prep_rwcp.sh 6.67 KB
edit raw blame history



1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

19

20

21

22

23

24

25

26

27

28

29

30

31

32

33

34

35

36

37

38

39

40

41

42

43

44

45

46

47

48

49

50

51

52

53

54

55

56

57

58

59

60

61

62

63

64

65

66

67

68

69

70

71

72

73

74

75

76

77

78

79

80

81

82

83

84

85

86

87

88

89

90

91

92

93

94

95

96

97

98

99

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

119

120

121

122

123

124

125

126

127

128

129

130

131

132

133

134

135

136

137

138

139

140

141

142

143

144

145

146

147

148

149

150

151

152

153

154

155

156

157

158

159

160

161

162

163

164

165

166

167

168

169

170

171

172

173

174

175

176

177

178

179


#!/bin/bash
# Copyright 2015  Johns Hopkins University (author: Vijayaditya Peddinti)
# Apache 2.0
# This script downloads the RWCP impulse responses and ambient noise
# (Provided by MERL)
#-----------------------------------------
# all data is headerless binary type with little endian and 48 KHz
# impulse responses are float32 (4 byte)
# noises are short (2 byte)
# Data is multi-channel and each directory has a recording with each channel
# as a seperate file
# Impulse responses


download=true
sampling_rate=8k
output_bit=16
DBname=RWCP
file_splitter=  #script to generate job scripts given the command file

. ./cmd.sh
. ./path.sh
. ./utils/parse_options.sh

if [ $# != 3 ]; then
  echo "Usage: "
  echo "  $0 [options] <rir-home> <output-dir> <log-dir>"
  echo "e.g.:"
  echo " $0  --download true db/RIR_databases/ data/impulses_noises exp/make_reverb/log"
  exit 1;
fi

RIR_home=$1
output_dir=$2
log_dir=$3

if [ "$download" = true ]; then
  mkdir -p $RIR_home
  # RWCP sound scene database
  #==========================
  (cd $RIR_home;
  rm -rf RWCP.tar.gz
  wget http://www.openslr.org/resources/13/RWCP.tar.gz || exit 1;
  tar -zxvf RWCP.tar.gz >/dev/null
  )
fi

RWCP_home=$RIR_home/RWCP
RWCP_dirs=[]
RWCP_dirs[0]=$RWCP_home/micarray/MICARRAY/data1/ 
RWCP_dirs[1]=$RWCP_home/micarray/MICARRAY/data3/
RWCP_dirs[2]=$RWCP_home/micarray/MICARRAY/data5/


command_file=$log_dir/RWCP_read_rir_noise.sh
echo "">$command_file
# micarray database
type_num=0
for base_dir_name in ${RWCP_dirs[@]}; do
  type_num=$((type_num + 1))
  leaf_directories=( $(find $base_dir_name -type d -links 2 -print || exit -1) )
  files_done=0
  total_files=$(echo ${leaf_directories[@]}|wc -w)
  echo "Found ${total_files} impulse responses in ${base_dir_name}."
  echo "" > $log_dir/RWCP_type$type_num.rir.list
  # create the list of commands to be executed
  for leaf_dir_name in  ${leaf_directories[@]}; do
    first_channel=$(ls $leaf_dir_name|sed -e"s/.*\.//g"|sort -n|head -1)
    last_channel=$(ls $leaf_dir_name|sed -e"s/.*\.//g"|sort -nr|head -1)
    file_base_name=$(basename $leaf_dir_name)
    output_file_name=`echo ${leaf_dir_name#$base_dir_name}| sed -e"s/[\/\]\+/_/g" | tr '[:upper:]' '[:lower:]'`    
    output_file_name=RWCP_type${type_num}_rir_${output_file_name}.wav
    channel_files=
    for i in `seq $first_channel $last_channel`; do
      channel_files="$channel_files -t raw -e float -b 32 -c 1 -r 48k  $leaf_dir_name/$file_base_name.$i ";
    done
    echo "sox -M $channel_files -r $sampling_rate -e signed-integer -b $output_bit ${output_dir}/${output_file_name}" >> $command_file
    echo ${output_dir}/${output_file_name} >>  $log_dir/RWCP_type$type_num.rir.list
    files_done=$((files_done + 1))
  done
done

# robot - non-directional microphone
# sox is not able to handle input scaling and there is lot of clipping
# so we scale the values in python first
tempdir_robo=`mktemp -d $PWD/tempXXXX`

cat << EOF > $tempdir_robo/raw_read.py 
import sys, numpy as np, argparse, scipy.signal as signal, os.path, glob, scipy.io, scipy.io.wavfile
precision = np.float32
file_handle = open(sys.argv[1], 'rb')
data = np.fromfile(file_handle, dtype = precision)
data = (0.9 * data / np.max(np.abs(data))) * (2**31)
data = data.astype('int32', copy = False)
scipy.io.wavfile.write(sys.argv[2], 48000, data) 
EOF

type_num=$((type_num + 1))
data_files=( $(find $RWCP_home/robot/data/non -name '*.dat' -type f -print || exit -1) )
files_done=0
total_files=$(echo ${data_files[@]}|wc -w)
echo "" > $log_dir/RWCP_type$type_num.rir.list
echo "Found $total_files impulse responses in ${RWCP_home}/robot/data/non."
# create the list of commands to be executed
for data_file in ${data_files[@]}; do
  temp_file=$tempdir_robo/$files_done.wav
  python $tempdir_robo/raw_read.py $data_file $temp_file 
  output_file_name=RWCP_type${type_num}_rir_`basename $data_file .dat | tr '[:upper:]' '[:lower:]'`.wav
  echo "sox -t wav $temp_file -r $sampling_rate -e signed-integer -b $output_bit ${output_dir}/${output_file_name}"   >> $command_file
  echo ${output_dir}/${output_file_name} >>  $log_dir/RWCP_type$type_num.rir.list
  files_done=$((files_done + 1))
done

# Ambient noise
type_num=$((type_num + 1))
base_dir_name=$RWCP_home/micarray/MICARRAY/data6/
leaf_directories=( $(find $base_dir_name -type d -links 2 -print || exit -1) )
files_done=0
total_files=$(echo ${leaf_directories[@]}|wc -w)
echo "" > $log_dir/RWCP_type$type_num.noise.list
echo "Found $total_files noises in ${base_dir_name}."
for leaf_dir_name in  ${leaf_directories[@]}; do
  first_channel=$(ls $leaf_dir_name|sed -e"s/.*\.//g"|sort -n|head -1)
  last_channel=$(ls $leaf_dir_name|sed -e"s/.*\.//g"|sort -nr|head -1)
  file_base_name=$(basename $leaf_dir_name)
  output_file_name=`echo ${leaf_dir_name#$base_dir_name}| sed -e"s/[\/\]\+/_/g" | tr '[:upper:]' '[:lower:]'`
  output_file_name=RWCP_type${type_num}_noise_${output_file_name}.wav
  channel_files=
  for i in `seq $first_channel $last_channel`; do
    channel_files="$channel_files -t raw -e signed-integer -b 16 -c 1 -r 48k  $leaf_dir_name/$file_base_name.$i ";
  done
  echo "sox -M $channel_files -r $sampling_rate -e signed-integer -b $output_bit ${output_dir}/${output_file_name}" >> $command_file

  echo ${output_dir}/${output_file_name} >>  $log_dir/RWCP_type$type_num.noise.list
  files_done=$((files_done + 1))
done

if [ ! -z "$file_splitter" ]; then
  num_jobs=$($file_splitter $command_file || exit 1)
  job_file=${command_file%.sh}.JOB.sh
else
  num_jobs=1
  job_file=$command_file
fi

if [ ! -z "$file_splitter" ]; then
  num_jobs=$($file_splitter $command_file || exit 1)
  job_file=${command_file%.sh}.JOB.sh
  job_log=${command_file%.sh}.JOB.log
else
  num_jobs=1
  job_file=$command_file
  job_log=${command_file%.sh}.log
fi

# execute the commands using the above created array jobs
time $decode_cmd --max-jobs-run 40 JOB=1:$num_jobs $job_log \
  sh $job_file || exit 1;

# get the RWCP database noise mic and room settings to pair with corresponding impulse responses
type_num=5
noise_patterns=( $(ls ${output_dir}/RWCP_type${type_num}_noise*.wav | xargs -n1 basename | python -c"
import sys
for line in sys.stdin:
  name = line.split('RWCP_type${type_num}_noise')[1]
  print '_'.join(name.split('_')[1:-1])
"|sort -u) )

for noise_pattern in ${noise_patterns[@]}; do
  set_file=$output_dir/info/noise_impulse_RWCP_$noise_pattern
  echo -n "noise_files=" > $set_file
  ls ${output_dir}/*${noise_pattern}*.wav | grep "type${type_num}" | grep "noise" | awk '{ ORS="  "; print; } END{print "\n"}' >> $set_file
  echo -n "impulse_files=" >> $set_file
  ls ${output_dir}/*${noise_pattern}*.wav | grep -v "type${type_num}" | grep "rir" | awk '{ ORS="  "; print; } END{print "\n"}' >> $set_file
done


# remove the tempdir we created to tackle the scaling problem
rm -rf $tempdir_robo