prep_rwcp.sh
6.67 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
#!/bin/bash
# Copyright 2015 Johns Hopkins University (author: Vijayaditya Peddinti)
# Apache 2.0
# This script downloads the RWCP impulse responses and ambient noise
# (Provided by MERL)
#-----------------------------------------
# all data is headerless binary type with little endian and 48 KHz
# impulse responses are float32 (4 byte)
# noises are short (2 byte)
# Data is multi-channel and each directory has a recording with each channel
# as a seperate file
# Impulse responses
download=true
sampling_rate=8k
output_bit=16
DBname=RWCP
file_splitter= #script to generate job scripts given the command file
. ./cmd.sh
. ./path.sh
. ./utils/parse_options.sh
if [ $# != 3 ]; then
echo "Usage: "
echo " $0 [options] <rir-home> <output-dir> <log-dir>"
echo "e.g.:"
echo " $0 --download true db/RIR_databases/ data/impulses_noises exp/make_reverb/log"
exit 1;
fi
RIR_home=$1
output_dir=$2
log_dir=$3
if [ "$download" = true ]; then
mkdir -p $RIR_home
# RWCP sound scene database
#==========================
(cd $RIR_home;
rm -rf RWCP.tar.gz
wget http://www.openslr.org/resources/13/RWCP.tar.gz || exit 1;
tar -zxvf RWCP.tar.gz >/dev/null
)
fi
RWCP_home=$RIR_home/RWCP
RWCP_dirs=[]
RWCP_dirs[0]=$RWCP_home/micarray/MICARRAY/data1/
RWCP_dirs[1]=$RWCP_home/micarray/MICARRAY/data3/
RWCP_dirs[2]=$RWCP_home/micarray/MICARRAY/data5/
command_file=$log_dir/RWCP_read_rir_noise.sh
echo "">$command_file
# micarray database
type_num=0
for base_dir_name in ${RWCP_dirs[@]}; do
type_num=$((type_num + 1))
leaf_directories=( $(find $base_dir_name -type d -links 2 -print || exit -1) )
files_done=0
total_files=$(echo ${leaf_directories[@]}|wc -w)
echo "Found ${total_files} impulse responses in ${base_dir_name}."
echo "" > $log_dir/RWCP_type$type_num.rir.list
# create the list of commands to be executed
for leaf_dir_name in ${leaf_directories[@]}; do
first_channel=$(ls $leaf_dir_name|sed -e"s/.*\.//g"|sort -n|head -1)
last_channel=$(ls $leaf_dir_name|sed -e"s/.*\.//g"|sort -nr|head -1)
file_base_name=$(basename $leaf_dir_name)
output_file_name=`echo ${leaf_dir_name#$base_dir_name}| sed -e"s/[\/\]\+/_/g" | tr '[:upper:]' '[:lower:]'`
output_file_name=RWCP_type${type_num}_rir_${output_file_name}.wav
channel_files=
for i in `seq $first_channel $last_channel`; do
channel_files="$channel_files -t raw -e float -b 32 -c 1 -r 48k $leaf_dir_name/$file_base_name.$i ";
done
echo "sox -M $channel_files -r $sampling_rate -e signed-integer -b $output_bit ${output_dir}/${output_file_name}" >> $command_file
echo ${output_dir}/${output_file_name} >> $log_dir/RWCP_type$type_num.rir.list
files_done=$((files_done + 1))
done
done
# robot - non-directional microphone
# sox is not able to handle input scaling and there is lot of clipping
# so we scale the values in python first
tempdir_robo=`mktemp -d $PWD/tempXXXX`
cat << EOF > $tempdir_robo/raw_read.py
import sys, numpy as np, argparse, scipy.signal as signal, os.path, glob, scipy.io, scipy.io.wavfile
precision = np.float32
file_handle = open(sys.argv[1], 'rb')
data = np.fromfile(file_handle, dtype = precision)
data = (0.9 * data / np.max(np.abs(data))) * (2**31)
data = data.astype('int32', copy = False)
scipy.io.wavfile.write(sys.argv[2], 48000, data)
EOF
type_num=$((type_num + 1))
data_files=( $(find $RWCP_home/robot/data/non -name '*.dat' -type f -print || exit -1) )
files_done=0
total_files=$(echo ${data_files[@]}|wc -w)
echo "" > $log_dir/RWCP_type$type_num.rir.list
echo "Found $total_files impulse responses in ${RWCP_home}/robot/data/non."
# create the list of commands to be executed
for data_file in ${data_files[@]}; do
temp_file=$tempdir_robo/$files_done.wav
python $tempdir_robo/raw_read.py $data_file $temp_file
output_file_name=RWCP_type${type_num}_rir_`basename $data_file .dat | tr '[:upper:]' '[:lower:]'`.wav
echo "sox -t wav $temp_file -r $sampling_rate -e signed-integer -b $output_bit ${output_dir}/${output_file_name}" >> $command_file
echo ${output_dir}/${output_file_name} >> $log_dir/RWCP_type$type_num.rir.list
files_done=$((files_done + 1))
done
# Ambient noise
type_num=$((type_num + 1))
base_dir_name=$RWCP_home/micarray/MICARRAY/data6/
leaf_directories=( $(find $base_dir_name -type d -links 2 -print || exit -1) )
files_done=0
total_files=$(echo ${leaf_directories[@]}|wc -w)
echo "" > $log_dir/RWCP_type$type_num.noise.list
echo "Found $total_files noises in ${base_dir_name}."
for leaf_dir_name in ${leaf_directories[@]}; do
first_channel=$(ls $leaf_dir_name|sed -e"s/.*\.//g"|sort -n|head -1)
last_channel=$(ls $leaf_dir_name|sed -e"s/.*\.//g"|sort -nr|head -1)
file_base_name=$(basename $leaf_dir_name)
output_file_name=`echo ${leaf_dir_name#$base_dir_name}| sed -e"s/[\/\]\+/_/g" | tr '[:upper:]' '[:lower:]'`
output_file_name=RWCP_type${type_num}_noise_${output_file_name}.wav
channel_files=
for i in `seq $first_channel $last_channel`; do
channel_files="$channel_files -t raw -e signed-integer -b 16 -c 1 -r 48k $leaf_dir_name/$file_base_name.$i ";
done
echo "sox -M $channel_files -r $sampling_rate -e signed-integer -b $output_bit ${output_dir}/${output_file_name}" >> $command_file
echo ${output_dir}/${output_file_name} >> $log_dir/RWCP_type$type_num.noise.list
files_done=$((files_done + 1))
done
if [ ! -z "$file_splitter" ]; then
num_jobs=$($file_splitter $command_file || exit 1)
job_file=${command_file%.sh}.JOB.sh
else
num_jobs=1
job_file=$command_file
fi
if [ ! -z "$file_splitter" ]; then
num_jobs=$($file_splitter $command_file || exit 1)
job_file=${command_file%.sh}.JOB.sh
job_log=${command_file%.sh}.JOB.log
else
num_jobs=1
job_file=$command_file
job_log=${command_file%.sh}.log
fi
# execute the commands using the above created array jobs
time $decode_cmd --max-jobs-run 40 JOB=1:$num_jobs $job_log \
sh $job_file || exit 1;
# get the RWCP database noise mic and room settings to pair with corresponding impulse responses
type_num=5
noise_patterns=( $(ls ${output_dir}/RWCP_type${type_num}_noise*.wav | xargs -n1 basename | python -c"
import sys
for line in sys.stdin:
name = line.split('RWCP_type${type_num}_noise')[1]
print '_'.join(name.split('_')[1:-1])
"|sort -u) )
for noise_pattern in ${noise_patterns[@]}; do
set_file=$output_dir/info/noise_impulse_RWCP_$noise_pattern
echo -n "noise_files=" > $set_file
ls ${output_dir}/*${noise_pattern}*.wav | grep "type${type_num}" | grep "noise" | awk '{ ORS=" "; print; } END{print "\n"}' >> $set_file
echo -n "impulse_files=" >> $set_file
ls ${output_dir}/*${noise_pattern}*.wav | grep -v "type${type_num}" | grep "rir" | awk '{ ORS=" "; print; } END{print "\n"}' >> $set_file
done
# remove the tempdir we created to tackle the scaling problem
rm -rf $tempdir_robo