Blame view
egs/reverb/s5/local/prepare_simu_data.sh
6.83 KB
8dcb6dfcb first commit |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 |
#!/bin/bash # # Copyright 2018 Johns Hopkins University (Author: Shinji Watanabe) # Copyright 2018 Johns Hopkins University (Author: Aswin Shanmugam Subramanian) # Apache 2.0 # This script is adapted from data preparation scripts in the Kaldi reverb recipe # https://github.com/kaldi-asr/kaldi/tree/master/egs/reverb/s5/local # Begin configuration section. wavdir=${PWD}/wav # End configuration section . ./utils/parse_options.sh # accept options.. you can run this run.sh with the . ./path.sh echo >&2 "$0" "$@" if [ $# -ne 2 ] ; then echo >&2 "$0" "$@" echo >&2 "$0: Error: wrong number of arguments" echo -e >&2 "Usage: $0 [opts] <reverb-dir> <wsjcam0-dir>" echo -e >&2 "eg: $0 /export/corpora5/REVERB_2014/REVERB /export/corpora3/LDC/LDC95S24/wsjcam0" exit 1 fi set -e -o pipefail reverb=$1 wsjcam0=$2 # tool directory tooldir=${PWD}/data/local/reverb_tools # working directory dir=${PWD}/data/local/data mkdir -p ${dir} # make a one dot file for train, dev, and eval data # the directory structure of WSJCAM0 is not consistent and we need such process for each task cp ${wsjcam0}/data/primary_microphone/etc/si_tr.dot ${dir}/tr.dot cat ${wsjcam0}/data/primary_microphone/etc/si_dt*.dot | sort > ${dir}/dt.dot cat ${wsjcam0}/data/*/si_et*/*/*.dot | sort > ${dir}/et.dot noiseword="<NOISE>"; for nch in 1 2 8; do taskdir=data/local/reverb_tools/ReleasePackage/reverb_tools_for_asr_ver2.0/taskFiles/${nch}ch # make a wav list task=tr for x in `ls ${taskdir} | grep SimData | grep _${task}_`; do perl -se 'while (<>) { chomp; if (m/\/(\w{8})[^\/]+$/) { print $1, " ", $dir, $_, " "; } }' -- -dir=${wavdir}/REVERB_WSJCAM0_${task}/data ${taskdir}/$x |\ sed -e "s/^\(...\)/\1_${x}_\1/" done > ${dir}/${task}_simu_${nch}ch_wav.scp for task in dt et; do for x in `ls ${taskdir} | grep SimData | grep _${task}_ | grep -e far -e near`; do perl -se 'while (<>) { chomp; if (m/\/(\w{8})[^\/]+$/) { print $1, " ", $dir, $_, " "; } }' -- -dir=${reverb}/REVERB_WSJCAM0_${task}/data ${taskdir}/$x |\ sed -e "s/^\(...\)/\1_${x}_\1/" done > ${dir}/${task}_simu_${nch}ch_wav.scp if [ ${nch} == 1 ]; then for x in `ls ${taskdir} | grep SimData | grep _${task}_ | grep -e cln`; do perl -se 'while (<>) { chomp; if (m/\/(\w{8})[^\/]+$/) { print $1, " ", $dir, $_, " "; } }' -- -dir=${reverb}/REVERB_WSJCAM0_${task}/data ${taskdir}/$x |\ sed -e "s/^\(...\)/\1_${x}_\1/" done > ${dir}/${task}_cln_wav.scp fi done task=tr for x in `ls ${taskdir} | grep SimData | grep _${task}_`; do perl -se 'while (<>) { chomp; if (m/\/(\w{8})[^\/]+$/) { print $1, " ", $dir, $_, " "; } }' -- -dir=${wavdir}/WPE/${nch}ch/REVERB_WSJCAM0_${task}/data ${taskdir}/$x |\ sed -e "s/^\(...\)/\1_${x}_\1/" done > ${dir}/${task}_simu_${nch}ch_wpe_wav.scp for task in dt et; do for x in `ls ${taskdir} | grep SimData | grep _${task}_ | grep -e far -e near`; do perl -se 'while (<>) { chomp; if (m/\/(\w{8})[^\/]+$/) { print $1, " ", $dir, $_, " "; } }' -- -dir=${wavdir}/WPE/${nch}ch/REVERB_WSJCAM0_${task}/data ${taskdir}/$x |\ sed -e "s/^\(...\)/\1_${x}_\1/" done > ${dir}/${task}_simu_${nch}ch_wpe_wav.scp done # make a transcript task=tr for x in `ls ${taskdir} | grep SimData | grep _${task}_`; do perl -e 'while (<>) { chomp; if (m/\/(\w{8})[^\/]+$/) { print $1, " "; } }' ${taskdir}/$x |\ perl local/find_transcripts_singledot.pl ${dir}/${task}.dot |\ sed -e "s/^\(...\)/\1_${x}_\1/" done > ${dir}/${task}_simu_${nch}ch.trans1 || exit 1; cat ${dir}/${task}_simu_${nch}ch.trans1 | local/normalize_transcript.pl ${noiseword} > ${dir}/${task}_simu_${nch}ch.txt || exit 1; for task in dt et; do for x in `ls ${taskdir} | grep SimData | grep _${task}_ | grep -e far -e near`; do perl -e 'while (<>) { chomp; if (m/\/(\w{8})[^\/]+$/) { print $1, " "; } }' ${taskdir}/$x |\ perl local/find_transcripts_singledot.pl ${dir}/${task}.dot |\ sed -e "s/^\(...\)/\1_${x}_\1/" done > ${dir}/${task}_simu_${nch}ch.trans1 || exit 1; cat ${dir}/${task}_simu_${nch}ch.trans1 | local/normalize_transcript.pl ${noiseword} > ${dir}/${task}_simu_${nch}ch.txt || exit 1; if [ ${nch} == 1 ]; then for x in `ls ${taskdir} | grep SimData | grep _${task}_ | grep -e cln`; do perl -e 'while (<>) { chomp; if (m/\/(\w{8})[^\/]+$/) { print $1, " "; } }' ${taskdir}/$x |\ perl local/find_transcripts_singledot.pl ${dir}/${task}.dot |\ sed -e "s/^\(...\)/\1_${x}_\1/" done > ${dir}/${task}_cln.trans1 || exit 1; cat ${dir}/${task}_cln.trans1 | local/normalize_transcript.pl ${noiseword} > ${dir}/${task}_cln.txt || exit 1; fi done # Make the utt2spk and spk2utt files. for task in tr dt et; do cat ${dir}/${task}_simu_${nch}ch_wav.scp | awk '{print $1}' | awk -F '_' '{print $0 " " $1}' > ${dir}/${task}_simu_${nch}ch.utt2spk || exit 1; cat ${dir}/${task}_simu_${nch}ch.utt2spk | ./utils/utt2spk_to_spk2utt.pl > ${dir}/${task}_simu_${nch}ch.spk2utt || exit 1; done for task in dt et; do cat ${dir}/${task}_cln_wav.scp | awk '{print $1}' | awk -F '_' '{print $0 " " $1}' > ${dir}/${task}_cln.utt2spk || exit 1; cat ${dir}/${task}_cln.utt2spk | ./utils/utt2spk_to_spk2utt.pl > ${dir}/${task}_cln.spk2utt || exit 1; done done # finally copy the above files to the data directory for nch in 1 2 8; do for task in tr dt et; do datadir=data/${task}_simu_${nch}ch mkdir -p ${datadir} sort ${dir}/${task}_simu_${nch}ch_wav.scp > ${datadir}/wav.scp sort ${dir}/${task}_simu_${nch}ch.txt > ${datadir}/text sort ${dir}/${task}_simu_${nch}ch.utt2spk > ${datadir}/utt2spk sort ${dir}/${task}_simu_${nch}ch.spk2utt > ${datadir}/spk2utt ./utils/fix_data_dir.sh ${datadir} if [ ${task} != 'tr' ]; then datadir=data/${task}_simu_${nch}ch_wpe mkdir -p ${datadir} sort ${dir}/${task}_simu_1ch_wpe_wav.scp | sed -e "s/WPE\/1ch/WPE\/${nch}ch/" > ${datadir}/wav.scp sort ${dir}/${task}_simu_1ch.txt > ${datadir}/text sort ${dir}/${task}_simu_1ch.utt2spk > ${datadir}/utt2spk sort ${dir}/${task}_simu_1ch.spk2utt > ${datadir}/spk2utt ./utils/fix_data_dir.sh ${datadir} if [ ${nch} != 1 ]; then datadir=data/${task}_simu_${nch}ch_beamformit mkdir -p ${datadir} sort ${dir}/${task}_simu_1ch_wpe_wav.scp | sed -e "s/ch1/bf${nch}/" | sed -e "s/WPE\/1ch/WPE\/${nch}ch/" > ${datadir}/wav.scp sort ${dir}/${task}_simu_1ch.txt > ${datadir}/text sort ${dir}/${task}_simu_1ch.utt2spk > ${datadir}/utt2spk sort ${dir}/${task}_simu_1ch.spk2utt > ${datadir}/spk2utt ./utils/fix_data_dir.sh ${datadir} else datadir=data/${task}_cln mkdir -p ${datadir} sort ${dir}/${task}_cln_wav.scp > ${datadir}/wav.scp sort ${dir}/${task}_cln.txt > ${datadir}/text sort ${dir}/${task}_cln.utt2spk > ${datadir}/utt2spk sort ${dir}/${task}_cln.spk2utt > ${datadir}/spk2utt ./utils/fix_data_dir.sh ${datadir} fi fi done done |