Blame view
egs/chime4/s5_1ch/local/simu_enhan_chime4_data_prep.sh
3.98 KB
8dcb6dfcb first commit |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 |
#!/bin/bash set -e # Copyright 2009-2012 Microsoft Corporation Johns Hopkins University (Author: Daniel Povey) # Apache 2.0. # This is modified from the script in standard Kaldi recipe to account # for the way the WSJ data is structured on the Edinburgh systems. # - Arnab Ghoshal, 29/05/12 # Modified from the script for CHiME2 baseline # Shinji Watanabe 02/13/2015 # Config: eval_flag=true # make it true when the evaluation data are released . utils/parse_options.sh || exit 1; if [ $# -ne 2 ]; then printf " USAGE: %s <enhancement-name> <enhanced-speech-directory> " `basename $0` echo "The argument should be a the directory that only contains enhanced speech data." exit 1; fi echo "$0 $@" # Print the command line for logging enhan=$1 audio_dir=$2 dir=`pwd`/data/local/data mkdir -p $dir local=`pwd`/local utils=`pwd`/utils odir=`pwd`/data . ./path.sh # Needed for KALDI_ROOT export PATH=$PATH:$KALDI_ROOT/tools/irstlm/bin sph2pipe=$KALDI_ROOT/tools/sph2pipe_v2.5/sph2pipe if [ ! -x $sph2pipe ]; then echo "Could not find (or execute) the sph2pipe program at $sph2pipe"; exit 1; fi if $eval_flag; then list_set="tr05_simu_$enhan dt05_simu_$enhan et05_simu_$enhan" else list_set="tr05_simu_$enhan dt05_simu_$enhan" fi cd $dir find $audio_dir/ -name '*.wav' | grep 'tr05_bus_simu\|tr05_caf_simu\|tr05_ped_simu\|tr05_str_simu' | sort -u > tr05_simu_$enhan.flist find $audio_dir/ -name '*.wav' | grep 'dt05_bus_simu\|dt05_caf_simu\|dt05_ped_simu\|dt05_str_simu' | sort -u > dt05_simu_$enhan.flist if $eval_flag; then find $audio_dir/ -name '*.wav' | grep 'et05_bus_simu\|et05_caf_simu\|et05_ped_simu\|et05_str_simu' | sort -u > et05_simu_$enhan.flist fi # make a scp file from file list for x in $list_set; do cat $x.flist | awk -F'[/]' '{print $NF}'| sed -e 's/\.wav/_SIMU/' > ${x}_wav.ids paste -d" " ${x}_wav.ids $x.flist | sort -k 1 > ${x}_wav.scp done # make a transcription from dot # simulation training data extract dot file from original WSJ0 data # since it is generated from these data if [ ! -e dot_files.flist ]; then echo "Could not find $dir/dot_files.flist files, first run local/clean_wsj0_data_prep.sh"; exit 1; fi cat tr05_simu_${enhan}_wav.scp | awk -F'[_]' '{print $2}' | tr '[A-Z]' '[a-z]' \ | $local/find_noisy_transcripts.pl dot_files.flist | cut -f 2- -d" " > tr05_simu_$enhan.txt cat tr05_simu_${enhan}_wav.scp | cut -f 1 -d" " > tr05_simu_$enhan.ids paste -d" " tr05_simu_$enhan.ids tr05_simu_$enhan.txt | sort -k 1 > tr05_simu_$enhan.trans1 # dt05 and et05 simulation data are generated from the CHiME4 booth recording # and we use CHiME4 dot files cat dt05_simu.dot | sed -e 's/(\(.*\))/\1/' | awk '{print $NF "_SIMU"}'> dt05_simu_$enhan.ids cat dt05_simu.dot | sed -e 's/(.*)//' > dt05_simu_$enhan.txt paste -d" " dt05_simu_$enhan.ids dt05_simu_$enhan.txt | sort -k 1 > dt05_simu_$enhan.trans1 if $eval_flag; then cat et05_simu.dot | sed -e 's/(\(.*\))/\1/' | awk '{print $NF "_SIMU"}'> et05_simu_$enhan.ids cat et05_simu.dot | sed -e 's/(.*)//' > et05_simu_$enhan.txt paste -d" " et05_simu_$enhan.ids et05_simu_$enhan.txt | sort -k 1 > et05_simu_$enhan.trans1 fi # Do some basic normalization steps. At this point we don't remove OOVs-- # that will be done inside the training scripts, as we'd like to make the # data-preparation stage independent of the specific lexicon used. noiseword="<NOISE>"; for x in $list_set;do cat $x.trans1 | $local/normalize_transcript.pl $noiseword \ | sort > $x.txt || exit 1; done # Make the utt2spk and spk2utt files. for x in $list_set; do cat ${x}_wav.scp | awk -F'_' '{print $1}' > $x.spk cat ${x}_wav.scp | awk '{print $1}' > $x.utt paste -d" " $x.utt $x.spk > $x.utt2spk cat $x.utt2spk | $utils/utt2spk_to_spk2utt.pl > $x.spk2utt || exit 1; done # copying data to data/... for x in $list_set; do mkdir -p $odir/$x cp ${x}_wav.scp $odir/$x/wav.scp || exit 1; cp ${x}.txt $odir/$x/text || exit 1; cp ${x}.spk2utt $odir/$x/spk2utt || exit 1; cp ${x}.utt2spk $odir/$x/utt2spk || exit 1; done echo "Data preparation succeeded" |