Blame view
egs/fisher_callhome_spanish/s5/local/callhome_data_prep.sh
6.19 KB
8dcb6dfcb first commit |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 |
#!/bin/bash # # Copyright 2014 Gaurav Kumar. Apache 2.0 # The input is the Callhome Spanish Dataset. (*.sph files) # In addition the transcripts are needed as well. # To be run from one directory above this script. # Note: when creating your own data preparation scripts, it's a good idea # to make sure that the speaker id (if present) is a prefix of the utterance # id, that the output scp file is sorted on utterance id, and that the # transcription file is exactly the same length as the scp file and is also # sorted on utterance id (missing transcriptions should be removed from the # scp file using e.g. scripts/filter_scp.pl) stage=0 export LC_ALL=C if [ $# -lt 2 ]; then echo "Arguments should be the location of the Callhome Spanish Speech and Transcript Directories, se e ../run.sh for example." exit 1; fi cdir=`pwd` dir=`pwd`/data/local/data local=`pwd`/local utils=`pwd`/utils tmpdir=`pwd`/data/local/tmp . ./path.sh || exit 1; # Needed for KALDI_ROOT export PATH=$PATH:$KALDI_ROOT/tools/irstlm/bin sph2pipe=$KALDI_ROOT/tools/sph2pipe_v2.5/sph2pipe if [ ! -x $sph2pipe ]; then echo "Could not find (or execute) the sph2pipe program at $sph2pipe"; exit 1; fi cd $dir # Make directory of links to the WSJ disks such as 11-13.1. This relies on the command # line arguments being absolute pathnames. #rm -r links/ 2>/dev/null mkdir -p links/ ln -s $* links # Basic spot checks to see if we got the data that we needed if [ ! -d links/LDC96S35 -o ! -d links/LDC96T17 ]; then echo "The speech and the data directories need to be named LDC96S35 and LDC96T17 respecti vely" exit 1; fi if [ ! -d links/LDC96S35/CALLHOME/SPANISH/SPEECH/DEVTEST -o ! -d links/LDC96S35/CALLHOME/SPANISH/SPEECH/EVLTEST -o ! -d links/LDC96S35/CALLHOME/SPANISH/SPEECH/TRAIN ]; then echo "Dev, Eval or Train directories missing or not properly organised within the speech data dir" exit 1; fi #Check the transcripts directories as well to see if they exist if [ ! -d links/LDC96T17/callhome_spanish_trans_970711/transcrp/devtest -o ! -d links/LDC96T17/callhome_spanish_trans_970711/transcrp/evltest -o ! -d links/LDC96T17/callhome_spanish_trans_970711/transcrp/train ] then echo "Transcript directories missing or not properly organised" exit 1; fi speech_train=$dir/links/LDC96S35/CALLHOME/SPANISH/SPEECH/TRAIN speech_dev=$dir/links/LDC96S35/CALLHOME/SPANISH/SPEECH/DEVTEST speech_test=$dir/links/LDC96S35/CALLHOME/SPANISH/SPEECH/EVLTEST transcripts_train=$dir/links/LDC96T17/callhome_spanish_trans_970711/transcrp/train transcripts_dev=$dir/links/LDC96T17/callhome_spanish_trans_970711/transcrp/devtest transcripts_test=$dir/links/LDC96T17/callhome_spanish_trans_970711/transcrp/evltest fcount_train=`find ${speech_train} -iname '*.SPH' | wc -l` fcount_dev=`find ${speech_dev} -iname '*.SPH' | wc -l` fcount_test=`find ${speech_test} -iname '*.SPH' | wc -l` fcount_t_train=`find ${transcripts_train} -iname '*.txt' | wc -l` fcount_t_dev=`find ${transcripts_dev} -iname '*.txt' | wc -l` fcount_t_test=`find ${transcripts_test} -iname '*.txt' | wc -l` #Now check if we got all the files that we needed if [ $fcount_train != 80 -o $fcount_dev != 20 -o $fcount_test != 20 -o $fcount_t_train != 80 -o $fcount_t_dev != 20 -o $fcount_t_test != 20 ]; then echo "Incorrect number of files in the data directories" echo "The paritions should contain 80/20/20 files" exit 1; fi if [ $stage -le 0 ]; then #Gather all the speech files together to create a file list ( find $speech_train -iname '*.sph'; find $speech_dev -iname '*.sph'; find $speech_test -iname '*.sph'; ) > $tmpdir/callhome_train_sph.flist #Get all the transcripts in one place ( find $transcripts_train -iname '*.txt'; find $transcripts_dev -iname '*.txt'; find $transcripts_test -iname '*.txt'; ) > $tmpdir/callhome_train_transcripts.flist fi if [ $stage -le 1 ]; then $local/callhome_make_trans.pl $tmpdir mkdir -p $dir/callhome_train_all mv $tmpdir/callhome_reco2file_and_channel $dir/callhome_train_all/ fi if [ $stage -le 2 ]; then sort $tmpdir/callhome.text.1 | sed 's/^\s\s*|\s\s*$//g' | sed 's/\s\s*/ /g' > $dir/callhome_train_all/callhome.text #Create segments file and utt2spk file ! cat $dir/callhome_train_all/callhome.text | perl -ane 'm:([^-]+)-([AB])-(\S+): || die "Bad line $_;"; print "$1-$2-$3 $1-$2 "; ' > $dir/callhome_train_all/callhome_utt2spk \ && echo "Error producing utt2spk file" && exit 1; cat $dir/callhome_train_all/callhome.text | perl -ane 'm:((\S+-[AB])-(\d+)-(\d+))\s: || die; $utt = $1; $reco = $2; $s = sprintf("%.2f", 0.01*$3); $e = sprintf("%.2f", 0.01*$4); print "$utt $reco $s $e "; ' >$dir/callhome_train_all/callhome_segments $utils/utt2spk_to_spk2utt.pl <$dir/callhome_train_all/callhome_utt2spk > $dir/callhome_train_all/callhome_spk2utt fi if [ $stage -le 3 ]; then for f in `cat $tmpdir/callhome_train_sph.flist`; do # convert to absolute path make_absolute.sh $f done > $tmpdir/callhome_train_sph_abs.flist cat $tmpdir/callhome_train_sph_abs.flist | perl -ane 'm:/([^/]+)\.SPH$: || die "bad line $_; "; print lc($1)," $_"; ' > $tmpdir/callhome_sph.scp cat $tmpdir/callhome_sph.scp | awk -v sph2pipe=$sph2pipe '{printf("%s-A %s -f wav -p -c 1 %s | ", $1, sph2pipe, $2); printf("%s-B %s -f wav -p -c 2 %s | ", $1, sph2pipe, $2);}' | \ sort -k1,1 -u > $dir/callhome_train_all/callhome_wav.scp || exit 1; fi if [ $stage -le 4 ]; then # Build the speaker to gender map, the temporary file with the speaker in gender information is already created by fsp_make_trans.pl. cd $cdir #TODO: needs to be rewritten $local/callhome_make_spk2gender.sh > $dir/callhome_train_all/callhome_spk2gender fi # Rename files from the callhome directory if [ $stage -le 5 ]; then cd $dir/callhome_train_all mv callhome.text text mv callhome_segments segments mv callhome_spk2utt spk2utt mv callhome_wav.scp wav.scp mv callhome_reco2file_and_channel reco2file_and_channel mv callhome_spk2gender spk2gender mv callhome_utt2spk utt2spk cd $cdir fi fix_data_dir.sh $dir/callhome_train_all || exit 1 utils/validate_data_dir.sh --no-feats $dir/callhome_train_all || exit 1 echo "CALLHOME spanish Data preparation succeeded." exit 0; |