Blame view
egs/rm/s5/local/rm_data_prep.sh
4.3 KB
8dcb6dfcb first commit |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 |
#!/bin/bash # # Copyright 2010-2012 Microsoft Corporation Johns Hopkins University (Author: Daniel Povey). Apache 2.0. # To be run from one directory above this script. # The input is the 3 CDs from the LDC distribution of Resource Management. # The script's argument is a directory which has three subdirectories: # rm1_audio1 rm1_audio2 rm2_audio # Note: when creating your own data preparation scripts, it's a good idea # to make sure that the speaker id (if present) is a prefix of the utterance # id, that the output scp file is sorted on utterance id, and that the # transcription file is exactly the same length as the scp file and is also # sorted on utterance id (missing transcriptions should be removed from the # scp file using e.g. scripts/filter_scp.pl) if [ $# != 1 ]; then echo "Usage: ../../local/RM_data_prep.sh /path/to/RM" exit 1; fi export LC_ALL=C RMROOT=$1 tmpdir=data/local/tmp mkdir -p $tmpdir . ./path.sh || exit 1; # for KALDI_ROOT if [ ! -d $RMROOT/rm1_audio1 -o ! -d $RMROOT/rm1_audio2 ]; then echo "Error: rm_data_prep.sh requires a directory argument (an absolute pathname) that contains rm1_audio1 and rm1_audio2" exit 1; fi if [ ! -d $RMROOT/rm2_audio ]; then echo "**Warning: $RMROOT/rm2_audio does not exist; won't create spk2gender file correctly***" sleep 1 fi ( find $RMROOT/rm1_audio1/rm1/ind_trn -iname '*.sph'; find $RMROOT/rm1_audio2/2_4_2/rm1/ind/dev_aug -iname '*.sph'; ) | perl -ane ' m:/sa\d.sph:i || m:/sb\d\d.sph:i || print; ' > $tmpdir/train_sph.flist dir=data/train mkdir -p $dir # make_trans.pl also creates the utterance id's and the kaldi-format scp file. local/make_trans.pl trn $tmpdir/train_sph.flist $RMROOT/rm1_audio1/rm1/doc/al_sents.snr >(sort -k1 >$dir/text) \ >(sort -k1 >$dir/sph.scp) sleep 0.25 # At one point I had the next line failing because $dir/sph.scp appeared not # to exist. Adding this sleep statement appeared to fix the problem. sph2pipe=$KALDI_ROOT/tools/sph2pipe_v2.5/sph2pipe [ ! -f $sph2pipe ] && echo "Could not find the sph2pipe program at $sph2pipe" && exit 1; awk '{printf("%s '$sph2pipe' -f wav %s | ", $1, $2);}' <$dir/sph.scp > $dir/wav.scp rm $dir/sph.scp cat $dir/wav.scp | perl -ane 'm/^((\w+)\w_\w+_\w+) / || die; print "$1 $2 "' > $dir/utt2spk utils/utt2spk_to_spk2utt.pl $dir/utt2spk > $dir/spk2utt for ntest in 1_mar87 2_oct87 4_feb89 5_oct89 6_feb91 7_sep92; do n=`echo $ntest | cut -d_ -f 1` # e.g. n = 1, 2, 4, 5.. test=`echo $ntest | cut -d_ -f 2` # e.g. test=mar87, oct87... dir=data/test_${test} mkdir -p $dir root=$RMROOT/rm1_audio2/2_4_2 for x in `grep -v ';' $root/rm1/doc/tests/$ntest/${n}_indtst.ndx`; do echo "$root/$x "; done | sort > $dir/sph.flist local/make_trans.pl ${test} $dir/sph.flist $RMROOT/rm1_audio1/rm1/doc/al_sents.snr \ >(sort -k1 >$dir/text) >(sort -k1 >$dir/sph.scp) sleep 0.25 # At one point I had the next line failing because $dir/sph.scp appeared not # to exist. Adding this sleep statement appeared to fix the problem. awk '{printf("%s '$sph2pipe' -f wav %s | ", $1, $2);}' <$dir/sph.scp >$dir/wav.scp rm $dir/sph.flist $dir/sph.scp cat $dir/wav.scp | perl -ane 'm/^((\w+)\w_\w+_\w+) / || die; print "$1 $2 "' > $dir/utt2spk utils/utt2spk_to_spk2utt.pl $dir/utt2spk > $dir/spk2utt done cat $RMROOT/rm1_audio2/2_5_1/rm1/doc/al_spkrs.txt \ $RMROOT/rm2_audio/3-1.2/rm2/doc/al_spkrs.txt | \ perl -ane 'tr/A-Z/a-z/;print;' | grep -v ';' | \ awk '{print $1, $2}' | sort | uniq > $tmpdir/spk2gender || exit 1; for t in train test_mar87 test_oct87 test_feb89 test_oct89 test_feb91 test_sep92; do utils/filter_scp.pl data/$t/spk2utt $tmpdir/spk2gender >data/$t/spk2gender done local/make_rm_lm.pl $RMROOT/rm1_audio1/rm1/doc/wp_gram.txt > $tmpdir/G.txt || exit 1; mkdir -p data/local/dict # Getting lexicon local/make_rm_dict.pl $RMROOT/rm1_audio2/2_4_2/score/src/rdev/pcdsril.txt \ > data/local/dict/lexicon.txt || exit 1; # Get phone lists... grep -v -w sil data/local/dict/lexicon.txt | \ awk '{for(n=2;n<=NF;n++) { p[$n]=1; }} END{for(x in p) {print x}}' | sort > data/local/dict/nonsilence_phones.txt echo sil > data/local/dict/silence_phones.txt echo sil > data/local/dict/optional_silence.txt touch data/local/dict/extra_questions.txt # no extra questions, as we have no stress or tone markers. echo RM_data_prep succeeded. |