Blame view
egs/heroico/s5/run.sh
4.94 KB
8dcb6dfcb first commit |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 |
#!/bin/bash . ./cmd.sh . ./path.sh stage=0 # the location of the LDC corpus; this location works for the CLSP grid. datadir=/export/corpora5/LDC/LDC2006S37 # The corpus and lexicon are on openslr.org #speech_url="http://www.openslr.org/resources/39/LDC2006S37.tar.gz" lexicon_url="http://www.openslr.org/resources/34/santiago.tar.gz" # Location of the Movie subtitles text corpus subtitles_url="http://opus.lingfil.uu.se/download.php?f=OpenSubtitles2018/en-es.txt.zip" . utils/parse_options.sh set -e set -o pipefail set -u # don't change tmpdir, the location is used explicitly in scripts in local/. tmpdir=data/local/tmp if [ $stage -le 0 ]; then if [ ! -d $datadir ]; then echo "$0: please download and un-tar http://www.openslr.org/resources/39/LDC2006S37.tar.gz" echo " and set $datadir to the directory where it is located." exit 1 fi if [ ! -s santiago.txt ]; then echo "$0: downloading the lexicon" wget -c http://www.openslr.org/resources/34/santiago.tar.gz tar -xvzf santiago.tar.gz fi # Get data for lm training local/subs_download.sh $subtitles_url fi if [ $stage -le 1 ]; then echo "Making lists for building models." local/prepare_data.sh $datadir fi if [ $stage -le 2 ]; then mkdir -p data/local/dict $tmpdir/dict local/prepare_dict.sh fi if [ $stage -le 3 ]; then utils/prepare_lang.sh \ data/local/dict "<UNK>" \ data/local/lang data/lang fi if [ $stage -le 4 ]; then mkdir -p $tmpdir/subs/lm local/subs_prepare_data.pl fi if [ $stage -le 5 ]; then echo "point 1" local/prepare_lm.sh $tmpdir/subs/lm/in_vocabulary.txt fi if [ $stage -le 6 ]; then echo "point 2" utils/format_lm.sh \ data/lang data/local/lm/trigram.arpa.gz data/local/dict/lexicon.txt \ data/lang_test fi if [ $stage -le 7 ]; then echo "$0: extracting acoustic features." mkdir -p exp for fld in native nonnative test devtest train; do if [ -e data/$fld/cmvn.scp ]; then rm data/$fld/cmvn.scp fi steps/make_mfcc.sh --cmd "$train_cmd" --nj 4 data/$fld utils/fix_data_dir.sh data/$fld steps/compute_cmvn_stats.sh data/$fld utils/fix_data_dir.sh data/$fld done fi if [ $stage -le 8 ]; then echo "$0 monophone training" steps/train_mono.sh --nj 8 --cmd "$train_cmd" data/train data/lang exp/mono || exit 1; # evaluation ( # make decoding graph for monophones utils/mkgraph.sh data/lang_test exp/mono exp/mono/graph || exit 1; # test monophones for x in native nonnative devtest test; do steps/decode.sh --nj 8 exp/mono/graph data/$x exp/mono/decode_${x} || exit 1; done ) & fi if [ $stage -le 9 ]; then # align with monophones steps/align_si.sh --nj 8 --cmd "$train_cmd" \ data/train data/lang exp/mono exp/mono_ali echo "$0 Starting triphone training in exp/tri1" steps/train_deltas.sh --cmd "$train_cmd" --cluster-thresh 100 \ 1500 25000 data/train data/lang exp/mono_ali exp/tri1 wait # wait for the previous decoding jobs to finish in case there's just one # machine. ( utils/mkgraph.sh \ data/lang_test exp/tri1 exp/tri1/graph || exit 1; for x in native nonnative devtest test; do steps/decode.sh --nj 8 exp/tri1/graph data/$x exp/tri1/decode_${x} || exit 1; done ) & fi if [ $stage -le 10 ]; then echo "$0: Starting delta system alignment" steps/align_si.sh \ --nj 8 --cmd "$train_cmd" data/train data/lang exp/tri1 exp/tri1_ali echo "$0: starting lda+mllt triphone training in exp/tri2b" steps/train_lda_mllt.sh \ --splice-opts "--left-context=3 --right-context=3" \ 2000 30000 data/train data/lang exp/tri1_ali exp/tri2b wait # wait for the previous decoding jobs to finish in case there's just one # machine. ( utils/mkgraph.sh data/lang_test exp/tri2b exp/tri2b/graph || exit 1; for x in native nonnative devtest test; do steps/decode.sh --nj 8 exp/tri2b/graph data/$x exp/tri2b/decode_${x} || exit 1; done ) & fi if [ $stage -le 11 ]; then echo "$0: Starting LDA+MLLT system alignment" steps/align_si.sh \ --use-graphs true --nj 8 --cmd "$train_cmd" \ data/train data/lang exp/tri2b exp/tri2b_ali echo "$0 Starting (SAT) triphone training in exp/tri3b" steps/train_sat.sh \ --cmd "$train_cmd" \ 3100 50000 data/train data/lang exp/tri2b_ali exp/tri3b echo "$0 Starting exp/tri3b_ali" steps/align_fmllr.sh \ --nj 8 --cmd "$train_cmd" \ data/train data/lang exp/tri3b exp/tri3b_ali wait # wait for the previous decoding jobs to finish in case there's just one # machine. ( # make decoding graphs for SAT models utils/mkgraph.sh \ data/lang_test exp/tri3b exp/tri3b/graph || exit 1; for x in native nonnative devtest test; do echo "$0: decoding $x with tri3b models." steps/decode_fmllr.sh \ --nj 8 --cmd "$decode_cmd" exp/tri3b/graph data/$x exp/tri3b/decode_${x} done ) & fi if [ $stage -le 12 ]; then echo "$0: train and test chain models." local/chain/run_tdnn.sh fi wait |