Blame view
egs/gale_arabic/s5c/run.sh
4.34 KB
8dcb6dfcb first commit |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 |
#!/bin/bash -e # Copyright 2014 QCRI (author: Ahmed Ali) # 2019 Dongji Gao # Apache 2.0 # This is an example script for subword implementation num_jobs=120 num_decode_jobs=40 decode_gmm=true stage=0 overwrite=false num_merges=1000 dir1=/export/corpora/LDC/LDC2013S02/ dir2=/export/corpora/LDC/LDC2013S07/ dir3=/export/corpora/LDC/LDC2014S07/ text1=/export/corpora/LDC/LDC2013T17/ text2=/export/corpora/LDC/LDC2013T04/ text3=/export/corpora/LDC/LDC2014T17/ galeData=GALE . ./cmd.sh ## You'll want to change cmd.sh to something that will work on your system. ## This relates to the queue. . ./path.sh . ./utils/parse_options.sh # e.g. this parses the above options # if supplied. if [ $stage -le 0 ]; then if [ -f data/train/text ] && ! $overwrite; then echo "$0: Not processing, probably script have run from wrong stage" echo "Exiting with status 1 to avoid data corruption" exit 1; fi echo "$0: preparing data..." local/prepare_data.sh --dir1 $dir1 --dir2 $dir2 --dir3 $dir3 \ --text1 $text1 --text2 $text2 --text3 $text3 echo "$0: Preparing lexicon and LM..." local/prepare_dict_subword.sh --num_merges $num_merges utils/subword/prepare_lang_subword.sh data/local/dict "<UNK>" data/local/lang data/lang for set in train test; do utils/subword/prepare_subword_text.sh data/$set/text data/local/pair_code.txt data/$set/text done local/prepare_lm_subword.sh utils/format_lm.sh data/lang data/local/lm/lm.gz \ data/local/dict/lexicon.txt data/lang_test fi mfccdir=mfcc if [ $stage -le 1 ]; then echo "$0: Preparing the test and train feature files..." for x in train test ; do steps/make_mfcc.sh --cmd "$train_cmd" --nj $num_jobs \ data/$x exp/make_mfcc/$x $mfccdir utils/fix_data_dir.sh data/$x # some files fail to get mfcc for many reasons steps/compute_cmvn_stats.sh data/$x exp/make_mfcc/$x $mfccdir done fi if [ $stage -le 2 ]; then echo "$0: creating sub-set and training monophone system" utils/subset_data_dir.sh data/train 10000 data/train.10K || exit 1; steps/train_mono.sh --nj 40 --cmd "$train_cmd" \ data/train.10K data/lang exp/mono_subword || exit 1; fi if [ $stage -le 3 ]; then echo "$0: Aligning data using monophone system" steps/align_si.sh --nj $num_jobs --cmd "$train_cmd" \ data/train data/lang exp/mono_subword exp/mono_ali_subword || exit 1; echo "$0: training triphone system with delta features" steps/train_deltas.sh --cmd "$train_cmd" \ 2500 30000 data/train data/lang exp/mono_ali_subword exp/tri1_subword || exit 1; fi if [ $stage -le 4 ] && $decode_gmm; then utils/mkgraph.sh data/lang_test exp/tri1_subword exp/tri1_subword/graph steps/decode.sh --nj $num_decode_jobs --cmd "$decode_cmd" \ exp/tri1_subword/graph data/test exp/tri1_subword/decode fi if [ $stage -le 5 ]; then echo "$0: Aligning data and retraining and realigning with lda_mllt" steps/align_si.sh --nj $num_jobs --cmd "$train_cmd" \ data/train data/lang exp/tri1_subword exp/tri1_ali_subword || exit 1; steps/train_lda_mllt.sh --cmd "$train_cmd" 4000 50000 \ data/train data/lang exp/tri1_ali_subword exp/tri2b_subword || exit 1; fi if [ $stage -le 6 ] && $decode_gmm; then utils/mkgraph.sh data/lang_test exp/tri2b_subword exp/tri2b_subword/graph steps/decode.sh --nj $num_decode_jobs --cmd "$decode_cmd" \ exp/tri2b_subword/graph data/test exp/tri2b_subword/decode fi if [ $stage -le 7 ]; then echo "$0: Aligning data and retraining and realigning with sat_basis" steps/align_si.sh --nj $num_jobs --cmd "$train_cmd" \ data/train data/lang exp/tri2b_subword exp/tri2b_ali_subword || exit 1; steps/train_sat_basis.sh --cmd "$train_cmd" \ 5000 100000 data/train data/lang exp/tri2b_ali_subword exp/tri3b_subword || exit 1; steps/align_fmllr.sh --nj $num_jobs --cmd "$train_cmd" \ data/train data/lang exp/tri3b_subword exp/tri3b_ali_subword || exit 1; fi if [ $stage -le 8 ] && $decode_gmm; then utils/mkgraph.sh data/lang_test exp/tri3b_subword exp/tri3b_subword/graph steps/decode_fmllr.sh --nj $num_decode_jobs --cmd \ "$decode_cmd" exp/tri3b_subword/graph data/test exp/tri3b_subword/decode fi if [ $stage -le 9 ]; then echo "$0: Training a regular chain model using the e2e alignments..." local/chain/run_tdnn.sh --gmm tri3b_subword fi echo "$0: training succeed" exit 0 |