Blame view
egs/gale_arabic/s5/run.sh
5.52 KB
8dcb6dfcb first commit |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 |
#!/bin/bash set -e # Copyright 2014 QCRI (author: Ahmed Ali) # Apache 2.0 . ./path.sh . ./cmd.sh ## You'll want to change cmd.sh to something that will work on your system. ## This relates to the queue. nJobs=120 nDecodeJobs=40 #NB: You can add whatever number of copora you like. The supported extensions #NB: (formats) are wav and flac. Flac will be converted using sox and in contrast #NB: with the old approach, the conversion will be on-the-fly and one-time-only #NB: during the parametrization. #NB: Text corpora scpecification. We support either tgz files, which are unpacked #NB: or just plain (already unpacked) directories. The list of transcript is then #NB: obtained using find command #This is CLSP configuration. We add the 2014 GALE data. We got around 2 % #improvement just by including it. The gain might be large if someone would tweak # the number of leaves and states and so on. #Make sure you edit this section to reflect whers you keep the LDC data on your cluster audio=( /data/sls/scratch/amali/data/GALE/LDC2013S02 /data/sls/scratch/amali/data/GALE/LDC2013S07 /data/sls/scratch/amali/data/GALE/LDC2014S07 ) text=( /data/sls/scratch/amali/data/GALE/LDC2013T17.tgz /data/sls/scratch/amali/data/GALE/LDC2013T04.tgz /data/sls/scratch/amali/data/GALE/LDC2014T17.tgz ) galeData=GALE #prepare the data #split train dev test #prepare lexicon and LM # You can run the script from here automatically, but it is recommended to run the data preparation, # and features extraction manually and and only once. # By copying and pasting into your shell. #copy the audio files to local folder wav and convet flac files to wav local/gale_data_prep_audio.sh "${audio[@]}" $galeData || exit 1; #get the transcription and remove empty prompts and all noise markers local/gale_data_prep_txt.sh "${text[@]}" $galeData || exit 1; # split the data to reports and conversational and for each class will have rain/dev and test local/gale_data_prep_split.sh $galeData || exit 1; # get QCRI dictionary and add silence and UN local/gale_prep_dict.sh || exit 1; #prepare the langauge resources utils/prepare_lang.sh data/local/dict "<UNK>" data/local/lang data/lang || exit 1; # LM training local/gale_train_lms.sh || exit 1; local/gale_format_data.sh || exit 1; # G compilation, check LG composition # Now make MFCC features. # mfccdir should be some place with a largish disk where you # want to store MFCC features. mfccdir=mfcc for x in train test ; do steps/make_mfcc.sh --cmd "$train_cmd" --nj $nJobs \ data/$x exp/make_mfcc/$x $mfccdir utils/fix_data_dir.sh data/$x # some files fail to get mfcc for many reasons steps/compute_cmvn_stats.sh data/$x exp/make_mfcc/$x $mfccdir done # Here we start the AM # Let's create a subset with 10k segments to make quick flat-start training: utils/subset_data_dir.sh data/train 10000 data/train.10K || exit 1; # Train monophone models on a subset of the data, 10K segment # Note: the --boost-silence option should probably be omitted by default steps/train_mono.sh --nj 40 --cmd "$train_cmd" \ data/train.10K data/lang exp/mono || exit 1; # Get alignments from monophone system. steps/align_si.sh --nj $nJobs --cmd "$train_cmd" \ data/train data/lang exp/mono exp/mono_ali || exit 1; # train tri1 [first triphone pass] steps/train_deltas.sh --cmd "$train_cmd" \ 2500 30000 data/train data/lang exp/mono_ali exp/tri1 || exit 1; # First triphone decoding utils/mkgraph.sh data/lang_test exp/tri1 exp/tri1/graph steps/decode.sh --nj $nDecodeJobs --cmd "$decode_cmd" \ exp/tri1/graph data/test exp/tri1/decode steps/align_si.sh --nj $nJobs --cmd "$train_cmd" \ data/train data/lang exp/tri1 exp/tri1_ali || exit 1; # Train tri2a, which is deltas+delta+deltas steps/train_deltas.sh --cmd "$train_cmd" \ 3000 40000 data/train data/lang exp/tri1_ali exp/tri2a || exit 1; # tri2a decoding utils/mkgraph.sh data/lang_test exp/tri2a exp/tri2a/graph steps/decode.sh --nj $nDecodeJobs --cmd "$decode_cmd" \ exp/tri2a/graph data/test exp/tri2a/decode # train and decode tri2b [LDA+MLLT] steps/train_lda_mllt.sh --cmd "$train_cmd" 4000 50000 \ data/train data/lang exp/tri1_ali exp/tri2b || exit 1; utils/mkgraph.sh data/lang_test exp/tri2b exp/tri2b/graph steps/decode.sh --nj $nDecodeJobs --cmd "$decode_cmd" \ exp/tri2b/graph data/test exp/tri2b/decode # Align all data with LDA+MLLT system (tri2b) steps/align_si.sh --nj $nJobs --cmd "$train_cmd" \ --use-graphs true data/train data/lang exp/tri2b exp/tri2b_ali || exit 1; # From 2b system, train 3b which is LDA + MLLT + SAT. steps/train_sat.sh --cmd "$train_cmd" \ 5000 100000 data/train data/lang exp/tri2b_ali exp/tri3b || exit 1; utils/mkgraph.sh data/lang_test exp/tri3b exp/tri3b/graph steps/decode_fmllr.sh --nj $nDecodeJobs --cmd \ "$decode_cmd" exp/tri3b/graph data/test exp/tri3b/decode # From 3b system, align all data. steps/align_fmllr.sh --nj $nJobs --cmd "$train_cmd" \ data/train data/lang exp/tri3b exp/tri3b_ali || exit 1; # train mmi mpe local/run_mmi_mpe.sh & # we keep it for completion but not getting the best results # train sgmm local/run_sgmm.sh & # we keep it for completion but not getting the best results wait local/nnet/run_dnn.sh time=$(date +"%Y-%m-%d-%H-%M-%S") #get WER for x in exp/*/decode*; do [ -d $x ] && grep WER $x/wer_* | utils/best_wer.sh; \ done | sort -n -r -k2 > RESULTS.$USER.$time # to make sure you keep the results timed and owned #get detailed WER; reports, conversational and combined local/split_wer.sh $galeData > RESULTS.details.$USER.$time echo training succedded exit 0 |