Blame view
egs/tedlium/s5_r2_wsj/local/train_lm.sh
7.42 KB
8dcb6dfcb first commit |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 |
#!/bin/bash # Copyright 2016 Vincent Nguyen # 2016 Johns Hopkins University (author: Daniel Povey) # Apache 2.0 # # This script trains a LM on the Cantab-Tedlium text data and tedlium acoustic training data. # It is based on the example scripts distributed with PocoLM # It will first check if pocolm is installed and if not will process with installation # It will then get the source data from the pre-downloaded Cantab-Tedlium files # and the pre-prepared data/train text source. set -e stage=0 cmd=run.pl echo "$0 $@" # Print the command line for logging . utils/parse_options.sh || exit 1; dir=data/local/local_lm lm_dir=${dir}/data mkdir -p $dir . ./path.sh || exit 1; # for KALDI_ROOT export PATH=$KALDI_ROOT/tools/pocolm/scripts:$PATH ( # First make sure the pocolm toolkit is installed. cd $KALDI_ROOT/tools || exit 1; if [ -d pocolm ]; then echo Not installing the pocolm toolkit since it is already there. else echo "$0: Please install the PocoLM toolkit with: " echo " cd ../../../tools; extras/install_pocolm.sh; cd -" exit 1; fi ) || exit 1; num_dev_sentences=10000 bypass_metaparam_optim_opt= if [ $stage -le 0 ]; then mkdir -p ${dir}/data mkdir -p ${dir}/data/text echo "$0: Getting the Data sources" rm ${dir}/data/text/* 2>/dev/null || true # Unzip TEDLIUM 6 data sources, normalize apostrophe+suffix to previous word, gzip the result. gunzip -c db/TEDLIUM_release2/LM/*.en.gz | sed 's/ <\/s>//g' | \ local/join_suffix.py | awk '{print "foo "$0}' | \ local/normalize_transcript.pl '<NOISE>' | cut -d ' ' -f 2- | gzip -c > ${dir}/data/text/train.txt.gz # use a subset of the annotated training data as the dev set . # Note: the name 'dev' is treated specially by pocolm, it automatically # becomes the dev set. head -n $num_dev_sentences < data/train/text | cut -d " " -f 2- > ${dir}/data/text/dev.txt # .. and the rest of the training data as an additional data source. # we can later fold the dev data into this. tail -n +$[$num_dev_sentences+1] < data/train/text | cut -d " " -f 2- > ${dir}/data/text/ted.txt cat data/train_si284/text | cut -d " " -f 2- > ${dir}/data/text/wsj_si284.txt # for reporting perplexities, we'll use the "real" dev set. # (a subset of the training data is used as ${dir}/data/text/ted.txt to work # out interpolation weights. # note, we can't put it in ${dir}/data/text/, because then pocolm would use # it as one of the data sources. cut -d " " -f 2- < data/dev/text > ${dir}/data/real_dev_set.txt fi if [ $stage -le 1 ]; then mkdir -p $dir/data/work get_word_counts.py $dir/data/text $dir/data/work/word_counts touch $dir/data/work/word_counts/.done fi if [ $stage -le 2 ]; then # decide on the vocabulary. cat $dir/data/work/word_counts/{ted,dev}.counts | \ local/lm/merge_word_counts.py 2 > $dir/data/work/ted.wordlist_counts cat $dir/data/work/word_counts/train.counts | \ local/lm/merge_word_counts.py 5 > $dir/data/work/train.wordlist_counts cat $dir/data/work/word_counts/wsj_si284.counts | \ local/lm/merge_word_counts.py 2 > $dir/data/work/wsj_si284.wordlist_counts cat $dir/data/work/{ted,train,wsj_si284}.wordlist_counts | \ perl -ane 'if ($F[1] =~ m/[A-Za-z]/) { print "$F[0] $F[1] "; }' | \ local/lm/merge_word_counts.py 1 | sort -k 1,1nr > $dir/data/work/final.wordlist_counts if [ ! -z "$vocab_size" ]; then awk -v sz=$vocab_size 'BEGIN{count=-1;} { i+=1; if (i == int(sz)) { count = $1; }; if (count > 0 && count != $1) { exit(0); } print $0; }' $dir/data/work/final.wordlist_counts else cat $dir/data/work/final.wordlist_counts fi | awk '{print $2}' > $dir/data/work/wordlist fi order=4 wordlist=${dir}/data/work/wordlist min_counts='train=2 ted=1 wsj_si284=5' # Uncomment these if you want to remove WSJ data from LM. It should not # affect much. WSJ data improves perplexity by a couple of points. # min_counts='train=2 ted=1' # [ -f $dir/data/text/wsj_si284.txt ] && mv $dir/data/text/wsj_si284.txt $dir/data/ # [ -f $dir/data/work/word_counts/wsj_si284.counts ] && mv $dir/data/work/word_counts/wsj_si284.counts $dir/data/work lm_name="`basename ${wordlist}`_${order}" if [ -n "${min_counts}" ]; then lm_name+="_`echo ${min_counts} | tr -s "[:blank:]" "_" | tr "," "." | tr "=" "-"`" fi unpruned_lm_dir=${lm_dir}/${lm_name}.pocolm if [ $stage -le 3 ]; then echo "$0: training the unpruned LM" $cmd ${unpruned_lm_dir}/log/train.log \ train_lm.py --wordlist=${wordlist} --num-splits=10 --warm-start-ratio=20 \ --limit-unk-history=true \ --fold-dev-into=ted ${bypass_metaparam_optim_opt} \ --min-counts="${min_counts}" \ ${dir}/data/text ${order} ${lm_dir}/work ${unpruned_lm_dir} for x in real_dev_set; do $cmd ${unpruned_lm_dir}/log/compute_data_prob_${x}.log \ get_data_prob.py ${dir}/data/${x}.txt ${unpruned_lm_dir} cat ${unpruned_lm_dir}/log/compute_data_prob_${x}.log | grep -F '[perplexity' done # Preplexity with just cantab-tedlium LM and Ted text: [perplexity = 157.87] over 18290.0 words # Perplexity with WSJ text added: # log-prob of data/local/local_lm/data/real_dev_set.txt given model data/local/local_lm/data/wordlist_4_train-2_ted-1_wsj_si284-5.pocolm was -5.05607815615 per word [perplexity = 156.973681282] over 18290.0 words. fi if [ $stage -le 4 ]; then echo "$0: pruning the LM (to larger size)" # Using 10 million n-grams for a big LM for rescoring purposes. size=10000000 $cmd ${dir}/data/lm_${order}_prune_big/log/prune_lm.log \ prune_lm_dir.py --target-num-ngrams=$size --initial-threshold=0.02 ${unpruned_lm_dir} ${dir}/data/lm_${order}_prune_big for x in real_dev_set; do $cmd ${dir}/data/lm_${order}_prune_big/log/compute_data_prob_${x}.log \ get_data_prob.py ${dir}/data/${x}.txt ${dir}/data/lm_${order}_prune_big cat ${dir}/data/lm_${order}_prune_big/log/compute_data_prob_${x}.log | grep -F '[perplexity' done # current results, after adding --limit-unk-history=true: # get_data_prob.py: log-prob of data/local/local_lm/data/real_dev_set.txt given model data/local/local_lm/data/lm_4_prune_big was -5.16562818753 per word [perplexity = 175.147449465] over 18290.0 words. mkdir -p ${dir}/data/arpa format_arpa_lm.py ${dir}/data/lm_${order}_prune_big | gzip -c > ${dir}/data/arpa/${order}gram_big.arpa.gz fi if [ $stage -le 5 ]; then echo "$0: pruning the LM (to smaller size)" # Using 2 million n-grams for a smaller LM for graph building. Prune from the # bigger-pruned LM, it'll be faster. size=2000000 $cmd ${dir}/data/lm_${order}_prune_small/log/prune_lm.log \ prune_lm_dir.py --target-num-ngrams=$size ${dir}/data/lm_${order}_prune_big ${dir}/data/lm_${order}_prune_small for x in real_dev_set; do $cmd ${dir}/data/lm_${order}_prune_small/log/compute_data_prob_${x}.log \ get_data_prob.py ${dir}/data/${x}.txt ${dir}/data/lm_${order}_prune_small cat ${dir}/data/lm_${order}_prune_small/log/compute_data_prob_${x}.log | grep -F '[perplexity' done # current results, after adding --limit-unk-history=true (needed for modeling OOVs and not blowing up LG.fst): # get_data_prob.py: log-prob of data/local/local_lm/data/real_dev_set.txt given model data/local/local_lm/data/lm_4_prune_small was -5.29432352378 per word [perplexity = 199.202824404 over 18290.0 words. format_arpa_lm.py ${dir}/data/lm_${order}_prune_small | gzip -c > ${dir}/data/arpa/${order}gram_small.arpa.gz fi |