Blame view
egs/yomdle_russian/v1/local/train_lm.sh
5.21 KB
8dcb6dfcb first commit |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 |
#!/bin/bash # Copyright 2016 Vincent Nguyen # 2016 Johns Hopkins University (author: Daniel Povey) # 2017 Ashish Arora # 2017 Hossein Hadian # Apache 2.0 # # This script trains a LM on the training transcriptions and corpus text. # It is based on the example scripts distributed with PocoLM # It will check if pocolm is installed and if not will proceed with installation set -e stage=0 dir=data/local/local_lm order=6 echo "$0 $@" # Print the command line for logging . ./utils/parse_options.sh || exit 1; lm_dir=${dir}/data mkdir -p $dir . ./path.sh || exit 1; # for KALDI_ROOT export PATH=$KALDI_ROOT/tools/pocolm/scripts:$PATH ( # First make sure the pocolm toolkit is installed. cd $KALDI_ROOT/tools || exit 1; if [ -d pocolm ]; then echo Not installing the pocolm toolkit since it is already there. else echo "$0: Please install the PocoLM toolkit with: " echo " cd ../../../tools; extras/install_pocolm.sh; cd -" exit 1; fi ) || exit 1; bypass_metaparam_optim_opt= # If you want to bypass the metaparameter optimization steps with specific metaparameters # un-comment the following line, and change the numbers to some appropriate values. # You can find the values from output log of train_lm.py. # These example numbers of metaparameters is for 4-gram model (with min-counts) # running with train_lm.py. # The dev perplexity should be close to the non-bypassed model. #bypass_metaparam_optim_opt="--bypass-metaparameter-optimization=0.031,0.860,0.678,0.194,0.037,0.006,0.928,0.712,0.454,0.220,0.926,0.844,0.749,0.358,0.966,0.879,0.783,0.544,0.966,0.826,0.674,0.450" # Note: to use these example parameters, you may need to remove the .done files # to make sure the make_lm_dir.py be called and tain only 3-gram model #for order in 3; do #rm -f ${lm_dir}/${num_word}_${order}.pocolm/.done if [ $stage -le 0 ]; then mkdir -p ${dir}/data mkdir -p ${dir}/data/text echo "$0: Getting the Data sources" rm ${dir}/data/text/* 2>/dev/null || true # use the validation data as the dev set. # Note: the name 'dev' is treated specially by pocolm, it automatically # becomes the dev set. cat data/local/text/cleaned/bpe_val.txt > ${dir}/data/text/dev.txt # use the training data as an additional data source. # we can later fold the dev data into this. cat data/train/text | cut -d " " -f 2- > ${dir}/data/text/train.txt cat data/local/text/cleaned/bpe_corpus.txt > ${dir}/data/text/corpus_text.txt # for reporting perplexities, we'll use the "real" dev set. # (the validation data is used as ${dir}/data/text/dev.txt to work # out interpolation weights.) # note, we can't put it in ${dir}/data/text/, because then pocolm would use # it as one of the data sources. cut -d " " -f 2- < data/test/text > ${dir}/data/real_dev_set.txt # get the wordlist from train and corpus text cat ${dir}/data/text/{train,corpus_text}.txt | tr '[:space:]' '[ *]' | grep -v "^\s*$" | sort | uniq -c | sort -bnr > ${dir}/data/word_count cat ${dir}/data/word_count | awk '{print $2}' > ${dir}/data/wordlist fi if [ $stage -le 1 ]; then # decide on the vocabulary. # Note: you'd use --wordlist if you had a previously determined word-list # that you wanted to use. # Note: if you have more than one order, use a certain amount of words as the # vocab and want to restrict max memory for 'sort', echo "$0: training the unpruned LM" min_counts='train=1' wordlist=${dir}/data/wordlist lm_name="`basename ${wordlist}`_${order}" if [ -n "${min_counts}" ]; then lm_name+="_`echo ${min_counts} | tr -s "[:blank:]" "_" | tr "=" "-"`" fi unpruned_lm_dir=${lm_dir}/${lm_name}.pocolm train_lm.py --wordlist=${wordlist} --num-splits=20 --warm-start-ratio=20 \ --limit-unk-history=true \ ${bypass_metaparam_optim_opt} \ ${dir}/data/text ${order} ${lm_dir}/work ${unpruned_lm_dir} get_data_prob.py ${dir}/data/real_dev_set.txt ${unpruned_lm_dir} 2>&1 | grep -F '[perplexity' mkdir -p ${dir}/data/arpa format_arpa_lm.py ${unpruned_lm_dir} | gzip -c > ${dir}/data/arpa/${order}gram_unpruned.arpa.gz fi if [ $stage -le 2 ]; then echo "$0: pruning the LM (to larger size)" # Using 10 million n-grams for a big LM for rescoring purposes. size=10000000 prune_lm_dir.py --target-num-ngrams=$size --initial-threshold=0.02 ${unpruned_lm_dir} ${dir}/data/lm_${order}_prune_big get_data_prob.py ${dir}/data/real_dev_set.txt ${dir}/data/lm_${order}_prune_big 2>&1 | grep -F '[perplexity' #[perplexity = 22.0613098868] over 151116.0 words mkdir -p ${dir}/data/arpa format_arpa_lm.py ${dir}/data/lm_${order}_prune_big | gzip -c > ${dir}/data/arpa/${order}gram_big.arpa.gz fi if [ $stage -le 3 ]; then echo "$0: pruning the LM (to smaller size)" # Using 2 million n-grams for a smaller LM for graph building. Prune from the # bigger-pruned LM, it'll be faster. size=2000000 prune_lm_dir.py --target-num-ngrams=$size ${dir}/data/lm_${order}_prune_big ${dir}/data/lm_${order}_prune_small get_data_prob.py ${dir}/data/real_dev_set.txt ${dir}/data/lm_${order}_prune_small 2>&1 | grep -F '[perplexity' #[perplexity = 23.4801171202] over 151116.0 words format_arpa_lm.py ${dir}/data/lm_${order}_prune_small | gzip -c > ${dir}/data/arpa/${order}gram_small.arpa.gz fi |