Blame view
egs/librispeech/s5/local/lm/train_lm.sh
4.97 KB
8dcb6dfcb first commit |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 |
#!/bin/bash # Copyright 2014 Vassil Panayotov # Apache 2.0 # This is the top-level LM training script . ./path.sh || exit 1 . ./cmd.sh || exit 1 # use to skip some of the initial steps stage=1 # how many words we want in the LM's vocabulary vocab_size=200000 # LM pruning threshold for the 'small' trigram model prune_thresh_small=0.0000003 # LM pruning threshold for the 'medium' trigram model prune_thresh_medium=0.0000001 # how many text normalization jobs to run in parallel normjobs=2 . utils/parse_options.sh || exit 1 if [[ $# -ne 4 ]]; then echo "Usage: $1 <lm-texts-root> <tmp-dir> <txt-norm-root> <out-lm-dir>" echo "where," echo " <lm-text-root>: the root directory containing the raw(unnormalized) LM training texts" echo " <tmp-dir>: store the temp files into this dir" echo " <txt-norm-root>: store the normalized texts in subdirectories under this root dir" echo " <out-lm-dir>: the directory to store the trained ARPA model" exit 1 fi corpus_dir=$1/corpus tmp_dir=$2 norm_dir=$3 lm_dir=$4 [[ -d "$corpus_dir" ]] || { echo "No such directory '$corpus_dir'"; exit 1; } split_prefix=$tmp_dir/split if [ "$stage" -le 1 ]; then mkdir -p $tmp_dir echo "Splitting into $normjobs parts, to allow for parallel processing ..." split_files=$(eval "echo $split_prefix-{$(seq -s',' $normjobs | sed 's/,$//')}") find $corpus_dir -mindepth 1 -maxdepth 1 -type d |\ tee $tmp_dir/all_texts.txt |\ utils/split_scp.pl /dev/stdin $split_files echo "Checking the splits ..." total_count=$(wc -l <$tmp_dir/all_texts.txt) split_count=$(cat $split_files | wc -l | awk 'BEGIN{c=0} {c+=$1;} END{print c}') [[ "$total_count" -eq "$split_count" ]] || { echo "Inconsistent counts"; exit 1; } fi if [ "$stage" -le 2 ]; then echo "Performing text normalization ($normjobs jobs) - check $tmp_dir/txt_norm.JOB.log ..." mkdir -p $norm_dir $mkgraph_cmd JOB=1:$normjobs $tmp_dir/txt_norm.JOB.log \ local/lm/normalize_text.sh $split_prefix-JOB $norm_dir || exit 1 echo "Finished OK" fi word_counts=$lm_dir/word_counts.txt vocab=$lm_dir/librispeech-vocab.txt full_corpus=$lm_dir/librispeech-lm-norm.txt.gz if [ "$stage" -le 3 ]; then echo "Selecting the vocabulary ($vocab_size words) ..." mkdir -p $lm_dir echo "Making the corpus and the vocabulary ..." # The following sequence of commands does the following: # 1) Eliminates duplicate sentences and saves the resulting corpus # 2) Splits the corpus into words # 3) Sorts the words in respect to their frequency # 4) Saves the list of the first $vocab_size words sorted by their frequencies # 5) Saves an alphabetically sorted vocabulary, that include the most frequent $vocab_size words for f in $(find $norm_dir -iname '*.txt'); do cat $f; done |\ sort -u | tee >(gzip >$full_corpus) | tr -s '[[:space:]]' ' ' |\ sort | uniq -c | sort -k1 -n -r |\ head -n $vocab_size | tee $word_counts | awk '{print $2}' | sort >$vocab || exit 1 echo "Word counts saved to '$word_counts'" echo "Vocabulary saved as '$vocab'" echo "All unique sentences (in sorted order) stored in '$full_corpus'" echo "Counting the total number word tokens in the corpus ..." echo "There are $(wc -w < <(zcat $full_corpus)) tokens in the corpus" fi trigram_lm=$lm_dir/lm_tglarge.arpa.gz if [ "$stage" -le 4 ]; then echo "Training a 3-gram LM ..." command -v ngram-count 1>/dev/null 2>&1 || { echo "Please install SRILM and set path.sh accordingly"; exit 1; } echo "This implementation assumes that you have a lot of free RAM(> 12GB) on your machine" echo "If that's not the case, consider something like: http://joshua-decoder.org/4.0/large-lms.html" ngram-count -order 3 -kndiscount -interpolate \ -unk -map-unk "<UNK>" -limit-vocab -vocab $vocab -text $full_corpus -lm $trigram_lm || exit 1 du -h $trigram_lm fi trigram_pruned_small=$lm_dir/lm_tgsmall.arpa.gz if [ "$stage" -le 5 ]; then echo "Creating a 'small' pruned 3-gram LM (threshold: $prune_thresh_small) ..." command -v ngram 1>/dev/null 2>&1 || { echo "Please install SRILM and set path.sh accordingly"; exit 1; } ngram -prune $prune_thresh_small -lm $trigram_lm -write-lm $trigram_pruned_small || exit 1 du -h $trigram_pruned_small fi trigram_pruned_medium=$lm_dir/lm_tgmed.arpa.gz if [ "$stage" -le 5 ]; then echo "Creating a 'medium' pruned 3-gram LM (threshold: $prune_thresh_medium) ..." command -v ngram 1>/dev/null 2>&1 || { echo "Please install SRILM and set path.sh accordingly"; exit 1; } ngram -prune $prune_thresh_medium -lm $trigram_lm -write-lm $trigram_pruned_medium || exit 1 du -h $trigram_pruned_medium fi fourgram_lm=$lm_dir/lm_fglarge.arpa.gz if [ "$stage" -le 4 ]; then # This requires even more RAM than the 3-gram echo "Training a 4-gram LM ..." command -v ngram-count 1>/dev/null 2>&1 || { echo "Please install SRILM and set path.sh accordingly"; exit 1; } ngram-count -order 4 -kndiscount -interpolate \ -unk -map-unk "<UNK>" -limit-vocab -vocab $vocab -text $full_corpus -lm $fourgram_lm || exit 1 du -h $fourgram_lm fi exit 0 |