Blame view
egs/wsj/s5/steps/cleanup/make_biased_lm_graphs.sh
7.27 KB
8dcb6dfcb first commit |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 |
#!/bin/bash # Copyright 2012-2016 Johns Hopkins University (Author: Daniel Povey) # 2016 Vimal Manohar # Apache 2.0 # This script creates biased decoding graphs based on the data transcripts as # HCLG.fsts.scp, in the specified directory; this can be consumed by # decode_segmentation.sh. # This is for use in data-cleanup and data-filtering. set -u set -o pipefail set -e # Begin configuration section. nj=10 cmd=run.pl scale_opts="--transition-scale=1.0 --self-loop-scale=0.1" top_n_words=100 # Number of common words that we compile into each graph (most frequent # in $data/text.orig. top_n_words_weight=1.0 # this weight is before renormalization; it can be more # or less than 1. min_words_per_graph=100 # Utterances will be grouped so that they have at least # this many words, before making the graph. stage=0 ### options for make_one_biased_lm.py. ngram_order=4 # maximum n-gram order to use (but see also --min-lm-state-cout). min_lm_state_count=10 # make this smaller (e.g. 2) for more strongly biased LM. discounting_constant=0.3 # strictly between 0 and 1. Make this closer to 0 for # more strongly biased LM. # End configuration options. echo "$0 $@" # Print the command line for logging [ -f path.sh ] && . ./path.sh # source the path. . parse_options.sh || exit 1; if [ $# != 4 ]; then echo "usage: $0 <data-dir|text> <lang-dir> <dir> <graph-dir>" echo "e.g.: $0 data/train data/lang exp/tri3_cleanup exp/tri3_cleanup/graphs" echo " This script creates biased decoding graphs per utterance (or possibly" echo " groups of utterances, depending on --min-words-per-graph). Its output" echo " goes to <dir>/HCLG.fsts.scp, indexed by utterance. Directory <dir> is" echo " required to be a model or alignment directory, containing 'tree' and 'final.mdl'." echo "Main options (for others, see top of script file)" echo " --scale-opts <scale-opts> # Options relating to language" echo " # model scale; default is " echo " # '--transition-scale=1.0 --self-loop-scale=0.1'" echo " --top-n-words <N> # Number of most-common-words to add with" echo " # unigram probabilities into graph (default: 100)" echo " --top-n-words-weight <float> # Weight given to top-n-words portion of graph" echo " # (before renormalizing); may be any positive" echo " # number (default: 1.0)" echo " --min-words-per-graph <N> # A constant that controls grouping of utterances" echo " # (we make the LMs for groups of utterances)." echo " # Default: 100." echo " --ngram-order <N> # N-gram order in range [2,7]. Maximum n-gram order " echo " # that may be used (but also see --min-lm-state-count)." echo " # Default 4" echo " --min-lm-state-count <N> # Minimum state count for an LM-state of order >2 to " echo " # be completely pruned away [bigrams will always be kept]" echo " # Default 10. Smaller -> more strongly biased LM" echo " --discounting-constant <float> # Discounting constant for Kneser-Ney, strictly between 0" echo " # and 1. Default 0.3. Smaller -> more strongly biased LM." echo " --config <config-file> # config containing options" echo " --nj <nj> # number of parallel jobs" echo " --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs." exit 1; fi data_or_text=$1 lang=$2 dir=$3 graph_dir=$4 if [ -d $data_or_text ]; then text=$data_or_text/text else text=$data_or_text fi mkdir -p $graph_dir for f in $text $lang/oov.int $dir/tree $dir/final.mdl \ $lang/L_disambig.fst $lang/phones/disambig.int; do [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1; done utils/lang/check_phones_compatible.sh $lang/phones.txt $dir/phones.txt cp $lang/phones.txt $graph_dir oov=`cat $lang/oov.int` || exit 1; mkdir -p $graph_dir/log # create top_words.{int,txt} if [ $stage -le 0 ]; then export LC_ALL=C # the following pipe will be broken due to the 'head'; don't fail. set +o pipefail utils/sym2int.pl --map-oov $oov -f 2- $lang/words.txt < $text | \ awk '{for(x=2;x<=NF;x++) print $x;}' | sort | uniq -c | \ sort -nr | head -n $top_n_words > $graph_dir/word_counts.int set -o pipefail total_count=$(awk '{x+=$1} END{print x}' < $graph_dir/word_counts.int) # print top-n words with their unigram probabilities. awk -v tot=$total_count -v weight=$top_n_words_weight '{print $2, ($1*weight)/tot;}' \ <$graph_dir/word_counts.int >$graph_dir/top_words.int utils/int2sym.pl -f 1 $lang/words.txt <$graph_dir/top_words.int >$graph_dir/top_words.txt fi word_disambig_symbol=$(cat $lang/words.txt | grep -w "#0" | awk '{print $2}') if [ -z "$word_disambig_symbol" ]; then echo "$0: error getting word disambiguation symbol" exit 1 fi mkdir -p $graph_dir/texts split_text= for n in `seq $nj`; do split_text="$split_text $graph_dir/texts/text.$n" done utils/split_scp.pl $text $split_text mkdir -p $graph_dir/log $graph_dir/fsts # Make $dir an absolute pathname dir=`perl -e '($dir,$pwd)= @ARGV; if($dir!~m:^/:) { $dir = "$pwd/$dir"; } print $dir; ' $dir ${PWD}` if [ $stage -le 1 ]; then echo "$0: creating utterance-group-specific decoding graphs with biased LMs" # These options are passed through directly to make_one_biased_lm.py. lm_opts="--word-disambig-symbol=$word_disambig_symbol --ngram-order=$ngram_order --min-lm-state-count=$min_lm_state_count --discounting-constant=$discounting_constant --top-words=$graph_dir/top_words.int" $cmd JOB=1:$nj $graph_dir/log/compile_decoding_graphs.JOB.log \ utils/sym2int.pl --map-oov $oov -f 2- $lang/words.txt $graph_dir/texts/text.JOB \| \ steps/cleanup/make_biased_lms.py --min-words-per-graph=$min_words_per_graph \ --lm-opts="$lm_opts" $graph_dir/fsts/utt2group.JOB \| \ compile-train-graphs-fsts $scale_opts --read-disambig-syms=$lang/phones/disambig.int \ $dir/tree $dir/final.mdl $lang/L_disambig.fst ark:- \ ark,scp:$graph_dir/fsts/HCLG.fsts.JOB.ark,$graph_dir/fsts/HCLG.fsts.JOB.scp || exit 1 fi for j in $(seq $nj); do cat $graph_dir/fsts/HCLG.fsts.$j.scp; done > $graph_dir/fsts/HCLG.fsts.per_utt.scp for j in $(seq $nj); do cat $graph_dir/fsts/utt2group.$j; done > $graph_dir/fsts/utt2group cp $lang/words.txt $graph_dir/ cp -r $lang/phones $graph_dir/ # The following command gives us an scp file relative to utterance-id. utils/apply_map.pl -f 2 $graph_dir/fsts/HCLG.fsts.per_utt.scp <$graph_dir/fsts/utt2group > $graph_dir/HCLG.fsts.scp n1=$(cat $text | wc -l) n2=$(cat $graph_dir/HCLG.fsts.scp | wc -l) if [ $[$n1*9] -gt $[$n2*10] ]; then echo "$0: too many utterances have no scp, something seems to have gone wrong." exit 1 fi exit 0; |