Blame view
egs/wsj/s5/utils/lang/adjust_unk_graph.sh
1.45 KB
8dcb6dfcb first commit |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 |
#!/bin/bash # Copyright 2018 Xiaohui Zhang # Apache 2.0 # This script copies a fully expanded decoding graph (HCLG.fst) and scales the scores # of all arcs whose output symbol is a user-specified OOV symbol (or any other word). # This achieves an equivalent effect of utils/lang/adjust_unk_arpa.pl, which scales # the LM prob of all ngrams predicting an OOV symbol, while avoiding re-creating the graph. set -o pipefail if [ $# != 4 ]; then echo "Usage: utils/adjust_unk_graph.sh <oov-dict-entry> <scale> <in-graph-dir> <out-graph-dir>" echo "e.g.: utils/adjust_unk_graph.sh \"<unk>\" 0.1 exp/tri1/graph exp/tri1/graph_unk_scale_0.1" exit 1; fi if [ -f path.sh ]; then . ./path.sh; fi oov_word=$1 unk_scale=$2 graphdir_in=$3 graphdir_out=$4 mkdir -p $graphdir_out required="HCLG.fst words.txt disambig_tid.int num_pdfs phones phones.txt words.txt" for f in $required; do [ ! -e $graphdir_in/$f ] && echo "adjust_unk_graph.sh: expected $graphdir_in/$f to exist" && exit 1; cp -r $graphdir_in/$f $graphdir_out done cp -r $graphdir_in/{disambig_tid.int,num_pdfs,phones,phones.txt,words.txt} $graphdir_out oov_id=`echo $oov_word | utils/sym2int.pl $graphdir_in/words.txt` [ -z $oov_id ] && echo "adjust_unk_graph.sh: the specified oov symbol $oov_word is out of the vocabulary." && exit 1; fstprint $graphdir_in/HCLG.fst | awk -v oov=$oov_id -v unk_scale=$unk_scale '{if($4==oov) $5=$5-log(unk_scale);print $0}' | \ fstcompile > $graphdir_out/HCLG.fst || exit 1; |