extend_vocab_demo.sh 13.9 KB
#!/usr/bin/env bash

# This script demonstrates how to use the grammar-decoding framework to build
# graphs made out of more than one part.  It demonstrates using `fstequivalent`
# that the graph constructed this way is equivalent to what you would create if
# you had the LM all as a single piece.  This uses the command line tools to
# expand to a regular FST (--write-as-grammar=false) In practice you might not
# want do to that, since the result might be large, and since writing the entire
# thing might take too much time.  The code itself allows you to construct these
# GrammarFst objects in lightweight way and decode using them.

# Unfortunately the filenames here are not very well through through.  I hope to
# rework this when I have time.

stage=0
run_g2p=false  # set this to true to run the g2p stuff, it's slow so
               # by default we fake it by providing what it previously output
set -e

. ./path.sh
. utils/parse_options.sh


tree_dir=exp/chain/tree_sp
lang_base=data/lang_nosp_basevocab
lang_ext=data/lang_nosp_extvocab

# For the purposes of this script we just need a biphone tree and associated
# transition-model for testing, because we're testing it at the graph level,
# i.e. testing equivalence of compiled HCLG graphs; there is no decoding
# involved here.

# We're doing this with the "no-silprobs" dictionary dir for now, as we
# need to write some scripts to support silprobs with this.

# For reference, here is how we could create the 'lang' dir for the
# baseline.
#utils/prepare_lang.sh data/local/dict_nosp \
#   "<UNK>" data/local/lang_tmp_nosp data/lang_nosp

if [ $stage -le 0 ]; then
  cp -r data/local/dict_nosp data/local/dict_nosp_basevocab
  echo "#nonterm:unk" > data/local/dict_nosp_basevocab/nonterminals.txt

  utils/prepare_lang.sh data/local/dict_nosp_basevocab \
       "<UNK>" data/local/lang_tmp_nosp $lang_base
fi

if [ $stage -le 1 ]; then
  # note: <UNK> does appear in that arpa file, with a reasonable probability
  # (0.0)...  presumably because the vocab that the arpa file was built with was
  # not vast, so there were plenty of OOVs.  It would be possible to adjust its
  # probability with adjust_unk_arpa.pl, but for now we just leave it as-is.
  # The <UNK> appears quite a few times in the ARPA.  In the language model we
  # replaced it with #nonterm:unk, which will later expand to our custom graph
  # of new words.

  # We don't want the #nonterm:unk on the output side of G.fst, or it would
  # appear in the decoded output, so we remove it using the 'fstrmsymbols' command.

  nonterm_unk=$(grep '#nonterm:unk' $lang_base/words.txt | awk '{print $2}')

  gunzip -c  data/local/lm/lm_tgsmall.arpa.gz | \
    sed 's/<UNK>/#nonterm:unk/g' | \
    arpa2fst --disambig-symbol=#0 \
             --read-symbol-table=$lang_base/words.txt - | \
    fstrmsymbols --remove-from-output=true "echo $nonterm_unk|" - $lang_base/G.fst
fi


if [ $stage -le 2 ]; then
  # make the top-level part of the graph.
  utils/mkgraph.sh --self-loop-scale 1.0 $lang_base $tree_dir $tree_dir/extvocab_nosp_top
fi

if [ $stage -le 3 ] && $run_g2p; then
  # you may have to do some stuff manually to install sequitur, to get this to work.
  dict=data/local/dict_nosp_basevocab
  steps/dict/train_g2p.sh --silence-phones $dict/silence_phones.txt $dict/lexicon.txt  $tree_dir/extvocab_nosp_g2p
fi


if [ $stage -le 4 ]; then
  # Create data/local/dict_nosp_newvocab as a dict-dir containing just the
  # newly created vocabulary entries (but the same phone list as our old setup, not
  # that it matters)

  mkdir -p $tree_dir/extvocab_nosp_lexicon

  # First find a list of words in the test set that are out of vocabulary.
  # Of course this is totally cheating.
  awk -v w=data/lang/words.txt 'BEGIN{while(getline <w) seen[$1] = $1} {for(n=2;n<=NF;n++) if(!($n in seen)) oov[$n] = 1}
                                END{ for(k in oov) print k;}' < data/dev_clean_2/text > $tree_dir/extvocab_nosp_lexicon/words
  echo "$0: generating g2p entries for $(wc -l <$tree_dir/extvocab_nosp_lexicon/words) words"

  if $run_g2p; then
    steps/dict/apply_g2p.sh $tree_dir/extvocab_nosp_lexicon/words $tree_dir/extvocab_nosp_g2p  $tree_dir/extvocab_nosp_lexicon
  else
    cat <<EOF >$tree_dir/extvocab_nosp_lexicon/lexicon.lex
HARDWIGG	0.962436	HH AA1 R D W IH1 G
SUDVESTR	0.162048	S AH1 D V EY1 S T R
SUDVESTR	0.133349	S AH1 D V EH1 S T R
SUDVESTR	0.114376	S AH1 D V EH1 S T ER0
VINOS	0.558345	V IY1 N OW0 Z
VINOS	0.068883	V AY1 N OW0 Z
VINOS	0.068431	V IY1 N OW0 S
DOMA	0.645714	D OW1 M AH0
DOMA	0.118255	D UW1 M AH0
DOMA	0.080682	D OW0 M AH0
GWYNPLAINE'S	0.983053	G W IH1 N P L EY1 N Z
SHIMERDA	0.610922	SH IH0 M EH1 R D AH0
SHIMERDA	0.175678	SH IY0 M EH1 R D AH0
SHIMERDA	0.069785	SH AY1 M ER1 D AH0
MYRDALS	0.479183	M IH1 R D AH0 L Z
MYRDALS	0.135225	M ER1 D AH0 L Z
MYRDALS	0.115478	M IH1 R D L Z
HEUCHERA	0.650042	HH OY1 K IH1 R AH0
HEUCHERA	0.119363	HH OY1 K EH1 R AH0
HEUCHERA	0.077907	HH OY1 K ER0 AH0
IMPARA	0.906222	IH0 M P AA1 R AH0
VERLOC'S	0.564847	V ER0 L AA1 K S
VERLOC'S	0.173540	V ER1 L AH0 K S
VERLOC'S	0.050543	V ER1 L AA1 K S
UNTRUSSING	0.998019	AH0 N T R AH1 S IH0 NG
DARFHULVA	0.317057	D AA2 F UH1 L V AH0
DARFHULVA	0.262882	D AA2 F HH UH1 L V AH0
DARFHULVA	0.064055	D AA2 F HH UW1 L V AH0
FINNACTA	0.594586	F IH1 N AH0 K T AH0
FINNACTA	0.232454	F IH1 N AE1 K T AH0
FINNACTA	0.044733	F IH1 N IH0 K T AH0
YOKUL	0.845279	Y OW1 K AH0 L
YOKUL	0.051082	Y OW2 K AH0 L
YOKUL	0.029435	Y OW0 K AH0 L
CONGAL	0.504228	K AA1 NG G AH0 L
CONGAL	0.151648	K AA2 NG G AH0 L
CONGAL	0.137837	K AH0 N JH AH0 L
DELECTASTI	0.632180	D IH0 L EH0 K T EY1 S T IY0
DELECTASTI	0.203808	D IH0 L EH1 K T EY1 S T IY0
DELECTASTI	0.066722	D IH0 L EH0 K T AE1 S T IY0
YUNDT	0.975077	Y AH1 N T
QUINCI	0.426115	K W IH1 N S IY0
QUINCI	0.369324	K W IH1 N CH IY0
QUINCI	0.064507	K W IY0 N CH IY0
BIRDIKINS	0.856979	B ER1 D IH0 K AH0 N Z
BIRDIKINS	0.045315	B ER1 D AH0 K AH0 N Z
SNEFFELS	0.928413	S N EH1 F AH0 L Z
FJORDUNGR	0.130629	F Y AO1 R D UW0 NG G R
FJORDUNGR	0.125082	F Y AO1 R D AH0 NG G R
FJORDUNGR	0.111035	F Y AO1 R D UH1 NG R
YULKA	0.540253	Y UW1 L K AH0
YULKA	0.295588	Y AH1 L K AH0
YULKA	0.076631	Y UH1 L K AH0
LACQUEY'S	0.987908	L AE1 K IY0 Z
OSSIPON'S	0.651400	AA1 S AH0 P AA2 N Z
OSSIPON'S	0.118444	AA1 S AH0 P AA0 N Z
OSSIPON'S	0.106377	AA1 S AH0 P AH0 N Z
SAKNUSSEMM	0.060270	S AE1 K N AH1 S EH1 M
SAKNUSSEMM	0.044992	S AE1 K N AH0 S EH1 M
SAKNUSSEMM	0.044084	S AA0 K N AH1 S EH1 M
CONGAL'S	0.618287	K AA1 NG G AH0 L Z
CONGAL'S	0.185952	K AA2 NG G AH0 L Z
CONGAL'S	0.115143	K AH0 N G AH0 L Z
TARRINZEAU	0.159153	T AA1 R IY0 N Z OW1
TARRINZEAU	0.136536	T AA1 R AH0 N Z OW1
TARRINZEAU	0.100924	T EH1 R IY0 N Z OW1
SHIMERDAS	0.230819	SH IH0 M EH1 R D AH0 Z
SHIMERDAS	0.216235	SH IH0 M EH1 R D AH0 S
SHIMERDAS	0.073311	SH AY1 M ER1 D AH0 Z
RUGGEDO'S	0.821285	R UW0 JH EY1 D OW0 Z
RUGGEDO'S	0.166825	R AH1 G AH0 D OW0 Z
CORNCAKES	0.934118	K AO1 R N K EY2 K S
VENDHYA	0.616662	V EH0 N D Y AH0
VENDHYA	0.178349	V EH1 N D Y AH0
VENDHYA	0.160768	V AA1 N D Y AH0
GINGLE	0.919815	G IH1 NG G AH0 L
STUPIRTI	0.422653	S T UW0 P IH1 R T IY0
STUPIRTI	0.126925	S T UW1 P IH0 R T IY0
STUPIRTI	0.078422	S T UW1 P AH0 R T IY0
HERBIVORE	0.950887	HH ER1 B IH0 V AO2 R
BRION'S	0.838326	B R AY1 AH0 N Z
BRION'S	0.140310	B R IY0 AH0 N Z
DELAUNAY'S	0.993259	D EH1 L AO0 N EY0 Z
KHOSALA	0.920908	K OW0 S AA1 L AH0
BRANDD	0.827461	B R AE1 N D
BRANDD	0.085646	B R AE2 N D
GARDAR	0.598675	G AA0 R D AA1 R
GARDAR	0.289831	G AA1 R D AA2 R
GARDAR	0.057983	G AA0 R D AA2 R
MACKLEWAIN	0.570209	M AE1 K AH0 L W EY0 N
MACKLEWAIN	0.101477	M AH0 K AH0 L W EY0 N
MACKLEWAIN	0.067905	M AE1 K AH0 L W EY2 N
LIBANO	0.993297	L IY0 B AA1 N OW0
MOLING	0.782578	M OW1 L IH0 NG
MOLING	0.059362	M OW2 L IH0 NG
MOLING	0.056217	M AA1 L IH0 NG
BENNYDECK'S	0.583859	B EH1 N IY0 D EH0 K S
BENNYDECK'S	0.276699	B EH1 N IH0 D EH0 K S
BENNYDECK'S	0.028343	B EH1 N IH0 D IH0 K S
MACKLEWAIN'S	0.615766	M AE1 K AH0 L W EY0 N Z
MACKLEWAIN'S	0.109585	M AH0 K AH0 L W EY0 N Z
MACKLEWAIN'S	0.039423	M AE1 K AH0 L W AH0 N Z
PRESTY	0.616071	P R EH1 S T IY0
PRESTY	0.288701	P R AH0 S T IY0
BREADHOUSE	0.995874	B R EH1 D HH AW2 S
BUZZER'S	0.992495	B AH1 Z ER0 Z
BHUNDA	0.502439	B UW1 N D AH0
BHUNDA	0.267733	B AH0 N D AH0
BHUNDA	0.193772	B UH1 N D AH0
PINKIES	0.998440	P IH1 NG K IY0 Z
TROKE	0.723320	T R OW1 K
TROKE	0.269707	T R OW2 K
OSSIPON	0.728486	AA1 S AH0 P AA2 N
OSSIPON	0.098752	AA1 S AH0 P AH0 N
OSSIPON	0.033957	AA1 S AH0 P AO0 N
RIVERLIKE	0.991731	R IH1 V ER0 L AY2 K
NICLESS	0.478183	N IH1 K L AH0 S
NICLESS	0.159889	N IH0 K L AH0 S
NICLESS	0.120611	N IH1 K L IH0 S
TRAMPE	0.959184	T R AE1 M P
VERLOC	0.610461	V ER0 L AA1 K
VERLOC	0.128479	V ER1 L AH0 K
VERLOC	0.073687	V ER1 L AA0 K
GANNY	0.991703	G AE1 N IY0
AMBROSCH	0.302906	AE0 M B R OW1 SH
AMBROSCH	0.201163	AE0 M B R AO1 SH
AMBROSCH	0.109274	AE1 M B R AO1 SH
FIBI	0.619154	F IH1 B IY0
FIBI	0.163168	F IY1 B IY0
FIBI	0.083443	F AY1 B IY0
IROLG	0.823123	IH0 R OW1 L G
IROLG	0.053196	IH0 R OW1 L JH
IROLG	0.021038	IH0 R OW1 L JH IY1
BALVASTRO	0.251546	B AA0 L V AA1 S T R OW0
BALVASTRO	0.213351	B AE0 L V AE1 S T R OW0
BALVASTRO	0.133005	B AA0 L V AE1 S T R OW0
BOOLOOROO	0.676757	B UW1 L UW1 R UW0
BOOLOOROO	0.173653	B UW1 L UH2 R UW0
BOOLOOROO	0.086501	B UW1 L UH0 R UW0
EOF
  fi

  # extend_lang.sh needs it to have basename 'lexiconp.txt'.
  mv $tree_dir/extvocab_nosp_lexicon/lexicon.lex $tree_dir/extvocab_nosp_lexicon/lexiconp.txt

  [ -f data/lang_nosp_extvocab/G.fst ] && rm data/lang_nosp_extvocab/G.fst
  utils/lang/extend_lang.sh  data/lang_nosp_basevocab $tree_dir/extvocab_nosp_lexicon/lexiconp.txt  data/lang_nosp_extvocab
fi

if [ $stage -le 5 ]; then
  # make the G.fst for the extra words.  Just assign equal probabilities to all of
  # them.  The words will all transition from state 1 to 2.
  cat <<EOF > $lang_ext/G.txt
0    1    #nonterm_begin <eps>
2    3    #nonterm_end <eps>
3
EOF
  lexicon=$tree_dir/extvocab_nosp_lexicon/lexiconp.txt
  num_words=$(wc -l <$lexicon)
  cost=$(perl -e "print log($num_words)");
  awk -v cost=$cost '{print 1, 2, $1, $1, cost}' <$lexicon >>$lang_ext/G.txt
  fstcompile --isymbols=$lang_ext/words.txt --osymbols=$lang_ext/words.txt <$lang_ext/G.txt | \
    fstarcsort --sort_type=ilabel >$lang_ext/G.fst
fi

if [ $stage -le 6 ]; then
  # make the part of the graph that will be included.
  # Refer to the 'compile-graph' commands in ./simple_demo.sh for how you'd do
  # this in code.
  utils/mkgraph.sh --self-loop-scale 1.0 $lang_ext $tree_dir $tree_dir/extvocab_nosp_part
fi

if [ $stage -le 7 ]; then
  offset=$(grep nonterm_bos $lang_ext/phones.txt | awk '{print $2}')
  nonterm_unk=$(grep nonterm:unk $lang_ext/phones.txt | awk '{print $2}')

  mkdir -p $tree_dir/extvocab_nosp_combined
  [ -d $tree_dir/extvocab_nosp_combined/phones ] && rm -r $tree_dir/extvocab_nosp_combined/phones
  # the decoding script expects words.txt and phones/, copy them from the extvocab_part
  # graph directory where they will have suitable values.
  cp -r $tree_dir/extvocab_nosp_part/{words.txt,phones.txt,phones/} $tree_dir/extvocab_nosp_combined

  # the following, due to --write-as-grammar=false, compiles it into an FST
  # which can be decoded by our normal decoder.
  make-grammar-fst --write-as-grammar=false --nonterm-phones-offset=$offset $tree_dir/extvocab_nosp_top/HCLG.fst \
                   $nonterm_unk $tree_dir/extvocab_nosp_part/HCLG.fst  $tree_dir/extvocab_nosp_combined/HCLG.fst

  # the following compiles it and writes as GrammarFst.  The size is 176M, vs. 182M for HCLG.fst.
  # In other examples, of course the difference might be more.

  make-grammar-fst --write-as-grammar=true --nonterm-phones-offset=$offset $tree_dir/extvocab_nosp_top/HCLG.fst \
                $nonterm_unk $tree_dir/extvocab_nosp_part/HCLG.fst  $tree_dir/extvocab_nosp_combined/HCLG.gra
fi


if [ $stage -le 8 ]; then
  # OK, now we actually decode the test data.  For reference, the command which was used to
  # decode the test data in the current (at the time of writing) chain TDNN system
  # local/chain/run_tdnn.sh (as figured out by running it from that stage), was:
  # steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 --frames-per-chunk 140 --nj 38 \
  #   --cmd "queue.pl --mem 4G --num-threads 4" --online-ivector-dir exp/nnet3/ivectors_dev_clean_2_hires \
  #   exp/chain/tree_sp/graph_tgsmall data/dev_clean_2_hires exp/chain/tdnn1h_sp/decode_tgsmall_dev_clean_2

  # We just replace the graph with the one in $treedir/extvocab_nosp_combined.

  steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 --frames-per-chunk 140 --nj 38 \
    --cmd "queue.pl --mem 4G --num-threads 4" --online-ivector-dir exp/nnet3/ivectors_dev_clean_2_hires \
    exp/chain/tree_sp/extvocab_nosp_combined data/dev_clean_2_hires exp/chain/tdnn1h_sp/decode_tgsmall_dev_clean_2_ev_nosp_comb



#  grep WER exp/chain/tdnn1h_sp/decode_tgsmall_dev_clean_2_ev_nosp_comb/wer_* | utils/best_wer.sh
#%WER 11.79 [ 2375 / 20138, 195 ins, 343 del, 1837 sub ] exp/chain/tdnn1h_sp/decode_tgsmall_dev_clean_2_ev_nosp_comb/wer_12_0.0# s5: grep WER exp/chain/tdnn1h_sp/decode_tgsmall_dev_clean_2_ev_nosp_comb/wer_* | utils/best_wer.sh

 #.. versus the baseline below note, the baseline is not 100% comparable as it used the
 #   silence probabilities, which the grammar-decoding does not (yet) support...
 # s5: grep WER exp/chain/tdnn1h_sp/decode_tgsmall_dev_clean_2/wer_* | utils/best_wer.sh
 # %WER 12.01 [ 2418 / 20138, 244 ins, 307 del, 1867 sub ] exp/chain/tdnn1h_sp/decode_tgsmall_dev_clean_2/wer_13_0.0
fi

if [ $stage -le 9 ]; then
  steps/nnet3/decode_grammar.sh --acwt 1.0 --post-decode-acwt 10.0 --frames-per-chunk 140 --nj 38 \
    --cmd "queue.pl --mem 4G --num-threads 4" --online-ivector-dir exp/nnet3/ivectors_dev_clean_2_hires \
    exp/chain/tree_sp/extvocab_nosp_combined data/dev_clean_2_hires exp/chain/tdnn1h_sp/decode_tgsmall_dev_clean_2_ev_nosp_comb_gra

  #  The WER when decoding with the grammar FST directly is exactly the same:
  # s5:  grep WER exp/chain/tdnn1h_sp/decode_tgsmall_dev_clean_2_ev_nosp_comb_gra/wer_* | utils/best_wer.sh
  # %WER 11.79 [ 2375 / 20138, 195 ins, 343 del, 1837 sub ] exp/chain/tdnn1h_sp/decode_tgsmall_dev_clean_2_ev_nosp_comb_gra/wer_12_0.0
fi