Blame view
egs/lre07/v2/local/dnn/fisher_create_test_lang.sh
2.25 KB
8dcb6dfcb first commit |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 |
#!/bin/bash # if [ -f path.sh ]; then . ./path.sh; fi mkdir -p data/lang_test arpa_lm=data/local/lm/3gram-mincount/lm_unpruned.gz [ ! -f $arpa_lm ] && echo No such file $arpa_lm && exit 1; mkdir -p data/lang_test cp -r data/lang/* data/lang_test # grep -v '<s> <s>' etc. is only for future-proofing this script. Our # LM doesn't have these "invalid combinations". These can cause # determinization failures of CLG [ends up being epsilon cycles]. # Note: remove_oovs.pl takes a list of words in the LM that aren't in # our word list. Since our LM doesn't have any, we just give it # /dev/null [we leave it in the script to show how you'd do it]. gunzip -c "$arpa_lm" | \ grep -v '<s> <s>' | \ grep -v '</s> <s>' | \ grep -v '</s> </s>' | \ arpa2fst - | fstprint | \ utils/remove_oovs.pl /dev/null | \ utils/eps2disambig.pl | utils/s2eps.pl | fstcompile --isymbols=data/lang_test/words.txt \ --osymbols=data/lang_test/words.txt --keep_isymbols=false --keep_osymbols=false | \ fstrmepsilon | fstarcsort --sort_type=ilabel > data/lang_test/G.fst fstisstochastic data/lang_test/G.fst echo "Checking how stochastic G is (the first of these numbers should be small):" fstisstochastic data/lang_test/G.fst ## Check lexicon. ## just have a look and make sure it seems sane. echo "First few lines of lexicon FST:" fstprint --isymbols=data/lang/phones.txt --osymbols=data/lang/words.txt data/lang/L.fst | head echo Performing further checks # Checking that G.fst is determinizable. fstdeterminize data/lang_test/G.fst /dev/null || echo Error determinizing G. # Checking that L_disambig.fst is determinizable. fstdeterminize data/lang_test/L_disambig.fst /dev/null || echo Error determinizing L. # Checking that disambiguated lexicon times G is determinizable # Note: we do this with fstdeterminizestar not fstdeterminize, as # fstdeterminize was taking forever (presumbaly relates to a bug # in this version of OpenFst that makes determinization slow for # some case). fsttablecompose data/lang_test/L_disambig.fst data/lang_test/G.fst | \ fstdeterminizestar >/dev/null || echo Error # Checking that LG is stochastic: fsttablecompose data/lang/L_disambig.fst data/lang_test/G.fst | \ fstisstochastic || echo "[log:] LG is not stochastic" echo "$0 succeeded" |