Blame view
egs/aidatatang_200zh/s5/local/format_data.sh
1.77 KB
8dcb6dfcb first commit |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 |
#!/bin/bash # . ./path.sh silprob=0.5 mkdir -p data/lang_test data/train data/dev arpa_lm=data/local/lm/3gram-mincount/lm_unpruned.gz [ ! -f $arpa_lm ] && echo No such file $arpa_lm && exit 1; # Copy stuff into its final locations... for f in spk2utt utt2spk wav.scp text; do cp data/local/train/$f data/train/$f || exit 1; done for f in spk2utt utt2spk wav.scp text; do cp data/local/dev/$f data/dev/$f || exit 1; done rm -r data/lang_test cp -r data/lang data/lang_test gunzip -c "$arpa_lm" | \ arpa2fst --disambig-symbol=#0 \ --read-symbol-table=data/lang_test/words.txt - data/lang_test/G.fst echo "Checking how stochastic G is (the first of these numbers should be small):" fstisstochastic data/lang_test/G.fst ## Check lexicon. ## just have a look and make sure it seems sane. echo "First few lines of lexicon FST:" fstprint --isymbols=data/lang/phones.txt --osymbols=data/lang/words.txt data/lang/L.fst | head echo Performing further checks # Checking that G.fst is determinizable. fstdeterminize data/lang_test/G.fst /dev/null || echo Error determinizing G. # Checking that L_disambig.fst is determinizable. fstdeterminize data/lang_test/L_disambig.fst /dev/null || echo Error determinizing L. # Checking that disambiguated lexicon times G is determinizable # Note: we do this with fstdeterminizestar not fstdeterminize, as # fstdeterminize was taking forever (presumbaly relates to a bug # in this version of OpenFst that makes determinization slow for # some case). fsttablecompose data/lang_test/L_disambig.fst data/lang_test/G.fst | \ fstdeterminizestar >/dev/null || echo Error # Checking that LG is stochastic: fsttablecompose data/lang/L_disambig.fst data/lang_test/G.fst | \ fstisstochastic || echo LG is not stochastic echo format_data succeeded. |