Blame view
Scripts/03_compile_LM_LIA.sh
3.51 KB
ec85f8892 first commit |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 |
#!/bin/sh #================================" EXPE_DIR=$1 . ../LIA_kaldiUtils/path.sh #echo $PATH CHECK=0 silprob=0.5 LM_SOURCE=../LM_DATABASE LM_DIR=$EXPE_DIR/LANGUAGE_MODEL AC_DIR=$EXPE_DIR/ACOUSTIC_MODEL AC_DATA=$EXPE_DIR/ac_Data/ LM_DATA=$EXPE_DIR/ling_Data/ mkdir -p $AC_DIR mkdir -p $LM_DIR #--------------------------------# arpa_lm=$LM_SOURCE/ML_3g.arpa arpa_lm=/local_disk/hera2/PERCOL/bigot/KALDI/LM_DATABASE/4G_DATA/ARPA/corpus_oral.DICO.dic.sri.n4.sort.arpa #SPEERAL_LEXICON=$LM_SOURCE/LEXIQUE_V3.fmt #KALDI_LEXICON=$LM_DIR/lexicon.txt KALDI_LEXICON=$LM_SOURCE/LEXIQUE_V3.fmt #--------------------------------# #NON_SILENCE_PHONEMES=$LM_DATA/nonsilence_phones.txt #SILENCE_PHONEMES=$LM_DATA/silence_phones.txt #OPTIONAL_SILENCES=$LM_DATA/optional_silence.txt #EXTRA_QUESTIONS=$LM_DATA/extra_questions.txt #---------------------------------# #echo 'creating kaldi lexicon' $KALDI_LEXICON from $SPEERAL_LEXICON #head $KALDI_LEXICON $SPEERAL_LEXICON #awk 'BEGIN{getline}($0 !~ /^#/) { print}' $SPEERAL_LEXICON | sort | awk '($0 !~ /^[:space:]*$/) {print}' > $KALDI_LEXICON || exit 1; # Add to the lexicon the silences, noises etc. #echo '!SIL SIL'; echo '[VOCALIZED-NOISE] SPN'; echo '[NOISE] NSN'; echo '[LAUGHTER] LAU'; #sort -k1 -u $KALDI_LEXICON -o $KALDI_LEXICON #sed -i 1i"<UNK> SPN" $KALDI_LEXICON # creating word.txt echo "cut -f1 -d" " $KALDI_LEXICON | uniq > $LM_DIR/lex.txt" cut -f1 -d" " $KALDI_LEXICON | uniq > $LM_DIR/lex.txt sed -i 1i"<eps>" $LM_DIR/lex.txt echo "#0" >> $LM_DIR/lex.txt ii=0 for line in $(cat $LM_DIR/lex.txt);do echo $line $ii ii=$(( $ii + 1)) done > $LM_DIR/words.txt #cat $KALDI_LEXICON | awk '{ for(n=2;n<=NF;n++){ phones[$n] = 1; }} END{for (p in phones) print p;}' | grep -v SIL > $NON_SILENCE_PHONEMES || exit 1; #(echo SIL; echo SPN; echo NSN; echo LAU ) > $SILENCE_PHONEMES #echo SIL > $OPTIONAL_SILENCES # No "extra questions" in the input to this setup, as we don't have stress or tone. #echo -n > $EXTRA_QUESTIONS #[ ! -f $arpa_lm ] && echo No such file $arpa_lm && exit 1; #SEGMENT=$AC_DATA/segments #TEXT=$LM_DATA/text #================================" echo $arpa_lm #zcat $arpa_lm | find_arpa_oovs.pl $LM_DIR/words.txt > $LM_DIR/oovsML.txt cat $arpa_lm | find_arpa_oovs.pl $LM_DIR/words.txt > $LM_DIR/oovsML.txt #echo $LM_DATA/oovsML.txt #echo $arpa_lm #gunzip -c "$arpa_lm" | \ cat $arpa_lm | \ grep -v '<s> <s>' | \ grep -v '</s> <s>' |\ grep -v '</s> </s>' | \ arpa2fst - |\ fstprint | \ remove_oovs.pl $LM_DIR/oovsML.txt |\ eps2disambig.pl |\ s2eps.pl |\ fstcompile --isymbols=$LM_DIR/words.txt --osymbols=$LM_DIR/words.txt --keep_isymbols=false --keep_osymbols=false | \ fstrmepsilon > $LM_DIR/G.fst exit if [ $CHECK == 1 ]; then fstisstochastic $LM_DATA/G.fst echo "Checking how stochastic G is (the first of these numbers should be small):" fstisstochastic $LM_DATA/G.fst echo "First few lines of lexicon FST:" fstprint --isymbols=$LM_DATA/phones.txt --osymbols=$LM_DATA/words.txt $LM_DATA/L.fst | head echo Performing further checks # Checking that G.fst is determinizable. fstdeterminize $LM_DATA/G.fst /dev/null || echo Error determinizing G. # Checking that L_disambig.fst is determinizable. fstdeterminize $LM_DATA/L_disambig.fst /dev/null || echo Error determinizing L. fsttablecompose $LM_DATA/L_disambig.fst $LM_DATA/G.fst | \ fstdeterminizestar >/dev/null || echo Error # Checking that LG is stochastic: fsttablecompose $LM_DATA/L_disambig.fst $LM_DATA/G.fst | \ fstisstochastic || echo LG is not stochastic fi # pour eviter de refaire le traitement # |