Blame view
Scripts/02_lexicon_LIA.sh
2.05 KB
ec85f8892 first commit |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 |
#!/bin/sh #. 00_init_paths.sh echo "==> $0" EXPE_DIR=$1 . ../LIA_kaldiUtils/path.sh # ================================== # #LM_SOURCE=../LM_DATABASE #LM_DIR=$EXPE_DIR/LANGUAGE_MODEL #SPEERAL_LEXICON=$LM_SOURCE/LEXIQUE_V3.fmt #TEXT=$LM_DATA/text #CLEAN_TEXT=$LM_DIR/text.no_oov # ================================== # LM_DATA=$EXPE_DIR/ling_Data/ DICT=$LM_DATA/dict/ mkdir -p $LM_DATA mkdir -p $DICT mkdir -p $EXPE_DIR/TEMP NON_SILENCE_PHONEMES=$DICT/nonsilence_phones.txt SILENCE_PHONEMES=$DICT/silence_phones.txt OPTIONAL_SILENCES=$DICT/optional_silence.txt EXTRA_QUESTIONS=$DICT/extra_questions.txt OOV=$LM_DATA/oov.txt LEXICON=../LM_DATABASE/LEXIQUE_V3.lst LEXICON_PHON=../LM_DATABASE/LEXIQUE_V3.fmt ls $LEXICON_PHON #====================================# cp $LEXICON_PHON $DICT awk 'BEGIN{getline}($0 !~ /^#/) { print}' $DICT/LEXIQUE_V3.fmt | sort | awk '($0 !~ /^[:space:]*$/) {print}' > $DICT/lexicon1.txt || exit 1; #cat $LEXICON | sed 's/(.)//' > $DICT/lexicon.txt wc -l $DICT/lexicon1.txt cat $DICT/lexicon1.txt |\ awk '{ for(n=2;n<=NF;n++){ phones[$n] = 1; }} END{for (p in phones) print p;}' |\ grep -v SIL > $NON_SILENCE_PHONEMES || exit 1; #cat $KALDI_LEXICON | awk '{ for(n=2;n<=NF;n++){ phones[$n] = 1; }} END{for (p in phones) print p;}' | grep -v SIL > $NON_SILENCE_PHONEMES || exit 1; #head $NON_SILENCE_PHONEMES (echo "SIL"; echo "SPN"; echo "NSN"; echo "LAU" ) > $SILENCE_PHONEMES echo "SIL" > $OPTIONAL_SILENCES echo "<UNK>" > $OOV # No "extra questions" in the input to this setup, as we don't have stress or tone. touch $EXTRA_QUESTIONS # Add to the lexicon the silences, noises etc. ( #echo '!SIL SIL'; echo '[VOCALIZED-NOISE] SPN'; echo '[NOISE] NSN'; echo '[LAUGHTER] LAU'; echo '<UNK> SPN' ) |\ cat - $DICT/lexicon1.txt > $DICT/lexicon2.txt || exit 1; cp $DICT/lexicon2.txt $DICT/lexicon.txt #utils/prepare_lang.sh $LM_DATA "<UNK>" $EXPE_DIR/TEMP $LM_DATA #prepare_lang.sh $LM_DATA "<UNK>" $EXPE_DIR/TEMP $LM_DATA echo "====> prepare_lang.sh $DICT <UNK> $EXPE_DIR/TEMP $LM_DATA" prepare_lang.sh $DICT "<UNK>" $EXPE_DIR/TEMP $LM_DATA |