02_lexicon_LIA.sh 2.05 KB
#!/bin/sh

#. 00_init_paths.sh 
echo "==> $0"
EXPE_DIR=$1
. ../LIA_kaldiUtils/path.sh

# ================================== #
#LM_SOURCE=../LM_DATABASE
#LM_DIR=$EXPE_DIR/LANGUAGE_MODEL
#SPEERAL_LEXICON=$LM_SOURCE/LEXIQUE_V3.fmt
#TEXT=$LM_DATA/text
#CLEAN_TEXT=$LM_DIR/text.no_oov
# ================================== #

LM_DATA=$EXPE_DIR/ling_Data/
DICT=$LM_DATA/dict/

mkdir -p $LM_DATA
mkdir -p $DICT
mkdir -p $EXPE_DIR/TEMP

NON_SILENCE_PHONEMES=$DICT/nonsilence_phones.txt
SILENCE_PHONEMES=$DICT/silence_phones.txt
OPTIONAL_SILENCES=$DICT/optional_silence.txt
EXTRA_QUESTIONS=$DICT/extra_questions.txt
OOV=$LM_DATA/oov.txt

LEXICON=../LM_DATABASE/LEXIQUE_V3.lst
LEXICON_PHON=../LM_DATABASE/LEXIQUE_V3.fmt
ls $LEXICON_PHON

#====================================# 

cp $LEXICON_PHON $DICT

awk 'BEGIN{getline}($0 !~ /^#/) { print}' $DICT/LEXIQUE_V3.fmt | sort | awk '($0 !~ /^[:space:]*$/) {print}'  > $DICT/lexicon1.txt || exit 1;


#cat $LEXICON | sed 's/(.)//' > $DICT/lexicon.txt
wc -l $DICT/lexicon1.txt 


cat $DICT/lexicon1.txt |\
awk '{ for(n=2;n<=NF;n++){ phones[$n] = 1; }} END{for (p in phones) print p;}' |\
grep -v SIL > $NON_SILENCE_PHONEMES  || exit 1;


#cat $KALDI_LEXICON | awk '{ for(n=2;n<=NF;n++){ phones[$n] = 1; }} END{for (p in phones) print p;}' |  grep -v SIL > $NON_SILENCE_PHONEMES  || exit 1;
#head $NON_SILENCE_PHONEMES

(echo "SIL"; echo "SPN"; echo "NSN"; echo "LAU" ) > $SILENCE_PHONEMES
echo "SIL" > $OPTIONAL_SILENCES
echo "<UNK>" > $OOV
# No "extra questions" in the input to this setup, as we don't have stress or tone.
touch $EXTRA_QUESTIONS

# Add to the lexicon the silences, noises etc.    
(
#echo '!SIL SIL'; echo '[VOCALIZED-NOISE] SPN'; echo '[NOISE] NSN'; echo '[LAUGHTER] LAU';       
echo '<UNK> SPN' ) |\
cat - $DICT/lexicon1.txt  > $DICT/lexicon2.txt || exit 1;  

cp $DICT/lexicon2.txt $DICT/lexicon.txt

#utils/prepare_lang.sh $LM_DATA "<UNK>" $EXPE_DIR/TEMP $LM_DATA
#prepare_lang.sh $LM_DATA "<UNK>" $EXPE_DIR/TEMP $LM_DATA
echo "====> prepare_lang.sh $DICT <UNK> $EXPE_DIR/TEMP $LM_DATA"
prepare_lang.sh $DICT "<UNK>" $EXPE_DIR/TEMP $LM_DATA