03_compile_LM_LIA.sh 3.51 KB
#!/bin/sh


#================================"

EXPE_DIR=$1
. ../LIA_kaldiUtils/path.sh

#echo $PATH
CHECK=0

silprob=0.5
LM_SOURCE=../LM_DATABASE
LM_DIR=$EXPE_DIR/LANGUAGE_MODEL
AC_DIR=$EXPE_DIR/ACOUSTIC_MODEL
AC_DATA=$EXPE_DIR/ac_Data/
LM_DATA=$EXPE_DIR/ling_Data/
mkdir -p $AC_DIR
mkdir -p $LM_DIR

#--------------------------------#
arpa_lm=$LM_SOURCE/ML_3g.arpa
arpa_lm=/local_disk/hera2/PERCOL/bigot/KALDI/LM_DATABASE/4G_DATA/ARPA/corpus_oral.DICO.dic.sri.n4.sort.arpa
#SPEERAL_LEXICON=$LM_SOURCE/LEXIQUE_V3.fmt
#KALDI_LEXICON=$LM_DIR/lexicon.txt 
KALDI_LEXICON=$LM_SOURCE/LEXIQUE_V3.fmt
#--------------------------------#
#NON_SILENCE_PHONEMES=$LM_DATA/nonsilence_phones.txt
#SILENCE_PHONEMES=$LM_DATA/silence_phones.txt   
#OPTIONAL_SILENCES=$LM_DATA/optional_silence.txt 
#EXTRA_QUESTIONS=$LM_DATA/extra_questions.txt
#---------------------------------#

#echo 'creating kaldi lexicon' $KALDI_LEXICON from $SPEERAL_LEXICON
#head $KALDI_LEXICON $SPEERAL_LEXICON
#awk 'BEGIN{getline}($0 !~ /^#/) { print}' $SPEERAL_LEXICON | sort | awk '($0 !~ /^[:space:]*$/) {print}'  > $KALDI_LEXICON || exit 1;

# Add to the lexicon the silences, noises etc. 
#echo '!SIL SIL'; echo '[VOCALIZED-NOISE] SPN'; echo '[NOISE] NSN'; echo '[LAUGHTER] LAU';
#sort -k1 -u  $KALDI_LEXICON -o $KALDI_LEXICON  
#sed -i 1i"<UNK> SPN" $KALDI_LEXICON

# creating word.txt
echo "cut -f1 -d" " $KALDI_LEXICON | uniq > $LM_DIR/lex.txt"
cut -f1 -d" " $KALDI_LEXICON | uniq > $LM_DIR/lex.txt
sed -i 1i"<eps>" $LM_DIR/lex.txt
echo "#0" >> $LM_DIR/lex.txt

ii=0
for line in $(cat $LM_DIR/lex.txt);do
	echo $line $ii
	ii=$(( $ii + 1))
done > $LM_DIR/words.txt


#cat $KALDI_LEXICON | awk '{ for(n=2;n<=NF;n++){ phones[$n] = 1; }} END{for (p in phones) print p;}' |  grep -v SIL > $NON_SILENCE_PHONEMES  || exit 1; 
#(echo SIL; echo SPN; echo NSN; echo LAU ) > $SILENCE_PHONEMES
#echo SIL > $OPTIONAL_SILENCES 

# No "extra questions" in the input to this setup, as we don't have stress or tone.
#echo -n > $EXTRA_QUESTIONS 
#[ ! -f $arpa_lm ] && echo No such file $arpa_lm && exit 1;
#SEGMENT=$AC_DATA/segments
#TEXT=$LM_DATA/text
#================================"
echo $arpa_lm
#zcat $arpa_lm  | find_arpa_oovs.pl $LM_DIR/words.txt  > $LM_DIR/oovsML.txt
cat $arpa_lm  | find_arpa_oovs.pl $LM_DIR/words.txt  > $LM_DIR/oovsML.txt
#echo $LM_DATA/oovsML.txt
#echo $arpa_lm
#gunzip -c "$arpa_lm" | \

cat $arpa_lm | \
grep -v '<s> <s>' | \
grep -v '</s> <s>' |\
grep -v '</s> </s>' | \
arpa2fst - |\
fstprint | \
remove_oovs.pl $LM_DIR/oovsML.txt |\
eps2disambig.pl |\
s2eps.pl |\
fstcompile --isymbols=$LM_DIR/words.txt  --osymbols=$LM_DIR/words.txt  --keep_isymbols=false --keep_osymbols=false | \
fstrmepsilon > $LM_DIR/G.fst

exit

if [ $CHECK == 1 ]; then
	fstisstochastic $LM_DATA/G.fst
	
	echo  "Checking how stochastic G is (the first of these numbers should be small):"
	fstisstochastic $LM_DATA/G.fst

	echo "First few lines of lexicon FST:"
	fstprint --isymbols=$LM_DATA/phones.txt --osymbols=$LM_DATA/words.txt $LM_DATA/L.fst  | head

	echo Performing further checks
	# Checking that G.fst is determinizable.
	fstdeterminize $LM_DATA/G.fst /dev/null || echo Error determinizing G.


	# Checking that L_disambig.fst is determinizable.
	fstdeterminize $LM_DATA/L_disambig.fst /dev/null || echo Error determinizing L.

	fsttablecompose $LM_DATA/L_disambig.fst $LM_DATA/G.fst | \
	fstdeterminizestar >/dev/null || echo Error

	# Checking that LG is stochastic:
	fsttablecompose $LM_DATA/L_disambig.fst $LM_DATA/G.fst | \
	fstisstochastic || echo LG is not stochastic

 
fi # pour eviter de refaire le traitement #