03_compile_LM_LIA.sh
3.51 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
#!/bin/sh
#================================"
EXPE_DIR=$1
. ../LIA_kaldiUtils/path.sh
#echo $PATH
CHECK=0
silprob=0.5
LM_SOURCE=../LM_DATABASE
LM_DIR=$EXPE_DIR/LANGUAGE_MODEL
AC_DIR=$EXPE_DIR/ACOUSTIC_MODEL
AC_DATA=$EXPE_DIR/ac_Data/
LM_DATA=$EXPE_DIR/ling_Data/
mkdir -p $AC_DIR
mkdir -p $LM_DIR
#--------------------------------#
arpa_lm=$LM_SOURCE/ML_3g.arpa
arpa_lm=/local_disk/hera2/PERCOL/bigot/KALDI/LM_DATABASE/4G_DATA/ARPA/corpus_oral.DICO.dic.sri.n4.sort.arpa
#SPEERAL_LEXICON=$LM_SOURCE/LEXIQUE_V3.fmt
#KALDI_LEXICON=$LM_DIR/lexicon.txt
KALDI_LEXICON=$LM_SOURCE/LEXIQUE_V3.fmt
#--------------------------------#
#NON_SILENCE_PHONEMES=$LM_DATA/nonsilence_phones.txt
#SILENCE_PHONEMES=$LM_DATA/silence_phones.txt
#OPTIONAL_SILENCES=$LM_DATA/optional_silence.txt
#EXTRA_QUESTIONS=$LM_DATA/extra_questions.txt
#---------------------------------#
#echo 'creating kaldi lexicon' $KALDI_LEXICON from $SPEERAL_LEXICON
#head $KALDI_LEXICON $SPEERAL_LEXICON
#awk 'BEGIN{getline}($0 !~ /^#/) { print}' $SPEERAL_LEXICON | sort | awk '($0 !~ /^[:space:]*$/) {print}' > $KALDI_LEXICON || exit 1;
# Add to the lexicon the silences, noises etc.
#echo '!SIL SIL'; echo '[VOCALIZED-NOISE] SPN'; echo '[NOISE] NSN'; echo '[LAUGHTER] LAU';
#sort -k1 -u $KALDI_LEXICON -o $KALDI_LEXICON
#sed -i 1i"<UNK> SPN" $KALDI_LEXICON
# creating word.txt
echo "cut -f1 -d" " $KALDI_LEXICON | uniq > $LM_DIR/lex.txt"
cut -f1 -d" " $KALDI_LEXICON | uniq > $LM_DIR/lex.txt
sed -i 1i"<eps>" $LM_DIR/lex.txt
echo "#0" >> $LM_DIR/lex.txt
ii=0
for line in $(cat $LM_DIR/lex.txt);do
echo $line $ii
ii=$(( $ii + 1))
done > $LM_DIR/words.txt
#cat $KALDI_LEXICON | awk '{ for(n=2;n<=NF;n++){ phones[$n] = 1; }} END{for (p in phones) print p;}' | grep -v SIL > $NON_SILENCE_PHONEMES || exit 1;
#(echo SIL; echo SPN; echo NSN; echo LAU ) > $SILENCE_PHONEMES
#echo SIL > $OPTIONAL_SILENCES
# No "extra questions" in the input to this setup, as we don't have stress or tone.
#echo -n > $EXTRA_QUESTIONS
#[ ! -f $arpa_lm ] && echo No such file $arpa_lm && exit 1;
#SEGMENT=$AC_DATA/segments
#TEXT=$LM_DATA/text
#================================"
echo $arpa_lm
#zcat $arpa_lm | find_arpa_oovs.pl $LM_DIR/words.txt > $LM_DIR/oovsML.txt
cat $arpa_lm | find_arpa_oovs.pl $LM_DIR/words.txt > $LM_DIR/oovsML.txt
#echo $LM_DATA/oovsML.txt
#echo $arpa_lm
#gunzip -c "$arpa_lm" | \
cat $arpa_lm | \
grep -v '<s> <s>' | \
grep -v '</s> <s>' |\
grep -v '</s> </s>' | \
arpa2fst - |\
fstprint | \
remove_oovs.pl $LM_DIR/oovsML.txt |\
eps2disambig.pl |\
s2eps.pl |\
fstcompile --isymbols=$LM_DIR/words.txt --osymbols=$LM_DIR/words.txt --keep_isymbols=false --keep_osymbols=false | \
fstrmepsilon > $LM_DIR/G.fst
exit
if [ $CHECK == 1 ]; then
fstisstochastic $LM_DATA/G.fst
echo "Checking how stochastic G is (the first of these numbers should be small):"
fstisstochastic $LM_DATA/G.fst
echo "First few lines of lexicon FST:"
fstprint --isymbols=$LM_DATA/phones.txt --osymbols=$LM_DATA/words.txt $LM_DATA/L.fst | head
echo Performing further checks
# Checking that G.fst is determinizable.
fstdeterminize $LM_DATA/G.fst /dev/null || echo Error determinizing G.
# Checking that L_disambig.fst is determinizable.
fstdeterminize $LM_DATA/L_disambig.fst /dev/null || echo Error determinizing L.
fsttablecompose $LM_DATA/L_disambig.fst $LM_DATA/G.fst | \
fstdeterminizestar >/dev/null || echo Error
# Checking that LG is stochastic:
fsttablecompose $LM_DATA/L_disambig.fst $LM_DATA/G.fst | \
fstisstochastic || echo LG is not stochastic
fi # pour eviter de refaire le traitement #