Blame view

Scripts/03_compile_LM_LIA.sh 3.51 KB
ec85f8892   bigot benjamin   first commit
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
  #!/bin/sh
  
  
  #================================"
  
  EXPE_DIR=$1
  . ../LIA_kaldiUtils/path.sh
  
  #echo $PATH
  CHECK=0
  
  silprob=0.5
  LM_SOURCE=../LM_DATABASE
  LM_DIR=$EXPE_DIR/LANGUAGE_MODEL
  AC_DIR=$EXPE_DIR/ACOUSTIC_MODEL
  AC_DATA=$EXPE_DIR/ac_Data/
  LM_DATA=$EXPE_DIR/ling_Data/
  mkdir -p $AC_DIR
  mkdir -p $LM_DIR
  
  #--------------------------------#
  arpa_lm=$LM_SOURCE/ML_3g.arpa
  arpa_lm=/local_disk/hera2/PERCOL/bigot/KALDI/LM_DATABASE/4G_DATA/ARPA/corpus_oral.DICO.dic.sri.n4.sort.arpa
  #SPEERAL_LEXICON=$LM_SOURCE/LEXIQUE_V3.fmt
  #KALDI_LEXICON=$LM_DIR/lexicon.txt 
  KALDI_LEXICON=$LM_SOURCE/LEXIQUE_V3.fmt
  #--------------------------------#
  #NON_SILENCE_PHONEMES=$LM_DATA/nonsilence_phones.txt
  #SILENCE_PHONEMES=$LM_DATA/silence_phones.txt   
  #OPTIONAL_SILENCES=$LM_DATA/optional_silence.txt 
  #EXTRA_QUESTIONS=$LM_DATA/extra_questions.txt
  #---------------------------------#
  
  #echo 'creating kaldi lexicon' $KALDI_LEXICON from $SPEERAL_LEXICON
  #head $KALDI_LEXICON $SPEERAL_LEXICON
  #awk 'BEGIN{getline}($0 !~ /^#/) { print}' $SPEERAL_LEXICON | sort | awk '($0 !~ /^[:space:]*$/) {print}'  > $KALDI_LEXICON || exit 1;
  
  # Add to the lexicon the silences, noises etc. 
  #echo '!SIL SIL'; echo '[VOCALIZED-NOISE] SPN'; echo '[NOISE] NSN'; echo '[LAUGHTER] LAU';
  #sort -k1 -u  $KALDI_LEXICON -o $KALDI_LEXICON  
  #sed -i 1i"<UNK> SPN" $KALDI_LEXICON
  
  # creating word.txt
  echo "cut -f1 -d" " $KALDI_LEXICON | uniq > $LM_DIR/lex.txt"
  cut -f1 -d" " $KALDI_LEXICON | uniq > $LM_DIR/lex.txt
  sed -i 1i"<eps>" $LM_DIR/lex.txt
  echo "#0" >> $LM_DIR/lex.txt
  
  ii=0
  for line in $(cat $LM_DIR/lex.txt);do
  	echo $line $ii
  	ii=$(( $ii + 1))
  done > $LM_DIR/words.txt
  
  
  #cat $KALDI_LEXICON | awk '{ for(n=2;n<=NF;n++){ phones[$n] = 1; }} END{for (p in phones) print p;}' |  grep -v SIL > $NON_SILENCE_PHONEMES  || exit 1; 
  #(echo SIL; echo SPN; echo NSN; echo LAU ) > $SILENCE_PHONEMES
  #echo SIL > $OPTIONAL_SILENCES 
  
  # No "extra questions" in the input to this setup, as we don't have stress or tone.
  #echo -n > $EXTRA_QUESTIONS 
  #[ ! -f $arpa_lm ] && echo No such file $arpa_lm && exit 1;
  #SEGMENT=$AC_DATA/segments
  #TEXT=$LM_DATA/text
  #================================"
  echo $arpa_lm
  #zcat $arpa_lm  | find_arpa_oovs.pl $LM_DIR/words.txt  > $LM_DIR/oovsML.txt
  cat $arpa_lm  | find_arpa_oovs.pl $LM_DIR/words.txt  > $LM_DIR/oovsML.txt
  #echo $LM_DATA/oovsML.txt
  #echo $arpa_lm
  #gunzip -c "$arpa_lm" | \
  
  cat $arpa_lm | \
  grep -v '<s> <s>' | \
  grep -v '</s> <s>' |\
  grep -v '</s> </s>' | \
  arpa2fst - |\
  fstprint | \
  remove_oovs.pl $LM_DIR/oovsML.txt |\
  eps2disambig.pl |\
  s2eps.pl |\
  fstcompile --isymbols=$LM_DIR/words.txt  --osymbols=$LM_DIR/words.txt  --keep_isymbols=false --keep_osymbols=false | \
  fstrmepsilon > $LM_DIR/G.fst
  
  exit
  
  if [ $CHECK == 1 ]; then
  	fstisstochastic $LM_DATA/G.fst
  	
  	echo  "Checking how stochastic G is (the first of these numbers should be small):"
  	fstisstochastic $LM_DATA/G.fst
  
  	echo "First few lines of lexicon FST:"
  	fstprint --isymbols=$LM_DATA/phones.txt --osymbols=$LM_DATA/words.txt $LM_DATA/L.fst  | head
  
  	echo Performing further checks
  	# Checking that G.fst is determinizable.
  	fstdeterminize $LM_DATA/G.fst /dev/null || echo Error determinizing G.
  
  
  	# Checking that L_disambig.fst is determinizable.
  	fstdeterminize $LM_DATA/L_disambig.fst /dev/null || echo Error determinizing L.
  
  	fsttablecompose $LM_DATA/L_disambig.fst $LM_DATA/G.fst | \
  	fstdeterminizestar >/dev/null || echo Error
  
  	# Checking that LG is stochastic:
  	fsttablecompose $LM_DATA/L_disambig.fst $LM_DATA/G.fst | \
  	fstisstochastic || echo LG is not stochastic
  
   
  fi # pour eviter de refaire le traitement #