Blame view

Scripts/02_lexicon_LIA.sh 2.05 KB
ec85f8892   bigot benjamin   first commit
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
  #!/bin/sh
  
  #. 00_init_paths.sh 
  echo "==> $0"
  EXPE_DIR=$1
  . ../LIA_kaldiUtils/path.sh
  
  # ================================== #
  #LM_SOURCE=../LM_DATABASE
  #LM_DIR=$EXPE_DIR/LANGUAGE_MODEL
  #SPEERAL_LEXICON=$LM_SOURCE/LEXIQUE_V3.fmt
  #TEXT=$LM_DATA/text
  #CLEAN_TEXT=$LM_DIR/text.no_oov
  # ================================== #
  
  LM_DATA=$EXPE_DIR/ling_Data/
  DICT=$LM_DATA/dict/
  
  mkdir -p $LM_DATA
  mkdir -p $DICT
  mkdir -p $EXPE_DIR/TEMP
  
  NON_SILENCE_PHONEMES=$DICT/nonsilence_phones.txt
  SILENCE_PHONEMES=$DICT/silence_phones.txt
  OPTIONAL_SILENCES=$DICT/optional_silence.txt
  EXTRA_QUESTIONS=$DICT/extra_questions.txt
  OOV=$LM_DATA/oov.txt
  
  LEXICON=../LM_DATABASE/LEXIQUE_V3.lst
  LEXICON_PHON=../LM_DATABASE/LEXIQUE_V3.fmt
  ls $LEXICON_PHON
  
  #====================================# 
  
  cp $LEXICON_PHON $DICT
  
  awk 'BEGIN{getline}($0 !~ /^#/) { print}' $DICT/LEXIQUE_V3.fmt | sort | awk '($0 !~ /^[:space:]*$/) {print}'  > $DICT/lexicon1.txt || exit 1;
  
  
  #cat $LEXICON | sed 's/(.)//' > $DICT/lexicon.txt
  wc -l $DICT/lexicon1.txt 
  
  
  cat $DICT/lexicon1.txt |\
  awk '{ for(n=2;n<=NF;n++){ phones[$n] = 1; }} END{for (p in phones) print p;}' |\
  grep -v SIL > $NON_SILENCE_PHONEMES  || exit 1;
  
  
  #cat $KALDI_LEXICON | awk '{ for(n=2;n<=NF;n++){ phones[$n] = 1; }} END{for (p in phones) print p;}' |  grep -v SIL > $NON_SILENCE_PHONEMES  || exit 1;
  #head $NON_SILENCE_PHONEMES
  
  (echo "SIL"; echo "SPN"; echo "NSN"; echo "LAU" ) > $SILENCE_PHONEMES
  echo "SIL" > $OPTIONAL_SILENCES
  echo "<UNK>" > $OOV
  # No "extra questions" in the input to this setup, as we don't have stress or tone.
  touch $EXTRA_QUESTIONS
  
  # Add to the lexicon the silences, noises etc.    
  (
  #echo '!SIL SIL'; echo '[VOCALIZED-NOISE] SPN'; echo '[NOISE] NSN'; echo '[LAUGHTER] LAU';       
  echo '<UNK> SPN' ) |\
  cat - $DICT/lexicon1.txt  > $DICT/lexicon2.txt || exit 1;  
  
  cp $DICT/lexicon2.txt $DICT/lexicon.txt
  
  #utils/prepare_lang.sh $LM_DATA "<UNK>" $EXPE_DIR/TEMP $LM_DATA
  #prepare_lang.sh $LM_DATA "<UNK>" $EXPE_DIR/TEMP $LM_DATA
  echo "====> prepare_lang.sh $DICT <UNK> $EXPE_DIR/TEMP $LM_DATA"
  prepare_lang.sh $DICT "<UNK>" $EXPE_DIR/TEMP $LM_DATA