Blame view

egs/sprakbanken/s5/local/sprak_train_irstlm.sh 4.48 KB
8dcb6dfcb   Yannick Estève   first commit
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
  #!/bin/bash
  
  # Copyright 2013-2014  Mirsk Digital Aps (Author: Andreas Kirkedal)
  # Apache 2.0
  
  # This script takes data prepared in a corpus-dependent way
  # in data/local/, and converts it into the "canonical" form,
  # in various subdirectories of data/, e.g. data/lang, data/lang_test_ug,
  # data/train_si284, data/train_si84, etc.
  
  . ./path.sh || exit 1;
  if [ -z $IRSTLM ] ; then
    export IRSTLM=$KALDI_ROOT/tools/irstlm/
  fi
  export PATH=${PATH}:$IRSTLM/bin
  if ! command -v ngt >/dev/null 2>&1 ; then
    echo "$0: Error: the IRSTLM is not available or compiled" >&2
    echo "$0: Error: We used to install it by default, but." >&2
    echo "$0: Error: this is no longer the case." >&2
    echo "$0: Error: To install it, go to $KALDI_ROOT/tools" >&2
    echo "$0: Error: and run extras/install_irstlm.sh" >&2
    exit 1
  fi
  
  srcdict=$1
  newtext=$2
  lm_suffix=$3
  N=$4
  lmdir=$5
  extdict=${srcdict}_$lm_suffix
  lang_tmp=data/local/lang_tmp
  extlang=data/lang_$lm_suffix
  
  if [ ! -d $lmdir ];
    then
    mkdir -p $lmdir
  fi
  
  
  if [ ! -d $extdict ];
    then
  
    echo "Creating $extdict based on $srcdict"
  
    # Extend the $srcdict to include the new data
    mkdir -p $extdict
    for f in extra_questions.txt lexicon.txt nonsilence_phones.txt optional_silence.txt silence_phones.txt; do
      cp $srcdict/$f $extdict/
    done
  
    mv $extdict/lexicon.txt $extdict/oldlexicon.txt
  fi
  
  
  if [ ! -f $extdict/transcripts.uniq ];
    then
    # Create the text data for LMs and RNNs
    cat $srcdict/transcripts.txt $newtext > $extdict/transcripts.txt
    sort -u $extdict/transcripts.txt > $extdict/transcripts.uniq
  fi
  
  
  # Checks if espeak is available on the system. espeak is necessary to extend
  # the setup because the original transcriptions were created with espeak and
  # filtered
  
  if ! which espeak >&/dev/null; then
    echo "espeak is not available on your system. You must install espeak before proceeding."
    exit 0;
  fi
  
  
  
  if [ ! -f $extdict/lexicon.txt ];
    then
    # Extend lexicon with pronunciations from espeak
    echo "Transcibing $newtext using espeak"
  
    cat $newtext | tr [:blank:] '
  ' | grep -P -v '^[\s?|\.|\!]*$' | sort -u > $extdict/wlist.txt
  
    # Piped so only a number is stored in the variable
    nwords=$(wc -l < $extdict/wlist.txt)
    nsplit=$((nwords / 8))
  
    # Create wordlist
    # Run through espeak to get phonetics
    split -l $nsplit $extdict/wlist.txt $extdict/Wtemp_
    for w in $extdict/Wtemp_*; do
      (cat $w | espeak -q -vda -x > $w.pho ) &
    done
    wait
  
    cat $extdict/Wtemp_*.pho > $extdict/plist.txt
    rm -f $extdict/Wtemp_*
  
  
    # Filter transcription
    # Remove diacritics, language annotation ((da), (en), (fr) etc.), insert space between symbols, remove
    # initial and trailing spaces and collapse 2 or more spaces to one space
  
    cat $dir/plist.txt | perl -pe 's/\([[a-z]{2}\)//g' | perl -pe 's// /g' | perl -pe 's/ a I / aI /g' | perl -pe 's/ d Z / dZ /g' | perl -pe 's/ \? / /g' | perl -pe 's/ ([\#]) /\+ /g' | perl -pe 's/([\@n3]) \- /\1\- /g' | perl -pe "s/[\_\:\!\'\,\|2]//g" | perl -pe 's/ \- / /g' | tr -s ' ' | perl -pe 's/^ +| +$//g' > $dir/plist2.txt
  
    #Some question marks are not caught above
    perl -pe 's/ \? / /g' $dir/plist2.txt > $dir/plist3.txt
  
    # Create lexicon.txt and put it in data/local/dict
    paste $dir/wlist.txt $dir/plist3.txt > $dir/lexicon1.txt
  
    # Remove entries without transcription
    grep -P  "^.+\t.+$" $dir/lexicon1.txt > $dir/lexicon2.txt
  
    echo "Combining lexicons"
    # Combine lexicons
    cat $extdict/oldlexicon.txt $extdict/newlexicon.txt > $extdict/templex
    sort -u $extdict/templex > $extdict/lexicon.txt
  fi
  
  
  if [ ! -d $extlang ];
    then
    # Create new lang_ext dir
    utils/prepare_lang.sh $extdict "<UNK>" $lang_tmp $extlang || exit 1;
  fi
  
  if [ ! -f $lmdir/extra4.ngt ];
    then
    echo "Preparing LM data"
  
    grep -P -v '^[\s?|\.|\!]*$' $newtext | \
    awk '{if(NF>=4){ printf("%s
  ",$0); }}' > $lmdir/text.filt
  
    # Envelop LM training data in context cues
    add-start-end.sh < $lmdir/text.filt > $lmdir/lm_input
  
  
      echo "Creating new binary ngram table $lmdir/extra4.ngt"
      ngt -i=$lmdir/lm_input -n=4 -o=$lmdir/extra4.ngt -b=yes
  fi
  
  echo "Training ARPA model extra$lm_suffix"
  
  # Randomly chose n=4 as upper bound for the ngram table
  tlm -tr=$lmdir/extra4.ngt -n=$N -lm=wb -o=$lmdir/extra${N}$lm_suffix
  
  # Next, create the corresponding FST
  # and the corresponding lang_test_* directory.
  test=data/lang_test_${N}${lm_suffix}
  mkdir -p $test
  
  
  cp -r $extlang $test
  
  cat $lmdir/extra${N}$lm_suffix | \
    arpa2fst --disambig-symbol=#0 \
             --read-symbol-table=$test/words.txt - $test/G.fst
  
  utils/validate_lang.pl $test || exit 1;
  
  exit 0;