sprak_train_irstlm.sh
4.48 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
#!/bin/bash
# Copyright 2013-2014 Mirsk Digital Aps (Author: Andreas Kirkedal)
# Apache 2.0
# This script takes data prepared in a corpus-dependent way
# in data/local/, and converts it into the "canonical" form,
# in various subdirectories of data/, e.g. data/lang, data/lang_test_ug,
# data/train_si284, data/train_si84, etc.
. ./path.sh || exit 1;
if [ -z $IRSTLM ] ; then
export IRSTLM=$KALDI_ROOT/tools/irstlm/
fi
export PATH=${PATH}:$IRSTLM/bin
if ! command -v ngt >/dev/null 2>&1 ; then
echo "$0: Error: the IRSTLM is not available or compiled" >&2
echo "$0: Error: We used to install it by default, but." >&2
echo "$0: Error: this is no longer the case." >&2
echo "$0: Error: To install it, go to $KALDI_ROOT/tools" >&2
echo "$0: Error: and run extras/install_irstlm.sh" >&2
exit 1
fi
srcdict=$1
newtext=$2
lm_suffix=$3
N=$4
lmdir=$5
extdict=${srcdict}_$lm_suffix
lang_tmp=data/local/lang_tmp
extlang=data/lang_$lm_suffix
if [ ! -d $lmdir ];
then
mkdir -p $lmdir
fi
if [ ! -d $extdict ];
then
echo "Creating $extdict based on $srcdict"
# Extend the $srcdict to include the new data
mkdir -p $extdict
for f in extra_questions.txt lexicon.txt nonsilence_phones.txt optional_silence.txt silence_phones.txt; do
cp $srcdict/$f $extdict/
done
mv $extdict/lexicon.txt $extdict/oldlexicon.txt
fi
if [ ! -f $extdict/transcripts.uniq ];
then
# Create the text data for LMs and RNNs
cat $srcdict/transcripts.txt $newtext > $extdict/transcripts.txt
sort -u $extdict/transcripts.txt > $extdict/transcripts.uniq
fi
# Checks if espeak is available on the system. espeak is necessary to extend
# the setup because the original transcriptions were created with espeak and
# filtered
if ! which espeak >&/dev/null; then
echo "espeak is not available on your system. You must install espeak before proceeding."
exit 0;
fi
if [ ! -f $extdict/lexicon.txt ];
then
# Extend lexicon with pronunciations from espeak
echo "Transcibing $newtext using espeak"
cat $newtext | tr [:blank:] '\n' | grep -P -v '^[\s?|\.|\!]*$' | sort -u > $extdict/wlist.txt
# Piped so only a number is stored in the variable
nwords=$(wc -l < $extdict/wlist.txt)
nsplit=$((nwords / 8))
# Create wordlist
# Run through espeak to get phonetics
split -l $nsplit $extdict/wlist.txt $extdict/Wtemp_
for w in $extdict/Wtemp_*; do
(cat $w | espeak -q -vda -x > $w.pho ) &
done
wait
cat $extdict/Wtemp_*.pho > $extdict/plist.txt
rm -f $extdict/Wtemp_*
# Filter transcription
# Remove diacritics, language annotation ((da), (en), (fr) etc.), insert space between symbols, remove
# initial and trailing spaces and collapse 2 or more spaces to one space
cat $dir/plist.txt | perl -pe 's/\([[a-z]{2}\)//g' | perl -pe 's// /g' | perl -pe 's/ a I / aI /g' | perl -pe 's/ d Z / dZ /g' | perl -pe 's/ \? / /g' | perl -pe 's/ ([\#]) /\+ /g' | perl -pe 's/([\@n3]) \- /\1\- /g' | perl -pe "s/[\_\:\!\'\,\|2]//g" | perl -pe 's/ \- / /g' | tr -s ' ' | perl -pe 's/^ +| +$//g' > $dir/plist2.txt
#Some question marks are not caught above
perl -pe 's/ \? / /g' $dir/plist2.txt > $dir/plist3.txt
# Create lexicon.txt and put it in data/local/dict
paste $dir/wlist.txt $dir/plist3.txt > $dir/lexicon1.txt
# Remove entries without transcription
grep -P "^.+\t.+$" $dir/lexicon1.txt > $dir/lexicon2.txt
echo "Combining lexicons"
# Combine lexicons
cat $extdict/oldlexicon.txt $extdict/newlexicon.txt > $extdict/templex
sort -u $extdict/templex > $extdict/lexicon.txt
fi
if [ ! -d $extlang ];
then
# Create new lang_ext dir
utils/prepare_lang.sh $extdict "<UNK>" $lang_tmp $extlang || exit 1;
fi
if [ ! -f $lmdir/extra4.ngt ];
then
echo "Preparing LM data"
grep -P -v '^[\s?|\.|\!]*$' $newtext | \
awk '{if(NF>=4){ printf("%s\n",$0); }}' > $lmdir/text.filt
# Envelop LM training data in context cues
add-start-end.sh < $lmdir/text.filt > $lmdir/lm_input
echo "Creating new binary ngram table $lmdir/extra4.ngt"
ngt -i=$lmdir/lm_input -n=4 -o=$lmdir/extra4.ngt -b=yes
fi
echo "Training ARPA model extra$lm_suffix"
# Randomly chose n=4 as upper bound for the ngram table
tlm -tr=$lmdir/extra4.ngt -n=$N -lm=wb -o=$lmdir/extra${N}$lm_suffix
# Next, create the corresponding FST
# and the corresponding lang_test_* directory.
test=data/lang_test_${N}${lm_suffix}
mkdir -p $test
cp -r $extlang $test
cat $lmdir/extra${N}$lm_suffix | \
arpa2fst --disambig-symbol=#0 \
--read-symbol-table=$test/words.txt - $test/G.fst
utils/validate_lang.pl $test || exit 1;
exit 0;