callhome_train_lms.sh
4.87 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
#!/bin/bash
# To be run from one level above this directory
# Generate the text for the LM training
tmp_dir=data/local/tmp
train_all=data/local/data/train_all
if [ $# -lt 1 ]; then
echo "Specify the location of the split files"
exit 1;
fi
splitFile=$1
split=train
# Train only
if [ -d $tmp_dir/$split ]; then
rm -r $tmp_dir/$split
fi
cp -r $train_all $tmp_dir/$split
awk 'BEGIN {FS=" "}; FNR==NR { a[$1]; next } ((substr($2,0,length($2)-2) ".sph") in a)' \
$splitFile/$split $train_all/segments > $tmp_dir/$split/segments
n=`awk 'BEGIN {FS = " "}; {print substr($2,0,length($2)-2)}' $tmp_dir/$split/segments | sort | uniq | wc -l`
echo "$n conversations left in split $split"
utils/fix_data_dir.sh $tmp_dir/$split
utils/fix_data_dir.sh $tmp_dir/$split
# There is no feature file yet, use --no-feats switch
utils/validate_data_dir.sh --no-feats $tmp_dir/$split
rm $tmp_dir/$split/*.tmp
# Now use this training text
text=$tmp_dir/train/text
lexicon=data/local/dict/lexicon.txt
for f in "$text" "$lexicon"; do
[ ! -f $x ] && echo "$0: No such file $f" && exit 1;
done
# This script takes no arguments. It assumes you have already run
# fisher_data_prep.sh and fisher_prepare_dict.sh
# It takes as input the files
#data/train_all/text
#data/local/dict/lexicon.txt
dir=`pwd`/data/local/lm
mkdir -p $dir
export LC_ALL=C # You'll get errors about things being not sorted, if you
# have a different locale.
export PATH=$PATH:`pwd`/../../../tools/kaldi_lm
( # First make sure the kaldi_lm toolkit is installed.
cd ../../../tools || exit 1;
if [ -d kaldi_lm ]; then
echo Not installing the kaldi_lm toolkit since it is already there.
else
echo Downloading and installing the kaldi_lm tools
if [ ! -f kaldi_lm.tar.gz ]; then
wget http://www.danielpovey.com/files/kaldi/kaldi_lm.tar.gz || exit 1;
fi
tar -xvzf kaldi_lm.tar.gz || exit 1;
cd kaldi_lm
make || exit 1;
echo Done making the kaldi_lm tools
fi
) || exit 1;
mkdir -p $dir
cleantext=$dir/text.no_oov
cat $text | awk -v lex=$lexicon 'BEGIN{while((getline<lex) >0){ seen[$1]=1; } }
{for(n=1; n<=NF;n++) { if (seen[$n]) { printf("%s ", $n); } else {printf("<unk> ");} } printf("\n");}' \
> $cleantext || exit 1;
cat $cleantext | awk '{for(n=2;n<=NF;n++) print $n; }' | sort | uniq -c | \
sort -nr > $dir/word.counts || exit 1;
# Get counts from acoustic training transcripts, and add one-count
# for each word in the lexicon (but not silence, we don't want it
# in the LM-- we'll add it optionally later).
cat $cleantext | awk '{for(n=2;n<=NF;n++) print $n; }' | \
cat - <(grep -w -v '!SIL' $lexicon | awk '{print $1}') | \
sort | uniq -c | sort -nr > $dir/unigram.counts || exit 1;
# note: we probably won't really make use of <unk> as there aren't any OOVs
cat $dir/unigram.counts | awk '{print $2}' | get_word_map.pl "<s>" "</s>" "<unk>" > $dir/word_map \
|| exit 1;
# note: ignore 1st field of train.txt, it's the utterance-id.
cat $cleantext | awk -v wmap=$dir/word_map 'BEGIN{while((getline<wmap)>0)map[$1]=$2;}
{ for(n=2;n<=NF;n++) { printf map[$n]; if(n<NF){ printf " "; } else { print ""; }}}' | gzip -c >$dir/train.gz \
|| exit 1;
train_lm.sh --arpa --lmtype 3gram-mincount $dir || exit 1;
# Perplexity over 88307.000000 words (excluding 691.000000 OOVs) is 71.241332
# note: output is
# data/local/lm/3gram-mincount/lm_unpruned.gz
exit 0
echo "Baseline"
# From here is some commands to do a baseline with SRILM (assuming
# you have it installed).
heldout_sent=158126 # Don't change this if you want result to be comparable with
# kaldi_lm results
sdir=$dir/srilm # in case we want to use SRILM to double-check perplexities.
mkdir -p $sdir
cat $cleantext | awk '{for(n=2;n<=NF;n++){ printf $n; if(n<NF) printf " "; else print ""; }}' | \
head -$heldout_sent > $sdir/heldout
cat $cleantext | awk '{for(n=2;n<=NF;n++){ printf $n; if(n<NF) printf " "; else print ""; }}' | \
tail -n +$heldout_sent > $sdir/train
cat $dir/word_map | awk '{print $1}' | cat - <(echo "<s>"; echo "</s>" ) > $sdir/wordlist
ngram-count -text $sdir/train -order 3 -limit-vocab -vocab $sdir/wordlist -unk \
-map-unk "<unk>" -kndiscount -interpolate -lm $sdir/srilm.o3g.kn.gz
ngram -lm $sdir/srilm.o3g.kn.gz -ppl $sdir/heldout
# data/local/lm/srilm/srilm.o3g.kn.gz: line 71: warning: non-zero probability for <unk> in closed-vocabulary LM
# file data/local/lm/srilm/heldout: 10000 sentences, 78998 words, 0 OOVs
# 0 zeroprobs, logprob= -165170 ppl= 71.7609 ppl1= 123.258
# Note: perplexity SRILM gives to Kaldi-LM model is similar to what kaldi-lm reports above.
# Difference in WSJ must have been due to different treatment of <unk>.
ngram -lm $dir/3gram-mincount/lm_unpruned.gz -ppl $sdir/heldout
# data/local/lm/srilm/srilm.o3g.kn.gz: line 71: warning: non-zero probability for <unk> in closed-vocabulary LM
# file data/local/lm/srilm/heldout: 10000 sentences, 78998 words, 0 OOVs
# 0 zeroprobs, logprob= -164990 ppl= 71.4278 ppl1= 122.614