fsp_prepare_dict.sh
4.8 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
#!/usr/bin/env bash
# Copyright 2014 Gaurav Kumar. Apache 2.0
. ./path.sh
#First get the list of unique words from our text file
if [ $# -lt 1 ]; then
echo 'Usage fsp_prepare_dict.sh lexicon'
exit 1;
fi
stage=0
dir=`pwd`/data/local/dict
datadir=`pwd`/data/local/data/train_all
mkdir -p $dir
local=`pwd`/local
utils=`pwd`/utils
tmpdir=`pwd`/data/local/tmp
lexicon=$1
#Get all unique words, remove punctuation.
if [ $stage -le 0 ]; then
cat $datadir/text | sed 's:[0-9][0-9]\S*::g' | sed 's:[\.,\?]::g' | tr " " "\n" | sort | uniq | awk '{if (NF > 0){ print; }}' > $tmpdir/uniquewords
if [ ! -f "${tmpdir}/es_wordlist.json" ]; then
echo "Could not find the large collection of Spanish words es_wordlist.json"
echo "Trying to download it via wget"
if ! which wget >&/dev/null; then
echo "This script requires you to first install wget"
exit 1;
fi
cwd=`pwd`
cd $tmpdir
wget -T 10 -t 3 -c http://www.openslr.org/resources/21/es_wordlist.json.tgz
if [ ! -e ${tmpdir}/es_wordlist.json.tgz ]; then
echo "Download of the large Spanish word list failed"
exit 1;
fi
tar -xovzf es_wordlist.json.tgz || exit 1;
cd $cwd
fi
# Merge with gigaword corpus
$local/merge_lexicons.py ${tmpdir} ${lexicon}
mv $tmpdir/uniquewords $tmpdir/uniquewords.small
mv $tmpdir/uniquewords64k $tmpdir/uniquewords
fi
#Then get the list of phones form basic_rules in the lexicon folder
if [ $stage -le 1 ]; then
if [ ! -d "$lexicon/callhome_spanish_lexicon_970908" ]; then
echo "Could not find folder callhome_spanish_lexicon_970908 in the lexicon folder"
exit 1;
fi
# This is a preliminary attempt to get the unique phones from the LDC lexicon
# This will be extended based on our lexicon later
perl $local/find_unique_phones.pl $lexicon/callhome_spanish_lexicon_970908 $tmpdir
fi
#Get pronunciation for each word using the spron.pl file in the lexicon folder
if [ $stage -le 2 ]; then
#cd $lexicon/callhome_spanish_lexicon_970908
# Replace all words for which no pronunciation was generated with an orthographic
# representation
cat $tmpdir/uniquewords | $local/spron.pl $lexicon/callhome_spanish_lexicon_970908/preferences $lexicon/callhome_spanish_lexicon_970908/basic_rules \
| cut -f1 | sed -r 's:#\S+\s\S+\s\S+\s\S+\s(\S+):\1:g' \
| awk -F '[/][/]' '{print $1}' \
> $tmpdir/lexicon_raw
fi
#Break the pronunciation down according to the format required by Kaldi
if [ $stage -le 3 ]; then
# Creates a KALDI compatible lexicon, and extends the phone list
perl $local/isolate_phones.pl $tmpdir
cat $tmpdir/phones_extended | sort | awk '{if ($1 != "") {print;}}' > $tmpdir/phones_extended.1
mv $tmpdir/phones $tmpdir/phones.small
mv $tmpdir/phones_extended.1 $tmpdir/phones
sort $tmpdir/phones -o $tmpdir/phones
paste -d ' ' $tmpdir/uniquewords $tmpdir/lexicon_one_column | sed -r 's:(\S+)\s#.*:\1 oov:g' > $tmpdir/lexicon.1
#paste -d ' ' $tmpdir/uniquewords $tmpdir/lexicon_one_column | grep -v '#' > $tmpdir/lexicon.1
fi
if [ $stage -le 4 ]; then
# silence phones, one per line.
for w in sil laughter noise oov; do echo $w; done > $dir/silence_phones.txt
echo sil > $dir/optional_silence.txt
# An extra question will be added by including the silence phones in one class.
cat $dir/silence_phones.txt| awk '{printf("%s ", $1);} END{printf "\n";}' > \
$dir/extra_questions.txt || exit 1;
# Remove [] chars from phones
cat $tmpdir/phones | awk '{if ($1 != "_" && $1 != "[" && $1 != "]") {print;}}' > $tmpdir/phones.1
rm $tmpdir/phones
mv $tmpdir/phones.1 $tmpdir/phones
cp $tmpdir/phones $dir/nonsilence_phones.txt
if [ -f $tmpdir/lexicon.2 ]; then rm $tmpdir/lexicon.2; fi
cp "$tmpdir/lexicon.1" "$tmpdir/lexicon.2"
# Add prons for laughter, noise, oov
w=$(grep -v sil $dir/silence_phones.txt | tr '\n' '|')
perl -i -ne "print unless /\[(${w%?})\]/" $tmpdir/lexicon.2
for w in `grep -v sil $dir/silence_phones.txt`; do
echo "[$w] $w"
done | cat - $tmpdir/lexicon.2 > $tmpdir/lexicon.3 || exit 1;
cat $tmpdir/lexicon.3 \
<( echo "mm m"
echo "<unk> oov" ) > $tmpdir/lexicon.4
# From the lexicon remove _ from the phonetic representation
cat $tmpdir/lexicon.4 | sed 's:\s_::g' > $tmpdir/lexicon.5
cp "$tmpdir/lexicon.5" $dir/lexicon.txt
cat $datadir/text | \
awk '{for (n=2;n<=NF;n++){ count[$n]++; } } END { for(n in count) { print count[n], n; }}' | \
sort -nr > $tmpdir/word_counts
awk '{print $1}' $dir/lexicon.txt | \
perl -e '($word_counts)=@ARGV;
open(W, "<$word_counts")||die "opening word-counts $word_counts";
while(<STDIN>) { chop; $seen{$_}=1; }
while(<W>) {
($c,$w) = split;
if (!defined $seen{$w}) { print; }
} ' $tmpdir/word_counts > $tmpdir/oov_counts.txt
echo "*Highest-count OOVs are:"
head -n 20 $tmpdir/oov_counts.txt
fi
$utils/validate_dict_dir.pl $dir
exit 0;