wsj_extend_char_dict.sh
3.16 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
#!/bin/bash
# Copyright 2017 Hossein Hadian
# This script extends the word list by including OOVs from the training
# transcripts.
# Since no phonemes are involved, we need no G2P models/rules.
# In other words, this script is like wsj_extend_dict.sh except
# it deals with characters (i.e. graphemes) instead of phonemes
# so it's much simpler. Parts of this script are taken from
# EESEN (https://github.com/srvk/eesen)
if [ $# -ne 3 ]; then
echo "usage: $0 <wsj-corpus-dir> <dict-src-dir> <dict-larger-dir>"
echo "e.g.: $0 WSJ/13-32.1/ data/local/lang_char data/local/lang_char_larger"
exit 1;
fi
if [ "`basename $1`" != 13-32.1 ]; then
echo "Expecting the first argument to this script to end in 13-32.1"
exit 1
fi
corpusdir=$1
srcdir=$2
dir=$3
mincount=2 # Minimum count of an OOV we include into the lexicon.
mkdir -p $dir
cp $srcdir/lexicon.txt $dir/lexicon.ori.txt
cp $srcdir/nonsilence_phones.txt $dir
cp $srcdir/silence_phones.txt $dir
cp $srcdir/optional_silence.txt $dir
# the original wordlist
cat $dir/lexicon.ori.txt | awk '{print $1}' | sort | uniq > $dir/wordlist.ori
# Get the training transcripts
echo "Getting the training transcripts, may take some time ..."
touch $dir/cleaned.gz
if [ `du -m $dir/cleaned.gz | cut -f 1` -eq 73 ]; then
echo "Not getting cleaned data in $dir/cleaned.gz again [already exists]";
else
gunzip -c $corpusdir/wsj1/doc/lng_modl/lm_train/np_data/{87,88,89}/*.z \
| awk '/^</{next}{print toupper($0)}' | perl -e '
open(F, "<$ARGV[0]")||die;
while(<F>){ chop; $isword{$_} = 1; }
while(<STDIN>) {
@A = split(" ", $_);
for ($n = 0; $n < @A; $n++) {
$a = $A[$n];
if (! $isword{$a} && $a =~ s/^([^\.]+)\.$/$1/) { # nonwords that end in "."
# and have no other "." in them: treat as period.
print "$a";
if ($n+1 < @A) { print "\n"; }
} else { print "$a "; }
}
print "\n";
}
' $dir/wordlist.ori | gzip -c > $dir/cleaned.gz
fi
# Get unigram counts and the counts of the oov words
echo "Getting unigram counts"
gunzip -c $dir/cleaned.gz | tr -s ' ' '\n' | \
awk '{count[$1]++} END{for (w in count) { print count[w], w; }}' | \
sort -nr > $dir/unigrams
cat $dir/unigrams | awk -v dict=$dir/wordlist.ori \
'BEGIN{while(getline<dict) seen[$1]=1;} {if(!seen[$2]){print;}}' \
> $dir/oov.counts
echo "Most frequent unseen unigrams are: "
head $dir/oov.counts
# Select the OOVs whose counts > $mincount. Include these OOVs into the lexicon.
cat $dir/oov.counts | awk -v thresh=$mincount '{if ($1 >= thresh) { print $2; }}' > $dir/oovlist
cat $dir/oovlist | perl -e 'while(<>){ chop; $str="$_"; foreach $p (split("", $_)) {$str="$str $p"}; print "$str\n";}' > $dir/lexicon.oov.txt
# filter out oov words that have characters not in non-silence characters
cat $dir/lexicon.oov.txt | awk -v dict=$dir/nonsilence_phones.txt \
'BEGIN{while(getline<dict) seen[$1]=1;} {for(i=2;i<=NF;i++) {if(!seen[$i]){break;}}; if (i==(NF+1)){print;}}' > $dir/lexicon.oov.filt.txt
# THe final expanded lexicon
cat $dir/lexicon.ori.txt $dir/lexicon.oov.filt.txt > $dir/lexicon.txt
echo "Number of OOVs we handled is `cat $dir/lexicon.oov.filt.txt | wc -l`"
echo "Created the larger lexicon $dir/lexicon.txt"