convert_htk.sh
4.44 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
#!/bin/bash
# E.g. of usage: ./convert_htk.sh ../.. /mnt/matylda5/jhu09/setup/CH1/English/exp/xwrd.R0_800_TB500/hmm84/MMF /mnt/matylda5/jhu09/setup/CH1/English/exp/xwrd.R0_800_TB500/hmm10_800_500/cluster.trees convert_dir
# This script takes as input the mmf and trees file and creates kaldi.tree and kaldi.mdl
# as output.
make_linear=no
cleanup=yes
for foo in 1 2 3; do
if [ "$1" == "--linear-topology" ]; then
make_linear=yes;
shift;
fi
if [ "$1" == "--no-cleanup" ]; then
cleanup=no;
shift;
fi
done
rootdir=$1 # e.g. /homes/eva/q/qpovey/UBM-ASR/branches/clean/
mmf=$2 #e.g. /mnt/matylda5/jhu09/setup/CH1/English/exp/xwrd.R0_800_TB500/hmm84/MMF
trees=$3 # e.g. /mnt/matylda5/jhu09/setup/CH1/English/exp/xwrd.R0_800_TB500/hmm10_800_500/cluster.trees
outdir=$4 # place to put tmp stuff + kaldi models.
# Convert relative to absolute pathnames in arguments.
D=`dirname "$rootdir"`
B=`basename "$rootdir"`
rootdir="`cd \"$D\" 2>/dev/null && pwd || echo \"$D\"`/$B"
D=`dirname "$trees"`
B=`basename "$trees"`
trees="`cd \"$D\" 2>/dev/null && pwd || echo \"$D\"`/$B"
D=`dirname "$mmf"`
B=`basename "$mmf"`
mmf="`cd \"$D\" 2>/dev/null && pwd || echo \"$D\"`/$B"
export PATH=$PATH:$rootdir/misc/htk_conversion/:$rootdir/src/gmmbin/
mkdir -p $outdir
cd $outdir
mmf2trans.pl < $mmf > trans1.txt
if [ $make_linear == "yes" ]; then
make_linear.pl < trans1.txt > trans2.txt
else
cp trans1.txt trans2.txt
fi
grep TRANSITION trans2.txt | awk '{print $2}' | sort > phonelist.txt
# Work out phones that appear in questions but not data (will remove them.)
grep QS $trees | \
perl -ane ' foreach $a ( split(",", $_) ) { $a =~ m:\"(.+)\-\*: && print "$1\n"; $a =~ m:\"\*\+(.+)\": && print "$1\n"; } ' \
| sort | uniq > phonelist_tree.txt
if diff phonelist.txt phonelist_tree.txt | grep -v a | grep -v '>'; then
echo "Phones seen in tree but not model: error."
exit 1;
fi
diff phonelist.txt phonelist_tree.txt | grep '>' | awk '{print $2}' > bad_phones.txt
# bad_phones.txt is phones that appear in the tree file but not the model file.
# Will just delete them from the tree
# Get initial tree.
parse_trees.pl $trees > trees1.txt
# Add stump "trees" for the context-independent phones. This assumes they are macros-- otherwise, we have a problem.
get_ci_phone_trees.pl $mmf | cat trees1.txt - > trees2.txt
# Remove "bad" phones (unseen phones) from the trees.
perl -e 'foreach $p (@ARGV) { $is_bad{$p} = 1; }; while(<STDIN>) { foreach $a(split(" ",$_))
{ if(!$is_bad{$a}) { print "$a "; } } print "\n"; } ' `cat bad_phones.txt` < trees2.txt > trees3.txt
# Add eps to every question that has "sil" in it (should produce better end-of-file behaviour.)
# have to treat the first symbol on each line specially (would be the phone whose tree it is).
cat trees3.txt | awk '{for(n=1;n<=NF;n++) { if(n>1 && $n == "sil") { printf("sil <eps> "); } else
{ printf("%s ", $n); } } printf("\n"); }' > trees4.txt
# Extract the GMMs for each state.
get_hmm_states.pl $mmf > states1.txt
cat states1.txt | grep STATE | awk '{print $2}' | sort > statelist.txt
# Make symbol-tables for the phones and states.
echo '<eps>' | cat - phonelist.txt | awk '{ printf("%s %d\n", $1, NR-1); }' > phones.txt
cat statelist.txt | awk '{ printf("%s %d\n", $1, NR-1); }' > statesyms.txt
integerize.pl phones.txt < trans2.txt > trans3.txt
# work out map from phone-id to number of emitting states for that phone.
grep TRANSITION trans3.txt | awk '{print $2 " " $3-2; }' > phone2len.txt
trans2topo.pl < trans3.txt > kaldi.topo
integerize.pl statesyms.txt < states1.txt > states2.txt
integerize.pl phones.txt statesyms.txt < trees4.txt > trees5.txt
# Sort the integers in the questions (the things between square brackets),
# the Kaldi code requires this.
cat trees5.txt | perl -ane ' chop; @A = (split("\\[",$_)); print shift @A; foreach $x (@A)
{ ($y,$z)=split("\\]",$x);
$y = join(" ", sort{$a <=> $b}(split(" ", $y))); print " [ $y ] $z "; } print "\n"; ' > trees6.txt
tree_convert.pl phone2len.txt trees6.txt > kaldi.tree
dim=`grep -w MEAN $mmf | head -1 | awk '{print $2}'` # probably 39.
convert_states.pl $dim < states2.txt > kaldi.am_gmm
gmm-init-trans --binary=false kaldi.topo kaldi.am_gmm kaldi.tree kaldi.mdl || exit 1;
# clean up:
if [ $cleanup == "yes" ]; then
mv phones.txt tmpa;
mv statesyms.txt tmpb;
rm *.txt kaldi.am_gmm
mv tmpa phones.txt
mv tmpb statesyms.txt
fi
# now, kaldi.tree and kaldi.mdl are a Kaldi-format model; kaldi.topo may
# also be useful for some purposes.