run.sh
2.49 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
#!/bin/bash
# Note: this TIDIGITS setup has not been tuned at all and has some obvious
# deficiencies; this has been created as a starting point for a tutorial.
# We're just using the "adults" data here, not the data from children.
. ./cmd.sh ## You'll want to change cmd.sh to something that will work on your system.
## This relates to the queue.
# This is a shell script, but it's recommended that you run the commands one by
# one by copying and pasting into the shell.
tidigits=/export/corpora5/LDC/LDC93S10
#tidigits=/mnt/matylda2/data/TIDIGITS
# The following command prepares the data/{train,dev,test} directories.
local/tidigits_data_prep.sh $tidigits || exit 1;
local/tidigits_prepare_lang.sh || exit 1;
utils/validate_lang.pl data/lang/ # Note; this actually does report errors,
# and exits with status 1, but we've checked them and seen that they
# don't matter (this setup doesn't have any disambiguation symbols,
# and the script doesn't like that).
# Now make MFCC features.
# mfccdir should be some place with a largish disk where you
# want to store MFCC features.
mfccdir=mfcc
for x in test train; do
steps/make_mfcc.sh --cmd "$train_cmd" --nj 20 \
data/$x exp/make_mfcc/$x $mfccdir || exit 1;
steps/compute_cmvn_stats.sh data/$x exp/make_mfcc/$x $mfccdir || exit 1;
done
utils/subset_data_dir.sh data/train 1000 data/train_1k
# try --boost-silence 1.25 to some of the scripts below (also 1.5, if that helps...
# effect may not be clear till we test triphone system. See
# wsj setup for examples (../../wsj/s5/run.sh)
steps/train_mono.sh --nj 4 --cmd "$train_cmd" \
data/train_1k data/lang exp/mono0a
utils/mkgraph.sh data/lang exp/mono0a exp/mono0a/graph && \
steps/decode.sh --nj 10 --cmd "$decode_cmd" \
exp/mono0a/graph data/test exp/mono0a/decode
steps/align_si.sh --nj 4 --cmd "$train_cmd" \
data/train data/lang exp/mono0a exp/mono0a_ali
steps/train_deltas.sh --cmd "$train_cmd" \
300 3000 data/train data/lang exp/mono0a_ali exp/tri1
utils/mkgraph.sh data/lang exp/tri1 exp/tri1/graph
steps/decode.sh --nj 10 --cmd "$decode_cmd" \
exp/tri1/graph data/test exp/tri1/decode
# Example of looking at the output.
# utils/int2sym.pl -f 2- data/lang/words.txt exp/tri1/decode/scoring/19.tra | sed "s/ $//" | sort | diff - data/test/text
# Getting results [see RESULTS file]
# for x in exp/*/decode*; do [ -d $x ] && grep SER $x/wer_* | utils/best_wer.sh; done
#exp/mono0a/decode/wer_17:%SER 3.67 [ 319 / 8700 ]
#exp/tri1/decode/wer_19:%SER 2.64 [ 230 / 8700 ]