Blame view

Scripts/steps/tandem/train_mono.sh 5.8 KB
ec85f8892   bigot benjamin   first commit
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
  #!/bin/bash
  # Copyright 2012  Johns Hopkins University (Author: Daniel Povey)
  #                 Korbinian Riedhammer
  # Apache 2.0
  
  
  # To be run from ..
  # Flat start and monophone training, with delta-delta features.
  # This script applies cepstral mean normalization (per speaker).
  
  # Begin configuration section.
  nj=4
  cmd=run.pl
  scale_opts="--transition-scale=1.0 --acoustic-scale=0.1 --self-loop-scale=0.1"
  num_iters=40    # Number of iterations of training
  max_iter_inc=30 # Last iter to increase #Gauss on.
  totgauss=1000 # Target #Gaussians.  
  boost_silence=1.0 # Factor by which to boost silence likelihoods in alignment
  realign_iters="1 2 3 4 5 6 7 8 9 10 12 14 16 18 20 23 26 29 32 35 38";
  config= # name of config file.
  stage=-4
  power=0.2 # exponent to determine number of gaussians from occurrence counts
  normft2=true # typically, the tandem features will already be normalized due to pca
  # End configuration section.
  
  echo "$0 $@"  # Print the command line for logging
  
  if [ -f path.sh ]; then . ./path.sh; fi
  . parse_options.sh || exit 1;
  
  if [ $# != 4 ]; then
    echo "Usage: steps/tandem/train_mono.sh [options] <data1-dir> <data2-dir> <lang-dir> <exp-dir>"
    echo " e.g.: steps/tandem/train_mono.sh {mfcc,bottleneck}/data/train.1k data/lang exp/mono"
    echo "main options (for others, see top of script file)"
    echo "  --config <config-file>                           # config containing options"
    echo "  --nj <nj>                                        # number of parallel jobs"
    echo "  --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
    echo "  --normft2 (true|false)                           # apply CMVN to second features?"
    exit 1;
  fi
  
  data1=$1
  data2=$2
  lang=$3
  dir=$4
  
  oov_sym=`cat $lang/oov.int` || exit 1;
  
  mkdir -p $dir/log
  echo $nj > $dir/num_jobs
  
  
  # Set up features.
  
  sdata1=$data1/split$nj;
  sdata2=$data2/split$nj;
  [[ -d $sdata1 && $data1/feats.scp -ot $sdata1 ]] || split_data.sh $data1 $nj || exit 1;
  [[ -d $sdata2 && $data2/feats.scp -ot $sdata2 ]] || split_data.sh $data2 $nj || exit 1;
  
  # Use deltas on the first tream (most likely this will be MFCCs or alike)
  feats1="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata1/JOB/utt2spk scp:$sdata1/JOB/cmvn.scp scp:$sdata1/JOB/feats.scp ark:- | add-deltas ark:- ark:- |"
  
  # Second stream will most likely be bottleneck or posteriors, so normalize
  # if desired
  feats2="scp:$sdata2/JOB/feats.scp"
  if [ "$normft2" == "true" ]; then
    feats2="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata2/JOB/utt2spk scp:$sdata2/JOB/cmvn.scp $feats2 ark:- |"
  fi
  
  # paste features
  feats="ark,s,cs:paste-feats '$feats1' '$feats2' ark:- |"
  example_feats="`echo '$feats' | sed s/JOB/1/g`";
  
  # get dimension
  allfeats=$(echo $feats | sed s:JOB:..:g)
  feat_dim=$(feat-to-dim --print-args=false "$allfeats" - 2> $dir/log/feat_dim)
  
  # save stats
  echo $feats > $dir/tandem
  echo $normft2 > $dir/normft2
  
  echo "$0: Initializing monophone system."
  
  [ ! -f $lang/phones/sets.int ] && exit 1;
  shared_phones_opt="--shared-phones=$lang/phones/sets.int"
  
  if [ $stage -le -3 ]; then
  # Note: JOB=. makes it use the whole set;  we want that to make sure we have phoneme 
    $cmd JOB=1 $dir/log/init.log \
      gmm-init-mono $shared_phones_opt "--train-feats=$allfeats" $lang/topo $feat_dim \
      $dir/0.mdl $dir/tree || exit 1;
  fi
  
  numgauss=`gmm-info --print-args=false $dir/0.mdl | grep gaussians | awk '{print $NF}'`
  incgauss=$[($totgauss-$numgauss)/$max_iter_inc] # per-iter increment for #Gauss
  
  if [ $stage -le -2 ]; then
    echo "$0: Compiling training graphs"
    $cmd JOB=1:$nj $dir/log/compile_graphs.JOB.log \
      compile-train-graphs $dir/tree $dir/0.mdl  $lang/L.fst \
      "ark:sym2int.pl --map-oov $oov_sym -f 2- $lang/words.txt < $sdata1/JOB/text|" \
      "ark:|gzip -c >$dir/fsts.JOB.gz" || exit 1;
  fi
  
  if [ $stage -le -1 ]; then
    echo "$0: Aligning data equally (pass 0)"
    $cmd JOB=1:$nj $dir/log/align.0.JOB.log \
      align-equal-compiled "ark:gunzip -c $dir/fsts.JOB.gz|" "$feats" ark,t:-  \| \
      gmm-acc-stats-ali --binary=true $dir/0.mdl "$feats" ark:- \
      $dir/0.JOB.acc || exit 1;
  fi
  
  # In the following steps, the --min-gaussian-occupancy=3 option is important, otherwise
  # we fail to est "rare" phones and later on, they never align properly.
  
  if [ $stage -le 0 ]; then
    gmm-est --min-gaussian-occupancy=3  --mix-up=$numgauss --power=$power \
      $dir/0.mdl "gmm-sum-accs - $dir/0.*.acc|" $dir/1.mdl 2> $dir/log/update.0.log || exit 1;
    rm $dir/0.*.acc
  fi
  
  
  beam=6 # will change to 10 below after 1st pass
  # note: using slightly wider beams for WSJ vs. RM.
  x=1
  while [ $x -lt $num_iters ]; do
    echo "$0: Pass $x"
    if [ $stage -le $x ]; then
      if echo $realign_iters | grep -w $x >/dev/null; then
        echo "$0: Aligning data"
        mdl="gmm-boost-silence --boost=$boost_silence `cat $lang/phones/optional_silence.csl` $dir/$x.mdl - |"
        $cmd JOB=1:$nj $dir/log/align.$x.JOB.log \
          gmm-align-compiled $scale_opts --beam=$beam --retry-beam=$[$beam*4] "$mdl" \
          "ark:gunzip -c $dir/fsts.JOB.gz|" "$feats" "ark,t:|gzip -c >$dir/ali.JOB.gz" \
          || exit 1;
      fi
      $cmd JOB=1:$nj $dir/log/acc.$x.JOB.log \
        gmm-acc-stats-ali  $dir/$x.mdl "$feats" "ark:gunzip -c $dir/ali.JOB.gz|" \
        $dir/$x.JOB.acc || exit 1;
  
      $cmd $dir/log/update.$x.log \
        gmm-est --write-occs=$dir/$[$x+1].occs --mix-up=$numgauss --power=$power $dir/$x.mdl \
        "gmm-sum-accs - $dir/$x.*.acc|" $dir/$[$x+1].mdl || exit 1;
      rm $dir/$x.mdl $dir/$x.*.acc $dir/$x.occs 2>/dev/null
    fi
    if [ $x -le $max_iter_inc ]; then
       numgauss=$[$numgauss+$incgauss];
    fi
    beam=10
    x=$[$x+1]
  done
  
  ( cd $dir; rm final.{mdl,occs} 2>/dev/null; ln -s $x.mdl final.mdl; ln -s $x.occs final.occs )
  
  utils/summarize_warnings.pl $dir/log
  
  echo "Done training tandem mono-phone system in $dir"
  
  # example of showing the alignments:
  # show-alignments data/lang/phones.txt $dir/30.mdl "ark:gunzip -c $dir/ali.0.gz|" | head -4