Blame view

egs/wsj/s5/steps/nnet3/align_lats.sh 6.17 KB
8dcb6dfcb   Yannick Estève   first commit
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
  #!/bin/bash
  # Copyright 2012  Brno University of Technology (Author: Karel Vesely)
  #           2013  Johns Hopkins University (Author: Daniel Povey)
  #           2015  Vijayaditya Peddinti
  #           2016  Vimal Manohar
  #           2017  Pegah Ghahremani
  # Apache 2.0
  
  # Computes training alignments using nnet3 DNN, with output to lattices.
  
  # Begin configuration section.
  nj=4
  cmd=run.pl
  stage=-1
  # Begin configuration.
  scale_opts="--transition-scale=1.0 --self-loop-scale=0.1"
  acoustic_scale=0.1
  beam=20
  iter=final
  frames_per_chunk=50
  extra_left_context=0
  extra_right_context=0
  extra_left_context_initial=-1
  extra_right_context_final=-1
  online_ivector_dir=
  graphs_scp=
  generate_ali_from_lats=false # If true, alingments generated from lattices.
  # End configuration options.
  
  echo "$0 $@"  # Print the command line for logging
  
  [ -f path.sh ] && . ./path.sh # source the path.
  . parse_options.sh || exit 1;
  
  if [ $# != 4 ]; then
     echo "Usage: $0 <data-dir> <lang-dir> <src-dir> <align-dir>"
     echo "e.g.: $0 data/train data/lang exp/nnet4 exp/nnet4_ali"
     echo "main options (for others, see top of script file)"
     echo "  --config <config-file>                           # config containing options"
     echo "  --nj <nj>                                        # number of parallel jobs"
     echo "  --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
     exit 1;
  fi
  
  data=$1
  lang=$2
  srcdir=$3
  dir=$4
  
  oov=`cat $lang/oov.int` || exit 1;
  mkdir -p $dir/log
  echo $nj > $dir/num_jobs
  sdata=$data/split${nj}
  [[ -d $sdata && $data/feats.scp -ot $sdata ]] || \
     split_data.sh $data $nj || exit 1;
  
  extra_files=
  if [ ! -z "$online_ivector_dir" ]; then
    steps/nnet2/check_ivectors_compatible.sh $srcdir $online_ivector_dir || exit 1
    extra_files="$online_ivector_dir/ivector_online.scp $online_ivector_dir/ivector_period"
  fi
  
  for f in $srcdir/tree $srcdir/${iter}.mdl $data/feats.scp $lang/L.fst $extra_files; do
    [ ! -f $f ] && echo "$0: no such file $f" && exit 1;
  done
  
  cp $srcdir/{tree,${iter}.mdl} $dir || exit 1;
  
  utils/lang/check_phones_compatible.sh $lang/phones.txt $srcdir/phones.txt || exit 1;
  cp $lang/phones.txt $dir || exit 1;
  ## Set up features.  Note: these are different from the normal features
  ## because we have one rspecifier that has the features for the entire
  ## training set, not separate ones for each batch.
  echo "$0: feature type is raw"
  
  cmvn_opts=`cat $srcdir/cmvn_opts 2>/dev/null`
  cp $srcdir/cmvn_opts $dir 2>/dev/null
  
  feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- |"
  
  ivector_opts=
  if [ ! -z "$online_ivector_dir" ]; then
    ivector_period=$(cat $online_ivector_dir/ivector_period) || exit 1;
    ivector_opts="--online-ivectors=scp:$online_ivector_dir/ivector_online.scp --online-ivector-period=$ivector_period"
  fi
  
  echo "$0: aligning data in $data using model from $srcdir, putting alignments in $dir"
  
  frame_subsampling_opt=
  if [ -f $srcdir/frame_subsampling_factor ]; then
    # e.g. for 'chain' systems
    frame_subsampling_factor=$(cat $srcdir/frame_subsampling_factor)
    frame_subsampling_opt="--frame-subsampling-factor=$frame_subsampling_factor"
    cp $srcdir/frame_subsampling_factor $dir
    if [[ $frame_subsampling_factor -gt 1 ]]; then
      # Assume a chain system, check agrument sanity.
      if [[ ! ($scale_opts == *--self-loop-scale=1.0* &&
               $scale_opts == *--transition-scale=1.0* &&
               $acoustic_scale = '1.0') ]]; then
        echo "$0: ERROR: frame-subsampling-factor is not 1, assuming a chain system."
        echo "... You should pass the following options to this script:"
        echo "  --scale-opts '--transition-scale=1.0 --self-loop-scale=1.0'" \
             "--acoustic_scale 1.0"
      fi
    fi
  fi
  
  if [ ! -z "$graphs_scp" ]; then
    if [ ! -f $graphs_scp ]; then
      echo "Could not find graphs $graphs_scp" && exit 1
    fi
    tra="scp:utils/filter_scp.pl $sdata/JOB/utt2spk $graphs_scp |"
    prog=compile-train-graphs-fsts
  else
    tra="ark:utils/sym2int.pl --map-oov $oov -f 2- $lang/words.txt $sdata/JOB/text|";
    prog=compile-train-graphs
  fi
  
  if [ $stage -le 0 ]; then
    ## because nnet3-latgen-faster doesn't support adding the transition-probs to the
    ## graph itself, we need to bake them into the compiled graphs.  This means we can't reuse previously compiled graphs,
    ## because the other scripts write them without transition probs.
    $cmd JOB=1:$nj $dir/log/compile_graphs.JOB.log \
      $prog --read-disambig-syms=$lang/phones/disambig.int \
      $scale_opts \
      $dir/tree $srcdir/${iter}.mdl  $lang/L.fst "$tra" \
      "ark:|gzip -c >$dir/fsts.JOB.gz" || exit 1
  fi
  
  if [ $stage -le 1 ]; then
    # Warning: nnet3-latgen-faster doesn't support a retry-beam so you may get more
    # alignment errors (however, it does have a default min-active=200 so this
    # will tend to reduce alignment errors).
    # --allow_partial=false makes sure we reach the end of the decoding graph.
    # --word-determinize=false makes sure we retain the alternative pronunciations of
    #   words (including alternatives regarding optional silences).
    #  --lattice-beam=$beam keeps all the alternatives that were within the beam,
    #    it means we do no pruning of the lattice (lattices from a training transcription
    #    will be small anyway).
    $cmd JOB=1:$nj $dir/log/generate_lattices.JOB.log \
      nnet3-latgen-faster --acoustic-scale=$acoustic_scale $ivector_opts $frame_subsampling_opt \
      --frames-per-chunk=$frames_per_chunk \
      --extra-left-context=$extra_left_context \
      --extra-right-context=$extra_right_context \
      --extra-left-context-initial=$extra_left_context_initial \
      --extra-right-context-final=$extra_right_context_final \
      --beam=$beam --lattice-beam=$beam \
      --allow-partial=false --word-determinize=false \
      $srcdir/${iter}.mdl "ark:gunzip -c $dir/fsts.JOB.gz |" \
      "$feats" "ark:|gzip -c >$dir/lat.JOB.gz" || exit 1;
  fi
  
  if [ $stage -le 2 ] && $generate_ali_from_lats; then
    # If generate_alignments is true, ali.*.gz is generated in lats dir
    $cmd JOB=1:$nj $dir/log/generate_alignments.JOB.log \
      lattice-best-path --acoustic-scale=$acoustic_scale "ark:gunzip -c $dir/lat.JOB.gz |" \
      ark:/dev/null "ark:|gzip -c >$dir/ali.JOB.gz" || exit 1;
  fi
  echo "$0: done generating lattices from training transcripts."