Blame view

egs/wsj/s5/steps/nnet3/make_bottleneck_features.sh 4.61 KB
8dcb6dfcb   Yannick Estève   first commit
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
  #!/bin/bash
  
  # Copyright 2016 Pegah Ghahremani
  
  # This script dumps bottleneck feature for model trained using nnet3.
  # CAUTION!  This script isn't very suitable for dumping features from recurrent
  # architectures such as LSTMs, because it doesn't support setting the chunk size
  # and left and right context.  (Those would have to be passed into nnet3-compute).
  # See also chain/get_phone_post.sh.
  
  # Begin configuration section.
  stage=1
  nj=4
  cmd=queue.pl
  use_gpu=false
  ivector_dir=
  compress=true
  # End configuration options.
  
  echo "$0 $@"  # Print the command line for logging
  
  [ -f path.sh ] && . ./path.sh # source the path.
  . parse_options.sh || exit 1;
  
  if [[ ( $# -lt 4 ) || ( $# -gt 6 ) ]]; then
     echo "usage: steps/nnet3/make_bottleneck_features.sh <bnf-node-name> <input-data-dir> <bnf-data-dir> <nnet-dir> [<log-dir> [<bnfdir>] ]"
     echo "e.g.:  steps/nnet3/make_bottleneck_features.sh tdnn_bn.renorm data/train data/train_bnf exp/nnet3/tdnn_bnf exp_bnf/dump_bnf bnf"
     echo "Note: <log-dir> defaults to <bnf-data-dir>/log and <bnfdir> defaults to"
     echo " <bnf-data-dir>/data"
     echo "main options (for others, see top of script file)"
     echo "  --config <config-file>                           # config containing options"
     echo "  --nj <nj>                                        # number of parallel jobs"
     echo "  --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
     echo "  --ivector-dir                                    # directory for ivectors"
     exit 1;
  fi
  bnf_name=$1 # the component-node name in nnet3 model used for bottleneck feature extraction
  data=$2
  bnf_data=$3
  nnetdir=$4
  if [ $# -gt 4 ]; then
    logdir=$5
  else
    logdir=$bnf_data/log
  fi
  if [ $# -gt 5 ]; then
    bnfdir=$6
  else
    bnfdir=$bnf_data/data
  fi
  
  # Assume that final.nnet is in nnetdir
  cmvn_opts=`cat $nnetdir/cmvn_opts`;
  bnf_nnet=$nnetdir/final.raw
  if [ ! -f $bnf_nnet ] ; then
    if [ ! -f $nnetdir/final.mdl ]; then
      echo "$0: No such file $bnf_nnet or $nnetdir/final.mdl";
      exit 1;
    else
      bnf_nnet=$nnetdir/final.mdl
    fi
  fi
  
  if $use_gpu; then
    compute_queue_opt="--gpu 1"
    compute_gpu_opt="--use-gpu=yes"
    if ! cuda-compiled; then
      echo "$0: WARNING: you are running with one thread but you have not compiled"
      echo "   for CUDA.  You may be running a setup optimized for GPUs.  If you have"
      echo "   GPUs and have nvcc installed, go to src/ and do ./configure; make"
      exit 1
    fi
  else
    echo "$0: without using a GPU this will be very slow.  nnet3 does not yet support multiple threads."
    compute_gpu_opt="--use-gpu=no"
  fi
  
  
  ## Set up input features of nnet
  name=`basename $data`
  sdata=$data/split$nj
  
  mkdir -p $logdir
  mkdir -p $bnf_data
  mkdir -p $bnfdir
  echo $nj > $nnetdir/num_jobs
  
  [ ! -f $data/feats.scp ] && echo >&2 "The file $data/feats.scp does not exist!" && exit 1;
  [[ -d $sdata && $data/feats.scp -ot $sdata ]] || split_data.sh $data $nj || exit 1;
  
  use_ivector=false
  if [ ! -z "$ivector_dir" ];then
    use_ivector=true
    steps/nnet2/check_ivectors_compatible.sh $nnetdir $ivector_dir || exit 1;
  fi
  
  feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- |"
  ivector_feats="scp:utils/filter_scp.pl $sdata/JOB/utt2spk $ivector_dir/ivector_online.scp |"
  
  if [ $stage -le 1 ]; then
    echo "$0: Generating bottleneck (BNF) features using $bnf_nnet model as output of "
    echo "    component-node with name $bnf_name."
    echo "output-node name=output input=$bnf_name" > $bnf_data/output.config
    modified_bnf_nnet="nnet3-copy --nnet-config=$bnf_data/output.config $bnf_nnet - |"
    ivector_opts=
    if $use_ivector; then
      ivector_period=$(cat $ivector_dir/ivector_period) || exit 1;
      ivector_opts="--online-ivector-period=$ivector_period --online-ivectors='$ivector_feats'"
    fi
    $cmd $compute_queue_opt JOB=1:$nj $logdir/make_bnf_$name.JOB.log \
      nnet3-compute $compute_gpu_opt $ivector_opts "$modified_bnf_nnet" "$feats" ark:- \| \
      copy-feats --compress=$compress ark:- ark,scp:$bnfdir/raw_bnfeat_$name.JOB.ark,$bnfdir/raw_bnfeat_$name.JOB.scp || exit 1;
  fi
  
  
  N0=$(cat $data/feats.scp | wc -l)
  N1=$(cat $bnfdir/raw_bnfeat_$name.*.scp | wc -l)
  if [[ "$N0" != "$N1" ]]; then
    echo "$0: Error generating BNF features for $name (original:$N0 utterances, BNF:$N1 utterances)"
    exit 1;
  fi
  
  # Concatenate feats.scp into bnf_data
  for n in $(seq $nj); do  cat $bnfdir/raw_bnfeat_$name.$n.scp; done > $bnf_data/feats.scp
  
  for f in segments spk2utt text utt2spk wav.scp char.stm glm kws reco2file_and_channel stm; do
    [ -e $data/$f ] && cp -r $data/$f $bnf_data/$f
  done
  
  echo "$0: computing CMVN stats."
  steps/compute_cmvn_stats.sh $bnf_data
  
  echo "$0: done making BNF features."
  
  exit 0;