Blame view

egs/wsj/s5/steps/make_mfcc.sh 5.34 KB
8dcb6dfcb   Yannick Estève   first commit
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
  #!/bin/bash
  
  # Copyright 2012-2016  Johns Hopkins University (Author: Daniel Povey)
  # Apache 2.0
  # To be run from .. (one directory up from here)
  # see ../run.sh for example
  
  # Begin configuration section.
  nj=4
  cmd=run.pl
  mfcc_config=conf/mfcc.conf
  compress=true
  write_utt2num_frames=true  # If true writes utt2num_frames.
  write_utt2dur=true
  # End configuration section.
  
  echo "$0 $@"  # Print the command line for logging.
  
  if [ -f path.sh ]; then . ./path.sh; fi
  . parse_options.sh || exit 1;
  
  if [ $# -lt 1 ] || [ $# -gt 3 ]; then
    cat >&2 <<EOF
  Usage: $0 [options] <data-dir> [<log-dir> [<mfcc-dir>] ]
   e.g.: $0 data/train
  Note: <log-dir> defaults to <data-dir>/log, and
        <mfcc-dir> defaults to <data-dir>/data.
  Options:
    --mfcc-config <config-file>          # config passed to compute-mfcc-feats.
    --nj <nj>                            # number of parallel jobs.
    --cmd <run.pl|queue.pl <queue opts>> # how to run jobs.
    --write-utt2num-frames <true|false>  # If true, write utt2num_frames file.
    --write-utt2dur <true|false>         # If true, write utt2dur file.
  EOF
     exit 1;
  fi
  
  data=$1
  if [ $# -ge 2 ]; then
    logdir=$2
  else
    logdir=$data/log
  fi
  if [ $# -ge 3 ]; then
    mfccdir=$3
  else
    mfccdir=$data/data
  fi
  
  # make $mfccdir an absolute pathname.
  mfccdir=`perl -e '($dir,$pwd)= @ARGV; if($dir!~m:^/:) { $dir = "$pwd/$dir"; } print $dir; ' $mfccdir ${PWD}`
  
  # use "name" as part of name of the archive.
  name=`basename $data`
  
  mkdir -p $mfccdir || exit 1;
  mkdir -p $logdir || exit 1;
  
  if [ -f $data/feats.scp ]; then
    mkdir -p $data/.backup
    echo "$0: moving $data/feats.scp to $data/.backup"
    mv $data/feats.scp $data/.backup
  fi
  
  scp=$data/wav.scp
  
  required="$scp $mfcc_config"
  
  for f in $required; do
    if [ ! -f $f ]; then
      echo "$0: no such file $f"
      exit 1;
    fi
  done
  
  utils/validate_data_dir.sh --no-text --no-feats $data || exit 1;
  
  if [ -f $data/spk2warp ]; then
    echo "$0 [info]: using VTLN warp factors from $data/spk2warp"
    vtln_opts="--vtln-map=ark:$data/spk2warp --utt2spk=ark:$data/utt2spk"
  elif [ -f $data/utt2warp ]; then
    echo "$0 [info]: using VTLN warp factors from $data/utt2warp"
    vtln_opts="--vtln-map=ark:$data/utt2warp"
  else
    vtln_opts=""
  fi
  
  for n in $(seq $nj); do
    # the next command does nothing unless $mfccdir/storage/ exists, see
    # utils/create_data_link.pl for more info.
    utils/create_data_link.pl $mfccdir/raw_mfcc_$name.$n.ark
  done
  
  
  if $write_utt2num_frames; then
    write_num_frames_opt="--write-num-frames=ark,t:$logdir/utt2num_frames.JOB"
  else
    write_num_frames_opt=
  fi
  
  if $write_utt2dur; then
    write_utt2dur_opt="--write-utt2dur=ark,t:$logdir/utt2dur.JOB"
  else
    write_utt2dur_opt=
  fi
  
  if [ -f $data/segments ]; then
    echo "$0 [info]: segments file exists: using that."
  
    split_segments=
    for n in $(seq $nj); do
      split_segments="$split_segments $logdir/segments.$n"
    done
  
    utils/split_scp.pl $data/segments $split_segments || exit 1;
    rm $logdir/.error 2>/dev/null
  
    $cmd JOB=1:$nj $logdir/make_mfcc_${name}.JOB.log \
      extract-segments scp,p:$scp $logdir/segments.JOB ark:- \| \
      compute-mfcc-feats $vtln_opts $write_utt2dur_opt --verbose=2 \
        --config=$mfcc_config ark:- ark:- \| \
      copy-feats --compress=$compress $write_num_frames_opt ark:- \
        ark,scp:$mfccdir/raw_mfcc_$name.JOB.ark,$mfccdir/raw_mfcc_$name.JOB.scp \
       || exit 1;
  
  else
    echo "$0: [info]: no segments file exists: assuming wav.scp indexed by utterance."
    split_scps=
    for n in $(seq $nj); do
      split_scps="$split_scps $logdir/wav_${name}.$n.scp"
    done
  
    utils/split_scp.pl $scp $split_scps || exit 1;
  
  
    # add ,p to the input rspecifier so that we can just skip over
    # utterances that have bad wave data.
  
    $cmd JOB=1:$nj $logdir/make_mfcc_${name}.JOB.log \
      compute-mfcc-feats $vtln_opts $write_utt2dur_opt --verbose=2 \
        --config=$mfcc_config scp,p:$logdir/wav_${name}.JOB.scp ark:- \| \
      copy-feats $write_num_frames_opt --compress=$compress ark:- \
        ark,scp:$mfccdir/raw_mfcc_$name.JOB.ark,$mfccdir/raw_mfcc_$name.JOB.scp \
        || exit 1;
  fi
  
  
  if [ -f $logdir/.error.$name ]; then
    echo "$0: Error producing MFCC features for $name:"
    tail $logdir/make_mfcc_${name}.1.log
    exit 1;
  fi
  
  # concatenate the .scp files together.
  for n in $(seq $nj); do
    cat $mfccdir/raw_mfcc_$name.$n.scp || exit 1
  done > $data/feats.scp || exit 1
  
  if $write_utt2num_frames; then
    for n in $(seq $nj); do
      cat $logdir/utt2num_frames.$n || exit 1
    done > $data/utt2num_frames || exit 1
  fi
  
  if $write_utt2dur; then
    for n in $(seq $nj); do
      cat $logdir/utt2dur.$n || exit 1
    done > $data/utt2dur || exit 1
  fi
  
  # Store frame_shift and mfcc_config along with features.
  frame_shift=$(perl -ne 'if (/^--frame-shift=(\d+)/) {
                            printf "%.3f", 0.001 * $1; exit; }' $mfcc_config)
  echo ${frame_shift:-'0.01'} > $data/frame_shift
  mkdir -p $data/conf && cp $mfcc_config $data/conf/mfcc.conf || exit 1
  
  rm $logdir/wav_${name}.*.scp  $logdir/segments.* \
     $logdir/utt2num_frames.* $logdir/utt2dur.* 2>/dev/null
  
  nf=$(wc -l < $data/feats.scp)
  nu=$(wc -l < $data/utt2spk)
  if [ $nf -ne $nu ]; then
    echo "$0: It seems not all of the feature files were successfully procesed" \
         "($nf != $nu); consider using utils/fix_data_dir.sh $data"
  fi
  
  if (( nf < nu - nu/20 )); then
    echo "$0: Less than 95% the features were successfully generated."\
         "Probably a serious error."
    exit 1
  fi
  
  
  echo "$0: Succeeded creating MFCC features for $name"