prep_test_aspire.sh 14.8 KB
edit raw blame history



1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

19

20

21

22

23

24

25

26

27

28

29

30

31

32

33

34

35

36

37

38

39

40

41

42

43

44

45

46

47

48

49

50

51

52

53

54

55

56

57

58

59

60

61

62

63

64

65

66

67

68

69

70

71

72

73

74

75

76

77

78

79

80

81

82

83

84

85

86

87

88

89

90

91

92

93

94

95

96

97

98

99

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

119

120

121

122

123

124

125

126

127

128

129

130

131

132

133

134

135

136

137

138

139

140

141

142

143

144

145

146

147

148

149

150

151

152

153

154

155

156

157

158

159

160

161

162

163

164

165

166

167

168

169

170

171

172

173

174

175

176

177

178

179

180

181

182

183

184

185

186

187

188

189

190

191

192

193

194

195

196

197

198

199

200

201

202

203

204

205

206

207

208

209

210

211

212

213

214

215

216

217

218

219

220

221

222

223

224

225

226

227

228

229

230

231

232

233

234

235

236

237

238

239

240

241

242

243

244

245

246

247

248

249

250

251

252

253

254

255

256

257

258

259

260

261

262

263

264

265

266

267

268

269

270

271

272

273

274

275

276

277

278

279

280

281

282

283

284

285

286

287

288

289

290

291

292

293

294

295

296

297

298

299

300

301

302

303

304

305

306

307

308

309

310

311

312

313

314

315

316

317

318

319

320

321

322

323

324

325

326

327

328

329

330

331

332

333

334

335

336

337

338

339

340

341

342

343

344

345

346

347

348

349

350

351

352

353

354

355

356

357

358


#!/bin/bash
# Copyright Johns Hopkins University (Author: Daniel Povey, Vijayaditya Peddinti) 2015.  Apache 2.0.
# This script generates the ctm files for dev_aspire, test_aspire and eval_aspire 
# for scoring with ASpIRE scoring server.
# It also provides the WER for dev_aspire data.

iter=final
mfccdir=mfcc_reverb_submission
stage=0
decode_num_jobs=200
num_jobs=30
LMWT=12
word_ins_penalty=0
min_lmwt=9
max_lmwt=20
word_ins_penalties=0.0,0.25,0.5,0.75,1.0
decode_mbr=true
acwt=0.1
lattice_beam=8
ctm_beam=6
do_segmentation=true
max_count=100 # parameter for extract_ivectors.sh
sub_speaker_frames=1500
overlap=5
window=30
affix=
ivector_scale=1.0
pad_frames=0  # this did not seem to be helpful but leaving it as an option.
tune_hyper=true
pass2_decode_opts=
filter_ctm=true
weights_file=
silence_weight=0.00001
. ./cmd.sh

[ -f ./path.sh ] && . ./path.sh
. parse_options.sh || exit 1;

if [ $# -ne 3 ]; then
  echo "Usage: $0 [options] <data-dir> <lang-dir> <model-dir>"
  echo " Options:"
  echo "    --stage (0|1|2)                 # start scoring script from part-way through."
  echo "e.g.:"
  echo "$0 data/train data/lang exp/nnet2_multicondition/nnet_ms_a"
  exit 1;
fi

data_dir=$1 #select from {dev_aspire, test_aspire, eval_aspire}
lang=$2 # data/lang
dir=$3 # exp/nnet2_multicondition/nnet_ms_a

model_affix=`basename $dir`
ivector_dir=`dirname $dir`
ivector_affix=${affix:+_$affix}_${model_affix}_iter$iter
affix=_${affix}_iter${iter}
act_data_dir=${data_dir}
if [ "$data_dir" == "test_aspire" ]; then
  out_file=single_dev_test${affix}_$model_affix.ctm
elif [ "$data_dir" == "eval_aspire" ]; then
  out_file=single_eval${affix}_$model_affix.ctm
else
  if [ $stage -le 1 ]; then
    echo "Creating the data dir with whole recordings without segmentation"
    # create a whole directory without the segments
    unseg_dir=data/${data_dir}_whole
    src_dir=data/$data_dir
    mkdir -p $unseg_dir
    echo "Creating the $unseg_dir/wav.scp file"
    cp $src_dir/wav.scp $unseg_dir

    echo "Creating the $unseg_dir/reco2file_and_channel file"
    cat $unseg_dir/wav.scp | awk '{print $1, $1, "A";}' > $unseg_dir/reco2file_and_channel
    cat $unseg_dir/wav.scp | awk '{print $1, $1;}' > $unseg_dir/utt2spk
    utils/utt2spk_to_spk2utt.pl $unseg_dir/utt2spk > $unseg_dir/spk2utt

    steps/make_mfcc.sh --nj 30 --cmd "$train_cmd" --mfcc-config conf/mfcc_hires.conf $unseg_dir exp/make_mfcc_reverb/${data_dir}_whole $mfccdir || exit 1;
    steps/compute_cmvn_stats.sh $unseg_dir exp/make_mfcc_reverb/${data_dir}_whole $mfccdir || exit 1;
  fi
  data_dir=${data_dir}_whole
  out_file=single_dev${affix}_${model_affix}.ctm
fi

num_jobs=`cat data/${act_data_dir}/wav.scp|wc -l`
segmented_data_dir=${data_dir}
# extract the ivectors
if $do_segmentation; then
  segmented_data_dir=${data_dir}_uniformsegmented_win${window}_over${overlap}
fi

if [ $stage -le 2 ]; then
  echo "Generating uniform segments with length $window and overlap $overlap."
  rm -rf data/$segmented_data_dir
  copy_data_dir.sh --validate-opts "--no-text" data/$data_dir data/$segmented_data_dir || exit 1;
  cp data/$data_dir/reco2file_and_channel data/$segmented_data_dir/ || exit 1;
  python local/multi_condition/create_uniform_segments.py --overlap $overlap --window $window data/$segmented_data_dir  || exit 1;
  for file in cmvn.scp feats.scp; do
    rm -f data/$segmented_data_dir/$file
  done
  utils/validate_data_dir.sh --no-text --no-feats data/$segmented_data_dir || exit 1;
fi

if [ $stage -le 3 ]; then
  echo "Extracting features for the segments"
   # extract the features/i-vectors once again so that they are indexed by utterance and not by recording
  rm -rf data/${segmented_data_dir}_hires
  copy_data_dir.sh --validate-opts "--no-text " data/${segmented_data_dir} data/${segmented_data_dir}_hires || exit 1;
  steps/make_mfcc.sh --nj 10 --mfcc-config conf/mfcc_hires.conf \
      --cmd "$train_cmd" data/${segmented_data_dir}_hires \
      exp/make_reverb_hires/${segmented_data_dir} $mfccdir || exit 1;
  steps/compute_cmvn_stats.sh data/${segmented_data_dir}_hires exp/make_reverb_hires/${segmented_data_dir} $mfccdir || exit 1;
  utils/fix_data_dir.sh data/${segmented_data_dir}_hires
  utils/validate_data_dir.sh --no-text data/${segmented_data_dir}_hires
fi

if [ $stage -le 4 ]; then
  echo "Extracting i-vectors, stage 1"
  steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj 20 \
    --max-count $max_count \
    data/${segmented_data_dir}_hires $ivector_dir/extractor \
    $ivector_dir/ivectors_${segmented_data_dir}${ivector_affix}_stage1 || exit 1;
fi
if [ $ivector_scale != 1.0 ] && [ $ivector_scale != 1 ]; then
  ivector_scale_affix=_scale$ivector_scale
else
  ivector_scale_affix=
fi

if [ $stage -le 5 ]; then
  if [ "$ivector_scale_affix" != "" ]; then
    echo "$0: Scaling iVectors, stage 1"
    srcdir=$ivector_dir/ivectors_${segmented_data_dir}${ivector_affix}_stage1
    outdir=$ivector_dir/ivectors_${segmented_data_dir}${ivector_affix}${ivector_scale_affix}_stage1
    mkdir -p $outdir
    copy-matrix --scale=$ivector_scale scp:$srcdir/ivector_online.scp ark:- | \
      copy-feats --compress=true ark:-  ark,scp:$outdir/ivector_online.ark,$outdir/ivector_online.scp || exit 1;
    cp $srcdir/ivector_period $outdir/ivector_period
  fi
fi

decode_dir=$dir/decode_${segmented_data_dir}${affix}_pp
# generate the lattices
if [ $stage -le 6 ]; then
  echo "Generating lattices, stage 1"
  local/multi_condition/decode.sh --nj $decode_num_jobs --cmd "$decode_cmd" --config conf/decode.config \
    --online-ivector-dir $ivector_dir/ivectors_${segmented_data_dir}${ivector_affix}${ivector_scale_affix}_stage1 \
    --skip-scoring true --iter $iter \
    exp/tri5a/graph_pp data/${segmented_data_dir}_hires ${decode_dir}_stage1 || exit 1;
fi

if [ $stage -le 7 ]; then
  echo "$0: generating CTM from stage-1 lattices"
  local/multi_condition/get_ctm_conf.sh --cmd "$decode_cmd" \
    --use-segments false --iter $iter \
    data/${segmented_data_dir}_hires \
    ${lang} \
    ${decode_dir}_stage1 || exit 1;
fi

if [ $stage -le 8 ]; then
  if $filter_ctm; then
    if [ ! -z $weights_file ]; then
      echo "$0: Using provided weights file $weights_file"
      ivector_extractor_input=$weights_file
    else
      ctm=${decode_dir}_stage1/score_10/${segmented_data_dir}_hires.ctm 
      echo "$0: generating weights file from stage-1 ctm $ctm"
      
      feat-to-len scp:data/${segmented_data_dir}_hires/feats.scp ark,t:- >${decode_dir}_stage1/utt.lengths.$affix
      if [ ! -f $ctm ]; then  echo "$0: stage 8: expected ctm to exist: $ctm"; exit 1; fi
      cat $ctm | awk '$6 == 1.0 && $4 < 1.0' | \
      grep -v -w mm | grep -v -w mhm | grep -v -F '[noise]' | \
      grep -v -F '[laughter]' | grep -v -F '<unk>' | \
      perl -e ' $lengths=shift @ARGV;  $pad_frames=shift @ARGV; $silence_weight=shift @ARGV;
       $pad_frames >= 0 || die "bad pad-frames value $pad_frames";
       open(L, "<$lengths") || die "opening lengths file";
       @all_utts = ();
       $utt2ref = { };
       while (<L>) {
         ($utt, $len) = split(" ", $_);
         push @all_utts, $utt;
         $array_ref = [ ];
         for ($n = 0; $n < $len; $n++) { ${$array_ref}[$n] = $silence_weight; }
         $utt2ref{$utt} = $array_ref;
       }
       while (<STDIN>) {
         @A = split(" ", $_);
         @A == 6 || die "bad ctm line $_";
         $utt = $A[0]; $beg = $A[2]; $len = $A[3];
         $beg_int = int($beg * 100) - $pad_frames; 
         $len_int = int($len * 100) + 2*$pad_frames;
         $array_ref = $utt2ref{$utt};
         !defined $array_ref  && die "No length info for utterance $utt";
         for ($t = $beg_int; $t < $beg_int + $len_int; $t++) {
           if ($t >= 0 && $t < @$array_ref) {
             ${$array_ref}[$t] = 1;
            }
          }
        }
        foreach $utt (@all_utts) {  $array_ref = $utt2ref{$utt};
          print $utt, " [ ", join(" ", @$array_ref), " ]\n";
          } ' ${decode_dir}_stage1/utt.lengths.$affix $pad_frames $silence_weight   | gzip -c >${decode_dir}_stage1/weights${affix}.gz
          ivector_extractor_input=${decode_dir}_stage1/weights${affix}.gz
        fi
      else
        ivector_extractor_input=${decode_dir}_stage1
      fi
fi

if [ $stage -le 8 ]; then
  echo "Extracting i-vectors, stage 2 with input $ivector_extractor_input"
  # this does offline decoding, except we estimate the iVectors per
  # speaker, excluding silence (based on alignments from a GMM decoding), with a
  # different script.  This is just to demonstrate that script.
  # the --sub-speaker-frames is optional; if provided, it will divide each speaker
  # up into "sub-speakers" of at least that many frames... can be useful if
  # acoustic conditions drift over time within the speaker's data.
  steps/online/nnet2/extract_ivectors.sh --cmd "$train_cmd" --nj 20 \
    --silence-weight $silence_weight \
    --sub-speaker-frames $sub_speaker_frames --max-count $max_count \
    data/${segmented_data_dir}_hires $lang $ivector_dir/extractor \
    $ivector_extractor_input $ivector_dir/ivectors_${segmented_data_dir}${ivector_affix} || exit 1;
fi

if [ $stage -le 9 ]; then
  echo "Generating lattices, stage 2 with --acwt $acwt"
  rm -f ${decode_dir}_tg/.error
  local/multi_condition/decode.sh --nj $decode_num_jobs --cmd "$decode_cmd" --config conf/decode.config $pass2_decode_opts \
      --skip-scoring true --iter $iter --acwt $acwt --lattice-beam $lattice_beam \
      --online-ivector-dir $ivector_dir/ivectors_${segmented_data_dir}${ivector_affix} \
     exp/tri5a/graph_pp data/${segmented_data_dir}_hires ${decode_dir}_tg || touch ${decode_dir}_tg/.error
  [ -f ${decode_dir}_tg/.error ] && echo "$0: Error decoding" && exit 1;
fi

if [ $stage -le 10 ]; then
  echo "Rescoring lattices"
  steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
    --skip-scoring true \
    ${lang}_pp_test{,_fg} data/${segmented_data_dir}_hires \
    ${decode_dir}_{tg,fg} || exit 1;
fi

# tune the LMWT and WIP
# make command for filtering the ctms
decode_dir=${decode_dir}_fg
if [ -z $iter ]; then
  model=$decode_dir/../final.mdl # assume model one level up from decoding dir.
else
  model=$decode_dir/../$iter.mdl
fi

mkdir -p $decode_dir/scoring
# create a python script to filter the ctm, for labels which are mapped
# to null strings in the glm or which are not accepted by the scoring server
python -c "
import sys, re
lines = map(lambda x: x.strip(), open('data/${act_data_dir}/glm').readlines())
patterns = []
for line in lines:
  if re.search('=>', line) is not None:
    parts = re.split('=>', line.split('/')[0])
    if parts[1].strip() == '':
      patterns.append(parts[0].strip())
print '|'.join(patterns)
" > $decode_dir/scoring/glm_ignore_patterns || exit 1;

ignore_patterns=$(cat $decode_dir/scoring/glm_ignore_patterns)
echo "$0: Ignoring these patterns from the ctm ", $ignore_patterns 
cat << EOF > $decode_dir/scoring/filter_ctm.py
import sys
file = open(sys.argv[1])
out_file = open(sys.argv[2], 'w')
ignore_set = "$ignore_patterns".split("|")
ignore_set.append("[noise]")
ignore_set.append("[laughter]")
ignore_set.append("[vocalized-noise]")
ignore_set.append("!SIL")
ignore_set.append("<unk>")
ignore_set.append("%hesitation")
ignore_set = set(ignore_set)
print ignore_set
for line in file:
  if line.split()[4] not in ignore_set:
    out_file.write(line)
out_file.close()
EOF

filter_ctm_command="python $decode_dir/scoring/filter_ctm.py "

if  $tune_hyper ; then
  if [ $stage -le 11 ]; then
    if [ "$act_data_dir" == "dev_aspire" ]; then
      wip_string=$(echo $word_ins_penalties | sed 's/,/ /g')
      temp_wips=($wip_string)
      $decode_cmd WIP=1:${#temp_wips[@]} $decode_dir/scoring/log/score.wip.WIP.log \
        wips=\(0 $wip_string\) \&\& \
        wip=\${wips[WIP]} \&\& \
        echo \$wip \&\& \
        $decode_cmd LMWT=$min_lmwt:$max_lmwt $decode_dir/scoring/log/score.LMWT.\$wip.log \
          local/multi_condition/get_ctm.sh --filter-ctm-command "$filter_ctm_command" \
            --window $window --overlap $overlap \
            --beam $ctm_beam --decode-mbr $decode_mbr \
            --glm data/${act_data_dir}/glm --stm data/${act_data_dir}/stm \
          LMWT \$wip $lang data/${segmented_data_dir}_hires $model $decode_dir || exit 1; 

      eval "grep Sum $decode_dir/score_{${min_lmwt}..${max_lmwt}}/penalty_{$word_ins_penalties}/*.sys"|utils/best_wer.sh 2>/dev/null
      eval "grep Sum $decode_dir/score_{${min_lmwt}..${max_lmwt}}/penalty_{$word_ins_penalties}/*.sys" | \
       utils/best_wer.sh 2>/dev/null | python -c "import sys, re
line = sys.stdin.readline()
file_name=line.split()[-1]
parts=file_name.split('/')
penalty = re.sub('penalty_','',parts[-2])
lmwt = re.sub('score_','', parts[-3])
lmfile=open('$decode_dir/scoring/bestLMWT','w')
lmfile.write(str(lmwt))
lmfile.close()
wipfile=open('$decode_dir/scoring/bestWIP','w')
wipfile.write(str(penalty))
wipfile.close()
" || exit 1;
        LMWT=$(cat $decode_dir/scoring/bestLMWT)
        word_ins_penalty=$(cat $decode_dir/scoring/bestWIP)
    fi
  fi
  if [ "$act_data_dir" == "test_aspire" ] || [ "$act_data_dir" == "eval_aspire" ]; then
    dev_decode_dir=$(echo $decode_dir|sed "s/test_aspire/dev_aspire_whole/g; s/eval_aspire/dev_aspire_whole/g")
    if [ -f $dev_decode_dir/scoring/bestLMWT ]; then
      LMWT=$(cat $dev_decode_dir/scoring/bestLMWT)
      echo "Using the bestLMWT $LMWT value found in  $dev_decode_dir"
    else
      echo "Unable to find the bestLMWT in the  dev decode dir $dev_decode_dir"
      echo "Keeping the default/user-specified value"
    fi
    if [ -f $dev_decode_dir/scoring/bestWIP ]; then
      word_ins_penalty=$(cat $dev_decode_dir/scoring/bestWIP)
      echo "Using the bestWIP $word_ins_penalty value found in  $dev_decode_dir"
    else
      echo "Unable to find the bestWIP in the  dev decode dir $dev_decode_dir"
      echo "Keeping the default/user-specified value"
    fi
  else
    echo "Using the default/user-specified values for LMWT and word_ins_penalty"
  fi
fi

# lattice to ctm conversion and scoring.
if [ $stage -le 12 ]; then
  echo "Generating CTMs with LMWT $LMWT and word insertion penalty of $word_ins_penalty"
  local/multi_condition/get_ctm.sh --filter-ctm-command "$filter_ctm_command" \
    --beam $ctm_beam --decode-mbr $decode_mbr \
    $LMWT $word_ins_penalty $lang data/${segmented_data_dir}_hires $model $decode_dir 2>$decode_dir/scoring/finalctm.LMWT$LMWT.WIP$word_ins_penalty.log || exit 1;
fi

if [ $stage -le 13 ]; then
  cat $decode_dir/score_$LMWT/penalty_$word_ins_penalty/ctm.filt | awk '{split($1, parts, "-"); printf("%s 1 %s %s %s\n", parts[1], $3, $4, $5)}' > $out_file
  cat data/${segmented_data_dir}_hires/wav.scp | awk '{split($1, parts, "-"); printf("%s\n", parts[1])}' > $decode_dir/score_$LMWT/penalty_$word_ins_penalty/recording_names 
  local/multi_condition/fill_missing_recordings.py $out_file $out_file.submission $decode_dir/score_$LMWT/penalty_$word_ins_penalty/recording_names
  echo "Generated the ctm @ $out_file.submission from the ctm file $decode_dir/score_${LMWT}/penalty_$word_ins_penalty/ctm.filt"
fi