Blame view

Scripts/steps/nnet2/get_perturbed_feats.sh 2.85 KB
ec85f8892   bigot benjamin   first commit
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
  #!/bin/bash
  
  
  # begin configuration section
  
  cmd="run.pl"
  pairs="1.1-1.0 1.05-1.2 1.0-0.8 0.95-1.1 0.9-0.9" # Pairs of (VTLN warp factor, time-warp factor)
  stage=0
  cleanup=true
  feature_type=fbank
  # end configuration section
  
  set -e
  . utils/parse_options.sh 
  
  if [ $# -ne 5 ]; then
    echo "Usage: $0 [options] <baseline-feature-config> <feature-storage-dir> <log-location> <input-data-dir> <output-data-dir> "
    echo "e.g.: $0 mfcc conf/fbank_40.conf exp/perturbed_fbank_train data/train data/train_perturbed_fbank"
    echo "Supported options: "
    echo "--feature-type (fbank|mfcc|plp)  # Type of features we are making"
    echo "--cmd 'command-program'      # Mechanism to run jobs, e.g. run.pl"
    echo "--pairs <pairs>              # Pairs of (vtln-warp, time-warp) factors, "
    echo "                             # default $pairs"
    echo "--stage <stage>              # Use for partial re-run"
    echo "--cleanup (true|false)       # If false, do not clean up temp files (default: true)"
    exit 1;
  fi
  
  base_config=$1
  featdir=$2
  dir=$3 # dir/log* will contain log-files
  inputdata=$4
  data=$5
  
  for f in $base_config $inputdata/wav.scp; do 
    if [ ! -f $f ]; then
      echo "Expected file $f to exist"
      exit 1;
    fi
  done
  
  if [ "$feature_type" != "fbank" ] && [ "$feature_type" != "mfcc" ] && \
     [ "$feature_type" != "plp" ]; then 
    echo "$0: Invalid option --feature-type=$feature_type"
    exit 1;
  fi
  
  mkdir -p $featdir
  mkdir -p $dir/conf $dir/log
  
  all_feature_dirs=""
  
  for pair in $pairs; do
    vtln_warp=`echo $pair | cut -d- -f1`
    time_warp=`echo $pair | cut -d- -f2`
    fs=`perl -e "print ($time_warp*10);"`
    conf=$dir/conf/$pair.conf
    this_dir=$dir/$pair
    
    ( cat $base_config; echo; echo "--frame-shift=$fs"; echo "--vtln-warp=$vtln_warp" ) > $conf
    
    echo "Making ${feature_type} features for VTLN-warp $vtln_warp and time-warp $time_warp"
  
    feature_data=${data}-$pair
    all_feature_dirs="$all_feature_dirs $feature_data"
  
    utils/copy_data_dir.sh --spk-prefix ${pair}- --utt-prefix ${pair}- $inputdata $feature_data
    steps/make_${feature_type}.sh --${feature_type}-config $conf --nj 8 --cmd "$cmd" $feature_data $this_dir $featdir
  
    steps/compute_cmvn_stats.sh $feature_data $this_dir $featdir
  done
  
  utils/combine_data.sh $data $all_feature_dirs
  
  
  # In the combined feature directory, create a file utt2uniq which maps
  # our extended utterance-ids to "unique utterances".  This enables the
  # script steps/nnet2/get_egs.sh to hold out data in a more proper way.
  cat $data/utt2spk | \
     perl -e ' while(<STDIN>){ @A=split; $x=shift @A; $y=$x; 
       foreach $pair (@ARGV) { $y =~ s/^${pair}-// && last; } print "$x $y
  "; } ' $pairs \
    > $data/utt2uniq
  
  if $cleanup; then
    echo "$0: Cleaning up temporary directories for ${feature_type} features."
    # Note, this just removes the .scp files and so on, not the data which is located in
    # $featdir and which is still needed.
    rm -r $all_feature_dirs
  fi