Blame view

egs/wsj/s5/utils/perturb_data_dir_speed.sh 4.54 KB
8dcb6dfcb   Yannick Estève   first commit
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
  #!/bin/bash
  
  # Copyright 2013  Johns Hopkins University (author: Daniel Povey)
  #           2014  Tom Ko
  #           2018  Emotech LTD (author: Pawel Swietojanski)
  # Apache 2.0
  
  # This script operates on a directory, such as in data/train/,
  # that contains some subset of the following files:
  #  wav.scp
  #  spk2utt
  #  utt2spk
  #  text
  #  utt2dur
  #  reco2dur
  #
  # It generates the files which are used for perturbing the speed of the original data.
  
  . utils/parse_options.sh
  
  if [ $# != 3 ]; then
    echo "Usage: perturb_data_dir_speed.sh <warping-factor> <srcdir> <destdir>"
    echo "e.g.:"
    echo " $0 0.9 data/train_si284 data/train_si284p"
    exit 1
  fi
  
  export LC_ALL=C
  
  factor=$1
  srcdir=$2
  destdir=$3
  label="sp"
  spk_prefix=$label$factor"-"
  utt_prefix=$label$factor"-"
  
  #check is sox on the path
  which sox &>/dev/null
  ! [ $? -eq 0 ] && echo "sox: command not found" && exit 1;
  
  if [ ! -f $srcdir/utt2spk ]; then
    echo "$0: no such file $srcdir/utt2spk"
    exit 1;
  fi
  
  if [ "$destdir" == "$srcdir" ]; then
    echo "$0: this script requires <srcdir> and <destdir> to be different."
    exit 1
  fi
  
  set -e;
  set -o pipefail
  
  mkdir -p $destdir
  
  cat $srcdir/utt2spk | awk -v p=$utt_prefix '{printf("%s %s%s
  ", $1, p, $1);}' > $destdir/utt_map
  cat $srcdir/spk2utt | awk -v p=$spk_prefix '{printf("%s %s%s
  ", $1, p, $1);}' > $destdir/spk_map
  cat $srcdir/wav.scp | awk -v p=$spk_prefix '{printf("%s %s%s
  ", $1, p, $1);}' > $destdir/reco_map
  if [ ! -f $srcdir/utt2uniq ]; then
    cat $srcdir/utt2spk | awk -v p=$utt_prefix '{printf("%s%s %s
  ", p, $1, $1);}' > $destdir/utt2uniq
  else
    cat $srcdir/utt2uniq | awk -v p=$utt_prefix '{printf("%s%s %s
  ", p, $1, $2);}' > $destdir/utt2uniq
  fi
  
  
  cat $srcdir/utt2spk | utils/apply_map.pl -f 1 $destdir/utt_map  | \
    utils/apply_map.pl -f 2 $destdir/spk_map >$destdir/utt2spk
  
  utils/utt2spk_to_spk2utt.pl <$destdir/utt2spk >$destdir/spk2utt
  
  if [ -f $srcdir/segments ]; then
  
    utils/apply_map.pl -f 1 $destdir/utt_map <$srcdir/segments | \
      utils/apply_map.pl -f 2 $destdir/reco_map | \
        awk -v factor=$factor \
          '{s=$3/factor; e=$4/factor; if (e > s + 0.01) { printf("%s %s %.2f %.2f
  ", $1, $2, $3/factor, $4/factor);} }' >$destdir/segments
  
    utils/apply_map.pl -f 1 $destdir/reco_map <$srcdir/wav.scp | sed 's/| *$/ |/' | \
      # Handle three cases of rxfilenames appropriately; "input piped command", "file offset" and "filename" 
      awk -v factor=$factor \
          '{wid=$1; $1=""; if ($NF=="|") {print wid $_ " sox -t wav - -t wav - speed " factor " |"}
            else if (match($0, /:[0-9]+$/)) {print wid " wav-copy" $_ " - | sox -t wav - -t wav - speed " factor " |" } 
            else  {print wid " sox -t wav" $_ " -t wav - speed " factor " |"}}' > $destdir/wav.scp
    if [ -f $srcdir/reco2file_and_channel ]; then
      utils/apply_map.pl -f 1 $destdir/reco_map <$srcdir/reco2file_and_channel >$destdir/reco2file_and_channel
    fi
  
  else # no segments->wav indexed by utterance.
    if [ -f $srcdir/wav.scp ]; then
      utils/apply_map.pl -f 1 $destdir/utt_map <$srcdir/wav.scp | sed 's/| *$/ |/' | \
       # Handle three cases of rxfilenames appropriately; "input piped command", "file offset" and "filename" 
       awk -v factor=$factor \
         '{wid=$1; $1=""; if ($NF=="|") {print wid $_ " sox -t wav - -t wav - speed " factor " |"}
           else if (match($0, /:[0-9]+$/)) {print wid " wav-copy" $_ " - | sox -t wav - -t wav - speed " factor " |" } 
           else {print wid " sox -t wav" $_ " -t wav - speed " factor " |"}}' > $destdir/wav.scp
    fi
  fi
  
  if [ -f $srcdir/text ]; then
    utils/apply_map.pl -f 1 $destdir/utt_map <$srcdir/text >$destdir/text
  fi
  if [ -f $srcdir/spk2gender ]; then
    utils/apply_map.pl -f 1 $destdir/spk_map <$srcdir/spk2gender >$destdir/spk2gender
  fi
  if [ -f $srcdir/utt2lang ]; then
    utils/apply_map.pl -f 1 $destdir/utt_map <$srcdir/utt2lang >$destdir/utt2lang
  fi
  
  #prepare speed-perturbed utt2dur
  if [ ! -f $srcdir/utt2dur ]; then
    # generate utt2dur if it does not exist in srcdir
    utils/data/get_utt2dur.sh $srcdir
  fi
  cat $srcdir/utt2dur | utils/apply_map.pl -f 1 $destdir/utt_map  | \
    awk -v factor=$factor '{print $1, $2/factor;}' >$destdir/utt2dur
  
  #prepare speed-perturbed reco2dur 
  if [ ! -f $srcdir/reco2dur ]; then
    # generate reco2dur if it does not exist in srcdir
    utils/data/get_reco2dur.sh $srcdir
  fi
  cat $srcdir/reco2dur | utils/apply_map.pl -f 1 $destdir/reco_map  | \
    awk -v factor=$factor '{print $1, $2/factor;}' >$destdir/reco2dur
  
  rm $destdir/spk_map $destdir/utt_map $destdir/reco_map 2>/dev/null
  echo "$0: generated speed-perturbed version of data in $srcdir, in $destdir"
  
  utils/validate_data_dir.sh --no-feats --no-text $destdir