Blame view

egs/wsj/s5/utils/data/combine_short_segments.sh 6 KB
8dcb6dfcb   Yannick Estève   first commit
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
  #!/bin/bash
  
  # Copyright 2013  Johns Hopkins University (author: Daniel Povey)
  # Apache 2.0
  
  # This script copies and modifies a data directory while combining
  # segments whose duration is lower than a specified minimum segment
  # length.
  #
  # Note: this does not work for the wav.scp, since there is no natural way to
  # concatenate segments; you have to operate on directories that already have
  # features extracted.
  
  #
  
  
  # begin configuration section
  cleanup=true
  # end configuration section
  
  . utils/parse_options.sh
  
  if [ $# != 3 ]; then
    echo "Usage: "
    echo "  $0 [options] <srcdir> <min-segment-length-in-seconds> <dir>"
    echo "e.g.:"
    echo " $0 data/train 1.55 data/train_comb"
    # options documentation here.
    exit 1;
  fi
  
  
  export LC_ALL=C
  
  srcdir=$1
  min_seg_len=$2
  dir=$3
  
  if [ "$dir" == "$srcdir" ]; then
    echo "$0: this script requires <srcdir> and <dir> to be different."
    exit 1
  fi
  
  for f in $srcdir/utt2spk $srcdir/feats.scp; do
    [ ! -s $f ] && echo "$0: expected file $f to exist and be nonempty" && exit 1
  done
  
  if ! awk '{if (NF != 2) exit(1);}' <$srcdir/feats.scp; then
    echo "$0: could not combine short segments because $srcdir/feats.scp has "
    echo " entries with too many fields"
  fi
  
  if ! mkdir -p $dir; then
    echo "$0: could not create directory $dir"
    exit 1;
  fi
  
  if ! utils/validate_data_dir.sh $srcdir; then
    echo "$0: failed to validate input directory $srcdir.  If needed, run   utils/fix_data_dir.sh $srcdir"
    exit 1
  fi
  
  if ! python -c "x=float('$min_seg_len'); assert(x>0.0 and x<100.0);" 2>/dev/null; then
    echo "$0: bad <min-segment-length-in-seconds>: got '$min_seg_len'"
    exit 1
  fi
  
  set -e
  set -o pipefail
  
  # make sure $srcdir/utt2dur exists.
  utils/data/get_utt2dur.sh $srcdir
  
  utils/data/internal/choose_utts_to_combine.py --min-duration=$min_seg_len \
    $srcdir/spk2utt $srcdir/utt2dur $dir/utt2utts $dir/utt2spk $dir/utt2dur
  
  utils/utt2spk_to_spk2utt.pl < $dir/utt2spk > $dir/spk2utt
  
  # create the feats.scp.
  # if a line of utt2utts is like 'utt2-comb2 utt2 utt3', then
  # the utils/apply_map.pl will create a line that looks like
  # 'utt2-comb2 foo.ark:4315 foo.ark:431423'
  # and the awk command creates suitable command lines like:
  # 'utt2-comb2 concat-feats foo.ark:4315 foo.ark:431423 - |'
  utils/apply_map.pl -f 2- $srcdir/feats.scp <$dir/utt2utts | \
    awk '{if (NF<=2){print;} else { $1 = $1 " concat-feats --print-args=false"; $NF = $NF " - |"; print; }}' > $dir/feats.scp
  
  # create $dir/text by concatenating the source 'text' entries for the original
  # utts.
  utils/apply_map.pl -f 2- $srcdir/text <$dir/utt2utts > $dir/text
  
  if [ -f $srcdir/utt2uniq ]; then
    # the utt2uniq file is such that if 2 utts were derived from the same original
    # utt (e.g. by speed perturbing) they map to the same 'uniq' value.  This is
    # so that we can properly hold out validation data for neural net training and
    # know that we're not training on perturbed verions of that utterance.  We
    # need to obtain the utt2uniq file so that if any 2 'new' utts contain any of
    # the same 'old' utts, their 'uniq' values are the same [but otherwise as far
    # as possible, the 'uniq' values are different.]
    #
    # we'll do this by arranging the old 'uniq' values into groups as necessary to
    # capture this property.
  
    # The following command creates 'uniq_sets', each line of which contains
    # a set of original 'uniq' values, and effectively we assert that they must
    # be grouped together to the same 'uniq' value.
    # the first awk command prints a group of the original utterance-ids that
    # are combined together into a single new utterance, and the apply_map
    # command converts those into a list of original 'uniq' values.
    awk '{$1 = ""; print;}' < $dir/utt2utts | \
      utils/apply_map.pl $srcdir/utt2uniq > $dir/uniq_sets
  
    # The next command creates $dir/uniq2merged_uniq, which is a map from the
    # original 'uniq' values to the 'merged' uniq values.
    # for example, if $dir/uniq_sets were to contain
    # a b
    # b c
    # d
    # then we'd obtain a uniq2merged_uniq file that looks like:
    # a a
    # b a
    # c a
    # d d
    # ... because a and b appear together, and b and c appear together,
    # they have to be merged into the same set, and we name that set 'a'
    # (in general, we take the lowest string in lexicographical order).
  
    cat $dir/uniq_sets | LC_ALL=C python -c '
  import sys;
  from collections import defaultdict
  uniq2orig_uniq = dict()
  equal_pairs = set()  # set of 2-tuples (a,b) which should have equal orig_uniq
  while True:
      line = sys.stdin.readline()
      if line == "": break
      split_line = line.split() # list of uniq strings that should map in same set
      # initialize uniq2orig_uniq to the identity mapping
      for uniq in split_line: uniq2orig_uniq[uniq] = uniq
      for a in split_line[1:]: equal_pairs.add((split_line[0], a))
  
  changed = True
  while changed:
      changed = False
      for a,b in equal_pairs:
           min_orig_uniq = min(uniq2orig_uniq[a], uniq2orig_uniq[b])
           for x in [a,b]:
               if uniq2orig_uniq[x] != min_orig_uniq:
                   uniq2orig_uniq[x] = min_orig_uniq
                   changed = True
  
  for uniq in sorted(uniq2orig_uniq.keys()):
      print uniq, uniq2orig_uniq[uniq]
  ' > $dir/uniq_to_orig_uniq
    rm $dir/uniq_sets
  
  
    # In the following command, suppose we have a line like:
    # utt1-comb2 utt1 utt2
    # .. the first awk command retains only the first original utt, to give
    # utt1-comb2 utt1
    # [we can pick one arbitrarily since we know any of them would map to the same
    # orig_uniq value.]
    # the first apply_map.pl command maps the 'utt1' to the 'uniq' value it mapped to
    # in $srcdir, and the second apply_map.pl command maps it to the grouped 'uniq'
    # value obtained by the inline python script above.
    awk '{print $1, $2}' < $dir/utt2utts | utils/apply_map.pl -f 2 $srcdir/utt2uniq | \
      utils/apply_map.pl -f 2 $dir/uniq_to_orig_uniq > $dir/utt2uniq
    rm $dir/uniq_to_orig_uniq
  fi
  
  # note: the user will have to recompute the cmvn, as the speakers may have changed.
  rm $dir/cmvn.scp 2>/dev/null || true
  
  utils/validate_data_dir.sh --no-wav $dir
  
  if $cleanup; then
    rm $dir/utt2utts
  fi