Blame view

egs/wsj/s5/utils/subset_data_dir.sh 7.12 KB
8dcb6dfcb   Yannick Estève   first commit
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
  #!/bin/bash
  # Copyright 2010-2011  Microsoft Corporation
  #           2012-2013  Johns Hopkins University (Author: Daniel Povey)
  # Apache 2.0
  
  
  # This script operates on a data directory, such as in data/train/.
  # See http://kaldi-asr.org/doc/data_prep.html#data_prep_data
  # for what these directories contain.
  
  # This script creates a subset of that data, consisting of some specified
  # number of utterances.  (The selected utterances are distributed evenly
  # throughout the file, by the program ./subset_scp.pl).
  
  # There are six options, none compatible with any other.
  
  # If you give the --per-spk option, it will attempt to select the supplied
  # number of utterances for each speaker (typically you would supply a much
  # smaller number in this case).
  
  # If you give the --speakers option, it selects a subset of n randomly
  # selected speakers.
  
  # If you give the --shortest option, it will give you the n shortest utterances.
  
  # If you give the --first option, it will just give you the n first utterances.
  
  # If you give the --last option, it will just give you the n last utterances.
  
  # If you give the --spk-list or --utt-list option, it reads the
  # speakers/utterances to keep from <speaker-list-file>/<utt-list-file>" (note,
  # in this case there is no <num-utt> positional parameter; see usage message.)
  
  
  shortest=false
  perspk=false
  speakers=false
  first_opt=
  spk_list=
  utt_list=
  
  expect_args=3
  case $1 in
    --first|--last) first_opt=$1; shift ;;
    --per-spk)  perspk=true; shift ;;
    --shortest) shortest=true; shift ;;
    --speakers) speakers=true; shift ;;
    --spk-list) shift; spk_list=$1; shift; expect_args=2 ;;
    --utt-list) shift; utt_list=$1; shift; expect_args=2 ;;
    --*) echo "$0: invalid option '$1'"; exit 1
  esac
  
  if [ $# != $expect_args ]; then
    echo "Usage:"
    echo "  subset_data_dir.sh [--speakers|--shortest|--first|--last|--per-spk] <srcdir> <num-utt> <destdir>"
    echo "  subset_data_dir.sh [--spk-list <speaker-list-file>] <srcdir> <destdir>"
    echo "  subset_data_dir.sh [--utt-list <utt-list-file>] <srcdir> <destdir>"
    echo "By default, randomly selects <num-utt> utterances from the data directory."
    echo "With --speakers, randomly selects enough speakers that we have <num-utt> utterances"
    echo "With --per-spk, selects <num-utt> utterances per speaker, if available."
    echo "With --first, selects the first <num-utt> utterances"
    echo "With --last, selects the last <num-utt> utterances"
    echo "With --shortest, selects the shortest <num-utt> utterances."
    echo "With --spk-list, reads the speakers to keep from <speaker-list-file>"
    echo "With --utt-list, reads the utterances to keep from <utt-list-file>"
    exit 1;
  fi
  
  srcdir=$1
  if [[ $spk_list || $utt_list ]]; then
    numutt=
    destdir=$2
  else
    numutt=$2
    destdir=$3
  fi
  
  export LC_ALL=C
  
  if [ ! -f $srcdir/utt2spk ]; then
    echo "$0: no such file $srcdir/utt2spk"
    exit 1
  fi
  
  if [[ $numutt && $numutt -gt $(wc -l <$srcdir/utt2spk) ]]; then
    echo "$0: cannot subset to more utterances than you originally had."
    exit 1
  fi
  
  if $shortest && [ ! -f $srcdir/feats.scp ]; then
    echo "$0: you selected --shortest but no feats.scp exist."
    exit 1
  fi
  
  mkdir -p $destdir || exit 1
  
  if [[ $spk_list ]]; then
    utils/filter_scp.pl "$spk_list" $srcdir/spk2utt > $destdir/spk2utt || exit 1;
    utils/spk2utt_to_utt2spk.pl < $destdir/spk2utt > $destdir/utt2spk || exit 1;
  elif [[ $utt_list ]]; then
    utils/filter_scp.pl "$utt_list" $srcdir/utt2spk > $destdir/utt2spk || exit 1;
    utils/utt2spk_to_spk2utt.pl < $destdir/utt2spk > $destdir/spk2utt || exit 1;
  elif $speakers; then
    utils/shuffle_list.pl < $srcdir/spk2utt |
      awk -v numutt=$numutt '{ if (tot < numutt){ print; } tot += (NF-1); }' |
      sort > $destdir/spk2utt
    utils/spk2utt_to_utt2spk.pl < $destdir/spk2utt > $destdir/utt2spk
  elif $perspk; then
    awk '{ n='$numutt'; printf("%s ",$1);
           skip=1; while(n*(skip+1) <= NF-1) { skip++; }
           for(x=2; x<=NF && x <= n*skip; x += skip) { printf("%s ", $x); }
           printf("
  "); }' <$srcdir/spk2utt >$destdir/spk2utt
    utils/spk2utt_to_utt2spk.pl < $destdir/spk2utt > $destdir/utt2spk
  else
    if $shortest; then
      # Select $numutt shortest utterances.
      . ./path.sh
      feat-to-len scp:$srcdir/feats.scp ark,t:$destdir/tmp.len || exit 1;
      sort -n -k2 $destdir/tmp.len |
        awk '{print $1}' |
        head -$numutt >$destdir/tmp.uttlist
      utils/filter_scp.pl $destdir/tmp.uttlist $srcdir/utt2spk >$destdir/utt2spk
      rm $destdir/tmp.uttlist $destdir/tmp.len
    else
      # Select $numutt random utterances.
      utils/subset_scp.pl $first_opt $numutt $srcdir/utt2spk > $destdir/utt2spk || exit 1;
    fi
    utils/utt2spk_to_spk2utt.pl < $destdir/utt2spk > $destdir/spk2utt
  fi
  
  # Perform filtering. utt2spk and spk2utt files already exist by this point.
  # Filter by utterance.
  [ -f $srcdir/feats.scp ] &&
    utils/filter_scp.pl $destdir/utt2spk <$srcdir/feats.scp >$destdir/feats.scp
  [ -f $srcdir/vad.scp ] &&
    utils/filter_scp.pl $destdir/utt2spk <$srcdir/vad.scp >$destdir/vad.scp
  [ -f $srcdir/utt2lang ] &&
    utils/filter_scp.pl $destdir/utt2spk <$srcdir/utt2lang >$destdir/utt2lang
  [ -f $srcdir/utt2dur ] &&
    utils/filter_scp.pl $destdir/utt2spk <$srcdir/utt2dur >$destdir/utt2dur
  [ -f $srcdir/utt2num_frames ] &&
    utils/filter_scp.pl $destdir/utt2spk <$srcdir/utt2num_frames >$destdir/utt2num_frames
  [ -f $srcdir/utt2uniq ] &&
    utils/filter_scp.pl $destdir/utt2spk <$srcdir/utt2uniq >$destdir/utt2uniq
  [ -f $srcdir/wav.scp ] &&
    utils/filter_scp.pl $destdir/utt2spk <$srcdir/wav.scp >$destdir/wav.scp
  [ -f $srcdir/utt2warp ] &&
    utils/filter_scp.pl $destdir/utt2spk <$srcdir/utt2warp >$destdir/utt2warp
  [ -f $srcdir/text ] &&
    utils/filter_scp.pl $destdir/utt2spk <$srcdir/text >$destdir/text
  
  # Filter by speaker.
  [ -f $srcdir/spk2warp ] &&
    utils/filter_scp.pl $destdir/spk2utt <$srcdir/spk2warp >$destdir/spk2warp
  [ -f $srcdir/spk2gender ] &&
    utils/filter_scp.pl $destdir/spk2utt <$srcdir/spk2gender >$destdir/spk2gender
  [ -f $srcdir/cmvn.scp ] &&
    utils/filter_scp.pl $destdir/spk2utt <$srcdir/cmvn.scp >$destdir/cmvn.scp
  
  # Filter by recording-id.
  if [ -f $srcdir/segments ]; then
    utils/filter_scp.pl $destdir/utt2spk <$srcdir/segments >$destdir/segments
    # Recording-ids are in segments.
    awk '{print $2}' $destdir/segments | sort | uniq >$destdir/reco
    # The next line overrides the command above for wav.scp, which would be incorrect.
    [ -f $srcdir/wav.scp ] &&
      utils/filter_scp.pl $destdir/reco <$srcdir/wav.scp >$destdir/wav.scp
  else
    # No segments; recording-ids are in wav.scp.
    awk '{print $1}' $destdir/wav.scp | sort | uniq >$destdir/reco
  fi
  
  [ -f $srcdir/reco2file_and_channel ] &&
    utils/filter_scp.pl $destdir/reco <$srcdir/reco2file_and_channel >$destdir/reco2file_and_channel
  [ -f $srcdir/reco2dur ] &&
    utils/filter_scp.pl $destdir/reco <$srcdir/reco2dur >$destdir/reco2dur
  
  # Filter the STM file for proper sclite scoring.
  # Copy over the comments from STM file.
  [ -f $srcdir/stm ] &&
    (grep "^;;" $srcdir/stm
     utils/filter_scp.pl $destdir/reco $srcdir/stm) >$destdir/stm
  
  rm $destdir/reco
  
  # Copy frame_shift if present.
  [ -f $srcdir/frame_shift ] && cp $srcdir/frame_shift $destdir
  
  srcutts=$(wc -l <$srcdir/utt2spk)
  destutts=$(wc -l <$destdir/utt2spk)
  echo "$0: reducing #utt from $srcutts to $destutts"
  exit 0