Blame view

egs/wsj/s5/utils/split_data.sh 5.11 KB
8dcb6dfcb   Yannick Estève   first commit
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
  #!/bin/bash
  # Copyright 2010-2013 Microsoft Corporation
  #                     Johns Hopkins University (Author: Daniel Povey)
  
  # Licensed under the Apache License, Version 2.0 (the "License");
  # you may not use this file except in compliance with the License.
  # You may obtain a copy of the License at
  #
  #  http://www.apache.org/licenses/LICENSE-2.0
  #
  # THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
  # KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
  # WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
  # MERCHANTABLITY OR NON-INFRINGEMENT.
  # See the Apache 2 License for the specific language governing permissions and
  # limitations under the License.
  
  split_per_spk=true
  if [ "$1" == "--per-utt" ]; then
    split_per_spk=false
    shift
  fi
  
  if [ $# != 2 ]; then
    echo "Usage: $0 [--per-utt] <data-dir> <num-to-split>"
    echo "E.g.: $0 data/train 50"
    echo "It creates its output in e.g. data/train/split50/{1,2,3,...50}, or if the "
    echo "--per-utt option was given, in e.g. data/train/split50utt/{1,2,3,...50}."
    echo ""
    echo "This script will not split the data-dir if it detects that the output is newer than the input."
    echo "By default it splits per speaker (so each speaker is in only one split dir),"
    echo "but with the --per-utt option it will ignore the speaker information while splitting."
    exit 1
  fi
  
  data=$1
  numsplit=$2
  
  if ! [ "$numsplit" -gt 0 ]; then
    echo "Invalid num-split argument $numsplit";
    exit 1;
  fi
  
  if $split_per_spk; then
    warning_opt=
  else
    # suppress warnings from filter_scps.pl about 'some input lines were output
    # to multiple files'.
    warning_opt="--no-warn"
  fi
  
  n=0;
  feats=""
  wavs=""
  utt2spks=""
  texts=""
  
  nu=`cat $data/utt2spk | wc -l`
  nf=`cat $data/feats.scp 2>/dev/null | wc -l`
  nt=`cat $data/text 2>/dev/null | wc -l` # take it as zero if no such file
  if [ -f $data/feats.scp ] && [ $nu -ne $nf ]; then
    echo "** split_data.sh: warning, #lines is (utt2spk,feats.scp) is ($nu,$nf); you can "
    echo "**  use utils/fix_data_dir.sh $data to fix this."
  fi
  if [ -f $data/text ] && [ $nu -ne $nt ]; then
    echo "** split_data.sh: warning, #lines is (utt2spk,text) is ($nu,$nt); you can "
    echo "** use utils/fix_data_dir.sh to fix this."
  fi
  
  
  if $split_per_spk; then
    utt2spk_opt="--utt2spk=$data/utt2spk"
    utt=""
  else
    utt2spk_opt=
    utt="utt"
  fi
  
  s1=$data/split${numsplit}${utt}/1
  if [ ! -d $s1 ]; then
    need_to_split=true
  else
    need_to_split=false
    for f in utt2spk spk2utt spk2warp feats.scp text wav.scp cmvn.scp spk2gender \
      vad.scp segments reco2file_and_channel utt2lang; do
      if [[ -f $data/$f && ( ! -f $s1/$f || $s1/$f -ot $data/$f ) ]]; then
        need_to_split=true
      fi
    done
  fi
  
  if ! $need_to_split; then
    exit 0;
  fi
  
  utt2spks=$(for n in `seq $numsplit`; do echo $data/split${numsplit}${utt}/$n/utt2spk; done)
  
  directories=$(for n in `seq $numsplit`; do echo $data/split${numsplit}${utt}/$n; done)
  
  # if this mkdir fails due to argument-list being too long, iterate.
  if ! mkdir -p $directories >&/dev/null; then
    for n in `seq $numsplit`; do
      mkdir -p $data/split${numsplit}${utt}/$n
    done
  fi
  
  # If lockfile is not installed, just don't lock it.  It's not a big deal.
  which lockfile >&/dev/null && lockfile -l 60 $data/.split_lock
  trap 'rm -f $data/.split_lock' EXIT HUP INT PIPE TERM
  
  utils/split_scp.pl $utt2spk_opt $data/utt2spk $utt2spks || exit 1
  
  for n in `seq $numsplit`; do
    dsn=$data/split${numsplit}${utt}/$n
    utils/utt2spk_to_spk2utt.pl $dsn/utt2spk > $dsn/spk2utt || exit 1;
  done
  
  maybe_wav_scp=
  if [ ! -f $data/segments ]; then
    maybe_wav_scp=wav.scp  # If there is no segments file, then wav file is
                           # indexed per utt.
  fi
  
  # split some things that are indexed by utterance.
  for f in feats.scp text vad.scp utt2lang $maybe_wav_scp utt2dur utt2num_frames; do
    if [ -f $data/$f ]; then
      utils/filter_scps.pl JOB=1:$numsplit \
        $data/split${numsplit}${utt}/JOB/utt2spk $data/$f $data/split${numsplit}${utt}/JOB/$f || exit 1;
    fi
  done
  
  # split some things that are indexed by speaker
  for f in spk2gender spk2warp cmvn.scp; do
    if [ -f $data/$f ]; then
      utils/filter_scps.pl $warning_opt JOB=1:$numsplit \
        $data/split${numsplit}${utt}/JOB/spk2utt $data/$f $data/split${numsplit}${utt}/JOB/$f || exit 1;
    fi
  done
  
  if [ -f $data/segments ]; then
    utils/filter_scps.pl JOB=1:$numsplit \
       $data/split${numsplit}${utt}/JOB/utt2spk $data/segments $data/split${numsplit}${utt}/JOB/segments || exit 1
    for n in `seq $numsplit`; do
      dsn=$data/split${numsplit}${utt}/$n
      awk '{print $2;}' $dsn/segments | sort | uniq > $dsn/tmp.reco # recording-ids.
    done
    if [ -f $data/reco2file_and_channel ]; then
      utils/filter_scps.pl $warning_opt JOB=1:$numsplit \
        $data/split${numsplit}${utt}/JOB/tmp.reco $data/reco2file_and_channel \
        $data/split${numsplit}${utt}/JOB/reco2file_and_channel || exit 1
    fi
    if [ -f $data/wav.scp ]; then
      utils/filter_scps.pl $warning_opt JOB=1:$numsplit \
        $data/split${numsplit}${utt}/JOB/tmp.reco $data/wav.scp \
        $data/split${numsplit}${utt}/JOB/wav.scp || exit 1
    fi
    for f in $data/split${numsplit}${utt}/*/tmp.reco; do rm $f; done
  fi
  
  exit 0