Blame view

egs/wsj/s5/steps/cleanup/split_long_utterance.sh 4.96 KB
8dcb6dfcb   Yannick Estève   first commit
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
  #!/bin/bash
  
  # Copyright 2014  Guoguo Chen
  # Apache 2.0
  
  # Begin configuration section.
  seg_length=30
  min_seg_length=10
  overlap_length=5
  # End configuration section.
  
  echo "$0 $@"
  
  [ -f ./path.sh ] && . ./path.sh
  . parse_options.sh || exit 1;
  
  if [ $# -ne 2 ]; then
    echo "This script truncates the long audio into smaller overlapping segments"
    echo ""
    echo "Usage: $0 [options] <input-dir> <output-dir>"
    echo " e.g.: $0 data/train_si284_long data/train_si284_split"
    echo ""
    echo "Options:"
    echo "    --min-seg-length        # minimal segment length"
    echo "    --seg-length            # length of segments in seconds."
    echo "    --overlap-length        # length of overlap in seconds."
    exit 1;
  fi
  
  input_dir=$1
  output_dir=$2
  
  for f in spk2utt text utt2spk wav.scp; do
    [ ! -f $input_dir/$f ] && echo "$0: no such file $input_dir/$f" && exit 1;
  done
  
  [ ! $seg_length -gt $overlap_length ] \
    && echo "$0: --seg-length should be longer than --overlap-length." && exit 1;
  
  # Checks if sox is on the path.
  sox=`which sox`
  [ $? -ne 0 ] && echo "$0: sox command not found." && exit 1;
  sph2pipe=$KALDI_ROOT/tools/sph2pipe_v2.5/sph2pipe
  [ ! -x $sph2pipe ] && echo "$0: sph2pipe command not found." && exit 1;
  
  mkdir -p $output_dir
  cp -f $input_dir/spk2gender $output_dir/spk2gender 2>/dev/null
  cp -f $input_dir/text $output_dir/text.orig
  cp -f $input_dir/wav.scp $output_dir/wav.scp
  
  # We assume the audio length in header is correct and get it from there. It is
  # a little bit annoying that old version of sox does not support the following:
  #   $audio_cmd | sox --i -D
  # we have to put it in the following format for the old versions:
  #   $sox --i -D "|$audio_cmd"
  # Another way is to count all the samples to get the duration, but it takes
  # longer time, so we do not use it here.. The command is:
  #   $audio_cmd | sox -t wav - -n stat | grep -P "^Length" | awk '{print $1;}'
  #
  # Note: in the wsj example the process takes couple of minutes because of the
  #       audio file concatenation; in a real case it should be much faster since
  #       it just reads the header.
  cat $output_dir/wav.scp | perl -e '
    $no_orig_seg = "false";       # Original segment file may or may not exist.
    ($u2s_in, $u2s_out, $seg_in,
     $seg_out, $orig2utt, $sox, $slen, $mslen, $olen) = @ARGV;
    open(UI, "<$u2s_in") || die "Error: fail to open $u2s_in
  ";
    open(UO, ">$u2s_out") || die "Error: fail to open $u2s_out
  ";
    open(SI, "<$seg_in") || ($no_orig_seg = "true");
    open(SO, ">$seg_out") || die "Error: fail to open $seg_out
  ";
    open(UMAP, ">$orig2utt") || die "Error: fail to open $orig2utt
  ";
    # If the original segment file exists, we have to work out the segment
    # duration from the segment file. Otherwise we work that out from the wav.scp
    # file.
    if ($no_orig_seg eq "false") {
      while (<SI>) {
        chomp;
        @col = split;
        @col == 4 || die "Error: bad line $_
  ";
        ($seg_id, $wav_id, $seg_start, $seg_end) = @col;
        $seg2wav{$seg_id} = $wav_id;
        $seg_start{$seg_id} = $seg_start;
        $seg_end{$seg_id} = $seg_end;
      }
    } else {
      while (<STDIN>) {
        chomp;
        @col = split;
        @col >= 2 || "bad line $_
  ";
        if ((@col > 2) &&  ($col[-1] eq "|")) {
          $wav_id = shift @col; pop @col;
          $audio_cmd = join(" ", @col);
          $duration = `$sox --i -D '\''|$audio_cmd'\''`;
        } else {
          @col == 2 || die "Error: bad line $_
   in wav.scp";
          $wav_id = $col[0];
          $audio_file = $col[1];
          $duration = `$sox --i -D $audio_file`;
        }
        chomp($duration);
        $seg2wav{$wav_id} = $wav_id;
        $seg_start{$wav_id} = 0;
        $seg_end{$wav_id} = $duration;
      }
    }
    while (<UI>) {
      chomp;
      @col = split;
      @col == 2 || die "Error: bad line $_
  ";
      $utt2spk{$col[0]} = $col[1];
    }
    foreach $seg (sort keys %seg2wav) {
      $index = 0;
      $step = $slen - $olen;
      print UMAP "$seg";
      while ($seg_start{$seg} + $index * $step < $seg_end{$seg}) {
        $new_seg = $seg . "_" . sprintf("%05d", $index);
        $start = $seg_start{$seg} + $index * $step;
        $end = $start + $slen;
        defined($utt2spk{$seg}) || die "Error: speaker not found for $seg
  ";
        print UO "$new_seg $utt2spk{$seg}
  ";
        print UMAP " $new_seg"; 
        $index += 1;
        if ($end - $olen + $mslen >= $seg_end{$seg}) {
          # last segment will have at least $mslen seconds.
          $end = $seg_end{$seg};
          print SO "$new_seg $seg2wav{$seg} $start $end
  ";
          last;
        } else {
          print SO "$new_seg $seg2wav{$seg} $start $end
  ";
        }
      }
      print UMAP "
  ";
    }' $input_dir/utt2spk $output_dir/utt2spk \
      $input_dir/segments $output_dir/segments $output_dir/orig2utt \
      $sox $seg_length $min_seg_length $overlap_length
  
  # CAVEAT: We are not dealing with channels here. Each channel should have a
  # unique file name in wav.scp.
  paste -d ' ' <(cut -d ' ' -f 1 $output_dir/wav.scp) \
    <(cut -d ' ' -f 1 $output_dir/wav.scp) | awk '{print $1" "$2" A";}' \
    > $output_dir/reco2file_and_channel
  
  utils/fix_data_dir.sh $output_dir
  
  exit 0;