Blame view
egs/wsj/s5/steps/cleanup/split_long_utterance.sh
4.96 KB
8dcb6dfcb first commit |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 |
#!/bin/bash # Copyright 2014 Guoguo Chen # Apache 2.0 # Begin configuration section. seg_length=30 min_seg_length=10 overlap_length=5 # End configuration section. echo "$0 $@" [ -f ./path.sh ] && . ./path.sh . parse_options.sh || exit 1; if [ $# -ne 2 ]; then echo "This script truncates the long audio into smaller overlapping segments" echo "" echo "Usage: $0 [options] <input-dir> <output-dir>" echo " e.g.: $0 data/train_si284_long data/train_si284_split" echo "" echo "Options:" echo " --min-seg-length # minimal segment length" echo " --seg-length # length of segments in seconds." echo " --overlap-length # length of overlap in seconds." exit 1; fi input_dir=$1 output_dir=$2 for f in spk2utt text utt2spk wav.scp; do [ ! -f $input_dir/$f ] && echo "$0: no such file $input_dir/$f" && exit 1; done [ ! $seg_length -gt $overlap_length ] \ && echo "$0: --seg-length should be longer than --overlap-length." && exit 1; # Checks if sox is on the path. sox=`which sox` [ $? -ne 0 ] && echo "$0: sox command not found." && exit 1; sph2pipe=$KALDI_ROOT/tools/sph2pipe_v2.5/sph2pipe [ ! -x $sph2pipe ] && echo "$0: sph2pipe command not found." && exit 1; mkdir -p $output_dir cp -f $input_dir/spk2gender $output_dir/spk2gender 2>/dev/null cp -f $input_dir/text $output_dir/text.orig cp -f $input_dir/wav.scp $output_dir/wav.scp # We assume the audio length in header is correct and get it from there. It is # a little bit annoying that old version of sox does not support the following: # $audio_cmd | sox --i -D # we have to put it in the following format for the old versions: # $sox --i -D "|$audio_cmd" # Another way is to count all the samples to get the duration, but it takes # longer time, so we do not use it here.. The command is: # $audio_cmd | sox -t wav - -n stat | grep -P "^Length" | awk '{print $1;}' # # Note: in the wsj example the process takes couple of minutes because of the # audio file concatenation; in a real case it should be much faster since # it just reads the header. cat $output_dir/wav.scp | perl -e ' $no_orig_seg = "false"; # Original segment file may or may not exist. ($u2s_in, $u2s_out, $seg_in, $seg_out, $orig2utt, $sox, $slen, $mslen, $olen) = @ARGV; open(UI, "<$u2s_in") || die "Error: fail to open $u2s_in "; open(UO, ">$u2s_out") || die "Error: fail to open $u2s_out "; open(SI, "<$seg_in") || ($no_orig_seg = "true"); open(SO, ">$seg_out") || die "Error: fail to open $seg_out "; open(UMAP, ">$orig2utt") || die "Error: fail to open $orig2utt "; # If the original segment file exists, we have to work out the segment # duration from the segment file. Otherwise we work that out from the wav.scp # file. if ($no_orig_seg eq "false") { while (<SI>) { chomp; @col = split; @col == 4 || die "Error: bad line $_ "; ($seg_id, $wav_id, $seg_start, $seg_end) = @col; $seg2wav{$seg_id} = $wav_id; $seg_start{$seg_id} = $seg_start; $seg_end{$seg_id} = $seg_end; } } else { while (<STDIN>) { chomp; @col = split; @col >= 2 || "bad line $_ "; if ((@col > 2) && ($col[-1] eq "|")) { $wav_id = shift @col; pop @col; $audio_cmd = join(" ", @col); $duration = `$sox --i -D '\''|$audio_cmd'\''`; } else { @col == 2 || die "Error: bad line $_ in wav.scp"; $wav_id = $col[0]; $audio_file = $col[1]; $duration = `$sox --i -D $audio_file`; } chomp($duration); $seg2wav{$wav_id} = $wav_id; $seg_start{$wav_id} = 0; $seg_end{$wav_id} = $duration; } } while (<UI>) { chomp; @col = split; @col == 2 || die "Error: bad line $_ "; $utt2spk{$col[0]} = $col[1]; } foreach $seg (sort keys %seg2wav) { $index = 0; $step = $slen - $olen; print UMAP "$seg"; while ($seg_start{$seg} + $index * $step < $seg_end{$seg}) { $new_seg = $seg . "_" . sprintf("%05d", $index); $start = $seg_start{$seg} + $index * $step; $end = $start + $slen; defined($utt2spk{$seg}) || die "Error: speaker not found for $seg "; print UO "$new_seg $utt2spk{$seg} "; print UMAP " $new_seg"; $index += 1; if ($end - $olen + $mslen >= $seg_end{$seg}) { # last segment will have at least $mslen seconds. $end = $seg_end{$seg}; print SO "$new_seg $seg2wav{$seg} $start $end "; last; } else { print SO "$new_seg $seg2wav{$seg} $start $end "; } } print UMAP " "; }' $input_dir/utt2spk $output_dir/utt2spk \ $input_dir/segments $output_dir/segments $output_dir/orig2utt \ $sox $seg_length $min_seg_length $overlap_length # CAVEAT: We are not dealing with channels here. Each channel should have a # unique file name in wav.scp. paste -d ' ' <(cut -d ' ' -f 1 $output_dir/wav.scp) \ <(cut -d ' ' -f 1 $output_dir/wav.scp) | awk '{print $1" "$2" A";}' \ > $output_dir/reco2file_and_channel utils/fix_data_dir.sh $output_dir exit 0; |