Blame view
egs/wsj/s5/steps/cleanup/make_segmentation_data_dir.sh
6.28 KB
8dcb6dfcb first commit |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 |
#!/bin/bash # Copyright 2014 Guoguo Chen # Apache 2.0 # Begin configuration section. max_seg_length=10 min_seg_length=2 min_sil_length=0.5 time_precision=0.05 special_symbol="<***>" separator=";" wer_cutoff=-1 # End configuration section. set -e echo "$0 $@" [ -f ./path.sh ] && . ./path.sh . parse_options.sh || exit 1; if [ $# -ne 3 ]; then echo "This script takes the ctm file that corresponds to the data directory" echo "created by steps/cleanup/split_long_utterance.sh, works out a new" echo "segmentation and creates a new data directory for the new segmentation." echo "" echo "Usage: $0 [options] <ctm-file> <old-data-dir> <new-data-dir>" echo " e.g.: $0 train_si284_split.ctm \\" echo " data/train_si284_split data/train_si284_reseg" echo "Options:" echo " --wer-cutoff # ignore segments with WER higher than the" echo " # specified value. -1 means no segment will" echo " # be ignored." echo " --max-seg-length # maximum length of new segments" echo " --min-seg-length # minimum length of new segments" echo " --min-sil-length # minimum length of silence as split point" echo " --time-precision # precision for determining \"same time\"" echo " --special-symbol # special symbol to be aligned with" echo " # inserted or deleted words" echo " --separator # separator for aligned pairs" exit 1; fi ctm=$1 old_data_dir=$2 new_data_dir=$3 for f in $ctm $old_data_dir/text.orig $old_data_dir/utt2spk \ $old_data_dir/wav.scp $old_data_dir/segments; do if [ ! -f $f ]; then echo "$0: expected $f to exist" exit 1; fi done mkdir -p $new_data_dir/tmp/ cp -f $old_data_dir/wav.scp $new_data_dir [ -f old_data_dir/spk2gender ] && cp -f $old_data_dir/spk2gender $new_data_dir # Removes the overlapping region (in utils/split_long_utterance.sh we create # the segmentation with overlapping region). # # Note that for each audio file, we expect its segments have been sorted in time # ascending order (if we ignore the overlap). cat $ctm | perl -e ' $precision = $ARGV[0]; @ctm = (); %processed_ids = (); $previous_id = ""; while (<STDIN>) { chomp; my @current = split; @current >= 5 || die "Error: bad line $_ "; $id = join("_", ($current[0], $current[1])); @previous = @{$ctm[-1]}; # Start of a new audio file. if ($previous_id ne $id) { # Prints existing information. if (@ctm > 0) { foreach $line (@ctm) { print "$line->[0] $line->[1] $line->[2] $line->[3] $line->[4] "; } } # Checks if the ctm file is sorted. if (defined($processed_ids{$id})) { die "Error: \"$current[0] $current[1]\" has already been processed "; } else { $processed_ids{$id} = 1; } @ctm = (); push(@ctm, \@current); $previous_id = $id; next; } $new_start = sprintf("%.2f", $previous[2] + $previous[3]); if ($new_start > $current[2]) { # Case 2: scans for a splice point. $index = -1; while (defined($ctm[$index]) && $ctm[$index]->[2] + $ctm[$index]->[3] > $current[2]) { if ($ctm[$index]->[4] eq $current[4] && abs($ctm[$index]->[2] - $current[2]) < $precision && abs($ctm[$index]->[3] - $current[3]) < $precision) { pop @ctm for 2..abs($index); last; } else { $index -= 1; } } } else { push(@ctm, \@current); } } if (@ctm > 0) { foreach $line (@ctm) { print "$line->[0] $line->[1] $line->[2] $line->[3] $line->[4] "; } }' $time_precision > $new_data_dir/tmp/ctm # Creates a text file from the ctm, which will be used in Levenshtein alignment. # Note that we remove <eps> in the text file. cat $new_data_dir/tmp/ctm | perl -e ' $previous_wav = ""; $previous_channel = ""; $text = ""; while (<STDIN>) { chomp; @col = split; @col >= 5 || die "Error: bad line $_ "; if ($previous_wav eq $col[0]) { $previous_channel eq $col[1] || die "Error: more than one channels detected "; if ($col[4] ne "<eps>") { $text .= " $col[4]"; } } else { if ($text ne "") { print "$previous_wav $text "; } $text = $col[4]; $previous_wav = $col[0]; $previous_channel = $col[1]; } } if ($text ne "") { print "$previous_wav $text "; }' > $new_data_dir/tmp/text # Computes the Levenshtein alignment. align-text --special-symbol=$special_symbol --separator=$separator \ ark:$old_data_dir/text.orig ark:$new_data_dir/tmp/text \ ark,t:$new_data_dir/tmp/aligned.txt # Creates new segmentation. steps/cleanup/create_segments_from_ctm.pl \ --max-seg-length $max_seg_length --min-seg-length $min_seg_length \ --min-sil-length $min_sil_length \ --separator $separator --special-symbol $special_symbol \ --wer-cutoff $wer_cutoff \ $new_data_dir/tmp/ctm $new_data_dir/tmp/aligned.txt \ $new_data_dir/segments $new_data_dir/text # Now creates the new utt2spk and spk2utt file. cat $old_data_dir/utt2spk | perl -e ' ($old_seg_file, $new_seg_file, $utt2spk_file_out) = @ARGV; open(OS, "<$old_seg_file") || die "Error: fail to open $old_seg_file "; open(NS, "<$new_seg_file") || die "Error: fail to open $new_seg_file "; open(UO, ">$utt2spk_file_out") || die "Error: fail to open $utt2spk_file_out "; while (<STDIN>) { chomp; @col = split; @col == 2 || die "Error: bad line $_ "; $utt2spk{$col[0]} = $col[1]; } while (<OS>) { chomp; @col = split; @col == 4 || die "Error: bad line $_ "; if (defined($wav2spk{$col[1]})) { $wav2spk{$col[1]} == $utt2spk{$col[0]} || die "Error: multiple speakers detected for wav file $col[1] "; } else { $wav2spk{$col[1]} = $utt2spk{$col[0]}; } } while (<NS>) { chomp; @col = split; @col == 4 || die "Error: bad line $_ "; defined($wav2spk{$col[1]}) || die "Error: could not find speaker for wav file $col[1] "; print UO "$col[0] $wav2spk{$col[1]} "; } ' $old_data_dir/segments $new_data_dir/segments $new_data_dir/utt2spk utils/utt2spk_to_spk2utt.pl $new_data_dir/utt2spk > $new_data_dir/spk2utt utils/fix_data_dir.sh $new_data_dir exit 0; |