Blame view
Scripts/utils/split_scp.pl
8.3 KB
ec85f8892 first commit |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 |
#!/usr/bin/perl -w # Copyright 2010-2011 Microsoft Corporation # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED # WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, # MERCHANTABLITY OR NON-INFRINGEMENT. # See the Apache 2 License for the specific language governing permissions and # limitations under the License. # This program splits up any kind of .scp or archive-type file. # If there is no utt2spk option it will work on any text file and # will split it up with an approximately equal number of lines in # each but. # With the --utt2spk option it will work on anything that has the # utterance-id as the first entry on each line; the utt2spk file is # of the form "utterance speaker" (on each line). # It splits it into equal size chunks as far as it can. If you use # the utt2spk option it will make sure these chunks coincide with # speaker boundaries. In this case, if there are more chunks # than speakers (and in some other circumstances), some of the # resulting chunks will be empty and it # will print a warning. # You will normally call this like: # split_scp.pl scp scp.1 scp.2 scp.3 ... # or # split_scp.pl --utt2spk=utt2spk scp scp.1 scp.2 scp.3 ... # Note that you can use this script to split the utt2spk file itself, # e.g. split_scp.pl --utt2spk=utt2spk utt2spk utt2spk.1 utt2spk.2 ... # You can also call the scripts like: # split_scp.pl -j 3 0 scp scp.0 # [note: with this option, it assumes zero-based indexing of the split parts, # i.e. the second number must be 0 <= n < num-jobs.] $num_jobs = 0; $job_id = 0; $utt2spk_file = ""; for ($x = 1; $x <= 2; $x++) { if ($ARGV[0] eq "-j") { shift @ARGV; $num_jobs = shift @ARGV; $job_id = shift @ARGV; if ($num_jobs <= 0 || $job_id < 0 || $job_id >= $num_jobs) { die "Invalid num-jobs and job-id: $num_jobs and $job_id"; } } if ($ARGV[0] =~ "--utt2spk=(.+)") { $utt2spk_file=$1; shift; } } if(($num_jobs == 0 && @ARGV < 2) || ($num_jobs > 0 && (@ARGV < 1 || @ARGV > 2))) { die "Usage: split_scp.pl [--utt2spk=<utt2spk_file>] in.scp out1.scp out2.scp ... " . " or: split_scp.pl -j num-jobs job-id [--utt2spk=<utt2spk_file>] in.scp [out.scp] " . " ... where 0 <= job-id < num-jobs."; } $error = 0; $inscp = shift @ARGV; if ($num_jobs == 0) { # without -j option @OUTPUTS = @ARGV; } else { for ($j = 0; $j < $num_jobs; $j++) { if ($j == $job_id) { if (@ARGV > 0) { push @OUTPUTS, $ARGV[0]; } else { push @OUTPUTS, "-"; } } else { push @OUTPUTS, "/dev/null"; } } } if ($utt2spk_file ne "") { # We have the --utt2spk option... open(U, "<$utt2spk_file") || die "Failed to open utt2spk file $utt2spk_file"; while(<U>) { @A = split; @A == 2 || die "Bad line $_ in utt2spk file $utt2spk_file"; ($u,$s) = @A; $utt2spk{$u} = $s; } open(I, "<$inscp") || die "Opening input scp file $inscp"; @spkrs = (); while(<I>) { @A = split; if(@A == 0) { die "Empty or space-only line in scp file $inscp"; } $u = $A[0]; $s = $utt2spk{$u}; if(!defined $s) { die "No such utterance $u in utt2spk file $utt2spk_file"; } if(!defined $spk_count{$s}) { push @spkrs, $s; $spk_count{$s} = 0; $spk_data{$s} = ""; } $spk_count{$s}++; $spk_data{$s} = $spk_data{$s} . $_; } # Now split as equally as possible .. # First allocate spks to files by allocating an approximately # equal number of speakers. $numspks = @spkrs; # number of speakers. $numscps = @OUTPUTS; # number of output files. for($scpidx = 0; $scpidx < $numscps; $scpidx++) { $scparray[$scpidx] = []; # [] is array reference. } for ($spkidx = 0; $spkidx < $numspks; $spkidx++) { $scpidx = int(($spkidx*$numscps) / $numspks); $spk = $spkrs[$spkidx]; push @{$scparray[$scpidx]}, $spk; $scpcount[$scpidx] += $spk_count{$spk}; } # Now will try to reassign beginning + ending speakers # to different scp's and see if it gets more balanced. # Suppose objf we're minimizing is sum_i (num utts in scp[i] - average)^2. # We can show that if considering changing just 2 scp's, we minimize # this by minimizing the squared difference in sizes. This is # equivalent to minimizing the absolute difference in sizes. This # shows this method is bound to converge. $changed = 1; while($changed) { $changed = 0; for($scpidx = 0; $scpidx < $numscps; $scpidx++) { # First try to reassign ending spk of this scp. if($scpidx < $numscps-1) { $sz = @{$scparray[$scpidx]}; if($sz > 0) { $spk = $scparray[$scpidx]->[$sz-1]; $count = $spk_count{$spk}; $nutt1 = $scpcount[$scpidx]; $nutt2 = $scpcount[$scpidx+1]; if( abs( ($nutt2+$count) - ($nutt1-$count)) < abs($nutt2 - $nutt1)) { # Would decrease # size-diff by reassigning spk... $scpcount[$scpidx+1] += $count; $scpcount[$scpidx] -= $count; pop @{$scparray[$scpidx]}; unshift @{$scparray[$scpidx+1]}, $spk; $changed = 1; } } } if($scpidx > 0 && @{$scparray[$scpidx]} > 0) { $spk = $scparray[$scpidx]->[0]; $count = $spk_count{$spk}; $nutt1 = $scpcount[$scpidx-1]; $nutt2 = $scpcount[$scpidx]; if( abs( ($nutt2-$count) - ($nutt1+$count)) < abs($nutt2 - $nutt1)) { # Would decrease # size-diff by reassigning spk... $scpcount[$scpidx-1] += $count; $scpcount[$scpidx] -= $count; shift @{$scparray[$scpidx]}; push @{$scparray[$scpidx-1]}, $spk; $changed = 1; } } } } # Now print out the files... for($scpidx = 0; $scpidx < $numscps; $scpidx++) { $scpfn = $OUTPUTS[$scpidx]; open(F, ">$scpfn") || die "Could not open scp file $scpfn for writing."; $count = 0; if(@{$scparray[$scpidx]} == 0) { print STDERR "Error: split_scp.pl producing empty .scp file $scpfn (too many splits and too few speakers?) "; $error = 1; } else { foreach $spk ( @{$scparray[$scpidx]} ) { print F $spk_data{$spk}; $count += $spk_count{$spk}; } if($count != $scpcount[$scpidx]) { die "Count mismatch [code error]"; } } close(F); } } else { # This block is the "normal" case where there is no --utt2spk # option and we just break into equal size chunks. open(I, "<$inscp") || die "Opening input scp file $inscp"; $numscps = @OUTPUTS; # size of array. @F = (); while(<I>) { push @F, $_; } $numlines = @F; if($numlines == 0) { print STDERR "split_scp.pl: error: empty input scp file $inscp"; $error = 1; } $linesperscp = int( $numlines / $numscps); # the "whole part".. $linesperscp >= 1 || die "You are splitting into too many pieces!"; $remainder = $numlines - ($linesperscp * $numscps); ($remainder >= 0 && $remainder < $numlines) || die "bad remainder $remainder"; # [just doing int() rounds down]. $n = 0; for($scpidx = 0; $scpidx < @OUTPUTS; $scpidx++) { $scpfile = $OUTPUTS[$scpidx]; open(O, ">$scpfile") || die "Opening output scp file $scpfile"; for($k = 0; $k < $linesperscp + ($scpidx < $remainder ? 1 : 0); $k++) { print O $F[$n++]; } close(O) || die "Closing scp file $scpfile"; } $n == $numlines || die "split_scp.pl: code error., $n != $numlines"; } exit ($error ? 1 : 0); |