Blame view
egs/wsj/s5/utils/subset_scp.pl
2.72 KB
8dcb6dfcb first commit |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 |
#!/usr/bin/env perl use warnings; #sed replacement for -w perl parameter # Copyright 2010-2011 Microsoft Corporation # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED # WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, # MERCHANTABLITY OR NON-INFRINGEMENT. # See the Apache 2 License for the specific language governing permissions and # limitations under the License. # This program selects a subset of N elements in the scp. # By default, it selects them evenly from throughout the scp, in order to avoid # selecting too many from the same speaker. It prints them on the standard # output. # With the option --first, it just selects the N first utterances. # With the option --last, it just selects the N last utterances. # Last modified by JHU & HKUST @2013 $quiet = 0; $first = 0; $last = 0; if (@ARGV > 0 && $ARGV[0] eq "--quiet") { shift; $quiet = 1; } if (@ARGV > 0 && $ARGV[0] eq "--first") { shift; $first = 1; } if (@ARGV > 0 && $ARGV[0] eq "--last") { shift; $last = 1; } if(@ARGV < 2 ) { die "Usage: subset_scp.pl [--quiet][--first|--last] N in.scp " . " --quiet causes it to not die if N < num lines in scp. " . " --first and --last make it equivalent to head or tail. " . "See also: filter_scp.pl "; } $N = shift @ARGV; if($N == 0) { die "First command-line parameter to subset_scp.pl must be an integer, got \"$N\""; } $inscp = shift @ARGV; open(I, "<$inscp") || die "Opening input scp file $inscp"; @F = (); while(<I>) { push @F, $_; } $numlines = @F; if($N > $numlines) { if ($quiet) { $N = $numlines; } else { die "You requested from subset_scp.pl more elements than available: $N > $numlines"; } } sub select_n { my ($start,$end,$num_needed) = @_; my $diff = $end - $start; if ($num_needed > $diff) { die "select_n: code error"; } if ($diff == 1 ) { if ($num_needed > 0) { print $F[$start]; } } else { my $halfdiff = int($diff/2); my $halfneeded = int($num_needed/2); select_n($start, $start+$halfdiff, $halfneeded); select_n($start+$halfdiff, $end, $num_needed - $halfneeded); } } if ( ! $first && ! $last) { if ($N > 0) { select_n(0, $numlines, $N); } } else { if ($first) { # --first option: same as head. for ($n = 0; $n < $N; $n++) { print $F[$n]; } } else { # --last option: same as tail. for ($n = @F - $N; $n < @F; $n++) { print $F[$n]; } } } |