Blame view
egs/fisher_callhome_spanish/s5/local/fsp_ideal_data_partitions.pl
2.39 KB
8dcb6dfcb first commit |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 |
#!/usr/bin/env perl # # Johns Hopkins University (Author : Gaurav Kumar) # # This script should be run from one directory above the current one # # Rough partitions that are needed are : # # ASR Train : 120k utterances # ASR tune : 20k utterances # ASR eval : 20k utterances # MT train : 105k utterances # MT tune : Same as the ASR eval (20k utterances) # MT eval : 20k utterances # # This script tries to find the closest possible matches so that conversations # belong in one single partition and hence there is no speaker/conversation # overlap between data partitions use Storable 'dclone'; $textfile="data/local/data/train_all/text"; $tmp="data/local/tmp"; open(T, "<", "$textfile") || die "Can't open text file"; $ongoingConv = ""; %tmpSplits = (); @splitNumbers = (17455, 20000, 100000, 20000, 100000); $splitId = 0; %splits = (); while (<T>) { @myStringComponents = split(/\s/); @uttid = split('-', $myStringComponents[0]); $currentConv = $uttid[0]; if ($currentConv eq $ongoingConv) { # Same conversation, add to current hash #print "Same conversation"; $tmpSplits{$ongoingConv} += 1; } else { # New conversation intiated, first check if there are enough entries # in the hash #print $ongoingConv . " " . get_entries_hash(\%tmpSplits) . " "; if (get_entries_hash(\%tmpSplits) > $splitNumbers[$splitId]) { print "Finished processing split " . $splitId . ". It contains " . get_entries_hash(\%tmpSplits) . " entries. "; #$splits{$splitId} = keys %tmpSplits; @newArr = keys %tmpSplits; $splits{$splitId} = dclone(\@newArr); %tmpSplits = (); $splitId += 1; } $ongoingConv = $currentConv; $tmpSplits{$ongoingConv} = 1; } } # Put final tmpsplits in the right partition @newArr = keys %tmpSplits; $splits{$splitId} = dclone(\@newArr); foreach (keys %splits) { #print $_ , " ", $splits{$_}, " "; } print "Finished processing split " . $splitId . ". It contains " . get_entries_hash(\%tmpSplits) . " entries. "; # Write splits to file foreach my $key ( keys %splits ) { open(S, ">$tmp/split-$key") || die "Can't open splitfile to write"; foreach my $file ( @{$splits{$key}} ) { print $file, " "; print S "$file " || die "Error writing to file"; } close(S); } sub get_entries_hash() { my $inputHashRef = shift; $total = 0; foreach (keys %{$inputHashRef}) { $total += $inputHashRef->{$_}; } return $total; } |