fsp_ideal_data_partitions.pl
2.39 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
#!/usr/bin/env perl
#
# Johns Hopkins University (Author : Gaurav Kumar)
#
# This script should be run from one directory above the current one
#
# Rough partitions that are needed are :
#
# ASR Train : 120k utterances
# ASR tune : 20k utterances
# ASR eval : 20k utterances
# MT train : 105k utterances
# MT tune : Same as the ASR eval (20k utterances)
# MT eval : 20k utterances
#
# This script tries to find the closest possible matches so that conversations
# belong in one single partition and hence there is no speaker/conversation
# overlap between data partitions
use Storable 'dclone';
$textfile="data/local/data/train_all/text";
$tmp="data/local/tmp";
open(T, "<", "$textfile") || die "Can't open text file";
$ongoingConv = "";
%tmpSplits = ();
@splitNumbers = (17455, 20000, 100000, 20000, 100000);
$splitId = 0;
%splits = ();
while (<T>) {
@myStringComponents = split(/\s/);
@uttid = split('-', $myStringComponents[0]);
$currentConv = $uttid[0];
if ($currentConv eq $ongoingConv) {
# Same conversation, add to current hash
#print "Same conversation";
$tmpSplits{$ongoingConv} += 1;
}
else {
# New conversation intiated, first check if there are enough entries
# in the hash
#print $ongoingConv . " " . get_entries_hash(\%tmpSplits) . "\n";
if (get_entries_hash(\%tmpSplits) > $splitNumbers[$splitId]) {
print "Finished processing split " . $splitId . ". It contains " . get_entries_hash(\%tmpSplits) . " entries. \n";
#$splits{$splitId} = keys %tmpSplits;
@newArr = keys %tmpSplits;
$splits{$splitId} = dclone(\@newArr);
%tmpSplits = ();
$splitId += 1;
}
$ongoingConv = $currentConv;
$tmpSplits{$ongoingConv} = 1;
}
}
# Put final tmpsplits in the right partition
@newArr = keys %tmpSplits;
$splits{$splitId} = dclone(\@newArr);
foreach (keys %splits) {
#print $_ , " ", $splits{$_}, "\n";
}
print "Finished processing split " . $splitId . ". It contains " . get_entries_hash(\%tmpSplits) . " entries. \n";
# Write splits to file
foreach my $key ( keys %splits ) {
open(S, ">$tmp/split-$key") || die "Can't open splitfile to write";
foreach my $file ( @{$splits{$key}} ) {
print $file, "\n";
print S "$file\n" || die "Error writing to file";
}
close(S);
}
sub get_entries_hash() {
my $inputHashRef = shift;
$total = 0;
foreach (keys %{$inputHashRef})
{
$total += $inputHashRef->{$_};
}
return $total;
}