Blame view

egs/fisher_callhome_spanish/s5/local/fsp_ideal_data_partitions.pl 2.39 KB
8dcb6dfcb   Yannick Estève   first commit
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
  #!/usr/bin/env perl
  #
  # Johns Hopkins University (Author : Gaurav Kumar)
  #
  # This script should be run from one directory above the current one
  #
  # Rough partitions that are needed are :
  #
  # ASR Train : 120k utterances
  # ASR tune : 20k utterances
  # ASR eval : 20k utterances
  # MT train : 105k utterances
  # MT tune : Same as the ASR eval (20k utterances)
  # MT eval : 20k utterances
  #
  # This script tries to find the closest possible matches so that conversations
  # belong in one single partition and hence there is no speaker/conversation
  # overlap between data partitions
  
  use Storable 'dclone';
  
  $textfile="data/local/data/train_all/text";
  $tmp="data/local/tmp";
  
  open(T, "<", "$textfile") || die "Can't open text file";
  
  $ongoingConv = "";
  %tmpSplits = ();
  @splitNumbers = (17455, 20000, 100000, 20000, 100000);
  $splitId = 0;
  %splits = ();
  
  while (<T>) {
     @myStringComponents = split(/\s/);
    @uttid = split('-', $myStringComponents[0]);
    $currentConv = $uttid[0];
    if ($currentConv eq $ongoingConv) {
      # Same conversation, add to current hash
      #print "Same conversation";
      $tmpSplits{$ongoingConv} += 1;
    }
    else {
      # New conversation intiated, first check if there are enough entries
      # in the hash
      #print $ongoingConv . " " . get_entries_hash(\%tmpSplits) . "
  ";
      if (get_entries_hash(\%tmpSplits) > $splitNumbers[$splitId]) {
        print "Finished processing split " . $splitId . ". It contains " . get_entries_hash(\%tmpSplits) . " entries. 
  ";
        #$splits{$splitId} = keys %tmpSplits;
        @newArr = keys %tmpSplits;
        $splits{$splitId} = dclone(\@newArr);
        %tmpSplits = ();
        $splitId += 1;
      }
      $ongoingConv = $currentConv;
      $tmpSplits{$ongoingConv} = 1;
    }
  }
  # Put final tmpsplits in the right partition
  @newArr = keys %tmpSplits;
  $splits{$splitId} = dclone(\@newArr);
  foreach (keys  %splits) {
    #print $_ , " ", $splits{$_}, "
  ";
  }
  print "Finished processing split " . $splitId . ". It contains " . get_entries_hash(\%tmpSplits) . " entries. 
  ";
  
  # Write splits to file
  foreach my $key ( keys %splits ) {
    open(S, ">$tmp/split-$key") || die "Can't open splitfile to write";
    foreach my $file ( @{$splits{$key}} ) {
      print $file, "
  ";
      print S "$file
  " || die "Error writing to file";
    }
    close(S);
  }
  
  sub get_entries_hash() {
    my $inputHashRef = shift;
    $total = 0;
    foreach (keys %{$inputHashRef})
      {
      $total += $inputHashRef->{$_};
      }
    return $total;
  }