Blame view

egs/lre07/v2/local/dnn/fisher_fix_speakerid.pl 2.79 KB
8dcb6dfcb   Yannick Estève   first commit
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
  #!/usr/bin/perl -w
  
  # Author: Peng Qi (pengqi@cs.stanford.edu)
  # This script maps Switchboard speaker IDs to the true physical speakers
  # and fixes the utterances IDs accordingly. Expected to be run one level of
  # directory above.
  
  sub trim {
      (my $s = $_[0]) =~ s/^\s+|\s+$//g;
      return $s;        
  }
  
  if ($#ARGV != 1) {
  	print "Usage: swbd1_fix_speakerid.pl <fisher-calldata-tbl-file> <data-dir>
  ";
  	print "E.g.:  swbd1_fix_speakerid.pl data/local/train/combined-calldata.tbl data/train_all
  ";
  }
  
  $tab_file = $ARGV[0];
  $dir = $ARGV[1];
  
  %conv_to_spk = ();
  
  open(my $conv_tab, '<', $tab_file) or die "Could not open '$tab_file' $!
  ";
   
  while (my $line = <$conv_tab>) {
    chomp $line;
   
    my @fields = split "," , $line;
    #$fields[0] = trim($fields[0]);
    $fields[5] = trim($fields[5]);
    $fields[10] = trim($fields[10]);
    $conv_to_spk{'fe_03_' . $fields[0] . '-A'} = $fields[5];
    $conv_to_spk{'fe_03_' . $fields[0] . '-B'} = $fields[10];
  }
  
  close($conv_tab);
  
  # fix utt2spk
  
  %missingconv = ();
  
  open(my $utt2spk, '<', $dir . '/utt2spk') or die "Could not open '$dir/utt2spk' $!
  ";
  open(my $utt2spk_new, '>', $dir . '/utt2spk.new');
  
  while (my $line = <$utt2spk>) {
    chomp $line;
  
    my @fields = split " " , $line;
    my $convid = substr $fields[0], 0, 13;
    
    if (exists $conv_to_spk{ $convid }) {
      my $spkid = $conv_to_spk{ $convid };
      $spkid = "fe_03_" . $spkid;
      my $newuttid = $spkid . '-' . (substr $fields[0], 6);
  
      print $utt2spk_new "$newuttid $spkid
  ";
    } else {
      my $convid = substr $convid, 6, 5;
      $missingconv{$convid} = 1;
      
      print $utt2spk_new $fields[0]." ".$fields[1]."
  ";
    }
  }
  
  close($utt2spk);
  close($utt2spk_new);
  
  foreach my $conv (keys %missingconv) {
    print "Warning: Conversation ID '$conv' not found in conv.tab, retaining old speaker IDs
  "
  }
  
  # fix spk2gender
  
  if (open(my $spk2gender, '<', $dir . '/spk2gender')) {
    open(my $spk2gender_new, '>', $dir . '/spk2gender.new');
  
    while (my $line = <$spk2gender>) {
      chomp $line;
  
      my @fields = split " ", $line;
      my $convid = $fields[0];
  
      if (exists $conv_to_spk{ $convid }) {
        my $spkid = $conv_to_spk{ $convid };
        $spkid = "fe_03_" . $spkid;
  
        print $spk2gender_new $spkid." ".$fields[1]."
  ";
      } else {
        print $spk2gender_new $fields[0]." ".$fields[1]."
  ";
      }
    }
  
    close($spk2gender);
    close($spk2gender_new);
  }
  
  # fix segments and text
  
  foreach my $file ('segments','text') {
    open(my $oldfile, '<', "$dir/$file") or die "Could not open '$dir/$file' $!
  ";
    open(my $newfile, '>', "$dir/$file.new");
  
    while (my $line = <$oldfile>) {
      chomp $line;
  
      my $convid = substr $line, 0, 13;
      if (exists $conv_to_spk{$convid}) {
        my $spkid = $conv_to_spk{$convid};
        print $newfile "fe_03_$spkid-" . (substr $line, 6) . "
  ";
      } else {
        print $newfile "$line
  ";
      }
    }
  }