Blame view

egs/callhome_egyptian/s5/local/isolate_phones.pl 1.64 KB
8dcb6dfcb   Yannick Estève   first commit
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
  #!/usr/bin/env perl
  # Once the phonetic representation for words is generated by the LDC lexicon
  # This script converts them into a KALDI compatible format
  # In addition, it extends the list of phonemes to consider based on 
  # orthograhic representations of those words which do not have stressed vowels
  
  use utf8;
  
  ($tmpdir)=$ARGV[0];
  open(L, "<", "$tmpdir/lexicon_raw") || die "Can't open raw lexicon";
  open(P, "<" , "$tmpdir/phones") || die "Can't open phone file";
  open(I, ">$tmpdir/lexicon_one_column") || die "Can't open text file for writing";
  open(E, ">$tmpdir/phones_extended") || die "Can't open ex-phone file for writing";
  binmode(P, ":utf8");
  binmode(L, ":utf8");
  binmode(I, ":utf8");
  binmode(E, ":utf8");
  
  #Get all phones
  my %phones = qw();
  while (<P>) {
    chomp;
    $phones{$_} = 1;
  }
  
  print @phones;
  
  while (<L>) {
    if (substr($_, 0, 1) eq "#") {
      print I $_;
      next;
    }
    $len = length;
    $current = 0;
    $splitWord = "";
    while ($current < $len) {
      #First check for two char codes
      $currentChar2 = substr($_, $current, 2);
      $currentChar1 = substr($_, $current, 1);
      if (exists($phones{$currentChar2})) {
        $splitWord = $splitWord . " " . $currentChar2;
        $current = $current + 2;
      }
      else {
        # Check if this phone exists
        if (!exists($phones{$currentChar1})) {
          $phones{$currentChar1} = 1
        }
        $splitWord = $splitWord . " " . $currentChar1;
        $current = $current + 1;
      }
    }
    $splitWord =~ s/^\s*(.*?)\s*$/$1/;
    print I $splitWord, "
  ";
  }
  
  # Now write the phones to the extended phone file
  foreach my $key (keys %phones) {
      print E $key, "
  ";
  }
  
  close(L);
  close(P);
  close(I);
  close(E);