isolate_phones.pl 1.68 KB
#!/usr/bin/env perl
# Copyright 2014  Gaurav Kumar.   Apache 2.0
# Once the phonetic representation for words is generated by the LDC lexicon
# This script converts them into a KALDI compatible format
# In addition, it extends the list of phonemes to consider based on
# orthograhic representations of those words which do not have stressed vowels

use utf8;

($tmpdir)=$ARGV[0];
open(L, "<", "$tmpdir/lexicon_raw") || die "Can't open raw lexicon";
open(P, "<" , "$tmpdir/phones") || die "Can't open phone file";
open(I, ">$tmpdir/lexicon_one_column") || die "Can't open text file for writing";
open(E, ">$tmpdir/phones_extended") || die "Can't open ex-phone file for writing";
binmode(P, ":utf8");
binmode(L, ":utf8");
binmode(I, ":utf8");
binmode(E, ":utf8");

#Get all phones
my %phones = qw();
while (<P>) {
  chomp;
  $phones{$_} = 1;
}

print @phones;

while (<L>) {
  if (substr($_, 0, 1) eq "#") {
    print I $_;
    next;
  }
  $len = length;
  $current = 0;
  $splitWord = "";
  while ($current < $len) {
    #First check for two char codes
    $currentChar2 = substr($_, $current, 2);
    $currentChar1 = substr($_, $current, 1);
    if (exists($phones{$currentChar2})) {
      $splitWord = $splitWord . " " . $currentChar2;
      $current = $current + 2;
    }
    else {
      # Check if this phone exists
      if (!exists($phones{$currentChar1})) {
        $phones{$currentChar1} = 1
      }
      $splitWord = $splitWord . " " . $currentChar1;
      $current = $current + 1;
    }
  }
  $splitWord =~ s/^\s*(.*?)\s*$/$1/;
  print I $splitWord, "\n";
}

# Now write the phones to the extended phone file
foreach my $key (keys %phones) {
    print E $key, "\n";
}

close(L);
close(P);
close(I);
close(E);