isolate_phones.pl 1.68 KB
edit raw blame history



1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

19

20

21

22

23

24

25

26

27

28

29

30

31

32

33

34

35

36

37

38

39

40

41

42

43

44

45

46

47

48

49

50

51

52

53

54

55

56

57

58

59

60

61

62

63

64

65

66


#!/usr/bin/env perl
# Copyright 2014  Gaurav Kumar.   Apache 2.0
# Once the phonetic representation for words is generated by the LDC lexicon
# This script converts them into a KALDI compatible format
# In addition, it extends the list of phonemes to consider based on
# orthograhic representations of those words which do not have stressed vowels

use utf8;

($tmpdir)=$ARGV[0];
open(L, "<", "$tmpdir/lexicon_raw") || die "Can't open raw lexicon";
open(P, "<" , "$tmpdir/phones") || die "Can't open phone file";
open(I, ">$tmpdir/lexicon_one_column") || die "Can't open text file for writing";
open(E, ">$tmpdir/phones_extended") || die "Can't open ex-phone file for writing";
binmode(P, ":utf8");
binmode(L, ":utf8");
binmode(I, ":utf8");
binmode(E, ":utf8");

#Get all phones
my %phones = qw();
while (<P>) {
  chomp;
  $phones{$_} = 1;
}

print @phones;

while (<L>) {
  if (substr($_, 0, 1) eq "#") {
    print I $_;
    next;
  }
  $len = length;
  $current = 0;
  $splitWord = "";
  while ($current < $len) {
    #First check for two char codes
    $currentChar2 = substr($_, $current, 2);
    $currentChar1 = substr($_, $current, 1);
    if (exists($phones{$currentChar2})) {
      $splitWord = $splitWord . " " . $currentChar2;
      $current = $current + 2;
    }
    else {
      # Check if this phone exists
      if (!exists($phones{$currentChar1})) {
        $phones{$currentChar1} = 1
      }
      $splitWord = $splitWord . " " . $currentChar1;
      $current = $current + 1;
    }
  }
  $splitWord =~ s/^\s*(.*?)\s*$/$1/;
  print I $splitWord, "\n";
}

# Now write the phones to the extended phone file
foreach my $key (keys %phones) {
    print E $key, "\n";
}

close(L);
close(P);
close(I);
close(E);