Blame view
egs/callhome_egyptian/s5/local/isolate_phones.pl
1.64 KB
8dcb6dfcb first commit |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 |
#!/usr/bin/env perl # Once the phonetic representation for words is generated by the LDC lexicon # This script converts them into a KALDI compatible format # In addition, it extends the list of phonemes to consider based on # orthograhic representations of those words which do not have stressed vowels use utf8; ($tmpdir)=$ARGV[0]; open(L, "<", "$tmpdir/lexicon_raw") || die "Can't open raw lexicon"; open(P, "<" , "$tmpdir/phones") || die "Can't open phone file"; open(I, ">$tmpdir/lexicon_one_column") || die "Can't open text file for writing"; open(E, ">$tmpdir/phones_extended") || die "Can't open ex-phone file for writing"; binmode(P, ":utf8"); binmode(L, ":utf8"); binmode(I, ":utf8"); binmode(E, ":utf8"); #Get all phones my %phones = qw(); while (<P>) { chomp; $phones{$_} = 1; } print @phones; while (<L>) { if (substr($_, 0, 1) eq "#") { print I $_; next; } $len = length; $current = 0; $splitWord = ""; while ($current < $len) { #First check for two char codes $currentChar2 = substr($_, $current, 2); $currentChar1 = substr($_, $current, 1); if (exists($phones{$currentChar2})) { $splitWord = $splitWord . " " . $currentChar2; $current = $current + 2; } else { # Check if this phone exists if (!exists($phones{$currentChar1})) { $phones{$currentChar1} = 1 } $splitWord = $splitWord . " " . $currentChar1; $current = $current + 1; } } $splitWord =~ s/^\s*(.*?)\s*$/$1/; print I $splitWord, " "; } # Now write the phones to the extended phone file foreach my $key (keys %phones) { print E $key, " "; } close(L); close(P); close(I); close(E); |