Blame view
egs/rm/s5/local/make_rm_lm.pl
2.61 KB
8dcb6dfcb first commit |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 |
#!/usr/bin/env perl # Copyright 2010-2011 Yanmin Qian Microsoft Corporation # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED # WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, # MERCHANTABLITY OR NON-INFRINGEMENT. # See the Apache 2 License for the specific language governing permissions and # limitations under the License. # This file takes as input the file wp_gram.txt that comes with the RM # distribution, and creates the language model as an acceptor in FST form. # make_rm_lm.pl wp_gram.txt > G.txt if (@ARGV != 1) { print "usage: make_rm_lm.pl wp_gram.txt > G.txt "; exit(0); } unless (open(IN_FILE, "@ARGV[0]")) { die ("can't open @ARGV[0]"); } $flag = 0; $count_wrd = 0; $cnt_ends = 0; $init = ""; while ($line = <IN_FILE>) { chop($line); $line =~ s/ //g; if(($line =~ /^>/)) { if($flag == 0) { $flag = 1; } $line =~ s/>//g; $hashcnt{$init} = $i; $init = $line; $i = 0; $count_wrd++; @LineArray[$count_wrd - 1] = $init; $hashwrd{$init} = 0; } elsif($flag != 0) { $hash{$init}[$i] = $line; $i++; if($line =~ /SENTENCE-END/) { $cnt_ends++; } } else {} } $hashcnt{$init} = $i; $num = 0; $weight = 0; $init_wrd = "SENTENCE-END"; $hashwrd{$init_wrd} = @LineArray; for($i = 0; $i < $hashcnt{$init_wrd}; $i++) { $weight = -log(1/$hashcnt{$init_wrd}); $hashwrd{$hash{$init_wrd}[$i]} = $i + 1; print "0 $hashwrd{$hash{$init_wrd}[$i]} $hash{$init_wrd}[$i] $hash{$init_wrd}[$i] $weight "; } $num = $i; for($i = 0; $i < @LineArray; $i++) { if(@LineArray[$i] eq 'SENTENCE-END') {} else { if($hashwrd{@LineArray[$i]} == 0) { $num++; $hashwrd{@LineArray[$i]} = $num; } for($j = 0; $j < $hashcnt{@LineArray[$i]}; $j++) { $weight = -log(1/$hashcnt{@LineArray[$i]}); if($hashwrd{$hash{@LineArray[$i]}[$j]} == 0) { $num++; $hashwrd{$hash{@LineArray[$i]}[$j]} = $num; } if($hash{@LineArray[$i]}[$j] eq 'SENTENCE-END') { print "$hashwrd{@LineArray[$i]} $hashwrd{$hash{@LineArray[$i]}[$j]} <eps> <eps> $weight " } else { print "$hashwrd{@LineArray[$i]} $hashwrd{$hash{@LineArray[$i]}[$j]} $hash{@LineArray[$i]}[$j] $hash{@LineArray[$i]}[$j] $weight "; } } } } print "$hashwrd{$init_wrd} 0 "; close(IN_FILE); |