Blame view

egs/rm/s5/local/make_rm_lm.pl 2.61 KB
8dcb6dfcb   Yannick Estève   first commit
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
  #!/usr/bin/env perl
  
  # Copyright 2010-2011 Yanmin Qian  Microsoft Corporation
  
  # Licensed under the Apache License, Version 2.0 (the "License");
  # you may not use this file except in compliance with the License.
  # You may obtain a copy of the License at
  #
  #  http://www.apache.org/licenses/LICENSE-2.0
  #
  # THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
  # KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
  # WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
  # MERCHANTABLITY OR NON-INFRINGEMENT.
  # See the Apache 2 License for the specific language governing permissions and
  # limitations under the License.
  
  # This file takes as input the file wp_gram.txt that comes with the RM
  # distribution, and creates the language model as an acceptor in FST form.
  
  # make_rm_lm.pl   wp_gram.txt > G.txt
  
  if (@ARGV != 1) {
      print "usage: make_rm_lm.pl  wp_gram.txt > G.txt
  ";
      exit(0);
  }
  unless (open(IN_FILE, "@ARGV[0]")) {
      die ("can't open @ARGV[0]");
  }
  
  
  $flag = 0;
  $count_wrd = 0;
  $cnt_ends = 0;
  $init = "";
  
  while ($line = <IN_FILE>)
  {	
  	chop($line);
  
      $line =~ s/ //g;
      
  	if(($line =~ /^>/)) 
  	{
  		if($flag == 0) 
  		{
  			$flag = 1;
  		}
  		$line =~ s/>//g;
  		$hashcnt{$init} = $i;
  		$init = $line;
  		$i = 0;
  		$count_wrd++;
  		@LineArray[$count_wrd - 1] = $init;
   		$hashwrd{$init} = 0;
  	}
  	elsif($flag != 0)
  	{
  		
  		$hash{$init}[$i] = $line;
  		$i++; 			
  		if($line =~ /SENTENCE-END/)
  		{
  			$cnt_ends++;
  		}
   	} 
  	else
  	{}
  }
  
  $hashcnt{$init} = $i;
  
  $num = 0;
  $weight = 0;
  $init_wrd = "SENTENCE-END";
  $hashwrd{$init_wrd} = @LineArray;
  for($i = 0; $i < $hashcnt{$init_wrd}; $i++)
  {
  	$weight = -log(1/$hashcnt{$init_wrd});
  	$hashwrd{$hash{$init_wrd}[$i]} = $i + 1;
  	print "0    $hashwrd{$hash{$init_wrd}[$i]}    $hash{$init_wrd}[$i]    $hash{$init_wrd}[$i]    $weight
  ";
  }
  $num = $i;
  
  for($i = 0; $i < @LineArray; $i++)
  {
  	if(@LineArray[$i] eq 'SENTENCE-END')
  	{}
  	else
  	{
  		if($hashwrd{@LineArray[$i]} == 0)
  		{
  			$num++;
  			$hashwrd{@LineArray[$i]} = $num;
  		}
  		for($j = 0; $j < $hashcnt{@LineArray[$i]}; $j++)
  		{
  			$weight = -log(1/$hashcnt{@LineArray[$i]});
  			if($hashwrd{$hash{@LineArray[$i]}[$j]} == 0)
  			{
  				$num++;
  				$hashwrd{$hash{@LineArray[$i]}[$j]} = $num;
  			}
  			if($hash{@LineArray[$i]}[$j] eq 'SENTENCE-END')
  			{
  				print "$hashwrd{@LineArray[$i]}    $hashwrd{$hash{@LineArray[$i]}[$j]}    <eps>    <eps>    $weight
  "
                  }
  			else
  			{
  				print "$hashwrd{@LineArray[$i]}    $hashwrd{$hash{@LineArray[$i]}[$j]}    $hash{@LineArray[$i]}[$j]    $hash{@LineArray[$i]}[$j]    $weight
  ";
  			}
  		}
  	}
  }
  
  print "$hashwrd{$init_wrd}    0
  ";
  close(IN_FILE);