generate_stm.pl
1.3 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
#!/usr/bin/perl
use strict;
if(scalar(@ARGV) != 2) {
print "USAGE: ./generate_stm.pl rep_in_trs rep_out_stm\n";
exit;
}
my $REP_TRS = $ARGV[0];
my $REP_STM_OUT = $ARGV[1];
# my $REP_TRS = "/local_disk/hera2/REPERE/Databases/Phase1/train/trs";
# my $REP_STM_OUT = "stm";
my $TOOLS_BDLEX = "/users/ferreira/TOOLS/OTMEDIA_TOOLS/lia_ltbox/lia_tagg/bin";
my @files = `ls $REP_TRS | grep .trs\$`;
foreach my $file (@files) {
chomp($file);
$file =~ s/\.trs$//;
print "Generation du .stm pour le fichier $file\n";
# First step: trs cleaning feat. pronounciations replacement (ex : 1,5 milliards => 1 milliard 5)
system("bin/preprocess_trs.pl < $REP_TRS/$file.trs | bin/replacePronTrs.pl > $REP_STM_OUT/$file.trsTmp");
# Second step: generating a raw stm from the trs file
system("bin/trs2stm.pl -c -f stm $REP_STM_OUT/$file.trsTmp > $REP_STM_OUT/$file.raw.stm");
# Third step: normalizing the stm
system("bin/normalize.v0.55 -if stm -n 2 -l 1 -d bin/normalize.v2.0.dic -i $REP_STM_OUT/$file.raw.stm -o $REP_STM_OUT/$file.stm");
# Last Step : Generate the .txt associated to .stm (essentialy for LM)
system("cat $REP_STM_OUT/$file.stm | bin/stm2txt.pl | $TOOLS_BDLEX/reacc_win2bdlex > $REP_STM_OUT/$file.txt");
# Removing the unused files
system("rm -f $REP_STM_OUT/$file.trsTmp $REP_STM_OUT/$file.raw.stm");
}