generate_stm.pl 1.3 KB
#!/usr/bin/perl
use strict;

if(scalar(@ARGV) != 2) {
	print "USAGE: ./generate_stm.pl rep_in_trs rep_out_stm\n";
	exit;
}

my $REP_TRS = $ARGV[0];
my $REP_STM_OUT = $ARGV[1];

# my $REP_TRS = "/local_disk/hera2/REPERE/Databases/Phase1/train/trs";
# my $REP_STM_OUT = "stm";

my $TOOLS_BDLEX = "/users/ferreira/TOOLS/OTMEDIA_TOOLS/lia_ltbox/lia_tagg/bin";

my @files = `ls $REP_TRS | grep .trs\$`;

foreach my $file (@files) {
	chomp($file);
	$file =~ s/\.trs$//;
	print "Generation du .stm pour le fichier $file\n";
	
	# First step: trs cleaning feat. pronounciations replacement (ex : 1,5 milliards => 1 milliard 5)
	system("bin/preprocess_trs.pl < $REP_TRS/$file.trs | bin/replacePronTrs.pl > $REP_STM_OUT/$file.trsTmp");
	
	# Second step: generating a raw stm from the trs file
	system("bin/trs2stm.pl -c -f stm $REP_STM_OUT/$file.trsTmp  > $REP_STM_OUT/$file.raw.stm");

	# Third step: normalizing the stm
	system("bin/normalize.v0.55 -if stm -n 2 -l 1 -d bin/normalize.v2.0.dic -i $REP_STM_OUT/$file.raw.stm -o $REP_STM_OUT/$file.stm");

	# Last Step : Generate the .txt associated to .stm (essentialy for LM)
	system("cat $REP_STM_OUT/$file.stm | bin/stm2txt.pl | $TOOLS_BDLEX/reacc_win2bdlex > $REP_STM_OUT/$file.txt");

	# Removing the unused files
	system("rm -f $REP_STM_OUT/$file.trsTmp $REP_STM_OUT/$file.raw.stm");
}