Blame view

LIA_kaldiUtils/generate_stm.pl 1.3 KB
ec85f8892   bigot benjamin   first commit
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
  #!/usr/bin/perl
  use strict;
  
  if(scalar(@ARGV) != 2) {
  	print "USAGE: ./generate_stm.pl rep_in_trs rep_out_stm
  ";
  	exit;
  }
  
  my $REP_TRS = $ARGV[0];
  my $REP_STM_OUT = $ARGV[1];
  
  # my $REP_TRS = "/local_disk/hera2/REPERE/Databases/Phase1/train/trs";
  # my $REP_STM_OUT = "stm";
  
  my $TOOLS_BDLEX = "/users/ferreira/TOOLS/OTMEDIA_TOOLS/lia_ltbox/lia_tagg/bin";
  
  my @files = `ls $REP_TRS | grep .trs\$`;
  
  foreach my $file (@files) {
  	chomp($file);
  	$file =~ s/\.trs$//;
  	print "Generation du .stm pour le fichier $file
  ";
  	
  	# First step: trs cleaning feat. pronounciations replacement (ex : 1,5 milliards => 1 milliard 5)
  	system("bin/preprocess_trs.pl < $REP_TRS/$file.trs | bin/replacePronTrs.pl > $REP_STM_OUT/$file.trsTmp");
  	
  	# Second step: generating a raw stm from the trs file
  	system("bin/trs2stm.pl -c -f stm $REP_STM_OUT/$file.trsTmp  > $REP_STM_OUT/$file.raw.stm");
  
  	# Third step: normalizing the stm
  	system("bin/normalize.v0.55 -if stm -n 2 -l 1 -d bin/normalize.v2.0.dic -i $REP_STM_OUT/$file.raw.stm -o $REP_STM_OUT/$file.stm");
  
  	# Last Step : Generate the .txt associated to .stm (essentialy for LM)
  	system("cat $REP_STM_OUT/$file.stm | bin/stm2txt.pl | $TOOLS_BDLEX/reacc_win2bdlex > $REP_STM_OUT/$file.txt");
  
  	# Removing the unused files
  	system("rm -f $REP_STM_OUT/$file.trsTmp $REP_STM_OUT/$file.raw.stm");
  }