#!/usr/bin/perl -w
#------------------------------------------
# Author : Emmanuel FERREIRA
# Contact: emmanuel.ferreira0194@gmail.com
# Date : 30/09/11
#------------------------------------------
use strict;
use Getopt::Long;
use Pod::Usage;
use Switch;
use lib "$ENV{'OTMEDIA_HOME'}/tools/scripts/perlmod";
use Utils;
#-------------------------------------------
# MAIN
#-------------------------------------------
# options variables
my $help = 0;
my $format = 'ctm';
my $inExt = 'res';
my $confidenceExt = 'res';
my $inDir = '.';
my $out;
my $ignFile;
my $trsConfigFile;
my $usfConfigFile;
my $bdlex;
# usefull variables
my @filesList;
my @filesContent;
my $fileFilterRegex='';
my %trsConfig;
my %usfConfig;
# Parse options and print usage if there is a syntax
# error, or if usage was explicitly requested
GetOptions('help|?' => \$help,
'format=s' => \$format,
'dir=s' => \$inDir,
'ext=s' => \$inExt,
'out=s' => \$out,
'ignore=s' => \$ignFile,
'trs_config=s' => \$trsConfigFile,
'usf_config=s' => \$usfConfigFile);
# Options test
pod2usage(1) if($help);
pod2usage({-msg =>"\nERROR : you must specify a trs config file (--trs_config) if you want to obtain a TRS formatted output\n", -exitval => 1, -verbose => 0, -output => \*STDERR}) if((lc($format) eq "trs") && (!$trsConfigFile));
pod2usage({-msg =>"\nERROR : you must specify a usf config file (--usf_config) if you want to obtain a USF formatted output\n", -exitval => 1, -verbose => 0, -output => \*STDERR}) if((lc($format) eq "usf") && (!$usfConfigFile));
if($inExt){
$inExt = '\.'.$inExt;
}
Utils::fill_array_from_dir($inDir, $inExt, \@filesList);
pod2usage({-msg =>"\nERROR : there is no ASR result file ($inExt) located in $inDir\n", -exitval => 1, -verbose => 0, -output => \*STDERR}) if(@filesList == 0);
if($ignFile){
my @ignItems;
Utils::fill_array_from_file($ignFile, '', \@ignItems);
chomp @ignItems;
$fileFilterRegex = "\\s(".formattedRegex(join("|", @ignItems)).")\\s";
}
foreach my $file (@filesList){
my $filePath = $inDir."/".$file;
my @fileContent;
fill_array_from_file($filePath, $fileFilterRegex, \@fileContent);
push(@filesContent, @fileContent);
}
pod2usage({-msg =>"\nERROR : Bad res format, it must be 'showName#begin:duratio#sex#speakerID 1 word_begin word_duration word'", -exitval => 1, -verbose => 0, -output => \*STDERR}) if(!checkResFormat(@filesContent));
if($trsConfigFile){
Utils::fill_hash_from_file($trsConfigFile,' ', \%trsConfig);
}
if($usfConfigFile){
Utils::fill_hash_from_file($usfConfigFile,' ', \%usfConfig);
}
switch(lc($format)){
case("ctm") {to_CTM($out, @filesContent);}
case("trs") {to_TRS($out, \%trsConfig, @filesContent);}
case("trn") {to_TRN($out, @filesContent);}
case("txt") {to_TXT($out, @filesContent);}
case("usf") {to_USF($out, \%usfConfig, @filesContent);}
else{}
}
#-------------------------------------------
# SUBROUTINES
#-------------------------------------------
#
#
# \brief check format of the first line of the ASR results
# \param resFileContent array holding the ASR results
#
sub checkResFormat
{
my @resFileContent = @_;
#parse the first line
my @field = split(/ /, $resFileContent[0]);
if($#field < 4){
return 0;
}
else{
my @fileNameField = split(/#/, $field[0]);
if($#fileNameField != 3 && $#fileNameField != 4)
{
return 0;
}
else
{
my @timeField = split(/:/, $fileNameField[1]);
if($#timeField != 1)
{
return 0;
}
return 1;
}
}
}
#
# \brief put to the good regex format the string passed in argument
# \param string
# \return formatted string
#
sub formattedRegex
{
my ($regex) = @_;
$regex =~ s/\//\\\//g;
$regex =~ s/\(/\\\(/g;
$regex =~ s/\)/\\\)/g;
$regex =~ s/\*/\\\*/g;
$regex =~ s/\+/\\\+/g;
$regex =~ s/\{/\\\{/g;
$regex =~ s/\}/\\\}/g;
$regex =~ s/\./\\\./g;
return $regex;
}
#
# \brief convert ASR results to the ctm format
#
sub to_CTM
{
my ($out, @filesContent) = @_;
@filesContent = sort ctm_sort @filesContent;
s/(\S+)#(\S+)#(\S+)#(\S+) (.+)/$1 $5/g for(@filesContent);
if($out)
{
Utils::fill_file_from_array($out, @filesContent);
}
else
{
print @filesContent;
}
}
#
# \brief convert ASR results to the TRS format
#
sub to_TRS
{
my ($out, $trsConfig, @filesContent) = @_;
my $authorName = $$trsConfig{"name"};
my $fileName = $$trsConfig{"fileName"};
my $fileExt = $trsConfig{"fileExt"};
my $segFile = $$trsConfig{"segFile"};
my $outstream = *STDOUT;
my $file;
my @segFileContent;
chomp @filesContent;
@filesContent = sort ctm_sort @filesContent;
if($out){
open($file, ">$out") or die ("Cannot open file : $out");
$outstream = $file
}
#-----------
# XML Header
#----------
print $outstream "\n";
print $outstream "\n";
print $outstream "\n";
#-----------
# Speakers
#-----------
print $outstream "\n";
#
# List each speakers
#
Utils::fill_array_from_file($segFile, '', \@segFileContent);
@segFileContent = grep (/$fileName/, @segFileContent);
for(my $i = 0; $i <= $#segFileContent; $i++)
{
my @splittedValue = split(/ /, $segFileContent[$i]);
$segFileContent[$i] = $splittedValue[4].' '.$splittedValue[5].' '.$splittedValue[7];
}
my %seen = ();
my @uniqSegFileContent = grep { ! $seen{$_} ++ } @segFileContent;
foreach my $line (@uniqSegFileContent)
{
chomp $line;
my @splittedLine = split(/ /, $line);
my $codesex = $splittedLine[0];
my $type = $splittedLine[1];
my $loc = $splittedLine[2];
my $sex;
if($codesex eq "F"){
$sex = "female";
} else{
$sex = "male";
}
print $outstream "\n";
}
print $outstream "\n";
#---------
# Episode
#---------
#
# In order to calculate the ending time
#
my $lastLine = $filesContent[$#filesContent];
my @splittedLastLine = split(/ /, $lastLine);
my @splittedLastFileName = split(/#/, $splittedLastLine[0]);
my @timesLastFile = split(/:/, $splittedLastFileName[1]);
my $endTime = ($timesLastFile[0]/100) + ($timesLastFile[1]/100);
print $outstream "\n\n";
#
# Fill with content => it's organized in turn (a turn corresponding
# to a specific features file)
#
my $currentFile = '';
my ($sex, $speaker, $begin, $end, $channel, $sentence) = ('', '', '', '','','');
foreach my $content (@filesContent){
my @splittedContent = split(/ /, $content);
#
# If it's a new file
#
if($currentFile ne $splittedContent[0])
{
#
# not the first time
#
if($currentFile){
print $outstream "\n";
print $outstream "\n$sentence\n\n";
}
#
# initialize values
#
$currentFile = $splittedContent[0];
my @splittedFileName = split(/#/, $splittedContent[0]);
$sex = $splittedFileName[2];
$speaker = $splittedFileName[3];
my $type = substr($speaker,0,1);
if($type eq "S"){
$channel = "studio";
} else{
$channel = "telephone";
}
my @times = split(/:/, $splittedFileName[1]);
$begin = $times[0]/100;
my $duration = $times[1]/100;
$end = $begin + $duration;
$sentence = $splittedContent[4];
} else {
#
# Continue the current sentence
#
$sentence .= ' '.$splittedContent[4]
}
}
#
# For the last turn
#
print $outstream "\n";
print $outstream "\n$sentence\n\n\n\n";
#----------
# END OF XML
#-----------
print $outstream "";
if($file){
close($file);
}
}
#
# \brief convert ASR results to the USF format
#
sub to_USF
{
my ($out, $utfConfig, @filesContent) = @_;
my $fileName = $$utfConfig{'fileName'};
my $authorName = $$utfConfig{'name'};
my $type = $$utfConfig{'type'}; #posteriors
my $segFile = $$utfConfig{"segFile"};
my $outstream = *STDOUT;
my $file;
my @segFileContent;
chomp @filesContent;
@filesContent = sort ctm_sort @filesContent;
if($out){
open($file, ">$out") or die ("Cannot open file : $out");
$outstream = $file
}
print $outstream "\n\n";
#-------------
# SPEAKERS
#-------------
print $outstream "\n";
#
# list each speakers
#
Utils::fill_array_from_file($segFile, '', \@segFileContent);
@segFileContent = grep (/$fileName/, @segFileContent);
for(my $i = 0; $i <= $#segFileContent; $i++)
{
my @splittedValue = split(/ /, $segFileContent[$i]);
$segFileContent[$i] = $splittedValue[4].' '.$splittedValue[5].' '.$splittedValue[7];
}
my %seen = ();
my @uniqSegFileContent = grep { ! $seen{$_} ++ } @segFileContent;
foreach my $line (@uniqSegFileContent)
{
chomp $line;
my @splittedLine = split(/ /, $line);
my $codesex = $splittedLine[0];
my $type = $splittedLine[1];
my $loc = $splittedLine[2];
my $sex;
if($codesex eq "F"){
$sex = "female";
} else{
$sex = "male";
}
print $outstream "\n";
}
print $outstream "\n";
#-------------
# SUBTITLES TAG
#-------------
print $outstream "\n";
#
# fill with content => it's organized in subtitle tag (each subtitle tag corresponding
# to a specific word with it's confidence)
#
foreach my $content (@filesContent){
#if($type eq "posteriors"){
my @splittedContent = split(/\s+/, $content);
my @splittedFileName = split(/#/, $splittedContent[0]);
my $sexLoc=$splittedFileName[2];
my $idLoc=$splittedFileName[3];
my $speakerId = $sexLoc.$idLoc;
#my @times = split(/:/, $splittedFileName[1]);
#my $begin = $times[0]/100;
#my $duration = $times[1]/100;
my $begin = $splittedContent[2];
my $duration = $splittedContent[3];
my $word = $splittedContent[4];
my $confidence = sprintf("%0.3f", $splittedContent[5]);
print $outstream "\n";
print $outstream "$word\n";
print $outstream "\n";
#}
}
print $outstream "\n";
#-----------
# END OF XML
#-----------
print $outstream "";
if($file){
close($file);
}
}
#
# \brief convert ASR results to the TRN format
#
sub to_TRN
{
my ($out, @filesContent) = @_;
my $currentId = '';
my $outstream = *STDOUT;
my $file;
chomp @filesContent;
if($out){
open($file, ">$out") or die("Cannot open file : $out");
$outstream = $file;
}
foreach my $line (@filesContent)
{
my @splittedLine = split(/ /, $line);
# first time
if(!$currentId)
{
$currentId = $splittedLine[0];
}
# if file change
elsif($currentId ne $splittedLine[0])
{
print $outstream "($currentId)\n";
$currentId = $splittedLine[0];
}
print $outstream "$splittedLine[4] ";
}
print $outstream "($currentId)";
}
#
# \brief convert ASR results to txt
#
sub to_TXT
{
my ($out, @filesContent) = @_;
my $outstream = *STDOUT;
my $id = 0;
my $file;
@filesContent = sort ctm_sort @filesContent;
chomp @filesContent;
if($out){
open($file, ">$out") or die("Cannot open file : $out");
$outstream = $file;
}
foreach my $line (@filesContent)
{
my @splittedLine = split(/ /, $line);
print $outstream "$splittedLine[4] ";
if($id++ == 20)
{
print $outstream "\n";
$id = 0;
}
}
if($file)
{
close($file);
}
}
#
# \brief ctm sort
#
# equivalent to "sort +0 -1 +1 -2 +2nb -3"
#
sub ctm_sort
{
my @afmt = split(/ /, $a);
my @bfmt = split(/ /, $b);
show_name($afmt[0]) cmp show_name($bfmt[0])
||
$afmt[1] cmp $bfmt[1]
||
$afmt[2] <=> $bfmt[2];
}
#
# \brief get the show name from the full filename
# \param full filename
# \return show name
#
# full filename format : show_name#begin:duration#speakerType#speakerId
#
sub show_name
{
my $fullFileName = @_;
my @splittedFileName = split("#", $fullFileName);
return $splittedFileName[0];
}
sub goodFormattedHour
{
my ($time) = @_;
my $formattedTime;
my @splitTime = split(/\./, $time);
my $hour = int($time/3600);
$time = $time % 3600;
my $min = int($time/60);
$time = $time % 60;
if($splitTime[1]){
return sprintf("%02d",$hour).":".sprintf("%02d", $min).":". sprintf("%02d", $time).".".$splitTime[1]."0";
}
else{
return sprintf("%02d",$hour).":".sprintf("%02d", $min).":". sprintf("%02d", $time).".000";
}
}
__END__
=head1 NAME
res2out.pl - ARS results to other format
=head1 SYNOPSIS
res2out.pl [options]
Options:
-help|? brief help message
-dir input dir containing ASR results files (default .)
-format trs | ctm | txt | trn | usf
-ext file extension of the ASR results files (default res)
-ignore file containing words to ignore in the ASR results
-out output file (default STDOUT)
-trs_config file containing the config of the trs transformation
name
fileName
extFile
segFile
Example :
name emmanuel
fileName fr3_1900_2000
extFile wav
segFile /home/emmanuel/ASR/test/fr3_1900_2000.seg
-usf_config file containing the config of the usf transformation
name
fileName
segFile
Example :
name emmanuel
fileName fr3_1900_2000
segFile /home/emmanuel/ASR/test/fr3_1900_2000.seg