Jean-François Rey / otmedia

Blame view

tools/scripts/res2out.pl 13.8 KB
  #!/usr/bin/perl -w
  
  #------------------------------------------
  # Author : Emmanuel FERREIRA
  # Contact: emmanuel.ferreira0194@gmail.com
  # Date : 30/09/11
  #------------------------------------------
  
  use strict;
  use Getopt::Long;
  use Pod::Usage;
  use Switch;
  
  use lib "$ENV{'OTMEDIA_HOME'}/tools/scripts/perlmod";
  use Utils;
  
  #-------------------------------------------
  # MAIN
  #-------------------------------------------
  
  # options variables
  my $help = 0;
  my $format = 'ctm';
  my $inExt = 'res';
  my $confidenceExt = 'res';
  my $inDir = '.';
  my $out;
  my $ignFile;
  my $trsConfigFile;
  my $usfConfigFile;
  my $bdlex;
  
  # usefull variables
  my @filesList;
  my @filesContent;
  my $fileFilterRegex='';
  my %trsConfig;
  my %usfConfig;
  
  # Parse options and print usage if there is a syntax
  # error, or if usage was explicitly requested
  GetOptions('help|?' => \$help, 
  	   'format=s' => \$format,
  	   'dir=s' => \$inDir,
  	   'ext=s' => \$inExt,
  	   'out=s' => \$out,
  	   'ignore=s' => \$ignFile,
  	   'trs_config=s' => \$trsConfigFile,
  	   'usf_config=s' => \$usfConfigFile);
  
  # Options test
  pod2usage(1) if($help);
  pod2usage({-msg =>"
  ERROR : you must specify a trs config file (--trs_config) if you want to obtain a TRS formatted output
  ", -exitval => 1, -verbose => 0, -output => \*STDERR}) if((lc($format) eq "trs") && (!$trsConfigFile));
  pod2usage({-msg =>"
  ERROR : you must specify a usf config file (--usf_config) if you want to obtain a USF formatted output
  ", -exitval => 1, -verbose => 0, -output => \*STDERR}) if((lc($format) eq "usf") && (!$usfConfigFile));
  
  if($inExt){
  	$inExt = '\.'.$inExt;
  }
  
  Utils::fill_array_from_dir($inDir, $inExt, \@filesList);
  
  pod2usage({-msg =>"
  ERROR : there is no ASR result file ($inExt) located in $inDir
  ", -exitval => 1, -verbose => 0, -output => \*STDERR}) if(@filesList == 0);
  
  if($ignFile){
  	my @ignItems;
  	Utils::fill_array_from_file($ignFile, '', \@ignItems);
  	chomp @ignItems;
  	$fileFilterRegex = "\\s(".formattedRegex(join("|", @ignItems)).")\\s";
  }
  
  foreach my $file (@filesList){
  	my $filePath = $inDir."/".$file;
  	my @fileContent;
  	fill_array_from_file($filePath, $fileFilterRegex, \@fileContent);
  	push(@filesContent, @fileContent);
  }
  
  pod2usage({-msg =>"
  ERROR : Bad res format, it must be 'showName#begin:duratio#sex#speakerID 1 word_begin word_duration word'", -exitval => 1, -verbose => 0, -output => \*STDERR}) if(!checkResFormat(@filesContent));
  
  if($trsConfigFile){
  	Utils::fill_hash_from_file($trsConfigFile,' ', \%trsConfig);
  }
  
  if($usfConfigFile){
  	Utils::fill_hash_from_file($usfConfigFile,' ', \%usfConfig);
  }
  
  switch(lc($format)){
  	case("ctm") {to_CTM($out, @filesContent);}
  	case("trs") {to_TRS($out, \%trsConfig, @filesContent);}
  	case("trn") {to_TRN($out, @filesContent);}
  	case("txt") {to_TXT($out, @filesContent);}
  	case("usf") {to_USF($out, \%usfConfig, @filesContent);}
  	else{}
  }
  
  #-------------------------------------------
  # SUBROUTINES
  #-------------------------------------------
  #
  
  # 
  # \brief check format of the first line of the ASR results 
  # \param resFileContent array holding the ASR results
  #
  sub checkResFormat
  {
  	my @resFileContent = @_;
  	#parse the first line
  	my @field = split(/ /, $resFileContent[0]);
  	if($#field < 4){
  		return 0;
  	}
  	else{
  		my @fileNameField = split(/#/, $field[0]);
  		if($#fileNameField != 3 && $#fileNameField != 4)
  		{
  			return 0;
  		}
  		else
  		{
  			my @timeField = split(/:/, $fileNameField[1]);
  			if($#timeField != 1)
  			{
  				return 0;
  			}
  			return 1;
  		}
  	}
  }
  
  #
  # \brief put to the good regex format the string passed in argument
  # \param string
  # \return formatted string
  #
  sub formattedRegex
  {
  	my ($regex) = @_;
  	$regex =~ s/\//\\\//g;
  	$regex =~ s/\(/\\\(/g;
  	$regex =~ s/\)/\\\)/g;
  	$regex =~ s/\*/\\\*/g;
  	$regex =~ s/\+/\\\+/g;
  	$regex =~ s/\{/\\\{/g;
  	$regex =~ s/\}/\\\}/g;
  	$regex =~ s/\./\\\./g;
  	return $regex;
  }
  
  #
  # \brief convert ASR results to the ctm format
  #
  sub to_CTM
  {
  	my ($out, @filesContent) = @_;
  	@filesContent = sort ctm_sort @filesContent;
  	s/(\S+)#(\S+)#(\S+)#(\S+) (.+)/$1 $5/g for(@filesContent);
  	if($out)
  	{
  		Utils::fill_file_from_array($out, @filesContent);
  	}
  	else
  	{	
  		print @filesContent;
  	}
  }
  
  #
  # \brief convert ASR results to the TRS format
  #
  sub to_TRS
  {
  	my ($out, $trsConfig, @filesContent) = @_;
  	
  	my $authorName = $$trsConfig{"name"};
  	my $fileName =  $$trsConfig{"fileName"};
  	my $fileExt =  $trsConfig{"fileExt"}; 
  	my $segFile = $$trsConfig{"segFile"};
  	my $outstream = *STDOUT;
  	my $file;
  	my @segFileContent;
  	
  	chomp @filesContent;
  	@filesContent = sort ctm_sort @filesContent;
  	if($out){
  		open($file, ">$out") or die ("Cannot open file : $out");
  		$outstream = $file
  	}
  	#-----------
  	# XML Header
  	#----------
  	print $outstream "<?xml version=\"1.0\" encoding=\"UTF-8\"?>
  ";
  	print $outstream "<!DOCTYPE Trans SYSTEM \"trans-14.dtd\">
  ";
  	print $outstream "<Trans scribe=\"".$authorName."\" audio_filename=\"".$fileName.".".$fileExt."\" version=\"1\" version_date=\"051107\">
  ";
  	#-----------
  	# Speakers
  	#-----------
  	print $outstream "<Speakers>
  ";
  		#
  		# List each speakers
  		#
  		Utils::fill_array_from_file($segFile, '', \@segFileContent);
  		@segFileContent = grep (/$fileName/, @segFileContent);
  		for(my $i = 0; $i <= $#segFileContent; $i++) 
  		{
  			my @splittedValue = split(/ /, $segFileContent[$i]);
  			$segFileContent[$i] = $splittedValue[4].' '.$splittedValue[5].' '.$splittedValue[7];
  		}
  		my %seen = ();
  		my @uniqSegFileContent = grep { ! $seen{$_} ++ } @segFileContent;
  		foreach my $line (@uniqSegFileContent)
  		{
  			chomp $line;
  			my @splittedLine = split(/ /, $line);
  			my $codesex = $splittedLine[0];
  			my $type = $splittedLine[1];
  			my $loc = $splittedLine[2];
  			my $sex;
  			if($codesex eq "F"){
  				$sex = "female";
  			} else{
  				$sex = "male";
  			}
  			print $outstream "<Speaker id=\"$loc$codesex\" name=\"$loc$codesex\" check=\"yes\" type=\"$sex\" dialect=\"native\" accent=\"\" scope=\"local\"/>
  ";
  		}
  	print $outstream "</Speakers>
  ";
  	#---------
  	# Episode
  	#---------
  	#
  	# In order to calculate the ending time
  	#
  	my $lastLine = $filesContent[$#filesContent];
  	my @splittedLastLine = split(/ /, $lastLine);
  	my @splittedLastFileName = split(/#/, $splittedLastLine[0]);
  	my @timesLastFile = split(/:/, $splittedLastFileName[1]);
  	my $endTime = ($timesLastFile[0]/100) + ($timesLastFile[1]/100);
  	print $outstream "<Episode>
  <Section type=\"report\" startTime=\"0\" endTime=\"$endTime\">
  ";
  	#
  	# Fill with content => it's organized in turn (a turn corresponding 
  	# to a specific features file)
  	#
  	my $currentFile = '';
  	my ($sex, $speaker, $begin, $end, $channel, $sentence) = ('', '', '', '','','');
  	foreach my $content (@filesContent){
  		my @splittedContent = split(/ /, $content);
  		
  		#
  		# If it's a new file
  		# 	
  		if($currentFile ne $splittedContent[0])
  		{
  			#
  			# not the first time
  			#
  			if($currentFile){
  				print $outstream "<Turn speaker=\"$speaker$sex\" startTime=\"$begin\" endTime=\"$end\" mode=\"\" fidelity=\"\" channel=\"$channel\">
  ";
   				print $outstream "<Sync time=\"$begin\"/>
  $sentence
  </Turn>
  ";
  			}
  			#
  			# initialize values
  			#
  			$currentFile = $splittedContent[0];
  			my @splittedFileName = split(/#/, $splittedContent[0]);
  			$sex = $splittedFileName[2];
  			$speaker = $splittedFileName[3];
  			my $type = substr($speaker,0,1);
  			if($type eq "S"){
  				$channel = "studio";
  			} else{	
  				$channel = "telephone";
  			}
  			my @times =  split(/:/, $splittedFileName[1]);
  			$begin = $times[0]/100;
  			my $duration = $times[1]/100;
  			$end = $begin + $duration;
  			$sentence = $splittedContent[4];
  		} else {
  			#
  			# Continue the current sentence
  			#
  			$sentence .= ' '.$splittedContent[4]
  		}
  	}
  	#
  	# For the last turn 
  	#
  	print $outstream "<Turn speaker=\"$speaker$sex\" startTime=\"$begin\" endTime=\"$end\" mode=\"\" fidelity=\"\" channel=\"$channel\">
  ";
         	print $outstream "<Sync time=\"$begin\"/>
  $sentence
  </Turn>
  </Section>
  </Episode>
  ";
  	#----------
  	# END OF XML
  	#-----------
  	print $outstream "</Trans>";
  	if($file){
  		close($file);
  	}
  }
  
  #
  # \brief convert ASR results to the USF format
  #
  sub to_USF
  {	
  	my ($out, $utfConfig, @filesContent) = @_;
  	my $fileName = $$utfConfig{'fileName'};
  	my $authorName = $$utfConfig{'name'};
  	my $type = $$utfConfig{'type'}; #posteriors
  	my $segFile = $$utfConfig{"segFile"};
  	my $outstream = *STDOUT;
  	my $file;
  	my @segFileContent;
  	
  	chomp @filesContent;
  	@filesContent = sort ctm_sort @filesContent;
  	if($out){
  		open($file, ">$out") or die ("Cannot open file : $out");
  		$outstream = $file
  	}
  	print $outstream "<?xml version=\"1.0\" encoding=\"UTF-8\"?>
  <USFSubtitles  xmlns:lia=\"http://www.lia.univ-avignon.fr/index.htm#mediaspeech\" xmlns:exa=\"http://schemas.exalead.com/exa\" version=\"1.0\">
  ";
  	#-------------
  	# SPEAKERS
  	#-------------
  	print $outstream "<lia:speakers>
  ";
  		#
  		# list each speakers
  		# 	
  		Utils::fill_array_from_file($segFile, '', \@segFileContent);
  		@segFileContent = grep (/$fileName/, @segFileContent);
  		for(my $i = 0; $i <= $#segFileContent; $i++) 
  		{
  			my @splittedValue = split(/ /, $segFileContent[$i]);
  			$segFileContent[$i] = $splittedValue[4].' '.$splittedValue[5].' '.$splittedValue[7];
  		}
  		my %seen = ();
  		my @uniqSegFileContent = grep { ! $seen{$_} ++ } @segFileContent;
  		foreach my $line (@uniqSegFileContent)
  		{
  			chomp $line;
  			my @splittedLine = split(/ /, $line);
  			my $codesex = $splittedLine[0];
  			my $type = $splittedLine[1];
  			my $loc = $splittedLine[2];
  			my $sex;
  			if($codesex eq "F"){
  				$sex = "female";
  			} else{
  				$sex = "male";
  			}
  			print $outstream "<lia:speaker gender=\"$sex\" speaker_id=\"$codesex$loc\" language=\"fre\" language_confidence=\"1.00\"/>
  ";
  		}
   	print $outstream "</lia:speakers>
  ";
  	#-------------
  	# SUBTITLES TAG
  	#-------------
  	print $outstream "<subtitles>
  ";
  		#
  		# fill with content => it's organized in subtitle tag (each subtitle tag corresponding 
  		# to a specific word with it's confidence)
  		#
  		foreach my $content (@filesContent){
  			#if($type eq "posteriors"){
  			my @splittedContent = split(/\s+/, $content);
  			my @splittedFileName = split(/#/, $splittedContent[0]);
  			my $sexLoc=$splittedFileName[2];
  			my $idLoc=$splittedFileName[3];
  			my $speakerId = $sexLoc.$idLoc;
  			#my @times =  split(/:/, $splittedFileName[1]);
  			#my $begin = $times[0]/100;
  			#my $duration = $times[1]/100;
  			my $begin = $splittedContent[2];
  			my $duration = $splittedContent[3];
  			my $word = $splittedContent[4];
  			my $confidence = sprintf("%0.3f", $splittedContent[5]);
  			
  			print $outstream "<subtitle start=\"$begin\" duration=\"$duration\" lia:speaker_id=\"$speakerId\" exa:segment_id='0'>
  ";
  				print $outstream "<text lia:confidence=\"$confidence\" lia:language=\"fre\" lia:language_confidence=\"1.00\">$word</text>
  ";
  			print $outstream "</subtitle>
  ";
  			#}
  		}
  	print $outstream "</subtitles>
  ";
  	#-----------
  	# END OF XML
  	#-----------
  	print $outstream "</USFSubtitles>";
  	if($file){
  		close($file);
  	}
  }
  
  #
  # \brief convert ASR results to the TRN format
  #
  sub to_TRN
  {
  	my ($out, @filesContent) = @_;
  	my $currentId = '';
  	my $outstream = *STDOUT;
  	my $file;
  	
  	chomp @filesContent;
  	if($out){
  		open($file, ">$out") or die("Cannot open file : $out");
  		$outstream = $file;
  	}	
  	foreach my $line (@filesContent)
  	{
  		my @splittedLine = split(/ /, $line);
  		# first time
  		if(!$currentId)
  		{
  			$currentId = $splittedLine[0];
  		}
  		# if file change
  		elsif($currentId ne $splittedLine[0])
  		{
  			print $outstream "($currentId)
  ";
  			$currentId = $splittedLine[0];
  		}
  		print $outstream "$splittedLine[4] ";
  	}
  	print $outstream "($currentId)";
  }
  
  #
  # \brief convert ASR results to txt
  #
  sub to_TXT
  {
  	my ($out, @filesContent) = @_;
  	my $outstream = *STDOUT;
  	my $id = 0;
  	my $file;
  	
  	@filesContent = sort ctm_sort @filesContent;
  	chomp @filesContent;
  	if($out){
  		open($file, ">$out") or die("Cannot open file : $out");
  		$outstream = $file;
  	}
  	foreach my $line (@filesContent)
  	{
  		my @splittedLine = split(/ /, $line);
  		print $outstream "$splittedLine[4] ";
  		if($id++ == 20)
  		{
  			print $outstream "
  ";
  			$id = 0;
  		}
  	} 
  	if($file)
  	{
  		close($file);
  	}
  }
  
  #
  # \brief ctm sort
  #
  # equivalent to "sort +0 -1 +1 -2 +2nb -3"
  #
  sub ctm_sort
  {
  	my @afmt = split(/ /, $a);
  	my @bfmt = split(/ /, $b);
  	show_name($afmt[0]) cmp show_name($bfmt[0])
  	||
  	$afmt[1] cmp $bfmt[1]
  	||
  	$afmt[2] <=> $bfmt[2];
  }
  
  #
  # \brief get the show name from the full filename
  # \param full filename
  # \return show name
  #
  # full filename format : show_name#begin:duration#speakerType#speakerId
  #
  sub show_name
  {
  	my $fullFileName = @_;
  	my @splittedFileName = split("#", $fullFileName);
  	return $splittedFileName[0];
  }
  
  sub goodFormattedHour
  {
  	my ($time) = @_;
  	my $formattedTime;	
  	
  	my @splitTime = split(/\./, $time);
  
  	my $hour = int($time/3600);
  	$time = $time % 3600;
  
  	my $min = int($time/60);
  	
  	$time = $time % 60;
  
  	if($splitTime[1]){
  	
  		return sprintf("%02d",$hour).":".sprintf("%02d", $min).":". sprintf("%02d", $time).".".$splitTime[1]."0";
  
  	}
  	else{
  		
  		return sprintf("%02d",$hour).":".sprintf("%02d", $min).":". sprintf("%02d", $time).".000";
  	}
  }
  __END__
  
  =head1 NAME
  
  res2out.pl - ARS results to other format
  
  =head1 SYNOPSIS
  
  res2out.pl [options] 
  
  Options:
  	-help|?		brief help message
  
  	-dir		input dir containing ASR results files (default .)
  
  	-format		trs | ctm | txt | trn | usf
  
  	-ext		file extension of the ASR results files (default res)
  
  	-ignore		file containing words to ignore in the ASR results
  
  	-out		output file (default STDOUT)
  
  	-trs_config	file containing the config of the trs transformation
  
  			name <author_name>
  			fileName <sound_file_name>
  			extFile <sound_file_extension>
  			segFile <segmentation_file>
  
  			Example :
  	
  			name emmanuel
  			fileName fr3_1900_2000
  			extFile wav
  			segFile	/home/emmanuel/ASR/test/fr3_1900_2000.seg
  
  	-usf_config	file containing the config of the usf transformation
  
  			name <author_name>
  			fileName <sound_file_name>
  			segFile <segmentation_file>
  
  			Example :
  			
  			name emmanuel
  			fileName fr3_1900_2000
  			segFile /home/emmanuel/ASR/test/fr3_1900_2000.seg