#!/usr/bin/perl -w

#------------------------------------------
# Author : Emmanuel FERREIRA
# Contact: emmanuel.ferreira0194@gmail.com
# Date : 30/09/11
#------------------------------------------

use strict;
use Getopt::Long;
use Pod::Usage;
use Switch;

use lib "$ENV{'OTMEDIA_HOME'}/tools/scripts/perlmod";
use Utils;

#-------------------------------------------
# MAIN
#-------------------------------------------

# options variables
my $help = 0;
my $format = 'ctm';
my $inExt = 'res';
my $confidenceExt = 'res';
my $inDir = '.';
my $out;
my $ignFile;
my $trsConfigFile;
my $usfConfigFile;
my $bdlex;

# usefull variables
my @filesList;
my @filesContent;
my $fileFilterRegex='';
my %trsConfig;
my %usfConfig;

# Parse options and print usage if there is a syntax
# error, or if usage was explicitly requested
GetOptions('help|?' => \$help, 
	   'format=s' => \$format,
	   'dir=s' => \$inDir,
	   'ext=s' => \$inExt,
	   'out=s' => \$out,
	   'ignore=s' => \$ignFile,
	   'trs_config=s' => \$trsConfigFile,
	   'usf_config=s' => \$usfConfigFile);

# Options test
pod2usage(1) if($help);
pod2usage({-msg =>"\nERROR : you must specify a trs config file (--trs_config) if you want to obtain a TRS formatted output\n", -exitval => 1, -verbose => 0, -output => \*STDERR}) if((lc($format) eq "trs") && (!$trsConfigFile));
pod2usage({-msg =>"\nERROR : you must specify a usf config file (--usf_config) if you want to obtain a USF formatted output\n", -exitval => 1, -verbose => 0, -output => \*STDERR}) if((lc($format) eq "usf") && (!$usfConfigFile));

if($inExt){
	$inExt = '\.'.$inExt;
}

Utils::fill_array_from_dir($inDir, $inExt, \@filesList);

pod2usage({-msg =>"\nERROR : there is no ASR result file ($inExt) located in $inDir\n", -exitval => 1, -verbose => 0, -output => \*STDERR}) if(@filesList == 0);

if($ignFile){
	my @ignItems;
	Utils::fill_array_from_file($ignFile, '', \@ignItems);
	chomp @ignItems;
	$fileFilterRegex = "\\s(".formattedRegex(join("|", @ignItems)).")\\s";
}

foreach my $file (@filesList){
	my $filePath = $inDir."/".$file;
	my @fileContent;
	fill_array_from_file($filePath, $fileFilterRegex, \@fileContent);
	push(@filesContent, @fileContent);
}

pod2usage({-msg =>"\nERROR : Bad res format, it must be 'showName#begin:duratio#sex#speakerID 1 word_begin word_duration word'", -exitval => 1, -verbose => 0, -output => \*STDERR}) if(!checkResFormat(@filesContent));

if($trsConfigFile){
	Utils::fill_hash_from_file($trsConfigFile,' ', \%trsConfig);
}

if($usfConfigFile){
	Utils::fill_hash_from_file($usfConfigFile,' ', \%usfConfig);
}

switch(lc($format)){
	case("ctm") {to_CTM($out, @filesContent);}
	case("trs") {to_TRS($out, \%trsConfig, @filesContent);}
	case("trn") {to_TRN($out, @filesContent);}
	case("txt") {to_TXT($out, @filesContent);}
	case("usf") {to_USF($out, \%usfConfig, @filesContent);}
	else{}
}

#-------------------------------------------
# SUBROUTINES
#-------------------------------------------
#

# 
# \brief check format of the first line of the ASR results 
# \param resFileContent array holding the ASR results
#
sub checkResFormat
{
	my @resFileContent = @_;
	#parse the first line
	my @field = split(/ /, $resFileContent[0]);
	if($#field < 4){
		return 0;
	}
	else{
		my @fileNameField = split(/#/, $field[0]);
		if($#fileNameField != 3 && $#fileNameField != 4)
		{
			return 0;
		}
		else
		{
			my @timeField = split(/:/, $fileNameField[1]);
			if($#timeField != 1)
			{
				return 0;
			}
			return 1;
		}
	}
}

#
# \brief put to the good regex format the string passed in argument
# \param string
# \return formatted string
#
sub formattedRegex
{
	my ($regex) = @_;
	$regex =~ s/\//\\\//g;
	$regex =~ s/\(/\\\(/g;
	$regex =~ s/\)/\\\)/g;
	$regex =~ s/\*/\\\*/g;
	$regex =~ s/\+/\\\+/g;
	$regex =~ s/\{/\\\{/g;
	$regex =~ s/\}/\\\}/g;
	$regex =~ s/\./\\\./g;
	return $regex;
}

#
# \brief convert ASR results to the ctm format
#
sub to_CTM
{
	my ($out, @filesContent) = @_;
	@filesContent = sort ctm_sort @filesContent;
	s/(\S+)#(\S+)#(\S+)#(\S+) (.+)/$1 $5/g for(@filesContent);
	if($out)
	{
		Utils::fill_file_from_array($out, @filesContent);
	}
	else
	{	
		print @filesContent;
	}
}

#
# \brief convert ASR results to the TRS format
#
sub to_TRS
{
	my ($out, $trsConfig, @filesContent) = @_;
	
	my $authorName = $$trsConfig{"name"};
	my $fileName =  $$trsConfig{"fileName"};
	my $fileExt =  $trsConfig{"fileExt"}; 
	my $segFile = $$trsConfig{"segFile"};
	my $outstream = *STDOUT;
	my $file;
	my @segFileContent;
	
	chomp @filesContent;
	@filesContent = sort ctm_sort @filesContent;
	if($out){
		open($file, ">$out") or die ("Cannot open file : $out");
		$outstream = $file
	}
	#-----------
	# XML Header
	#----------
	print $outstream "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n";
	print $outstream "<!DOCTYPE Trans SYSTEM \"trans-14.dtd\">\n";
	print $outstream "<Trans scribe=\"".$authorName."\" audio_filename=\"".$fileName.".".$fileExt."\" version=\"1\" version_date=\"051107\">\n";
	#-----------
	# Speakers
	#-----------
	print $outstream "<Speakers>\n";
		#
		# List each speakers
		#
		Utils::fill_array_from_file($segFile, '', \@segFileContent);
		@segFileContent = grep (/$fileName/, @segFileContent);
		for(my $i = 0; $i <= $#segFileContent; $i++) 
		{
			my @splittedValue = split(/ /, $segFileContent[$i]);
			$segFileContent[$i] = $splittedValue[4].' '.$splittedValue[5].' '.$splittedValue[7];
		}
		my %seen = ();
		my @uniqSegFileContent = grep { ! $seen{$_} ++ } @segFileContent;
		foreach my $line (@uniqSegFileContent)
		{
			chomp $line;
			my @splittedLine = split(/ /, $line);
			my $codesex = $splittedLine[0];
			my $type = $splittedLine[1];
			my $loc = $splittedLine[2];
			my $sex;
			if($codesex eq "F"){
				$sex = "female";
			} else{
				$sex = "male";
			}
			print $outstream "<Speaker id=\"$loc$codesex\" name=\"$loc$codesex\" check=\"yes\" type=\"$sex\" dialect=\"native\" accent=\"\" scope=\"local\"/>\n";
		}
	print $outstream "</Speakers>\n";
	#---------
	# Episode
	#---------
	#
	# In order to calculate the ending time
	#
	my $lastLine = $filesContent[$#filesContent];
	my @splittedLastLine = split(/ /, $lastLine);
	my @splittedLastFileName = split(/#/, $splittedLastLine[0]);
	my @timesLastFile = split(/:/, $splittedLastFileName[1]);
	my $endTime = ($timesLastFile[0]/100) + ($timesLastFile[1]/100);
	print $outstream "<Episode>\n<Section type=\"report\" startTime=\"0\" endTime=\"$endTime\">\n";
	#
	# Fill with content => it's organized in turn (a turn corresponding 
	# to a specific features file)
	#
	my $currentFile = '';
	my ($sex, $speaker, $begin, $end, $channel, $sentence) = ('', '', '', '','','');
	foreach my $content (@filesContent){
		my @splittedContent = split(/ /, $content);
		
		#
		# If it's a new file
		# 	
		if($currentFile ne $splittedContent[0])
		{
			#
			# not the first time
			#
			if($currentFile){
				print $outstream "<Turn speaker=\"$speaker$sex\" startTime=\"$begin\" endTime=\"$end\" mode=\"\" fidelity=\"\" channel=\"$channel\">\n";
 				print $outstream "<Sync time=\"$begin\"/>\n$sentence\n</Turn>\n";
			}
			#
			# initialize values
			#
			$currentFile = $splittedContent[0];
			my @splittedFileName = split(/#/, $splittedContent[0]);
			$sex = $splittedFileName[2];
			$speaker = $splittedFileName[3];
			my $type = substr($speaker,0,1);
			if($type eq "S"){
				$channel = "studio";
			} else{	
				$channel = "telephone";
			}
			my @times =  split(/:/, $splittedFileName[1]);
			$begin = $times[0]/100;
			my $duration = $times[1]/100;
			$end = $begin + $duration;
			$sentence = $splittedContent[4];
		} else {
			#
			# Continue the current sentence
			#
			$sentence .= ' '.$splittedContent[4]
		}
	}
	#
	# For the last turn 
	#
	print $outstream "<Turn speaker=\"$speaker$sex\" startTime=\"$begin\" endTime=\"$end\" mode=\"\" fidelity=\"\" channel=\"$channel\">\n";
       	print $outstream "<Sync time=\"$begin\"/>\n$sentence\n</Turn>\n</Section>\n</Episode>\n";
	#----------
	# END OF XML
	#-----------
	print $outstream "</Trans>";
	if($file){
		close($file);
	}
}

#
# \brief convert ASR results to the USF format
#
sub to_USF
{	
	my ($out, $utfConfig, @filesContent) = @_;
	my $fileName = $$utfConfig{'fileName'};
	my $authorName = $$utfConfig{'name'};
	my $type = $$utfConfig{'type'}; #posteriors
	my $segFile = $$utfConfig{"segFile"};
	my $outstream = *STDOUT;
	my $file;
	my @segFileContent;
	
	chomp @filesContent;
	@filesContent = sort ctm_sort @filesContent;
	if($out){
		open($file, ">$out") or die ("Cannot open file : $out");
		$outstream = $file
	}
	print $outstream "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n<USFSubtitles  xmlns:lia=\"http://www.lia.univ-avignon.fr/index.htm#mediaspeech\" xmlns:exa=\"http://schemas.exalead.com/exa\" version=\"1.0\">\n";
	#-------------
	# SPEAKERS
	#-------------
	print $outstream "<lia:speakers>\n";
		#
		# list each speakers
		# 	
		Utils::fill_array_from_file($segFile, '', \@segFileContent);
		@segFileContent = grep (/$fileName/, @segFileContent);
		for(my $i = 0; $i <= $#segFileContent; $i++) 
		{
			my @splittedValue = split(/ /, $segFileContent[$i]);
			$segFileContent[$i] = $splittedValue[4].' '.$splittedValue[5].' '.$splittedValue[7];
		}
		my %seen = ();
		my @uniqSegFileContent = grep { ! $seen{$_} ++ } @segFileContent;
		foreach my $line (@uniqSegFileContent)
		{
			chomp $line;
			my @splittedLine = split(/ /, $line);
			my $codesex = $splittedLine[0];
			my $type = $splittedLine[1];
			my $loc = $splittedLine[2];
			my $sex;
			if($codesex eq "F"){
				$sex = "female";
			} else{
				$sex = "male";
			}
			print $outstream "<lia:speaker gender=\"$sex\" speaker_id=\"$codesex$loc\" language=\"fre\" language_confidence=\"1.00\"/>\n";
		}
 	print $outstream "</lia:speakers>\n";
	#-------------
	# SUBTITLES TAG
	#-------------
	print $outstream "<subtitles>\n";
		#
		# fill with content => it's organized in subtitle tag (each subtitle tag corresponding 
		# to a specific word with it's confidence)
		#
		foreach my $content (@filesContent){
			#if($type eq "posteriors"){
			my @splittedContent = split(/\s+/, $content);
			my @splittedFileName = split(/#/, $splittedContent[0]);
			my $sexLoc=$splittedFileName[2];
			my $idLoc=$splittedFileName[3];
			my $speakerId = $sexLoc.$idLoc;
			#my @times =  split(/:/, $splittedFileName[1]);
			#my $begin = $times[0]/100;
			#my $duration = $times[1]/100;
			my $begin = $splittedContent[2];
			my $duration = $splittedContent[3];
			my $word = $splittedContent[4];
			my $confidence = sprintf("%0.3f", $splittedContent[5]);
			
			print $outstream "<subtitle start=\"$begin\" duration=\"$duration\" lia:speaker_id=\"$speakerId\" exa:segment_id='0'>\n";
				print $outstream "<text lia:confidence=\"$confidence\" lia:language=\"fre\" lia:language_confidence=\"1.00\">$word</text>\n";
			print $outstream "</subtitle>\n";
			#}
		}
	print $outstream "</subtitles>\n";
	#-----------
	# END OF XML
	#-----------
	print $outstream "</USFSubtitles>";
	if($file){
		close($file);
	}
}

#
# \brief convert ASR results to the TRN format
#
sub to_TRN
{
	my ($out, @filesContent) = @_;
	my $currentId = '';
	my $outstream = *STDOUT;
	my $file;
	
	chomp @filesContent;
	if($out){
		open($file, ">$out") or die("Cannot open file : $out");
		$outstream = $file;
	}	
	foreach my $line (@filesContent)
	{
		my @splittedLine = split(/ /, $line);
		# first time
		if(!$currentId)
		{
			$currentId = $splittedLine[0];
		}
		# if file change
		elsif($currentId ne $splittedLine[0])
		{
			print $outstream "($currentId)\n";
			$currentId = $splittedLine[0];
		}
		print $outstream "$splittedLine[4] ";
	}
	print $outstream "($currentId)";
}

#
# \brief convert ASR results to txt
#
sub to_TXT
{
	my ($out, @filesContent) = @_;
	my $outstream = *STDOUT;
	my $id = 0;
	my $file;
	
	@filesContent = sort ctm_sort @filesContent;
	chomp @filesContent;
	if($out){
		open($file, ">$out") or die("Cannot open file : $out");
		$outstream = $file;
	}
	foreach my $line (@filesContent)
	{
		my @splittedLine = split(/ /, $line);
		print $outstream "$splittedLine[4] ";
		if($id++ == 20)
		{
			print $outstream "\n";
			$id = 0;
		}
	} 
	if($file)
	{
		close($file);
	}
}

#
# \brief ctm sort
#
# equivalent to "sort +0 -1 +1 -2 +2nb -3"
#
sub ctm_sort
{
	my @afmt = split(/ /, $a);
	my @bfmt = split(/ /, $b);
	show_name($afmt[0]) cmp show_name($bfmt[0])
	||
	$afmt[1] cmp $bfmt[1]
	||
	$afmt[2] <=> $bfmt[2];
}

#
# \brief get the show name from the full filename
# \param full filename
# \return show name
#
# full filename format : show_name#begin:duration#speakerType#speakerId
#
sub show_name
{
	my $fullFileName = @_;
	my @splittedFileName = split("#", $fullFileName);
	return $splittedFileName[0];
}

sub goodFormattedHour
{
	my ($time) = @_;
	my $formattedTime;	
	
	my @splitTime = split(/\./, $time);

	my $hour = int($time/3600);
	$time = $time % 3600;

	my $min = int($time/60);
	
	$time = $time % 60;

	if($splitTime[1]){
	
		return sprintf("%02d",$hour).":".sprintf("%02d", $min).":". sprintf("%02d", $time).".".$splitTime[1]."0";

	}
	else{
		
		return sprintf("%02d",$hour).":".sprintf("%02d", $min).":". sprintf("%02d", $time).".000";
	}
}
__END__

=head1 NAME

res2out.pl - ARS results to other format

=head1 SYNOPSIS

res2out.pl [options] 

Options:
	-help|?		brief help message

	-dir		input dir containing ASR results files (default .)

	-format		trs | ctm | txt | trn | usf

	-ext		file extension of the ASR results files (default res)

	-ignore		file containing words to ignore in the ASR results

	-out		output file (default STDOUT)

	-trs_config	file containing the config of the trs transformation

			name <author_name>
			fileName <sound_file_name>
			extFile <sound_file_extension>
			segFile <segmentation_file>

			Example :
	
			name emmanuel
			fileName fr3_1900_2000
			extFile wav
			segFile	/home/emmanuel/ASR/test/fr3_1900_2000.seg

	-usf_config	file containing the config of the usf transformation

			name <author_name>
			fileName <sound_file_name>
			segFile <segmentation_file>

			Example :
			
			name emmanuel
			fileName fr3_1900_2000
			segFile /home/emmanuel/ASR/test/fr3_1900_2000.seg
