ConvertSuperCTMtoDataSVM.pl 5.79 KB
#!/usr/bin/perl -w

 
use strict;
use warnings;

sub par_num { return $a <=> $b }

if (@ARGV < 5)
{
	print "<SuperCTM classifie> <svm|boost> <ordre> <only complete vectors : 0|1> <add file info 0:1>\n"
}
else
{
	my $NbFeatures=29;
	my $NomFichier = $ARGV[0];
	my $Format = $ARGV[1];
	my $OnlyComplete=$ARGV[3];
	my $AddFileInfo=$ARGV[4];

	my $FileInfo=0;
	open (CTM, $NomFichier) or die "Ouverture de $NomFichier loupée !";


	my %Features;
	my @Mots;
	my $IdPrec=-1;
	my $First=0;
	my $TailleVecteur=$ARGV[2];
	my $Add=0;
	my $DureeMotCourant=0;

	if ($TailleVecteur=~/.*?\+/)
	{
		$TailleVecteur=~s/\+//;
		$Add=1;
	}

	$TailleVecteur-=1;
	%Features=();

	my $Classe;

	foreach my $Ligne (<CTM>)
	{
		my $IdFeature=0;
		chomp $Ligne;

		$Ligne=~/.*?Mot=(.*?)\s+.*/;

	 	my $Mot = $1;



		$Ligne=~/([0-9]*?)\s+([0-9]*?)\s+(.*?)\s+.*/;

		my $Id=$1;
		my $Ordre=$2;
		$Classe=$3;


		if ($Ordre==0)
		{
			$Classe=$3;

		}



		if ($Id != $IdPrec || $First == 0)
		{
			$IdPrec = $Id;

			$First = 1;
			if (%Features ne 0)
			{
				if ($OnlyComplete == 0)
				{

					for (my $i=1; $i <= ($TailleVecteur*2+1+$Add)*$NbFeatures;$i++)
					{
						if ($Format eq "boost")
						{
							if (!exists $Features{$i})
							{
								$Features{$i}="?";	
							}
						}
					}
				}


				if (scalar(keys(%Features)) == ($TailleVecteur*2+1+$Add)*$NbFeatures)
				{
					if ($AddFileInfo)
					{
						my $newdur= $DureeMotCourant/100;

						$FileInfo.="_$newdur";
					}
					PrintFeatures (\%Features, $Classe, $Format, \@Mots, $FileInfo);
				}
				else 
				{
					my $nb = scalar(keys(%Features));
					my $expected=($TailleVecteur*2+1+$Add)*$NbFeatures;
					print "ERROR : nb features : $nb , expected : $expected\n";
				}
				%Features=();
				@Mots=();
			}
			else
			{
				%Features=();
				@Mots=();
			}
		}


		$IdFeature=$NbFeatures*$Ordre;

		push @Mots, $Mot;

		if ($Ligne=~/\tLogProbUni=(.*?)\t/)
		{
			$Features{1+$IdFeature}=$1;
		}
		if ($Ligne=~/\tDur=(.*?)\t/)
		{
			if ($Ordre == 0)
			{
				$DureeMotCourant = $1;

				if ($AddFileInfo == 1)
				{
					$Ligne=~/(.*?\t){3}(.*?)\t.*/;
					$FileInfo = $2;

					$Ligne=~/(.*?\t){5}(.*?)\t.*/;
					
					my $DebutMot=$2;

					$FileInfo=~/.*([0-9]+):.*/;
					$DebutMot+=$1;

					$DebutMot/=100;

					$FileInfo.="_$DebutMot";
				}
			}
			$Features{2+$IdFeature}=$1;
		}
		if ($Ligne=~/\twin=(.*?)\t/)
		{
			$Features{3+$IdFeature}=$1;
		}
		if ($Ligne=~/\tppl=(.*?)\t/)
		{
			$Features{4+$IdFeature}=$1;
		}
		if ($Ligne=~/\tLMProb=(.*?)\t/)
		{
			$Features{5+$IdFeature}=$1;
		}
		if ($Ligne=~/\tNgramP=(.*?)\t/)
		{
			$Features{6+$IdFeature}=$1;
		}
		if ($Ligne=~/\tNgram=(.*?)\t/)
		{
			$Features{7+$IdFeature}=$1;
		}
		if ($Ligne=~/\tNgramS=(.*?)\t/)
		{
			$Features{8+$IdFeature}=$1;
		}
		if ($Ligne=~/\tGaussRatio=(.*?)\t/)
		{
			$Features{9+$IdFeature}=$1;
		}
		if ($Ligne=~/\tLogVrais=(.*?)\t/)
		{
			$Features{10+$IdFeature}=$1;
		}
		if ($Ligne=~/\tLogVraisFrame=(.*?)\t/)
		{
			$Features{11+$IdFeature}=$1;
		}
		if ($Ligne=~/\tConf=(.*?)\t/)
		{
			$Features{12+$IdFeature}=$1;
		}
		if ($Ligne=~/\tDAP1=(.*?)\t/)
		{
			$Features{13+$IdFeature}=$1;
		}
		if ($Ligne=~/\tDAP2=(.*?)\t/)
		{
			$Features{14+$IdFeature}=$1;
		}
		if ($Ligne=~/\tnodes=(.*?)\t/)
		{
			$Features{15+$IdFeature}=$1;
		}
		if ($Ligne=~/\tposterior=(.*?)\t/)
		{
			$Features{16+$IdFeature}=$1;
		}
		if ($Ligne=~/\tmin=(.*?)\t/)
		{
			$Features{17+$IdFeature}=$1;
		}
		if ($Ligne=~/\tmax=(.*?)\t/)
		{
			$Features{18+$IdFeature}=$1;
		}
		if ($Ligne=~/\tmean=(.*?)\t/)
		{
			$Features{19+$IdFeature}=$1;
		}
		if ($Ligne=~/\tvar=(.*?)\t/)
		{
			$Features{20+$IdFeature}=$1;
		}
		if ($Ligne=~/\tsvar=(.*?)\t/)
		{
			$Features{21+$IdFeature}=$1;
		}
		if ($Ligne=~/\tnullBef=(.*?)\t/)
		{
			$Features{22+$IdFeature}=$1;
		}
		if ($Ligne=~/\tnullAf=(.*)/)
		{
			$Features{23+$IdFeature}=$1;
		}
		if ($Ligne=~/\tMot=(.*?)\t/)
		{
			$Features{24+$IdFeature}=$1;
		}
		if ($Ligne=~/\tGenre=(.*?)\t/)
		{
			$Features{25+$IdFeature}=$1;
		}
		if ($Ligne=~/\tType=(.*?)\t/)
		{
			$Features{26+$IdFeature}=$1;
		}
		if ($Ligne=~/\tNgramP0=(.*?)\t/)
		{
			$Features{27+$IdFeature}=$1;
		}
		if ($Ligne=~/\tNgramS2=(.*?)\t/)
		{
			$Features{28+$IdFeature}=$1;
		}
		if ($Ligne=~/\tML=(.*?)\t/)
		{
			$Features{29+$IdFeature}=$1;
		}






		foreach (keys %Features)
		{
			if ($Features{$_} eq "")
			{
				delete ($Features{$_});
			}
		}
	}


	#RAJOUT POUR NE PAS OUBLIER LE DERNIER MOT


	if (%Features ne 0)
	{
		if ($OnlyComplete == 0)
		{

			for (my $i=1; $i <= ($TailleVecteur*2+1+$Add)*$NbFeatures;$i++)
			{
				if ($Format eq "boost")
				{
					if (!exists $Features{$i})
					{
						$Features{$i}="?";	
					}
				}
			}
		}


		if (scalar(keys(%Features)) == ($TailleVecteur*2+1+$Add)*$NbFeatures)
		{
			if ($AddFileInfo)
			{
				my $newdur= $DureeMotCourant/100;

				$FileInfo.="_$newdur";
			}
			PrintFeatures (\%Features, $Classe, $Format, \@Mots, $FileInfo);
		}
		else 
		{
			my $nb = scalar(keys(%Features));
			my $expected=($TailleVecteur*2+1+$Add)*$NbFeatures;
			#print "error : nb features : $nb , expected : $expected\n";
		}
	}



	#52	0	HV	 19990628_1900_1920_inter 1      0       10      des     LogProbUni=-2.14847     Dur=10  win=2   ppl=-6.47632    LMProb=-2.16605 NgramP=0        Ngram=2 NgramS=2            GaussRatio=8.51 LogVrais=3.2842e+01     LogVraisFrame=3.2842e+00                Conf=1.369455   DAP1=3.28       DAP2=7.57       nodes=7 posterior=0.781947      min=    8.34465e-07 max=0.781947    mean=0.142857   var=0.496395    svar=0.704553   nullBef=0       nullAf=0
}



sub PrintFeatures
{
	my ($Table, $Classe, $Format, $Mots, $FileInfo)=@_;

	my $first=0;

	foreach (sort par_num keys %$Table)
	{
		if ($first == 1)
		{
			print ", ";
		}
		else
		{
			$first=1;
		}
		print "$Table->{$_}";
	}
	
	print ".";

	if ($FileInfo ne 0)
	{
		print "ref=${FileInfo}_@$Mots[0]";
	}

	print "\n";
}