scoredCtmAndTaggedLem2All.pl 1.17 KB
#!/usr/bin/perl
#
# Concat scored ctm and taglem file
#

use strict;
use warnings;

my $tagLemFile="";
print $#ARGV;
if($#ARGV == 0)
{
    $tagLemFile = $ARGV[0];
}else
{
    die "BAD USAGE : cat <scored_ctm file> | ./scoredCtmAndTagLem2All.sh <taggerlem file>\n";
}

open(TAGLEM_FILE,"<".$tagLemFile) || die $!;

while(<STDIN>){
    chomp($_);
    my @splittedLine = split(/ /, $_);
    my $basename = $splittedLine[0];
    my $time = $splittedLine[2];
    my $words = $splittedLine[4];
    if( $words =~ m/\<\/?[a-zA-Z:]+\>/){next;}
    my $conf = $splittedLine[5];
    my @word = split(/_/,$words); 

    if($word[0] eq "<s>" || $word[0] eq "</s>"){next;}

    my $tword;
    my $ttag;
    my $tlem;

    for(my $i=0;$i<=$#word;$i++)
    {
        do{
            if( eof(TAGLEM_FILE) ){last;}
            my @taglem = split(/ /,<TAGLEM_FILE>);
            $tword = $taglem[0];
            $ttag = $taglem[1];
            $tlem = $taglem[2];
            chomp($tlem);

        }while($tword =~ /#[0-9]+-[0-9]+#/);
        
        if($tword eq $word[$i]){print "$basename $time $tword $ttag $tlem $conf\n";}
        else { print "ERROR $tword $word[$i]\n";}
        
    }

}

close TAGLEM_FILE;