2.75 KB
import subprocess
import os
from BaseProcessor import baseProcessor
import nltk
import re

class Tagger(baseProcessor):
    """ a calling to lia_tagg class"""
    def clean(self,dirtyString):
	""" Clean string for using it into lia_tagg

        Change text to iso  and clean it  one word by line and separate sentences with <s> </s>"""
        (cleanString, err) = p.communicate(input=dirtyString.encode('iso8859-1'))
        return cleanString.decode("iso8859-1").encode("utf8")

    def tagg(self,cleanString):
	"""POS Tagg and lemm a string which come  from clean""" 
    	(taggedString,err) =p2.communicate(input=cleanString.decode("utf8").encode('iso8859-1'))
         # This is used beceause lia_tagg deal with iso8859 only
    	return taggedString.decode('iso8859-1').encode("utf8")

    def lemm(self,cleanString):
	    """ use the pos tagger to lemm word and return lemm only"""
 	    taggedString = self.tagg(cleanString)       
	    # sub the string to get only lemm ( cut markup and origin word ) Can be Delete with better use of lia_tagg
	    sub = re.sub(r' </s>',u'', re.sub(r'<s> ','',u" ".join([ x.rstrip().split().pop(2) for x in taggedString.rstrip().split(u"\n") if x])))
            return sub

    def isReady(self):
        """ Check if the Tagger can be used ( depends on LIA_TAGG )"""
        return True

class Phoner(baseProcessor):
    """ a class which call the lia phoner """
    def clean(self,dirtyString):
        (cleanString, err) = p.communicate(input=dirtyString.decode("utf8").encode('iso8859-1'))
        return cleanString.decode("iso8859-1").encode("utf8")
    def phon(self,cleanString):
    	(taggedString,err) =p2.communicate(input=cleanString)
    # This is used beceause lia_phon deal with iso8859 only
    # We reconverte the output to utf8 back
    	return taggedString.decode('iso8859-1').encode("utf8")
    def isReady(self):
        return True

class StopWord(baseProcessor):
    def isReady(self):
        return True
    def RemoveStopList(self,rowstring):
        """ Remove from set of word (splited String ) each words in the stoplist and join all of the other in a string """
        return u" ".join(unicode(value) for value in list(set(rowstring.split()) - set(nltk.corpus.stopwords.words("french"))))