Blame view
processor/LiaTools.py
2.16 KB
5492de487 ajout du processor |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 |
import subprocess import os from BaseProcessor import baseProcessor import nltk import re class Tagger(baseProcessor): """ a calling to lia_tagg class""" def clean(self,dirtyString): p=subprocess.Popen([os.environ["LIA_TAGG"]+'/script/lia_clean'],stdin=subprocess.PIPE,stdout=subprocess.PIPE) (cleanString, err) = p.communicate(input=dirtyString.encode('iso8859-1','backslashreplace')) return cleanString def tagg(self,cleanString): p2=subprocess.Popen([os.environ["LIA_TAGG"]+'/script/lia_tagg+lemm','-guess'],stdin=subprocess.PIPE,stdout=subprocess.PIPE) (taggedString,err) =p2.communicate(input=cleanString) # This is used beceause lia_tagg deal with iso8859 only return taggedString.decode('iso8859').encode("utf8") def lemm(self,cleanString): taggedString = self.taff(cleanString) return re.sub(r'<s> ',''," ".join([ x.split().pop(2) for x in taggedString.rstrip().split(" ")])) def isReady(self): os.environ["LIA_TAGG"] return true class Phoner(baseProcessor): """ a class which call the lia phoner """ def clean(self,dirtyString): p=subprocess.Popen([os.environ["LIA_PHON_REP"]+'/script/lia_nett'],stdin=subprocess.PIPE,stdout=subprocess.PIPE) (cleanString, err) = p.communicate(input=dirtyString.encode('iso8859-1','backslashreplace')) return cleanString def phon(self,cleanString): p2=subprocess.Popen([os.environ["LIA_PHON_REP"]+'/script/lia_lex2phon'],stdin=subprocess.PIPE,stdout=subprocess.PIPE) (taggedString,err) =p2.communicate(input=cleanString) # This is used beceause lia_phon deal with iso8859 only # We reconverte the output to utf8 back return taggedString.decode('iso8859').encode("utf8") def isReady(self): os.environ["LIA_PHON_REP"] return true class StopWord(baseProcessor): def isReady(self): return true def RemoveStopList(self,rowstring): """ Remove from set of word (splited String ) each words in the stoplist and join all of the other in a string """ return u" ".join(unicode(value) for value in list(set(test.split()) - set(nltk.corpus.stopwords.words("french")))) |