Blame view
processor/LiaTools.py
2.32 KB
5492de487 ajout du processor |
1 2 3 4 5 |
import subprocess import os from BaseProcessor import baseProcessor import nltk import re |
673721ec0 ajout phon il ma... |
6 |
|
5492de487 ajout du processor |
7 8 9 10 11 12 13 14 15 16 17 18 19 |
class Tagger(baseProcessor): """ a calling to lia_tagg class""" def clean(self,dirtyString): p=subprocess.Popen([os.environ["LIA_TAGG"]+'/script/lia_clean'],stdin=subprocess.PIPE,stdout=subprocess.PIPE) (cleanString, err) = p.communicate(input=dirtyString.encode('iso8859-1','backslashreplace')) return cleanString def tagg(self,cleanString): p2=subprocess.Popen([os.environ["LIA_TAGG"]+'/script/lia_tagg+lemm','-guess'],stdin=subprocess.PIPE,stdout=subprocess.PIPE) (taggedString,err) =p2.communicate(input=cleanString) # This is used beceause lia_tagg deal with iso8859 only return taggedString.decode('iso8859').encode("utf8") def lemm(self,cleanString): |
673721ec0 ajout phon il ma... |
20 21 22 23 24 25 26 |
print " cleannnnn " + cleanString taggedString = self.tagg(cleanString) print "taggs full " + taggedString sub = re.sub(r' </s>','',re.sub(r'<s> ',''," ".join([ x.rstrip().split().pop(2) for x in taggedString.rstrip().split(" ") if x]))) print " subbbbb" + sub return sub |
5492de487 ajout du processor |
27 28 |
def isReady(self): os.environ["LIA_TAGG"] |
673721ec0 ajout phon il ma... |
29 |
return True |
5492de487 ajout du processor |
30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 |
class Phoner(baseProcessor): """ a class which call the lia phoner """ def clean(self,dirtyString): p=subprocess.Popen([os.environ["LIA_PHON_REP"]+'/script/lia_nett'],stdin=subprocess.PIPE,stdout=subprocess.PIPE) (cleanString, err) = p.communicate(input=dirtyString.encode('iso8859-1','backslashreplace')) return cleanString def phon(self,cleanString): p2=subprocess.Popen([os.environ["LIA_PHON_REP"]+'/script/lia_lex2phon'],stdin=subprocess.PIPE,stdout=subprocess.PIPE) (taggedString,err) =p2.communicate(input=cleanString) # This is used beceause lia_phon deal with iso8859 only # We reconverte the output to utf8 back return taggedString.decode('iso8859').encode("utf8") def isReady(self): os.environ["LIA_PHON_REP"] |
673721ec0 ajout phon il ma... |
45 |
return True |
5492de487 ajout du processor |
46 47 |
class StopWord(baseProcessor): def isReady(self): |
673721ec0 ajout phon il ma... |
48 |
return True |
5492de487 ajout du processor |
49 50 |
def RemoveStopList(self,rowstring): """ Remove from set of word (splited String ) each words in the stoplist and join all of the other in a string """ |
673721ec0 ajout phon il ma... |
51 |
return u" ".join(unicode(value) for value in list(set(rowstring.split()) - set(nltk.corpus.stopwords.words("french")))) |