Blame view
processor/LiaTools.py
2.64 KB
5492de487 ajout du processor |
1 2 3 4 5 |
import subprocess import os from BaseProcessor import baseProcessor import nltk import re |
673721ec0 ajout phon il ma... |
6 |
|
5492de487 ajout du processor |
7 8 9 |
class Tagger(baseProcessor): """ a calling to lia_tagg class""" def clean(self,dirtyString): |
7ff5cc7f9 edit raw text + c... |
10 11 12 |
""" Clean string for using it into lia_tagg Change text to iso and clean it one word by line and separate sentences with <s> </s>""" |
5492de487 ajout du processor |
13 14 15 16 17 |
p=subprocess.Popen([os.environ["LIA_TAGG"]+'/script/lia_clean'],stdin=subprocess.PIPE,stdout=subprocess.PIPE) (cleanString, err) = p.communicate(input=dirtyString.encode('iso8859-1','backslashreplace')) return cleanString def tagg(self,cleanString): |
7ff5cc7f9 edit raw text + c... |
18 |
"""POS Tagg and lemm a string which come from clean""" |
5492de487 ajout du processor |
19 20 21 22 |
p2=subprocess.Popen([os.environ["LIA_TAGG"]+'/script/lia_tagg+lemm','-guess'],stdin=subprocess.PIPE,stdout=subprocess.PIPE) (taggedString,err) =p2.communicate(input=cleanString) # This is used beceause lia_tagg deal with iso8859 only return taggedString.decode('iso8859').encode("utf8") |
7ff5cc7f9 edit raw text + c... |
23 |
|
5492de487 ajout du processor |
24 |
def lemm(self,cleanString): |
7ff5cc7f9 edit raw text + c... |
25 |
""" use the pos tagger to lemm word and return lemm only""" |
673721ec0 ajout phon il ma... |
26 |
taggedString = self.tagg(cleanString) |
7ff5cc7f9 edit raw text + c... |
27 |
# sub the string to get only lemm ( cut markup and origin word ) Can be Delete with better use of lia_tagg |
673721ec0 ajout phon il ma... |
28 29 |
sub = re.sub(r' </s>','',re.sub(r'<s> ',''," ".join([ x.rstrip().split().pop(2) for x in taggedString.rstrip().split(" ") if x]))) |
673721ec0 ajout phon il ma... |
30 |
return sub |
5492de487 ajout du processor |
31 |
def isReady(self): |
7ff5cc7f9 edit raw text + c... |
32 |
""" Check if the Tagger can be used ( depends on LIA_TAGG )""" |
5492de487 ajout du processor |
33 |
os.environ["LIA_TAGG"] |
673721ec0 ajout phon il ma... |
34 |
return True |
5492de487 ajout du processor |
35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 |
class Phoner(baseProcessor): """ a class which call the lia phoner """ def clean(self,dirtyString): p=subprocess.Popen([os.environ["LIA_PHON_REP"]+'/script/lia_nett'],stdin=subprocess.PIPE,stdout=subprocess.PIPE) (cleanString, err) = p.communicate(input=dirtyString.encode('iso8859-1','backslashreplace')) return cleanString def phon(self,cleanString): p2=subprocess.Popen([os.environ["LIA_PHON_REP"]+'/script/lia_lex2phon'],stdin=subprocess.PIPE,stdout=subprocess.PIPE) (taggedString,err) =p2.communicate(input=cleanString) # This is used beceause lia_phon deal with iso8859 only # We reconverte the output to utf8 back return taggedString.decode('iso8859').encode("utf8") def isReady(self): os.environ["LIA_PHON_REP"] |
673721ec0 ajout phon il ma... |
50 |
return True |
5492de487 ajout du processor |
51 52 |
class StopWord(baseProcessor): def isReady(self): |
673721ec0 ajout phon il ma... |
53 |
return True |
5492de487 ajout du processor |
54 55 |
def RemoveStopList(self,rowstring): """ Remove from set of word (splited String ) each words in the stoplist and join all of the other in a string """ |
673721ec0 ajout phon il ma... |
56 |
return u" ".join(unicode(value) for value in list(set(rowstring.split()) - set(nltk.corpus.stopwords.words("french")))) |