Commit 5492de487a52e11ecf26bc12fe443ffbd07039a7
1 parent
8aff910d0e
Exists in
master
and in
1 other branch
ajout du processor
Showing 5 changed files with 49 additions and 0 deletions Side-by-side Diff
processor/BaseProcessor.py
processor/BaseProcessor.pyc
No preview for this file type
processor/LiaTools.py
| 1 | +import subprocess | |
| 2 | +import os | |
| 3 | +from BaseProcessor import baseProcessor | |
| 4 | +import nltk | |
| 5 | +import re | |
| 6 | +class Tagger(baseProcessor): | |
| 7 | + """ a calling to lia_tagg class""" | |
| 8 | + def clean(self,dirtyString): | |
| 9 | + p=subprocess.Popen([os.environ["LIA_TAGG"]+'/script/lia_clean'],stdin=subprocess.PIPE,stdout=subprocess.PIPE) | |
| 10 | + (cleanString, err) = p.communicate(input=dirtyString.encode('iso8859-1','backslashreplace')) | |
| 11 | + return cleanString | |
| 12 | + | |
| 13 | + def tagg(self,cleanString): | |
| 14 | + p2=subprocess.Popen([os.environ["LIA_TAGG"]+'/script/lia_tagg+lemm','-guess'],stdin=subprocess.PIPE,stdout=subprocess.PIPE) | |
| 15 | + (taggedString,err) =p2.communicate(input=cleanString) | |
| 16 | + # This is used beceause lia_tagg deal with iso8859 only | |
| 17 | + return taggedString.decode('iso8859').encode("utf8") | |
| 18 | + def lemm(self,cleanString): | |
| 19 | + taggedString = self.taff(cleanString) | |
| 20 | + return re.sub(r'<s> ',''," ".join([ x.split().pop(2) for x in taggedString.rstrip().split("\n")])) | |
| 21 | + def isReady(self): | |
| 22 | + os.environ["LIA_TAGG"] | |
| 23 | + return true | |
| 24 | + | |
| 25 | +class Phoner(baseProcessor): | |
| 26 | + """ a class which call the lia phoner """ | |
| 27 | + def clean(self,dirtyString): | |
| 28 | + p=subprocess.Popen([os.environ["LIA_PHON_REP"]+'/script/lia_nett'],stdin=subprocess.PIPE,stdout=subprocess.PIPE) | |
| 29 | + (cleanString, err) = p.communicate(input=dirtyString.encode('iso8859-1','backslashreplace')) | |
| 30 | + return cleanString | |
| 31 | + def phon(self,cleanString): | |
| 32 | + p2=subprocess.Popen([os.environ["LIA_PHON_REP"]+'/script/lia_lex2phon'],stdin=subprocess.PIPE,stdout=subprocess.PIPE) | |
| 33 | + (taggedString,err) =p2.communicate(input=cleanString) | |
| 34 | + # This is used beceause lia_phon deal with iso8859 only | |
| 35 | + # We reconverte the output to utf8 back | |
| 36 | + return taggedString.decode('iso8859').encode("utf8") | |
| 37 | + def isReady(self): | |
| 38 | + os.environ["LIA_PHON_REP"] | |
| 39 | + return true | |
| 40 | +class StopWord(baseProcessor): | |
| 41 | + def isReady(self): | |
| 42 | + return true | |
| 43 | + def RemoveStopList(self,rowstring): | |
| 44 | + """ Remove from set of word (splited String ) each words in the stoplist and join all of the other in a string """ | |
| 45 | + return u" ".join(unicode(value) for value in list(set(test.split()) - set(nltk.corpus.stopwords.words("french")))) |
processor/LiaTools.pyc
No preview for this file type
processor/__init__.pyc
No preview for this file type