Commit 5492de487a52e11ecf26bc12fe443ffbd07039a7
1 parent
8aff910d0e
Exists in
master
and in
1 other branch
ajout du processor
Showing 5 changed files with 49 additions and 0 deletions Side-by-side Diff
processor/BaseProcessor.py
processor/BaseProcessor.pyc
No preview for this file type
processor/LiaTools.py
1 | +import subprocess | |
2 | +import os | |
3 | +from BaseProcessor import baseProcessor | |
4 | +import nltk | |
5 | +import re | |
6 | +class Tagger(baseProcessor): | |
7 | + """ a calling to lia_tagg class""" | |
8 | + def clean(self,dirtyString): | |
9 | + p=subprocess.Popen([os.environ["LIA_TAGG"]+'/script/lia_clean'],stdin=subprocess.PIPE,stdout=subprocess.PIPE) | |
10 | + (cleanString, err) = p.communicate(input=dirtyString.encode('iso8859-1','backslashreplace')) | |
11 | + return cleanString | |
12 | + | |
13 | + def tagg(self,cleanString): | |
14 | + p2=subprocess.Popen([os.environ["LIA_TAGG"]+'/script/lia_tagg+lemm','-guess'],stdin=subprocess.PIPE,stdout=subprocess.PIPE) | |
15 | + (taggedString,err) =p2.communicate(input=cleanString) | |
16 | + # This is used beceause lia_tagg deal with iso8859 only | |
17 | + return taggedString.decode('iso8859').encode("utf8") | |
18 | + def lemm(self,cleanString): | |
19 | + taggedString = self.taff(cleanString) | |
20 | + return re.sub(r'<s> ',''," ".join([ x.split().pop(2) for x in taggedString.rstrip().split("\n")])) | |
21 | + def isReady(self): | |
22 | + os.environ["LIA_TAGG"] | |
23 | + return true | |
24 | + | |
25 | +class Phoner(baseProcessor): | |
26 | + """ a class which call the lia phoner """ | |
27 | + def clean(self,dirtyString): | |
28 | + p=subprocess.Popen([os.environ["LIA_PHON_REP"]+'/script/lia_nett'],stdin=subprocess.PIPE,stdout=subprocess.PIPE) | |
29 | + (cleanString, err) = p.communicate(input=dirtyString.encode('iso8859-1','backslashreplace')) | |
30 | + return cleanString | |
31 | + def phon(self,cleanString): | |
32 | + p2=subprocess.Popen([os.environ["LIA_PHON_REP"]+'/script/lia_lex2phon'],stdin=subprocess.PIPE,stdout=subprocess.PIPE) | |
33 | + (taggedString,err) =p2.communicate(input=cleanString) | |
34 | + # This is used beceause lia_phon deal with iso8859 only | |
35 | + # We reconverte the output to utf8 back | |
36 | + return taggedString.decode('iso8859').encode("utf8") | |
37 | + def isReady(self): | |
38 | + os.environ["LIA_PHON_REP"] | |
39 | + return true | |
40 | +class StopWord(baseProcessor): | |
41 | + def isReady(self): | |
42 | + return true | |
43 | + def RemoveStopList(self,rowstring): | |
44 | + """ Remove from set of word (splited String ) each words in the stoplist and join all of the other in a string """ | |
45 | + return u" ".join(unicode(value) for value in list(set(test.split()) - set(nltk.corpus.stopwords.words("french")))) |
processor/LiaTools.pyc
No preview for this file type
processor/__init__.pyc
No preview for this file type