Commit 5492de487a52e11ecf26bc12fe443ffbd07039a7
1 parent
8aff910d0e
Exists in
master
and in
1 other branch
ajout du processor
Showing 5 changed files with 49 additions and 0 deletions Inline Diff
processor/BaseProcessor.py
File was created | 1 | class baseProcessor: | |
2 | def isReady(self): | ||
3 | raise NameError(' You are using a module that did not existe') | ||
4 | |||
5 |
processor/BaseProcessor.pyc
No preview for this file type
processor/LiaTools.py
File was created | 1 | import subprocess | |
2 | import os | ||
3 | from BaseProcessor import baseProcessor | ||
4 | import nltk | ||
5 | import re | ||
6 | class Tagger(baseProcessor): | ||
7 | """ a calling to lia_tagg class""" | ||
8 | def clean(self,dirtyString): | ||
9 | p=subprocess.Popen([os.environ["LIA_TAGG"]+'/script/lia_clean'],stdin=subprocess.PIPE,stdout=subprocess.PIPE) | ||
10 | (cleanString, err) = p.communicate(input=dirtyString.encode('iso8859-1','backslashreplace')) | ||
11 | return cleanString | ||
12 | |||
13 | def tagg(self,cleanString): | ||
14 | p2=subprocess.Popen([os.environ["LIA_TAGG"]+'/script/lia_tagg+lemm','-guess'],stdin=subprocess.PIPE,stdout=subprocess.PIPE) | ||
15 | (taggedString,err) =p2.communicate(input=cleanString) | ||
16 | # This is used beceause lia_tagg deal with iso8859 only | ||
17 | return taggedString.decode('iso8859').encode("utf8") | ||
18 | def lemm(self,cleanString): | ||
19 | taggedString = self.taff(cleanString) | ||
20 | return re.sub(r'<s> ',''," ".join([ x.split().pop(2) for x in taggedString.rstrip().split("\n")])) | ||
21 | def isReady(self): | ||
22 | os.environ["LIA_TAGG"] | ||
23 | return true | ||
24 | |||
25 | class Phoner(baseProcessor): | ||
26 | """ a class which call the lia phoner """ | ||
27 | def clean(self,dirtyString): | ||
28 | p=subprocess.Popen([os.environ["LIA_PHON_REP"]+'/script/lia_nett'],stdin=subprocess.PIPE,stdout=subprocess.PIPE) | ||
29 | (cleanString, err) = p.communicate(input=dirtyString.encode('iso8859-1','backslashreplace')) | ||
30 | return cleanString | ||
31 | def phon(self,cleanString): | ||
32 | p2=subprocess.Popen([os.environ["LIA_PHON_REP"]+'/script/lia_lex2phon'],stdin=subprocess.PIPE,stdout=subprocess.PIPE) | ||
33 | (taggedString,err) =p2.communicate(input=cleanString) | ||
34 | # This is used beceause lia_phon deal with iso8859 only | ||
35 | # We reconverte the output to utf8 back | ||
36 | return taggedString.decode('iso8859').encode("utf8") | ||
37 | def isReady(self): | ||
38 | os.environ["LIA_PHON_REP"] | ||
39 | return true | ||
40 | class StopWord(baseProcessor): | ||
41 | def isReady(self): | ||
42 | return true | ||
43 | def RemoveStopList(self,rowstring): | ||
44 | """ Remove from set of word (splited String ) each words in the stoplist and join all of the other in a string """ | ||
45 | return u" ".join(unicode(value) for value in list(set(test.split()) - set(nltk.corpus.stopwords.words("french")))) | ||
46 |
processor/LiaTools.pyc
No preview for this file type
processor/__init__.pyc
No preview for this file type