Commit 673721ec077645b585c9d15e60a6b7eab3c2362a
1 parent
e0e4926982
Exists in
master
and in
1 other branch
ajout phon il manque plus que le soap
Showing 3 changed files with 27 additions and 9 deletions Side-by-side Diff
README.md
... | ... | @@ -19,9 +19,10 @@ |
19 | 19 | ## Requirement |
20 | 20 | |
21 | 21 | * [LIA\_TAGGER](http://pageperso.lif.univ-mrs.fr/~frederic.bechet/download.html) |
22 | +* [LIA\_PHON](http://pageperso.lif.univ-mrs.fr/~frederic.bechet/download.html) | |
22 | 23 | * Python > 2.5 |
23 | 24 | * Flask |
24 | - | |
25 | +* nltk | |
25 | 26 | ## Advise |
26 | 27 | |
27 | 28 | * Virtualenv |
processor/LiaTools.py
... | ... | @@ -3,6 +3,7 @@ |
3 | 3 | from BaseProcessor import baseProcessor |
4 | 4 | import nltk |
5 | 5 | import re |
6 | + | |
6 | 7 | class Tagger(baseProcessor): |
7 | 8 | """ a calling to lia_tagg class""" |
8 | 9 | def clean(self,dirtyString): |
9 | 10 | |
... | ... | @@ -16,11 +17,15 @@ |
16 | 17 | # This is used beceause lia_tagg deal with iso8859 only |
17 | 18 | return taggedString.decode('iso8859').encode("utf8") |
18 | 19 | def lemm(self,cleanString): |
19 | - taggedString = self.taff(cleanString) | |
20 | - return re.sub(r'<s> ',''," ".join([ x.split().pop(2) for x in taggedString.rstrip().split("\n")])) | |
20 | + print " cleannnnn " + cleanString | |
21 | + taggedString = self.tagg(cleanString) | |
22 | + print "taggs full " + taggedString | |
23 | + sub = re.sub(r' </s>','',re.sub(r'<s> ',''," ".join([ x.rstrip().split().pop(2) for x in taggedString.rstrip().split("\n") if x]))) | |
24 | + print " subbbbb" + sub | |
25 | + return sub | |
21 | 26 | def isReady(self): |
22 | 27 | os.environ["LIA_TAGG"] |
23 | - return true | |
28 | + return True | |
24 | 29 | |
25 | 30 | class Phoner(baseProcessor): |
26 | 31 | """ a class which call the lia phoner """ |
27 | 32 | |
28 | 33 | |
... | ... | @@ -36,11 +41,12 @@ |
36 | 41 | return taggedString.decode('iso8859').encode("utf8") |
37 | 42 | def isReady(self): |
38 | 43 | os.environ["LIA_PHON_REP"] |
39 | - return true | |
44 | + return True | |
45 | + | |
40 | 46 | class StopWord(baseProcessor): |
41 | 47 | def isReady(self): |
42 | - return true | |
48 | + return True | |
43 | 49 | def RemoveStopList(self,rowstring): |
44 | 50 | """ Remove from set of word (splited String ) each words in the stoplist and join all of the other in a string """ |
45 | - return u" ".join(unicode(value) for value in list(set(test.split()) - set(nltk.corpus.stopwords.words("french")))) | |
51 | + return u" ".join(unicode(value) for value in list(set(rowstring.split()) - set(nltk.corpus.stopwords.words("french")))) |
webtagger.py
... | ... | @@ -13,12 +13,23 @@ |
13 | 13 | @app.route("/tagger",methods=['POST']) |
14 | 14 | def cleaner(): |
15 | 15 | tagger = Tagger() |
16 | + tagger.isReady() | |
17 | + phoner = Phoner() | |
18 | + phoner.isReady() | |
19 | + stoplist = StopWord() | |
20 | + stoplist.isReady() | |
16 | 21 | # Receive String from post parametre Raw text ( Json ) |
17 | 22 | dirtyString= request.json[u'string'] |
18 | 23 | # send the String throught LIA_TAGG script thank's to pip |
19 | 24 | # lia_clean split a word by line et markup the sentences |
20 | - cleanString= tagger.clean(dirtyString) | |
21 | - taggedString= tagger.tagg(cleanString) | |
25 | + dirtyString = stoplist.RemoveStopList(dirtyString) | |
26 | + print " stop list " + dirtyString | |
27 | + lemm = tagger.lemm(tagger.clean(dirtyString)) | |
28 | + print 'les lemm '+ lemm | |
29 | + dirtyString = dirtyString+" "+ lemm | |
30 | + cleanString= phoner.clean(dirtyString) | |
31 | + taggedString= phoner.phon(cleanString) | |
32 | + print taggedString | |
22 | 33 | return taggedString |
23 | 34 | if __name__ == '__main__': |
24 | 35 | app.debug = True |