Commit 673721ec077645b585c9d15e60a6b7eab3c2362a
1 parent
e0e4926982
Exists in
master
and in
1 other branch
ajout phon il manque plus que le soap
Showing 3 changed files with 27 additions and 9 deletions Side-by-side Diff
README.md
| ... | ... | @@ -19,9 +19,10 @@ |
| 19 | 19 | ## Requirement |
| 20 | 20 | |
| 21 | 21 | * [LIA\_TAGGER](http://pageperso.lif.univ-mrs.fr/~frederic.bechet/download.html) |
| 22 | +* [LIA\_PHON](http://pageperso.lif.univ-mrs.fr/~frederic.bechet/download.html) | |
| 22 | 23 | * Python > 2.5 |
| 23 | 24 | * Flask |
| 24 | - | |
| 25 | +* nltk | |
| 25 | 26 | ## Advise |
| 26 | 27 | |
| 27 | 28 | * Virtualenv |
processor/LiaTools.py
| ... | ... | @@ -3,6 +3,7 @@ |
| 3 | 3 | from BaseProcessor import baseProcessor |
| 4 | 4 | import nltk |
| 5 | 5 | import re |
| 6 | + | |
| 6 | 7 | class Tagger(baseProcessor): |
| 7 | 8 | """ a calling to lia_tagg class""" |
| 8 | 9 | def clean(self,dirtyString): |
| 9 | 10 | |
| ... | ... | @@ -16,11 +17,15 @@ |
| 16 | 17 | # This is used beceause lia_tagg deal with iso8859 only |
| 17 | 18 | return taggedString.decode('iso8859').encode("utf8") |
| 18 | 19 | def lemm(self,cleanString): |
| 19 | - taggedString = self.taff(cleanString) | |
| 20 | - return re.sub(r'<s> ',''," ".join([ x.split().pop(2) for x in taggedString.rstrip().split("\n")])) | |
| 20 | + print " cleannnnn " + cleanString | |
| 21 | + taggedString = self.tagg(cleanString) | |
| 22 | + print "taggs full " + taggedString | |
| 23 | + sub = re.sub(r' </s>','',re.sub(r'<s> ',''," ".join([ x.rstrip().split().pop(2) for x in taggedString.rstrip().split("\n") if x]))) | |
| 24 | + print " subbbbb" + sub | |
| 25 | + return sub | |
| 21 | 26 | def isReady(self): |
| 22 | 27 | os.environ["LIA_TAGG"] |
| 23 | - return true | |
| 28 | + return True | |
| 24 | 29 | |
| 25 | 30 | class Phoner(baseProcessor): |
| 26 | 31 | """ a class which call the lia phoner """ |
| 27 | 32 | |
| 28 | 33 | |
| ... | ... | @@ -36,11 +41,12 @@ |
| 36 | 41 | return taggedString.decode('iso8859').encode("utf8") |
| 37 | 42 | def isReady(self): |
| 38 | 43 | os.environ["LIA_PHON_REP"] |
| 39 | - return true | |
| 44 | + return True | |
| 45 | + | |
| 40 | 46 | class StopWord(baseProcessor): |
| 41 | 47 | def isReady(self): |
| 42 | - return true | |
| 48 | + return True | |
| 43 | 49 | def RemoveStopList(self,rowstring): |
| 44 | 50 | """ Remove from set of word (splited String ) each words in the stoplist and join all of the other in a string """ |
| 45 | - return u" ".join(unicode(value) for value in list(set(test.split()) - set(nltk.corpus.stopwords.words("french")))) | |
| 51 | + return u" ".join(unicode(value) for value in list(set(rowstring.split()) - set(nltk.corpus.stopwords.words("french")))) |
webtagger.py
| ... | ... | @@ -13,12 +13,23 @@ |
| 13 | 13 | @app.route("/tagger",methods=['POST']) |
| 14 | 14 | def cleaner(): |
| 15 | 15 | tagger = Tagger() |
| 16 | + tagger.isReady() | |
| 17 | + phoner = Phoner() | |
| 18 | + phoner.isReady() | |
| 19 | + stoplist = StopWord() | |
| 20 | + stoplist.isReady() | |
| 16 | 21 | # Receive String from post parametre Raw text ( Json ) |
| 17 | 22 | dirtyString= request.json[u'string'] |
| 18 | 23 | # send the String throught LIA_TAGG script thank's to pip |
| 19 | 24 | # lia_clean split a word by line et markup the sentences |
| 20 | - cleanString= tagger.clean(dirtyString) | |
| 21 | - taggedString= tagger.tagg(cleanString) | |
| 25 | + dirtyString = stoplist.RemoveStopList(dirtyString) | |
| 26 | + print " stop list " + dirtyString | |
| 27 | + lemm = tagger.lemm(tagger.clean(dirtyString)) | |
| 28 | + print 'les lemm '+ lemm | |
| 29 | + dirtyString = dirtyString+" "+ lemm | |
| 30 | + cleanString= phoner.clean(dirtyString) | |
| 31 | + taggedString= phoner.phon(cleanString) | |
| 32 | + print taggedString | |
| 22 | 33 | return taggedString |
| 23 | 34 | if __name__ == '__main__': |
| 24 | 35 | app.debug = True |