diff --git a/README.md b/README.md index 3d9bdf9..498e021 100644 --- a/README.md +++ b/README.md @@ -19,9 +19,10 @@ LIA's webTagger : Web api for the LIA POS TAGGER ## Requirement * [LIA\_TAGGER](http://pageperso.lif.univ-mrs.fr/~frederic.bechet/download.html) +* [LIA\_PHON](http://pageperso.lif.univ-mrs.fr/~frederic.bechet/download.html) * Python > 2.5 * Flask - +* nltk ## Advise * Virtualenv diff --git a/processor/LiaTools.py b/processor/LiaTools.py index e9611d6..3de290b 100644 --- a/processor/LiaTools.py +++ b/processor/LiaTools.py @@ -3,6 +3,7 @@ import os from BaseProcessor import baseProcessor import nltk import re + class Tagger(baseProcessor): """ a calling to lia_tagg class""" def clean(self,dirtyString): @@ -16,11 +17,15 @@ class Tagger(baseProcessor): # This is used beceause lia_tagg deal with iso8859 only return taggedString.decode('iso8859').encode("utf8") def lemm(self,cleanString): - taggedString = self.taff(cleanString) - return re.sub(r' ',''," ".join([ x.split().pop(2) for x in taggedString.rstrip().split("\n")])) + print " cleannnnn " + cleanString + taggedString = self.tagg(cleanString) + print "taggs full " + taggedString + sub = re.sub(r' ','',re.sub(r' ',''," ".join([ x.rstrip().split().pop(2) for x in taggedString.rstrip().split("\n") if x]))) + print " subbbbb" + sub + return sub def isReady(self): os.environ["LIA_TAGG"] - return true + return True class Phoner(baseProcessor): """ a class which call the lia phoner """ @@ -36,10 +41,11 @@ class Phoner(baseProcessor): return taggedString.decode('iso8859').encode("utf8") def isReady(self): os.environ["LIA_PHON_REP"] - return true + return True + class StopWord(baseProcessor): def isReady(self): - return true + return True def RemoveStopList(self,rowstring): """ Remove from set of word (splited String ) each words in the stoplist and join all of the other in a string """ - return u" ".join(unicode(value) for value in list(set(test.split()) - set(nltk.corpus.stopwords.words("french")))) + return u" ".join(unicode(value) for value in list(set(rowstring.split()) - set(nltk.corpus.stopwords.words("french")))) diff --git a/webtagger.py b/webtagger.py index eb600ac..7535a1f 100644 --- a/webtagger.py +++ b/webtagger.py @@ -13,12 +13,23 @@ def docs(): @app.route("/tagger",methods=['POST']) def cleaner(): tagger = Tagger() + tagger.isReady() + phoner = Phoner() + phoner.isReady() + stoplist = StopWord() + stoplist.isReady() # Receive String from post parametre Raw text ( Json ) dirtyString= request.json[u'string'] # send the String throught LIA_TAGG script thank's to pip # lia_clean split a word by line et markup the sentences - cleanString= tagger.clean(dirtyString) - taggedString= tagger.tagg(cleanString) + dirtyString = stoplist.RemoveStopList(dirtyString) + print " stop list " + dirtyString + lemm = tagger.lemm(tagger.clean(dirtyString)) + print 'les lemm '+ lemm + dirtyString = dirtyString+" "+ lemm + cleanString= phoner.clean(dirtyString) + taggedString= phoner.phon(cleanString) + print taggedString return taggedString if __name__ == '__main__': app.debug = True