Commit 673721ec077645b585c9d15e60a6b7eab3c2362a

Authored by Killian
1 parent e0e4926982
Exists in master and in 1 other branch soap

ajout phon il manque plus que le soap

Showing 3 changed files with 27 additions and 9 deletions Side-by-side Diff

... ... @@ -19,9 +19,10 @@
19 19 ## Requirement
20 20  
21 21 * [LIA\_TAGGER](http://pageperso.lif.univ-mrs.fr/~frederic.bechet/download.html)
  22 +* [LIA\_PHON](http://pageperso.lif.univ-mrs.fr/~frederic.bechet/download.html)
22 23 * Python > 2.5
23 24 * Flask
24   -
  25 +* nltk
25 26 ## Advise
26 27  
27 28 * Virtualenv
processor/LiaTools.py
... ... @@ -3,6 +3,7 @@
3 3 from BaseProcessor import baseProcessor
4 4 import nltk
5 5 import re
  6 +
6 7 class Tagger(baseProcessor):
7 8 """ a calling to lia_tagg class"""
8 9 def clean(self,dirtyString):
9 10  
... ... @@ -16,11 +17,15 @@
16 17 # This is used beceause lia_tagg deal with iso8859 only
17 18 return taggedString.decode('iso8859').encode("utf8")
18 19 def lemm(self,cleanString):
19   - taggedString = self.taff(cleanString)
20   - return re.sub(r'<s> ',''," ".join([ x.split().pop(2) for x in taggedString.rstrip().split("\n")]))
  20 + print " cleannnnn " + cleanString
  21 + taggedString = self.tagg(cleanString)
  22 + print "taggs full " + taggedString
  23 + sub = re.sub(r' </s>','',re.sub(r'<s> ',''," ".join([ x.rstrip().split().pop(2) for x in taggedString.rstrip().split("\n") if x])))
  24 + print " subbbbb" + sub
  25 + return sub
21 26 def isReady(self):
22 27 os.environ["LIA_TAGG"]
23   - return true
  28 + return True
24 29  
25 30 class Phoner(baseProcessor):
26 31 """ a class which call the lia phoner """
27 32  
28 33  
... ... @@ -36,11 +41,12 @@
36 41 return taggedString.decode('iso8859').encode("utf8")
37 42 def isReady(self):
38 43 os.environ["LIA_PHON_REP"]
39   - return true
  44 + return True
  45 +
40 46 class StopWord(baseProcessor):
41 47 def isReady(self):
42   - return true
  48 + return True
43 49 def RemoveStopList(self,rowstring):
44 50 """ Remove from set of word (splited String ) each words in the stoplist and join all of the other in a string """
45   - return u" ".join(unicode(value) for value in list(set(test.split()) - set(nltk.corpus.stopwords.words("french"))))
  51 + return u" ".join(unicode(value) for value in list(set(rowstring.split()) - set(nltk.corpus.stopwords.words("french"))))
... ... @@ -13,12 +13,23 @@
13 13 @app.route("/tagger",methods=['POST'])
14 14 def cleaner():
15 15 tagger = Tagger()
  16 + tagger.isReady()
  17 + phoner = Phoner()
  18 + phoner.isReady()
  19 + stoplist = StopWord()
  20 + stoplist.isReady()
16 21 # Receive String from post parametre Raw text ( Json )
17 22 dirtyString= request.json[u'string']
18 23 # send the String throught LIA_TAGG script thank's to pip
19 24 # lia_clean split a word by line et markup the sentences
20   - cleanString= tagger.clean(dirtyString)
21   - taggedString= tagger.tagg(cleanString)
  25 + dirtyString = stoplist.RemoveStopList(dirtyString)
  26 + print " stop list " + dirtyString
  27 + lemm = tagger.lemm(tagger.clean(dirtyString))
  28 + print 'les lemm '+ lemm
  29 + dirtyString = dirtyString+" "+ lemm
  30 + cleanString= phoner.clean(dirtyString)
  31 + taggedString= phoner.phon(cleanString)
  32 + print taggedString
22 33 return taggedString
23 34 if __name__ == '__main__':
24 35 app.debug = True