Commit 673721ec077645b585c9d15e60a6b7eab3c2362a

Authored by Killian
1 parent e0e4926982
Exists in master and in 1 other branch soap

ajout phon il manque plus que le soap

Showing 3 changed files with 27 additions and 9 deletions Inline Diff

1 LIA's webTagger : Web api for the LIA POS TAGGER 1 LIA's webTagger : Web api for the LIA POS TAGGER
2 ================================================= 2 =================================================
3 3
4 ## LIA's WebTagger let you 4 ## LIA's WebTagger let you
5 5
6 * Taggin each word of a sentences with his morphosyntaxic function 6 * Taggin each word of a sentences with his morphosyntaxic function
7 * having the lemm of corresponding of which word 7 * having the lemm of corresponding of which word
8 8
9 ## LIA's WebTagger is 9 ## LIA's WebTagger is
10 10
11 * Powered by Python 11 * Powered by Python
12 * Free and open source (licence CeCILL) 12 * Free and open source (licence CeCILL)
13 13
14 ## Ressources 14 ## Ressources
15 15
16 * source code : gitlia.univ-avignon.fr/public 16 * source code : gitlia.univ-avignon.fr/public
17 * contact : killian.janod@alumni.univ-avignon.fr 17 * contact : killian.janod@alumni.univ-avignon.fr
18 18
19 ## Requirement 19 ## Requirement
20 20
21 * [LIA\_TAGGER](http://pageperso.lif.univ-mrs.fr/~frederic.bechet/download.html) 21 * [LIA\_TAGGER](http://pageperso.lif.univ-mrs.fr/~frederic.bechet/download.html)
22 * [LIA\_PHON](http://pageperso.lif.univ-mrs.fr/~frederic.bechet/download.html)
22 * Python > 2.5 23 * Python > 2.5
23 * Flask 24 * Flask
24 25 * nltk
25 ## Advise 26 ## Advise
26 27
27 * Virtualenv 28 * Virtualenv
28 * gunincorn 29 * gunincorn
29 * Bower ( for js/css in the online demo ) 30 * Bower ( for js/css in the online demo )
30 31
31 ## Instalation 32 ## Instalation
32 33
33 TODO 34 TODO
34 35
processor/LiaTools.py
1 import subprocess 1 import subprocess
2 import os 2 import os
3 from BaseProcessor import baseProcessor 3 from BaseProcessor import baseProcessor
4 import nltk 4 import nltk
5 import re 5 import re
6
6 class Tagger(baseProcessor): 7 class Tagger(baseProcessor):
7 """ a calling to lia_tagg class""" 8 """ a calling to lia_tagg class"""
8 def clean(self,dirtyString): 9 def clean(self,dirtyString):
9 p=subprocess.Popen([os.environ["LIA_TAGG"]+'/script/lia_clean'],stdin=subprocess.PIPE,stdout=subprocess.PIPE) 10 p=subprocess.Popen([os.environ["LIA_TAGG"]+'/script/lia_clean'],stdin=subprocess.PIPE,stdout=subprocess.PIPE)
10 (cleanString, err) = p.communicate(input=dirtyString.encode('iso8859-1','backslashreplace')) 11 (cleanString, err) = p.communicate(input=dirtyString.encode('iso8859-1','backslashreplace'))
11 return cleanString 12 return cleanString
12 13
13 def tagg(self,cleanString): 14 def tagg(self,cleanString):
14 p2=subprocess.Popen([os.environ["LIA_TAGG"]+'/script/lia_tagg+lemm','-guess'],stdin=subprocess.PIPE,stdout=subprocess.PIPE) 15 p2=subprocess.Popen([os.environ["LIA_TAGG"]+'/script/lia_tagg+lemm','-guess'],stdin=subprocess.PIPE,stdout=subprocess.PIPE)
15 (taggedString,err) =p2.communicate(input=cleanString) 16 (taggedString,err) =p2.communicate(input=cleanString)
16 # This is used beceause lia_tagg deal with iso8859 only 17 # This is used beceause lia_tagg deal with iso8859 only
17 return taggedString.decode('iso8859').encode("utf8") 18 return taggedString.decode('iso8859').encode("utf8")
18 def lemm(self,cleanString): 19 def lemm(self,cleanString):
19 taggedString = self.taff(cleanString) 20 print " cleannnnn " + cleanString
20 return re.sub(r'<s> ',''," ".join([ x.split().pop(2) for x in taggedString.rstrip().split("\n")])) 21 taggedString = self.tagg(cleanString)
22 print "taggs full " + taggedString
23 sub = re.sub(r' </s>','',re.sub(r'<s> ',''," ".join([ x.rstrip().split().pop(2) for x in taggedString.rstrip().split("\n") if x])))
24 print " subbbbb" + sub
25 return sub
21 def isReady(self): 26 def isReady(self):
22 os.environ["LIA_TAGG"] 27 os.environ["LIA_TAGG"]
23 return true 28 return True
24 29
25 class Phoner(baseProcessor): 30 class Phoner(baseProcessor):
26 """ a class which call the lia phoner """ 31 """ a class which call the lia phoner """
27 def clean(self,dirtyString): 32 def clean(self,dirtyString):
28 p=subprocess.Popen([os.environ["LIA_PHON_REP"]+'/script/lia_nett'],stdin=subprocess.PIPE,stdout=subprocess.PIPE) 33 p=subprocess.Popen([os.environ["LIA_PHON_REP"]+'/script/lia_nett'],stdin=subprocess.PIPE,stdout=subprocess.PIPE)
29 (cleanString, err) = p.communicate(input=dirtyString.encode('iso8859-1','backslashreplace')) 34 (cleanString, err) = p.communicate(input=dirtyString.encode('iso8859-1','backslashreplace'))
30 return cleanString 35 return cleanString
31 def phon(self,cleanString): 36 def phon(self,cleanString):
32 p2=subprocess.Popen([os.environ["LIA_PHON_REP"]+'/script/lia_lex2phon'],stdin=subprocess.PIPE,stdout=subprocess.PIPE) 37 p2=subprocess.Popen([os.environ["LIA_PHON_REP"]+'/script/lia_lex2phon'],stdin=subprocess.PIPE,stdout=subprocess.PIPE)
33 (taggedString,err) =p2.communicate(input=cleanString) 38 (taggedString,err) =p2.communicate(input=cleanString)
34 # This is used beceause lia_phon deal with iso8859 only 39 # This is used beceause lia_phon deal with iso8859 only
35 # We reconverte the output to utf8 back 40 # We reconverte the output to utf8 back
36 return taggedString.decode('iso8859').encode("utf8") 41 return taggedString.decode('iso8859').encode("utf8")
37 def isReady(self): 42 def isReady(self):
38 os.environ["LIA_PHON_REP"] 43 os.environ["LIA_PHON_REP"]
39 return true 44 return True
45
40 class StopWord(baseProcessor): 46 class StopWord(baseProcessor):
41 def isReady(self): 47 def isReady(self):
42 return true 48 return True
43 def RemoveStopList(self,rowstring): 49 def RemoveStopList(self,rowstring):
44 """ Remove from set of word (splited String ) each words in the stoplist and join all of the other in a string """ 50 """ Remove from set of word (splited String ) each words in the stoplist and join all of the other in a string """
45 return u" ".join(unicode(value) for value in list(set(test.split()) - set(nltk.corpus.stopwords.words("french")))) 51 return u" ".join(unicode(value) for value in list(set(rowstring.split()) - set(nltk.corpus.stopwords.words("french"))))
46 52
1 # -*- coding: utf-8 -*- 1 # -*- coding: utf-8 -*-
2 import subprocess 2 import subprocess
3 import os 3 import os
4 import json 4 import json
5 from flask import Flask, request, render_template 5 from flask import Flask, request, render_template
6 from processor.LiaTools import * 6 from processor.LiaTools import *
7 app = Flask(__name__) 7 app = Flask(__name__)
8 8
9 @app.route("/") 9 @app.route("/")
10 def docs(): 10 def docs():
11 return render_template('index.html') 11 return render_template('index.html')
12 12
13 @app.route("/tagger",methods=['POST']) 13 @app.route("/tagger",methods=['POST'])
14 def cleaner(): 14 def cleaner():
15 tagger = Tagger() 15 tagger = Tagger()
16 tagger.isReady()
17 phoner = Phoner()
18 phoner.isReady()
19 stoplist = StopWord()
20 stoplist.isReady()
16 # Receive String from post parametre Raw text ( Json ) 21 # Receive String from post parametre Raw text ( Json )
17 dirtyString= request.json[u'string'] 22 dirtyString= request.json[u'string']
18 # send the String throught LIA_TAGG script thank's to pip 23 # send the String throught LIA_TAGG script thank's to pip
19 # lia_clean split a word by line et markup the sentences 24 # lia_clean split a word by line et markup the sentences
20 cleanString= tagger.clean(dirtyString) 25 dirtyString = stoplist.RemoveStopList(dirtyString)
21 taggedString= tagger.tagg(cleanString) 26 print " stop list " + dirtyString
27 lemm = tagger.lemm(tagger.clean(dirtyString))
28 print 'les lemm '+ lemm
29 dirtyString = dirtyString+" "+ lemm
30 cleanString= phoner.clean(dirtyString)
31 taggedString= phoner.phon(cleanString)
32 print taggedString
22 return taggedString 33 return taggedString
23 if __name__ == '__main__': 34 if __name__ == '__main__':
24 app.debug = True 35 app.debug = True
25 app.run(host='0.0.0.0') 36 app.run(host='0.0.0.0')
26 37