Commit 673721ec077645b585c9d15e60a6b7eab3c2362a
1 parent
e0e4926982
Exists in
master
and in
1 other branch
ajout phon il manque plus que le soap
Showing 3 changed files with 27 additions and 9 deletions Inline Diff
README.md
1 | LIA's webTagger : Web api for the LIA POS TAGGER | 1 | LIA's webTagger : Web api for the LIA POS TAGGER |
2 | ================================================= | 2 | ================================================= |
3 | 3 | ||
4 | ## LIA's WebTagger let you | 4 | ## LIA's WebTagger let you |
5 | 5 | ||
6 | * Taggin each word of a sentences with his morphosyntaxic function | 6 | * Taggin each word of a sentences with his morphosyntaxic function |
7 | * having the lemm of corresponding of which word | 7 | * having the lemm of corresponding of which word |
8 | 8 | ||
9 | ## LIA's WebTagger is | 9 | ## LIA's WebTagger is |
10 | 10 | ||
11 | * Powered by Python | 11 | * Powered by Python |
12 | * Free and open source (licence CeCILL) | 12 | * Free and open source (licence CeCILL) |
13 | 13 | ||
14 | ## Ressources | 14 | ## Ressources |
15 | 15 | ||
16 | * source code : gitlia.univ-avignon.fr/public | 16 | * source code : gitlia.univ-avignon.fr/public |
17 | * contact : killian.janod@alumni.univ-avignon.fr | 17 | * contact : killian.janod@alumni.univ-avignon.fr |
18 | 18 | ||
19 | ## Requirement | 19 | ## Requirement |
20 | 20 | ||
21 | * [LIA\_TAGGER](http://pageperso.lif.univ-mrs.fr/~frederic.bechet/download.html) | 21 | * [LIA\_TAGGER](http://pageperso.lif.univ-mrs.fr/~frederic.bechet/download.html) |
22 | * [LIA\_PHON](http://pageperso.lif.univ-mrs.fr/~frederic.bechet/download.html) | ||
22 | * Python > 2.5 | 23 | * Python > 2.5 |
23 | * Flask | 24 | * Flask |
24 | 25 | * nltk | |
25 | ## Advise | 26 | ## Advise |
26 | 27 | ||
27 | * Virtualenv | 28 | * Virtualenv |
28 | * gunincorn | 29 | * gunincorn |
29 | * Bower ( for js/css in the online demo ) | 30 | * Bower ( for js/css in the online demo ) |
30 | 31 | ||
31 | ## Instalation | 32 | ## Instalation |
32 | 33 | ||
33 | TODO | 34 | TODO |
34 | 35 |
processor/LiaTools.py
1 | import subprocess | 1 | import subprocess |
2 | import os | 2 | import os |
3 | from BaseProcessor import baseProcessor | 3 | from BaseProcessor import baseProcessor |
4 | import nltk | 4 | import nltk |
5 | import re | 5 | import re |
6 | |||
6 | class Tagger(baseProcessor): | 7 | class Tagger(baseProcessor): |
7 | """ a calling to lia_tagg class""" | 8 | """ a calling to lia_tagg class""" |
8 | def clean(self,dirtyString): | 9 | def clean(self,dirtyString): |
9 | p=subprocess.Popen([os.environ["LIA_TAGG"]+'/script/lia_clean'],stdin=subprocess.PIPE,stdout=subprocess.PIPE) | 10 | p=subprocess.Popen([os.environ["LIA_TAGG"]+'/script/lia_clean'],stdin=subprocess.PIPE,stdout=subprocess.PIPE) |
10 | (cleanString, err) = p.communicate(input=dirtyString.encode('iso8859-1','backslashreplace')) | 11 | (cleanString, err) = p.communicate(input=dirtyString.encode('iso8859-1','backslashreplace')) |
11 | return cleanString | 12 | return cleanString |
12 | 13 | ||
13 | def tagg(self,cleanString): | 14 | def tagg(self,cleanString): |
14 | p2=subprocess.Popen([os.environ["LIA_TAGG"]+'/script/lia_tagg+lemm','-guess'],stdin=subprocess.PIPE,stdout=subprocess.PIPE) | 15 | p2=subprocess.Popen([os.environ["LIA_TAGG"]+'/script/lia_tagg+lemm','-guess'],stdin=subprocess.PIPE,stdout=subprocess.PIPE) |
15 | (taggedString,err) =p2.communicate(input=cleanString) | 16 | (taggedString,err) =p2.communicate(input=cleanString) |
16 | # This is used beceause lia_tagg deal with iso8859 only | 17 | # This is used beceause lia_tagg deal with iso8859 only |
17 | return taggedString.decode('iso8859').encode("utf8") | 18 | return taggedString.decode('iso8859').encode("utf8") |
18 | def lemm(self,cleanString): | 19 | def lemm(self,cleanString): |
19 | taggedString = self.taff(cleanString) | 20 | print " cleannnnn " + cleanString |
20 | return re.sub(r'<s> ',''," ".join([ x.split().pop(2) for x in taggedString.rstrip().split("\n")])) | 21 | taggedString = self.tagg(cleanString) |
22 | print "taggs full " + taggedString | ||
23 | sub = re.sub(r' </s>','',re.sub(r'<s> ',''," ".join([ x.rstrip().split().pop(2) for x in taggedString.rstrip().split("\n") if x]))) | ||
24 | print " subbbbb" + sub | ||
25 | return sub | ||
21 | def isReady(self): | 26 | def isReady(self): |
22 | os.environ["LIA_TAGG"] | 27 | os.environ["LIA_TAGG"] |
23 | return true | 28 | return True |
24 | 29 | ||
25 | class Phoner(baseProcessor): | 30 | class Phoner(baseProcessor): |
26 | """ a class which call the lia phoner """ | 31 | """ a class which call the lia phoner """ |
27 | def clean(self,dirtyString): | 32 | def clean(self,dirtyString): |
28 | p=subprocess.Popen([os.environ["LIA_PHON_REP"]+'/script/lia_nett'],stdin=subprocess.PIPE,stdout=subprocess.PIPE) | 33 | p=subprocess.Popen([os.environ["LIA_PHON_REP"]+'/script/lia_nett'],stdin=subprocess.PIPE,stdout=subprocess.PIPE) |
29 | (cleanString, err) = p.communicate(input=dirtyString.encode('iso8859-1','backslashreplace')) | 34 | (cleanString, err) = p.communicate(input=dirtyString.encode('iso8859-1','backslashreplace')) |
30 | return cleanString | 35 | return cleanString |
31 | def phon(self,cleanString): | 36 | def phon(self,cleanString): |
32 | p2=subprocess.Popen([os.environ["LIA_PHON_REP"]+'/script/lia_lex2phon'],stdin=subprocess.PIPE,stdout=subprocess.PIPE) | 37 | p2=subprocess.Popen([os.environ["LIA_PHON_REP"]+'/script/lia_lex2phon'],stdin=subprocess.PIPE,stdout=subprocess.PIPE) |
33 | (taggedString,err) =p2.communicate(input=cleanString) | 38 | (taggedString,err) =p2.communicate(input=cleanString) |
34 | # This is used beceause lia_phon deal with iso8859 only | 39 | # This is used beceause lia_phon deal with iso8859 only |
35 | # We reconverte the output to utf8 back | 40 | # We reconverte the output to utf8 back |
36 | return taggedString.decode('iso8859').encode("utf8") | 41 | return taggedString.decode('iso8859').encode("utf8") |
37 | def isReady(self): | 42 | def isReady(self): |
38 | os.environ["LIA_PHON_REP"] | 43 | os.environ["LIA_PHON_REP"] |
39 | return true | 44 | return True |
45 | |||
40 | class StopWord(baseProcessor): | 46 | class StopWord(baseProcessor): |
41 | def isReady(self): | 47 | def isReady(self): |
42 | return true | 48 | return True |
43 | def RemoveStopList(self,rowstring): | 49 | def RemoveStopList(self,rowstring): |
44 | """ Remove from set of word (splited String ) each words in the stoplist and join all of the other in a string """ | 50 | """ Remove from set of word (splited String ) each words in the stoplist and join all of the other in a string """ |
45 | return u" ".join(unicode(value) for value in list(set(test.split()) - set(nltk.corpus.stopwords.words("french")))) | 51 | return u" ".join(unicode(value) for value in list(set(rowstring.split()) - set(nltk.corpus.stopwords.words("french")))) |
46 | 52 |
webtagger.py
1 | # -*- coding: utf-8 -*- | 1 | # -*- coding: utf-8 -*- |
2 | import subprocess | 2 | import subprocess |
3 | import os | 3 | import os |
4 | import json | 4 | import json |
5 | from flask import Flask, request, render_template | 5 | from flask import Flask, request, render_template |
6 | from processor.LiaTools import * | 6 | from processor.LiaTools import * |
7 | app = Flask(__name__) | 7 | app = Flask(__name__) |
8 | 8 | ||
9 | @app.route("/") | 9 | @app.route("/") |
10 | def docs(): | 10 | def docs(): |
11 | return render_template('index.html') | 11 | return render_template('index.html') |
12 | 12 | ||
13 | @app.route("/tagger",methods=['POST']) | 13 | @app.route("/tagger",methods=['POST']) |
14 | def cleaner(): | 14 | def cleaner(): |
15 | tagger = Tagger() | 15 | tagger = Tagger() |
16 | tagger.isReady() | ||
17 | phoner = Phoner() | ||
18 | phoner.isReady() | ||
19 | stoplist = StopWord() | ||
20 | stoplist.isReady() | ||
16 | # Receive String from post parametre Raw text ( Json ) | 21 | # Receive String from post parametre Raw text ( Json ) |
17 | dirtyString= request.json[u'string'] | 22 | dirtyString= request.json[u'string'] |
18 | # send the String throught LIA_TAGG script thank's to pip | 23 | # send the String throught LIA_TAGG script thank's to pip |
19 | # lia_clean split a word by line et markup the sentences | 24 | # lia_clean split a word by line et markup the sentences |
20 | cleanString= tagger.clean(dirtyString) | 25 | dirtyString = stoplist.RemoveStopList(dirtyString) |
21 | taggedString= tagger.tagg(cleanString) | 26 | print " stop list " + dirtyString |
27 | lemm = tagger.lemm(tagger.clean(dirtyString)) | ||
28 | print 'les lemm '+ lemm | ||
29 | dirtyString = dirtyString+" "+ lemm | ||
30 | cleanString= phoner.clean(dirtyString) | ||
31 | taggedString= phoner.phon(cleanString) | ||
32 | print taggedString | ||
22 | return taggedString | 33 | return taggedString |
23 | if __name__ == '__main__': | 34 | if __name__ == '__main__': |
24 | app.debug = True | 35 | app.debug = True |
25 | app.run(host='0.0.0.0') | 36 | app.run(host='0.0.0.0') |
26 | 37 |