Commit 7ff5cc7f92813b58647a98f7bb4d8cdde0bd1d81
1 parent
1db021bcb8
Exists in
master
and in
1 other branch
edit raw text + comment
Showing 3 changed files with 16 additions and 21 deletions Inline Diff
processor/LiaTools.py
1 | import subprocess | 1 | import subprocess |
2 | import os | 2 | import os |
3 | from BaseProcessor import baseProcessor | 3 | from BaseProcessor import baseProcessor |
4 | import nltk | 4 | import nltk |
5 | import re | 5 | import re |
6 | 6 | ||
7 | class Tagger(baseProcessor): | 7 | class Tagger(baseProcessor): |
8 | """ a calling to lia_tagg class""" | 8 | """ a calling to lia_tagg class""" |
9 | def clean(self,dirtyString): | 9 | def clean(self,dirtyString): |
10 | """ Clean string for using it into lia_tagg | ||
11 | |||
12 | Change text to iso and clean it one word by line and separate sentences with <s> </s>""" | ||
10 | p=subprocess.Popen([os.environ["LIA_TAGG"]+'/script/lia_clean'],stdin=subprocess.PIPE,stdout=subprocess.PIPE) | 13 | p=subprocess.Popen([os.environ["LIA_TAGG"]+'/script/lia_clean'],stdin=subprocess.PIPE,stdout=subprocess.PIPE) |
11 | (cleanString, err) = p.communicate(input=dirtyString.encode('iso8859-1','backslashreplace')) | 14 | (cleanString, err) = p.communicate(input=dirtyString.encode('iso8859-1','backslashreplace')) |
12 | return cleanString | 15 | return cleanString |
13 | 16 | ||
14 | def tagg(self,cleanString): | 17 | def tagg(self,cleanString): |
18 | """POS Tagg and lemm a string which come from clean""" | ||
15 | p2=subprocess.Popen([os.environ["LIA_TAGG"]+'/script/lia_tagg+lemm','-guess'],stdin=subprocess.PIPE,stdout=subprocess.PIPE) | 19 | p2=subprocess.Popen([os.environ["LIA_TAGG"]+'/script/lia_tagg+lemm','-guess'],stdin=subprocess.PIPE,stdout=subprocess.PIPE) |
16 | (taggedString,err) =p2.communicate(input=cleanString) | 20 | (taggedString,err) =p2.communicate(input=cleanString) |
17 | # This is used beceause lia_tagg deal with iso8859 only | 21 | # This is used beceause lia_tagg deal with iso8859 only |
18 | return taggedString.decode('iso8859').encode("utf8") | 22 | return taggedString.decode('iso8859').encode("utf8") |
23 | |||
19 | def lemm(self,cleanString): | 24 | def lemm(self,cleanString): |
20 | print " cleannnnn " + cleanString | 25 | """ use the pos tagger to lemm word and return lemm only""" |
21 | taggedString = self.tagg(cleanString) | 26 | taggedString = self.tagg(cleanString) |
22 | print "taggs full " + taggedString | 27 | # sub the string to get only lemm ( cut markup and origin word ) Can be Delete with better use of lia_tagg |
23 | sub = re.sub(r' </s>','',re.sub(r'<s> ',''," ".join([ x.rstrip().split().pop(2) for x in taggedString.rstrip().split("\n") if x]))) | 28 | sub = re.sub(r' </s>','',re.sub(r'<s> ',''," ".join([ x.rstrip().split().pop(2) for x in taggedString.rstrip().split("\n") if x]))) |
24 | print " subbbbb" + sub | ||
25 | return sub | 29 | return sub |
26 | def isReady(self): | 30 | def isReady(self): |
31 | """ Check if the Tagger can be used ( depends on LIA_TAGG )""" | ||
27 | os.environ["LIA_TAGG"] | 32 | os.environ["LIA_TAGG"] |
28 | return True | 33 | return True |
29 | 34 | ||
30 | class Phoner(baseProcessor): | 35 | class Phoner(baseProcessor): |
31 | """ a class which call the lia phoner """ | 36 | """ a class which call the lia phoner """ |
32 | def clean(self,dirtyString): | 37 | def clean(self,dirtyString): |
33 | p=subprocess.Popen([os.environ["LIA_PHON_REP"]+'/script/lia_nett'],stdin=subprocess.PIPE,stdout=subprocess.PIPE) | 38 | p=subprocess.Popen([os.environ["LIA_PHON_REP"]+'/script/lia_nett'],stdin=subprocess.PIPE,stdout=subprocess.PIPE) |
34 | (cleanString, err) = p.communicate(input=dirtyString.encode('iso8859-1','backslashreplace')) | 39 | (cleanString, err) = p.communicate(input=dirtyString.encode('iso8859-1','backslashreplace')) |
35 | return cleanString | 40 | return cleanString |
36 | def phon(self,cleanString): | 41 | def phon(self,cleanString): |
37 | p2=subprocess.Popen([os.environ["LIA_PHON_REP"]+'/script/lia_lex2phon'],stdin=subprocess.PIPE,stdout=subprocess.PIPE) | 42 | p2=subprocess.Popen([os.environ["LIA_PHON_REP"]+'/script/lia_lex2phon'],stdin=subprocess.PIPE,stdout=subprocess.PIPE) |
38 | (taggedString,err) =p2.communicate(input=cleanString) | 43 | (taggedString,err) =p2.communicate(input=cleanString) |
39 | # This is used beceause lia_phon deal with iso8859 only | 44 | # This is used beceause lia_phon deal with iso8859 only |
40 | # We reconverte the output to utf8 back | 45 | # We reconverte the output to utf8 back |
41 | return taggedString.decode('iso8859').encode("utf8") | 46 | return taggedString.decode('iso8859').encode("utf8") |
42 | def isReady(self): | 47 | def isReady(self): |
43 | os.environ["LIA_PHON_REP"] | 48 | os.environ["LIA_PHON_REP"] |
44 | return True | 49 | return True |
45 | 50 | ||
46 | class StopWord(baseProcessor): | 51 | class StopWord(baseProcessor): |
47 | def isReady(self): | 52 | def isReady(self): |
48 | return True | 53 | return True |
49 | def RemoveStopList(self,rowstring): | 54 | def RemoveStopList(self,rowstring): |
50 | """ Remove from set of word (splited String ) each words in the stoplist and join all of the other in a string """ | 55 | """ Remove from set of word (splited String ) each words in the stoplist and join all of the other in a string """ |
51 | return u" ".join(unicode(value) for value in list(set(rowstring.split()) - set(nltk.corpus.stopwords.words("french")))) | 56 | return u" ".join(unicode(value) for value in list(set(rowstring.split()) - set(nltk.corpus.stopwords.words("french")))) |
static/js/application.js
1 | // Some general UI pack related JS | 1 | // Some general UI pack related JS |
2 | $(document).ready(function() { | 2 | $(document).ready(function() { |
3 | tagging(); | 3 | tagging(); |
4 | }); | 4 | }); |
5 | 5 | ||
6 | 6 | ||
7 | 7 | ||
8 | function tagging(){ | 8 | function tagging(){ |
9 | $('#go').click(function(){ | 9 | $('#go').click(function(){ |
10 | data=JSON.stringify({ "string" : $('#data').val() }); | 10 | data={"string" :$('#data').val()}; |
11 | console.log(data); | 11 | console.log(data); |
12 | $.ajax({ | 12 | $.ajax({ |
13 | type: "POST", | 13 | type: "POST", |
14 | url: "tagger", | 14 | url: "tagger", |
15 | data: data, | 15 | data: data, |
16 | success: function(data){ | 16 | success: function(data){ |
17 | //data = JSON.parse(data); | ||
18 | result=$('#result'); | 17 | result=$('#result'); |
19 | |||
20 | //_.each(data, function(element, index, list){ | ||
21 | // console.log(element); | ||
22 | // result.append(_.escape(element["word"])+" "); | ||
23 | // result.append(_.escape(element["markup"]+" ")); | ||
24 | // result.append(_.escape(element["lemm"]+"\n")); | ||
25 | //}); | ||
26 | result.append(_.escape(data)); | 18 | result.append(_.escape(data)); |
27 | console.log(_.escape(data)); | 19 | console.log(_.escape(data)); |
28 | console.log("resultat"); | 20 | console.log("resultat"); |
29 | }, | 21 | }, |
30 | error: function(){ | 22 | error: function(){ |
31 | alert("error"); | 23 | alert("error"); |
32 | }, | 24 | }, |
33 | dataType: "text", | 25 | dataType: "text" |
34 | contentType:"application/json; charset=UTF-8" | ||
35 | }); | 26 | }); |
36 | }); | 27 | }); |
37 | } | 28 | } |
38 | 29 |
webtagger.py
1 | # -*- coding: utf-8 -*- | 1 | # -*- coding: utf-8 -*- |
2 | import subprocess | 2 | import subprocess |
3 | import os | 3 | import os |
4 | import json | 4 | import json |
5 | from flask import Flask, request, render_template | 5 | from flask import Flask, request, render_template |
6 | from processor.LiaTools import * | 6 | from processor.LiaTools import * |
7 | app = Flask(__name__) | 7 | app = Flask(__name__) |
8 | 8 | ||
9 | @app.route("/") | 9 | @app.route("/") |
10 | def docs(): | 10 | def docs(): |
11 | return render_template('index.html') | 11 | return render_template('index.html') |
12 | 12 | ||
13 | @app.route("/tagger",methods=['POST']) | 13 | @app.route("/tagger",methods=['POST']) |
14 | def cleaner(): | 14 | def cleaner(): |
15 | # Charging Processor et check if they are okay ( aim is to dynamic charge later ) | ||
15 | tagger = Tagger() | 16 | tagger = Tagger() |
16 | tagger.isReady() | 17 | tagger.isReady() |
17 | phoner = Phoner() | 18 | phoner = Phoner() |
18 | phoner.isReady() | 19 | phoner.isReady() |
19 | stoplist = StopWord() | 20 | stoplist = StopWord() |
20 | stoplist.isReady() | 21 | stoplist.isReady() |
21 | # Receive String from post parametre Raw text ( Json ) | 22 | # Receive String from post parametre Raw text |
22 | dirtyString= request.json[u'string'] | 23 | dirtyString= request.values[u'string'] |
23 | # send the String throught LIA_TAGG script thank's to pipe | 24 | # Processing |
24 | # lia_clean split a word by line et markup the sentences | ||
25 | dirtyString = stoplist.RemoveStopList(dirtyString) | 25 | dirtyString = stoplist.RemoveStopList(dirtyString) |
26 | print " stop list " + dirtyString | ||
27 | lemm = tagger.lemm(tagger.clean(dirtyString)) | 26 | lemm = tagger.lemm(tagger.clean(dirtyString)) |
28 | print 'les lemm '+ lemm | 27 | # Adding lemm of each words cause we went ther phonem too |
29 | dirtyString = dirtyString+" "+ lemm | 28 | dirtyString = dirtyString+" "+ lemm |
30 | cleanString= phoner.clean(dirtyString) | 29 | cleanString= phoner.clean(dirtyString) |
31 | taggedString= phoner.phon(cleanString) | 30 | taggedString= phoner.phon(cleanString) |
32 | print taggedString | 31 | # Returning a row text to be parse client side |
33 | return taggedString | 32 | return taggedString |
34 | if __name__ == '__main__': | 33 | if __name__ == '__main__': |
35 | app.debug = True | 34 | app.debug = True |
36 | app.run(host='0.0.0.0') | 35 | app.run(host='0.0.0.0') |