Commit 7ff5cc7f92813b58647a98f7bb4d8cdde0bd1d81

Authored by Killian
1 parent 1db021bcb8
Exists in master and in 1 other branch soap

edit raw text + comment

Showing 3 changed files with 16 additions and 21 deletions Inline Diff

processor/LiaTools.py
1 import subprocess 1 import subprocess
2 import os 2 import os
3 from BaseProcessor import baseProcessor 3 from BaseProcessor import baseProcessor
4 import nltk 4 import nltk
5 import re 5 import re
6 6
7 class Tagger(baseProcessor): 7 class Tagger(baseProcessor):
8 """ a calling to lia_tagg class""" 8 """ a calling to lia_tagg class"""
9 def clean(self,dirtyString): 9 def clean(self,dirtyString):
10 """ Clean string for using it into lia_tagg
11
12 Change text to iso and clean it one word by line and separate sentences with <s> </s>"""
10 p=subprocess.Popen([os.environ["LIA_TAGG"]+'/script/lia_clean'],stdin=subprocess.PIPE,stdout=subprocess.PIPE) 13 p=subprocess.Popen([os.environ["LIA_TAGG"]+'/script/lia_clean'],stdin=subprocess.PIPE,stdout=subprocess.PIPE)
11 (cleanString, err) = p.communicate(input=dirtyString.encode('iso8859-1','backslashreplace')) 14 (cleanString, err) = p.communicate(input=dirtyString.encode('iso8859-1','backslashreplace'))
12 return cleanString 15 return cleanString
13 16
14 def tagg(self,cleanString): 17 def tagg(self,cleanString):
18 """POS Tagg and lemm a string which come from clean"""
15 p2=subprocess.Popen([os.environ["LIA_TAGG"]+'/script/lia_tagg+lemm','-guess'],stdin=subprocess.PIPE,stdout=subprocess.PIPE) 19 p2=subprocess.Popen([os.environ["LIA_TAGG"]+'/script/lia_tagg+lemm','-guess'],stdin=subprocess.PIPE,stdout=subprocess.PIPE)
16 (taggedString,err) =p2.communicate(input=cleanString) 20 (taggedString,err) =p2.communicate(input=cleanString)
17 # This is used beceause lia_tagg deal with iso8859 only 21 # This is used beceause lia_tagg deal with iso8859 only
18 return taggedString.decode('iso8859').encode("utf8") 22 return taggedString.decode('iso8859').encode("utf8")
23
19 def lemm(self,cleanString): 24 def lemm(self,cleanString):
20 print " cleannnnn " + cleanString 25 """ use the pos tagger to lemm word and return lemm only"""
21 taggedString = self.tagg(cleanString) 26 taggedString = self.tagg(cleanString)
22 print "taggs full " + taggedString 27 # sub the string to get only lemm ( cut markup and origin word ) Can be Delete with better use of lia_tagg
23 sub = re.sub(r' </s>','',re.sub(r'<s> ',''," ".join([ x.rstrip().split().pop(2) for x in taggedString.rstrip().split("\n") if x]))) 28 sub = re.sub(r' </s>','',re.sub(r'<s> ',''," ".join([ x.rstrip().split().pop(2) for x in taggedString.rstrip().split("\n") if x])))
24 print " subbbbb" + sub
25 return sub 29 return sub
26 def isReady(self): 30 def isReady(self):
31 """ Check if the Tagger can be used ( depends on LIA_TAGG )"""
27 os.environ["LIA_TAGG"] 32 os.environ["LIA_TAGG"]
28 return True 33 return True
29 34
30 class Phoner(baseProcessor): 35 class Phoner(baseProcessor):
31 """ a class which call the lia phoner """ 36 """ a class which call the lia phoner """
32 def clean(self,dirtyString): 37 def clean(self,dirtyString):
33 p=subprocess.Popen([os.environ["LIA_PHON_REP"]+'/script/lia_nett'],stdin=subprocess.PIPE,stdout=subprocess.PIPE) 38 p=subprocess.Popen([os.environ["LIA_PHON_REP"]+'/script/lia_nett'],stdin=subprocess.PIPE,stdout=subprocess.PIPE)
34 (cleanString, err) = p.communicate(input=dirtyString.encode('iso8859-1','backslashreplace')) 39 (cleanString, err) = p.communicate(input=dirtyString.encode('iso8859-1','backslashreplace'))
35 return cleanString 40 return cleanString
36 def phon(self,cleanString): 41 def phon(self,cleanString):
37 p2=subprocess.Popen([os.environ["LIA_PHON_REP"]+'/script/lia_lex2phon'],stdin=subprocess.PIPE,stdout=subprocess.PIPE) 42 p2=subprocess.Popen([os.environ["LIA_PHON_REP"]+'/script/lia_lex2phon'],stdin=subprocess.PIPE,stdout=subprocess.PIPE)
38 (taggedString,err) =p2.communicate(input=cleanString) 43 (taggedString,err) =p2.communicate(input=cleanString)
39 # This is used beceause lia_phon deal with iso8859 only 44 # This is used beceause lia_phon deal with iso8859 only
40 # We reconverte the output to utf8 back 45 # We reconverte the output to utf8 back
41 return taggedString.decode('iso8859').encode("utf8") 46 return taggedString.decode('iso8859').encode("utf8")
42 def isReady(self): 47 def isReady(self):
43 os.environ["LIA_PHON_REP"] 48 os.environ["LIA_PHON_REP"]
44 return True 49 return True
45 50
46 class StopWord(baseProcessor): 51 class StopWord(baseProcessor):
47 def isReady(self): 52 def isReady(self):
48 return True 53 return True
49 def RemoveStopList(self,rowstring): 54 def RemoveStopList(self,rowstring):
50 """ Remove from set of word (splited String ) each words in the stoplist and join all of the other in a string """ 55 """ Remove from set of word (splited String ) each words in the stoplist and join all of the other in a string """
51 return u" ".join(unicode(value) for value in list(set(rowstring.split()) - set(nltk.corpus.stopwords.words("french")))) 56 return u" ".join(unicode(value) for value in list(set(rowstring.split()) - set(nltk.corpus.stopwords.words("french"))))
static/js/application.js
1 // Some general UI pack related JS 1 // Some general UI pack related JS
2 $(document).ready(function() { 2 $(document).ready(function() {
3 tagging(); 3 tagging();
4 }); 4 });
5 5
6 6
7 7
8 function tagging(){ 8 function tagging(){
9 $('#go').click(function(){ 9 $('#go').click(function(){
10 data=JSON.stringify({ "string" : $('#data').val() }); 10 data={"string" :$('#data').val()};
11 console.log(data); 11 console.log(data);
12 $.ajax({ 12 $.ajax({
13 type: "POST", 13 type: "POST",
14 url: "tagger", 14 url: "tagger",
15 data: data, 15 data: data,
16 success: function(data){ 16 success: function(data){
17 //data = JSON.parse(data);
18 result=$('#result'); 17 result=$('#result');
19
20 //_.each(data, function(element, index, list){
21 // console.log(element);
22 // result.append(_.escape(element["word"])+" ");
23 // result.append(_.escape(element["markup"]+" "));
24 // result.append(_.escape(element["lemm"]+"\n"));
25 //});
26 result.append(_.escape(data)); 18 result.append(_.escape(data));
27 console.log(_.escape(data)); 19 console.log(_.escape(data));
28 console.log("resultat"); 20 console.log("resultat");
29 }, 21 },
30 error: function(){ 22 error: function(){
31 alert("error"); 23 alert("error");
32 }, 24 },
33 dataType: "text", 25 dataType: "text"
34 contentType:"application/json; charset=UTF-8"
35 }); 26 });
36 }); 27 });
37 } 28 }
38 29
1 # -*- coding: utf-8 -*- 1 # -*- coding: utf-8 -*-
2 import subprocess 2 import subprocess
3 import os 3 import os
4 import json 4 import json
5 from flask import Flask, request, render_template 5 from flask import Flask, request, render_template
6 from processor.LiaTools import * 6 from processor.LiaTools import *
7 app = Flask(__name__) 7 app = Flask(__name__)
8 8
9 @app.route("/") 9 @app.route("/")
10 def docs(): 10 def docs():
11 return render_template('index.html') 11 return render_template('index.html')
12 12
13 @app.route("/tagger",methods=['POST']) 13 @app.route("/tagger",methods=['POST'])
14 def cleaner(): 14 def cleaner():
15 # Charging Processor et check if they are okay ( aim is to dynamic charge later )
15 tagger = Tagger() 16 tagger = Tagger()
16 tagger.isReady() 17 tagger.isReady()
17 phoner = Phoner() 18 phoner = Phoner()
18 phoner.isReady() 19 phoner.isReady()
19 stoplist = StopWord() 20 stoplist = StopWord()
20 stoplist.isReady() 21 stoplist.isReady()
21 # Receive String from post parametre Raw text ( Json ) 22 # Receive String from post parametre Raw text
22 dirtyString= request.json[u'string'] 23 dirtyString= request.values[u'string']
23 # send the String throught LIA_TAGG script thank's to pipe 24 # Processing
24 # lia_clean split a word by line et markup the sentences
25 dirtyString = stoplist.RemoveStopList(dirtyString) 25 dirtyString = stoplist.RemoveStopList(dirtyString)
26 print " stop list " + dirtyString
27 lemm = tagger.lemm(tagger.clean(dirtyString)) 26 lemm = tagger.lemm(tagger.clean(dirtyString))
28 print 'les lemm '+ lemm 27 # Adding lemm of each words cause we went ther phonem too
29 dirtyString = dirtyString+" "+ lemm 28 dirtyString = dirtyString+" "+ lemm
30 cleanString= phoner.clean(dirtyString) 29 cleanString= phoner.clean(dirtyString)
31 taggedString= phoner.phon(cleanString) 30 taggedString= phoner.phon(cleanString)
32 print taggedString 31 # Returning a row text to be parse client side
33 return taggedString 32 return taggedString
34 if __name__ == '__main__': 33 if __name__ == '__main__':
35 app.debug = True 34 app.debug = True
36 app.run(host='0.0.0.0') 35 app.run(host='0.0.0.0')