Commit f8f94203e7db209ef1e607db3ad0bdded8fb466e
1 parent
b65eb4cd11
Exists in
soap
Correction du unicode(orkis) qui gérait pas accent et les lem
Showing 2 changed files with 7 additions and 11 deletions Inline Diff
processor/Orkis.py
1 | # -*- coding: utf-8 -*- | 1 | # -*- coding: utf-8 -*- |
2 | from BaseProcessor import baseProcessor | 2 | from BaseProcessor import baseProcessor |
3 | import nltk | 3 | import nltk |
4 | import re | 4 | import re |
5 | from LiaTools import * | 5 | from LiaTools import * |
6 | class Orkis(baseProcessor): | 6 | class Orkis(baseProcessor): |
7 | """ Processor for Orkis """ | 7 | """ Processor for Orkis """ |
8 | def __init__(self,dirtyString): | 8 | def __init__(self,dirtyString): |
9 | self.tagger=Tagger() | 9 | self.tagger=Tagger() |
10 | self.phoner=Phoner() | 10 | self.phoner=Phoner() |
11 | self.dico ={} | 11 | self.dico ={} |
12 | self.string=dirtyString | 12 | self.string=dirtyString |
13 | print self.string | 13 | print self.string |
14 | def isReady(self): | 14 | def isReady(self): |
15 | self.phoner.isReady() | 15 | self.phoner.isReady() |
16 | self.tagger.isReady() | 16 | self.tagger.isReady() |
17 | def __unicode__(self): | 17 | def __unicode__(self): |
18 | string = u"" | 18 | string = u"" |
19 | for word in self.dico: | 19 | for word in self.dico: |
20 | print(isinstance(string, unicode)) | 20 | string += ( unicode(word.decode("utf-8")))+unicode (u";") |
21 | print(isinstance(unicode(word.decode("utf-8")),unicode)) | ||
22 | print(word) | ||
23 | print(string) | ||
24 | string += ( unicode(word.decode("utf-8"))) | ||
25 | for lemWord in self.dico[word][0]: | 21 | for lemWord in self.dico[word][0]: |
26 | string += (unicode(lemWord.decode("utf-8"))) #+ unicode(u" ")) | 22 | string += (unicode(lemWord.decode("utf-8"))+ unicode(u" ")) |
27 | string +=u";" | 23 | string +=u";" |
28 | for phonWord in self.dico[word][1]: | 24 | for phonWord in self.dico[word][1]: |
29 | string += (unicode(phonWord.decode("utf-8"))) #+ unicode(u" ")) | 25 | string += (unicode(phonWord.decode("utf-8"))+ unicode(u" ")) |
30 | string+=u"\n" | 26 | string+=u"\n" |
31 | return string | 27 | return string |
32 | def clean(self): | 28 | def clean(self): |
33 | stopword=StopWord() | 29 | stopword=StopWord() |
34 | self.string=stopword.RemoveStopList(self.string) | 30 | self.string=stopword.RemoveStopList(self.string) |
35 | def insertLem(self): | 31 | def insertLem(self): |
36 | self.cleanString=self.tagger.clean(self.string) | 32 | self.cleanString=self.tagger.clean(self.string) |
37 | taggedString=self.tagger.tagg(self.cleanString) | 33 | taggedString=self.tagger.tagg(self.cleanString) |
38 | self.tableLem = taggedString.rstrip().split("\n") | 34 | self.tableLem = taggedString.rstrip().split("\n") |
39 | for line in taggedString.rstrip().split("\n"): | 35 | for line in taggedString.rstrip().split("\n"): |
40 | if not re.match(r's>',line): | 36 | if not re.match(r's>',line): |
41 | table = line.rstrip().split(" ") | 37 | table = line.rstrip().split(" ") |
42 | if not table[0] in self.dico : | 38 | if not table[0] in self.dico : |
43 | self.dico[table[0]]=[set(),set()] | 39 | self.dico[table[0]]=[set(),set()] |
44 | self.dico[table[0]][0].add(table[2]) | 40 | self.dico[table[0]][0].add(table[2]) |
45 | def insertPhon(self): | 41 | def insertPhon(self): |
46 | phonedString=self.phoner.phon(self.cleanString) | 42 | phonedString=self.phoner.phon(self.cleanString) |
47 | self.tablephon= phonedString.rstrip().split("\n") | 43 | self.tablephon= phonedString.rstrip().split("\n") |
48 | for line in phonedString.rstrip().split("\n"): | 44 | for line in phonedString.rstrip().split("\n"): |
49 | if not re.match(r's>',line): | 45 | if not re.match(r's>',line): |
50 | table = line.rstrip().split(" ") | 46 | table = line.rstrip().split(" ") |
51 | if table[0] in self.dico: | 47 | if table[0] in self.dico: |
52 | self.dico[table[0]][1].add(table[1]) | 48 | self.dico[table[0]][1].add(table[1]) |
53 | def getDico(self): | 49 | def getDico(self): |
54 | self.clean() | 50 | self.clean() |
55 | self.insertLem() | 51 | self.insertLem() |
56 | self.insertPhon() | 52 | self.insertPhon() |
57 | table=[] | 53 | table=[] |
58 | for i in self.dico: | 54 | for i,v in self.dico.iteritems(): |
59 | if not re.match(r"<s>",i): | 55 | if not re.match(r"<s>",i): |
60 | list=[] | 56 | list=[] |
61 | list.append(i) | 57 | list.append(i) |
62 | for indice in self.dico[i][0]: | 58 | for indice in v[0]: |
63 | list.append(indice) | 59 | list.append(indice) |
64 | for indice in self.dico[i][1]: | 60 | for indice in v[1]: |
65 | list.append(indice) | 61 | list.append(indice) |
66 | ligne= " ".join(list) | 62 | ligne= " ".join(list) |
67 | 63 | ||
68 | table.append(ligne) | 64 | table.append(ligne) |
69 | return "\n".join(table) | 65 | return "\n".join(table) |
70 | 66 |
test/functional/testLiaSoap.py
1 | from suds.client import Client | 1 | from suds.client import Client |
2 | import time | 2 | import time |
3 | import threading | 3 | import threading |
4 | 4 | ||
5 | ### TODO : Tester en parallele x4 un million de fois pour voir ### | 5 | ### TODO : Tester en parallele x4 un million de fois pour voir ### |
6 | url = 'http://lrc2-kija.univ-avignon.fr:8000/?wsdl' | 6 | url = 'http://194.57.216.156:8181/?wsdl' |
7 | client = Client(url) | 7 | client = Client(url) |
8 | filename = "data.txt" | 8 | filename = "data.txt" |
9 | file = open(filename, "r") | 9 | file = open(filename, "r") |
10 | nb_times=4 | 10 | nb_times=4 |
11 | # Exp 1 | 11 | # Exp 1 |
12 | debut =time.time() | 12 | debut =time.time() |
13 | contents = file.read().decode("utf8").encode("ascii", errors='ignore').rstrip() | 13 | contents = file.read().decode("utf8").encode("ascii", errors='ignore').rstrip() |
14 | client.service.get_phon(contents) | 14 | client.service.get_phon(contents) |
15 | duree= time.time()- debut | 15 | duree= time.time()- debut |
16 | print (" Exper 1 : " + str(duree)) | 16 | print (" Exper 1 : " + str(duree)) |
17 | # Exp 2 | 17 | # Exp 2 |
18 | debut =time.time() | 18 | debut =time.time() |
19 | file.seek(0) | 19 | file.seek(0) |
20 | lines = file.readlines() | 20 | lines = file.readlines() |
21 | for line in lines: | 21 | for line in lines: |
22 | line = line.decode("utf8").encode("ascii", errors='ignore').rstrip() | 22 | line = line.decode("utf8").encode("ascii", errors='ignore').rstrip() |
23 | if line is not None: | 23 | if line is not None: |
24 | try: | 24 | try: |
25 | client.service.get_phon(line) | 25 | client.service.get_phon(line) |
26 | except: | 26 | except: |
27 | continue | 27 | continue |
28 | duree= time.time()- debut | 28 | duree= time.time()- debut |
29 | print (" Exper 2 line by line 1 corpus : " + str(duree)) | 29 | print (" Exper 2 line by line 1 corpus : " + str(duree)) |
30 | ##Exp 3 | 30 | ##Exp 3 |
31 | debut = time.time() | 31 | debut = time.time() |
32 | file.seek(0) | 32 | file.seek(0) |
33 | contents = file.read().decode("utf8").encode("ascii", errors='ignore').rstrip() | 33 | contents = file.read().decode("utf8").encode("ascii", errors='ignore').rstrip() |
34 | tabs=[] | 34 | tabs=[] |
35 | i=0 | 35 | i=0 |
36 | while i <= nb_times : | 36 | while i <= nb_times : |
37 | tabs.append(contents) | 37 | tabs.append(contents) |
38 | i+=1 | 38 | i+=1 |
39 | client.service.get_phon("".join(tabs)) | 39 | client.service.get_phon("".join(tabs)) |
40 | duree= time.time()- debut | 40 | duree= time.time()- debut |
41 | print ("Exper 3 2pow4 time the content in once shot " + str(duree)) | 41 | print ("Exper 3 2pow4 time the content in once shot " + str(duree)) |
42 | # EXP 4 | 42 | # EXP 4 |
43 | #debut = time.time() | 43 | #debut = time.time() |
44 | #contents = file.read().decode("utf8").encode("ascii", errors='ignore').rstrip() | 44 | #contents = file.read().decode("utf8").encode("ascii", errors='ignore').rstrip() |
45 | #def envoie(datas): | 45 | #def envoie(datas): |
46 | # client.service.get_phon(datas) | 46 | # client.service.get_phon(datas) |
47 | #i =0 | 47 | #i =0 |
48 | #threadTab= [] | 48 | #threadTab= [] |
49 | #while i <= nb_times: | 49 | #while i <= nb_times: |
50 | # threadTab.append(threading.Thread(None, envoie, None,contents,None)) | 50 | # threadTab.append(threading.Thread(None, envoie, None,contents,None)) |
51 | #duree = time.time() - debut | 51 | #duree = time.time() - debut |
52 | 52 |