Blame view
processor/Orkis.py
2.78 KB
b65eb4cd1 ajout des port Or... |
1 |
# -*- coding: utf-8 -*- |
b3cdd2e74 Ajout de Orkis pr... |
2 3 |
from BaseProcessor import baseProcessor import nltk |
b65eb4cd1 ajout des port Or... |
4 |
import re |
b3cdd2e74 Ajout de Orkis pr... |
5 6 7 8 |
from LiaTools import * class Orkis(baseProcessor): """ Processor for Orkis """ def __init__(self,dirtyString): |
2e75fdc6c correction Ajoute... |
9 |
self.lem=u"" |
b3cdd2e74 Ajout de Orkis pr... |
10 11 12 13 14 15 16 |
self.tagger=Tagger() self.phoner=Phoner() self.dico ={} self.string=dirtyString def isReady(self): self.phoner.isReady() self.tagger.isReady() |
b65eb4cd1 ajout des port Or... |
17 18 |
def __unicode__(self): string = u"" |
9aab1de73 Tentative Soap in... |
19 |
for word in self.dico: |
f8f94203e Correction du uni... |
20 |
string += ( unicode(word.decode("utf-8")))+unicode (u";") |
9aab1de73 Tentative Soap in... |
21 |
for lemWord in self.dico[word][0]: |
f8f94203e Correction du uni... |
22 |
string += (unicode(lemWord.decode("utf-8"))+ unicode(u" ")) |
b65eb4cd1 ajout des port Or... |
23 |
string +=u";" |
9aab1de73 Tentative Soap in... |
24 |
for phonWord in self.dico[word][1]: |
f8f94203e Correction du uni... |
25 |
string += (unicode(phonWord.decode("utf-8"))+ unicode(u" ")) |
b65eb4cd1 ajout des port Or... |
26 27 |
string+=u" " |
9aab1de73 Tentative Soap in... |
28 |
return string |
b3cdd2e74 Ajout de Orkis pr... |
29 30 31 32 |
def clean(self): stopword=StopWord() self.string=stopword.RemoveStopList(self.string) def insertLem(self): |
2e75fdc6c correction Ajoute... |
33 34 35 |
self.lem=u"" self.cleanString=self.tagger.clean(self.string).rstrip() taggedString=self.tagger.tagg(self.cleanString).rstrip() |
b3cdd2e74 Ajout de Orkis pr... |
36 37 38 39 |
self.tableLem = taggedString.rstrip().split(" ") for line in taggedString.rstrip().split(" "): |
2e75fdc6c correction Ajoute... |
40 41 42 43 44 45 46 47 |
table = line.rstrip().split(" ") print("table2" + table[2]) if not table[2].isspace(): if not table[0] in self.dico : self.dico[table[0]]=[set(),set()] self.dico[table[0]][0].add(table[2]) self.lem = self.lem +" "+ table[2] |
b3cdd2e74 Ajout de Orkis pr... |
48 |
def insertPhon(self): |
2e75fdc6c correction Ajoute... |
49 50 |
prephonedString=self.cleanString + self.lem.rstrip() phonedString=self.phoner.phon(self.cleanString.rstrip() +self.lem.rstrip()) |
b3cdd2e74 Ajout de Orkis pr... |
51 52 53 54 |
self.tablephon= phonedString.rstrip().split(" ") for line in phonedString.rstrip().split(" "): |
b65eb4cd1 ajout des port Or... |
55 56 |
if not re.match(r's>',line): table = line.rstrip().split(" ") |
2e75fdc6c correction Ajoute... |
57 |
if table[0] in self.dico and not table[1].isspace() : |
b65eb4cd1 ajout des port Or... |
58 |
self.dico[table[0]][1].add(table[1]) |
2e75fdc6c correction Ajoute... |
59 60 61 62 63 |
elif table[0] not in self.dico and not table[1].isspace() : for mot,sets in self.dico.iteritems(): if table[0] in sets[0]: self.dico[mot][1].add(table[1]) |
b3cdd2e74 Ajout de Orkis pr... |
64 65 66 67 |
def getDico(self): self.clean() self.insertLem() self.insertPhon() |
2e75fdc6c correction Ajoute... |
68 |
self.table=[] |
f8f94203e Correction du uni... |
69 |
for i,v in self.dico.iteritems(): |
2e75fdc6c correction Ajoute... |
70 |
if not re.match(r".s>",i): |
6c1479b8b Modification Orkis |
71 72 |
list=[] list.append(i) |
f8f94203e Correction du uni... |
73 |
for indice in v[0]: |
6c1479b8b Modification Orkis |
74 |
list.append(indice) |
f8f94203e Correction du uni... |
75 |
for indice in v[1]: |
6c1479b8b Modification Orkis |
76 77 |
list.append(indice) ligne= " ".join(list) |
b65eb4cd1 ajout des port Or... |
78 |
|
2e75fdc6c correction Ajoute... |
79 80 81 |
self.table.append(ligne) return " ".join(self.table) |