Blame view
processor/Orkis.py
1.73 KB
b65eb4cd1 ajout des port Or... |
1 |
# -*- coding: utf-8 -*- |
b3cdd2e74 Ajout de Orkis pr... |
2 3 |
from BaseProcessor import baseProcessor import nltk |
b65eb4cd1 ajout des port Or... |
4 |
import re |
b3cdd2e74 Ajout de Orkis pr... |
5 6 7 8 |
from LiaTools import * class Orkis(baseProcessor): """ Processor for Orkis """ def __init__(self,dirtyString): |
2e75fdc6c correction Ajoute... |
9 |
self.lem=u"" |
b3cdd2e74 Ajout de Orkis pr... |
10 |
self.tagger=Tagger() |
b3cdd2e74 Ajout de Orkis pr... |
11 12 |
self.dico ={} self.string=dirtyString |
44c17c423 Accents Ok |
13 |
print(dirtyString) |
b3cdd2e74 Ajout de Orkis pr... |
14 |
def isReady(self): |
b3cdd2e74 Ajout de Orkis pr... |
15 |
self.tagger.isReady() |
b65eb4cd1 ajout des port Or... |
16 17 |
def __unicode__(self): string = u"" |
9aab1de73 Tentative Soap in... |
18 |
for word in self.dico: |
44c17c423 Accents Ok |
19 |
string += word+u";" |
9aab1de73 Tentative Soap in... |
20 |
for lemWord in self.dico[word][0]: |
44c17c423 Accents Ok |
21 |
string += lemWord + u" " |
b65eb4cd1 ajout des port Or... |
22 23 |
string+=u" " |
9aab1de73 Tentative Soap in... |
24 |
return string |
b3cdd2e74 Ajout de Orkis pr... |
25 26 27 28 |
def clean(self): stopword=StopWord() self.string=stopword.RemoveStopList(self.string) def insertLem(self): |
2e75fdc6c correction Ajoute... |
29 30 |
self.lem=u"" self.cleanString=self.tagger.clean(self.string).rstrip() |
44c17c423 Accents Ok |
31 32 33 34 35 36 37 |
taggedString=self.tagger.tagg(self.cleanString).rstrip().decode("utf8") self.tableLem = taggedString.split(u" ") for line in taggedString.rstrip().split(u" "): table = line.rstrip().split(u" ") #print(u"table2" + table[2]) |
2e75fdc6c correction Ajoute... |
38 39 40 41 |
if not table[2].isspace(): if not table[0] in self.dico : self.dico[table[0]]=[set(),set()] self.dico[table[0]][0].add(table[2]) |
44c17c423 Accents Ok |
42 43 |
self.lem = self.lem +u" "+ table[2] |
b3cdd2e74 Ajout de Orkis pr... |
44 45 46 |
def getDico(self): self.clean() self.insertLem() |
2e75fdc6c correction Ajoute... |
47 |
self.table=[] |
f8f94203e Correction du uni... |
48 |
for i,v in self.dico.iteritems(): |
2e75fdc6c correction Ajoute... |
49 |
if not re.match(r".s>",i): |
6c1479b8b Modification Orkis |
50 51 |
list=[] list.append(i) |
f8f94203e Correction du uni... |
52 |
for indice in v[0]: |
6c1479b8b Modification Orkis |
53 |
list.append(indice) |
6c1479b8b Modification Orkis |
54 |
ligne= " ".join(list) |
2e75fdc6c correction Ajoute... |
55 |
self.table.append(ligne) |
44c17c423 Accents Ok |
56 57 |
return u" ".join(self.table) |