Orkis.py
1.73 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
# -*- coding: utf-8 -*-
from BaseProcessor import baseProcessor
import nltk
import re
from LiaTools import *
class Orkis(baseProcessor):
""" Processor for Orkis """
def __init__(self,dirtyString):
self.lem=u""
self.tagger=Tagger()
self.dico ={}
self.string=dirtyString
print(dirtyString)
def isReady(self):
self.tagger.isReady()
def __unicode__(self):
string = u""
for word in self.dico:
string += word+u";"
for lemWord in self.dico[word][0]:
string += lemWord + u" "
string+=u"\n"
return string
def clean(self):
stopword=StopWord()
self.string=stopword.RemoveStopList(self.string)
def insertLem(self):
self.lem=u""
self.cleanString=self.tagger.clean(self.string).rstrip()
taggedString=self.tagger.tagg(self.cleanString).rstrip().decode("utf8")
self.tableLem = taggedString.split(u"\n")
for line in taggedString.rstrip().split(u"\n"):
table = line.rstrip().split(u" ")
#print(u"table2" + table[2])
if not table[2].isspace():
if not table[0] in self.dico :
self.dico[table[0]]=[set(),set()]
self.dico[table[0]][0].add(table[2])
self.lem = self.lem +u"\n"+ table[2]
def getDico(self):
self.clean()
self.insertLem()
self.table=[]
for i,v in self.dico.iteritems():
if not re.match(r".s>",i):
list=[]
list.append(i)
for indice in v[0]:
list.append(indice)
ligne= " ".join(list)
self.table.append(ligne)
return u"\n".join(self.table)