Orkis.py
2.78 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
# -*- coding: utf-8 -*-
from BaseProcessor import baseProcessor
import nltk
import re
from LiaTools import *
class Orkis(baseProcessor):
""" Processor for Orkis """
def __init__(self,dirtyString):
self.lem=u""
self.tagger=Tagger()
self.phoner=Phoner()
self.dico ={}
self.string=dirtyString
def isReady(self):
self.phoner.isReady()
self.tagger.isReady()
def __unicode__(self):
string = u""
for word in self.dico:
string += ( unicode(word.decode("utf-8")))+unicode (u";")
for lemWord in self.dico[word][0]:
string += (unicode(lemWord.decode("utf-8"))+ unicode(u" "))
string +=u";"
for phonWord in self.dico[word][1]:
string += (unicode(phonWord.decode("utf-8"))+ unicode(u" "))
string+=u"\n"
return string
def clean(self):
stopword=StopWord()
self.string=stopword.RemoveStopList(self.string)
def insertLem(self):
self.lem=u""
self.cleanString=self.tagger.clean(self.string).rstrip()
taggedString=self.tagger.tagg(self.cleanString).rstrip()
self.tableLem = taggedString.rstrip().split("\n")
for line in taggedString.rstrip().split("\n"):
table = line.rstrip().split(" ")
print("table2" + table[2])
if not table[2].isspace():
if not table[0] in self.dico :
self.dico[table[0]]=[set(),set()]
self.dico[table[0]][0].add(table[2])
self.lem = self.lem +"\n"+ table[2]
def insertPhon(self):
prephonedString=self.cleanString + self.lem.rstrip()
phonedString=self.phoner.phon(self.cleanString.rstrip() +self.lem.rstrip())
self.tablephon= phonedString.rstrip().split("\n")
for line in phonedString.rstrip().split("\n"):
if not re.match(r's>',line):
table = line.rstrip().split(" ")
if table[0] in self.dico and not table[1].isspace() :
self.dico[table[0]][1].add(table[1])
elif table[0] not in self.dico and not table[1].isspace() :
for mot,sets in self.dico.iteritems():
if table[0] in sets[0]:
self.dico[mot][1].add(table[1])
def getDico(self):
self.clean()
self.insertLem()
self.insertPhon()
self.table=[]
for i,v in self.dico.iteritems():
if not re.match(r".s>",i):
list=[]
list.append(i)
for indice in v[0]:
list.append(indice)
for indice in v[1]:
list.append(indice)
ligne= " ".join(list)
self.table.append(ligne)
return "\n".join(self.table)