Orkis.py
2.1 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
from BaseProcessor import baseProcessor
import nltk
from LiaTools import *
class Orkis(baseProcessor):
""" Processor for Orkis """
def __init__(self,dirtyString):
self.tagger=Tagger()
self.phoner=Phoner()
self.dico ={}
self.string=dirtyString
def isReady(self):
self.phoner.isReady()
self.tagger.isReady()
def __str__(self):
string=""
for word in self.dico:
string += (word+';')
for lemWord in self.dico[word][0]:
string += (lemWord+" ")
string +=";"
for phonWord in self.dico[word][1]:
string += (phonWord+" ")
string += ';'
string+='\n'
return string
def clean(self):
stopword=StopWord()
self.string=stopword.RemoveStopList(self.string)
def insertLem(self):
self.cleanString=self.tagger.clean(self.string)
taggedString=self.tagger.tagg(self.cleanString)
self.tableLem = taggedString.rstrip().split("\n")
for line in taggedString.rstrip().split("\n"):
table = line.rstrip().split(" ")
if not table[0] in self.dico :
self.dico[table[0]]=[set(),set()]
self.dico[table[0]][0].add(table[2])
def insertPhon(self):
phonedString=self.phoner.phon(self.cleanString)
self.tablephon= phonedString.rstrip().split("\n")
for line in phonedString.rstrip().split("\n"):
table = line.rstrip().split(" ")
if table[0] in self.dico:
self.dico[table[0]][1].add(table[1])
def getDico(self):
self.clean()
self.insertLem()
self.insertPhon()
table=[]
for i in self.dico:
if not i == "<s>":
list=[]
list.append(i)
for indice in self.dico[i][0]:
list.append(indice)
for indice in self.dico[i][1]:
list.append(indice)
ligne= " ".join(list)
table.append(ligne)
return "\n".join(table)