Orkis.py 2.46 KB
# -*- coding: utf-8 -*-
from BaseProcessor import baseProcessor
import nltk
import re
from LiaTools import *
class Orkis(baseProcessor):
    """ Processor for Orkis """
    def __init__(self,dirtyString):
        self.tagger=Tagger()
        self.phoner=Phoner()
        self.dico ={}
        self.string=dirtyString
	print self.string
    def isReady(self):
        self.phoner.isReady()
        self.tagger.isReady()
    def __unicode__(self):
	string = u""
        for word in self.dico:
	    print(isinstance(string, unicode))
	    print(isinstance(unicode(word.decode("utf-8")),unicode))
	    print(word)
            print(string)
            string += ( unicode(word.decode("utf-8")))
            for lemWord in self.dico[word][0]:
                string += (unicode(lemWord.decode("utf-8"))) #+ unicode(u" "))
            string +=u";"
            for phonWord in self.dico[word][1]:
                string += (unicode(phonWord.decode("utf-8"))) #+ unicode(u" "))
            string+=u"\n"
        return string
    def clean(self):
        stopword=StopWord()
        self.string=stopword.RemoveStopList(self.string)
    def insertLem(self):
        self.cleanString=self.tagger.clean(self.string)
        taggedString=self.tagger.tagg(self.cleanString)
        self.tableLem = taggedString.rstrip().split("\n")
        for line in taggedString.rstrip().split("\n"):
	    if not re.match(r's>',line):
            	table = line.rstrip().split(" ")
            	if not table[0] in self.dico :
                	self.dico[table[0]]=[set(),set()]
            	self.dico[table[0]][0].add(table[2])
    def insertPhon(self):
        phonedString=self.phoner.phon(self.cleanString)
        self.tablephon= phonedString.rstrip().split("\n")
        for line in phonedString.rstrip().split("\n"):
	    if not re.match(r's>',line):
                table = line.rstrip().split(" ")
                if table[0] in self.dico:
                    self.dico[table[0]][1].add(table[1])
    def getDico(self):
        self.clean()
        self.insertLem()
        self.insertPhon()
        table=[]
        for i in self.dico:    
            if not re.match(r"<s>",i):
                list=[]          
                list.append(i)
                for indice in self.dico[i][0]:
                    list.append(indice) 
                for indice in self.dico[i][1]:  
                    list.append(indice)
                ligne= " ".join(list)     
		
                table.append(ligne) 
        return "\n".join(table)