Orkis.py 2.78 KB
# -*- coding: utf-8 -*-
from BaseProcessor import baseProcessor
import nltk
import re
from LiaTools import *
class Orkis(baseProcessor):
    """ Processor for Orkis """
    def __init__(self,dirtyString):
	self.lem=u""
        self.tagger=Tagger()
        self.phoner=Phoner()
        self.dico ={}
        self.string=dirtyString
    def isReady(self):
        self.phoner.isReady()
        self.tagger.isReady()
    def __unicode__(self):
	string = u""
        for word in self.dico:
            string += ( unicode(word.decode("utf-8")))+unicode (u";")
            for lemWord in self.dico[word][0]:
                string += (unicode(lemWord.decode("utf-8"))+ unicode(u" "))
            string +=u";"
            for phonWord in self.dico[word][1]:
                string += (unicode(phonWord.decode("utf-8"))+ unicode(u" "))
            string+=u"\n"
        return string
    def clean(self):
        stopword=StopWord()
        self.string=stopword.RemoveStopList(self.string)
    def insertLem(self):
	self.lem=u""
        self.cleanString=self.tagger.clean(self.string).rstrip()
        taggedString=self.tagger.tagg(self.cleanString).rstrip()
        self.tableLem = taggedString.rstrip().split("\n")
        for line in taggedString.rstrip().split("\n"):
            table = line.rstrip().split(" ")
	    print("table2" + table[2])
	    if not table[2].isspace():
                if not table[0] in self.dico :
                    self.dico[table[0]]=[set(),set()]
                self.dico[table[0]][0].add(table[2])
	        self.lem = self.lem +"\n"+ table[2]
    def insertPhon(self):
	prephonedString=self.cleanString + self.lem.rstrip()
        phonedString=self.phoner.phon(self.cleanString.rstrip() +self.lem.rstrip())
        self.tablephon= phonedString.rstrip().split("\n")
        for line in phonedString.rstrip().split("\n"):
	    if not re.match(r's>',line):
                table = line.rstrip().split(" ")
                if table[0] in self.dico and not table[1].isspace() :
                    self.dico[table[0]][1].add(table[1])
                elif table[0] not in self.dico and not table[1].isspace() :
                    for mot,sets in self.dico.iteritems():
		        if table[0] in sets[0]:
                            self.dico[mot][1].add(table[1])
        
    def getDico(self):
        self.clean()
        self.insertLem()
        self.insertPhon()
        self.table=[]
        for i,v in self.dico.iteritems():    
            if not re.match(r".s>",i):
                list=[]          
                list.append(i)
                for indice in v[0]:
                    list.append(indice) 
                for indice in v[1]:  
                    list.append(indice)
                ligne= " ".join(list)     
		
                self.table.append(ligne) 
        return "\n".join(self.table)