Blame view

processor/Orkis.py 1.73 KB
b65eb4cd1   Killian   ajout des port Or...
1
  # -*- coding: utf-8 -*-
b3cdd2e74   Killian   Ajout de Orkis pr...
2
3
  from BaseProcessor import baseProcessor
  import nltk
b65eb4cd1   Killian   ajout des port Or...
4
  import re
b3cdd2e74   Killian   Ajout de Orkis pr...
5
6
7
8
  from LiaTools import *
  class Orkis(baseProcessor):
      """ Processor for Orkis """
      def __init__(self,dirtyString):
2e75fdc6c   Killian   correction Ajoute...
9
  	self.lem=u""
b3cdd2e74   Killian   Ajout de Orkis pr...
10
          self.tagger=Tagger()
b3cdd2e74   Killian   Ajout de Orkis pr...
11
12
          self.dico ={}
          self.string=dirtyString
44c17c423   Killian   Accents Ok
13
          print(dirtyString)
b3cdd2e74   Killian   Ajout de Orkis pr...
14
      def isReady(self):
b3cdd2e74   Killian   Ajout de Orkis pr...
15
          self.tagger.isReady()
b65eb4cd1   Killian   ajout des port Or...
16
17
      def __unicode__(self):
  	string = u""
9aab1de73   Killian   Tentative Soap in...
18
          for word in self.dico:
44c17c423   Killian   Accents Ok
19
              string += word+u";"
9aab1de73   Killian   Tentative Soap in...
20
              for lemWord in self.dico[word][0]:
44c17c423   Killian   Accents Ok
21
                  string += lemWord + u" "
b65eb4cd1   Killian   ajout des port Or...
22
23
              string+=u"
  "
9aab1de73   Killian   Tentative Soap in...
24
          return string
b3cdd2e74   Killian   Ajout de Orkis pr...
25
26
27
28
      def clean(self):
          stopword=StopWord()
          self.string=stopword.RemoveStopList(self.string)
      def insertLem(self):
2e75fdc6c   Killian   correction Ajoute...
29
30
  	self.lem=u""
          self.cleanString=self.tagger.clean(self.string).rstrip()
44c17c423   Killian   Accents Ok
31
32
33
34
35
36
37
          taggedString=self.tagger.tagg(self.cleanString).rstrip().decode("utf8")
          self.tableLem = taggedString.split(u"
  ")
          for line in taggedString.rstrip().split(u"
  "):
              table = line.rstrip().split(u" ")
  	    #print(u"table2" + table[2])
2e75fdc6c   Killian   correction Ajoute...
38
39
40
41
  	    if not table[2].isspace():
                  if not table[0] in self.dico :
                      self.dico[table[0]]=[set(),set()]
                  self.dico[table[0]][0].add(table[2])
44c17c423   Killian   Accents Ok
42
43
  	        self.lem = self.lem +u"
  "+ table[2]
b3cdd2e74   Killian   Ajout de Orkis pr...
44
45
46
      def getDico(self):
          self.clean()
          self.insertLem()
2e75fdc6c   Killian   correction Ajoute...
47
          self.table=[]
f8f94203e   Killian   Correction du uni...
48
          for i,v in self.dico.iteritems():    
2e75fdc6c   Killian   correction Ajoute...
49
              if not re.match(r".s>",i):
6c1479b8b   Killian   Modification Orkis
50
51
                  list=[]          
                  list.append(i)
f8f94203e   Killian   Correction du uni...
52
                  for indice in v[0]:
6c1479b8b   Killian   Modification Orkis
53
                      list.append(indice) 
6c1479b8b   Killian   Modification Orkis
54
                  ligne= " ".join(list)     
2e75fdc6c   Killian   correction Ajoute...
55
                  self.table.append(ligne) 
44c17c423   Killian   Accents Ok
56
57
          return u"
  ".join(self.table)