Blame view

processor/LiaTools.py 2.16 KB
5492de487   Killian   ajout du processor
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
  import subprocess
  import os
  from BaseProcessor import baseProcessor
  import nltk
  import re
  class Tagger(baseProcessor):
      """ a calling to lia_tagg class"""
      def clean(self,dirtyString):
          p=subprocess.Popen([os.environ["LIA_TAGG"]+'/script/lia_clean'],stdin=subprocess.PIPE,stdout=subprocess.PIPE)
          (cleanString, err) = p.communicate(input=dirtyString.encode('iso8859-1','backslashreplace'))
          return cleanString
  
      def tagg(self,cleanString):
          p2=subprocess.Popen([os.environ["LIA_TAGG"]+'/script/lia_tagg+lemm','-guess'],stdin=subprocess.PIPE,stdout=subprocess.PIPE)
      	(taggedString,err) =p2.communicate(input=cleanString)
      # This is used beceause lia_tagg deal with iso8859 only
      	return taggedString.decode('iso8859').encode("utf8")
      def lemm(self,cleanString):
   	taggedString = self.taff(cleanString)       
          return re.sub(r'<s> ',''," ".join([ x.split().pop(2) for x in taggedString.rstrip().split("
  ")]))
      def isReady(self):
          os.environ["LIA_TAGG"]
          return true
  
  class Phoner(baseProcessor):
      """ a class which call the lia phoner """
      def clean(self,dirtyString):
          p=subprocess.Popen([os.environ["LIA_PHON_REP"]+'/script/lia_nett'],stdin=subprocess.PIPE,stdout=subprocess.PIPE)
          (cleanString, err) = p.communicate(input=dirtyString.encode('iso8859-1','backslashreplace'))
          return cleanString
      def phon(self,cleanString):
          p2=subprocess.Popen([os.environ["LIA_PHON_REP"]+'/script/lia_lex2phon'],stdin=subprocess.PIPE,stdout=subprocess.PIPE)
      	(taggedString,err) =p2.communicate(input=cleanString)
      # This is used beceause lia_phon deal with iso8859 only
      # We reconverte the output to utf8 back
      	return taggedString.decode('iso8859').encode("utf8")
      def isReady(self):
  	os.environ["LIA_PHON_REP"]
          return true
  class StopWord(baseProcessor):
      def isReady(self):
          return true
      def RemoveStopList(self,rowstring):
          """ Remove from set of word (splited String ) each words in the stoplist and join all of the other in a string """
          return u" ".join(unicode(value) for value in list(set(test.split()) - set(nltk.corpus.stopwords.words("french"))))