Blame view

processor/LiaTools.py 2.32 KB
5492de487   Killian   ajout du processor
1
2
3
4
5
  import subprocess
  import os
  from BaseProcessor import baseProcessor
  import nltk
  import re
673721ec0   Killian   ajout phon il ma...
6

5492de487   Killian   ajout du processor
7
8
9
10
11
12
13
14
15
16
17
18
19
  class Tagger(baseProcessor):
      """ a calling to lia_tagg class"""
      def clean(self,dirtyString):
          p=subprocess.Popen([os.environ["LIA_TAGG"]+'/script/lia_clean'],stdin=subprocess.PIPE,stdout=subprocess.PIPE)
          (cleanString, err) = p.communicate(input=dirtyString.encode('iso8859-1','backslashreplace'))
          return cleanString
  
      def tagg(self,cleanString):
          p2=subprocess.Popen([os.environ["LIA_TAGG"]+'/script/lia_tagg+lemm','-guess'],stdin=subprocess.PIPE,stdout=subprocess.PIPE)
      	(taggedString,err) =p2.communicate(input=cleanString)
      # This is used beceause lia_tagg deal with iso8859 only
      	return taggedString.decode('iso8859').encode("utf8")
      def lemm(self,cleanString):
673721ec0   Killian   ajout phon il ma...
20
21
22
23
24
25
26
          print " cleannnnn " + cleanString
   	taggedString = self.tagg(cleanString)       
          print "taggs full " + taggedString
  	sub = re.sub(r' </s>','',re.sub(r'<s> ',''," ".join([ x.rstrip().split().pop(2) for x in taggedString.rstrip().split("
  ") if x])))
          print " subbbbb" + sub 
          return sub
5492de487   Killian   ajout du processor
27
28
      def isReady(self):
          os.environ["LIA_TAGG"]
673721ec0   Killian   ajout phon il ma...
29
          return True
5492de487   Killian   ajout du processor
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
  
  class Phoner(baseProcessor):
      """ a class which call the lia phoner """
      def clean(self,dirtyString):
          p=subprocess.Popen([os.environ["LIA_PHON_REP"]+'/script/lia_nett'],stdin=subprocess.PIPE,stdout=subprocess.PIPE)
          (cleanString, err) = p.communicate(input=dirtyString.encode('iso8859-1','backslashreplace'))
          return cleanString
      def phon(self,cleanString):
          p2=subprocess.Popen([os.environ["LIA_PHON_REP"]+'/script/lia_lex2phon'],stdin=subprocess.PIPE,stdout=subprocess.PIPE)
      	(taggedString,err) =p2.communicate(input=cleanString)
      # This is used beceause lia_phon deal with iso8859 only
      # We reconverte the output to utf8 back
      	return taggedString.decode('iso8859').encode("utf8")
      def isReady(self):
  	os.environ["LIA_PHON_REP"]
673721ec0   Killian   ajout phon il ma...
45
          return True
5492de487   Killian   ajout du processor
46
47
  class StopWord(baseProcessor):
      def isReady(self):
673721ec0   Killian   ajout phon il ma...
48
          return True
5492de487   Killian   ajout du processor
49
50
      def RemoveStopList(self,rowstring):
          """ Remove from set of word (splited String ) each words in the stoplist and join all of the other in a string """
673721ec0   Killian   ajout phon il ma...
51
          return u" ".join(unicode(value) for value in list(set(rowstring.split()) - set(nltk.corpus.stopwords.words("french"))))