Blame view

processor/LiaTools.py 2.75 KB
5492de487   Killian   ajout du processor
1
2
3
4
5
  import subprocess
  import os
  from BaseProcessor import baseProcessor
  import nltk
  import re
673721ec0   Killian   ajout phon il ma...
6

5492de487   Killian   ajout du processor
7
8
9
  class Tagger(baseProcessor):
      """ a calling to lia_tagg class"""
      def clean(self,dirtyString):
7ff5cc7f9   Killian   edit raw text + c...
10
11
12
  	""" Clean string for using it into lia_tagg
  
          Change text to iso  and clean it  one word by line and separate sentences with <s> </s>"""
5492de487   Killian   ajout du processor
13
          p=subprocess.Popen([os.environ["LIA_TAGG"]+'/script/lia_clean'],stdin=subprocess.PIPE,stdout=subprocess.PIPE)
44c17c423   Killian   Accents Ok
14
15
          (cleanString, err) = p.communicate(input=dirtyString.encode('iso8859-1'))
          return cleanString.decode("iso8859-1").encode("utf8")
5492de487   Killian   ajout du processor
16
17
  
      def tagg(self,cleanString):
7ff5cc7f9   Killian   edit raw text + c...
18
  	"""POS Tagg and lemm a string which come  from clean""" 
5492de487   Killian   ajout du processor
19
          p2=subprocess.Popen([os.environ["LIA_TAGG"]+'/script/lia_tagg+lemm','-guess'],stdin=subprocess.PIPE,stdout=subprocess.PIPE)
44c17c423   Killian   Accents Ok
20
21
22
      	(taggedString,err) =p2.communicate(input=cleanString.decode("utf8").encode('iso8859-1'))
           # This is used beceause lia_tagg deal with iso8859 only
      	return taggedString.decode('iso8859-1').encode("utf8")
7ff5cc7f9   Killian   edit raw text + c...
23

5492de487   Killian   ajout du processor
24
      def lemm(self,cleanString):
44c17c423   Killian   Accents Ok
25
26
27
28
29
30
  	    """ use the pos tagger to lemm word and return lemm only"""
   	    taggedString = self.tagg(cleanString)       
  	    # sub the string to get only lemm ( cut markup and origin word ) Can be Delete with better use of lia_tagg
  	    sub = re.sub(r' </s>',u'', re.sub(r'<s> ','',u" ".join([ x.rstrip().split().pop(2) for x in taggedString.rstrip().split(u"
  ") if x])))
              return sub
5492de487   Killian   ajout du processor
31
      def isReady(self):
7ff5cc7f9   Killian   edit raw text + c...
32
          """ Check if the Tagger can be used ( depends on LIA_TAGG )"""
5492de487   Killian   ajout du processor
33
          os.environ["LIA_TAGG"]
673721ec0   Killian   ajout phon il ma...
34
          return True
5492de487   Killian   ajout du processor
35
36
37
38
39
  
  class Phoner(baseProcessor):
      """ a class which call the lia phoner """
      def clean(self,dirtyString):
          p=subprocess.Popen([os.environ["LIA_PHON_REP"]+'/script/lia_nett'],stdin=subprocess.PIPE,stdout=subprocess.PIPE)
44c17c423   Killian   Accents Ok
40
41
          (cleanString, err) = p.communicate(input=dirtyString.decode("utf8").encode('iso8859-1'))
          return cleanString.decode("iso8859-1").encode("utf8")
5492de487   Killian   ajout du processor
42
43
44
45
46
      def phon(self,cleanString):
          p2=subprocess.Popen([os.environ["LIA_PHON_REP"]+'/script/lia_lex2phon'],stdin=subprocess.PIPE,stdout=subprocess.PIPE)
      	(taggedString,err) =p2.communicate(input=cleanString)
      # This is used beceause lia_phon deal with iso8859 only
      # We reconverte the output to utf8 back
44c17c423   Killian   Accents Ok
47
      	return taggedString.decode('iso8859-1').encode("utf8")
5492de487   Killian   ajout du processor
48
49
      def isReady(self):
  	os.environ["LIA_PHON_REP"]
673721ec0   Killian   ajout phon il ma...
50
          return True
5492de487   Killian   ajout du processor
51
52
  class StopWord(baseProcessor):
      def isReady(self):
673721ec0   Killian   ajout phon il ma...
53
          return True
5492de487   Killian   ajout du processor
54
55
      def RemoveStopList(self,rowstring):
          """ Remove from set of word (splited String ) each words in the stoplist and join all of the other in a string """
673721ec0   Killian   ajout phon il ma...
56
          return u" ".join(unicode(value) for value in list(set(rowstring.split()) - set(nltk.corpus.stopwords.words("french"))))