Blame view

processor/LiaTools.py 2.64 KB
5492de487   Killian   ajout du processor
1
2
3
4
5
  import subprocess
  import os
  from BaseProcessor import baseProcessor
  import nltk
  import re
673721ec0   Killian   ajout phon il ma...
6

5492de487   Killian   ajout du processor
7
8
9
  class Tagger(baseProcessor):
      """ a calling to lia_tagg class"""
      def clean(self,dirtyString):
7ff5cc7f9   Killian   edit raw text + c...
10
11
12
  	""" Clean string for using it into lia_tagg
  
          Change text to iso  and clean it  one word by line and separate sentences with <s> </s>"""
5492de487   Killian   ajout du processor
13
14
15
16
17
          p=subprocess.Popen([os.environ["LIA_TAGG"]+'/script/lia_clean'],stdin=subprocess.PIPE,stdout=subprocess.PIPE)
          (cleanString, err) = p.communicate(input=dirtyString.encode('iso8859-1','backslashreplace'))
          return cleanString
  
      def tagg(self,cleanString):
7ff5cc7f9   Killian   edit raw text + c...
18
  	"""POS Tagg and lemm a string which come  from clean""" 
5492de487   Killian   ajout du processor
19
20
21
22
          p2=subprocess.Popen([os.environ["LIA_TAGG"]+'/script/lia_tagg+lemm','-guess'],stdin=subprocess.PIPE,stdout=subprocess.PIPE)
      	(taggedString,err) =p2.communicate(input=cleanString)
      # This is used beceause lia_tagg deal with iso8859 only
      	return taggedString.decode('iso8859').encode("utf8")
7ff5cc7f9   Killian   edit raw text + c...
23

5492de487   Killian   ajout du processor
24
      def lemm(self,cleanString):
7ff5cc7f9   Killian   edit raw text + c...
25
  	""" use the pos tagger to lemm word and return lemm only"""
673721ec0   Killian   ajout phon il ma...
26
   	taggedString = self.tagg(cleanString)       
7ff5cc7f9   Killian   edit raw text + c...
27
  	# sub the string to get only lemm ( cut markup and origin word ) Can be Delete with better use of lia_tagg
673721ec0   Killian   ajout phon il ma...
28
29
  	sub = re.sub(r' </s>','',re.sub(r'<s> ',''," ".join([ x.rstrip().split().pop(2) for x in taggedString.rstrip().split("
  ") if x])))
673721ec0   Killian   ajout phon il ma...
30
          return sub
5492de487   Killian   ajout du processor
31
      def isReady(self):
7ff5cc7f9   Killian   edit raw text + c...
32
          """ Check if the Tagger can be used ( depends on LIA_TAGG )"""
5492de487   Killian   ajout du processor
33
          os.environ["LIA_TAGG"]
673721ec0   Killian   ajout phon il ma...
34
          return True
5492de487   Killian   ajout du processor
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
  
  class Phoner(baseProcessor):
      """ a class which call the lia phoner """
      def clean(self,dirtyString):
          p=subprocess.Popen([os.environ["LIA_PHON_REP"]+'/script/lia_nett'],stdin=subprocess.PIPE,stdout=subprocess.PIPE)
          (cleanString, err) = p.communicate(input=dirtyString.encode('iso8859-1','backslashreplace'))
          return cleanString
      def phon(self,cleanString):
          p2=subprocess.Popen([os.environ["LIA_PHON_REP"]+'/script/lia_lex2phon'],stdin=subprocess.PIPE,stdout=subprocess.PIPE)
      	(taggedString,err) =p2.communicate(input=cleanString)
      # This is used beceause lia_phon deal with iso8859 only
      # We reconverte the output to utf8 back
      	return taggedString.decode('iso8859').encode("utf8")
      def isReady(self):
  	os.environ["LIA_PHON_REP"]
673721ec0   Killian   ajout phon il ma...
50
          return True
5492de487   Killian   ajout du processor
51
52
  class StopWord(baseProcessor):
      def isReady(self):
673721ec0   Killian   ajout phon il ma...
53
          return True
5492de487   Killian   ajout du processor
54
55
      def RemoveStopList(self,rowstring):
          """ Remove from set of word (splited String ) each words in the stoplist and join all of the other in a string """
673721ec0   Killian   ajout phon il ma...
56
          return u" ".join(unicode(value) for value in list(set(rowstring.split()) - set(nltk.corpus.stopwords.words("french"))))