LiaTools.py 2.64 KB
edit raw blame history



1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

19

20

21

22

23

24

25

26

27

28

29

30

31

32

33

34

35

36

37

38

39

40

41

42

43

44

45

46

47

48

49

50

51

52

53

54

55

56


import subprocess
import os
from BaseProcessor import baseProcessor
import nltk
import re

class Tagger(baseProcessor):
    """ a calling to lia_tagg class"""
    def clean(self,dirtyString):
	""" Clean string for using it into lia_tagg

        Change text to iso  and clean it  one word by line and separate sentences with <s> </s>"""
        p=subprocess.Popen([os.environ["LIA_TAGG"]+'/script/lia_clean'],stdin=subprocess.PIPE,stdout=subprocess.PIPE)
        (cleanString, err) = p.communicate(input=dirtyString.encode('iso8859-1','backslashreplace'))
        return cleanString

    def tagg(self,cleanString):
	"""POS Tagg and lemm a string which come  from clean""" 
        p2=subprocess.Popen([os.environ["LIA_TAGG"]+'/script/lia_tagg+lemm','-guess'],stdin=subprocess.PIPE,stdout=subprocess.PIPE)
    	(taggedString,err) =p2.communicate(input=cleanString)
    # This is used beceause lia_tagg deal with iso8859 only
    	return taggedString.decode('iso8859').encode("utf8")

    def lemm(self,cleanString):
	""" use the pos tagger to lemm word and return lemm only"""
 	taggedString = self.tagg(cleanString)       
	# sub the string to get only lemm ( cut markup and origin word ) Can be Delete with better use of lia_tagg
	sub = re.sub(r' </s>','',re.sub(r'<s> ',''," ".join([ x.rstrip().split().pop(2) for x in taggedString.rstrip().split("\n") if x])))
        return sub
    def isReady(self):
        """ Check if the Tagger can be used ( depends on LIA_TAGG )"""
        os.environ["LIA_TAGG"]
        return True

class Phoner(baseProcessor):
    """ a class which call the lia phoner """
    def clean(self,dirtyString):
        p=subprocess.Popen([os.environ["LIA_PHON_REP"]+'/script/lia_nett'],stdin=subprocess.PIPE,stdout=subprocess.PIPE)
        (cleanString, err) = p.communicate(input=dirtyString.encode('iso8859-1','backslashreplace'))
        return cleanString
    def phon(self,cleanString):
        p2=subprocess.Popen([os.environ["LIA_PHON_REP"]+'/script/lia_lex2phon'],stdin=subprocess.PIPE,stdout=subprocess.PIPE)
    	(taggedString,err) =p2.communicate(input=cleanString)
    # This is used beceause lia_phon deal with iso8859 only
    # We reconverte the output to utf8 back
    	return taggedString.decode('iso8859').encode("utf8")
    def isReady(self):
	os.environ["LIA_PHON_REP"]
        return True

class StopWord(baseProcessor):
    def isReady(self):
        return True
    def RemoveStopList(self,rowstring):
        """ Remove from set of word (splited String ) each words in the stoplist and join all of the other in a string """
        return u" ".join(unicode(value) for value in list(set(rowstring.split()) - set(nltk.corpus.stopwords.words("french"))))