LiaTools.py
2.64 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
import subprocess
import os
from BaseProcessor import baseProcessor
import nltk
import re
class Tagger(baseProcessor):
""" a calling to lia_tagg class"""
def clean(self,dirtyString):
""" Clean string for using it into lia_tagg
Change text to iso and clean it one word by line and separate sentences with <s> </s>"""
p=subprocess.Popen([os.environ["LIA_TAGG"]+'/script/lia_clean'],stdin=subprocess.PIPE,stdout=subprocess.PIPE)
(cleanString, err) = p.communicate(input=dirtyString.encode('iso8859-1','backslashreplace'))
return cleanString
def tagg(self,cleanString):
"""POS Tagg and lemm a string which come from clean"""
p2=subprocess.Popen([os.environ["LIA_TAGG"]+'/script/lia_tagg+lemm','-guess'],stdin=subprocess.PIPE,stdout=subprocess.PIPE)
(taggedString,err) =p2.communicate(input=cleanString)
# This is used beceause lia_tagg deal with iso8859 only
return taggedString.decode('iso8859').encode("utf8")
def lemm(self,cleanString):
""" use the pos tagger to lemm word and return lemm only"""
taggedString = self.tagg(cleanString)
# sub the string to get only lemm ( cut markup and origin word ) Can be Delete with better use of lia_tagg
sub = re.sub(r' </s>','',re.sub(r'<s> ',''," ".join([ x.rstrip().split().pop(2) for x in taggedString.rstrip().split("\n") if x])))
return sub
def isReady(self):
""" Check if the Tagger can be used ( depends on LIA_TAGG )"""
os.environ["LIA_TAGG"]
return True
class Phoner(baseProcessor):
""" a class which call the lia phoner """
def clean(self,dirtyString):
p=subprocess.Popen([os.environ["LIA_PHON_REP"]+'/script/lia_nett'],stdin=subprocess.PIPE,stdout=subprocess.PIPE)
(cleanString, err) = p.communicate(input=dirtyString.encode('iso8859-1','backslashreplace'))
return cleanString
def phon(self,cleanString):
p2=subprocess.Popen([os.environ["LIA_PHON_REP"]+'/script/lia_lex2phon'],stdin=subprocess.PIPE,stdout=subprocess.PIPE)
(taggedString,err) =p2.communicate(input=cleanString)
# This is used beceause lia_phon deal with iso8859 only
# We reconverte the output to utf8 back
return taggedString.decode('iso8859').encode("utf8")
def isReady(self):
os.environ["LIA_PHON_REP"]
return True
class StopWord(baseProcessor):
def isReady(self):
return True
def RemoveStopList(self,rowstring):
""" Remove from set of word (splited String ) each words in the stoplist and join all of the other in a string """
return u" ".join(unicode(value) for value in list(set(rowstring.split()) - set(nltk.corpus.stopwords.words("french"))))