Killian / liaWebServices

Blame view

processor/LiaTools.py 2.64 KB

5492de487 Killian ajout du processor	1 2 3 4 5	import subprocess import os from BaseProcessor import baseProcessor import nltk import re
673721ec0 Killian ajout phon il ma...	6
5492de487 Killian ajout du processor	7 8 9	class Tagger(baseProcessor): """ a calling to lia_tagg class""" def clean(self,dirtyString):
7ff5cc7f9 Killian edit raw text + c...	10 11 12	""" Clean string for using it into lia_tagg Change text to iso and clean it one word by line and separate sentences with <s> </s>"""
5492de487 Killian ajout du processor	13 14 15 16 17	p=subprocess.Popen([os.environ["LIA_TAGG"]+'/script/lia_clean'],stdin=subprocess.PIPE,stdout=subprocess.PIPE) (cleanString, err) = p.communicate(input=dirtyString.encode('iso8859-1','backslashreplace')) return cleanString def tagg(self,cleanString):
7ff5cc7f9 Killian edit raw text + c...	18	"""POS Tagg and lemm a string which come from clean"""
5492de487 Killian ajout du processor	19 20 21 22	p2=subprocess.Popen([os.environ["LIA_TAGG"]+'/script/lia_tagg+lemm','-guess'],stdin=subprocess.PIPE,stdout=subprocess.PIPE) (taggedString,err) =p2.communicate(input=cleanString) # This is used beceause lia_tagg deal with iso8859 only return taggedString.decode('iso8859').encode("utf8")
7ff5cc7f9 Killian edit raw text + c...	23
5492de487 Killian ajout du processor	24	def lemm(self,cleanString):
7ff5cc7f9 Killian edit raw text + c...	25	""" use the pos tagger to lemm word and return lemm only"""
673721ec0 Killian ajout phon il ma...	26	taggedString = self.tagg(cleanString)
7ff5cc7f9 Killian edit raw text + c...	27	# sub the string to get only lemm ( cut markup and origin word ) Can be Delete with better use of lia_tagg
673721ec0 Killian ajout phon il ma...	28 29	sub = re.sub(r' </s>','',re.sub(r'<s> ',''," ".join([ x.rstrip().split().pop(2) for x in taggedString.rstrip().split(" ") if x])))
673721ec0 Killian ajout phon il ma...	30	return sub
5492de487 Killian ajout du processor	31	def isReady(self):
7ff5cc7f9 Killian edit raw text + c...	32	""" Check if the Tagger can be used ( depends on LIA_TAGG )"""
5492de487 Killian ajout du processor	33	os.environ["LIA_TAGG"]
673721ec0 Killian ajout phon il ma...	34	return True
5492de487 Killian ajout du processor	35 36 37 38 39 40 41 42 43 44 45 46 47 48 49	class Phoner(baseProcessor): """ a class which call the lia phoner """ def clean(self,dirtyString): p=subprocess.Popen([os.environ["LIA_PHON_REP"]+'/script/lia_nett'],stdin=subprocess.PIPE,stdout=subprocess.PIPE) (cleanString, err) = p.communicate(input=dirtyString.encode('iso8859-1','backslashreplace')) return cleanString def phon(self,cleanString): p2=subprocess.Popen([os.environ["LIA_PHON_REP"]+'/script/lia_lex2phon'],stdin=subprocess.PIPE,stdout=subprocess.PIPE) (taggedString,err) =p2.communicate(input=cleanString) # This is used beceause lia_phon deal with iso8859 only # We reconverte the output to utf8 back return taggedString.decode('iso8859').encode("utf8") def isReady(self): os.environ["LIA_PHON_REP"]
673721ec0 Killian ajout phon il ma...	50	return True
5492de487 Killian ajout du processor	51 52	class StopWord(baseProcessor): def isReady(self):
673721ec0 Killian ajout phon il ma...	53	return True
5492de487 Killian ajout du processor	54 55	def RemoveStopList(self,rowstring): """ Remove from set of word (splited String ) each words in the stoplist and join all of the other in a string """
673721ec0 Killian ajout phon il ma...	56	return u" ".join(unicode(value) for value in list(set(rowstring.split()) - set(nltk.corpus.stopwords.words("french"))))