ajout phon il manque plus que le soap

Killian
1 parent e0e4926982
Showing 3 changed files with 27 additions and 9 deletions Side-by-side Diff
README.md
processor/LiaTools.py
webtagger.py
@@ -19,9 +19,10 @@
 ## Requirement
  
 * [LIA\_TAGGER](http://pageperso.lif.univ-mrs.fr/~frederic.bechet/download.html)
+* [LIA\_PHON](http://pageperso.lif.univ-mrs.fr/~frederic.bechet/download.html)
 * Python > 2.5
 * Flask
-
+* nltk
 ## Advise 
  
 * Virtualenv
@@ -3,6 +3,7 @@
 from BaseProcessor import baseProcessor
 import nltk
 import re
+
 class Tagger(baseProcessor):
     """ a calling to lia_tagg class"""
     def clean(self,dirtyString):
  
@@ -16,11 +17,15 @@
     # This is used beceause lia_tagg deal with iso8859 only
     	return taggedString.decode('iso8859').encode("utf8")
     def lemm(self,cleanString):
- 	taggedString = self.taff(cleanString)       
-        return re.sub(r'<s> ',''," ".join([ x.split().pop(2) for x in taggedString.rstrip().split("\n")]))
+        print " cleannnnn " + cleanString
+ 	taggedString = self.tagg(cleanString)       
+        print "taggs full " + taggedString
+	sub = re.sub(r' </s>','',re.sub(r'<s> ',''," ".join([ x.rstrip().split().pop(2) for x in taggedString.rstrip().split("\n") if x])))
+        print " subbbbb" + sub 
+        return sub
     def isReady(self):
         os.environ["LIA_TAGG"]
-        return true
+        return True
  
 class Phoner(baseProcessor):
     """ a class which call the lia phoner """
  
  
@@ -36,11 +41,12 @@
     	return taggedString.decode('iso8859').encode("utf8")
     def isReady(self):
 	os.environ["LIA_PHON_REP"]
-        return true
+        return True
+
 class StopWord(baseProcessor):
     def isReady(self):
-        return true
+        return True
     def RemoveStopList(self,rowstring):
         """ Remove from set of word (splited String ) each words in the stoplist and join all of the other in a string """
-        return u" ".join(unicode(value) for value in list(set(test.split()) - set(nltk.corpus.stopwords.words("french"))))
+        return u" ".join(unicode(value) for value in list(set(rowstring.split()) - set(nltk.corpus.stopwords.words("french"))))
@@ -13,12 +13,23 @@
 @app.route("/tagger",methods=['POST'])
 def cleaner():
     tagger = Tagger()
+    tagger.isReady()
+    phoner = Phoner()
+    phoner.isReady()
+    stoplist = StopWord()
+    stoplist.isReady()
     # Receive String from post parametre Raw text ( Json )
     dirtyString= request.json[u'string']
     # send the String throught LIA_TAGG script  thank's to pip
     # lia_clean split a word by line et markup the sentences
-    cleanString= tagger.clean(dirtyString)
-    taggedString= tagger.tagg(cleanString)
+    dirtyString = stoplist.RemoveStopList(dirtyString)
+    print " stop list " + dirtyString
+    lemm = tagger.lemm(tagger.clean(dirtyString))
+    print 'les lemm '+ lemm
+    dirtyString = dirtyString+" "+ lemm
+    cleanString= phoner.clean(dirtyString)
+    taggedString= phoner.phon(cleanString)
+    print taggedString
     return taggedString
 if __name__ == '__main__':
     app.debug = True
...	...	@@ -19,9 +19,10 @@
19	19	## Requirement
20	20
21	21	* [LIA\_TAGGER](http://pageperso.lif.univ-mrs.fr/~frederic.bechet/download.html)
	22	+* [LIA\_PHON](http://pageperso.lif.univ-mrs.fr/~frederic.bechet/download.html)
22	23	* Python > 2.5
23	24	* Flask
24		-
	25	+* nltk
25	26	## Advise
26	27
27	28	* Virtualenv
...	...	@@ -3,6 +3,7 @@
3	3	from BaseProcessor import baseProcessor
4	4	import nltk
5	5	import re
	6	+
6	7	class Tagger(baseProcessor):
7	8	""" a calling to lia_tagg class"""
8	9	def clean(self,dirtyString):
9	10
...	...	@@ -16,11 +17,15 @@
16	17	# This is used beceause lia_tagg deal with iso8859 only
17	18	return taggedString.decode('iso8859').encode("utf8")
18	19	def lemm(self,cleanString):
19		- taggedString = self.taff(cleanString)
20		- return re.sub(r'<s> ',''," ".join([ x.split().pop(2) for x in taggedString.rstrip().split("\n")]))
	20	+ print " cleannnnn " + cleanString
	21	+ taggedString = self.tagg(cleanString)
	22	+ print "taggs full " + taggedString
	23	+ sub = re.sub(r' </s>','',re.sub(r'<s> ',''," ".join([ x.rstrip().split().pop(2) for x in taggedString.rstrip().split("\n") if x])))
	24	+ print " subbbbb" + sub
	25	+ return sub
21	26	def isReady(self):
22	27	os.environ["LIA_TAGG"]
23		- return true
	28	+ return True
24	29
25	30	class Phoner(baseProcessor):
26	31	""" a class which call the lia phoner """
27	32
28	33
...	...	@@ -36,11 +41,12 @@
36	41	return taggedString.decode('iso8859').encode("utf8")
37	42	def isReady(self):
38	43	os.environ["LIA_PHON_REP"]
39		- return true
	44	+ return True
	45	+
40	46	class StopWord(baseProcessor):
41	47	def isReady(self):
42		- return true
	48	+ return True
43	49	def RemoveStopList(self,rowstring):
44	50	""" Remove from set of word (splited String ) each words in the stoplist and join all of the other in a string """
45		- return u" ".join(unicode(value) for value in list(set(test.split()) - set(nltk.corpus.stopwords.words("french"))))
	51	+ return u" ".join(unicode(value) for value in list(set(rowstring.split()) - set(nltk.corpus.stopwords.words("french"))))
...	...	@@ -13,12 +13,23 @@
13	13	@app.route("/tagger",methods=['POST'])
14	14	def cleaner():
15	15	tagger = Tagger()
	16	+ tagger.isReady()
	17	+ phoner = Phoner()
	18	+ phoner.isReady()
	19	+ stoplist = StopWord()
	20	+ stoplist.isReady()
16	21	# Receive String from post parametre Raw text ( Json )
17	22	dirtyString= request.json[u'string']
18	23	# send the String throught LIA_TAGG script thank's to pip
19	24	# lia_clean split a word by line et markup the sentences
20		- cleanString= tagger.clean(dirtyString)
21		- taggedString= tagger.tagg(cleanString)
	25	+ dirtyString = stoplist.RemoveStopList(dirtyString)
	26	+ print " stop list " + dirtyString
	27	+ lemm = tagger.lemm(tagger.clean(dirtyString))
	28	+ print 'les lemm '+ lemm
	29	+ dirtyString = dirtyString+" "+ lemm
	30	+ cleanString= phoner.clean(dirtyString)
	31	+ taggedString= phoner.phon(cleanString)
	32	+ print taggedString
22	33	return taggedString
23	34	if __name__ == '__main__':
24	35	app.debug = True