Blame view

W2V/utils.py 1.44 KB
b6d0165d1   Killian   Initial commit
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
  # -*- coding: utf-8 -*-
  import nltk
  import re
  pattern =  ur"\d+(?:\.\d+)?\s*%?|\w{1,2}'|<unk>|[\wéàèùêôûâòìîç]+|[^\w\s]"
  rer_b = re.compile(ur" r e r(?: e r)? b ")
  rer_c = re.compile(ur" r e r(?: e r)? c |r e r( e r)? c' est | rer c' est")
  rer = re.compile(ur" (e )?r e r(?: e r)? |re r( e r)? |rer e r | r e rer | r e r | r e rer |r( e r)+ ")
  sncf = re.compile(ur" s n c f ")
  jusq = re.compile(ur" jusqu ' ")
  ratp = re.compile(ur" r a t(?: p)? ")
  quel = re.compile(ur" quelqu ' ")
  space = re.compile(ur" +")
  tok2 = nltk.RegexpTokenizer(pattern,flags=re.UNICODE )
  # (?x)\d+(\.\d+)?\s*%| \w'| \w+| [^\w\s]
  
  def preproc(line):
      #   print 1,line.encode('utf8')
      line = space.subn(u" ",line)[0]
      line = rer_b.subn(u" rer b ",line)[0]
      line = rer_c.subn(u" rer c ",line)[0]
      line = rer.subn(u" rer ",line)[0]
      line = sncf.subn(u" sncf ",line)[0]
      line = ratp.subn(u" ratp ",line)[0]
      line = jusq.subn(u" jusqu' ",line)[0]
      line = quel.subn(u" quelqu' ",line)[0]
      line = space.subn(u" ",line)[0]
      # print 2,line.encode('utf8')
      return line.lower()
  
  def yield_corpus(df_list):
      for corpus in df_list:
          for id,doc in corpus.iterrows():
              try:
                  a = tok2.tokenize(preproc(doc[2].decode("utf-8")))
        #          print 3, " ".join(a).encode("utf8")
                  yield a 
              except:
                  print doc[2]
                  raise
  def select(elm):
      return int(elm.split("_")[-1])