Blame view

LDA/utils.py 3.43 KB
b6d0165d1   Killian   Initial commit
1
2
3
  # -*- coding: utf-8 -*-
  import nltk
  import re
2af8e57f4   Killian   change all
4
5
6
  import codecs
  import numpy as np
  import sqlite3
b6d0165d1   Killian   Initial commit
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
  pattern =  ur"\d+(?:\.\d+)?\s*%?|\w{1,2}'|<unk>|[\wéàèùêôûâòìîç]+|[^\w\s]"
  rer_b = re.compile(ur" r e r(?: e r)? b ")
  rer_c = re.compile(ur" r e r(?: e r)? c |r e r( e r)? c' est | rer c' est")
  rer = re.compile(ur" (e )?r e r(?: e r)? |re r( e r)? |rer e r | r e rer | r e r | r e rer |r( e r)+ ")
  sncf = re.compile(ur" s n c f ")
  jusq = re.compile(ur" jusqu ' ")
  ratp = re.compile(ur" r a t(?: p)? ")
  quel = re.compile(ur" quelqu ' ")
  space = re.compile(ur" +")
  tok2 = nltk.RegexpTokenizer(pattern,flags=re.UNICODE )
  # (?x)\d+(\.\d+)?\s*%| \w'| \w+| [^\w\s]
  
  def preproc(line):
      #   print 1,line.encode('utf8')
      line = space.subn(u" ",line)[0]
      line = rer_b.subn(u" rer b ",line)[0]
      line = rer_c.subn(u" rer c ",line)[0]
      line = rer.subn(u" rer ",line)[0]
      line = sncf.subn(u" sncf ",line)[0]
      line = ratp.subn(u" ratp ",line)[0]
      line = jusq.subn(u" jusqu' ",line)[0]
      line = quel.subn(u" quelqu' ",line)[0]
      line = space.subn(u" ",line)[0]
      # print 2,line.encode('utf8')
      return line.lower()
  
  def yield_corpus(df_list):
      for corpus in df_list:
          for id,doc in corpus.iterrows():
              try:
                  a = tok2.tokenize(preproc(doc[2].decode("utf-8")))
        #          print 3, " ".join(a).encode("utf8")
                  yield a 
              except:
                  print doc[2]
                  raise
  def select(elm):
      return int(elm.split("_")[-1])
7db73861f   Killian   add vae et mmf
45
46
47
48
  
  
  def select_mmf(elm):
      return int(elm.split("_")[0])
2af8e57f4   Killian   change all
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
  
  def get_score(table):
      mx_train = np.max(table[0])
      argmx_dev = np.argmax(table[1])
      mx_dev = table[1][argmx_dev]
      best_test = table[2][argmx_dev]
      mx_test = np.max(table[2])
      print """\tmax train : {}
      \tmax dev : {}
      \tmax test : {} - best test : {}
      \t best epochs : {}""".format(mx_train,mx_dev,mx_test,best_test,argmx_dev)
      return mx_train,mx_dev,mx_test,best_test,argmx_dev
  class WeightedWordsList :
      @staticmethod
      def get_key(wtuple):
          return wtuple[1]
      @staticmethod
      def get_okey(wtuple):
          return wtuple[1][1]
  
  
      def __init__(self,file_path):
          self.wlist = codecs.open(file_path,"r","utf8").readlines()
          self.wlist = [x.strip().split(':') for x in self.wlist ]
          self.wlist = [ (x, float(y)) for x,y in self.wlist ]
          self.wdict = {}
          for x,y in self.wlist:
              self.wdict[x.encode("utf8")] = y
  
      def select_best(self,word_list,lenght=5):
          scored_word = []
          for w in word_list:
              w = w.encode("utf8")
              if w not in self.wdict :
                  continue
  
              if len(scored_word) < lenght:
                  scored_word.append((w,self.wdict[w]))
              else :
                  w_min= min(enumerate(scored_word),key=WeightedWordsList.get_okey)
                  w_curr = (w, self.wdict[w])
                  if w_min[1][1] < w_curr[1]:
                      del scored_word[w_min[0]]
                      scored_word.append(w_curr)
                      w_min = min(enumerate(scored_word),key=WeightedWordsList.get_okey)
                      while len(scored_word) > lenght and w_min[1][1] < w_curr[1] :
                          del scored_word[w_min[0]]
                          w_min = min(enumerate(scored_word),key=WeightedWordsList.get_okey)
                  elif w_min[1][1] == w_curr[1]:
                      scored_word.append(w_curr)
          return [ w[0] for w in scored_word ]