Blame view

bin/data.py 2.92 KB
ac78b07ea   Mathias Quillot   All base bin file...
1
2
3
4
5
6
7
8
9
10
11
  '''
  This module aim in loading and writing files.
  Our files respect a specific format that 
  is not standard. This is why i hope these
  function make the read of file easier.
  
  For more information about the data, read
  the README file please.
  '''
  
  import sys
0bc4a3e39   Mathias Quillot   New implementatio...
12

ac78b07ea   Mathias Quillot   All base bin file...
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
  def read_file(filepath):
      '''
      Read the file and return an array with pairs
      where each pair is composed by the metas and the 
      features.
      '''
      data = []
      with open(filepath, "r") as f:
          for line in f:
              splited = line.replace("
  ", "").split(" ")
              metas = splited[0].split(",")
              features = splited[1:]
              data.append((metas, features))
      return data
0bc4a3e39   Mathias Quillot   New implementatio...
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
  def read_file_skyrim(filepath):
      '''
      Read the file and return an array with pairs
      where each pair is composed by the metas and the 
      features.
  
      This is for Skyrim files.
      '''
      data = []
      with open(filepath, "r") as f:
          for line in f:
              splited = line.replace("
  ", "").split(" ")
              metas = splited[0].split(".")
              features = splited[1:]
              data.append((metas, features))
      return data
ac78b07ea   Mathias Quillot   All base bin file...
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
  def index_by(data, num_col):
      '''
      Allows the user to index data by number of columns.
      '''
      indexed = {}
      for line in data:
          metas = line[0]
          features = line[1]
          if metas[num_col] not in indexed:
              indexed[metas[num_col]] = []
          indexed[metas[num_col]].append((metas, features))
      return indexed
  
  
  def index_by_id(data):
      '''
      Allows the user to index data by id.
      Index data by id consists in indexing two times 
      because data have two keys. On with the language 
      and the other one with the id of the sentence.
      '''
      indexed = {}
      for line in data:
          metas = line[0]
          id_sen = metas[3]
          lang = metas[0]
          if lang not in indexed:
              indexed[lang] = {}
          indexed[lang][id_sen] = line
      return indexed
0bc4a3e39   Mathias Quillot   New implementatio...
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
  def index_by_id_skyrim(data):
      '''
      Allows the user to index data by id.
      Index data by id consists in indexing two times 
      because data have two keys. On with the language 
      and the other one with the id of the sentence.
      '''
      indexed = {}
      for line in data:
          metas = line[0]
          id_sen = metas[2]
          lang = metas[0]
          if lang not in indexed:
              indexed[lang] = {}
          indexed[lang][id_sen] = line
      return indexed
ac78b07ea   Mathias Quillot   All base bin file...
91
92
93
94
95
96
97
98
99
  def write_line(metas, features, f=sys.stdout):
      '''
      Just print the line. No need to specify a file.
  
      metas: meta information on list
      features: feature vector
      f: file to write it
      '''
      print(",".join(metas) + " " + " ".join(features), file=f)
0bc4a3e39   Mathias Quillot   New implementatio...
100
101
102
103
104
105
106
107
108
109
110
  
  
  def write_line_skyrim(metas, features, f=sys.stdout):
      '''
      Just print the line. No need to specify a file.
  
      metas: meta information on list
      features: feature vector
      f: file to write it
      '''
      print(".".join(metas) + " " + " ".join(features), file=f)