Quillot Mathias / Clustering

Browse Code »

Commit 0bc4a3e394eb5c9587896e91379a61bb340f28d2

Authored by Mathias Quillot 2019-09-11 22:18:47 +0200

1 parent a7fec48aaf

Exists in master

New implementation of data functions for skyrim

Showing 1 changed file with 48 additions and 0 deletions Inline Diff

bin/data.py

bin/data.py

Diff comments View file @ 0bc4a3e

 '''
 This module aim in loading and writing files.
 Our files respect a specific format that
 is not standard. This is why i hope these
 function make the read of file easier.
 For more information about the data, read
 the README file please.
 '''
 import sys
 def read_file(filepath):
     '''
     Read the file and return an array with pairs
     where each pair is composed by the metas and the
     features.
     '''
     data = []
     with open(filepath, "r") as f:
         for line in f:
             splited = line.replace("\n", "").split(" ")
             metas = splited[0].split(",")
             features = splited[1:]
             data.append((metas, features))
     return data
+def read_file_skyrim(filepath):
+    '''
+    Read the file and return an array with pairs
+    where each pair is composed by the metas and the
+    features.
+    This is for Skyrim files.
+    '''
+    data = []
+    with open(filepath, "r") as f:
+        for line in f:
+            splited = line.replace("\n", "").split(" ")
+            metas = splited[0].split(".")
+            features = splited[1:]
+            data.append((metas, features))
+    return data
 def index_by(data, num_col):
     '''
     Allows the user to index data by number of columns.
     '''
     indexed = {}
     for line in data:
         metas = line[0]
         features = line[1]
         if metas[num_col] not in indexed:
             indexed[metas[num_col]] = []
         indexed[metas[num_col]].append((metas, features))
     return indexed
 def index_by_id(data):
     '''
     Allows the user to index data by id.
     Index data by id consists in indexing two times
     because data have two keys. On with the language
     and the other one with the id of the sentence.
     '''
     indexed = {}
     for line in data:
         metas = line[0]
         id_sen = metas[3]
         lang = metas[0]
         if lang not in indexed:
             indexed[lang] = {}
         indexed[lang][id_sen] = line
     return indexed
+def index_by_id_skyrim(data):
+    '''
+    Allows the user to index data by id.
+    Index data by id consists in indexing two times
+    because data have two keys. On with the language
+    and the other one with the id of the sentence.
+    '''
+    indexed = {}
+    for line in data:
+        metas = line[0]
+        id_sen = metas[2]
+        lang = metas[0]
+        if lang not in indexed:
+            indexed[lang] = {}
+        indexed[lang][id_sen] = line
+    return indexed
 def write_line(metas, features, f=sys.stdout):
     '''
     Just print the line. No need to specify a file.
     metas: meta information on list
     features: feature vector
     f: file to write it
     '''
     print(",".join(metas) + " " + " ".join(features), file=f)
+def write_line_skyrim(metas, features, f=sys.stdout):
+    '''
+    Just print the line. No need to specify a file.
+    metas: meta information on list
+    features: feature vector
+    f: file to write it
+    '''
+    print(".".join(metas) + " " + " ".join(features), file=f)