Blame view
bin/data.py
2.92 KB
ac78b07ea
|
1 2 3 4 5 6 7 8 9 10 11 |
''' This module aim in loading and writing files. Our files respect a specific format that is not standard. This is why i hope these function make the read of file easier. For more information about the data, read the README file please. ''' import sys |
0bc4a3e39
|
12 |
|
ac78b07ea
|
13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 |
def read_file(filepath): ''' Read the file and return an array with pairs where each pair is composed by the metas and the features. ''' data = [] with open(filepath, "r") as f: for line in f: splited = line.replace(" ", "").split(" ") metas = splited[0].split(",") features = splited[1:] data.append((metas, features)) return data |
0bc4a3e39
|
28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 |
def read_file_skyrim(filepath): ''' Read the file and return an array with pairs where each pair is composed by the metas and the features. This is for Skyrim files. ''' data = [] with open(filepath, "r") as f: for line in f: splited = line.replace(" ", "").split(" ") metas = splited[0].split(".") features = splited[1:] data.append((metas, features)) return data |
ac78b07ea
|
45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 |
def index_by(data, num_col): ''' Allows the user to index data by number of columns. ''' indexed = {} for line in data: metas = line[0] features = line[1] if metas[num_col] not in indexed: indexed[metas[num_col]] = [] indexed[metas[num_col]].append((metas, features)) return indexed def index_by_id(data): ''' Allows the user to index data by id. Index data by id consists in indexing two times because data have two keys. On with the language and the other one with the id of the sentence. ''' indexed = {} for line in data: metas = line[0] id_sen = metas[3] lang = metas[0] if lang not in indexed: indexed[lang] = {} indexed[lang][id_sen] = line return indexed |
0bc4a3e39
|
75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 |
def index_by_id_skyrim(data): ''' Allows the user to index data by id. Index data by id consists in indexing two times because data have two keys. On with the language and the other one with the id of the sentence. ''' indexed = {} for line in data: metas = line[0] id_sen = metas[2] lang = metas[0] if lang not in indexed: indexed[lang] = {} indexed[lang][id_sen] = line return indexed |
ac78b07ea
|
91 92 93 94 95 96 97 98 99 |
def write_line(metas, features, f=sys.stdout): ''' Just print the line. No need to specify a file. metas: meta information on list features: feature vector f: file to write it ''' print(",".join(metas) + " " + " ".join(features), file=f) |
0bc4a3e39
|
100 101 102 103 104 105 106 107 108 109 110 |
def write_line_skyrim(metas, features, f=sys.stdout): ''' Just print the line. No need to specify a file. metas: meta information on list features: feature vector f: file to write it ''' print(".".join(metas) + " " + " ".join(features), file=f) |