data.py
2.92 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
'''
Data management input/output
'''
# Import packages and modules
import numpy as np
import sys
# Defining some types
from typing import List, Dict
KeyToList = Dict[str, List[str]]
KeyToLabels = Dict[str, List[str]]
KeyToIntLabels = Dict[str, List[int]]
KeyToFeatures = Dict[str, List[float]]
def read_lst(file_path: str) -> KeyToList:
'''
Read lst file with this structure:
[id_1]
[id_2]
...
[id_n]
Return a list of ids.
'''
lst = []
with open(file_path, "r") as f:
for line in f:
lst.append(line.replace("\n", ""))
return lst
def read_id_values(file_path: str, value_type=str):
'''
Read file where each line is an id with its corresponding values:
[id_1] [value_1_1] [value_1_2] ... [value_1_k]
[id_2] [value_2_1] [value_2_2] ... [value_2_k]
...
[id_n] [value_n_1] [value_n_2] ... [value_n_k]
where values are value_type type.
Used in many reader functions with specific value_type.
Return a dictionary with id as key and values as associated values.
'''
id_values = {}
with open(file_path, "r") as f:
for line in f:
splited = line.replace("\n", "").split(" ")
id_values[splited[0]] = np.asarray(splited[1:], dtype=value_type)
return id_values
def read_features(file_path: str) -> KeyToFeatures:
'''
Read features files with the following structure:
[id_1] [value_1_1] [value_1_2] ... [value_1_k]
[id_2] [value_2_1] [value_2_2] ... [value_2_k]
...
[id_n] [value_n_1] [value_n_2] ... [value_n_k]
where values are float
Returns a dictionary with id as key and a list of values as associated values
'''
return read_id_values(file_path, float)
def read_labels(file_path: str) -> KeyToLabels:
'''
Read features files with the following structure :
[id_1] [value_1_1] [value_1_2] ... [value_1_k]
[id_2] [value_2_1] [value_2_2] ... [value_2_k]
...
[id_n] [value_n_1] [value_n_2] ... [value_n_k]
where values are int
'''
return read_id_values(file_path, str)
def read_labels_integer(file_path: str) -> KeyToIntLabels:
'''
Read features files with the following structure :
[id_1] [value_1_1] [value_1_2] ... [value_1_k]
[id_2] [value_2_1] [value_2_2] ... [value_2_k]
...
[id_n] [value_n_1] [value_n_2] ... [value_n_k]
where values are int
'''
return read_id_values(file_path, int)
def write_line(id_, values=[], out=sys.stdout):
"""
Write a line in list, labels or features files.
If you want to write a list, specify an empty
array for *values*.
Args:
id_ (str): id in string.
values (list, optional): list of values to write, features or labels. Defaults to [].
out (_io.TextIOWrapper, optional): . Defaults to sys.stdout.
"""
if len(values) == 0:
out.write(str(id_) + "\n")
else:
out.write(str(id_) + " " + " ".join(values) + "\n")