data.py
3.76 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
'''
Data management input/output
'''
# Import packages and modules
import numpy as np
import sys
# Defining some types
from typing import List, Dict, Tuple
from numpy.lib.shape_base import expand_dims
KeyToList = Dict[str, List[str]]
KeyToLabels = Dict[str, List[str]]
KeyToIntLabels = Dict[str, List[int]]
KeyToFeatures = Dict[str, List[float]]
def read_lst(file_path: str) -> KeyToList:
'''
Read lst file with this structure:
[id_1]
[id_2]
...
[id_n]
Return a list of ids.
'''
lst = []
with open(file_path, "r") as f:
for line in f:
lst.append(line.replace("\n", ""))
return lst
def read_id_values(file_path: str, value_type=str):
'''
Read file where each line is an id with its corresponding values:
[id_1] [value_1_1] [value_1_2] ... [value_1_k]
[id_2] [value_2_1] [value_2_2] ... [value_2_k]
...
[id_n] [value_n_1] [value_n_2] ... [value_n_k]
where values are value_type type.
Used in many reader functions with specific value_type.
Return a dictionary with id as key and values as associated values.
'''
id_values = {}
with open(file_path, "r") as f:
for line in f:
splited = line.replace("\n", "").split(" ")
id_values[splited[0]] = np.asarray(splited[1:], dtype=value_type)
return id_values
def read_features(file_path: str) -> KeyToFeatures:
'''
Read features files with the following structure:
[id_1] [value_1_1] [value_1_2] ... [value_1_k]
[id_2] [value_2_1] [value_2_2] ... [value_2_k]
...
[id_n] [value_n_1] [value_n_2] ... [value_n_k]
where values are float
Returns a dictionary with id as key and a list of values as associated values
'''
return read_id_values(file_path, np.float64)
def read_features_with_matrix(file_path: str) -> Tuple[List[str], np.ndarray]:
"""Read a features file and returns the keys (utterances ids)
with the corresponding matrix of values.
Args:
file_path (str): path of the features file
Returns:
[Tuple(List[str], np.ndarray)]: a tuple with a list of keys and the matrix
"""
data = read_id_values(file_path, np.float64)
keys = []
matrix = None
for key, values in data.items():
keys.append(key)
if matrix is None:
matrix = np.expand_dims(values, axis=0)
matrix = np.append(matrix, np.expand_dims(values, axis=0), axis=0)
return (keys, matrix)
def read_labels(file_path: str) -> KeyToLabels:
'''
Read features files with the following structure :
[id_1] [value_1_1] [value_1_2] ... [value_1_k]
[id_2] [value_2_1] [value_2_2] ... [value_2_k]
...
[id_n] [value_n_1] [value_n_2] ... [value_n_k]
where values are int
'''
return read_id_values(file_path, str)
def read_labels_integer(file_path: str) -> KeyToIntLabels:
'''
Read features files with the following structure :
[id_1] [value_1_1] [value_1_2] ... [value_1_k]
[id_2] [value_2_1] [value_2_2] ... [value_2_k]
...
[id_n] [value_n_1] [value_n_2] ... [value_n_k]
where values are int
'''
return read_id_values(file_path, int)
def write_line(id_, values=[], out=sys.stdout):
"""
Write a line in list, labels or features files.
If you want to write a list, specify an empty
array for *values*.
Args:
id_ (str): id in string.
values (list, optional): list of values to write, features or labels. Defaults to [].
out (_io.TextIOWrapper, optional): . Defaults to sys.stdout.
"""
if hasattr(values, '__len__'):
if len(values) == 0:
out.write(str(id_) + "\n")
else:
out.write(str(id_) + " " + " ".join(values) + "\n")
else:
out.write(str(id_) + " " + str(values) + "\n")