data.py 3.65 KB
'''
Data management input/output
'''

# Import packages and modules
import numpy as np
import sys

# Defining some types
from typing import List, Dict, Tuple

from numpy.lib.shape_base import expand_dims
KeyToList = Dict[str, List[str]]
KeyToLabels = Dict[str, List[str]]
KeyToIntLabels = Dict[str, List[int]]
KeyToFeatures = Dict[str, List[float]]


def read_lst(file_path: str) -> KeyToList:
    '''
    Read lst file with this structure:
    [id_1]
    [id_2]
    ...
    [id_n]
    
    Return a list of ids.
    '''
    lst = []
    with open(file_path, "r") as f:
        for line in f:
            lst.append(line.replace("\n", ""))
    return lst


def read_id_values(file_path: str, value_type=str):
    '''
    Read file where each line is an id with its corresponding values:
    [id_1] [value_1_1] [value_1_2] ... [value_1_k]
    [id_2] [value_2_1] [value_2_2] ... [value_2_k]
    ...
    [id_n] [value_n_1] [value_n_2] ... [value_n_k]

    where values are value_type type.

    Used in many reader functions with specific value_type.
    Return a dictionary with id as key and values as associated values.
    '''
    id_values = {}
    with open(file_path, "r") as f:
        for line in f:
            splited = line.replace("\n", "").split(" ")
            id_values[splited[0]] = np.asarray(splited[1:], dtype=value_type)
    return id_values


def read_features(file_path: str) -> KeyToFeatures:
    '''
    Read features files with the following structure:
    [id_1] [value_1_1] [value_1_2] ... [value_1_k]
    [id_2] [value_2_1] [value_2_2] ... [value_2_k]
    ...
    [id_n] [value_n_1] [value_n_2] ... [value_n_k]

    where values are float

    Returns a dictionary with id as key and a list of values as associated values
    '''
    return read_id_values(file_path, np.float64)


def read_features_with_matrix(file_path: str) -> Tuple[List[str], np.ndarray]:
    """Read a features file and returns the keys (utterances ids) 
    with the corresponding matrix of values.

    Args:
        file_path (str): path of the features file

    Returns:
        [Tuple(List[str], np.ndarray)]: a tuple with a list of keys and the matrix
    """
    data = read_id_values(file_path, np.float64)
    keys = []
    matrix = None
    for key, values in data.items():
        keys.append(key)
        if matrix is None:
            matrix = np.expand_dims(values, axis=0)
        matrix = np.append(matrix, np.expand_dims(values, axis=0), axis=0)
    
    return (keys, matrix)

def read_labels(file_path: str) -> KeyToLabels:
    '''
    Read features files with the following structure :
    [id_1] [value_1_1] [value_1_2] ... [value_1_k]
    [id_2] [value_2_1] [value_2_2] ... [value_2_k]
    ...
    [id_n] [value_n_1] [value_n_2] ... [value_n_k]

    where values are int
    '''
    return read_id_values(file_path, str)


def read_labels_integer(file_path: str) -> KeyToIntLabels:
    '''
    Read features files with the following structure :
    [id_1] [value_1_1] [value_1_2] ... [value_1_k]
    [id_2] [value_2_1] [value_2_2] ... [value_2_k]
    ...
    [id_n] [value_n_1] [value_n_2] ... [value_n_k]

    where values are int
    '''
    return read_id_values(file_path, int)


def write_line(id_, values=[], out=sys.stdout):
    """
    Write a line in list, labels or features files.
    If you want to write a list, specify an empty
    array for *values*. 

    Args:
        id_ (str): id in string.
        values (list, optional): list of values to write, features or labels. Defaults to [].
        out (_io.TextIOWrapper, optional): . Defaults to sys.stdout.
    """
    if len(values) == 0:
        out.write(str(id_) + "\n")
    else:
        out.write(str(id_) + " " + " ".join(values) + "\n")