data.py 2.92 KB
edit raw blame history



1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

19

20

21

22

23

24

25

26

27

28

29

30

31

32

33

34

35

36

37

38

39

40

41

42

43

44

45

46

47

48

49

50

51

52

53

54

55

56

57

58

59

60

61

62

63

64

65

66

67

68

69

70

71

72

73

74

75

76

77

78

79

80

81

82

83

84

85

86

87

88

89

90

91

92

93

94

95

96

97

98

99

100

101

102

103

104

105

106

107

108

109

110


'''
Data management input/output
'''

# Import packages and modules
import numpy as np
import sys

# Defining some types
from typing import List, Dict
KeyToList = Dict[str, List[str]]
KeyToLabels = Dict[str, List[str]]
KeyToIntLabels = Dict[str, List[int]]
KeyToFeatures = Dict[str, List[float]]


def read_lst(file_path: str) -> KeyToList:
    '''
    Read lst file with this structure:
    [id_1]
    [id_2]
    ...
    [id_n]
    
    Return a list of ids.
    '''
    lst = []
    with open(file_path, "r") as f:
        for line in f:
            lst.append(line.replace("\n", ""))
    return lst


def read_id_values(file_path: str, value_type=str):
    '''
    Read file where each line is an id with its corresponding values:
    [id_1] [value_1_1] [value_1_2] ... [value_1_k]
    [id_2] [value_2_1] [value_2_2] ... [value_2_k]
    ...
    [id_n] [value_n_1] [value_n_2] ... [value_n_k]

    where values are value_type type.

    Used in many reader functions with specific value_type.
    Return a dictionary with id as key and values as associated values.
    '''
    id_values = {}
    with open(file_path, "r") as f:
        for line in f:
            splited = line.replace("\n", "").split(" ")
            id_values[splited[0]] = np.asarray(splited[1:], dtype=value_type)
    return id_values


def read_features(file_path: str) -> KeyToFeatures:
    '''
    Read features files with the following structure:
    [id_1] [value_1_1] [value_1_2] ... [value_1_k]
    [id_2] [value_2_1] [value_2_2] ... [value_2_k]
    ...
    [id_n] [value_n_1] [value_n_2] ... [value_n_k]

    where values are float

    Returns a dictionary with id as key and a list of values as associated values
    '''
    return read_id_values(file_path, float)


def read_labels(file_path: str) -> KeyToLabels:
    '''
    Read features files with the following structure :
    [id_1] [value_1_1] [value_1_2] ... [value_1_k]
    [id_2] [value_2_1] [value_2_2] ... [value_2_k]
    ...
    [id_n] [value_n_1] [value_n_2] ... [value_n_k]

    where values are int
    '''
    return read_id_values(file_path, str)


def read_labels_integer(file_path: str) -> KeyToIntLabels:
    '''
    Read features files with the following structure :
    [id_1] [value_1_1] [value_1_2] ... [value_1_k]
    [id_2] [value_2_1] [value_2_2] ... [value_2_k]
    ...
    [id_n] [value_n_1] [value_n_2] ... [value_n_k]

    where values are int
    '''
    return read_id_values(file_path, int)


def write_line(id_, values=[], out=sys.stdout):
    """
    Write a line in list, labels or features files.
    If you want to write a list, specify an empty
    array for *values*. 

    Args:
        id_ (str): id in string.
        values (list, optional): list of values to write, features or labels. Defaults to [].
        out (_io.TextIOWrapper, optional): . Defaults to sys.stdout.
    """
    if len(values) == 0:
        out.write(str(id_) + "\n")
    else:
        out.write(str(id_) + " " + " ".join(values) + "\n")