data.py 1.73 KB
edit raw blame history



1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

19

20

21

22

23

24

25

26

27

28

29

30

31

32

33

34

35

36

37

38

39

40

41

42

43

44

45

46

47

48

49

50

51

52

53

54

55

56

57

58

59

60

61

62

63

64

65

66

67

68

69


'''
This module aim in loading and writing files.
Our files respect a specific format that 
is not standard. This is why i hope these
function make the read of file easier.

For more information about the data, read
the README file please.
'''

import sys

def read_file(filepath):
    '''
    Read the file and return an array with pairs
    where each pair is composed by the metas and the 
    features.
    '''
    data = []
    with open(filepath, "r") as f:
        for line in f:
            splited = line.replace("\n", "").split(" ")
            metas = splited[0].split(",")
            features = splited[1:]
            data.append((metas, features))
    return data


def index_by(data, num_col):
    '''
    Allows the user to index data by number of columns.
    '''
    indexed = {}
    for line in data:
        metas = line[0]
        features = line[1]
        if metas[num_col] not in indexed:
            indexed[metas[num_col]] = []
        indexed[metas[num_col]].append((metas, features))
    return indexed


def index_by_id(data):
    '''
    Allows the user to index data by id.
    Index data by id consists in indexing two times 
    because data have two keys. On with the language 
    and the other one with the id of the sentence.
    '''
    indexed = {}
    for line in data:
        metas = line[0]
        id_sen = metas[3]
        lang = metas[0]
        if lang not in indexed:
            indexed[lang] = {}
        indexed[lang][id_sen] = line
    return indexed


def write_line(metas, features, f=sys.stdout):
    '''
    Just print the line. No need to specify a file.

    metas: meta information on list
    features: feature vector
    f: file to write it
    '''
    print(",".join(metas) + " " + " ".join(features), file=f)