From e7d811503f88129fb1d8eb28dd6af09f681a771e Mon Sep 17 00:00:00 2001
From: Quillot Mathias <mathias.quillot@alumni.univ-avignon.fr>
Date: Wed, 21 Apr 2021 15:06:03 +0200
Subject: [PATCH] New file architecture. Now scripts are on volia's directory
 and the library is on the core directory.

---
 scripts/data-management/convert-old.py |  23 ---
 scripts/data-management/filter_ids.py  |  23 ---
 scripts/dim-reduction/tsne.py          |  37 -----
 scripts/evaluations/clustering.py      | 254 ---------------------------------
 scripts/plot/plot-character.py         |  62 --------
 volia/convert-old.py                   |  23 +++
 volia/core/data.py                     |  44 ++++++
 volia/core/measures.py                 | 227 +++++++++++++++++++++++++++++
 volia/data_io.py                       |  44 ------
 volia/filter_ids.py                    |  24 ++++
 volia/measures.py                      | 227 -----------------------------
 volia/plot-character.py                |  62 ++++++++
 volia/test.py                          |   2 +
 volia/tsne.py                          |  37 +++++
 14 files changed, 419 insertions(+), 670 deletions(-)
 delete mode 100644 scripts/data-management/convert-old.py
 delete mode 100644 scripts/data-management/filter_ids.py
 delete mode 100644 scripts/dim-reduction/tsne.py
 delete mode 100644 scripts/evaluations/clustering.py
 delete mode 100644 scripts/plot/plot-character.py
 create mode 100644 volia/convert-old.py
 create mode 100644 volia/core/data.py
 create mode 100644 volia/core/measures.py
 delete mode 100644 volia/data_io.py
 create mode 100644 volia/filter_ids.py
 delete mode 100644 volia/measures.py
 create mode 100644 volia/plot-character.py
 create mode 100644 volia/test.py
 create mode 100644 volia/tsne.py

diff --git a/scripts/data-management/convert-old.py b/scripts/data-management/convert-old.py
deleted file mode 100644
index b91a6ce..0000000
--- a/scripts/data-management/convert-old.py
+++ /dev/null
@@ -1,23 +0,0 @@
-import argparse
-from os.path import isfile
-
-
-if __name__ == "__main__":
-
-    parser = argparse.ArgumentParser(
-        description="Convert old files with wrong id to new one. Masseffect.")
-
-    parser.add_argument("file", type=str, help="feature, x2x, or list file")
-    parser.add_argument("--outfile", type=str, default="out.txt", help="output file")
-
-    args = parser.parse_args()
-    
-    assert isfile(args.file), "The given file does not exist."
-
-    with open(args.file, "r") as f, open(args.outfile, "w") as of:
-        for line in f:
-            splited = line.replace("\n", "").split(" ")
-            metas = splited[0].split(",")
-            metas.pop(2)
-            splited[0] = ",".join(metas)
-            of.write(" ".join(splited) + "\n")
diff --git a/scripts/data-management/filter_ids.py b/scripts/data-management/filter_ids.py
deleted file mode 100644
index 023895c..0000000
--- a/scripts/data-management/filter_ids.py
+++ /dev/null
@@ -1,23 +0,0 @@
-import argparse
-from os.path import isfile
-from volia.data_io import read_lst
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser(description="Filter ids of the given file to only keep a subset")
-    parser.add_argument("file", type=str, help="")
-    parser.add_argument("--filter", default=None, type=str, help="")
-    parser.add_argument("--outfile", default="out.txt", type=str, help="")
-
-    args = parser.parse_args()
-
-    assert args.filter is not None
-    assert isfile(args.file)
-
-    list_ = read_lst(args.file)
-    filter_ = read_lst(args.filter)
-    
-    with open(args.outfile, "w") as of:
-        for key in filter_.keys():
-            of.write(key + " " + " ".join(list_[key]) + "\n")
-    
-    print("File filtered and written in: ", args.outfile)
\ No newline at end of file
diff --git a/scripts/dim-reduction/tsne.py b/scripts/dim-reduction/tsne.py
deleted file mode 100644
index e88a10e..0000000
--- a/scripts/dim-reduction/tsne.py
+++ /dev/null
@@ -1,37 +0,0 @@
-'''
-The goal of this script is to display calculate tsne of pvectors.
-'''
-
-import os
-from os.path import isfile
-import argparse
-import numpy as np
-from sklearn.manifold import TSNE
-
-from volia.data_io import read_features
-
-if __name__ == "__main__":    
-    # Defining argparse 
-    parser = argparse.ArgumentParser(prog='pvector tsne', description='Calculate the tsne representation of pvector in 3 or 2d')
-    parser.add_argument('features', type=str,
-                        help='the path of the file you want to calculate tsne')
-    parser.add_argument('-o', '--outfile', type=str,
-                        default='.',
-                        help='the path of the output file.')
-    parser.add_argument('-n', '--n-comp', type=int, choices=[2, 3],
-                        default='2',
-                        help='number of components output of tsne')
-
-    args = parser.parse_args()
-
-    assert isfile(args.features)
-
-    features_list = read_features(args.features)
-    tuples_key_feat = np.vstack([ (key, feats) for key, feats in features_list.items()])
-    keys, features = zip(*tuples_key_feat)
-    feat_tsne = TSNE(n_components=args.n_comp).fit_transform(features)
-    
-    with open(args.outfile, "w") as of:
-        for i in range(len(keys)):
-            of.write(keys[i] + " " + " ".join([str(feat) for feat in feat_tsne[i]]) + "\n")
-    print("TSNE finished. Check if everything has been done well.")
\ No newline at end of file
diff --git a/scripts/evaluations/clustering.py b/scripts/evaluations/clustering.py
deleted file mode 100644
index 7c6e1eb..0000000
--- a/scripts/evaluations/clustering.py
+++ /dev/null
@@ -1,254 +0,0 @@
-'''
-This script allows the user to evaluate a classification system on new labels using clustering methods.
-The algorithms are applied on the given latent space (embedding).
-'''
-import argparse
-import numpy as np
-import pandas as pd
-import os
-import time
-import pickle
-import csv
-import json
-
-from sklearn.preprocessing import LabelEncoder
-from sklearn.metrics.pairwise import pairwise_distances
-from sklearn.cluster import KMeans
-from sklearn.manifold import TSNE
-from sklearn.metrics import f1_score, homogeneity_score, completeness_score, v_measure_score
-import matplotlib.pyplot as plt
-
-from volia.data_io import read_features,read_lst
-from volia.measures import entropy_score, purity_score
-
-'''
-TODO: 
-- Add an option allowing the user to choose the number of 
-clustering to train in order to compute the average and the
-'''
-
-
-def train_clustering(label_encoder, feats, classes, outdir):
-    num_classes = len(label_encoder.classes_)
-    estimator = None
-    kmeans_filepath = os.path.join(outdir, f"{args.prefix}kmeans.pkl") 
-    if args.onlymeasures:
-        print(f"Loading model: {kmeans_filepath}")
-        with open(kmeans_filepath, "rb") as f:
-            estimator = pickle.load(f)
-    else:
-        # Compute KMEANS clustering on data
-        print("Saving parameters")
-        kmeans_parameters = {
-            "n_clusters": num_classes,
-            "n_init": 100,
-            "tol": 10-6,
-            "algorithm": "elkan"
-        }
-        with open(os.path.join(outdir, f"{args.prefix}kmeans_parameters.json"), "w") as f:
-            json.dump(kmeans_parameters, f)
-
-        # Fit the model and Save parameters
-        print(f"Fit the model: {kmeans_filepath}")
-        estimator = KMeans(
-            **kmeans_parameters
-        )
-        estimator.fit(feats)
-        print(f"Kmeans: processed {estimator.n_iter_} iterations - intertia={estimator.inertia_}")
-
-        with open(kmeans_filepath, "wb") as f:
-            pickle.dump(estimator, f)
-    
-    # contains distance to each cluster for each sample
-    dist_space = estimator.transform(feats)
-    predictions = np.argmin(dist_space, axis=1)
-
-    # gives each cluster a name (considering most represented character)
-    dataframe = pd.DataFrame({
-        "label": pd.Series(list(map(lambda x: le.classes_[x], labels))),
-        "prediction": pd.Series(predictions)
-    })
-
-    def find_cluster_name_fn(c):
-        mask = dataframe["prediction"] == c
-        return dataframe[mask]["label"].value_counts(sort=False).idxmax()
-    
-    cluster_names = list(map(find_cluster_name_fn, range(num_classes)))
-    predicted_labels = le.transform(
-        [cluster_names[pred] for pred in predictions])
-    
-    # F-measure
-    fscores = f1_score(labels, predicted_labels, average=None)
-    fscores_str = "\n".join(map(lambda i: "{0:25s}: {1:.4f}".format(le.classes_[i], fscores[i]), range(len(fscores))))
-    
-    # Entropy
-    _, _, entropy = entropy_score(labels, predicted_labels)
-
-    # Homogenity
-    homogeneity = homogeneity_score(labels, predicted_labels)
-
-    # Completeness
-    completeness = completeness_score(labels, predicted_labels)
-
-    # V-Measure
-    v_measure = v_measure_score(labels, predicted_labels)
-
-    # Purity
-    purity_scores = purity_score(labels, predicted_labels)
-    purity_class_score = purity_scores["purity_class_score"]
-    purity_cluster_score = purity_scores["purity_cluster_score"]
-    K = purity_scores["K"]
-
-    # Write results
-    with open(os.path.join(outdir, args.prefix + "eval_clustering.log"), "w") as fd:
-        print(f"F1-scores for each classes:\n{fscores_str}", file=fd)
-        print(f"Entropy: {entropy}", file=fd)
-        print(f"Global score : {np.mean(fscores)}", file=fd)
-        print(f"Homogeneity: {homogeneity}", file=fd)
-        print(f"completeness: {completeness}", file=fd)
-        print(f"v-measure: {v_measure}", file=fd)
-        print(f"purity class score: {purity_class_score}", file=fd)
-        print(f"purity cluster score: {purity_cluster_score}", file=fd)
-        print(f"purity overall evaluation criterion (K): {K}", file=fd)
-
-    # Process t-SNE and plot
-    tsne_estimator = TSNE()
-    embeddings = tsne_estimator.fit_transform(feats)
-    print("t-SNE: processed {0} iterations - KL_divergence={1:.4f}".format(
-        tsne_estimator.n_iter_, tsne_estimator.kl_divergence_))
-
-    fig, [axe1, axe2] = plt.subplots(1, 2, figsize=(10, 5))
-    for c, name in enumerate(le.classes_):
-        c_mask = np.where(labels == c)
-        axe1.scatter(embeddings[c_mask][:, 0], embeddings[c_mask][:, 1], label=name, alpha=0.2, edgecolors=None)
-
-        try:
-            id_cluster = cluster_names.index(name)
-        except ValueError:
-            print("WARNING: no cluster found for {}".format(name))
-            continue
-        c_mask = np.where(predictions == id_cluster)
-        axe2.scatter(embeddings[c_mask][:, 0], embeddings[c_mask][:, 1], label=name, alpha=0.2, edgecolors=None)
-    
-    axe1.legend(loc="lower center", bbox_to_anchor=(0.5, -0.35))
-    axe1.set_title("true labels")
-    axe2.legend(loc="lower center", bbox_to_anchor=(0.5, -0.35))
-    axe2.set_title("predicted cluster label")
-
-    plt.suptitle("Kmeans Clustering")
-
-    loc = os.path.join(
-        outdir,
-        args.prefix + "kmeans.pdf"
-    )
-    plt.savefig(loc, bbox_inches="tight")
-    plt.close()
-
-    print("INFO: figure saved at {}".format(loc))
-
-    end = time.time()
-    print("program ended in {0:.2f} seconds".format(end-start))
-    return {
-        "f1": np.mean(fscores),
-        "entropy": entropy,
-        "homogeneity": homogeneity,
-        "completeness": completeness,
-        "v-measure": v_measure,
-        "purity_class_score": purity_class_score,
-        "purity_cluster score": purity_cluster_score,
-        "K": K
-    }
-
-
-if __name__ == "__main__":
-    # Argparse
-    parser = argparse.ArgumentParser("Compute clustering on a latent space")
-    parser.add_argument("features")
-    parser.add_argument("utt2",
-                        type=str,
-                        help="file with [utt] [value]")
-    parser.add_argument("--idsfrom",
-                        type=str,
-                        default="utt2",
-                        choices=[
-                            "features",
-                            "utt2"
-                        ],
-                        help="from features or from utt2?")
-    parser.add_argument("--prefix",
-                        default="",
-                        type=str,
-                        help="prefix of saved files")
-    parser.add_argument("--outdir",
-                        default=None,
-                        type=str,
-                        help="Output directory")
-    parser.add_argument("--nmodels",
-                        type=int,
-                        default=1,
-                        help="specifies the number of models to train")
-    parser.add_argument("--onlymeasures",
-                        action='store_true',
-                        help="Don't compute the clustering, compute only the measures")
-    args = parser.parse_args()
-
-    assert args.outdir
-
-    start = time.time()
-
-    # Load features and utt2
-    features = read_features(args.features)
-    utt2 = read_lst(args.utt2)
-
-    # Take id list
-    if args.idsfrom == "features":
-        ids = list(features.keys())
-    elif args.idsfrom == "utt2":
-        ids = list(utt2.keys())
-    else:
-        print(f"idsfrom is not good: {args.idsfrom}")
-        exit(1)
-    
-    feats = np.vstack([ features[id_] for id_ in ids ])
-    classes = [ utt2[id_] for id_ in ids ]
-
-    # Encode labels
-    le = LabelEncoder()
-    labels = le.fit_transform(classes)
-    
-    measures = {}
-    for i in range(1, args.nmodels+1):
-        subdir = os.path.join(args.outdir, str(i))
-        if not os.path.exists(subdir):
-            os.mkdir(subdir)
-        print(f"[{i}/{args.nmodels}] => {subdir}")
-        results = train_clustering(le, feats, classes, subdir)
-
-        for key, value in results.items():
-            if key not in measures:
-                measures[key] = []
-            measures[key].append(results[key])
-    
-
-    # File with results
-    file_results = os.path.join(args.outdir, args.prefix + "clustering_measures.txt")
-
-    with open(file_results, "w") as f:
-        f.write(f"[nmodels: {args.nmodels}]\n")
-        for key in measures.keys():
-            values = np.asarray(measures[key], dtype=float)
-            mean = np.mean(values)
-            std = np.std(values)
-            f.write(f"[{key} => mean: {mean}, std: {std}] \n")
-            
-    # CSV File with all the values
-    file_csv_measures = os.path.join(args.outdir, args.prefix + "clustering_measures.csv")
-
-    with open(file_csv_measures, "w", newline="") as f:
-        writer = csv.writer(f, delimiter=",")
-        writer.writerow(["measure"] + list(range(1, args.nmodels+1)) + ["mean"] + ["std"])
-        for key in measures.keys():
-            values = np.asarray(measures[key], dtype=float)
-            mean = np.mean(values)
-            std = np.std(values)
-            writer.writerow([key] + list(values) + [mean] + [std])
\ No newline at end of file
diff --git a/scripts/plot/plot-character.py b/scripts/plot/plot-character.py
deleted file mode 100644
index bfb98d7..0000000
--- a/scripts/plot/plot-character.py
+++ /dev/null
@@ -1,62 +0,0 @@
-
-import matplotlib.pyplot as plt
-import numpy as np
-import pandas as pd
-import argparse
-from os.path import isfile
-from volia.data_io import read_features, read_lst
-
-
-if __name__ == "__main__":
-    # Argparse
-    parser = argparse.ArgumentParser(description="Plot points with color for each character")
-    parser.add_argument("--features", type=str, help="features file path")
-    parser.add_argument("--utt2char", type=str, help="char2utt file path")
-    parser.add_argument("--sublist", type=str, default=None, help="white list of ids to take into account")
-    parser.add_argument("--outfile", default="out.pdf", type=str, help="")
-    parser.add_argument("--title", default="Example of plot", type=str, help="Specify the title")
-    args = parser.parse_args()
-
-    # List of assertions
-    assert args.features, "Need to specify features option"
-    assert args.utt2char, "Need to specify char2utt option file"
-    assert isfile(args.features), "Features path should point to a file"
-    assert isfile(args.utt2char), "char2utt path should point to a file"
-    if args.sublist is not None:
-        assert isfile(args.sublist), "sublist path should point to a file"
-
-
-    id_to_features = read_features(args.features)
-
-    ids = []
-    if args.sublist is not None:
-        print("Using sublist")
-        list_ids = read_lst(args.sublist)
-        ids = [ key for key in list_ids.keys() ]
-    else:
-        ids = [ key for key in id_to_features.keys() ]
-    
-    utt2char = read_lst(args.utt2char)
-    
-    features = [ id_to_features[id_] for id_ in ids ]
-    features = np.vstack(features)
-
-    characters_list = [ utt2char[id_][0] for id_ in ids ]
-
-    features_T = features.transpose()
-    print("Number of characters: ", len(np.unique(characters_list)))
-    df = pd.DataFrame(dict(
-        x=features_T[0],
-        y=features_T[1],
-        character=characters_list))
-
-    groups = df.groupby('character')
-
-    # Plot
-    fig, ax = plt.subplots()
-
-    for character, group in groups:
-        p = ax.plot(group.x, group.y, marker='o', linestyle='', ms=1, label=character)
-    ax.legend()
-    plt.savefig(args.outfile)
-    print("Your plot is saved well (no check of this affirmation)")
diff --git a/volia/convert-old.py b/volia/convert-old.py
new file mode 100644
index 0000000..b91a6ce
--- /dev/null
+++ b/volia/convert-old.py
@@ -0,0 +1,23 @@
+import argparse
+from os.path import isfile
+
+
+if __name__ == "__main__":
+
+    parser = argparse.ArgumentParser(
+        description="Convert old files with wrong id to new one. Masseffect.")
+
+    parser.add_argument("file", type=str, help="feature, x2x, or list file")
+    parser.add_argument("--outfile", type=str, default="out.txt", help="output file")
+
+    args = parser.parse_args()
+    
+    assert isfile(args.file), "The given file does not exist."
+
+    with open(args.file, "r") as f, open(args.outfile, "w") as of:
+        for line in f:
+            splited = line.replace("\n", "").split(" ")
+            metas = splited[0].split(",")
+            metas.pop(2)
+            splited[0] = ",".join(metas)
+            of.write(" ".join(splited) + "\n")
diff --git a/volia/core/data.py b/volia/core/data.py
new file mode 100644
index 0000000..4c1bdae
--- /dev/null
+++ b/volia/core/data.py
@@ -0,0 +1,44 @@
+'''
+Data management input/output
+'''
+
+# Import packages and modules
+import numpy as np
+
+# Defining some types
+from typing import List, Dict
+KeyToList = Dict[str, List[str]]
+KeyToFeatures = Dict[str, List[float]]
+
+
+def read_lst(file_path: str) -> KeyToList:
+    '''
+    Read lst file with this structure:
+    [id] [value1] [value2] ... [value n]
+
+    This is a basic function reused by others like read_features.
+    returns a dictionary with id as key and a list of value as corresponding values
+    '''
+    # KeyToList type variable
+    key_to_list = dict() 
+    with open(file_path, "r") as f:
+        for line in f:
+            splited = line.replace("\n", "").split(" ")
+            id = splited[0]
+            values = splited[1:]
+            key_to_list[id] = values
+    return key_to_list
+
+
+def read_features(file_path: str) -> KeyToFeatures:
+    '''
+    '''
+    # KeyToFeatures type variable
+    key_to_features = dict()
+    # and the KeyToList
+    key_to_list = read_lst(file_path)
+    
+    for key_, list_ in key_to_list.items():
+        key_to_features[key_] = np.asarray(list_, dtype=float)
+
+    return key_to_features
\ No newline at end of file
diff --git a/volia/core/measures.py b/volia/core/measures.py
new file mode 100644
index 0000000..0ef8967
--- /dev/null
+++ b/volia/core/measures.py
@@ -0,0 +1,227 @@
+'''
+This module is a part of my library. 
+It aims to compute some measures for clustering.
+'''
+
+import numpy as np
+
+def disequilibrium_(matrix1, matrix2, isGlobal=False, mod=None):
+    '''
+    Compute disequilibrium for all the clusters.
+    The disequilibrium is compute from the difference
+    between two clustering sets.
+    isGlobal permet à l'utilisateur de choisir le dénominateur de
+    la fonction : 
+        - True : divise la valeur par le nombre d'élément du cluster
+        - False : divise la valeur par le nombre d'élément total
+
+    withPower permet à l'utilisateur de décider d'appliquer un carré 2 ou
+    une valeur absolue.
+    '''
+
+    def divide_line(a, divider):
+        '''
+        Sub function used for dividing matrix by a vector line by line.
+        '''
+        return np.divide(a, divider, out=np.zeros_like(a), where=divider!=0)
+
+    dividers1 = 0
+    dividers2 = 0
+
+    if isGlobal:
+        dividers1 = matrix1.sum()
+        dividers2 = matrix2.sum()
+    else:
+        dividers1 = matrix1.sum(axis=1)
+        dividers2 = matrix2.sum(axis=1)
+    
+    matrix1_divided = np.apply_along_axis(divide_line, 0, np.asarray(matrix1, dtype=np.float), dividers1)
+    
+    matrix2_divided = np.apply_along_axis(divide_line, 0, np.asarray(matrix2, dtype=np.float), dividers2)
+    
+    diff = matrix1_divided - matrix2_divided
+    
+    mask = np.logical_not(np.logical_and(matrix2==0, matrix1==0))
+
+    result = diff
+
+    if mod != None or mod == "":
+        for word in mod.split(" "):
+            if word == "power":
+                result = np.power(result,2)
+            elif word == "human":
+                result = result * 100
+            elif word == "abs":
+                result = np.absolute(result)    
+            else:
+                raise Exception("Need to specify an accepted mod of the disequilibrium (\"power\", \"human\" or \"abs\"")
+    return (mask, result)
+
+
+
+def disequilibrium_mean_by_cluster(mask, matrix):
+    '''
+    Mean of disequilibrium
+    matrix is the disequilibrium calculated
+    from number of occurences belonging to a class,
+    for each cluster. 
+    '''
+    nb_k = len(matrix)
+    results = np.zeros((nb_k))
+    
+    for i in range(nb_k):
+        results[i] = matrix[i].sum() / mask[i].sum()
+    return results
+
+
+def disequilibrium(matrix1, matrix2, isGlobal=False):
+    '''
+    Disequilibrium matrix
+    And Disequilibrium value
+    '''
+    mask, result = disequilibrium_(matrix1, matrix2, isGlobal)
+    result_human = result * 100
+    result_power = np.power(result, 2)
+
+    return (
+        mask,
+        result_human,
+        disequilibrium_mean_by_cluster(mask, result_power).sum()/matrix1.shape[0]
+    )
+
+
+def compute_count_matrix(y_truth, y_hat):
+    '''
+        Check the size of the lists with assertion
+    '''
+    # Check size of the lists
+    assert len(y_hat) == len(y_truth), f"Matrices should have the same length y_hat: {len(y_hat)}, y_truth: {len(y_truth)}"
+
+    # Build count matrix
+    count_matrix = np.zeros((max(y_hat+1), max(y_truth+1)))
+    for i in range(len(y_hat)):
+        count_matrix[y_hat[i]][y_truth[i]] += 1
+    return count_matrix
+
+
+def entropy_score(y_truth, y_hat):
+    '''
+    Need to use label encoder before givin y_hat and y_truth
+    Don't use one hot labels
+
+    Return a tuple with:
+        - result_matrix : the matrix with the log multiplied probabilities (P(x) * log(P(x)))
+        - result_vector : the vector avec summing entropy of each class. Each value corresponds to a cluster.
+        - result : the final entropy measure of the clustering
+    '''
+    def divide_line(a, divider):
+        '''
+        Sub function used for dividing matrix by a vector line by line.
+        '''
+        return np.divide(a, divider, out=np.zeros_like(a), where=divider!=0)
+
+    # Build count matrix
+    count_matrix = compute_count_matrix(y_truth, y_hat)
+
+    # Build dividers vector
+    dividers = count_matrix.sum(axis=1)
+    
+    matrix_divided = np.apply_along_axis(divide_line, 0, np.asarray(count_matrix, dtype=np.float), dividers)
+
+    log_matrix = np.zeros(matrix_divided.shape)
+    np.log2(matrix_divided, out=log_matrix, where=count_matrix != 0)
+    result_matrix = -1 * np.multiply(matrix_divided, log_matrix)
+    result_vector = result_matrix.sum(axis=1)
+    result_vector.sum()
+    
+    if np.isnan(np.sum(result_vector)):
+        print("COUNT MATRIX")
+        print(count_matrix)
+        print("MATRIX DIVIDED")
+        print(matrix_divided)
+        print("RESULT MATRIX")
+        print(result_matrix)
+        print("VECTOR MATRIX")
+        print(result_vector)
+        print("An error occured due to nan value, some values are printed before")
+        exit(1)
+    
+    result = result_vector * dividers / dividers.sum()
+    result = result.sum()
+    return (result_matrix, result_vector, result)
+
+
+def purity_score(y_truth, y_hat):
+    '''
+    Return three values in a dictionary:
+        - purity_class_score: the purity score of the class (asp)
+        - purity_cluster_score: the purity score of the cluster (acp)
+        - K: the overall evaluation criterion (sqrt(asp * acp))
+
+    This function is based on the following article: 
+    Unknown-multiple speaker clustering using HMM, J. Ajmera, H. Bourlard, I. Lapidot, I. McCowan
+    '''
+
+    def divide_line(a, divider):
+        '''
+        Sub function used for dividing matrix by a vector line by line.
+        '''
+        return np.divide(a, divider, out=np.zeros_like(a), where=divider!=0)
+
+    def compute_purity_score(count_matrix, axis=0):
+        if axis==0:
+            other_axis = 1
+        else:
+            other_axis = 0
+        count_per_row = count_matrix.sum(axis=axis)
+        dividers = np.square(count_per_row)
+
+        count_matrix_squared = np.square(count_matrix)
+        matrix_divided = np.apply_along_axis(divide_line, other_axis, np.asarray(count_matrix_squared, dtype=np.float), dividers)
+        vector_purity = np.sum(matrix_divided, axis=axis)
+
+        scalar_purity = np.average(vector_purity, weights=count_per_row)
+        return (vector_purity, scalar_purity)
+    
+
+    count_matrix = compute_count_matrix(y_truth, y_hat)
+    _, purity_cluster_score = compute_purity_score(count_matrix, 1)
+    _, purity_class_score = cluster_purity = compute_purity_score(count_matrix, 0)
+
+    K = np.sqrt(purity_cluster_score * purity_class_score)
+
+    for i in range(count_matrix.shape[0]):
+        for j in range(count_matrix.shape[1]):
+            count_matrix[i][j]
+        count_matrix[i]
+    return {
+        "purity_class_score": purity_class_score,
+        "purity_cluster_score": purity_cluster_score,
+        "K": K
+    }
+
+
+if __name__ == "__main__":
+    print("Purity test #1")
+    # Hypothesis
+    y_hat = np.asarray([0, 1, 2, 0, 1, 0, 3, 2, 2, 3, 3, 0])
+    # Truth
+    y = np.asarray([0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3])
+
+    (result_matrix, result_vector, result) = entropy_score(y, y_hat)
+    print(purity_score(y, y_hat))
+
+    exit(1)
+    print("Purity test #2")
+    # Hypothesis
+    y_hat = np.asarray([0, 1, 2, 0, 1, 0, 3, 2, 2, 3, 3, 0, 4, 4, 4])
+    # Truth
+    y = np.asarray([0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 0, 3, 3, 3])
+
+    (result_matrix, result_vector, result) = entropy_score(y, y_hat)
+    exit(1)
+    print("Result matrix: ")
+    print(result_matrix)
+    print("Result vector: ")
+    print(result_vector)
+    print("Result: ", result)
\ No newline at end of file
diff --git a/volia/data_io.py b/volia/data_io.py
deleted file mode 100644
index 4c1bdae..0000000
--- a/volia/data_io.py
+++ /dev/null
@@ -1,44 +0,0 @@
-'''
-Data management input/output
-'''
-
-# Import packages and modules
-import numpy as np
-
-# Defining some types
-from typing import List, Dict
-KeyToList = Dict[str, List[str]]
-KeyToFeatures = Dict[str, List[float]]
-
-
-def read_lst(file_path: str) -> KeyToList:
-    '''
-    Read lst file with this structure:
-    [id] [value1] [value2] ... [value n]
-
-    This is a basic function reused by others like read_features.
-    returns a dictionary with id as key and a list of value as corresponding values
-    '''
-    # KeyToList type variable
-    key_to_list = dict() 
-    with open(file_path, "r") as f:
-        for line in f:
-            splited = line.replace("\n", "").split(" ")
-            id = splited[0]
-            values = splited[1:]
-            key_to_list[id] = values
-    return key_to_list
-
-
-def read_features(file_path: str) -> KeyToFeatures:
-    '''
-    '''
-    # KeyToFeatures type variable
-    key_to_features = dict()
-    # and the KeyToList
-    key_to_list = read_lst(file_path)
-    
-    for key_, list_ in key_to_list.items():
-        key_to_features[key_] = np.asarray(list_, dtype=float)
-
-    return key_to_features
\ No newline at end of file
diff --git a/volia/filter_ids.py b/volia/filter_ids.py
new file mode 100644
index 0000000..17732ea
--- /dev/null
+++ b/volia/filter_ids.py
@@ -0,0 +1,24 @@
+import argparse
+from os.path import isfile
+#from volia.data_io import read_lst
+
+import volia
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Filter ids of the given file to only keep a subset")
+    parser.add_argument("file", type=str, help="")
+    parser.add_argument("--filter", default=None, type=str, help="")
+    parser.add_argument("--outfile", default="out.txt", type=str, help="")
+
+    args = parser.parse_args()
+
+    assert args.filter is not None
+    assert isfile(args.file)
+
+    list_ = read_lst(args.file)
+    filter_ = read_lst(args.filter)
+    
+    with open(args.outfile, "w") as of:
+        for key in filter_.keys():
+            of.write(key + " " + " ".join(list_[key]) + "\n")
+    
+    print("File filtered and written in: ", args.outfile)
\ No newline at end of file
diff --git a/volia/measures.py b/volia/measures.py
deleted file mode 100644
index 0ef8967..0000000
--- a/volia/measures.py
+++ /dev/null
@@ -1,227 +0,0 @@
-'''
-This module is a part of my library. 
-It aims to compute some measures for clustering.
-'''
-
-import numpy as np
-
-def disequilibrium_(matrix1, matrix2, isGlobal=False, mod=None):
-    '''
-    Compute disequilibrium for all the clusters.
-    The disequilibrium is compute from the difference
-    between two clustering sets.
-    isGlobal permet à l'utilisateur de choisir le dénominateur de
-    la fonction : 
-        - True : divise la valeur par le nombre d'élément du cluster
-        - False : divise la valeur par le nombre d'élément total
-
-    withPower permet à l'utilisateur de décider d'appliquer un carré 2 ou
-    une valeur absolue.
-    '''
-
-    def divide_line(a, divider):
-        '''
-        Sub function used for dividing matrix by a vector line by line.
-        '''
-        return np.divide(a, divider, out=np.zeros_like(a), where=divider!=0)
-
-    dividers1 = 0
-    dividers2 = 0
-
-    if isGlobal:
-        dividers1 = matrix1.sum()
-        dividers2 = matrix2.sum()
-    else:
-        dividers1 = matrix1.sum(axis=1)
-        dividers2 = matrix2.sum(axis=1)
-    
-    matrix1_divided = np.apply_along_axis(divide_line, 0, np.asarray(matrix1, dtype=np.float), dividers1)
-    
-    matrix2_divided = np.apply_along_axis(divide_line, 0, np.asarray(matrix2, dtype=np.float), dividers2)
-    
-    diff = matrix1_divided - matrix2_divided
-    
-    mask = np.logical_not(np.logical_and(matrix2==0, matrix1==0))
-
-    result = diff
-
-    if mod != None or mod == "":
-        for word in mod.split(" "):
-            if word == "power":
-                result = np.power(result,2)
-            elif word == "human":
-                result = result * 100
-            elif word == "abs":
-                result = np.absolute(result)    
-            else:
-                raise Exception("Need to specify an accepted mod of the disequilibrium (\"power\", \"human\" or \"abs\"")
-    return (mask, result)
-
-
-
-def disequilibrium_mean_by_cluster(mask, matrix):
-    '''
-    Mean of disequilibrium
-    matrix is the disequilibrium calculated
-    from number of occurences belonging to a class,
-    for each cluster. 
-    '''
-    nb_k = len(matrix)
-    results = np.zeros((nb_k))
-    
-    for i in range(nb_k):
-        results[i] = matrix[i].sum() / mask[i].sum()
-    return results
-
-
-def disequilibrium(matrix1, matrix2, isGlobal=False):
-    '''
-    Disequilibrium matrix
-    And Disequilibrium value
-    '''
-    mask, result = disequilibrium_(matrix1, matrix2, isGlobal)
-    result_human = result * 100
-    result_power = np.power(result, 2)
-
-    return (
-        mask,
-        result_human,
-        disequilibrium_mean_by_cluster(mask, result_power).sum()/matrix1.shape[0]
-    )
-
-
-def compute_count_matrix(y_truth, y_hat):
-    '''
-        Check the size of the lists with assertion
-    '''
-    # Check size of the lists
-    assert len(y_hat) == len(y_truth), f"Matrices should have the same length y_hat: {len(y_hat)}, y_truth: {len(y_truth)}"
-
-    # Build count matrix
-    count_matrix = np.zeros((max(y_hat+1), max(y_truth+1)))
-    for i in range(len(y_hat)):
-        count_matrix[y_hat[i]][y_truth[i]] += 1
-    return count_matrix
-
-
-def entropy_score(y_truth, y_hat):
-    '''
-    Need to use label encoder before givin y_hat and y_truth
-    Don't use one hot labels
-
-    Return a tuple with:
-        - result_matrix : the matrix with the log multiplied probabilities (P(x) * log(P(x)))
-        - result_vector : the vector avec summing entropy of each class. Each value corresponds to a cluster.
-        - result : the final entropy measure of the clustering
-    '''
-    def divide_line(a, divider):
-        '''
-        Sub function used for dividing matrix by a vector line by line.
-        '''
-        return np.divide(a, divider, out=np.zeros_like(a), where=divider!=0)
-
-    # Build count matrix
-    count_matrix = compute_count_matrix(y_truth, y_hat)
-
-    # Build dividers vector
-    dividers = count_matrix.sum(axis=1)
-    
-    matrix_divided = np.apply_along_axis(divide_line, 0, np.asarray(count_matrix, dtype=np.float), dividers)
-
-    log_matrix = np.zeros(matrix_divided.shape)
-    np.log2(matrix_divided, out=log_matrix, where=count_matrix != 0)
-    result_matrix = -1 * np.multiply(matrix_divided, log_matrix)
-    result_vector = result_matrix.sum(axis=1)
-    result_vector.sum()
-    
-    if np.isnan(np.sum(result_vector)):
-        print("COUNT MATRIX")
-        print(count_matrix)
-        print("MATRIX DIVIDED")
-        print(matrix_divided)
-        print("RESULT MATRIX")
-        print(result_matrix)
-        print("VECTOR MATRIX")
-        print(result_vector)
-        print("An error occured due to nan value, some values are printed before")
-        exit(1)
-    
-    result = result_vector * dividers / dividers.sum()
-    result = result.sum()
-    return (result_matrix, result_vector, result)
-
-
-def purity_score(y_truth, y_hat):
-    '''
-    Return three values in a dictionary:
-        - purity_class_score: the purity score of the class (asp)
-        - purity_cluster_score: the purity score of the cluster (acp)
-        - K: the overall evaluation criterion (sqrt(asp * acp))
-
-    This function is based on the following article: 
-    Unknown-multiple speaker clustering using HMM, J. Ajmera, H. Bourlard, I. Lapidot, I. McCowan
-    '''
-
-    def divide_line(a, divider):
-        '''
-        Sub function used for dividing matrix by a vector line by line.
-        '''
-        return np.divide(a, divider, out=np.zeros_like(a), where=divider!=0)
-
-    def compute_purity_score(count_matrix, axis=0):
-        if axis==0:
-            other_axis = 1
-        else:
-            other_axis = 0
-        count_per_row = count_matrix.sum(axis=axis)
-        dividers = np.square(count_per_row)
-
-        count_matrix_squared = np.square(count_matrix)
-        matrix_divided = np.apply_along_axis(divide_line, other_axis, np.asarray(count_matrix_squared, dtype=np.float), dividers)
-        vector_purity = np.sum(matrix_divided, axis=axis)
-
-        scalar_purity = np.average(vector_purity, weights=count_per_row)
-        return (vector_purity, scalar_purity)
-    
-
-    count_matrix = compute_count_matrix(y_truth, y_hat)
-    _, purity_cluster_score = compute_purity_score(count_matrix, 1)
-    _, purity_class_score = cluster_purity = compute_purity_score(count_matrix, 0)
-
-    K = np.sqrt(purity_cluster_score * purity_class_score)
-
-    for i in range(count_matrix.shape[0]):
-        for j in range(count_matrix.shape[1]):
-            count_matrix[i][j]
-        count_matrix[i]
-    return {
-        "purity_class_score": purity_class_score,
-        "purity_cluster_score": purity_cluster_score,
-        "K": K
-    }
-
-
-if __name__ == "__main__":
-    print("Purity test #1")
-    # Hypothesis
-    y_hat = np.asarray([0, 1, 2, 0, 1, 0, 3, 2, 2, 3, 3, 0])
-    # Truth
-    y = np.asarray([0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3])
-
-    (result_matrix, result_vector, result) = entropy_score(y, y_hat)
-    print(purity_score(y, y_hat))
-
-    exit(1)
-    print("Purity test #2")
-    # Hypothesis
-    y_hat = np.asarray([0, 1, 2, 0, 1, 0, 3, 2, 2, 3, 3, 0, 4, 4, 4])
-    # Truth
-    y = np.asarray([0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 0, 3, 3, 3])
-
-    (result_matrix, result_vector, result) = entropy_score(y, y_hat)
-    exit(1)
-    print("Result matrix: ")
-    print(result_matrix)
-    print("Result vector: ")
-    print(result_vector)
-    print("Result: ", result)
\ No newline at end of file
diff --git a/volia/plot-character.py b/volia/plot-character.py
new file mode 100644
index 0000000..bfb98d7
--- /dev/null
+++ b/volia/plot-character.py
@@ -0,0 +1,62 @@
+
+import matplotlib.pyplot as plt
+import numpy as np
+import pandas as pd
+import argparse
+from os.path import isfile
+from volia.data_io import read_features, read_lst
+
+
+if __name__ == "__main__":
+    # Argparse
+    parser = argparse.ArgumentParser(description="Plot points with color for each character")
+    parser.add_argument("--features", type=str, help="features file path")
+    parser.add_argument("--utt2char", type=str, help="char2utt file path")
+    parser.add_argument("--sublist", type=str, default=None, help="white list of ids to take into account")
+    parser.add_argument("--outfile", default="out.pdf", type=str, help="")
+    parser.add_argument("--title", default="Example of plot", type=str, help="Specify the title")
+    args = parser.parse_args()
+
+    # List of assertions
+    assert args.features, "Need to specify features option"
+    assert args.utt2char, "Need to specify char2utt option file"
+    assert isfile(args.features), "Features path should point to a file"
+    assert isfile(args.utt2char), "char2utt path should point to a file"
+    if args.sublist is not None:
+        assert isfile(args.sublist), "sublist path should point to a file"
+
+
+    id_to_features = read_features(args.features)
+
+    ids = []
+    if args.sublist is not None:
+        print("Using sublist")
+        list_ids = read_lst(args.sublist)
+        ids = [ key for key in list_ids.keys() ]
+    else:
+        ids = [ key for key in id_to_features.keys() ]
+    
+    utt2char = read_lst(args.utt2char)
+    
+    features = [ id_to_features[id_] for id_ in ids ]
+    features = np.vstack(features)
+
+    characters_list = [ utt2char[id_][0] for id_ in ids ]
+
+    features_T = features.transpose()
+    print("Number of characters: ", len(np.unique(characters_list)))
+    df = pd.DataFrame(dict(
+        x=features_T[0],
+        y=features_T[1],
+        character=characters_list))
+
+    groups = df.groupby('character')
+
+    # Plot
+    fig, ax = plt.subplots()
+
+    for character, group in groups:
+        p = ax.plot(group.x, group.y, marker='o', linestyle='', ms=1, label=character)
+    ax.legend()
+    plt.savefig(args.outfile)
+    print("Your plot is saved well (no check of this affirmation)")
diff --git a/volia/test.py b/volia/test.py
new file mode 100644
index 0000000..108dad2
--- /dev/null
+++ b/volia/test.py
@@ -0,0 +1,2 @@
+if __name__ == "__main__":
+    print("Volia is well installed.")
\ No newline at end of file
diff --git a/volia/tsne.py b/volia/tsne.py
new file mode 100644
index 0000000..f276c22
--- /dev/null
+++ b/volia/tsne.py
@@ -0,0 +1,37 @@
+'''
+The goal of this script is to display calculate tsne of pvectors.
+'''
+
+import os
+from os.path import isfile
+import argparse
+import numpy as np
+from sklearn.manifold import TSNE
+
+from volia.data_io import read_features
+
+if __name__ == "__main__":
+    # Defining argparse 
+    parser = argparse.ArgumentParser(prog='pvector tsne', description='Calculate the tsne representation of pvector in 3 or 2d')
+    parser.add_argument('features', type=str,
+                        help='the path of the file you want to calculate tsne')
+    parser.add_argument('-o', '--outfile', type=str,
+                        default='.',
+                        help='the path of the output file.')
+    parser.add_argument('-n', '--n-comp', type=int, choices=[2, 3],
+                        default='2',
+                        help='number of components output of tsne')
+
+    args = parser.parse_args()
+
+    assert isfile(args.features)
+
+    features_list = read_features(args.features)
+    tuples_key_feat = np.vstack([ (key, feats) for key, feats in features_list.items()])
+    keys, features = zip(*tuples_key_feat)
+    feat_tsne = TSNE(n_components=args.n_comp).fit_transform(features)
+    
+    with open(args.outfile, "w") as of:
+        for i in range(len(keys)):
+            of.write(keys[i] + " " + " ".join([str(feat) for feat in feat_tsne[i]]) + "\n")
+    print("TSNE finished. Check if everything has been done well.")
\ No newline at end of file
-- 
1.8.2.3