Quillot Mathias · Quillot Mathias · Quillot Mathias
Showing 18 changed files Side-by-side Diff
Makefile
README.md
scripts/data-management/convert-old.py
scripts/data-management/filter_ids.py
scripts/dim-reduction/tsne.py
scripts/evaluations/clustering.py
scripts/plot/plot-character.py
setup.py
setup_volia.py
volia/convert-old.py
volia/core/data.py
volia/core/measures.py
volia/data_io.py
volia/filter_ids.py
volia/measures.py
volia/plot-character.py
volia/test.py
volia/tsne.py
@@ -0,0 +1,7 @@
+init:
+	pip install -r requirements.txt
+	pip install .
+	python -m volia.test
+
+uninstall:
+	pip uninstall volia
@@ -0,0 +1,29 @@
+# Introduction
+Volia est une librairie python pour le machine learning dédié au *speech*. Elle est accompagnée par un ensemble de scripts qui permettent de gérer des données et entraîner des modèles, sortir des rapports d'analyses, des benchmark etc.
+
+# Quick start
+Utiliser volia est très simple.
+
+## Installer volia
+Volia n'étant pas un *repository* officiel PyPi, vous devez l'installer en mode *développement*. Pour ce faire, vous pouvez utiliser le *Makefile* en exécutant la commande suivante :
+
+```
+make
+```
+
+## Exécuter les scripts de n'importe où
+Vous pouvez lancer des scripts depuis n'importe quel chemin de votre ordinateur. Pour cela, il suffit de lancer la commande suivante en remplaçant *run-script* par le nom du module à lancer : 
+```
+python -m volia.run-script
+```
+
+Exemple :
+```
+python -m volia.test
+```
+
+Si *Volia* est bien installé sur votre machine, cette commande devrait pour retourner "Volia is well installed." dans votre terminal.
+
+
+# Evolution
+Un jour, si ce *repository* devient trop volumineux, je créerai un deuxième *repository* seulement composé des scripts. Ce qui m'embête avec cette approche est le lien entre la version du *repository* volia et celui des scripts. Je suis donc en train de réfléchir à une solution pour organiser mon code proprement et bien séparer la partie *script* de la partie *bibliothèque* de Volia.
 \ No newline at end of file
@@ -0,0 +1,23 @@
+import setuptools
+
+with open("README.md", "r") as fh:
+    long_description = fh.read()
+
+setuptools.setup(
+    name="volia",
+    version="0.0.1",
+    author="Mathias Quillot",
+    author_email="mathias.quillot@alumni.univ-avignon.fr",
+    description="A package dedicated to my experiments on actor voices",
+    long_description=long_description,
+    long_description_content_type="text/markdown",
+    url="",
+    packages=setuptools.find_packages(),
+    classifiers=[
+        "Programming Language :: Python :: 3",
+        "License :: OSI Approved :: MIT License",
+        "Operating System :: OS Independent",
+    ],
+    include_package_data=True,
+    python_requires='>=3.6',
+)
 \ No newline at end of file
@@ -0,0 +1,23 @@
+import argparse
+from os.path import isfile
+
+
+if __name__ == "__main__":
+
+    parser = argparse.ArgumentParser(
+        description="Convert old files with wrong id to new one. Masseffect.")
+
+    parser.add_argument("file", type=str, help="feature, x2x, or list file")
+    parser.add_argument("--outfile", type=str, default="out.txt", help="output file")
+
+    args = parser.parse_args()
+    
+    assert isfile(args.file), "The given file does not exist."
+
+    with open(args.file, "r") as f, open(args.outfile, "w") as of:
+        for line in f:
+            splited = line.replace("\n", "").split(" ")
+            metas = splited[0].split(",")
+            metas.pop(2)
+            splited[0] = ",".join(metas)
+            of.write(" ".join(splited) + "\n")
@@ -0,0 +1,44 @@
+'''
+Data management input/output
+'''
+
+# Import packages and modules
+import numpy as np
+
+# Defining some types
+from typing import List, Dict
+KeyToList = Dict[str, List[str]]
+KeyToFeatures = Dict[str, List[float]]
+
+
+def read_lst(file_path: str) -> KeyToList:
+    '''
+    Read lst file with this structure:
+    [id] [value1] [value2] ... [value n]
+
+    This is a basic function reused by others like read_features.
+    returns a dictionary with id as key and a list of value as corresponding values
+    '''
+    # KeyToList type variable
+    key_to_list = dict() 
+    with open(file_path, "r") as f:
+        for line in f:
+            splited = line.replace("\n", "").split(" ")
+            id = splited[0]
+            values = splited[1:]
+            key_to_list[id] = values
+    return key_to_list
+
+
+def read_features(file_path: str) -> KeyToFeatures:
+    '''
+    '''
+    # KeyToFeatures type variable
+    key_to_features = dict()
+    # and the KeyToList
+    key_to_list = read_lst(file_path)
+    
+    for key_, list_ in key_to_list.items():
+        key_to_features[key_] = np.asarray(list_, dtype=float)
+
+    return key_to_features
 \ No newline at end of file
@@ -0,0 +1,227 @@
+'''
+This module is a part of my library. 
+It aims to compute some measures for clustering.
+'''
+
+import numpy as np
+
+def disequilibrium_(matrix1, matrix2, isGlobal=False, mod=None):
+    '''
+    Compute disequilibrium for all the clusters.
+    The disequilibrium is compute from the difference
+    between two clustering sets.
+    isGlobal permet à l'utilisateur de choisir le dénominateur de
+    la fonction : 
+        - True : divise la valeur par le nombre d'élément du cluster
+        - False : divise la valeur par le nombre d'élément total
+
+    withPower permet à l'utilisateur de décider d'appliquer un carré 2 ou
+    une valeur absolue.
+    '''
+
+    def divide_line(a, divider):
+        '''
+        Sub function used for dividing matrix by a vector line by line.
+        '''
+        return np.divide(a, divider, out=np.zeros_like(a), where=divider!=0)
+
+    dividers1 = 0
+    dividers2 = 0
+
+    if isGlobal:
+        dividers1 = matrix1.sum()
+        dividers2 = matrix2.sum()
+    else:
+        dividers1 = matrix1.sum(axis=1)
+        dividers2 = matrix2.sum(axis=1)
+    
+    matrix1_divided = np.apply_along_axis(divide_line, 0, np.asarray(matrix1, dtype=np.float), dividers1)
+    
+    matrix2_divided = np.apply_along_axis(divide_line, 0, np.asarray(matrix2, dtype=np.float), dividers2)
+    
+    diff = matrix1_divided - matrix2_divided
+    
+    mask = np.logical_not(np.logical_and(matrix2==0, matrix1==0))
+
+    result = diff
+
+    if mod != None or mod == "":
+        for word in mod.split(" "):
+            if word == "power":
+                result = np.power(result,2)
+            elif word == "human":
+                result = result * 100
+            elif word == "abs":
+                result = np.absolute(result)    
+            else:
+                raise Exception("Need to specify an accepted mod of the disequilibrium (\"power\", \"human\" or \"abs\"")
+    return (mask, result)
+
+
+
+def disequilibrium_mean_by_cluster(mask, matrix):
+    '''
+    Mean of disequilibrium
+    matrix is the disequilibrium calculated
+    from number of occurences belonging to a class,
+    for each cluster. 
+    '''
+    nb_k = len(matrix)
+    results = np.zeros((nb_k))
+    
+    for i in range(nb_k):
+        results[i] = matrix[i].sum() / mask[i].sum()
+    return results
+
+
+def disequilibrium(matrix1, matrix2, isGlobal=False):
+    '''
+    Disequilibrium matrix
+    And Disequilibrium value
+    '''
+    mask, result = disequilibrium_(matrix1, matrix2, isGlobal)
+    result_human = result * 100
+    result_power = np.power(result, 2)
+
+    return (
+        mask,
+        result_human,
+        disequilibrium_mean_by_cluster(mask, result_power).sum()/matrix1.shape[0]
+    )
+
+
+def compute_count_matrix(y_truth, y_hat):
+    '''
+        Check the size of the lists with assertion
+    '''
+    # Check size of the lists
+    assert len(y_hat) == len(y_truth), f"Matrices should have the same length y_hat: {len(y_hat)}, y_truth: {len(y_truth)}"
+
+    # Build count matrix
+    count_matrix = np.zeros((max(y_hat+1), max(y_truth+1)))
+    for i in range(len(y_hat)):
+        count_matrix[y_hat[i]][y_truth[i]] += 1
+    return count_matrix
+
+
+def entropy_score(y_truth, y_hat):
+    '''
+    Need to use label encoder before givin y_hat and y_truth
+    Don't use one hot labels
+
+    Return a tuple with:
+        - result_matrix : the matrix with the log multiplied probabilities (P(x) * log(P(x)))
+        - result_vector : the vector avec summing entropy of each class. Each value corresponds to a cluster.
+        - result : the final entropy measure of the clustering
+    '''
+    def divide_line(a, divider):
+        '''
+        Sub function used for dividing matrix by a vector line by line.
+        '''
+        return np.divide(a, divider, out=np.zeros_like(a), where=divider!=0)
+
+    # Build count matrix
+    count_matrix = compute_count_matrix(y_truth, y_hat)
+
+    # Build dividers vector
+    dividers = count_matrix.sum(axis=1)
+    
+    matrix_divided = np.apply_along_axis(divide_line, 0, np.asarray(count_matrix, dtype=np.float), dividers)
+
+    log_matrix = np.zeros(matrix_divided.shape)
+    np.log2(matrix_divided, out=log_matrix, where=count_matrix != 0)
+    result_matrix = -1 * np.multiply(matrix_divided, log_matrix)
+    result_vector = result_matrix.sum(axis=1)
+    result_vector.sum()
+    
+    if np.isnan(np.sum(result_vector)):
+        print("COUNT MATRIX")
+        print(count_matrix)
+        print("MATRIX DIVIDED")
+        print(matrix_divided)
+        print("RESULT MATRIX")
+        print(result_matrix)
+        print("VECTOR MATRIX")
+        print(result_vector)
+        print("An error occured due to nan value, some values are printed before")
+        exit(1)
+    
+    result = result_vector * dividers / dividers.sum()
+    result = result.sum()
+    return (result_matrix, result_vector, result)
+
+
+def purity_score(y_truth, y_hat):
+    '''
+    Return three values in a dictionary:
+        - purity_class_score: the purity score of the class (asp)
+        - purity_cluster_score: the purity score of the cluster (acp)
+        - K: the overall evaluation criterion (sqrt(asp * acp))
+
+    This function is based on the following article: 
+    Unknown-multiple speaker clustering using HMM, J. Ajmera, H. Bourlard, I. Lapidot, I. McCowan
+    '''
+
+    def divide_line(a, divider):
+        '''
+        Sub function used for dividing matrix by a vector line by line.
+        '''
+        return np.divide(a, divider, out=np.zeros_like(a), where=divider!=0)
+
+    def compute_purity_score(count_matrix, axis=0):
+        if axis==0:
+            other_axis = 1
+        else:
+            other_axis = 0
+        count_per_row = count_matrix.sum(axis=axis)
+        dividers = np.square(count_per_row)
+
+        count_matrix_squared = np.square(count_matrix)
+        matrix_divided = np.apply_along_axis(divide_line, other_axis, np.asarray(count_matrix_squared, dtype=np.float), dividers)
+        vector_purity = np.sum(matrix_divided, axis=axis)
+
+        scalar_purity = np.average(vector_purity, weights=count_per_row)
+        return (vector_purity, scalar_purity)
+    
+
+    count_matrix = compute_count_matrix(y_truth, y_hat)
+    _, purity_cluster_score = compute_purity_score(count_matrix, 1)
+    _, purity_class_score = cluster_purity = compute_purity_score(count_matrix, 0)
+
+    K = np.sqrt(purity_cluster_score * purity_class_score)
+
+    for i in range(count_matrix.shape[0]):
+        for j in range(count_matrix.shape[1]):
+            count_matrix[i][j]
+        count_matrix[i]
+    return {
+        "purity_class_score": purity_class_score,
+        "purity_cluster_score": purity_cluster_score,
+        "K": K
+    }
+
+
+if __name__ == "__main__":
+    print("Purity test #1")
+    # Hypothesis
+    y_hat = np.asarray([0, 1, 2, 0, 1, 0, 3, 2, 2, 3, 3, 0])
+    # Truth
+    y = np.asarray([0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3])
+
+    (result_matrix, result_vector, result) = entropy_score(y, y_hat)
+    print(purity_score(y, y_hat))
+
+    exit(1)
+    print("Purity test #2")
+    # Hypothesis
+    y_hat = np.asarray([0, 1, 2, 0, 1, 0, 3, 2, 2, 3, 3, 0, 4, 4, 4])
+    # Truth
+    y = np.asarray([0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 0, 3, 3, 3])
+
+    (result_matrix, result_vector, result) = entropy_score(y, y_hat)
+    exit(1)
+    print("Result matrix: ")
+    print(result_matrix)
+    print("Result vector: ")
+    print(result_vector)
+    print("Result: ", result)
 \ No newline at end of file
@@ -0,0 +1,24 @@
+import argparse
+from os.path import isfile
+#from volia.data_io import read_lst
+
+import volia
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Filter ids of the given file to only keep a subset")
+    parser.add_argument("file", type=str, help="")
+    parser.add_argument("--filter", default=None, type=str, help="")
+    parser.add_argument("--outfile", default="out.txt", type=str, help="")
+
+    args = parser.parse_args()
+
+    assert args.filter is not None
+    assert isfile(args.file)
+
+    list_ = read_lst(args.file)
+    filter_ = read_lst(args.filter)
+    
+    with open(args.outfile, "w") as of:
+        for key in filter_.keys():
+            of.write(key + " " + " ".join(list_[key]) + "\n")
+    
+    print("File filtered and written in: ", args.outfile)
 \ No newline at end of file
@@ -0,0 +1,62 @@
+
+import matplotlib.pyplot as plt
+import numpy as np
+import pandas as pd
+import argparse
+from os.path import isfile
+from volia.data_io import read_features, read_lst
+
+
+if __name__ == "__main__":
+    # Argparse
+    parser = argparse.ArgumentParser(description="Plot points with color for each character")
+    parser.add_argument("--features", type=str, help="features file path")
+    parser.add_argument("--utt2char", type=str, help="char2utt file path")
+    parser.add_argument("--sublist", type=str, default=None, help="white list of ids to take into account")
+    parser.add_argument("--outfile", default="out.pdf", type=str, help="")
+    parser.add_argument("--title", default="Example of plot", type=str, help="Specify the title")
+    args = parser.parse_args()
+
+    # List of assertions
+    assert args.features, "Need to specify features option"
+    assert args.utt2char, "Need to specify char2utt option file"
+    assert isfile(args.features), "Features path should point to a file"
+    assert isfile(args.utt2char), "char2utt path should point to a file"
+    if args.sublist is not None:
+        assert isfile(args.sublist), "sublist path should point to a file"
+
+
+    id_to_features = read_features(args.features)
+
+    ids = []
+    if args.sublist is not None:
+        print("Using sublist")
+        list_ids = read_lst(args.sublist)
+        ids = [ key for key in list_ids.keys() ]
+    else:
+        ids = [ key for key in id_to_features.keys() ]
+    
+    utt2char = read_lst(args.utt2char)
+    
+    features = [ id_to_features[id_] for id_ in ids ]
+    features = np.vstack(features)
+
+    characters_list = [ utt2char[id_][0] for id_ in ids ]
+
+    features_T = features.transpose()
+    print("Number of characters: ", len(np.unique(characters_list)))
+    df = pd.DataFrame(dict(
+        x=features_T[0],
+        y=features_T[1],
+        character=characters_list))
+
+    groups = df.groupby('character')
+
+    # Plot
+    fig, ax = plt.subplots()
+
+    for character, group in groups:
+        p = ax.plot(group.x, group.y, marker='o', linestyle='', ms=1, label=character)
+    ax.legend()
+    plt.savefig(args.outfile)
+    print("Your plot is saved well (no check of this affirmation)")
@@ -0,0 +1,2 @@
+if __name__ == "__main__":
+    print("Volia is well installed.")
 \ No newline at end of file
@@ -0,0 +1,37 @@
+'''
+The goal of this script is to display calculate tsne of pvectors.
+'''
+
+import os
+from os.path import isfile
+import argparse
+import numpy as np
+from sklearn.manifold import TSNE
+
+from volia.data_io import read_features
+
+if __name__ == "__main__":
+    # Defining argparse 
+    parser = argparse.ArgumentParser(prog='pvector tsne', description='Calculate the tsne representation of pvector in 3 or 2d')
+    parser.add_argument('features', type=str,
+                        help='the path of the file you want to calculate tsne')
+    parser.add_argument('-o', '--outfile', type=str,
+                        default='.',
+                        help='the path of the output file.')
+    parser.add_argument('-n', '--n-comp', type=int, choices=[2, 3],
+                        default='2',
+                        help='number of components output of tsne')
+
+    args = parser.parse_args()
+
+    assert isfile(args.features)
+
+    features_list = read_features(args.features)
+    tuples_key_feat = np.vstack([ (key, feats) for key, feats in features_list.items()])
+    keys, features = zip(*tuples_key_feat)
+    feat_tsne = TSNE(n_components=args.n_comp).fit_transform(features)
+    
+    with open(args.outfile, "w") as of:
+        for i in range(len(keys)):
+            of.write(keys[i] + " " + " ".join([str(feat) for feat in feat_tsne[i]]) + "\n")
+    print("TSNE finished. Check if everything has been done well.")
 \ No newline at end of file
...	...	@@ -0,0 +1,7 @@
	1	+init:
	2	+ pip install -r requirements.txt
	3	+ pip install .
	4	+ python -m volia.test
	5	+
	6	+uninstall:
	7	+ pip uninstall volia
...	...	@@ -0,0 +1,29 @@
	1	+# Introduction
	2	+Volia est une librairie python pour le machine learning dédié au speech. Elle est accompagnée par un ensemble de scripts qui permettent de gérer des données et entraîner des modèles, sortir des rapports d'analyses, des benchmark etc.
	3	+
	4	+# Quick start
	5	+Utiliser volia est très simple.
	6	+
	7	+## Installer volia
	8	+Volia n'étant pas un repository officiel PyPi, vous devez l'installer en mode développement. Pour ce faire, vous pouvez utiliser le Makefile en exécutant la commande suivante :
	9	+
	10	+```
	11	+make
	12	+```
	13	+
	14	+## Exécuter les scripts de n'importe où
	15	+Vous pouvez lancer des scripts depuis n'importe quel chemin de votre ordinateur. Pour cela, il suffit de lancer la commande suivante en remplaçant run-script par le nom du module à lancer :
	16	+```
	17	+python -m volia.run-script
	18	+```
	19	+
	20	+Exemple :
	21	+```
	22	+python -m volia.test
	23	+```
	24	+
	25	+Si Volia est bien installé sur votre machine, cette commande devrait pour retourner "Volia is well installed." dans votre terminal.
	26	+
	27	+
	28	+# Evolution
	29	+Un jour, si ce repository devient trop volumineux, je créerai un deuxième repository seulement composé des scripts. Ce qui m'embête avec cette approche est le lien entre la version du repository volia et celui des scripts. Je suis donc en train de réfléchir à une solution pour organiser mon code proprement et bien séparer la partie script de la partie bibliothèque de Volia.
0	30	\ No newline at end of file
...	...	@@ -0,0 +1,23 @@
	1	+import setuptools
	2	+
	3	+with open("README.md", "r") as fh:
	4	+ long_description = fh.read()
	5	+
	6	+setuptools.setup(
	7	+ name="volia",
	8	+ version="0.0.1",
	9	+ author="Mathias Quillot",
	10	+ author_email="mathias.quillot@alumni.univ-avignon.fr",
	11	+ description="A package dedicated to my experiments on actor voices",
	12	+ long_description=long_description,
	13	+ long_description_content_type="text/markdown",
	14	+ url="",
	15	+ packages=setuptools.find_packages(),
	16	+ classifiers=[
	17	+ "Programming Language :: Python :: 3",
	18	+ "License :: OSI Approved :: MIT License",
	19	+ "Operating System :: OS Independent",
	20	+ ],
	21	+ include_package_data=True,
	22	+ python_requires='>=3.6',
	23	+)
0	24	\ No newline at end of file
...	...	@@ -0,0 +1,23 @@
	1	+import argparse
	2	+from os.path import isfile
	3	+
	4	+
	5	+if __name__ == "__main__":
	6	+
	7	+ parser = argparse.ArgumentParser(
	8	+ description="Convert old files with wrong id to new one. Masseffect.")
	9	+
	10	+ parser.add_argument("file", type=str, help="feature, x2x, or list file")
	11	+ parser.add_argument("--outfile", type=str, default="out.txt", help="output file")
	12	+
	13	+ args = parser.parse_args()
	14	+
	15	+ assert isfile(args.file), "The given file does not exist."
	16	+
	17	+ with open(args.file, "r") as f, open(args.outfile, "w") as of:
	18	+ for line in f:
	19	+ splited = line.replace("\n", "").split(" ")
	20	+ metas = splited[0].split(",")
	21	+ metas.pop(2)
	22	+ splited[0] = ",".join(metas)
	23	+ of.write(" ".join(splited) + "\n")
...	...	@@ -0,0 +1,44 @@
	1	+'''
	2	+Data management input/output
	3	+'''
	4	+
	5	+# Import packages and modules
	6	+import numpy as np
	7	+
	8	+# Defining some types
	9	+from typing import List, Dict
	10	+KeyToList = Dict[str, List[str]]
	11	+KeyToFeatures = Dict[str, List[float]]
	12	+
	13	+
	14	+def read_lst(file_path: str) -> KeyToList:
	15	+ '''
	16	+ Read lst file with this structure:
	17	+ [id] [value1] [value2] ... [value n]
	18	+
	19	+ This is a basic function reused by others like read_features.
	20	+ returns a dictionary with id as key and a list of value as corresponding values
	21	+ '''
	22	+ # KeyToList type variable
	23	+ key_to_list = dict()
	24	+ with open(file_path, "r") as f:
	25	+ for line in f:
	26	+ splited = line.replace("\n", "").split(" ")
	27	+ id = splited[0]
	28	+ values = splited[1:]
	29	+ key_to_list[id] = values
	30	+ return key_to_list
	31	+
	32	+
	33	+def read_features(file_path: str) -> KeyToFeatures:
	34	+ '''
	35	+ '''
	36	+ # KeyToFeatures type variable
	37	+ key_to_features = dict()
	38	+ # and the KeyToList
	39	+ key_to_list = read_lst(file_path)
	40	+
	41	+ for key_, list_ in key_to_list.items():
	42	+ key_to_features[key_] = np.asarray(list_, dtype=float)
	43	+
	44	+ return key_to_features
0	45	\ No newline at end of file
...	...	@@ -0,0 +1,227 @@
	1	+'''
	2	+This module is a part of my library.
	3	+It aims to compute some measures for clustering.
	4	+'''
	5	+
	6	+import numpy as np
	7	+
	8	+def disequilibrium_(matrix1, matrix2, isGlobal=False, mod=None):
	9	+ '''
	10	+ Compute disequilibrium for all the clusters.
	11	+ The disequilibrium is compute from the difference
	12	+ between two clustering sets.
	13	+ isGlobal permet à l'utilisateur de choisir le dénominateur de
	14	+ la fonction :
	15	+ - True : divise la valeur par le nombre d'élément du cluster
	16	+ - False : divise la valeur par le nombre d'élément total
	17	+
	18	+ withPower permet à l'utilisateur de décider d'appliquer un carré 2 ou
	19	+ une valeur absolue.
	20	+ '''
	21	+
	22	+ def divide_line(a, divider):
	23	+ '''
	24	+ Sub function used for dividing matrix by a vector line by line.
	25	+ '''
	26	+ return np.divide(a, divider, out=np.zeros_like(a), where=divider!=0)
	27	+
	28	+ dividers1 = 0
	29	+ dividers2 = 0
	30	+
	31	+ if isGlobal:
	32	+ dividers1 = matrix1.sum()
	33	+ dividers2 = matrix2.sum()
	34	+ else:
	35	+ dividers1 = matrix1.sum(axis=1)
	36	+ dividers2 = matrix2.sum(axis=1)
	37	+
	38	+ matrix1_divided = np.apply_along_axis(divide_line, 0, np.asarray(matrix1, dtype=np.float), dividers1)
	39	+
	40	+ matrix2_divided = np.apply_along_axis(divide_line, 0, np.asarray(matrix2, dtype=np.float), dividers2)
	41	+
	42	+ diff = matrix1_divided - matrix2_divided
	43	+
	44	+ mask = np.logical_not(np.logical_and(matrix2==0, matrix1==0))
	45	+
	46	+ result = diff
	47	+
	48	+ if mod != None or mod == "":
	49	+ for word in mod.split(" "):
	50	+ if word == "power":
	51	+ result = np.power(result,2)
	52	+ elif word == "human":
	53	+ result = result * 100
	54	+ elif word == "abs":
	55	+ result = np.absolute(result)
	56	+ else:
	57	+ raise Exception("Need to specify an accepted mod of the disequilibrium (\"power\", \"human\" or \"abs\"")
	58	+ return (mask, result)
	59	+
	60	+
	61	+
	62	+def disequilibrium_mean_by_cluster(mask, matrix):
	63	+ '''
	64	+ Mean of disequilibrium
	65	+ matrix is the disequilibrium calculated
	66	+ from number of occurences belonging to a class,
	67	+ for each cluster.
	68	+ '''
	69	+ nb_k = len(matrix)
	70	+ results = np.zeros((nb_k))
	71	+
	72	+ for i in range(nb_k):
	73	+ results[i] = matrix[i].sum() / mask[i].sum()
	74	+ return results
	75	+
	76	+
	77	+def disequilibrium(matrix1, matrix2, isGlobal=False):
	78	+ '''
	79	+ Disequilibrium matrix
	80	+ And Disequilibrium value
	81	+ '''
	82	+ mask, result = disequilibrium_(matrix1, matrix2, isGlobal)
	83	+ result_human = result * 100
	84	+ result_power = np.power(result, 2)
	85	+
	86	+ return (
	87	+ mask,
	88	+ result_human,
	89	+ disequilibrium_mean_by_cluster(mask, result_power).sum()/matrix1.shape[0]
	90	+ )
	91	+
	92	+
	93	+def compute_count_matrix(y_truth, y_hat):
	94	+ '''
	95	+ Check the size of the lists with assertion
	96	+ '''
	97	+ # Check size of the lists
	98	+ assert len(y_hat) == len(y_truth), f"Matrices should have the same length y_hat: {len(y_hat)}, y_truth: {len(y_truth)}"
	99	+
	100	+ # Build count matrix
	101	+ count_matrix = np.zeros((max(y_hat+1), max(y_truth+1)))
	102	+ for i in range(len(y_hat)):
	103	+ count_matrix[y_hat[i]][y_truth[i]] += 1
	104	+ return count_matrix
	105	+
	106	+
	107	+def entropy_score(y_truth, y_hat):
	108	+ '''
	109	+ Need to use label encoder before givin y_hat and y_truth
	110	+ Don't use one hot labels
	111	+
	112	+ Return a tuple with:
	113	+ - result_matrix : the matrix with the log multiplied probabilities (P(x) * log(P(x)))
	114	+ - result_vector : the vector avec summing entropy of each class. Each value corresponds to a cluster.
	115	+ - result : the final entropy measure of the clustering
	116	+ '''
	117	+ def divide_line(a, divider):
	118	+ '''
	119	+ Sub function used for dividing matrix by a vector line by line.
	120	+ '''
	121	+ return np.divide(a, divider, out=np.zeros_like(a), where=divider!=0)
	122	+
	123	+ # Build count matrix
	124	+ count_matrix = compute_count_matrix(y_truth, y_hat)
	125	+
	126	+ # Build dividers vector
	127	+ dividers = count_matrix.sum(axis=1)
	128	+
	129	+ matrix_divided = np.apply_along_axis(divide_line, 0, np.asarray(count_matrix, dtype=np.float), dividers)
	130	+
	131	+ log_matrix = np.zeros(matrix_divided.shape)
	132	+ np.log2(matrix_divided, out=log_matrix, where=count_matrix != 0)
	133	+ result_matrix = -1 * np.multiply(matrix_divided, log_matrix)
	134	+ result_vector = result_matrix.sum(axis=1)
	135	+ result_vector.sum()
	136	+
	137	+ if np.isnan(np.sum(result_vector)):
	138	+ print("COUNT MATRIX")
	139	+ print(count_matrix)
	140	+ print("MATRIX DIVIDED")
	141	+ print(matrix_divided)
	142	+ print("RESULT MATRIX")
	143	+ print(result_matrix)
	144	+ print("VECTOR MATRIX")
	145	+ print(result_vector)
	146	+ print("An error occured due to nan value, some values are printed before")
	147	+ exit(1)
	148	+
	149	+ result = result_vector * dividers / dividers.sum()
	150	+ result = result.sum()
	151	+ return (result_matrix, result_vector, result)
	152	+
	153	+
	154	+def purity_score(y_truth, y_hat):
	155	+ '''
	156	+ Return three values in a dictionary:
	157	+ - purity_class_score: the purity score of the class (asp)
	158	+ - purity_cluster_score: the purity score of the cluster (acp)
	159	+ - K: the overall evaluation criterion (sqrt(asp * acp))
	160	+
	161	+ This function is based on the following article:
	162	+ Unknown-multiple speaker clustering using HMM, J. Ajmera, H. Bourlard, I. Lapidot, I. McCowan
	163	+ '''
	164	+
	165	+ def divide_line(a, divider):
	166	+ '''
	167	+ Sub function used for dividing matrix by a vector line by line.
	168	+ '''
	169	+ return np.divide(a, divider, out=np.zeros_like(a), where=divider!=0)
	170	+
	171	+ def compute_purity_score(count_matrix, axis=0):
	172	+ if axis==0:
	173	+ other_axis = 1
	174	+ else:
	175	+ other_axis = 0
	176	+ count_per_row = count_matrix.sum(axis=axis)
	177	+ dividers = np.square(count_per_row)
	178	+
	179	+ count_matrix_squared = np.square(count_matrix)
	180	+ matrix_divided = np.apply_along_axis(divide_line, other_axis, np.asarray(count_matrix_squared, dtype=np.float), dividers)
	181	+ vector_purity = np.sum(matrix_divided, axis=axis)
	182	+
	183	+ scalar_purity = np.average(vector_purity, weights=count_per_row)
	184	+ return (vector_purity, scalar_purity)
	185	+
	186	+
	187	+ count_matrix = compute_count_matrix(y_truth, y_hat)
	188	+ _, purity_cluster_score = compute_purity_score(count_matrix, 1)
	189	+ _, purity_class_score = cluster_purity = compute_purity_score(count_matrix, 0)
	190	+
	191	+ K = np.sqrt(purity_cluster_score * purity_class_score)
	192	+
	193	+ for i in range(count_matrix.shape[0]):
	194	+ for j in range(count_matrix.shape[1]):
	195	+ count_matrix[i][j]
	196	+ count_matrix[i]
	197	+ return {
	198	+ "purity_class_score": purity_class_score,
	199	+ "purity_cluster_score": purity_cluster_score,
	200	+ "K": K
	201	+ }
	202	+
	203	+
	204	+if __name__ == "__main__":
	205	+ print("Purity test #1")
	206	+ # Hypothesis
	207	+ y_hat = np.asarray([0, 1, 2, 0, 1, 0, 3, 2, 2, 3, 3, 0])
	208	+ # Truth
	209	+ y = np.asarray([0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3])
	210	+
	211	+ (result_matrix, result_vector, result) = entropy_score(y, y_hat)
	212	+ print(purity_score(y, y_hat))
	213	+
	214	+ exit(1)
	215	+ print("Purity test #2")
	216	+ # Hypothesis
	217	+ y_hat = np.asarray([0, 1, 2, 0, 1, 0, 3, 2, 2, 3, 3, 0, 4, 4, 4])
	218	+ # Truth
	219	+ y = np.asarray([0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 0, 3, 3, 3])
	220	+
	221	+ (result_matrix, result_vector, result) = entropy_score(y, y_hat)
	222	+ exit(1)
	223	+ print("Result matrix: ")
	224	+ print(result_matrix)
	225	+ print("Result vector: ")
	226	+ print(result_vector)
	227	+ print("Result: ", result)
0	228	\ No newline at end of file
...	...	@@ -0,0 +1,24 @@
	1	+import argparse
	2	+from os.path import isfile
	3	+#from volia.data_io import read_lst
	4	+
	5	+import volia
	6	+if __name__ == "__main__":
	7	+ parser = argparse.ArgumentParser(description="Filter ids of the given file to only keep a subset")
	8	+ parser.add_argument("file", type=str, help="")
	9	+ parser.add_argument("--filter", default=None, type=str, help="")
	10	+ parser.add_argument("--outfile", default="out.txt", type=str, help="")
	11	+
	12	+ args = parser.parse_args()
	13	+
	14	+ assert args.filter is not None
	15	+ assert isfile(args.file)
	16	+
	17	+ list_ = read_lst(args.file)
	18	+ filter_ = read_lst(args.filter)
	19	+
	20	+ with open(args.outfile, "w") as of:
	21	+ for key in filter_.keys():
	22	+ of.write(key + " " + " ".join(list_[key]) + "\n")
	23	+
	24	+ print("File filtered and written in: ", args.outfile)
0	25	\ No newline at end of file
...	...	@@ -0,0 +1,62 @@
	1	+
	2	+import matplotlib.pyplot as plt
	3	+import numpy as np
	4	+import pandas as pd
	5	+import argparse
	6	+from os.path import isfile
	7	+from volia.data_io import read_features, read_lst
	8	+
	9	+
	10	+if __name__ == "__main__":
	11	+ # Argparse
	12	+ parser = argparse.ArgumentParser(description="Plot points with color for each character")
	13	+ parser.add_argument("--features", type=str, help="features file path")
	14	+ parser.add_argument("--utt2char", type=str, help="char2utt file path")
	15	+ parser.add_argument("--sublist", type=str, default=None, help="white list of ids to take into account")
	16	+ parser.add_argument("--outfile", default="out.pdf", type=str, help="")
	17	+ parser.add_argument("--title", default="Example of plot", type=str, help="Specify the title")
	18	+ args = parser.parse_args()
	19	+
	20	+ # List of assertions
	21	+ assert args.features, "Need to specify features option"
	22	+ assert args.utt2char, "Need to specify char2utt option file"
	23	+ assert isfile(args.features), "Features path should point to a file"
	24	+ assert isfile(args.utt2char), "char2utt path should point to a file"
	25	+ if args.sublist is not None:
	26	+ assert isfile(args.sublist), "sublist path should point to a file"
	27	+
	28	+
	29	+ id_to_features = read_features(args.features)
	30	+
	31	+ ids = []
	32	+ if args.sublist is not None:
	33	+ print("Using sublist")
	34	+ list_ids = read_lst(args.sublist)
	35	+ ids = [ key for key in list_ids.keys() ]
	36	+ else:
	37	+ ids = [ key for key in id_to_features.keys() ]
	38	+
	39	+ utt2char = read_lst(args.utt2char)
	40	+
	41	+ features = [ id_to_features[id_] for id_ in ids ]
	42	+ features = np.vstack(features)
	43	+
	44	+ characters_list = [ utt2char[id_][0] for id_ in ids ]
	45	+
	46	+ features_T = features.transpose()
	47	+ print("Number of characters: ", len(np.unique(characters_list)))
	48	+ df = pd.DataFrame(dict(
	49	+ x=features_T[0],
	50	+ y=features_T[1],
	51	+ character=characters_list))
	52	+
	53	+ groups = df.groupby('character')
	54	+
	55	+ # Plot
	56	+ fig, ax = plt.subplots()
	57	+
	58	+ for character, group in groups:
	59	+ p = ax.plot(group.x, group.y, marker='o', linestyle='', ms=1, label=character)
	60	+ ax.legend()
	61	+ plt.savefig(args.outfile)
	62	+ print("Your plot is saved well (no check of this affirmation)")
...	...	@@ -0,0 +1,2 @@
	1	+if __name__ == "__main__":
	2	+ print("Volia is well installed.")
0	3	\ No newline at end of file
...	...	@@ -0,0 +1,37 @@
	1	+'''
	2	+The goal of this script is to display calculate tsne of pvectors.
	3	+'''
	4	+
	5	+import os
	6	+from os.path import isfile
	7	+import argparse
	8	+import numpy as np
	9	+from sklearn.manifold import TSNE
	10	+
	11	+from volia.data_io import read_features
	12	+
	13	+if __name__ == "__main__":
	14	+ # Defining argparse
	15	+ parser = argparse.ArgumentParser(prog='pvector tsne', description='Calculate the tsne representation of pvector in 3 or 2d')
	16	+ parser.add_argument('features', type=str,
	17	+ help='the path of the file you want to calculate tsne')
	18	+ parser.add_argument('-o', '--outfile', type=str,
	19	+ default='.',
	20	+ help='the path of the output file.')
	21	+ parser.add_argument('-n', '--n-comp', type=int, choices=[2, 3],
	22	+ default='2',
	23	+ help='number of components output of tsne')
	24	+
	25	+ args = parser.parse_args()
	26	+
	27	+ assert isfile(args.features)
	28	+
	29	+ features_list = read_features(args.features)
	30	+ tuples_key_feat = np.vstack([ (key, feats) for key, feats in features_list.items()])
	31	+ keys, features = zip(*tuples_key_feat)
	32	+ feat_tsne = TSNE(n_components=args.n_comp).fit_transform(features)
	33	+
	34	+ with open(args.outfile, "w") as of:
	35	+ for i in range(len(keys)):
	36	+ of.write(keys[i] + " " + " ".join([str(feat) for feat in feat_tsne[i]]) + "\n")
	37	+ print("TSNE finished. Check if everything has been done well.")
0	38	\ No newline at end of file