Compare View

switch
from
...
to
 
Commits (3)

Changes

Showing 18 changed files Side-by-side Diff

... ... @@ -0,0 +1,7 @@
  1 +init:
  2 + pip install -r requirements.txt
  3 + pip install .
  4 + python -m volia.test
  5 +
  6 +uninstall:
  7 + pip uninstall volia
... ... @@ -0,0 +1,29 @@
  1 +# Introduction
  2 +Volia est une librairie python pour le machine learning dédié au *speech*. Elle est accompagnée par un ensemble de scripts qui permettent de gérer des données et entraîner des modèles, sortir des rapports d'analyses, des benchmark etc.
  3 +
  4 +# Quick start
  5 +Utiliser volia est très simple.
  6 +
  7 +## Installer volia
  8 +Volia n'étant pas un *repository* officiel PyPi, vous devez l'installer en mode *développement*. Pour ce faire, vous pouvez utiliser le *Makefile* en exécutant la commande suivante :
  9 +
  10 +```
  11 +make
  12 +```
  13 +
  14 +## Exécuter les scripts de n'importe où
  15 +Vous pouvez lancer des scripts depuis n'importe quel chemin de votre ordinateur. Pour cela, il suffit de lancer la commande suivante en remplaçant *run-script* par le nom du module à lancer :
  16 +```
  17 +python -m volia.run-script
  18 +```
  19 +
  20 +Exemple :
  21 +```
  22 +python -m volia.test
  23 +```
  24 +
  25 +Si *Volia* est bien installé sur votre machine, cette commande devrait pour retourner "Volia is well installed." dans votre terminal.
  26 +
  27 +
  28 +# Evolution
  29 +Un jour, si ce *repository* devient trop volumineux, je créerai un deuxième *repository* seulement composé des scripts. Ce qui m'embête avec cette approche est le lien entre la version du *repository* volia et celui des scripts. Je suis donc en train de réfléchir à une solution pour organiser mon code proprement et bien séparer la partie *script* de la partie *bibliothèque* de Volia.
0 30 \ No newline at end of file
... ... @@ -0,0 +1,23 @@
  1 +import setuptools
  2 +
  3 +with open("README.md", "r") as fh:
  4 + long_description = fh.read()
  5 +
  6 +setuptools.setup(
  7 + name="volia",
  8 + version="0.0.1",
  9 + author="Mathias Quillot",
  10 + author_email="mathias.quillot@alumni.univ-avignon.fr",
  11 + description="A package dedicated to my experiments on actor voices",
  12 + long_description=long_description,
  13 + long_description_content_type="text/markdown",
  14 + url="",
  15 + packages=setuptools.find_packages(),
  16 + classifiers=[
  17 + "Programming Language :: Python :: 3",
  18 + "License :: OSI Approved :: MIT License",
  19 + "Operating System :: OS Independent",
  20 + ],
  21 + include_package_data=True,
  22 + python_requires='>=3.6',
  23 +)
0 24 \ No newline at end of file
volia/convert-old.py
... ... @@ -0,0 +1,23 @@
  1 +import argparse
  2 +from os.path import isfile
  3 +
  4 +
  5 +if __name__ == "__main__":
  6 +
  7 + parser = argparse.ArgumentParser(
  8 + description="Convert old files with wrong id to new one. Masseffect.")
  9 +
  10 + parser.add_argument("file", type=str, help="feature, x2x, or list file")
  11 + parser.add_argument("--outfile", type=str, default="out.txt", help="output file")
  12 +
  13 + args = parser.parse_args()
  14 +
  15 + assert isfile(args.file), "The given file does not exist."
  16 +
  17 + with open(args.file, "r") as f, open(args.outfile, "w") as of:
  18 + for line in f:
  19 + splited = line.replace("\n", "").split(" ")
  20 + metas = splited[0].split(",")
  21 + metas.pop(2)
  22 + splited[0] = ",".join(metas)
  23 + of.write(" ".join(splited) + "\n")
... ... @@ -0,0 +1,44 @@
  1 +'''
  2 +Data management input/output
  3 +'''
  4 +
  5 +# Import packages and modules
  6 +import numpy as np
  7 +
  8 +# Defining some types
  9 +from typing import List, Dict
  10 +KeyToList = Dict[str, List[str]]
  11 +KeyToFeatures = Dict[str, List[float]]
  12 +
  13 +
  14 +def read_lst(file_path: str) -> KeyToList:
  15 + '''
  16 + Read lst file with this structure:
  17 + [id] [value1] [value2] ... [value n]
  18 +
  19 + This is a basic function reused by others like read_features.
  20 + returns a dictionary with id as key and a list of value as corresponding values
  21 + '''
  22 + # KeyToList type variable
  23 + key_to_list = dict()
  24 + with open(file_path, "r") as f:
  25 + for line in f:
  26 + splited = line.replace("\n", "").split(" ")
  27 + id = splited[0]
  28 + values = splited[1:]
  29 + key_to_list[id] = values
  30 + return key_to_list
  31 +
  32 +
  33 +def read_features(file_path: str) -> KeyToFeatures:
  34 + '''
  35 + '''
  36 + # KeyToFeatures type variable
  37 + key_to_features = dict()
  38 + # and the KeyToList
  39 + key_to_list = read_lst(file_path)
  40 +
  41 + for key_, list_ in key_to_list.items():
  42 + key_to_features[key_] = np.asarray(list_, dtype=float)
  43 +
  44 + return key_to_features
0 45 \ No newline at end of file
volia/core/measures.py
... ... @@ -0,0 +1,227 @@
  1 +'''
  2 +This module is a part of my library.
  3 +It aims to compute some measures for clustering.
  4 +'''
  5 +
  6 +import numpy as np
  7 +
  8 +def disequilibrium_(matrix1, matrix2, isGlobal=False, mod=None):
  9 + '''
  10 + Compute disequilibrium for all the clusters.
  11 + The disequilibrium is compute from the difference
  12 + between two clustering sets.
  13 + isGlobal permet à l'utilisateur de choisir le dénominateur de
  14 + la fonction :
  15 + - True : divise la valeur par le nombre d'élément du cluster
  16 + - False : divise la valeur par le nombre d'élément total
  17 +
  18 + withPower permet à l'utilisateur de décider d'appliquer un carré 2 ou
  19 + une valeur absolue.
  20 + '''
  21 +
  22 + def divide_line(a, divider):
  23 + '''
  24 + Sub function used for dividing matrix by a vector line by line.
  25 + '''
  26 + return np.divide(a, divider, out=np.zeros_like(a), where=divider!=0)
  27 +
  28 + dividers1 = 0
  29 + dividers2 = 0
  30 +
  31 + if isGlobal:
  32 + dividers1 = matrix1.sum()
  33 + dividers2 = matrix2.sum()
  34 + else:
  35 + dividers1 = matrix1.sum(axis=1)
  36 + dividers2 = matrix2.sum(axis=1)
  37 +
  38 + matrix1_divided = np.apply_along_axis(divide_line, 0, np.asarray(matrix1, dtype=np.float), dividers1)
  39 +
  40 + matrix2_divided = np.apply_along_axis(divide_line, 0, np.asarray(matrix2, dtype=np.float), dividers2)
  41 +
  42 + diff = matrix1_divided - matrix2_divided
  43 +
  44 + mask = np.logical_not(np.logical_and(matrix2==0, matrix1==0))
  45 +
  46 + result = diff
  47 +
  48 + if mod != None or mod == "":
  49 + for word in mod.split(" "):
  50 + if word == "power":
  51 + result = np.power(result,2)
  52 + elif word == "human":
  53 + result = result * 100
  54 + elif word == "abs":
  55 + result = np.absolute(result)
  56 + else:
  57 + raise Exception("Need to specify an accepted mod of the disequilibrium (\"power\", \"human\" or \"abs\"")
  58 + return (mask, result)
  59 +
  60 +
  61 +
  62 +def disequilibrium_mean_by_cluster(mask, matrix):
  63 + '''
  64 + Mean of disequilibrium
  65 + matrix is the disequilibrium calculated
  66 + from number of occurences belonging to a class,
  67 + for each cluster.
  68 + '''
  69 + nb_k = len(matrix)
  70 + results = np.zeros((nb_k))
  71 +
  72 + for i in range(nb_k):
  73 + results[i] = matrix[i].sum() / mask[i].sum()
  74 + return results
  75 +
  76 +
  77 +def disequilibrium(matrix1, matrix2, isGlobal=False):
  78 + '''
  79 + Disequilibrium matrix
  80 + And Disequilibrium value
  81 + '''
  82 + mask, result = disequilibrium_(matrix1, matrix2, isGlobal)
  83 + result_human = result * 100
  84 + result_power = np.power(result, 2)
  85 +
  86 + return (
  87 + mask,
  88 + result_human,
  89 + disequilibrium_mean_by_cluster(mask, result_power).sum()/matrix1.shape[0]
  90 + )
  91 +
  92 +
  93 +def compute_count_matrix(y_truth, y_hat):
  94 + '''
  95 + Check the size of the lists with assertion
  96 + '''
  97 + # Check size of the lists
  98 + assert len(y_hat) == len(y_truth), f"Matrices should have the same length y_hat: {len(y_hat)}, y_truth: {len(y_truth)}"
  99 +
  100 + # Build count matrix
  101 + count_matrix = np.zeros((max(y_hat+1), max(y_truth+1)))
  102 + for i in range(len(y_hat)):
  103 + count_matrix[y_hat[i]][y_truth[i]] += 1
  104 + return count_matrix
  105 +
  106 +
  107 +def entropy_score(y_truth, y_hat):
  108 + '''
  109 + Need to use label encoder before givin y_hat and y_truth
  110 + Don't use one hot labels
  111 +
  112 + Return a tuple with:
  113 + - result_matrix : the matrix with the log multiplied probabilities (P(x) * log(P(x)))
  114 + - result_vector : the vector avec summing entropy of each class. Each value corresponds to a cluster.
  115 + - result : the final entropy measure of the clustering
  116 + '''
  117 + def divide_line(a, divider):
  118 + '''
  119 + Sub function used for dividing matrix by a vector line by line.
  120 + '''
  121 + return np.divide(a, divider, out=np.zeros_like(a), where=divider!=0)
  122 +
  123 + # Build count matrix
  124 + count_matrix = compute_count_matrix(y_truth, y_hat)
  125 +
  126 + # Build dividers vector
  127 + dividers = count_matrix.sum(axis=1)
  128 +
  129 + matrix_divided = np.apply_along_axis(divide_line, 0, np.asarray(count_matrix, dtype=np.float), dividers)
  130 +
  131 + log_matrix = np.zeros(matrix_divided.shape)
  132 + np.log2(matrix_divided, out=log_matrix, where=count_matrix != 0)
  133 + result_matrix = -1 * np.multiply(matrix_divided, log_matrix)
  134 + result_vector = result_matrix.sum(axis=1)
  135 + result_vector.sum()
  136 +
  137 + if np.isnan(np.sum(result_vector)):
  138 + print("COUNT MATRIX")
  139 + print(count_matrix)
  140 + print("MATRIX DIVIDED")
  141 + print(matrix_divided)
  142 + print("RESULT MATRIX")
  143 + print(result_matrix)
  144 + print("VECTOR MATRIX")
  145 + print(result_vector)
  146 + print("An error occured due to nan value, some values are printed before")
  147 + exit(1)
  148 +
  149 + result = result_vector * dividers / dividers.sum()
  150 + result = result.sum()
  151 + return (result_matrix, result_vector, result)
  152 +
  153 +
  154 +def purity_score(y_truth, y_hat):
  155 + '''
  156 + Return three values in a dictionary:
  157 + - purity_class_score: the purity score of the class (asp)
  158 + - purity_cluster_score: the purity score of the cluster (acp)
  159 + - K: the overall evaluation criterion (sqrt(asp * acp))
  160 +
  161 + This function is based on the following article:
  162 + Unknown-multiple speaker clustering using HMM, J. Ajmera, H. Bourlard, I. Lapidot, I. McCowan
  163 + '''
  164 +
  165 + def divide_line(a, divider):
  166 + '''
  167 + Sub function used for dividing matrix by a vector line by line.
  168 + '''
  169 + return np.divide(a, divider, out=np.zeros_like(a), where=divider!=0)
  170 +
  171 + def compute_purity_score(count_matrix, axis=0):
  172 + if axis==0:
  173 + other_axis = 1
  174 + else:
  175 + other_axis = 0
  176 + count_per_row = count_matrix.sum(axis=axis)
  177 + dividers = np.square(count_per_row)
  178 +
  179 + count_matrix_squared = np.square(count_matrix)
  180 + matrix_divided = np.apply_along_axis(divide_line, other_axis, np.asarray(count_matrix_squared, dtype=np.float), dividers)
  181 + vector_purity = np.sum(matrix_divided, axis=axis)
  182 +
  183 + scalar_purity = np.average(vector_purity, weights=count_per_row)
  184 + return (vector_purity, scalar_purity)
  185 +
  186 +
  187 + count_matrix = compute_count_matrix(y_truth, y_hat)
  188 + _, purity_cluster_score = compute_purity_score(count_matrix, 1)
  189 + _, purity_class_score = cluster_purity = compute_purity_score(count_matrix, 0)
  190 +
  191 + K = np.sqrt(purity_cluster_score * purity_class_score)
  192 +
  193 + for i in range(count_matrix.shape[0]):
  194 + for j in range(count_matrix.shape[1]):
  195 + count_matrix[i][j]
  196 + count_matrix[i]
  197 + return {
  198 + "purity_class_score": purity_class_score,
  199 + "purity_cluster_score": purity_cluster_score,
  200 + "K": K
  201 + }
  202 +
  203 +
  204 +if __name__ == "__main__":
  205 + print("Purity test #1")
  206 + # Hypothesis
  207 + y_hat = np.asarray([0, 1, 2, 0, 1, 0, 3, 2, 2, 3, 3, 0])
  208 + # Truth
  209 + y = np.asarray([0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3])
  210 +
  211 + (result_matrix, result_vector, result) = entropy_score(y, y_hat)
  212 + print(purity_score(y, y_hat))
  213 +
  214 + exit(1)
  215 + print("Purity test #2")
  216 + # Hypothesis
  217 + y_hat = np.asarray([0, 1, 2, 0, 1, 0, 3, 2, 2, 3, 3, 0, 4, 4, 4])
  218 + # Truth
  219 + y = np.asarray([0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 0, 3, 3, 3])
  220 +
  221 + (result_matrix, result_vector, result) = entropy_score(y, y_hat)
  222 + exit(1)
  223 + print("Result matrix: ")
  224 + print(result_matrix)
  225 + print("Result vector: ")
  226 + print(result_vector)
  227 + print("Result: ", result)
0 228 \ No newline at end of file
... ... @@ -0,0 +1,24 @@
  1 +import argparse
  2 +from os.path import isfile
  3 +#from volia.data_io import read_lst
  4 +
  5 +import volia
  6 +if __name__ == "__main__":
  7 + parser = argparse.ArgumentParser(description="Filter ids of the given file to only keep a subset")
  8 + parser.add_argument("file", type=str, help="")
  9 + parser.add_argument("--filter", default=None, type=str, help="")
  10 + parser.add_argument("--outfile", default="out.txt", type=str, help="")
  11 +
  12 + args = parser.parse_args()
  13 +
  14 + assert args.filter is not None
  15 + assert isfile(args.file)
  16 +
  17 + list_ = read_lst(args.file)
  18 + filter_ = read_lst(args.filter)
  19 +
  20 + with open(args.outfile, "w") as of:
  21 + for key in filter_.keys():
  22 + of.write(key + " " + " ".join(list_[key]) + "\n")
  23 +
  24 + print("File filtered and written in: ", args.outfile)
0 25 \ No newline at end of file
volia/plot-character.py
... ... @@ -0,0 +1,62 @@
  1 +
  2 +import matplotlib.pyplot as plt
  3 +import numpy as np
  4 +import pandas as pd
  5 +import argparse
  6 +from os.path import isfile
  7 +from volia.data_io import read_features, read_lst
  8 +
  9 +
  10 +if __name__ == "__main__":
  11 + # Argparse
  12 + parser = argparse.ArgumentParser(description="Plot points with color for each character")
  13 + parser.add_argument("--features", type=str, help="features file path")
  14 + parser.add_argument("--utt2char", type=str, help="char2utt file path")
  15 + parser.add_argument("--sublist", type=str, default=None, help="white list of ids to take into account")
  16 + parser.add_argument("--outfile", default="out.pdf", type=str, help="")
  17 + parser.add_argument("--title", default="Example of plot", type=str, help="Specify the title")
  18 + args = parser.parse_args()
  19 +
  20 + # List of assertions
  21 + assert args.features, "Need to specify features option"
  22 + assert args.utt2char, "Need to specify char2utt option file"
  23 + assert isfile(args.features), "Features path should point to a file"
  24 + assert isfile(args.utt2char), "char2utt path should point to a file"
  25 + if args.sublist is not None:
  26 + assert isfile(args.sublist), "sublist path should point to a file"
  27 +
  28 +
  29 + id_to_features = read_features(args.features)
  30 +
  31 + ids = []
  32 + if args.sublist is not None:
  33 + print("Using sublist")
  34 + list_ids = read_lst(args.sublist)
  35 + ids = [ key for key in list_ids.keys() ]
  36 + else:
  37 + ids = [ key for key in id_to_features.keys() ]
  38 +
  39 + utt2char = read_lst(args.utt2char)
  40 +
  41 + features = [ id_to_features[id_] for id_ in ids ]
  42 + features = np.vstack(features)
  43 +
  44 + characters_list = [ utt2char[id_][0] for id_ in ids ]
  45 +
  46 + features_T = features.transpose()
  47 + print("Number of characters: ", len(np.unique(characters_list)))
  48 + df = pd.DataFrame(dict(
  49 + x=features_T[0],
  50 + y=features_T[1],
  51 + character=characters_list))
  52 +
  53 + groups = df.groupby('character')
  54 +
  55 + # Plot
  56 + fig, ax = plt.subplots()
  57 +
  58 + for character, group in groups:
  59 + p = ax.plot(group.x, group.y, marker='o', linestyle='', ms=1, label=character)
  60 + ax.legend()
  61 + plt.savefig(args.outfile)
  62 + print("Your plot is saved well (no check of this affirmation)")
... ... @@ -0,0 +1,2 @@
  1 +if __name__ == "__main__":
  2 + print("Volia is well installed.")
0 3 \ No newline at end of file
... ... @@ -0,0 +1,37 @@
  1 +'''
  2 +The goal of this script is to display calculate tsne of pvectors.
  3 +'''
  4 +
  5 +import os
  6 +from os.path import isfile
  7 +import argparse
  8 +import numpy as np
  9 +from sklearn.manifold import TSNE
  10 +
  11 +from volia.data_io import read_features
  12 +
  13 +if __name__ == "__main__":
  14 + # Defining argparse
  15 + parser = argparse.ArgumentParser(prog='pvector tsne', description='Calculate the tsne representation of pvector in 3 or 2d')
  16 + parser.add_argument('features', type=str,
  17 + help='the path of the file you want to calculate tsne')
  18 + parser.add_argument('-o', '--outfile', type=str,
  19 + default='.',
  20 + help='the path of the output file.')
  21 + parser.add_argument('-n', '--n-comp', type=int, choices=[2, 3],
  22 + default='2',
  23 + help='number of components output of tsne')
  24 +
  25 + args = parser.parse_args()
  26 +
  27 + assert isfile(args.features)
  28 +
  29 + features_list = read_features(args.features)
  30 + tuples_key_feat = np.vstack([ (key, feats) for key, feats in features_list.items()])
  31 + keys, features = zip(*tuples_key_feat)
  32 + feat_tsne = TSNE(n_components=args.n_comp).fit_transform(features)
  33 +
  34 + with open(args.outfile, "w") as of:
  35 + for i in range(len(keys)):
  36 + of.write(keys[i] + " " + " ".join([str(feat) for feat in feat_tsne[i]]) + "\n")
  37 + print("TSNE finished. Check if everything has been done well.")
0 38 \ No newline at end of file