From e7d811503f88129fb1d8eb28dd6af09f681a771e Mon Sep 17 00:00:00 2001 From: Quillot Mathias Date: Wed, 21 Apr 2021 15:06:03 +0200 Subject: [PATCH] New file architecture. Now scripts are on volia's directory and the library is on the core directory. --- scripts/data-management/convert-old.py | 23 --- scripts/data-management/filter_ids.py | 23 --- scripts/dim-reduction/tsne.py | 37 ----- scripts/evaluations/clustering.py | 254 --------------------------------- scripts/plot/plot-character.py | 62 -------- volia/convert-old.py | 23 +++ volia/core/data.py | 44 ++++++ volia/core/measures.py | 227 +++++++++++++++++++++++++++++ volia/data_io.py | 44 ------ volia/filter_ids.py | 24 ++++ volia/measures.py | 227 ----------------------------- volia/plot-character.py | 62 ++++++++ volia/test.py | 2 + volia/tsne.py | 37 +++++ 14 files changed, 419 insertions(+), 670 deletions(-) delete mode 100644 scripts/data-management/convert-old.py delete mode 100644 scripts/data-management/filter_ids.py delete mode 100644 scripts/dim-reduction/tsne.py delete mode 100644 scripts/evaluations/clustering.py delete mode 100644 scripts/plot/plot-character.py create mode 100644 volia/convert-old.py create mode 100644 volia/core/data.py create mode 100644 volia/core/measures.py delete mode 100644 volia/data_io.py create mode 100644 volia/filter_ids.py delete mode 100644 volia/measures.py create mode 100644 volia/plot-character.py create mode 100644 volia/test.py create mode 100644 volia/tsne.py diff --git a/scripts/data-management/convert-old.py b/scripts/data-management/convert-old.py deleted file mode 100644 index b91a6ce..0000000 --- a/scripts/data-management/convert-old.py +++ /dev/null @@ -1,23 +0,0 @@ -import argparse -from os.path import isfile - - -if __name__ == "__main__": - - parser = argparse.ArgumentParser( - description="Convert old files with wrong id to new one. Masseffect.") - - parser.add_argument("file", type=str, help="feature, x2x, or list file") - parser.add_argument("--outfile", type=str, default="out.txt", help="output file") - - args = parser.parse_args() - - assert isfile(args.file), "The given file does not exist." - - with open(args.file, "r") as f, open(args.outfile, "w") as of: - for line in f: - splited = line.replace("\n", "").split(" ") - metas = splited[0].split(",") - metas.pop(2) - splited[0] = ",".join(metas) - of.write(" ".join(splited) + "\n") diff --git a/scripts/data-management/filter_ids.py b/scripts/data-management/filter_ids.py deleted file mode 100644 index 023895c..0000000 --- a/scripts/data-management/filter_ids.py +++ /dev/null @@ -1,23 +0,0 @@ -import argparse -from os.path import isfile -from volia.data_io import read_lst - -if __name__ == "__main__": - parser = argparse.ArgumentParser(description="Filter ids of the given file to only keep a subset") - parser.add_argument("file", type=str, help="") - parser.add_argument("--filter", default=None, type=str, help="") - parser.add_argument("--outfile", default="out.txt", type=str, help="") - - args = parser.parse_args() - - assert args.filter is not None - assert isfile(args.file) - - list_ = read_lst(args.file) - filter_ = read_lst(args.filter) - - with open(args.outfile, "w") as of: - for key in filter_.keys(): - of.write(key + " " + " ".join(list_[key]) + "\n") - - print("File filtered and written in: ", args.outfile) \ No newline at end of file diff --git a/scripts/dim-reduction/tsne.py b/scripts/dim-reduction/tsne.py deleted file mode 100644 index e88a10e..0000000 --- a/scripts/dim-reduction/tsne.py +++ /dev/null @@ -1,37 +0,0 @@ -''' -The goal of this script is to display calculate tsne of pvectors. -''' - -import os -from os.path import isfile -import argparse -import numpy as np -from sklearn.manifold import TSNE - -from volia.data_io import read_features - -if __name__ == "__main__": - # Defining argparse - parser = argparse.ArgumentParser(prog='pvector tsne', description='Calculate the tsne representation of pvector in 3 or 2d') - parser.add_argument('features', type=str, - help='the path of the file you want to calculate tsne') - parser.add_argument('-o', '--outfile', type=str, - default='.', - help='the path of the output file.') - parser.add_argument('-n', '--n-comp', type=int, choices=[2, 3], - default='2', - help='number of components output of tsne') - - args = parser.parse_args() - - assert isfile(args.features) - - features_list = read_features(args.features) - tuples_key_feat = np.vstack([ (key, feats) for key, feats in features_list.items()]) - keys, features = zip(*tuples_key_feat) - feat_tsne = TSNE(n_components=args.n_comp).fit_transform(features) - - with open(args.outfile, "w") as of: - for i in range(len(keys)): - of.write(keys[i] + " " + " ".join([str(feat) for feat in feat_tsne[i]]) + "\n") - print("TSNE finished. Check if everything has been done well.") \ No newline at end of file diff --git a/scripts/evaluations/clustering.py b/scripts/evaluations/clustering.py deleted file mode 100644 index 7c6e1eb..0000000 --- a/scripts/evaluations/clustering.py +++ /dev/null @@ -1,254 +0,0 @@ -''' -This script allows the user to evaluate a classification system on new labels using clustering methods. -The algorithms are applied on the given latent space (embedding). -''' -import argparse -import numpy as np -import pandas as pd -import os -import time -import pickle -import csv -import json - -from sklearn.preprocessing import LabelEncoder -from sklearn.metrics.pairwise import pairwise_distances -from sklearn.cluster import KMeans -from sklearn.manifold import TSNE -from sklearn.metrics import f1_score, homogeneity_score, completeness_score, v_measure_score -import matplotlib.pyplot as plt - -from volia.data_io import read_features,read_lst -from volia.measures import entropy_score, purity_score - -''' -TODO: -- Add an option allowing the user to choose the number of -clustering to train in order to compute the average and the -''' - - -def train_clustering(label_encoder, feats, classes, outdir): - num_classes = len(label_encoder.classes_) - estimator = None - kmeans_filepath = os.path.join(outdir, f"{args.prefix}kmeans.pkl") - if args.onlymeasures: - print(f"Loading model: {kmeans_filepath}") - with open(kmeans_filepath, "rb") as f: - estimator = pickle.load(f) - else: - # Compute KMEANS clustering on data - print("Saving parameters") - kmeans_parameters = { - "n_clusters": num_classes, - "n_init": 100, - "tol": 10-6, - "algorithm": "elkan" - } - with open(os.path.join(outdir, f"{args.prefix}kmeans_parameters.json"), "w") as f: - json.dump(kmeans_parameters, f) - - # Fit the model and Save parameters - print(f"Fit the model: {kmeans_filepath}") - estimator = KMeans( - **kmeans_parameters - ) - estimator.fit(feats) - print(f"Kmeans: processed {estimator.n_iter_} iterations - intertia={estimator.inertia_}") - - with open(kmeans_filepath, "wb") as f: - pickle.dump(estimator, f) - - # contains distance to each cluster for each sample - dist_space = estimator.transform(feats) - predictions = np.argmin(dist_space, axis=1) - - # gives each cluster a name (considering most represented character) - dataframe = pd.DataFrame({ - "label": pd.Series(list(map(lambda x: le.classes_[x], labels))), - "prediction": pd.Series(predictions) - }) - - def find_cluster_name_fn(c): - mask = dataframe["prediction"] == c - return dataframe[mask]["label"].value_counts(sort=False).idxmax() - - cluster_names = list(map(find_cluster_name_fn, range(num_classes))) - predicted_labels = le.transform( - [cluster_names[pred] for pred in predictions]) - - # F-measure - fscores = f1_score(labels, predicted_labels, average=None) - fscores_str = "\n".join(map(lambda i: "{0:25s}: {1:.4f}".format(le.classes_[i], fscores[i]), range(len(fscores)))) - - # Entropy - _, _, entropy = entropy_score(labels, predicted_labels) - - # Homogenity - homogeneity = homogeneity_score(labels, predicted_labels) - - # Completeness - completeness = completeness_score(labels, predicted_labels) - - # V-Measure - v_measure = v_measure_score(labels, predicted_labels) - - # Purity - purity_scores = purity_score(labels, predicted_labels) - purity_class_score = purity_scores["purity_class_score"] - purity_cluster_score = purity_scores["purity_cluster_score"] - K = purity_scores["K"] - - # Write results - with open(os.path.join(outdir, args.prefix + "eval_clustering.log"), "w") as fd: - print(f"F1-scores for each classes:\n{fscores_str}", file=fd) - print(f"Entropy: {entropy}", file=fd) - print(f"Global score : {np.mean(fscores)}", file=fd) - print(f"Homogeneity: {homogeneity}", file=fd) - print(f"completeness: {completeness}", file=fd) - print(f"v-measure: {v_measure}", file=fd) - print(f"purity class score: {purity_class_score}", file=fd) - print(f"purity cluster score: {purity_cluster_score}", file=fd) - print(f"purity overall evaluation criterion (K): {K}", file=fd) - - # Process t-SNE and plot - tsne_estimator = TSNE() - embeddings = tsne_estimator.fit_transform(feats) - print("t-SNE: processed {0} iterations - KL_divergence={1:.4f}".format( - tsne_estimator.n_iter_, tsne_estimator.kl_divergence_)) - - fig, [axe1, axe2] = plt.subplots(1, 2, figsize=(10, 5)) - for c, name in enumerate(le.classes_): - c_mask = np.where(labels == c) - axe1.scatter(embeddings[c_mask][:, 0], embeddings[c_mask][:, 1], label=name, alpha=0.2, edgecolors=None) - - try: - id_cluster = cluster_names.index(name) - except ValueError: - print("WARNING: no cluster found for {}".format(name)) - continue - c_mask = np.where(predictions == id_cluster) - axe2.scatter(embeddings[c_mask][:, 0], embeddings[c_mask][:, 1], label=name, alpha=0.2, edgecolors=None) - - axe1.legend(loc="lower center", bbox_to_anchor=(0.5, -0.35)) - axe1.set_title("true labels") - axe2.legend(loc="lower center", bbox_to_anchor=(0.5, -0.35)) - axe2.set_title("predicted cluster label") - - plt.suptitle("Kmeans Clustering") - - loc = os.path.join( - outdir, - args.prefix + "kmeans.pdf" - ) - plt.savefig(loc, bbox_inches="tight") - plt.close() - - print("INFO: figure saved at {}".format(loc)) - - end = time.time() - print("program ended in {0:.2f} seconds".format(end-start)) - return { - "f1": np.mean(fscores), - "entropy": entropy, - "homogeneity": homogeneity, - "completeness": completeness, - "v-measure": v_measure, - "purity_class_score": purity_class_score, - "purity_cluster score": purity_cluster_score, - "K": K - } - - -if __name__ == "__main__": - # Argparse - parser = argparse.ArgumentParser("Compute clustering on a latent space") - parser.add_argument("features") - parser.add_argument("utt2", - type=str, - help="file with [utt] [value]") - parser.add_argument("--idsfrom", - type=str, - default="utt2", - choices=[ - "features", - "utt2" - ], - help="from features or from utt2?") - parser.add_argument("--prefix", - default="", - type=str, - help="prefix of saved files") - parser.add_argument("--outdir", - default=None, - type=str, - help="Output directory") - parser.add_argument("--nmodels", - type=int, - default=1, - help="specifies the number of models to train") - parser.add_argument("--onlymeasures", - action='store_true', - help="Don't compute the clustering, compute only the measures") - args = parser.parse_args() - - assert args.outdir - - start = time.time() - - # Load features and utt2 - features = read_features(args.features) - utt2 = read_lst(args.utt2) - - # Take id list - if args.idsfrom == "features": - ids = list(features.keys()) - elif args.idsfrom == "utt2": - ids = list(utt2.keys()) - else: - print(f"idsfrom is not good: {args.idsfrom}") - exit(1) - - feats = np.vstack([ features[id_] for id_ in ids ]) - classes = [ utt2[id_] for id_ in ids ] - - # Encode labels - le = LabelEncoder() - labels = le.fit_transform(classes) - - measures = {} - for i in range(1, args.nmodels+1): - subdir = os.path.join(args.outdir, str(i)) - if not os.path.exists(subdir): - os.mkdir(subdir) - print(f"[{i}/{args.nmodels}] => {subdir}") - results = train_clustering(le, feats, classes, subdir) - - for key, value in results.items(): - if key not in measures: - measures[key] = [] - measures[key].append(results[key]) - - - # File with results - file_results = os.path.join(args.outdir, args.prefix + "clustering_measures.txt") - - with open(file_results, "w") as f: - f.write(f"[nmodels: {args.nmodels}]\n") - for key in measures.keys(): - values = np.asarray(measures[key], dtype=float) - mean = np.mean(values) - std = np.std(values) - f.write(f"[{key} => mean: {mean}, std: {std}] \n") - - # CSV File with all the values - file_csv_measures = os.path.join(args.outdir, args.prefix + "clustering_measures.csv") - - with open(file_csv_measures, "w", newline="") as f: - writer = csv.writer(f, delimiter=",") - writer.writerow(["measure"] + list(range(1, args.nmodels+1)) + ["mean"] + ["std"]) - for key in measures.keys(): - values = np.asarray(measures[key], dtype=float) - mean = np.mean(values) - std = np.std(values) - writer.writerow([key] + list(values) + [mean] + [std]) \ No newline at end of file diff --git a/scripts/plot/plot-character.py b/scripts/plot/plot-character.py deleted file mode 100644 index bfb98d7..0000000 --- a/scripts/plot/plot-character.py +++ /dev/null @@ -1,62 +0,0 @@ - -import matplotlib.pyplot as plt -import numpy as np -import pandas as pd -import argparse -from os.path import isfile -from volia.data_io import read_features, read_lst - - -if __name__ == "__main__": - # Argparse - parser = argparse.ArgumentParser(description="Plot points with color for each character") - parser.add_argument("--features", type=str, help="features file path") - parser.add_argument("--utt2char", type=str, help="char2utt file path") - parser.add_argument("--sublist", type=str, default=None, help="white list of ids to take into account") - parser.add_argument("--outfile", default="out.pdf", type=str, help="") - parser.add_argument("--title", default="Example of plot", type=str, help="Specify the title") - args = parser.parse_args() - - # List of assertions - assert args.features, "Need to specify features option" - assert args.utt2char, "Need to specify char2utt option file" - assert isfile(args.features), "Features path should point to a file" - assert isfile(args.utt2char), "char2utt path should point to a file" - if args.sublist is not None: - assert isfile(args.sublist), "sublist path should point to a file" - - - id_to_features = read_features(args.features) - - ids = [] - if args.sublist is not None: - print("Using sublist") - list_ids = read_lst(args.sublist) - ids = [ key for key in list_ids.keys() ] - else: - ids = [ key for key in id_to_features.keys() ] - - utt2char = read_lst(args.utt2char) - - features = [ id_to_features[id_] for id_ in ids ] - features = np.vstack(features) - - characters_list = [ utt2char[id_][0] for id_ in ids ] - - features_T = features.transpose() - print("Number of characters: ", len(np.unique(characters_list))) - df = pd.DataFrame(dict( - x=features_T[0], - y=features_T[1], - character=characters_list)) - - groups = df.groupby('character') - - # Plot - fig, ax = plt.subplots() - - for character, group in groups: - p = ax.plot(group.x, group.y, marker='o', linestyle='', ms=1, label=character) - ax.legend() - plt.savefig(args.outfile) - print("Your plot is saved well (no check of this affirmation)") diff --git a/volia/convert-old.py b/volia/convert-old.py new file mode 100644 index 0000000..b91a6ce --- /dev/null +++ b/volia/convert-old.py @@ -0,0 +1,23 @@ +import argparse +from os.path import isfile + + +if __name__ == "__main__": + + parser = argparse.ArgumentParser( + description="Convert old files with wrong id to new one. Masseffect.") + + parser.add_argument("file", type=str, help="feature, x2x, or list file") + parser.add_argument("--outfile", type=str, default="out.txt", help="output file") + + args = parser.parse_args() + + assert isfile(args.file), "The given file does not exist." + + with open(args.file, "r") as f, open(args.outfile, "w") as of: + for line in f: + splited = line.replace("\n", "").split(" ") + metas = splited[0].split(",") + metas.pop(2) + splited[0] = ",".join(metas) + of.write(" ".join(splited) + "\n") diff --git a/volia/core/data.py b/volia/core/data.py new file mode 100644 index 0000000..4c1bdae --- /dev/null +++ b/volia/core/data.py @@ -0,0 +1,44 @@ +''' +Data management input/output +''' + +# Import packages and modules +import numpy as np + +# Defining some types +from typing import List, Dict +KeyToList = Dict[str, List[str]] +KeyToFeatures = Dict[str, List[float]] + + +def read_lst(file_path: str) -> KeyToList: + ''' + Read lst file with this structure: + [id] [value1] [value2] ... [value n] + + This is a basic function reused by others like read_features. + returns a dictionary with id as key and a list of value as corresponding values + ''' + # KeyToList type variable + key_to_list = dict() + with open(file_path, "r") as f: + for line in f: + splited = line.replace("\n", "").split(" ") + id = splited[0] + values = splited[1:] + key_to_list[id] = values + return key_to_list + + +def read_features(file_path: str) -> KeyToFeatures: + ''' + ''' + # KeyToFeatures type variable + key_to_features = dict() + # and the KeyToList + key_to_list = read_lst(file_path) + + for key_, list_ in key_to_list.items(): + key_to_features[key_] = np.asarray(list_, dtype=float) + + return key_to_features \ No newline at end of file diff --git a/volia/core/measures.py b/volia/core/measures.py new file mode 100644 index 0000000..0ef8967 --- /dev/null +++ b/volia/core/measures.py @@ -0,0 +1,227 @@ +''' +This module is a part of my library. +It aims to compute some measures for clustering. +''' + +import numpy as np + +def disequilibrium_(matrix1, matrix2, isGlobal=False, mod=None): + ''' + Compute disequilibrium for all the clusters. + The disequilibrium is compute from the difference + between two clustering sets. + isGlobal permet à l'utilisateur de choisir le dénominateur de + la fonction : + - True : divise la valeur par le nombre d'élément du cluster + - False : divise la valeur par le nombre d'élément total + + withPower permet à l'utilisateur de décider d'appliquer un carré 2 ou + une valeur absolue. + ''' + + def divide_line(a, divider): + ''' + Sub function used for dividing matrix by a vector line by line. + ''' + return np.divide(a, divider, out=np.zeros_like(a), where=divider!=0) + + dividers1 = 0 + dividers2 = 0 + + if isGlobal: + dividers1 = matrix1.sum() + dividers2 = matrix2.sum() + else: + dividers1 = matrix1.sum(axis=1) + dividers2 = matrix2.sum(axis=1) + + matrix1_divided = np.apply_along_axis(divide_line, 0, np.asarray(matrix1, dtype=np.float), dividers1) + + matrix2_divided = np.apply_along_axis(divide_line, 0, np.asarray(matrix2, dtype=np.float), dividers2) + + diff = matrix1_divided - matrix2_divided + + mask = np.logical_not(np.logical_and(matrix2==0, matrix1==0)) + + result = diff + + if mod != None or mod == "": + for word in mod.split(" "): + if word == "power": + result = np.power(result,2) + elif word == "human": + result = result * 100 + elif word == "abs": + result = np.absolute(result) + else: + raise Exception("Need to specify an accepted mod of the disequilibrium (\"power\", \"human\" or \"abs\"") + return (mask, result) + + + +def disequilibrium_mean_by_cluster(mask, matrix): + ''' + Mean of disequilibrium + matrix is the disequilibrium calculated + from number of occurences belonging to a class, + for each cluster. + ''' + nb_k = len(matrix) + results = np.zeros((nb_k)) + + for i in range(nb_k): + results[i] = matrix[i].sum() / mask[i].sum() + return results + + +def disequilibrium(matrix1, matrix2, isGlobal=False): + ''' + Disequilibrium matrix + And Disequilibrium value + ''' + mask, result = disequilibrium_(matrix1, matrix2, isGlobal) + result_human = result * 100 + result_power = np.power(result, 2) + + return ( + mask, + result_human, + disequilibrium_mean_by_cluster(mask, result_power).sum()/matrix1.shape[0] + ) + + +def compute_count_matrix(y_truth, y_hat): + ''' + Check the size of the lists with assertion + ''' + # Check size of the lists + assert len(y_hat) == len(y_truth), f"Matrices should have the same length y_hat: {len(y_hat)}, y_truth: {len(y_truth)}" + + # Build count matrix + count_matrix = np.zeros((max(y_hat+1), max(y_truth+1))) + for i in range(len(y_hat)): + count_matrix[y_hat[i]][y_truth[i]] += 1 + return count_matrix + + +def entropy_score(y_truth, y_hat): + ''' + Need to use label encoder before givin y_hat and y_truth + Don't use one hot labels + + Return a tuple with: + - result_matrix : the matrix with the log multiplied probabilities (P(x) * log(P(x))) + - result_vector : the vector avec summing entropy of each class. Each value corresponds to a cluster. + - result : the final entropy measure of the clustering + ''' + def divide_line(a, divider): + ''' + Sub function used for dividing matrix by a vector line by line. + ''' + return np.divide(a, divider, out=np.zeros_like(a), where=divider!=0) + + # Build count matrix + count_matrix = compute_count_matrix(y_truth, y_hat) + + # Build dividers vector + dividers = count_matrix.sum(axis=1) + + matrix_divided = np.apply_along_axis(divide_line, 0, np.asarray(count_matrix, dtype=np.float), dividers) + + log_matrix = np.zeros(matrix_divided.shape) + np.log2(matrix_divided, out=log_matrix, where=count_matrix != 0) + result_matrix = -1 * np.multiply(matrix_divided, log_matrix) + result_vector = result_matrix.sum(axis=1) + result_vector.sum() + + if np.isnan(np.sum(result_vector)): + print("COUNT MATRIX") + print(count_matrix) + print("MATRIX DIVIDED") + print(matrix_divided) + print("RESULT MATRIX") + print(result_matrix) + print("VECTOR MATRIX") + print(result_vector) + print("An error occured due to nan value, some values are printed before") + exit(1) + + result = result_vector * dividers / dividers.sum() + result = result.sum() + return (result_matrix, result_vector, result) + + +def purity_score(y_truth, y_hat): + ''' + Return three values in a dictionary: + - purity_class_score: the purity score of the class (asp) + - purity_cluster_score: the purity score of the cluster (acp) + - K: the overall evaluation criterion (sqrt(asp * acp)) + + This function is based on the following article: + Unknown-multiple speaker clustering using HMM, J. Ajmera, H. Bourlard, I. Lapidot, I. McCowan + ''' + + def divide_line(a, divider): + ''' + Sub function used for dividing matrix by a vector line by line. + ''' + return np.divide(a, divider, out=np.zeros_like(a), where=divider!=0) + + def compute_purity_score(count_matrix, axis=0): + if axis==0: + other_axis = 1 + else: + other_axis = 0 + count_per_row = count_matrix.sum(axis=axis) + dividers = np.square(count_per_row) + + count_matrix_squared = np.square(count_matrix) + matrix_divided = np.apply_along_axis(divide_line, other_axis, np.asarray(count_matrix_squared, dtype=np.float), dividers) + vector_purity = np.sum(matrix_divided, axis=axis) + + scalar_purity = np.average(vector_purity, weights=count_per_row) + return (vector_purity, scalar_purity) + + + count_matrix = compute_count_matrix(y_truth, y_hat) + _, purity_cluster_score = compute_purity_score(count_matrix, 1) + _, purity_class_score = cluster_purity = compute_purity_score(count_matrix, 0) + + K = np.sqrt(purity_cluster_score * purity_class_score) + + for i in range(count_matrix.shape[0]): + for j in range(count_matrix.shape[1]): + count_matrix[i][j] + count_matrix[i] + return { + "purity_class_score": purity_class_score, + "purity_cluster_score": purity_cluster_score, + "K": K + } + + +if __name__ == "__main__": + print("Purity test #1") + # Hypothesis + y_hat = np.asarray([0, 1, 2, 0, 1, 0, 3, 2, 2, 3, 3, 0]) + # Truth + y = np.asarray([0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3]) + + (result_matrix, result_vector, result) = entropy_score(y, y_hat) + print(purity_score(y, y_hat)) + + exit(1) + print("Purity test #2") + # Hypothesis + y_hat = np.asarray([0, 1, 2, 0, 1, 0, 3, 2, 2, 3, 3, 0, 4, 4, 4]) + # Truth + y = np.asarray([0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 0, 3, 3, 3]) + + (result_matrix, result_vector, result) = entropy_score(y, y_hat) + exit(1) + print("Result matrix: ") + print(result_matrix) + print("Result vector: ") + print(result_vector) + print("Result: ", result) \ No newline at end of file diff --git a/volia/data_io.py b/volia/data_io.py deleted file mode 100644 index 4c1bdae..0000000 --- a/volia/data_io.py +++ /dev/null @@ -1,44 +0,0 @@ -''' -Data management input/output -''' - -# Import packages and modules -import numpy as np - -# Defining some types -from typing import List, Dict -KeyToList = Dict[str, List[str]] -KeyToFeatures = Dict[str, List[float]] - - -def read_lst(file_path: str) -> KeyToList: - ''' - Read lst file with this structure: - [id] [value1] [value2] ... [value n] - - This is a basic function reused by others like read_features. - returns a dictionary with id as key and a list of value as corresponding values - ''' - # KeyToList type variable - key_to_list = dict() - with open(file_path, "r") as f: - for line in f: - splited = line.replace("\n", "").split(" ") - id = splited[0] - values = splited[1:] - key_to_list[id] = values - return key_to_list - - -def read_features(file_path: str) -> KeyToFeatures: - ''' - ''' - # KeyToFeatures type variable - key_to_features = dict() - # and the KeyToList - key_to_list = read_lst(file_path) - - for key_, list_ in key_to_list.items(): - key_to_features[key_] = np.asarray(list_, dtype=float) - - return key_to_features \ No newline at end of file diff --git a/volia/filter_ids.py b/volia/filter_ids.py new file mode 100644 index 0000000..17732ea --- /dev/null +++ b/volia/filter_ids.py @@ -0,0 +1,24 @@ +import argparse +from os.path import isfile +#from volia.data_io import read_lst + +import volia +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Filter ids of the given file to only keep a subset") + parser.add_argument("file", type=str, help="") + parser.add_argument("--filter", default=None, type=str, help="") + parser.add_argument("--outfile", default="out.txt", type=str, help="") + + args = parser.parse_args() + + assert args.filter is not None + assert isfile(args.file) + + list_ = read_lst(args.file) + filter_ = read_lst(args.filter) + + with open(args.outfile, "w") as of: + for key in filter_.keys(): + of.write(key + " " + " ".join(list_[key]) + "\n") + + print("File filtered and written in: ", args.outfile) \ No newline at end of file diff --git a/volia/measures.py b/volia/measures.py deleted file mode 100644 index 0ef8967..0000000 --- a/volia/measures.py +++ /dev/null @@ -1,227 +0,0 @@ -''' -This module is a part of my library. -It aims to compute some measures for clustering. -''' - -import numpy as np - -def disequilibrium_(matrix1, matrix2, isGlobal=False, mod=None): - ''' - Compute disequilibrium for all the clusters. - The disequilibrium is compute from the difference - between two clustering sets. - isGlobal permet à l'utilisateur de choisir le dénominateur de - la fonction : - - True : divise la valeur par le nombre d'élément du cluster - - False : divise la valeur par le nombre d'élément total - - withPower permet à l'utilisateur de décider d'appliquer un carré 2 ou - une valeur absolue. - ''' - - def divide_line(a, divider): - ''' - Sub function used for dividing matrix by a vector line by line. - ''' - return np.divide(a, divider, out=np.zeros_like(a), where=divider!=0) - - dividers1 = 0 - dividers2 = 0 - - if isGlobal: - dividers1 = matrix1.sum() - dividers2 = matrix2.sum() - else: - dividers1 = matrix1.sum(axis=1) - dividers2 = matrix2.sum(axis=1) - - matrix1_divided = np.apply_along_axis(divide_line, 0, np.asarray(matrix1, dtype=np.float), dividers1) - - matrix2_divided = np.apply_along_axis(divide_line, 0, np.asarray(matrix2, dtype=np.float), dividers2) - - diff = matrix1_divided - matrix2_divided - - mask = np.logical_not(np.logical_and(matrix2==0, matrix1==0)) - - result = diff - - if mod != None or mod == "": - for word in mod.split(" "): - if word == "power": - result = np.power(result,2) - elif word == "human": - result = result * 100 - elif word == "abs": - result = np.absolute(result) - else: - raise Exception("Need to specify an accepted mod of the disequilibrium (\"power\", \"human\" or \"abs\"") - return (mask, result) - - - -def disequilibrium_mean_by_cluster(mask, matrix): - ''' - Mean of disequilibrium - matrix is the disequilibrium calculated - from number of occurences belonging to a class, - for each cluster. - ''' - nb_k = len(matrix) - results = np.zeros((nb_k)) - - for i in range(nb_k): - results[i] = matrix[i].sum() / mask[i].sum() - return results - - -def disequilibrium(matrix1, matrix2, isGlobal=False): - ''' - Disequilibrium matrix - And Disequilibrium value - ''' - mask, result = disequilibrium_(matrix1, matrix2, isGlobal) - result_human = result * 100 - result_power = np.power(result, 2) - - return ( - mask, - result_human, - disequilibrium_mean_by_cluster(mask, result_power).sum()/matrix1.shape[0] - ) - - -def compute_count_matrix(y_truth, y_hat): - ''' - Check the size of the lists with assertion - ''' - # Check size of the lists - assert len(y_hat) == len(y_truth), f"Matrices should have the same length y_hat: {len(y_hat)}, y_truth: {len(y_truth)}" - - # Build count matrix - count_matrix = np.zeros((max(y_hat+1), max(y_truth+1))) - for i in range(len(y_hat)): - count_matrix[y_hat[i]][y_truth[i]] += 1 - return count_matrix - - -def entropy_score(y_truth, y_hat): - ''' - Need to use label encoder before givin y_hat and y_truth - Don't use one hot labels - - Return a tuple with: - - result_matrix : the matrix with the log multiplied probabilities (P(x) * log(P(x))) - - result_vector : the vector avec summing entropy of each class. Each value corresponds to a cluster. - - result : the final entropy measure of the clustering - ''' - def divide_line(a, divider): - ''' - Sub function used for dividing matrix by a vector line by line. - ''' - return np.divide(a, divider, out=np.zeros_like(a), where=divider!=0) - - # Build count matrix - count_matrix = compute_count_matrix(y_truth, y_hat) - - # Build dividers vector - dividers = count_matrix.sum(axis=1) - - matrix_divided = np.apply_along_axis(divide_line, 0, np.asarray(count_matrix, dtype=np.float), dividers) - - log_matrix = np.zeros(matrix_divided.shape) - np.log2(matrix_divided, out=log_matrix, where=count_matrix != 0) - result_matrix = -1 * np.multiply(matrix_divided, log_matrix) - result_vector = result_matrix.sum(axis=1) - result_vector.sum() - - if np.isnan(np.sum(result_vector)): - print("COUNT MATRIX") - print(count_matrix) - print("MATRIX DIVIDED") - print(matrix_divided) - print("RESULT MATRIX") - print(result_matrix) - print("VECTOR MATRIX") - print(result_vector) - print("An error occured due to nan value, some values are printed before") - exit(1) - - result = result_vector * dividers / dividers.sum() - result = result.sum() - return (result_matrix, result_vector, result) - - -def purity_score(y_truth, y_hat): - ''' - Return three values in a dictionary: - - purity_class_score: the purity score of the class (asp) - - purity_cluster_score: the purity score of the cluster (acp) - - K: the overall evaluation criterion (sqrt(asp * acp)) - - This function is based on the following article: - Unknown-multiple speaker clustering using HMM, J. Ajmera, H. Bourlard, I. Lapidot, I. McCowan - ''' - - def divide_line(a, divider): - ''' - Sub function used for dividing matrix by a vector line by line. - ''' - return np.divide(a, divider, out=np.zeros_like(a), where=divider!=0) - - def compute_purity_score(count_matrix, axis=0): - if axis==0: - other_axis = 1 - else: - other_axis = 0 - count_per_row = count_matrix.sum(axis=axis) - dividers = np.square(count_per_row) - - count_matrix_squared = np.square(count_matrix) - matrix_divided = np.apply_along_axis(divide_line, other_axis, np.asarray(count_matrix_squared, dtype=np.float), dividers) - vector_purity = np.sum(matrix_divided, axis=axis) - - scalar_purity = np.average(vector_purity, weights=count_per_row) - return (vector_purity, scalar_purity) - - - count_matrix = compute_count_matrix(y_truth, y_hat) - _, purity_cluster_score = compute_purity_score(count_matrix, 1) - _, purity_class_score = cluster_purity = compute_purity_score(count_matrix, 0) - - K = np.sqrt(purity_cluster_score * purity_class_score) - - for i in range(count_matrix.shape[0]): - for j in range(count_matrix.shape[1]): - count_matrix[i][j] - count_matrix[i] - return { - "purity_class_score": purity_class_score, - "purity_cluster_score": purity_cluster_score, - "K": K - } - - -if __name__ == "__main__": - print("Purity test #1") - # Hypothesis - y_hat = np.asarray([0, 1, 2, 0, 1, 0, 3, 2, 2, 3, 3, 0]) - # Truth - y = np.asarray([0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3]) - - (result_matrix, result_vector, result) = entropy_score(y, y_hat) - print(purity_score(y, y_hat)) - - exit(1) - print("Purity test #2") - # Hypothesis - y_hat = np.asarray([0, 1, 2, 0, 1, 0, 3, 2, 2, 3, 3, 0, 4, 4, 4]) - # Truth - y = np.asarray([0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 0, 3, 3, 3]) - - (result_matrix, result_vector, result) = entropy_score(y, y_hat) - exit(1) - print("Result matrix: ") - print(result_matrix) - print("Result vector: ") - print(result_vector) - print("Result: ", result) \ No newline at end of file diff --git a/volia/plot-character.py b/volia/plot-character.py new file mode 100644 index 0000000..bfb98d7 --- /dev/null +++ b/volia/plot-character.py @@ -0,0 +1,62 @@ + +import matplotlib.pyplot as plt +import numpy as np +import pandas as pd +import argparse +from os.path import isfile +from volia.data_io import read_features, read_lst + + +if __name__ == "__main__": + # Argparse + parser = argparse.ArgumentParser(description="Plot points with color for each character") + parser.add_argument("--features", type=str, help="features file path") + parser.add_argument("--utt2char", type=str, help="char2utt file path") + parser.add_argument("--sublist", type=str, default=None, help="white list of ids to take into account") + parser.add_argument("--outfile", default="out.pdf", type=str, help="") + parser.add_argument("--title", default="Example of plot", type=str, help="Specify the title") + args = parser.parse_args() + + # List of assertions + assert args.features, "Need to specify features option" + assert args.utt2char, "Need to specify char2utt option file" + assert isfile(args.features), "Features path should point to a file" + assert isfile(args.utt2char), "char2utt path should point to a file" + if args.sublist is not None: + assert isfile(args.sublist), "sublist path should point to a file" + + + id_to_features = read_features(args.features) + + ids = [] + if args.sublist is not None: + print("Using sublist") + list_ids = read_lst(args.sublist) + ids = [ key for key in list_ids.keys() ] + else: + ids = [ key for key in id_to_features.keys() ] + + utt2char = read_lst(args.utt2char) + + features = [ id_to_features[id_] for id_ in ids ] + features = np.vstack(features) + + characters_list = [ utt2char[id_][0] for id_ in ids ] + + features_T = features.transpose() + print("Number of characters: ", len(np.unique(characters_list))) + df = pd.DataFrame(dict( + x=features_T[0], + y=features_T[1], + character=characters_list)) + + groups = df.groupby('character') + + # Plot + fig, ax = plt.subplots() + + for character, group in groups: + p = ax.plot(group.x, group.y, marker='o', linestyle='', ms=1, label=character) + ax.legend() + plt.savefig(args.outfile) + print("Your plot is saved well (no check of this affirmation)") diff --git a/volia/test.py b/volia/test.py new file mode 100644 index 0000000..108dad2 --- /dev/null +++ b/volia/test.py @@ -0,0 +1,2 @@ +if __name__ == "__main__": + print("Volia is well installed.") \ No newline at end of file diff --git a/volia/tsne.py b/volia/tsne.py new file mode 100644 index 0000000..f276c22 --- /dev/null +++ b/volia/tsne.py @@ -0,0 +1,37 @@ +''' +The goal of this script is to display calculate tsne of pvectors. +''' + +import os +from os.path import isfile +import argparse +import numpy as np +from sklearn.manifold import TSNE + +from volia.data_io import read_features + +if __name__ == "__main__": + # Defining argparse + parser = argparse.ArgumentParser(prog='pvector tsne', description='Calculate the tsne representation of pvector in 3 or 2d') + parser.add_argument('features', type=str, + help='the path of the file you want to calculate tsne') + parser.add_argument('-o', '--outfile', type=str, + default='.', + help='the path of the output file.') + parser.add_argument('-n', '--n-comp', type=int, choices=[2, 3], + default='2', + help='number of components output of tsne') + + args = parser.parse_args() + + assert isfile(args.features) + + features_list = read_features(args.features) + tuples_key_feat = np.vstack([ (key, feats) for key, feats in features_list.items()]) + keys, features = zip(*tuples_key_feat) + feat_tsne = TSNE(n_components=args.n_comp).fit_transform(features) + + with open(args.outfile, "w") as of: + for i in range(len(keys)): + of.write(keys[i] + " " + " ".join([str(feat) for feat in feat_tsne[i]]) + "\n") + print("TSNE finished. Check if everything has been done well.") \ No newline at end of file -- 1.8.2.3