Commit e7d811503f88129fb1d8eb28dd6af09f681a771e

Authored by Quillot Mathias
1 parent 15e16ec460
Exists in master

New file architecture. Now scripts are on volia's directory and the library is o…

…n the core directory.

Showing 14 changed files with 419 additions and 670 deletions Side-by-side Diff

scripts/data-management/convert-old.py
1   -import argparse
2   -from os.path import isfile
3   -
4   -
5   -if __name__ == "__main__":
6   -
7   - parser = argparse.ArgumentParser(
8   - description="Convert old files with wrong id to new one. Masseffect.")
9   -
10   - parser.add_argument("file", type=str, help="feature, x2x, or list file")
11   - parser.add_argument("--outfile", type=str, default="out.txt", help="output file")
12   -
13   - args = parser.parse_args()
14   -
15   - assert isfile(args.file), "The given file does not exist."
16   -
17   - with open(args.file, "r") as f, open(args.outfile, "w") as of:
18   - for line in f:
19   - splited = line.replace("\n", "").split(" ")
20   - metas = splited[0].split(",")
21   - metas.pop(2)
22   - splited[0] = ",".join(metas)
23   - of.write(" ".join(splited) + "\n")
scripts/data-management/filter_ids.py
1   -import argparse
2   -from os.path import isfile
3   -from volia.data_io import read_lst
4   -
5   -if __name__ == "__main__":
6   - parser = argparse.ArgumentParser(description="Filter ids of the given file to only keep a subset")
7   - parser.add_argument("file", type=str, help="")
8   - parser.add_argument("--filter", default=None, type=str, help="")
9   - parser.add_argument("--outfile", default="out.txt", type=str, help="")
10   -
11   - args = parser.parse_args()
12   -
13   - assert args.filter is not None
14   - assert isfile(args.file)
15   -
16   - list_ = read_lst(args.file)
17   - filter_ = read_lst(args.filter)
18   -
19   - with open(args.outfile, "w") as of:
20   - for key in filter_.keys():
21   - of.write(key + " " + " ".join(list_[key]) + "\n")
22   -
23   - print("File filtered and written in: ", args.outfile)
scripts/dim-reduction/tsne.py
1   -'''
2   -The goal of this script is to display calculate tsne of pvectors.
3   -'''
4   -
5   -import os
6   -from os.path import isfile
7   -import argparse
8   -import numpy as np
9   -from sklearn.manifold import TSNE
10   -
11   -from volia.data_io import read_features
12   -
13   -if __name__ == "__main__":
14   - # Defining argparse
15   - parser = argparse.ArgumentParser(prog='pvector tsne', description='Calculate the tsne representation of pvector in 3 or 2d')
16   - parser.add_argument('features', type=str,
17   - help='the path of the file you want to calculate tsne')
18   - parser.add_argument('-o', '--outfile', type=str,
19   - default='.',
20   - help='the path of the output file.')
21   - parser.add_argument('-n', '--n-comp', type=int, choices=[2, 3],
22   - default='2',
23   - help='number of components output of tsne')
24   -
25   - args = parser.parse_args()
26   -
27   - assert isfile(args.features)
28   -
29   - features_list = read_features(args.features)
30   - tuples_key_feat = np.vstack([ (key, feats) for key, feats in features_list.items()])
31   - keys, features = zip(*tuples_key_feat)
32   - feat_tsne = TSNE(n_components=args.n_comp).fit_transform(features)
33   -
34   - with open(args.outfile, "w") as of:
35   - for i in range(len(keys)):
36   - of.write(keys[i] + " " + " ".join([str(feat) for feat in feat_tsne[i]]) + "\n")
37   - print("TSNE finished. Check if everything has been done well.")
scripts/evaluations/clustering.py
1   -'''
2   -This script allows the user to evaluate a classification system on new labels using clustering methods.
3   -The algorithms are applied on the given latent space (embedding).
4   -'''
5   -import argparse
6   -import numpy as np
7   -import pandas as pd
8   -import os
9   -import time
10   -import pickle
11   -import csv
12   -import json
13   -
14   -from sklearn.preprocessing import LabelEncoder
15   -from sklearn.metrics.pairwise import pairwise_distances
16   -from sklearn.cluster import KMeans
17   -from sklearn.manifold import TSNE
18   -from sklearn.metrics import f1_score, homogeneity_score, completeness_score, v_measure_score
19   -import matplotlib.pyplot as plt
20   -
21   -from volia.data_io import read_features,read_lst
22   -from volia.measures import entropy_score, purity_score
23   -
24   -'''
25   -TODO:
26   -- Add an option allowing the user to choose the number of
27   -clustering to train in order to compute the average and the
28   -'''
29   -
30   -
31   -def train_clustering(label_encoder, feats, classes, outdir):
32   - num_classes = len(label_encoder.classes_)
33   - estimator = None
34   - kmeans_filepath = os.path.join(outdir, f"{args.prefix}kmeans.pkl")
35   - if args.onlymeasures:
36   - print(f"Loading model: {kmeans_filepath}")
37   - with open(kmeans_filepath, "rb") as f:
38   - estimator = pickle.load(f)
39   - else:
40   - # Compute KMEANS clustering on data
41   - print("Saving parameters")
42   - kmeans_parameters = {
43   - "n_clusters": num_classes,
44   - "n_init": 100,
45   - "tol": 10-6,
46   - "algorithm": "elkan"
47   - }
48   - with open(os.path.join(outdir, f"{args.prefix}kmeans_parameters.json"), "w") as f:
49   - json.dump(kmeans_parameters, f)
50   -
51   - # Fit the model and Save parameters
52   - print(f"Fit the model: {kmeans_filepath}")
53   - estimator = KMeans(
54   - **kmeans_parameters
55   - )
56   - estimator.fit(feats)
57   - print(f"Kmeans: processed {estimator.n_iter_} iterations - intertia={estimator.inertia_}")
58   -
59   - with open(kmeans_filepath, "wb") as f:
60   - pickle.dump(estimator, f)
61   -
62   - # contains distance to each cluster for each sample
63   - dist_space = estimator.transform(feats)
64   - predictions = np.argmin(dist_space, axis=1)
65   -
66   - # gives each cluster a name (considering most represented character)
67   - dataframe = pd.DataFrame({
68   - "label": pd.Series(list(map(lambda x: le.classes_[x], labels))),
69   - "prediction": pd.Series(predictions)
70   - })
71   -
72   - def find_cluster_name_fn(c):
73   - mask = dataframe["prediction"] == c
74   - return dataframe[mask]["label"].value_counts(sort=False).idxmax()
75   -
76   - cluster_names = list(map(find_cluster_name_fn, range(num_classes)))
77   - predicted_labels = le.transform(
78   - [cluster_names[pred] for pred in predictions])
79   -
80   - # F-measure
81   - fscores = f1_score(labels, predicted_labels, average=None)
82   - fscores_str = "\n".join(map(lambda i: "{0:25s}: {1:.4f}".format(le.classes_[i], fscores[i]), range(len(fscores))))
83   -
84   - # Entropy
85   - _, _, entropy = entropy_score(labels, predicted_labels)
86   -
87   - # Homogenity
88   - homogeneity = homogeneity_score(labels, predicted_labels)
89   -
90   - # Completeness
91   - completeness = completeness_score(labels, predicted_labels)
92   -
93   - # V-Measure
94   - v_measure = v_measure_score(labels, predicted_labels)
95   -
96   - # Purity
97   - purity_scores = purity_score(labels, predicted_labels)
98   - purity_class_score = purity_scores["purity_class_score"]
99   - purity_cluster_score = purity_scores["purity_cluster_score"]
100   - K = purity_scores["K"]
101   -
102   - # Write results
103   - with open(os.path.join(outdir, args.prefix + "eval_clustering.log"), "w") as fd:
104   - print(f"F1-scores for each classes:\n{fscores_str}", file=fd)
105   - print(f"Entropy: {entropy}", file=fd)
106   - print(f"Global score : {np.mean(fscores)}", file=fd)
107   - print(f"Homogeneity: {homogeneity}", file=fd)
108   - print(f"completeness: {completeness}", file=fd)
109   - print(f"v-measure: {v_measure}", file=fd)
110   - print(f"purity class score: {purity_class_score}", file=fd)
111   - print(f"purity cluster score: {purity_cluster_score}", file=fd)
112   - print(f"purity overall evaluation criterion (K): {K}", file=fd)
113   -
114   - # Process t-SNE and plot
115   - tsne_estimator = TSNE()
116   - embeddings = tsne_estimator.fit_transform(feats)
117   - print("t-SNE: processed {0} iterations - KL_divergence={1:.4f}".format(
118   - tsne_estimator.n_iter_, tsne_estimator.kl_divergence_))
119   -
120   - fig, [axe1, axe2] = plt.subplots(1, 2, figsize=(10, 5))
121   - for c, name in enumerate(le.classes_):
122   - c_mask = np.where(labels == c)
123   - axe1.scatter(embeddings[c_mask][:, 0], embeddings[c_mask][:, 1], label=name, alpha=0.2, edgecolors=None)
124   -
125   - try:
126   - id_cluster = cluster_names.index(name)
127   - except ValueError:
128   - print("WARNING: no cluster found for {}".format(name))
129   - continue
130   - c_mask = np.where(predictions == id_cluster)
131   - axe2.scatter(embeddings[c_mask][:, 0], embeddings[c_mask][:, 1], label=name, alpha=0.2, edgecolors=None)
132   -
133   - axe1.legend(loc="lower center", bbox_to_anchor=(0.5, -0.35))
134   - axe1.set_title("true labels")
135   - axe2.legend(loc="lower center", bbox_to_anchor=(0.5, -0.35))
136   - axe2.set_title("predicted cluster label")
137   -
138   - plt.suptitle("Kmeans Clustering")
139   -
140   - loc = os.path.join(
141   - outdir,
142   - args.prefix + "kmeans.pdf"
143   - )
144   - plt.savefig(loc, bbox_inches="tight")
145   - plt.close()
146   -
147   - print("INFO: figure saved at {}".format(loc))
148   -
149   - end = time.time()
150   - print("program ended in {0:.2f} seconds".format(end-start))
151   - return {
152   - "f1": np.mean(fscores),
153   - "entropy": entropy,
154   - "homogeneity": homogeneity,
155   - "completeness": completeness,
156   - "v-measure": v_measure,
157   - "purity_class_score": purity_class_score,
158   - "purity_cluster score": purity_cluster_score,
159   - "K": K
160   - }
161   -
162   -
163   -if __name__ == "__main__":
164   - # Argparse
165   - parser = argparse.ArgumentParser("Compute clustering on a latent space")
166   - parser.add_argument("features")
167   - parser.add_argument("utt2",
168   - type=str,
169   - help="file with [utt] [value]")
170   - parser.add_argument("--idsfrom",
171   - type=str,
172   - default="utt2",
173   - choices=[
174   - "features",
175   - "utt2"
176   - ],
177   - help="from features or from utt2?")
178   - parser.add_argument("--prefix",
179   - default="",
180   - type=str,
181   - help="prefix of saved files")
182   - parser.add_argument("--outdir",
183   - default=None,
184   - type=str,
185   - help="Output directory")
186   - parser.add_argument("--nmodels",
187   - type=int,
188   - default=1,
189   - help="specifies the number of models to train")
190   - parser.add_argument("--onlymeasures",
191   - action='store_true',
192   - help="Don't compute the clustering, compute only the measures")
193   - args = parser.parse_args()
194   -
195   - assert args.outdir
196   -
197   - start = time.time()
198   -
199   - # Load features and utt2
200   - features = read_features(args.features)
201   - utt2 = read_lst(args.utt2)
202   -
203   - # Take id list
204   - if args.idsfrom == "features":
205   - ids = list(features.keys())
206   - elif args.idsfrom == "utt2":
207   - ids = list(utt2.keys())
208   - else:
209   - print(f"idsfrom is not good: {args.idsfrom}")
210   - exit(1)
211   -
212   - feats = np.vstack([ features[id_] for id_ in ids ])
213   - classes = [ utt2[id_] for id_ in ids ]
214   -
215   - # Encode labels
216   - le = LabelEncoder()
217   - labels = le.fit_transform(classes)
218   -
219   - measures = {}
220   - for i in range(1, args.nmodels+1):
221   - subdir = os.path.join(args.outdir, str(i))
222   - if not os.path.exists(subdir):
223   - os.mkdir(subdir)
224   - print(f"[{i}/{args.nmodels}] => {subdir}")
225   - results = train_clustering(le, feats, classes, subdir)
226   -
227   - for key, value in results.items():
228   - if key not in measures:
229   - measures[key] = []
230   - measures[key].append(results[key])
231   -
232   -
233   - # File with results
234   - file_results = os.path.join(args.outdir, args.prefix + "clustering_measures.txt")
235   -
236   - with open(file_results, "w") as f:
237   - f.write(f"[nmodels: {args.nmodels}]\n")
238   - for key in measures.keys():
239   - values = np.asarray(measures[key], dtype=float)
240   - mean = np.mean(values)
241   - std = np.std(values)
242   - f.write(f"[{key} => mean: {mean}, std: {std}] \n")
243   -
244   - # CSV File with all the values
245   - file_csv_measures = os.path.join(args.outdir, args.prefix + "clustering_measures.csv")
246   -
247   - with open(file_csv_measures, "w", newline="") as f:
248   - writer = csv.writer(f, delimiter=",")
249   - writer.writerow(["measure"] + list(range(1, args.nmodels+1)) + ["mean"] + ["std"])
250   - for key in measures.keys():
251   - values = np.asarray(measures[key], dtype=float)
252   - mean = np.mean(values)
253   - std = np.std(values)
254   - writer.writerow([key] + list(values) + [mean] + [std])
scripts/plot/plot-character.py
1   -
2   -import matplotlib.pyplot as plt
3   -import numpy as np
4   -import pandas as pd
5   -import argparse
6   -from os.path import isfile
7   -from volia.data_io import read_features, read_lst
8   -
9   -
10   -if __name__ == "__main__":
11   - # Argparse
12   - parser = argparse.ArgumentParser(description="Plot points with color for each character")
13   - parser.add_argument("--features", type=str, help="features file path")
14   - parser.add_argument("--utt2char", type=str, help="char2utt file path")
15   - parser.add_argument("--sublist", type=str, default=None, help="white list of ids to take into account")
16   - parser.add_argument("--outfile", default="out.pdf", type=str, help="")
17   - parser.add_argument("--title", default="Example of plot", type=str, help="Specify the title")
18   - args = parser.parse_args()
19   -
20   - # List of assertions
21   - assert args.features, "Need to specify features option"
22   - assert args.utt2char, "Need to specify char2utt option file"
23   - assert isfile(args.features), "Features path should point to a file"
24   - assert isfile(args.utt2char), "char2utt path should point to a file"
25   - if args.sublist is not None:
26   - assert isfile(args.sublist), "sublist path should point to a file"
27   -
28   -
29   - id_to_features = read_features(args.features)
30   -
31   - ids = []
32   - if args.sublist is not None:
33   - print("Using sublist")
34   - list_ids = read_lst(args.sublist)
35   - ids = [ key for key in list_ids.keys() ]
36   - else:
37   - ids = [ key for key in id_to_features.keys() ]
38   -
39   - utt2char = read_lst(args.utt2char)
40   -
41   - features = [ id_to_features[id_] for id_ in ids ]
42   - features = np.vstack(features)
43   -
44   - characters_list = [ utt2char[id_][0] for id_ in ids ]
45   -
46   - features_T = features.transpose()
47   - print("Number of characters: ", len(np.unique(characters_list)))
48   - df = pd.DataFrame(dict(
49   - x=features_T[0],
50   - y=features_T[1],
51   - character=characters_list))
52   -
53   - groups = df.groupby('character')
54   -
55   - # Plot
56   - fig, ax = plt.subplots()
57   -
58   - for character, group in groups:
59   - p = ax.plot(group.x, group.y, marker='o', linestyle='', ms=1, label=character)
60   - ax.legend()
61   - plt.savefig(args.outfile)
62   - print("Your plot is saved well (no check of this affirmation)")
volia/convert-old.py
  1 +import argparse
  2 +from os.path import isfile
  3 +
  4 +
  5 +if __name__ == "__main__":
  6 +
  7 + parser = argparse.ArgumentParser(
  8 + description="Convert old files with wrong id to new one. Masseffect.")
  9 +
  10 + parser.add_argument("file", type=str, help="feature, x2x, or list file")
  11 + parser.add_argument("--outfile", type=str, default="out.txt", help="output file")
  12 +
  13 + args = parser.parse_args()
  14 +
  15 + assert isfile(args.file), "The given file does not exist."
  16 +
  17 + with open(args.file, "r") as f, open(args.outfile, "w") as of:
  18 + for line in f:
  19 + splited = line.replace("\n", "").split(" ")
  20 + metas = splited[0].split(",")
  21 + metas.pop(2)
  22 + splited[0] = ",".join(metas)
  23 + of.write(" ".join(splited) + "\n")
  1 +'''
  2 +Data management input/output
  3 +'''
  4 +
  5 +# Import packages and modules
  6 +import numpy as np
  7 +
  8 +# Defining some types
  9 +from typing import List, Dict
  10 +KeyToList = Dict[str, List[str]]
  11 +KeyToFeatures = Dict[str, List[float]]
  12 +
  13 +
  14 +def read_lst(file_path: str) -> KeyToList:
  15 + '''
  16 + Read lst file with this structure:
  17 + [id] [value1] [value2] ... [value n]
  18 +
  19 + This is a basic function reused by others like read_features.
  20 + returns a dictionary with id as key and a list of value as corresponding values
  21 + '''
  22 + # KeyToList type variable
  23 + key_to_list = dict()
  24 + with open(file_path, "r") as f:
  25 + for line in f:
  26 + splited = line.replace("\n", "").split(" ")
  27 + id = splited[0]
  28 + values = splited[1:]
  29 + key_to_list[id] = values
  30 + return key_to_list
  31 +
  32 +
  33 +def read_features(file_path: str) -> KeyToFeatures:
  34 + '''
  35 + '''
  36 + # KeyToFeatures type variable
  37 + key_to_features = dict()
  38 + # and the KeyToList
  39 + key_to_list = read_lst(file_path)
  40 +
  41 + for key_, list_ in key_to_list.items():
  42 + key_to_features[key_] = np.asarray(list_, dtype=float)
  43 +
  44 + return key_to_features
volia/core/measures.py
  1 +'''
  2 +This module is a part of my library.
  3 +It aims to compute some measures for clustering.
  4 +'''
  5 +
  6 +import numpy as np
  7 +
  8 +def disequilibrium_(matrix1, matrix2, isGlobal=False, mod=None):
  9 + '''
  10 + Compute disequilibrium for all the clusters.
  11 + The disequilibrium is compute from the difference
  12 + between two clustering sets.
  13 + isGlobal permet à l'utilisateur de choisir le dénominateur de
  14 + la fonction :
  15 + - True : divise la valeur par le nombre d'élément du cluster
  16 + - False : divise la valeur par le nombre d'élément total
  17 +
  18 + withPower permet à l'utilisateur de décider d'appliquer un carré 2 ou
  19 + une valeur absolue.
  20 + '''
  21 +
  22 + def divide_line(a, divider):
  23 + '''
  24 + Sub function used for dividing matrix by a vector line by line.
  25 + '''
  26 + return np.divide(a, divider, out=np.zeros_like(a), where=divider!=0)
  27 +
  28 + dividers1 = 0
  29 + dividers2 = 0
  30 +
  31 + if isGlobal:
  32 + dividers1 = matrix1.sum()
  33 + dividers2 = matrix2.sum()
  34 + else:
  35 + dividers1 = matrix1.sum(axis=1)
  36 + dividers2 = matrix2.sum(axis=1)
  37 +
  38 + matrix1_divided = np.apply_along_axis(divide_line, 0, np.asarray(matrix1, dtype=np.float), dividers1)
  39 +
  40 + matrix2_divided = np.apply_along_axis(divide_line, 0, np.asarray(matrix2, dtype=np.float), dividers2)
  41 +
  42 + diff = matrix1_divided - matrix2_divided
  43 +
  44 + mask = np.logical_not(np.logical_and(matrix2==0, matrix1==0))
  45 +
  46 + result = diff
  47 +
  48 + if mod != None or mod == "":
  49 + for word in mod.split(" "):
  50 + if word == "power":
  51 + result = np.power(result,2)
  52 + elif word == "human":
  53 + result = result * 100
  54 + elif word == "abs":
  55 + result = np.absolute(result)
  56 + else:
  57 + raise Exception("Need to specify an accepted mod of the disequilibrium (\"power\", \"human\" or \"abs\"")
  58 + return (mask, result)
  59 +
  60 +
  61 +
  62 +def disequilibrium_mean_by_cluster(mask, matrix):
  63 + '''
  64 + Mean of disequilibrium
  65 + matrix is the disequilibrium calculated
  66 + from number of occurences belonging to a class,
  67 + for each cluster.
  68 + '''
  69 + nb_k = len(matrix)
  70 + results = np.zeros((nb_k))
  71 +
  72 + for i in range(nb_k):
  73 + results[i] = matrix[i].sum() / mask[i].sum()
  74 + return results
  75 +
  76 +
  77 +def disequilibrium(matrix1, matrix2, isGlobal=False):
  78 + '''
  79 + Disequilibrium matrix
  80 + And Disequilibrium value
  81 + '''
  82 + mask, result = disequilibrium_(matrix1, matrix2, isGlobal)
  83 + result_human = result * 100
  84 + result_power = np.power(result, 2)
  85 +
  86 + return (
  87 + mask,
  88 + result_human,
  89 + disequilibrium_mean_by_cluster(mask, result_power).sum()/matrix1.shape[0]
  90 + )
  91 +
  92 +
  93 +def compute_count_matrix(y_truth, y_hat):
  94 + '''
  95 + Check the size of the lists with assertion
  96 + '''
  97 + # Check size of the lists
  98 + assert len(y_hat) == len(y_truth), f"Matrices should have the same length y_hat: {len(y_hat)}, y_truth: {len(y_truth)}"
  99 +
  100 + # Build count matrix
  101 + count_matrix = np.zeros((max(y_hat+1), max(y_truth+1)))
  102 + for i in range(len(y_hat)):
  103 + count_matrix[y_hat[i]][y_truth[i]] += 1
  104 + return count_matrix
  105 +
  106 +
  107 +def entropy_score(y_truth, y_hat):
  108 + '''
  109 + Need to use label encoder before givin y_hat and y_truth
  110 + Don't use one hot labels
  111 +
  112 + Return a tuple with:
  113 + - result_matrix : the matrix with the log multiplied probabilities (P(x) * log(P(x)))
  114 + - result_vector : the vector avec summing entropy of each class. Each value corresponds to a cluster.
  115 + - result : the final entropy measure of the clustering
  116 + '''
  117 + def divide_line(a, divider):
  118 + '''
  119 + Sub function used for dividing matrix by a vector line by line.
  120 + '''
  121 + return np.divide(a, divider, out=np.zeros_like(a), where=divider!=0)
  122 +
  123 + # Build count matrix
  124 + count_matrix = compute_count_matrix(y_truth, y_hat)
  125 +
  126 + # Build dividers vector
  127 + dividers = count_matrix.sum(axis=1)
  128 +
  129 + matrix_divided = np.apply_along_axis(divide_line, 0, np.asarray(count_matrix, dtype=np.float), dividers)
  130 +
  131 + log_matrix = np.zeros(matrix_divided.shape)
  132 + np.log2(matrix_divided, out=log_matrix, where=count_matrix != 0)
  133 + result_matrix = -1 * np.multiply(matrix_divided, log_matrix)
  134 + result_vector = result_matrix.sum(axis=1)
  135 + result_vector.sum()
  136 +
  137 + if np.isnan(np.sum(result_vector)):
  138 + print("COUNT MATRIX")
  139 + print(count_matrix)
  140 + print("MATRIX DIVIDED")
  141 + print(matrix_divided)
  142 + print("RESULT MATRIX")
  143 + print(result_matrix)
  144 + print("VECTOR MATRIX")
  145 + print(result_vector)
  146 + print("An error occured due to nan value, some values are printed before")
  147 + exit(1)
  148 +
  149 + result = result_vector * dividers / dividers.sum()
  150 + result = result.sum()
  151 + return (result_matrix, result_vector, result)
  152 +
  153 +
  154 +def purity_score(y_truth, y_hat):
  155 + '''
  156 + Return three values in a dictionary:
  157 + - purity_class_score: the purity score of the class (asp)
  158 + - purity_cluster_score: the purity score of the cluster (acp)
  159 + - K: the overall evaluation criterion (sqrt(asp * acp))
  160 +
  161 + This function is based on the following article:
  162 + Unknown-multiple speaker clustering using HMM, J. Ajmera, H. Bourlard, I. Lapidot, I. McCowan
  163 + '''
  164 +
  165 + def divide_line(a, divider):
  166 + '''
  167 + Sub function used for dividing matrix by a vector line by line.
  168 + '''
  169 + return np.divide(a, divider, out=np.zeros_like(a), where=divider!=0)
  170 +
  171 + def compute_purity_score(count_matrix, axis=0):
  172 + if axis==0:
  173 + other_axis = 1
  174 + else:
  175 + other_axis = 0
  176 + count_per_row = count_matrix.sum(axis=axis)
  177 + dividers = np.square(count_per_row)
  178 +
  179 + count_matrix_squared = np.square(count_matrix)
  180 + matrix_divided = np.apply_along_axis(divide_line, other_axis, np.asarray(count_matrix_squared, dtype=np.float), dividers)
  181 + vector_purity = np.sum(matrix_divided, axis=axis)
  182 +
  183 + scalar_purity = np.average(vector_purity, weights=count_per_row)
  184 + return (vector_purity, scalar_purity)
  185 +
  186 +
  187 + count_matrix = compute_count_matrix(y_truth, y_hat)
  188 + _, purity_cluster_score = compute_purity_score(count_matrix, 1)
  189 + _, purity_class_score = cluster_purity = compute_purity_score(count_matrix, 0)
  190 +
  191 + K = np.sqrt(purity_cluster_score * purity_class_score)
  192 +
  193 + for i in range(count_matrix.shape[0]):
  194 + for j in range(count_matrix.shape[1]):
  195 + count_matrix[i][j]
  196 + count_matrix[i]
  197 + return {
  198 + "purity_class_score": purity_class_score,
  199 + "purity_cluster_score": purity_cluster_score,
  200 + "K": K
  201 + }
  202 +
  203 +
  204 +if __name__ == "__main__":
  205 + print("Purity test #1")
  206 + # Hypothesis
  207 + y_hat = np.asarray([0, 1, 2, 0, 1, 0, 3, 2, 2, 3, 3, 0])
  208 + # Truth
  209 + y = np.asarray([0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3])
  210 +
  211 + (result_matrix, result_vector, result) = entropy_score(y, y_hat)
  212 + print(purity_score(y, y_hat))
  213 +
  214 + exit(1)
  215 + print("Purity test #2")
  216 + # Hypothesis
  217 + y_hat = np.asarray([0, 1, 2, 0, 1, 0, 3, 2, 2, 3, 3, 0, 4, 4, 4])
  218 + # Truth
  219 + y = np.asarray([0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 0, 3, 3, 3])
  220 +
  221 + (result_matrix, result_vector, result) = entropy_score(y, y_hat)
  222 + exit(1)
  223 + print("Result matrix: ")
  224 + print(result_matrix)
  225 + print("Result vector: ")
  226 + print(result_vector)
  227 + print("Result: ", result)
volia/data_io.py
1   -'''
2   -Data management input/output
3   -'''
4   -
5   -# Import packages and modules
6   -import numpy as np
7   -
8   -# Defining some types
9   -from typing import List, Dict
10   -KeyToList = Dict[str, List[str]]
11   -KeyToFeatures = Dict[str, List[float]]
12   -
13   -
14   -def read_lst(file_path: str) -> KeyToList:
15   - '''
16   - Read lst file with this structure:
17   - [id] [value1] [value2] ... [value n]
18   -
19   - This is a basic function reused by others like read_features.
20   - returns a dictionary with id as key and a list of value as corresponding values
21   - '''
22   - # KeyToList type variable
23   - key_to_list = dict()
24   - with open(file_path, "r") as f:
25   - for line in f:
26   - splited = line.replace("\n", "").split(" ")
27   - id = splited[0]
28   - values = splited[1:]
29   - key_to_list[id] = values
30   - return key_to_list
31   -
32   -
33   -def read_features(file_path: str) -> KeyToFeatures:
34   - '''
35   - '''
36   - # KeyToFeatures type variable
37   - key_to_features = dict()
38   - # and the KeyToList
39   - key_to_list = read_lst(file_path)
40   -
41   - for key_, list_ in key_to_list.items():
42   - key_to_features[key_] = np.asarray(list_, dtype=float)
43   -
44   - return key_to_features
  1 +import argparse
  2 +from os.path import isfile
  3 +#from volia.data_io import read_lst
  4 +
  5 +import volia
  6 +if __name__ == "__main__":
  7 + parser = argparse.ArgumentParser(description="Filter ids of the given file to only keep a subset")
  8 + parser.add_argument("file", type=str, help="")
  9 + parser.add_argument("--filter", default=None, type=str, help="")
  10 + parser.add_argument("--outfile", default="out.txt", type=str, help="")
  11 +
  12 + args = parser.parse_args()
  13 +
  14 + assert args.filter is not None
  15 + assert isfile(args.file)
  16 +
  17 + list_ = read_lst(args.file)
  18 + filter_ = read_lst(args.filter)
  19 +
  20 + with open(args.outfile, "w") as of:
  21 + for key in filter_.keys():
  22 + of.write(key + " " + " ".join(list_[key]) + "\n")
  23 +
  24 + print("File filtered and written in: ", args.outfile)
volia/measures.py
1   -'''
2   -This module is a part of my library.
3   -It aims to compute some measures for clustering.
4   -'''
5   -
6   -import numpy as np
7   -
8   -def disequilibrium_(matrix1, matrix2, isGlobal=False, mod=None):
9   - '''
10   - Compute disequilibrium for all the clusters.
11   - The disequilibrium is compute from the difference
12   - between two clustering sets.
13   - isGlobal permet à l'utilisateur de choisir le dénominateur de
14   - la fonction :
15   - - True : divise la valeur par le nombre d'élément du cluster
16   - - False : divise la valeur par le nombre d'élément total
17   -
18   - withPower permet à l'utilisateur de décider d'appliquer un carré 2 ou
19   - une valeur absolue.
20   - '''
21   -
22   - def divide_line(a, divider):
23   - '''
24   - Sub function used for dividing matrix by a vector line by line.
25   - '''
26   - return np.divide(a, divider, out=np.zeros_like(a), where=divider!=0)
27   -
28   - dividers1 = 0
29   - dividers2 = 0
30   -
31   - if isGlobal:
32   - dividers1 = matrix1.sum()
33   - dividers2 = matrix2.sum()
34   - else:
35   - dividers1 = matrix1.sum(axis=1)
36   - dividers2 = matrix2.sum(axis=1)
37   -
38   - matrix1_divided = np.apply_along_axis(divide_line, 0, np.asarray(matrix1, dtype=np.float), dividers1)
39   -
40   - matrix2_divided = np.apply_along_axis(divide_line, 0, np.asarray(matrix2, dtype=np.float), dividers2)
41   -
42   - diff = matrix1_divided - matrix2_divided
43   -
44   - mask = np.logical_not(np.logical_and(matrix2==0, matrix1==0))
45   -
46   - result = diff
47   -
48   - if mod != None or mod == "":
49   - for word in mod.split(" "):
50   - if word == "power":
51   - result = np.power(result,2)
52   - elif word == "human":
53   - result = result * 100
54   - elif word == "abs":
55   - result = np.absolute(result)
56   - else:
57   - raise Exception("Need to specify an accepted mod of the disequilibrium (\"power\", \"human\" or \"abs\"")
58   - return (mask, result)
59   -
60   -
61   -
62   -def disequilibrium_mean_by_cluster(mask, matrix):
63   - '''
64   - Mean of disequilibrium
65   - matrix is the disequilibrium calculated
66   - from number of occurences belonging to a class,
67   - for each cluster.
68   - '''
69   - nb_k = len(matrix)
70   - results = np.zeros((nb_k))
71   -
72   - for i in range(nb_k):
73   - results[i] = matrix[i].sum() / mask[i].sum()
74   - return results
75   -
76   -
77   -def disequilibrium(matrix1, matrix2, isGlobal=False):
78   - '''
79   - Disequilibrium matrix
80   - And Disequilibrium value
81   - '''
82   - mask, result = disequilibrium_(matrix1, matrix2, isGlobal)
83   - result_human = result * 100
84   - result_power = np.power(result, 2)
85   -
86   - return (
87   - mask,
88   - result_human,
89   - disequilibrium_mean_by_cluster(mask, result_power).sum()/matrix1.shape[0]
90   - )
91   -
92   -
93   -def compute_count_matrix(y_truth, y_hat):
94   - '''
95   - Check the size of the lists with assertion
96   - '''
97   - # Check size of the lists
98   - assert len(y_hat) == len(y_truth), f"Matrices should have the same length y_hat: {len(y_hat)}, y_truth: {len(y_truth)}"
99   -
100   - # Build count matrix
101   - count_matrix = np.zeros((max(y_hat+1), max(y_truth+1)))
102   - for i in range(len(y_hat)):
103   - count_matrix[y_hat[i]][y_truth[i]] += 1
104   - return count_matrix
105   -
106   -
107   -def entropy_score(y_truth, y_hat):
108   - '''
109   - Need to use label encoder before givin y_hat and y_truth
110   - Don't use one hot labels
111   -
112   - Return a tuple with:
113   - - result_matrix : the matrix with the log multiplied probabilities (P(x) * log(P(x)))
114   - - result_vector : the vector avec summing entropy of each class. Each value corresponds to a cluster.
115   - - result : the final entropy measure of the clustering
116   - '''
117   - def divide_line(a, divider):
118   - '''
119   - Sub function used for dividing matrix by a vector line by line.
120   - '''
121   - return np.divide(a, divider, out=np.zeros_like(a), where=divider!=0)
122   -
123   - # Build count matrix
124   - count_matrix = compute_count_matrix(y_truth, y_hat)
125   -
126   - # Build dividers vector
127   - dividers = count_matrix.sum(axis=1)
128   -
129   - matrix_divided = np.apply_along_axis(divide_line, 0, np.asarray(count_matrix, dtype=np.float), dividers)
130   -
131   - log_matrix = np.zeros(matrix_divided.shape)
132   - np.log2(matrix_divided, out=log_matrix, where=count_matrix != 0)
133   - result_matrix = -1 * np.multiply(matrix_divided, log_matrix)
134   - result_vector = result_matrix.sum(axis=1)
135   - result_vector.sum()
136   -
137   - if np.isnan(np.sum(result_vector)):
138   - print("COUNT MATRIX")
139   - print(count_matrix)
140   - print("MATRIX DIVIDED")
141   - print(matrix_divided)
142   - print("RESULT MATRIX")
143   - print(result_matrix)
144   - print("VECTOR MATRIX")
145   - print(result_vector)
146   - print("An error occured due to nan value, some values are printed before")
147   - exit(1)
148   -
149   - result = result_vector * dividers / dividers.sum()
150   - result = result.sum()
151   - return (result_matrix, result_vector, result)
152   -
153   -
154   -def purity_score(y_truth, y_hat):
155   - '''
156   - Return three values in a dictionary:
157   - - purity_class_score: the purity score of the class (asp)
158   - - purity_cluster_score: the purity score of the cluster (acp)
159   - - K: the overall evaluation criterion (sqrt(asp * acp))
160   -
161   - This function is based on the following article:
162   - Unknown-multiple speaker clustering using HMM, J. Ajmera, H. Bourlard, I. Lapidot, I. McCowan
163   - '''
164   -
165   - def divide_line(a, divider):
166   - '''
167   - Sub function used for dividing matrix by a vector line by line.
168   - '''
169   - return np.divide(a, divider, out=np.zeros_like(a), where=divider!=0)
170   -
171   - def compute_purity_score(count_matrix, axis=0):
172   - if axis==0:
173   - other_axis = 1
174   - else:
175   - other_axis = 0
176   - count_per_row = count_matrix.sum(axis=axis)
177   - dividers = np.square(count_per_row)
178   -
179   - count_matrix_squared = np.square(count_matrix)
180   - matrix_divided = np.apply_along_axis(divide_line, other_axis, np.asarray(count_matrix_squared, dtype=np.float), dividers)
181   - vector_purity = np.sum(matrix_divided, axis=axis)
182   -
183   - scalar_purity = np.average(vector_purity, weights=count_per_row)
184   - return (vector_purity, scalar_purity)
185   -
186   -
187   - count_matrix = compute_count_matrix(y_truth, y_hat)
188   - _, purity_cluster_score = compute_purity_score(count_matrix, 1)
189   - _, purity_class_score = cluster_purity = compute_purity_score(count_matrix, 0)
190   -
191   - K = np.sqrt(purity_cluster_score * purity_class_score)
192   -
193   - for i in range(count_matrix.shape[0]):
194   - for j in range(count_matrix.shape[1]):
195   - count_matrix[i][j]
196   - count_matrix[i]
197   - return {
198   - "purity_class_score": purity_class_score,
199   - "purity_cluster_score": purity_cluster_score,
200   - "K": K
201   - }
202   -
203   -
204   -if __name__ == "__main__":
205   - print("Purity test #1")
206   - # Hypothesis
207   - y_hat = np.asarray([0, 1, 2, 0, 1, 0, 3, 2, 2, 3, 3, 0])
208   - # Truth
209   - y = np.asarray([0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3])
210   -
211   - (result_matrix, result_vector, result) = entropy_score(y, y_hat)
212   - print(purity_score(y, y_hat))
213   -
214   - exit(1)
215   - print("Purity test #2")
216   - # Hypothesis
217   - y_hat = np.asarray([0, 1, 2, 0, 1, 0, 3, 2, 2, 3, 3, 0, 4, 4, 4])
218   - # Truth
219   - y = np.asarray([0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 0, 3, 3, 3])
220   -
221   - (result_matrix, result_vector, result) = entropy_score(y, y_hat)
222   - exit(1)
223   - print("Result matrix: ")
224   - print(result_matrix)
225   - print("Result vector: ")
226   - print(result_vector)
227   - print("Result: ", result)
volia/plot-character.py
  1 +
  2 +import matplotlib.pyplot as plt
  3 +import numpy as np
  4 +import pandas as pd
  5 +import argparse
  6 +from os.path import isfile
  7 +from volia.data_io import read_features, read_lst
  8 +
  9 +
  10 +if __name__ == "__main__":
  11 + # Argparse
  12 + parser = argparse.ArgumentParser(description="Plot points with color for each character")
  13 + parser.add_argument("--features", type=str, help="features file path")
  14 + parser.add_argument("--utt2char", type=str, help="char2utt file path")
  15 + parser.add_argument("--sublist", type=str, default=None, help="white list of ids to take into account")
  16 + parser.add_argument("--outfile", default="out.pdf", type=str, help="")
  17 + parser.add_argument("--title", default="Example of plot", type=str, help="Specify the title")
  18 + args = parser.parse_args()
  19 +
  20 + # List of assertions
  21 + assert args.features, "Need to specify features option"
  22 + assert args.utt2char, "Need to specify char2utt option file"
  23 + assert isfile(args.features), "Features path should point to a file"
  24 + assert isfile(args.utt2char), "char2utt path should point to a file"
  25 + if args.sublist is not None:
  26 + assert isfile(args.sublist), "sublist path should point to a file"
  27 +
  28 +
  29 + id_to_features = read_features(args.features)
  30 +
  31 + ids = []
  32 + if args.sublist is not None:
  33 + print("Using sublist")
  34 + list_ids = read_lst(args.sublist)
  35 + ids = [ key for key in list_ids.keys() ]
  36 + else:
  37 + ids = [ key for key in id_to_features.keys() ]
  38 +
  39 + utt2char = read_lst(args.utt2char)
  40 +
  41 + features = [ id_to_features[id_] for id_ in ids ]
  42 + features = np.vstack(features)
  43 +
  44 + characters_list = [ utt2char[id_][0] for id_ in ids ]
  45 +
  46 + features_T = features.transpose()
  47 + print("Number of characters: ", len(np.unique(characters_list)))
  48 + df = pd.DataFrame(dict(
  49 + x=features_T[0],
  50 + y=features_T[1],
  51 + character=characters_list))
  52 +
  53 + groups = df.groupby('character')
  54 +
  55 + # Plot
  56 + fig, ax = plt.subplots()
  57 +
  58 + for character, group in groups:
  59 + p = ax.plot(group.x, group.y, marker='o', linestyle='', ms=1, label=character)
  60 + ax.legend()
  61 + plt.savefig(args.outfile)
  62 + print("Your plot is saved well (no check of this affirmation)")
  1 +if __name__ == "__main__":
  2 + print("Volia is well installed.")
  1 +'''
  2 +The goal of this script is to display calculate tsne of pvectors.
  3 +'''
  4 +
  5 +import os
  6 +from os.path import isfile
  7 +import argparse
  8 +import numpy as np