Commit e7d811503f88129fb1d8eb28dd6af09f681a771e

Authored by Quillot Mathias
1 parent 15e16ec460
Exists in master

New file architecture. Now scripts are on volia's directory and the library is o…

…n the core directory.

Showing 14 changed files with 419 additions and 670 deletions Inline Diff

scripts/data-management/convert-old.py
1 import argparse File was deleted
2 from os.path import isfile
3
4
5 if __name__ == "__main__":
6
7 parser = argparse.ArgumentParser(
8 description="Convert old files with wrong id to new one. Masseffect.")
9
10 parser.add_argument("file", type=str, help="feature, x2x, or list file")
11 parser.add_argument("--outfile", type=str, default="out.txt", help="output file")
12
13 args = parser.parse_args()
14
15 assert isfile(args.file), "The given file does not exist."
16
17 with open(args.file, "r") as f, open(args.outfile, "w") as of:
18 for line in f:
19 splited = line.replace("\n", "").split(" ")
20 metas = splited[0].split(",")
21 metas.pop(2)
22 splited[0] = ",".join(metas)
23 of.write(" ".join(splited) + "\n")
24 1 import argparse
scripts/data-management/filter_ids.py
1 import argparse File was deleted
2 from os.path import isfile
3 from volia.data_io import read_lst
4
5 if __name__ == "__main__":
6 parser = argparse.ArgumentParser(description="Filter ids of the given file to only keep a subset")
7 parser.add_argument("file", type=str, help="")
8 parser.add_argument("--filter", default=None, type=str, help="")
9 parser.add_argument("--outfile", default="out.txt", type=str, help="")
10
11 args = parser.parse_args()
12
13 assert args.filter is not None
14 assert isfile(args.file)
15
16 list_ = read_lst(args.file)
17 filter_ = read_lst(args.filter)
18
19 with open(args.outfile, "w") as of:
20 for key in filter_.keys():
21 of.write(key + " " + " ".join(list_[key]) + "\n")
22
23 print("File filtered and written in: ", args.outfile)
scripts/dim-reduction/tsne.py
1 ''' File was deleted
2 The goal of this script is to display calculate tsne of pvectors.
3 '''
4
5 import os
6 from os.path import isfile
7 import argparse
8 import numpy as np
9 from sklearn.manifold import TSNE
10
11 from volia.data_io import read_features
12
13 if __name__ == "__main__":
14 # Defining argparse
15 parser = argparse.ArgumentParser(prog='pvector tsne', description='Calculate the tsne representation of pvector in 3 or 2d')
16 parser.add_argument('features', type=str,
17 help='the path of the file you want to calculate tsne')
18 parser.add_argument('-o', '--outfile', type=str,
19 default='.',
20 help='the path of the output file.')
21 parser.add_argument('-n', '--n-comp', type=int, choices=[2, 3],
22 default='2',
23 help='number of components output of tsne')
24
25 args = parser.parse_args()
26
27 assert isfile(args.features)
28
29 features_list = read_features(args.features)
30 tuples_key_feat = np.vstack([ (key, feats) for key, feats in features_list.items()])
31 keys, features = zip(*tuples_key_feat)
32 feat_tsne = TSNE(n_components=args.n_comp).fit_transform(features)
33
34 with open(args.outfile, "w") as of:
35 for i in range(len(keys)):
36 of.write(keys[i] + " " + " ".join([str(feat) for feat in feat_tsne[i]]) + "\n")
37 print("TSNE finished. Check if everything has been done well.")
scripts/evaluations/clustering.py
1 ''' File was deleted
2 This script allows the user to evaluate a classification system on new labels using clustering methods.
3 The algorithms are applied on the given latent space (embedding).
4 '''
5 import argparse
6 import numpy as np
7 import pandas as pd
8 import os
9 import time
10 import pickle
11 import csv
12 import json
13
14 from sklearn.preprocessing import LabelEncoder
15 from sklearn.metrics.pairwise import pairwise_distances
16 from sklearn.cluster import KMeans
17 from sklearn.manifold import TSNE
18 from sklearn.metrics import f1_score, homogeneity_score, completeness_score, v_measure_score
19 import matplotlib.pyplot as plt
20
21 from volia.data_io import read_features,read_lst
22 from volia.measures import entropy_score, purity_score
23
24 '''
25 TODO:
26 - Add an option allowing the user to choose the number of
27 clustering to train in order to compute the average and the
28 '''
29
30
31 def train_clustering(label_encoder, feats, classes, outdir):
32 num_classes = len(label_encoder.classes_)
33 estimator = None
34 kmeans_filepath = os.path.join(outdir, f"{args.prefix}kmeans.pkl")
35 if args.onlymeasures:
36 print(f"Loading model: {kmeans_filepath}")
37 with open(kmeans_filepath, "rb") as f:
38 estimator = pickle.load(f)
39 else:
40 # Compute KMEANS clustering on data
41 print("Saving parameters")
42 kmeans_parameters = {
43 "n_clusters": num_classes,
44 "n_init": 100,
45 "tol": 10-6,
46 "algorithm": "elkan"
47 }
48 with open(os.path.join(outdir, f"{args.prefix}kmeans_parameters.json"), "w") as f:
49 json.dump(kmeans_parameters, f)
50
51 # Fit the model and Save parameters
52 print(f"Fit the model: {kmeans_filepath}")
53 estimator = KMeans(
54 **kmeans_parameters
55 )
56 estimator.fit(feats)
57 print(f"Kmeans: processed {estimator.n_iter_} iterations - intertia={estimator.inertia_}")
58
59 with open(kmeans_filepath, "wb") as f:
60 pickle.dump(estimator, f)
61
62 # contains distance to each cluster for each sample
63 dist_space = estimator.transform(feats)
64 predictions = np.argmin(dist_space, axis=1)
65
66 # gives each cluster a name (considering most represented character)
67 dataframe = pd.DataFrame({
68 "label": pd.Series(list(map(lambda x: le.classes_[x], labels))),
69 "prediction": pd.Series(predictions)
70 })
71
72 def find_cluster_name_fn(c):
73 mask = dataframe["prediction"] == c
74 return dataframe[mask]["label"].value_counts(sort=False).idxmax()
75
76 cluster_names = list(map(find_cluster_name_fn, range(num_classes)))
77 predicted_labels = le.transform(
78 [cluster_names[pred] for pred in predictions])
79
80 # F-measure
81 fscores = f1_score(labels, predicted_labels, average=None)
82 fscores_str = "\n".join(map(lambda i: "{0:25s}: {1:.4f}".format(le.classes_[i], fscores[i]), range(len(fscores))))
83
84 # Entropy
85 _, _, entropy = entropy_score(labels, predicted_labels)
86
87 # Homogenity
88 homogeneity = homogeneity_score(labels, predicted_labels)
89
90 # Completeness
91 completeness = completeness_score(labels, predicted_labels)
92
93 # V-Measure
94 v_measure = v_measure_score(labels, predicted_labels)
95
96 # Purity
97 purity_scores = purity_score(labels, predicted_labels)
98 purity_class_score = purity_scores["purity_class_score"]
99 purity_cluster_score = purity_scores["purity_cluster_score"]
100 K = purity_scores["K"]
101
102 # Write results
103 with open(os.path.join(outdir, args.prefix + "eval_clustering.log"), "w") as fd:
104 print(f"F1-scores for each classes:\n{fscores_str}", file=fd)
105 print(f"Entropy: {entropy}", file=fd)
106 print(f"Global score : {np.mean(fscores)}", file=fd)
107 print(f"Homogeneity: {homogeneity}", file=fd)
108 print(f"completeness: {completeness}", file=fd)
109 print(f"v-measure: {v_measure}", file=fd)
110 print(f"purity class score: {purity_class_score}", file=fd)
111 print(f"purity cluster score: {purity_cluster_score}", file=fd)
112 print(f"purity overall evaluation criterion (K): {K}", file=fd)
113
114 # Process t-SNE and plot
115 tsne_estimator = TSNE()
116 embeddings = tsne_estimator.fit_transform(feats)
117 print("t-SNE: processed {0} iterations - KL_divergence={1:.4f}".format(
118 tsne_estimator.n_iter_, tsne_estimator.kl_divergence_))
119
120 fig, [axe1, axe2] = plt.subplots(1, 2, figsize=(10, 5))
121 for c, name in enumerate(le.classes_):
122 c_mask = np.where(labels == c)
123 axe1.scatter(embeddings[c_mask][:, 0], embeddings[c_mask][:, 1], label=name, alpha=0.2, edgecolors=None)
124
125 try:
126 id_cluster = cluster_names.index(name)
127 except ValueError:
128 print("WARNING: no cluster found for {}".format(name))
129 continue
130 c_mask = np.where(predictions == id_cluster)
131 axe2.scatter(embeddings[c_mask][:, 0], embeddings[c_mask][:, 1], label=name, alpha=0.2, edgecolors=None)
132
133 axe1.legend(loc="lower center", bbox_to_anchor=(0.5, -0.35))
134 axe1.set_title("true labels")
135 axe2.legend(loc="lower center", bbox_to_anchor=(0.5, -0.35))
136 axe2.set_title("predicted cluster label")
137
138 plt.suptitle("Kmeans Clustering")
139
140 loc = os.path.join(
141 outdir,
142 args.prefix + "kmeans.pdf"
143 )
144 plt.savefig(loc, bbox_inches="tight")
145 plt.close()
146
147 print("INFO: figure saved at {}".format(loc))
148
149 end = time.time()
150 print("program ended in {0:.2f} seconds".format(end-start))
151 return {
152 "f1": np.mean(fscores),
153 "entropy": entropy,
154 "homogeneity": homogeneity,
155 "completeness": completeness,
156 "v-measure": v_measure,
157 "purity_class_score": purity_class_score,
158 "purity_cluster score": purity_cluster_score,
159 "K": K
160 }
161
162
163 if __name__ == "__main__":
164 # Argparse
165 parser = argparse.ArgumentParser("Compute clustering on a latent space")
166 parser.add_argument("features")
167 parser.add_argument("utt2",
168 type=str,
169 help="file with [utt] [value]")
170 parser.add_argument("--idsfrom",
171 type=str,
172 default="utt2",
173 choices=[
174 "features",
175 "utt2"
176 ],
177 help="from features or from utt2?")
178 parser.add_argument("--prefix",
179 default="",
180 type=str,
181 help="prefix of saved files")
182 parser.add_argument("--outdir",
183 default=None,
184 type=str,
185 help="Output directory")
186 parser.add_argument("--nmodels",
187 type=int,
188 default=1,
189 help="specifies the number of models to train")
190 parser.add_argument("--onlymeasures",
191 action='store_true',
192 help="Don't compute the clustering, compute only the measures")
193 args = parser.parse_args()
194
195 assert args.outdir
196
197 start = time.time()
198
199 # Load features and utt2
200 features = read_features(args.features)
201 utt2 = read_lst(args.utt2)
202
203 # Take id list
204 if args.idsfrom == "features":
205 ids = list(features.keys())
206 elif args.idsfrom == "utt2":
207 ids = list(utt2.keys())
208 else:
209 print(f"idsfrom is not good: {args.idsfrom}")
210 exit(1)
211
212 feats = np.vstack([ features[id_] for id_ in ids ])
213 classes = [ utt2[id_] for id_ in ids ]
214
215 # Encode labels
216 le = LabelEncoder()
217 labels = le.fit_transform(classes)
218
219 measures = {}
220 for i in range(1, args.nmodels+1):
221 subdir = os.path.join(args.outdir, str(i))
222 if not os.path.exists(subdir):
223 os.mkdir(subdir)
224 print(f"[{i}/{args.nmodels}] => {subdir}")
225 results = train_clustering(le, feats, classes, subdir)
226
227 for key, value in results.items():
228 if key not in measures:
229 measures[key] = []
230 measures[key].append(results[key])
231
232
233 # File with results
234 file_results = os.path.join(args.outdir, args.prefix + "clustering_measures.txt")
235
236 with open(file_results, "w") as f:
237 f.write(f"[nmodels: {args.nmodels}]\n")
238 for key in measures.keys():
239 values = np.asarray(measures[key], dtype=float)
240 mean = np.mean(values)
241 std = np.std(values)
242 f.write(f"[{key} => mean: {mean}, std: {std}] \n")
243
244 # CSV File with all the values
245 file_csv_measures = os.path.join(args.outdir, args.prefix + "clustering_measures.csv")
246
247 with open(file_csv_measures, "w", newline="") as f:
248 writer = csv.writer(f, delimiter=",")
249 writer.writerow(["measure"] + list(range(1, args.nmodels+1)) + ["mean"] + ["std"])
250 for key in measures.keys():
251 values = np.asarray(measures[key], dtype=float)
252 mean = np.mean(values)
253 std = np.std(values)
254 writer.writerow([key] + list(values) + [mean] + [std])
scripts/plot/plot-character.py
1 File was deleted
2 import matplotlib.pyplot as plt
3 import numpy as np
4 import pandas as pd
5 import argparse
6 from os.path import isfile
7 from volia.data_io import read_features, read_lst
8
9
10 if __name__ == "__main__":
11 # Argparse
12 parser = argparse.ArgumentParser(description="Plot points with color for each character")
13 parser.add_argument("--features", type=str, help="features file path")
14 parser.add_argument("--utt2char", type=str, help="char2utt file path")
15 parser.add_argument("--sublist", type=str, default=None, help="white list of ids to take into account")
16 parser.add_argument("--outfile", default="out.pdf", type=str, help="")
17 parser.add_argument("--title", default="Example of plot", type=str, help="Specify the title")
18 args = parser.parse_args()
19
20 # List of assertions
21 assert args.features, "Need to specify features option"
22 assert args.utt2char, "Need to specify char2utt option file"
23 assert isfile(args.features), "Features path should point to a file"
24 assert isfile(args.utt2char), "char2utt path should point to a file"
25 if args.sublist is not None:
26 assert isfile(args.sublist), "sublist path should point to a file"
27
28
29 id_to_features = read_features(args.features)
30
31 ids = []
32 if args.sublist is not None:
33 print("Using sublist")
34 list_ids = read_lst(args.sublist)
35 ids = [ key for key in list_ids.keys() ]
36 else:
37 ids = [ key for key in id_to_features.keys() ]
38
39 utt2char = read_lst(args.utt2char)
40
41 features = [ id_to_features[id_] for id_ in ids ]
42 features = np.vstack(features)
43
44 characters_list = [ utt2char[id_][0] for id_ in ids ]
45
46 features_T = features.transpose()
47 print("Number of characters: ", len(np.unique(characters_list)))
48 df = pd.DataFrame(dict(
49 x=features_T[0],
50 y=features_T[1],
51 character=characters_list))
52
53 groups = df.groupby('character')
54
55 # Plot
56 fig, ax = plt.subplots()
57
58 for character, group in groups:
59 p = ax.plot(group.x, group.y, marker='o', linestyle='', ms=1, label=character)
60 ax.legend()
61 plt.savefig(args.outfile)
62 print("Your plot is saved well (no check of this affirmation)")
63 1
volia/convert-old.py
File was created 1 import argparse
2 from os.path import isfile
3
4
5 if __name__ == "__main__":
6
7 parser = argparse.ArgumentParser(
8 description="Convert old files with wrong id to new one. Masseffect.")
9
10 parser.add_argument("file", type=str, help="feature, x2x, or list file")
11 parser.add_argument("--outfile", type=str, default="out.txt", help="output file")
12
13 args = parser.parse_args()
14
15 assert isfile(args.file), "The given file does not exist."
16
17 with open(args.file, "r") as f, open(args.outfile, "w") as of:
18 for line in f:
19 splited = line.replace("\n", "").split(" ")
20 metas = splited[0].split(",")
21 metas.pop(2)
22 splited[0] = ",".join(metas)
23 of.write(" ".join(splited) + "\n")
24
File was created 1 '''
2 Data management input/output
3 '''
4
5 # Import packages and modules
6 import numpy as np
7
8 # Defining some types
9 from typing import List, Dict
10 KeyToList = Dict[str, List[str]]
11 KeyToFeatures = Dict[str, List[float]]
12
13
14 def read_lst(file_path: str) -> KeyToList:
15 '''
16 Read lst file with this structure:
17 [id] [value1] [value2] ... [value n]
18
19 This is a basic function reused by others like read_features.
20 returns a dictionary with id as key and a list of value as corresponding values
21 '''
22 # KeyToList type variable
23 key_to_list = dict()
24 with open(file_path, "r") as f:
25 for line in f:
26 splited = line.replace("\n", "").split(" ")
27 id = splited[0]
28 values = splited[1:]
29 key_to_list[id] = values
30 return key_to_list
31
32
33 def read_features(file_path: str) -> KeyToFeatures:
34 '''
35 '''
36 # KeyToFeatures type variable
37 key_to_features = dict()
38 # and the KeyToList
39 key_to_list = read_lst(file_path)
40
41 for key_, list_ in key_to_list.items():
42 key_to_features[key_] = np.asarray(list_, dtype=float)
43
44 return key_to_features
volia/core/measures.py
File was created 1 '''
2 This module is a part of my library.
3 It aims to compute some measures for clustering.
4 '''
5
6 import numpy as np
7
8 def disequilibrium_(matrix1, matrix2, isGlobal=False, mod=None):
9 '''
10 Compute disequilibrium for all the clusters.
11 The disequilibrium is compute from the difference
12 between two clustering sets.
13 isGlobal permet ร  l'utilisateur de choisir le dรฉnominateur de
14 la fonction :
15 - True : divise la valeur par le nombre d'รฉlรฉment du cluster
16 - False : divise la valeur par le nombre d'รฉlรฉment total
17
18 withPower permet ร  l'utilisateur de dรฉcider d'appliquer un carrรฉ 2 ou
19 une valeur absolue.
20 '''
21
22 def divide_line(a, divider):
23 '''
24 Sub function used for dividing matrix by a vector line by line.
25 '''
26 return np.divide(a, divider, out=np.zeros_like(a), where=divider!=0)
27
28 dividers1 = 0
29 dividers2 = 0
30
31 if isGlobal:
32 dividers1 = matrix1.sum()
33 dividers2 = matrix2.sum()
34 else:
35 dividers1 = matrix1.sum(axis=1)
36 dividers2 = matrix2.sum(axis=1)
37
38 matrix1_divided = np.apply_along_axis(divide_line, 0, np.asarray(matrix1, dtype=np.float), dividers1)
39
40 matrix2_divided = np.apply_along_axis(divide_line, 0, np.asarray(matrix2, dtype=np.float), dividers2)
41
42 diff = matrix1_divided - matrix2_divided
43
44 mask = np.logical_not(np.logical_and(matrix2==0, matrix1==0))
45
46 result = diff
47
48 if mod != None or mod == "":
49 for word in mod.split(" "):
50 if word == "power":
51 result = np.power(result,2)
52 elif word == "human":
53 result = result * 100
54 elif word == "abs":
55 result = np.absolute(result)
56 else:
57 raise Exception("Need to specify an accepted mod of the disequilibrium (\"power\", \"human\" or \"abs\"")
58 return (mask, result)
59
60
61
62 def disequilibrium_mean_by_cluster(mask, matrix):
63 '''
64 Mean of disequilibrium
65 matrix is the disequilibrium calculated
66 from number of occurences belonging to a class,
67 for each cluster.
68 '''
69 nb_k = len(matrix)
70 results = np.zeros((nb_k))
71
72 for i in range(nb_k):
73 results[i] = matrix[i].sum() / mask[i].sum()
74 return results
75
76
77 def disequilibrium(matrix1, matrix2, isGlobal=False):
78 '''
79 Disequilibrium matrix
80 And Disequilibrium value
81 '''
82 mask, result = disequilibrium_(matrix1, matrix2, isGlobal)
83 result_human = result * 100
84 result_power = np.power(result, 2)
85
86 return (
87 mask,
88 result_human,
89 disequilibrium_mean_by_cluster(mask, result_power).sum()/matrix1.shape[0]
90 )
91
92
93 def compute_count_matrix(y_truth, y_hat):
94 '''
95 Check the size of the lists with assertion
96 '''
97 # Check size of the lists
98 assert len(y_hat) == len(y_truth), f"Matrices should have the same length y_hat: {len(y_hat)}, y_truth: {len(y_truth)}"
99
100 # Build count matrix
101 count_matrix = np.zeros((max(y_hat+1), max(y_truth+1)))
102 for i in range(len(y_hat)):
103 count_matrix[y_hat[i]][y_truth[i]] += 1
104 return count_matrix
105
106
107 def entropy_score(y_truth, y_hat):
108 '''
109 Need to use label encoder before givin y_hat and y_truth
110 Don't use one hot labels
111
112 Return a tuple with:
113 - result_matrix : the matrix with the log multiplied probabilities (P(x) * log(P(x)))
114 - result_vector : the vector avec summing entropy of each class. Each value corresponds to a cluster.
115 - result : the final entropy measure of the clustering
116 '''
117 def divide_line(a, divider):
118 '''
119 Sub function used for dividing matrix by a vector line by line.
120 '''
121 return np.divide(a, divider, out=np.zeros_like(a), where=divider!=0)
122
123 # Build count matrix
124 count_matrix = compute_count_matrix(y_truth, y_hat)
125
126 # Build dividers vector
127 dividers = count_matrix.sum(axis=1)
128
129 matrix_divided = np.apply_along_axis(divide_line, 0, np.asarray(count_matrix, dtype=np.float), dividers)
130
131 log_matrix = np.zeros(matrix_divided.shape)
132 np.log2(matrix_divided, out=log_matrix, where=count_matrix != 0)
133 result_matrix = -1 * np.multiply(matrix_divided, log_matrix)
134 result_vector = result_matrix.sum(axis=1)
135 result_vector.sum()
136
137 if np.isnan(np.sum(result_vector)):
138 print("COUNT MATRIX")
139 print(count_matrix)
140 print("MATRIX DIVIDED")
141 print(matrix_divided)
142 print("RESULT MATRIX")
143 print(result_matrix)
144 print("VECTOR MATRIX")
145 print(result_vector)
146 print("An error occured due to nan value, some values are printed before")
147 exit(1)
148
149 result = result_vector * dividers / dividers.sum()
150 result = result.sum()
151 return (result_matrix, result_vector, result)
152
153
154 def purity_score(y_truth, y_hat):
155 '''
156 Return three values in a dictionary:
157 - purity_class_score: the purity score of the class (asp)
158 - purity_cluster_score: the purity score of the cluster (acp)
159 - K: the overall evaluation criterion (sqrt(asp * acp))
160
161 This function is based on the following article:
162 Unknown-multiple speaker clustering using HMM, J. Ajmera, H. Bourlard, I. Lapidot, I. McCowan
163 '''
164
165 def divide_line(a, divider):
166 '''
167 Sub function used for dividing matrix by a vector line by line.
168 '''
169 return np.divide(a, divider, out=np.zeros_like(a), where=divider!=0)
170
171 def compute_purity_score(count_matrix, axis=0):
172 if axis==0:
173 other_axis = 1
174 else:
175 other_axis = 0
176 count_per_row = count_matrix.sum(axis=axis)
177 dividers = np.square(count_per_row)
178
179 count_matrix_squared = np.square(count_matrix)
180 matrix_divided = np.apply_along_axis(divide_line, other_axis, np.asarray(count_matrix_squared, dtype=np.float), dividers)
181 vector_purity = np.sum(matrix_divided, axis=axis)
182
183 scalar_purity = np.average(vector_purity, weights=count_per_row)
184 return (vector_purity, scalar_purity)
185
186
187 count_matrix = compute_count_matrix(y_truth, y_hat)
188 _, purity_cluster_score = compute_purity_score(count_matrix, 1)
189 _, purity_class_score = cluster_purity = compute_purity_score(count_matrix, 0)
190
191 K = np.sqrt(purity_cluster_score * purity_class_score)
192
193 for i in range(count_matrix.shape[0]):
194 for j in range(count_matrix.shape[1]):
195 count_matrix[i][j]
196 count_matrix[i]
197 return {
198 "purity_class_score": purity_class_score,
199 "purity_cluster_score": purity_cluster_score,
200 "K": K
201 }
202
203
204 if __name__ == "__main__":
205 print("Purity test #1")
206 # Hypothesis
207 y_hat = np.asarray([0, 1, 2, 0, 1, 0, 3, 2, 2, 3, 3, 0])
208 # Truth
209 y = np.asarray([0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3])
210
211 (result_matrix, result_vector, result) = entropy_score(y, y_hat)
212 print(purity_score(y, y_hat))
213
214 exit(1)
215 print("Purity test #2")
216 # Hypothesis
217 y_hat = np.asarray([0, 1, 2, 0, 1, 0, 3, 2, 2, 3, 3, 0, 4, 4, 4])
218 # Truth
219 y = np.asarray([0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 0, 3, 3, 3])
220
221 (result_matrix, result_vector, result) = entropy_score(y, y_hat)
222 exit(1)
223 print("Result matrix: ")
224 print(result_matrix)
225 print("Result vector: ")
226 print(result_vector)
227 print("Result: ", result)
volia/data_io.py
1 ''' File was deleted
2 Data management input/output
3 '''
4
5 # Import packages and modules
6 import numpy as np
7
8 # Defining some types
9 from typing import List, Dict
10 KeyToList = Dict[str, List[str]]
11 KeyToFeatures = Dict[str, List[float]]
12
13
14 def read_lst(file_path: str) -> KeyToList:
15 '''
16 Read lst file with this structure:
17 [id] [value1] [value2] ... [value n]
18
19 This is a basic function reused by others like read_features.
20 returns a dictionary with id as key and a list of value as corresponding values
21 '''
22 # KeyToList type variable
23 key_to_list = dict()
24 with open(file_path, "r") as f:
25 for line in f:
26 splited = line.replace("\n", "").split(" ")
27 id = splited[0]
28 values = splited[1:]
29 key_to_list[id] = values
30 return key_to_list
31
32
33 def read_features(file_path: str) -> KeyToFeatures:
34 '''
35 '''
36 # KeyToFeatures type variable
37 key_to_features = dict()
38 # and the KeyToList
39 key_to_list = read_lst(file_path)
40
41 for key_, list_ in key_to_list.items():
42 key_to_features[key_] = np.asarray(list_, dtype=float)
43
44 return key_to_features
File was created 1 import argparse
2 from os.path import isfile
3 #from volia.data_io import read_lst
4
5 import volia
6 if __name__ == "__main__":
7 parser = argparse.ArgumentParser(description="Filter ids of the given file to only keep a subset")
8 parser.add_argument("file", type=str, help="")
9 parser.add_argument("--filter", default=None, type=str, help="")
10 parser.add_argument("--outfile", default="out.txt", type=str, help="")
11
12 args = parser.parse_args()
13
14 assert args.filter is not None
15 assert isfile(args.file)
16
17 list_ = read_lst(args.file)
18 filter_ = read_lst(args.filter)
19
20 with open(args.outfile, "w") as of:
21 for key in filter_.keys():
22 of.write(key + " " + " ".join(list_[key]) + "\n")
23
24 print("File filtered and written in: ", args.outfile)
volia/measures.py
1 ''' File was deleted
2 This module is a part of my library.
3 It aims to compute some measures for clustering.
4 '''
5
6 import numpy as np
7
8 def disequilibrium_(matrix1, matrix2, isGlobal=False, mod=None):
9 '''
10 Compute disequilibrium for all the clusters.
11 The disequilibrium is compute from the difference
12 between two clustering sets.
13 isGlobal permet ร  l'utilisateur de choisir le dรฉnominateur de
14 la fonction :
15 - True : divise la valeur par le nombre d'รฉlรฉment du cluster
16 - False : divise la valeur par le nombre d'รฉlรฉment total
17
18 withPower permet ร  l'utilisateur de dรฉcider d'appliquer un carrรฉ 2 ou
19 une valeur absolue.
20 '''
21
22 def divide_line(a, divider):
23 '''
24 Sub function used for dividing matrix by a vector line by line.
25 '''
26 return np.divide(a, divider, out=np.zeros_like(a), where=divider!=0)
27
28 dividers1 = 0
29 dividers2 = 0
30
31 if isGlobal:
32 dividers1 = matrix1.sum()
33 dividers2 = matrix2.sum()
34 else:
35 dividers1 = matrix1.sum(axis=1)
36 dividers2 = matrix2.sum(axis=1)
37
38 matrix1_divided = np.apply_along_axis(divide_line, 0, np.asarray(matrix1, dtype=np.float), dividers1)
39
40 matrix2_divided = np.apply_along_axis(divide_line, 0, np.asarray(matrix2, dtype=np.float), dividers2)
41
42 diff = matrix1_divided - matrix2_divided
43
44 mask = np.logical_not(np.logical_and(matrix2==0, matrix1==0))
45
46 result = diff
47
48 if mod != None or mod == "":
49 for word in mod.split(" "):
50 if word == "power":
51 result = np.power(result,2)
52 elif word == "human":
53 result = result * 100
54 elif word == "abs":
55 result = np.absolute(result)
56 else:
57 raise Exception("Need to specify an accepted mod of the disequilibrium (\"power\", \"human\" or \"abs\"")
58 return (mask, result)
59
60
61
62 def disequilibrium_mean_by_cluster(mask, matrix):
63 '''
64 Mean of disequilibrium
65 matrix is the disequilibrium calculated
66 from number of occurences belonging to a class,
67 for each cluster.
68 '''
69 nb_k = len(matrix)
70 results = np.zeros((nb_k))
71
72 for i in range(nb_k):
73 results[i] = matrix[i].sum() / mask[i].sum()
74 return results
75
76
77 def disequilibrium(matrix1, matrix2, isGlobal=False):
78 '''
79 Disequilibrium matrix
80 And Disequilibrium value
81 '''
82 mask, result = disequilibrium_(matrix1, matrix2, isGlobal)
83 result_human = result * 100
84 result_power = np.power(result, 2)
85
86 return (
87 mask,
88 result_human,
89 disequilibrium_mean_by_cluster(mask, result_power).sum()/matrix1.shape[0]
90 )
91
92
93 def compute_count_matrix(y_truth, y_hat):
94 '''
95 Check the size of the lists with assertion
96 '''
97 # Check size of the lists
98 assert len(y_hat) == len(y_truth), f"Matrices should have the same length y_hat: {len(y_hat)}, y_truth: {len(y_truth)}"
99
100 # Build count matrix
101 count_matrix = np.zeros((max(y_hat+1), max(y_truth+1)))
102 for i in range(len(y_hat)):
103 count_matrix[y_hat[i]][y_truth[i]] += 1
104 return count_matrix
105
106
107 def entropy_score(y_truth, y_hat):
108 '''
109 Need to use label encoder before givin y_hat and y_truth
110 Don't use one hot labels
111
112 Return a tuple with:
113 - result_matrix : the matrix with the log multiplied probabilities (P(x) * log(P(x)))
114 - result_vector : the vector avec summing entropy of each class. Each value corresponds to a cluster.
115 - result : the final entropy measure of the clustering
116 '''
117 def divide_line(a, divider):
118 '''
119 Sub function used for dividing matrix by a vector line by line.
120 '''
121 return np.divide(a, divider, out=np.zeros_like(a), where=divider!=0)
122
123 # Build count matrix
124 count_matrix = compute_count_matrix(y_truth, y_hat)
125
126 # Build dividers vector
127 dividers = count_matrix.sum(axis=1)
128
129 matrix_divided = np.apply_along_axis(divide_line, 0, np.asarray(count_matrix, dtype=np.float), dividers)
130
131 log_matrix = np.zeros(matrix_divided.shape)
132 np.log2(matrix_divided, out=log_matrix, where=count_matrix != 0)
133 result_matrix = -1 * np.multiply(matrix_divided, log_matrix)
134 result_vector = result_matrix.sum(axis=1)
135 result_vector.sum()
136
137 if np.isnan(np.sum(result_vector)):
138 print("COUNT MATRIX")
139 print(count_matrix)
140 print("MATRIX DIVIDED")
141 print(matrix_divided)
142 print("RESULT MATRIX")
143 print(result_matrix)
144 print("VECTOR MATRIX")
145 print(result_vector)
146 print("An error occured due to nan value, some values are printed before")
147 exit(1)
148
149 result = result_vector * dividers / dividers.sum()
150 result = result.sum()
151 return (result_matrix, result_vector, result)
152
153
154 def purity_score(y_truth, y_hat):
155 '''
156 Return three values in a dictionary:
157 - purity_class_score: the purity score of the class (asp)
158 - purity_cluster_score: the purity score of the cluster (acp)
159 - K: the overall evaluation criterion (sqrt(asp * acp))
160
161 This function is based on the following article:
162 Unknown-multiple speaker clustering using HMM, J. Ajmera, H. Bourlard, I. Lapidot, I. McCowan
163 '''
164
165 def divide_line(a, divider):
166 '''
167 Sub function used for dividing matrix by a vector line by line.
168 '''
169 return np.divide(a, divider, out=np.zeros_like(a), where=divider!=0)
170
171 def compute_purity_score(count_matrix, axis=0):
172 if axis==0:
173 other_axis = 1
174 else:
175 other_axis = 0
176 count_per_row = count_matrix.sum(axis=axis)
177 dividers = np.square(count_per_row)
178
179 count_matrix_squared = np.square(count_matrix)
180 matrix_divided = np.apply_along_axis(divide_line, other_axis, np.asarray(count_matrix_squared, dtype=np.float), dividers)
181 vector_purity = np.sum(matrix_divided, axis=axis)
182
183 scalar_purity = np.average(vector_purity, weights=count_per_row)
184 return (vector_purity, scalar_purity)
185
186
187 count_matrix = compute_count_matrix(y_truth, y_hat)
188 _, purity_cluster_score = compute_purity_score(count_matrix, 1)
189 _, purity_class_score = cluster_purity = compute_purity_score(count_matrix, 0)
190
191 K = np.sqrt(purity_cluster_score * purity_class_score)
192
193 for i in range(count_matrix.shape[0]):
194 for j in range(count_matrix.shape[1]):
195 count_matrix[i][j]
196 count_matrix[i]
197 return {
198 "purity_class_score": purity_class_score,
199 "purity_cluster_score": purity_cluster_score,
200 "K": K
201 }
202
203
204 if __name__ == "__main__":
205 print("Purity test #1")
206 # Hypothesis
207 y_hat = np.asarray([0, 1, 2, 0, 1, 0, 3, 2, 2, 3, 3, 0])
208 # Truth
209 y = np.asarray([0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3])
210
211 (result_matrix, result_vector, result) = entropy_score(y, y_hat)
212 print(purity_score(y, y_hat))
213
214 exit(1)
215 print("Purity test #2")
216 # Hypothesis
217 y_hat = np.asarray([0, 1, 2, 0, 1, 0, 3, 2, 2, 3, 3, 0, 4, 4, 4])
218 # Truth
219 y = np.asarray([0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 0, 3, 3, 3])
220
221 (result_matrix, result_vector, result) = entropy_score(y, y_hat)
222 exit(1)
223 print("Result matrix: ")
224 print(result_matrix)
225 print("Result vector: ")
226 print(result_vector)
227 print("Result: ", result)
volia/plot-character.py
File was created 1
2 import matplotlib.pyplot as plt
3 import numpy as np
4 import pandas as pd
5 import argparse
6 from os.path import isfile
7 from volia.data_io import read_features, read_lst
8
9
10 if __name__ == "__main__":
11 # Argparse
12 parser = argparse.ArgumentParser(description="Plot points with color for each character")
13 parser.add_argument("--features", type=str, help="features file path")
14 parser.add_argument("--utt2char", type=str, help="char2utt file path")
15 parser.add_argument("--sublist", type=str, default=None, help="white list of ids to take into account")
16 parser.add_argument("--outfile", default="out.pdf", type=str, help="")
17 parser.add_argument("--title", default="Example of plot", type=str, help="Specify the title")
18 args = parser.parse_args()
19
20 # List of assertions
21 assert args.features, "Need to specify features option"
22 assert args.utt2char, "Need to specify char2utt option file"
23 assert isfile(args.features), "Features path should point to a file"
24 assert isfile(args.utt2char), "char2utt path should point to a file"
25 if args.sublist is not None:
26 assert isfile(args.sublist), "sublist path should point to a file"
27
28
29 id_to_features = read_features(args.features)
30
31 ids = []
32 if args.sublist is not None:
33 print("Using sublist")
34 list_ids = read_lst(args.sublist)
35 ids = [ key for key in list_ids.keys() ]
36 else:
37 ids = [ key for key in id_to_features.keys() ]
38
39 utt2char = read_lst(args.utt2char)
40
41 features = [ id_to_features[id_] for id_ in ids ]
42 features = np.vstack(features)
43
44 characters_list = [ utt2char[id_][0] for id_ in ids ]
45
46 features_T = features.transpose()
47 print("Number of characters: ", len(np.unique(characters_list)))
48 df = pd.DataFrame(dict(
49 x=features_T[0],
50 y=features_T[1],
51 character=characters_list))
52
53 groups = df.groupby('character')
54
55 # Plot
56 fig, ax = plt.subplots()
57
58 for character, group in groups:
59 p = ax.plot(group.x, group.y, marker='o', linestyle='', ms=1, label=character)
60 ax.legend()
61 plt.savefig(args.outfile)
62 print("Your plot is saved well (no check of this affirmation)")
63
File was created 1 if __name__ == "__main__":
2 print("Volia is well installed.")
File was created 1 '''
2 The goal of this script is to display calculate tsne of pvectors.
3 '''
4
5 import os
6 from os.path import isfile
7 import argparse
8 import numpy as np
9 from sklearn.manifold import TSNE
10
11 from volia.data_io import read_features
12
13 if __name__ == "__main__":
14 # Defining argparse
15 parser = argparse.ArgumentParser(prog='pvector tsne', description='Calculate the tsne representation of pvector in 3 or 2d')
16 parser.add_argument('features', type=str,
17 help='the path of the file you want to calculate tsne')
18 parser.add_argument('-o', '--outfile', type=str,
19 default='.',
20 help='the path of the output file.')
21 parser.add_argument('-n', '--n-comp', type=int, choices=[2, 3],
22 default='2',
23 help='number of components output of tsne')
24
25 args = parser.parse_args()
26
27 assert isfile(args.features)
28
29 features_list = read_features(args.features)
30 tuples_key_feat = np.vstack([ (key, feats) for key, feats in features_list.items()])
31 keys, features = zip(*tuples_key_feat)
32 feat_tsne = TSNE(n_components=args.n_comp).fit_transform(features)
33
34 with open(args.outfile, "w") as of:
35 for i in range(len(keys)):
36 of.write(keys[i] + " " + " ".join([str(feat) for feat in feat_tsne[i]]) + "\n")
37 print("TSNE finished. Check if everything has been done well.")