Compare View
Commits (3)
-
…n the core directory.
Changes
Showing 18 changed files Side-by-side Diff
- Makefile
- README.md
- scripts/data-management/convert-old.py
- scripts/data-management/filter_ids.py
- scripts/dim-reduction/tsne.py
- scripts/evaluations/clustering.py
- scripts/plot/plot-character.py
- setup.py
- setup_volia.py
- volia/convert-old.py
- volia/core/data.py
- volia/core/measures.py
- volia/data_io.py
- volia/filter_ids.py
- volia/measures.py
- volia/plot-character.py
- volia/test.py
- volia/tsne.py
Makefile
README.md
... | ... | @@ -0,0 +1,29 @@ |
1 | +# Introduction | |
2 | +Volia est une librairie python pour le machine learning dédié au *speech*. Elle est accompagnée par un ensemble de scripts qui permettent de gérer des données et entraîner des modèles, sortir des rapports d'analyses, des benchmark etc. | |
3 | + | |
4 | +# Quick start | |
5 | +Utiliser volia est très simple. | |
6 | + | |
7 | +## Installer volia | |
8 | +Volia n'étant pas un *repository* officiel PyPi, vous devez l'installer en mode *développement*. Pour ce faire, vous pouvez utiliser le *Makefile* en exécutant la commande suivante : | |
9 | + | |
10 | +``` | |
11 | +make | |
12 | +``` | |
13 | + | |
14 | +## Exécuter les scripts de n'importe où | |
15 | +Vous pouvez lancer des scripts depuis n'importe quel chemin de votre ordinateur. Pour cela, il suffit de lancer la commande suivante en remplaçant *run-script* par le nom du module à lancer : | |
16 | +``` | |
17 | +python -m volia.run-script | |
18 | +``` | |
19 | + | |
20 | +Exemple : | |
21 | +``` | |
22 | +python -m volia.test | |
23 | +``` | |
24 | + | |
25 | +Si *Volia* est bien installé sur votre machine, cette commande devrait pour retourner "Volia is well installed." dans votre terminal. | |
26 | + | |
27 | + | |
28 | +# Evolution | |
29 | +Un jour, si ce *repository* devient trop volumineux, je créerai un deuxième *repository* seulement composé des scripts. Ce qui m'embête avec cette approche est le lien entre la version du *repository* volia et celui des scripts. Je suis donc en train de réfléchir à une solution pour organiser mon code proprement et bien séparer la partie *script* de la partie *bibliothèque* de Volia. | |
0 | 30 | \ No newline at end of file |
setup.py
... | ... | @@ -0,0 +1,23 @@ |
1 | +import setuptools | |
2 | + | |
3 | +with open("README.md", "r") as fh: | |
4 | + long_description = fh.read() | |
5 | + | |
6 | +setuptools.setup( | |
7 | + name="volia", | |
8 | + version="0.0.1", | |
9 | + author="Mathias Quillot", | |
10 | + author_email="mathias.quillot@alumni.univ-avignon.fr", | |
11 | + description="A package dedicated to my experiments on actor voices", | |
12 | + long_description=long_description, | |
13 | + long_description_content_type="text/markdown", | |
14 | + url="", | |
15 | + packages=setuptools.find_packages(), | |
16 | + classifiers=[ | |
17 | + "Programming Language :: Python :: 3", | |
18 | + "License :: OSI Approved :: MIT License", | |
19 | + "Operating System :: OS Independent", | |
20 | + ], | |
21 | + include_package_data=True, | |
22 | + python_requires='>=3.6', | |
23 | +) | |
0 | 24 | \ No newline at end of file |
volia/convert-old.py
... | ... | @@ -0,0 +1,23 @@ |
1 | +import argparse | |
2 | +from os.path import isfile | |
3 | + | |
4 | + | |
5 | +if __name__ == "__main__": | |
6 | + | |
7 | + parser = argparse.ArgumentParser( | |
8 | + description="Convert old files with wrong id to new one. Masseffect.") | |
9 | + | |
10 | + parser.add_argument("file", type=str, help="feature, x2x, or list file") | |
11 | + parser.add_argument("--outfile", type=str, default="out.txt", help="output file") | |
12 | + | |
13 | + args = parser.parse_args() | |
14 | + | |
15 | + assert isfile(args.file), "The given file does not exist." | |
16 | + | |
17 | + with open(args.file, "r") as f, open(args.outfile, "w") as of: | |
18 | + for line in f: | |
19 | + splited = line.replace("\n", "").split(" ") | |
20 | + metas = splited[0].split(",") | |
21 | + metas.pop(2) | |
22 | + splited[0] = ",".join(metas) | |
23 | + of.write(" ".join(splited) + "\n") |
volia/core/data.py
... | ... | @@ -0,0 +1,44 @@ |
1 | +''' | |
2 | +Data management input/output | |
3 | +''' | |
4 | + | |
5 | +# Import packages and modules | |
6 | +import numpy as np | |
7 | + | |
8 | +# Defining some types | |
9 | +from typing import List, Dict | |
10 | +KeyToList = Dict[str, List[str]] | |
11 | +KeyToFeatures = Dict[str, List[float]] | |
12 | + | |
13 | + | |
14 | +def read_lst(file_path: str) -> KeyToList: | |
15 | + ''' | |
16 | + Read lst file with this structure: | |
17 | + [id] [value1] [value2] ... [value n] | |
18 | + | |
19 | + This is a basic function reused by others like read_features. | |
20 | + returns a dictionary with id as key and a list of value as corresponding values | |
21 | + ''' | |
22 | + # KeyToList type variable | |
23 | + key_to_list = dict() | |
24 | + with open(file_path, "r") as f: | |
25 | + for line in f: | |
26 | + splited = line.replace("\n", "").split(" ") | |
27 | + id = splited[0] | |
28 | + values = splited[1:] | |
29 | + key_to_list[id] = values | |
30 | + return key_to_list | |
31 | + | |
32 | + | |
33 | +def read_features(file_path: str) -> KeyToFeatures: | |
34 | + ''' | |
35 | + ''' | |
36 | + # KeyToFeatures type variable | |
37 | + key_to_features = dict() | |
38 | + # and the KeyToList | |
39 | + key_to_list = read_lst(file_path) | |
40 | + | |
41 | + for key_, list_ in key_to_list.items(): | |
42 | + key_to_features[key_] = np.asarray(list_, dtype=float) | |
43 | + | |
44 | + return key_to_features | |
0 | 45 | \ No newline at end of file |
volia/core/measures.py
... | ... | @@ -0,0 +1,227 @@ |
1 | +''' | |
2 | +This module is a part of my library. | |
3 | +It aims to compute some measures for clustering. | |
4 | +''' | |
5 | + | |
6 | +import numpy as np | |
7 | + | |
8 | +def disequilibrium_(matrix1, matrix2, isGlobal=False, mod=None): | |
9 | + ''' | |
10 | + Compute disequilibrium for all the clusters. | |
11 | + The disequilibrium is compute from the difference | |
12 | + between two clustering sets. | |
13 | + isGlobal permet à l'utilisateur de choisir le dénominateur de | |
14 | + la fonction : | |
15 | + - True : divise la valeur par le nombre d'élément du cluster | |
16 | + - False : divise la valeur par le nombre d'élément total | |
17 | + | |
18 | + withPower permet à l'utilisateur de décider d'appliquer un carré 2 ou | |
19 | + une valeur absolue. | |
20 | + ''' | |
21 | + | |
22 | + def divide_line(a, divider): | |
23 | + ''' | |
24 | + Sub function used for dividing matrix by a vector line by line. | |
25 | + ''' | |
26 | + return np.divide(a, divider, out=np.zeros_like(a), where=divider!=0) | |
27 | + | |
28 | + dividers1 = 0 | |
29 | + dividers2 = 0 | |
30 | + | |
31 | + if isGlobal: | |
32 | + dividers1 = matrix1.sum() | |
33 | + dividers2 = matrix2.sum() | |
34 | + else: | |
35 | + dividers1 = matrix1.sum(axis=1) | |
36 | + dividers2 = matrix2.sum(axis=1) | |
37 | + | |
38 | + matrix1_divided = np.apply_along_axis(divide_line, 0, np.asarray(matrix1, dtype=np.float), dividers1) | |
39 | + | |
40 | + matrix2_divided = np.apply_along_axis(divide_line, 0, np.asarray(matrix2, dtype=np.float), dividers2) | |
41 | + | |
42 | + diff = matrix1_divided - matrix2_divided | |
43 | + | |
44 | + mask = np.logical_not(np.logical_and(matrix2==0, matrix1==0)) | |
45 | + | |
46 | + result = diff | |
47 | + | |
48 | + if mod != None or mod == "": | |
49 | + for word in mod.split(" "): | |
50 | + if word == "power": | |
51 | + result = np.power(result,2) | |
52 | + elif word == "human": | |
53 | + result = result * 100 | |
54 | + elif word == "abs": | |
55 | + result = np.absolute(result) | |
56 | + else: | |
57 | + raise Exception("Need to specify an accepted mod of the disequilibrium (\"power\", \"human\" or \"abs\"") | |
58 | + return (mask, result) | |
59 | + | |
60 | + | |
61 | + | |
62 | +def disequilibrium_mean_by_cluster(mask, matrix): | |
63 | + ''' | |
64 | + Mean of disequilibrium | |
65 | + matrix is the disequilibrium calculated | |
66 | + from number of occurences belonging to a class, | |
67 | + for each cluster. | |
68 | + ''' | |
69 | + nb_k = len(matrix) | |
70 | + results = np.zeros((nb_k)) | |
71 | + | |
72 | + for i in range(nb_k): | |
73 | + results[i] = matrix[i].sum() / mask[i].sum() | |
74 | + return results | |
75 | + | |
76 | + | |
77 | +def disequilibrium(matrix1, matrix2, isGlobal=False): | |
78 | + ''' | |
79 | + Disequilibrium matrix | |
80 | + And Disequilibrium value | |
81 | + ''' | |
82 | + mask, result = disequilibrium_(matrix1, matrix2, isGlobal) | |
83 | + result_human = result * 100 | |
84 | + result_power = np.power(result, 2) | |
85 | + | |
86 | + return ( | |
87 | + mask, | |
88 | + result_human, | |
89 | + disequilibrium_mean_by_cluster(mask, result_power).sum()/matrix1.shape[0] | |
90 | + ) | |
91 | + | |
92 | + | |
93 | +def compute_count_matrix(y_truth, y_hat): | |
94 | + ''' | |
95 | + Check the size of the lists with assertion | |
96 | + ''' | |
97 | + # Check size of the lists | |
98 | + assert len(y_hat) == len(y_truth), f"Matrices should have the same length y_hat: {len(y_hat)}, y_truth: {len(y_truth)}" | |
99 | + | |
100 | + # Build count matrix | |
101 | + count_matrix = np.zeros((max(y_hat+1), max(y_truth+1))) | |
102 | + for i in range(len(y_hat)): | |
103 | + count_matrix[y_hat[i]][y_truth[i]] += 1 | |
104 | + return count_matrix | |
105 | + | |
106 | + | |
107 | +def entropy_score(y_truth, y_hat): | |
108 | + ''' | |
109 | + Need to use label encoder before givin y_hat and y_truth | |
110 | + Don't use one hot labels | |
111 | + | |
112 | + Return a tuple with: | |
113 | + - result_matrix : the matrix with the log multiplied probabilities (P(x) * log(P(x))) | |
114 | + - result_vector : the vector avec summing entropy of each class. Each value corresponds to a cluster. | |
115 | + - result : the final entropy measure of the clustering | |
116 | + ''' | |
117 | + def divide_line(a, divider): | |
118 | + ''' | |
119 | + Sub function used for dividing matrix by a vector line by line. | |
120 | + ''' | |
121 | + return np.divide(a, divider, out=np.zeros_like(a), where=divider!=0) | |
122 | + | |
123 | + # Build count matrix | |
124 | + count_matrix = compute_count_matrix(y_truth, y_hat) | |
125 | + | |
126 | + # Build dividers vector | |
127 | + dividers = count_matrix.sum(axis=1) | |
128 | + | |
129 | + matrix_divided = np.apply_along_axis(divide_line, 0, np.asarray(count_matrix, dtype=np.float), dividers) | |
130 | + | |
131 | + log_matrix = np.zeros(matrix_divided.shape) | |
132 | + np.log2(matrix_divided, out=log_matrix, where=count_matrix != 0) | |
133 | + result_matrix = -1 * np.multiply(matrix_divided, log_matrix) | |
134 | + result_vector = result_matrix.sum(axis=1) | |
135 | + result_vector.sum() | |
136 | + | |
137 | + if np.isnan(np.sum(result_vector)): | |
138 | + print("COUNT MATRIX") | |
139 | + print(count_matrix) | |
140 | + print("MATRIX DIVIDED") | |
141 | + print(matrix_divided) | |
142 | + print("RESULT MATRIX") | |
143 | + print(result_matrix) | |
144 | + print("VECTOR MATRIX") | |
145 | + print(result_vector) | |
146 | + print("An error occured due to nan value, some values are printed before") | |
147 | + exit(1) | |
148 | + | |
149 | + result = result_vector * dividers / dividers.sum() | |
150 | + result = result.sum() | |
151 | + return (result_matrix, result_vector, result) | |
152 | + | |
153 | + | |
154 | +def purity_score(y_truth, y_hat): | |
155 | + ''' | |
156 | + Return three values in a dictionary: | |
157 | + - purity_class_score: the purity score of the class (asp) | |
158 | + - purity_cluster_score: the purity score of the cluster (acp) | |
159 | + - K: the overall evaluation criterion (sqrt(asp * acp)) | |
160 | + | |
161 | + This function is based on the following article: | |
162 | + Unknown-multiple speaker clustering using HMM, J. Ajmera, H. Bourlard, I. Lapidot, I. McCowan | |
163 | + ''' | |
164 | + | |
165 | + def divide_line(a, divider): | |
166 | + ''' | |
167 | + Sub function used for dividing matrix by a vector line by line. | |
168 | + ''' | |
169 | + return np.divide(a, divider, out=np.zeros_like(a), where=divider!=0) | |
170 | + | |
171 | + def compute_purity_score(count_matrix, axis=0): | |
172 | + if axis==0: | |
173 | + other_axis = 1 | |
174 | + else: | |
175 | + other_axis = 0 | |
176 | + count_per_row = count_matrix.sum(axis=axis) | |
177 | + dividers = np.square(count_per_row) | |
178 | + | |
179 | + count_matrix_squared = np.square(count_matrix) | |
180 | + matrix_divided = np.apply_along_axis(divide_line, other_axis, np.asarray(count_matrix_squared, dtype=np.float), dividers) | |
181 | + vector_purity = np.sum(matrix_divided, axis=axis) | |
182 | + | |
183 | + scalar_purity = np.average(vector_purity, weights=count_per_row) | |
184 | + return (vector_purity, scalar_purity) | |
185 | + | |
186 | + | |
187 | + count_matrix = compute_count_matrix(y_truth, y_hat) | |
188 | + _, purity_cluster_score = compute_purity_score(count_matrix, 1) | |
189 | + _, purity_class_score = cluster_purity = compute_purity_score(count_matrix, 0) | |
190 | + | |
191 | + K = np.sqrt(purity_cluster_score * purity_class_score) | |
192 | + | |
193 | + for i in range(count_matrix.shape[0]): | |
194 | + for j in range(count_matrix.shape[1]): | |
195 | + count_matrix[i][j] | |
196 | + count_matrix[i] | |
197 | + return { | |
198 | + "purity_class_score": purity_class_score, | |
199 | + "purity_cluster_score": purity_cluster_score, | |
200 | + "K": K | |
201 | + } | |
202 | + | |
203 | + | |
204 | +if __name__ == "__main__": | |
205 | + print("Purity test #1") | |
206 | + # Hypothesis | |
207 | + y_hat = np.asarray([0, 1, 2, 0, 1, 0, 3, 2, 2, 3, 3, 0]) | |
208 | + # Truth | |
209 | + y = np.asarray([0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3]) | |
210 | + | |
211 | + (result_matrix, result_vector, result) = entropy_score(y, y_hat) | |
212 | + print(purity_score(y, y_hat)) | |
213 | + | |
214 | + exit(1) | |
215 | + print("Purity test #2") | |
216 | + # Hypothesis | |
217 | + y_hat = np.asarray([0, 1, 2, 0, 1, 0, 3, 2, 2, 3, 3, 0, 4, 4, 4]) | |
218 | + # Truth | |
219 | + y = np.asarray([0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 0, 3, 3, 3]) | |
220 | + | |
221 | + (result_matrix, result_vector, result) = entropy_score(y, y_hat) | |
222 | + exit(1) | |
223 | + print("Result matrix: ") | |
224 | + print(result_matrix) | |
225 | + print("Result vector: ") | |
226 | + print(result_vector) | |
227 | + print("Result: ", result) | |
0 | 228 | \ No newline at end of file |
volia/filter_ids.py
... | ... | @@ -0,0 +1,24 @@ |
1 | +import argparse | |
2 | +from os.path import isfile | |
3 | +#from volia.data_io import read_lst | |
4 | + | |
5 | +import volia | |
6 | +if __name__ == "__main__": | |
7 | + parser = argparse.ArgumentParser(description="Filter ids of the given file to only keep a subset") | |
8 | + parser.add_argument("file", type=str, help="") | |
9 | + parser.add_argument("--filter", default=None, type=str, help="") | |
10 | + parser.add_argument("--outfile", default="out.txt", type=str, help="") | |
11 | + | |
12 | + args = parser.parse_args() | |
13 | + | |
14 | + assert args.filter is not None | |
15 | + assert isfile(args.file) | |
16 | + | |
17 | + list_ = read_lst(args.file) | |
18 | + filter_ = read_lst(args.filter) | |
19 | + | |
20 | + with open(args.outfile, "w") as of: | |
21 | + for key in filter_.keys(): | |
22 | + of.write(key + " " + " ".join(list_[key]) + "\n") | |
23 | + | |
24 | + print("File filtered and written in: ", args.outfile) | |
0 | 25 | \ No newline at end of file |
volia/plot-character.py
... | ... | @@ -0,0 +1,62 @@ |
1 | + | |
2 | +import matplotlib.pyplot as plt | |
3 | +import numpy as np | |
4 | +import pandas as pd | |
5 | +import argparse | |
6 | +from os.path import isfile | |
7 | +from volia.data_io import read_features, read_lst | |
8 | + | |
9 | + | |
10 | +if __name__ == "__main__": | |
11 | + # Argparse | |
12 | + parser = argparse.ArgumentParser(description="Plot points with color for each character") | |
13 | + parser.add_argument("--features", type=str, help="features file path") | |
14 | + parser.add_argument("--utt2char", type=str, help="char2utt file path") | |
15 | + parser.add_argument("--sublist", type=str, default=None, help="white list of ids to take into account") | |
16 | + parser.add_argument("--outfile", default="out.pdf", type=str, help="") | |
17 | + parser.add_argument("--title", default="Example of plot", type=str, help="Specify the title") | |
18 | + args = parser.parse_args() | |
19 | + | |
20 | + # List of assertions | |
21 | + assert args.features, "Need to specify features option" | |
22 | + assert args.utt2char, "Need to specify char2utt option file" | |
23 | + assert isfile(args.features), "Features path should point to a file" | |
24 | + assert isfile(args.utt2char), "char2utt path should point to a file" | |
25 | + if args.sublist is not None: | |
26 | + assert isfile(args.sublist), "sublist path should point to a file" | |
27 | + | |
28 | + | |
29 | + id_to_features = read_features(args.features) | |
30 | + | |
31 | + ids = [] | |
32 | + if args.sublist is not None: | |
33 | + print("Using sublist") | |
34 | + list_ids = read_lst(args.sublist) | |
35 | + ids = [ key for key in list_ids.keys() ] | |
36 | + else: | |
37 | + ids = [ key for key in id_to_features.keys() ] | |
38 | + | |
39 | + utt2char = read_lst(args.utt2char) | |
40 | + | |
41 | + features = [ id_to_features[id_] for id_ in ids ] | |
42 | + features = np.vstack(features) | |
43 | + | |
44 | + characters_list = [ utt2char[id_][0] for id_ in ids ] | |
45 | + | |
46 | + features_T = features.transpose() | |
47 | + print("Number of characters: ", len(np.unique(characters_list))) | |
48 | + df = pd.DataFrame(dict( | |
49 | + x=features_T[0], | |
50 | + y=features_T[1], | |
51 | + character=characters_list)) | |
52 | + | |
53 | + groups = df.groupby('character') | |
54 | + | |
55 | + # Plot | |
56 | + fig, ax = plt.subplots() | |
57 | + | |
58 | + for character, group in groups: | |
59 | + p = ax.plot(group.x, group.y, marker='o', linestyle='', ms=1, label=character) | |
60 | + ax.legend() | |
61 | + plt.savefig(args.outfile) | |
62 | + print("Your plot is saved well (no check of this affirmation)") |
volia/test.py
volia/tsne.py
... | ... | @@ -0,0 +1,37 @@ |
1 | +''' | |
2 | +The goal of this script is to display calculate tsne of pvectors. | |
3 | +''' | |
4 | + | |
5 | +import os | |
6 | +from os.path import isfile | |
7 | +import argparse | |
8 | +import numpy as np | |
9 | +from sklearn.manifold import TSNE | |
10 | + | |
11 | +from volia.data_io import read_features | |
12 | + | |
13 | +if __name__ == "__main__": | |
14 | + # Defining argparse | |
15 | + parser = argparse.ArgumentParser(prog='pvector tsne', description='Calculate the tsne representation of pvector in 3 or 2d') | |
16 | + parser.add_argument('features', type=str, | |
17 | + help='the path of the file you want to calculate tsne') | |
18 | + parser.add_argument('-o', '--outfile', type=str, | |
19 | + default='.', | |
20 | + help='the path of the output file.') | |
21 | + parser.add_argument('-n', '--n-comp', type=int, choices=[2, 3], | |
22 | + default='2', | |
23 | + help='number of components output of tsne') | |
24 | + | |
25 | + args = parser.parse_args() | |
26 | + | |
27 | + assert isfile(args.features) | |
28 | + | |
29 | + features_list = read_features(args.features) | |
30 | + tuples_key_feat = np.vstack([ (key, feats) for key, feats in features_list.items()]) | |
31 | + keys, features = zip(*tuples_key_feat) | |
32 | + feat_tsne = TSNE(n_components=args.n_comp).fit_transform(features) | |
33 | + | |
34 | + with open(args.outfile, "w") as of: | |
35 | + for i in range(len(keys)): | |
36 | + of.write(keys[i] + " " + " ".join([str(feat) for feat in feat_tsne[i]]) + "\n") | |
37 | + print("TSNE finished. Check if everything has been done well.") | |
0 | 38 | \ No newline at end of file |