Commit e7d811503f88129fb1d8eb28dd6af09f681a771e
1 parent
15e16ec460
Exists in
master
New file architecture. Now scripts are on volia's directory and the library is o…
…n the core directory.
Showing 14 changed files with 419 additions and 670 deletions Side-by-side Diff
- scripts/data-management/convert-old.py
- scripts/data-management/filter_ids.py
- scripts/dim-reduction/tsne.py
- scripts/evaluations/clustering.py
- scripts/plot/plot-character.py
- volia/convert-old.py
- volia/core/data.py
- volia/core/measures.py
- volia/data_io.py
- volia/filter_ids.py
- volia/measures.py
- volia/plot-character.py
- volia/test.py
- volia/tsne.py
scripts/data-management/convert-old.py
1 | -import argparse | |
2 | -from os.path import isfile | |
3 | - | |
4 | - | |
5 | -if __name__ == "__main__": | |
6 | - | |
7 | - parser = argparse.ArgumentParser( | |
8 | - description="Convert old files with wrong id to new one. Masseffect.") | |
9 | - | |
10 | - parser.add_argument("file", type=str, help="feature, x2x, or list file") | |
11 | - parser.add_argument("--outfile", type=str, default="out.txt", help="output file") | |
12 | - | |
13 | - args = parser.parse_args() | |
14 | - | |
15 | - assert isfile(args.file), "The given file does not exist." | |
16 | - | |
17 | - with open(args.file, "r") as f, open(args.outfile, "w") as of: | |
18 | - for line in f: | |
19 | - splited = line.replace("\n", "").split(" ") | |
20 | - metas = splited[0].split(",") | |
21 | - metas.pop(2) | |
22 | - splited[0] = ",".join(metas) | |
23 | - of.write(" ".join(splited) + "\n") |
scripts/data-management/filter_ids.py
1 | -import argparse | |
2 | -from os.path import isfile | |
3 | -from volia.data_io import read_lst | |
4 | - | |
5 | -if __name__ == "__main__": | |
6 | - parser = argparse.ArgumentParser(description="Filter ids of the given file to only keep a subset") | |
7 | - parser.add_argument("file", type=str, help="") | |
8 | - parser.add_argument("--filter", default=None, type=str, help="") | |
9 | - parser.add_argument("--outfile", default="out.txt", type=str, help="") | |
10 | - | |
11 | - args = parser.parse_args() | |
12 | - | |
13 | - assert args.filter is not None | |
14 | - assert isfile(args.file) | |
15 | - | |
16 | - list_ = read_lst(args.file) | |
17 | - filter_ = read_lst(args.filter) | |
18 | - | |
19 | - with open(args.outfile, "w") as of: | |
20 | - for key in filter_.keys(): | |
21 | - of.write(key + " " + " ".join(list_[key]) + "\n") | |
22 | - | |
23 | - print("File filtered and written in: ", args.outfile) |
scripts/dim-reduction/tsne.py
1 | -''' | |
2 | -The goal of this script is to display calculate tsne of pvectors. | |
3 | -''' | |
4 | - | |
5 | -import os | |
6 | -from os.path import isfile | |
7 | -import argparse | |
8 | -import numpy as np | |
9 | -from sklearn.manifold import TSNE | |
10 | - | |
11 | -from volia.data_io import read_features | |
12 | - | |
13 | -if __name__ == "__main__": | |
14 | - # Defining argparse | |
15 | - parser = argparse.ArgumentParser(prog='pvector tsne', description='Calculate the tsne representation of pvector in 3 or 2d') | |
16 | - parser.add_argument('features', type=str, | |
17 | - help='the path of the file you want to calculate tsne') | |
18 | - parser.add_argument('-o', '--outfile', type=str, | |
19 | - default='.', | |
20 | - help='the path of the output file.') | |
21 | - parser.add_argument('-n', '--n-comp', type=int, choices=[2, 3], | |
22 | - default='2', | |
23 | - help='number of components output of tsne') | |
24 | - | |
25 | - args = parser.parse_args() | |
26 | - | |
27 | - assert isfile(args.features) | |
28 | - | |
29 | - features_list = read_features(args.features) | |
30 | - tuples_key_feat = np.vstack([ (key, feats) for key, feats in features_list.items()]) | |
31 | - keys, features = zip(*tuples_key_feat) | |
32 | - feat_tsne = TSNE(n_components=args.n_comp).fit_transform(features) | |
33 | - | |
34 | - with open(args.outfile, "w") as of: | |
35 | - for i in range(len(keys)): | |
36 | - of.write(keys[i] + " " + " ".join([str(feat) for feat in feat_tsne[i]]) + "\n") | |
37 | - print("TSNE finished. Check if everything has been done well.") |
scripts/evaluations/clustering.py
1 | -''' | |
2 | -This script allows the user to evaluate a classification system on new labels using clustering methods. | |
3 | -The algorithms are applied on the given latent space (embedding). | |
4 | -''' | |
5 | -import argparse | |
6 | -import numpy as np | |
7 | -import pandas as pd | |
8 | -import os | |
9 | -import time | |
10 | -import pickle | |
11 | -import csv | |
12 | -import json | |
13 | - | |
14 | -from sklearn.preprocessing import LabelEncoder | |
15 | -from sklearn.metrics.pairwise import pairwise_distances | |
16 | -from sklearn.cluster import KMeans | |
17 | -from sklearn.manifold import TSNE | |
18 | -from sklearn.metrics import f1_score, homogeneity_score, completeness_score, v_measure_score | |
19 | -import matplotlib.pyplot as plt | |
20 | - | |
21 | -from volia.data_io import read_features,read_lst | |
22 | -from volia.measures import entropy_score, purity_score | |
23 | - | |
24 | -''' | |
25 | -TODO: | |
26 | -- Add an option allowing the user to choose the number of | |
27 | -clustering to train in order to compute the average and the | |
28 | -''' | |
29 | - | |
30 | - | |
31 | -def train_clustering(label_encoder, feats, classes, outdir): | |
32 | - num_classes = len(label_encoder.classes_) | |
33 | - estimator = None | |
34 | - kmeans_filepath = os.path.join(outdir, f"{args.prefix}kmeans.pkl") | |
35 | - if args.onlymeasures: | |
36 | - print(f"Loading model: {kmeans_filepath}") | |
37 | - with open(kmeans_filepath, "rb") as f: | |
38 | - estimator = pickle.load(f) | |
39 | - else: | |
40 | - # Compute KMEANS clustering on data | |
41 | - print("Saving parameters") | |
42 | - kmeans_parameters = { | |
43 | - "n_clusters": num_classes, | |
44 | - "n_init": 100, | |
45 | - "tol": 10-6, | |
46 | - "algorithm": "elkan" | |
47 | - } | |
48 | - with open(os.path.join(outdir, f"{args.prefix}kmeans_parameters.json"), "w") as f: | |
49 | - json.dump(kmeans_parameters, f) | |
50 | - | |
51 | - # Fit the model and Save parameters | |
52 | - print(f"Fit the model: {kmeans_filepath}") | |
53 | - estimator = KMeans( | |
54 | - **kmeans_parameters | |
55 | - ) | |
56 | - estimator.fit(feats) | |
57 | - print(f"Kmeans: processed {estimator.n_iter_} iterations - intertia={estimator.inertia_}") | |
58 | - | |
59 | - with open(kmeans_filepath, "wb") as f: | |
60 | - pickle.dump(estimator, f) | |
61 | - | |
62 | - # contains distance to each cluster for each sample | |
63 | - dist_space = estimator.transform(feats) | |
64 | - predictions = np.argmin(dist_space, axis=1) | |
65 | - | |
66 | - # gives each cluster a name (considering most represented character) | |
67 | - dataframe = pd.DataFrame({ | |
68 | - "label": pd.Series(list(map(lambda x: le.classes_[x], labels))), | |
69 | - "prediction": pd.Series(predictions) | |
70 | - }) | |
71 | - | |
72 | - def find_cluster_name_fn(c): | |
73 | - mask = dataframe["prediction"] == c | |
74 | - return dataframe[mask]["label"].value_counts(sort=False).idxmax() | |
75 | - | |
76 | - cluster_names = list(map(find_cluster_name_fn, range(num_classes))) | |
77 | - predicted_labels = le.transform( | |
78 | - [cluster_names[pred] for pred in predictions]) | |
79 | - | |
80 | - # F-measure | |
81 | - fscores = f1_score(labels, predicted_labels, average=None) | |
82 | - fscores_str = "\n".join(map(lambda i: "{0:25s}: {1:.4f}".format(le.classes_[i], fscores[i]), range(len(fscores)))) | |
83 | - | |
84 | - # Entropy | |
85 | - _, _, entropy = entropy_score(labels, predicted_labels) | |
86 | - | |
87 | - # Homogenity | |
88 | - homogeneity = homogeneity_score(labels, predicted_labels) | |
89 | - | |
90 | - # Completeness | |
91 | - completeness = completeness_score(labels, predicted_labels) | |
92 | - | |
93 | - # V-Measure | |
94 | - v_measure = v_measure_score(labels, predicted_labels) | |
95 | - | |
96 | - # Purity | |
97 | - purity_scores = purity_score(labels, predicted_labels) | |
98 | - purity_class_score = purity_scores["purity_class_score"] | |
99 | - purity_cluster_score = purity_scores["purity_cluster_score"] | |
100 | - K = purity_scores["K"] | |
101 | - | |
102 | - # Write results | |
103 | - with open(os.path.join(outdir, args.prefix + "eval_clustering.log"), "w") as fd: | |
104 | - print(f"F1-scores for each classes:\n{fscores_str}", file=fd) | |
105 | - print(f"Entropy: {entropy}", file=fd) | |
106 | - print(f"Global score : {np.mean(fscores)}", file=fd) | |
107 | - print(f"Homogeneity: {homogeneity}", file=fd) | |
108 | - print(f"completeness: {completeness}", file=fd) | |
109 | - print(f"v-measure: {v_measure}", file=fd) | |
110 | - print(f"purity class score: {purity_class_score}", file=fd) | |
111 | - print(f"purity cluster score: {purity_cluster_score}", file=fd) | |
112 | - print(f"purity overall evaluation criterion (K): {K}", file=fd) | |
113 | - | |
114 | - # Process t-SNE and plot | |
115 | - tsne_estimator = TSNE() | |
116 | - embeddings = tsne_estimator.fit_transform(feats) | |
117 | - print("t-SNE: processed {0} iterations - KL_divergence={1:.4f}".format( | |
118 | - tsne_estimator.n_iter_, tsne_estimator.kl_divergence_)) | |
119 | - | |
120 | - fig, [axe1, axe2] = plt.subplots(1, 2, figsize=(10, 5)) | |
121 | - for c, name in enumerate(le.classes_): | |
122 | - c_mask = np.where(labels == c) | |
123 | - axe1.scatter(embeddings[c_mask][:, 0], embeddings[c_mask][:, 1], label=name, alpha=0.2, edgecolors=None) | |
124 | - | |
125 | - try: | |
126 | - id_cluster = cluster_names.index(name) | |
127 | - except ValueError: | |
128 | - print("WARNING: no cluster found for {}".format(name)) | |
129 | - continue | |
130 | - c_mask = np.where(predictions == id_cluster) | |
131 | - axe2.scatter(embeddings[c_mask][:, 0], embeddings[c_mask][:, 1], label=name, alpha=0.2, edgecolors=None) | |
132 | - | |
133 | - axe1.legend(loc="lower center", bbox_to_anchor=(0.5, -0.35)) | |
134 | - axe1.set_title("true labels") | |
135 | - axe2.legend(loc="lower center", bbox_to_anchor=(0.5, -0.35)) | |
136 | - axe2.set_title("predicted cluster label") | |
137 | - | |
138 | - plt.suptitle("Kmeans Clustering") | |
139 | - | |
140 | - loc = os.path.join( | |
141 | - outdir, | |
142 | - args.prefix + "kmeans.pdf" | |
143 | - ) | |
144 | - plt.savefig(loc, bbox_inches="tight") | |
145 | - plt.close() | |
146 | - | |
147 | - print("INFO: figure saved at {}".format(loc)) | |
148 | - | |
149 | - end = time.time() | |
150 | - print("program ended in {0:.2f} seconds".format(end-start)) | |
151 | - return { | |
152 | - "f1": np.mean(fscores), | |
153 | - "entropy": entropy, | |
154 | - "homogeneity": homogeneity, | |
155 | - "completeness": completeness, | |
156 | - "v-measure": v_measure, | |
157 | - "purity_class_score": purity_class_score, | |
158 | - "purity_cluster score": purity_cluster_score, | |
159 | - "K": K | |
160 | - } | |
161 | - | |
162 | - | |
163 | -if __name__ == "__main__": | |
164 | - # Argparse | |
165 | - parser = argparse.ArgumentParser("Compute clustering on a latent space") | |
166 | - parser.add_argument("features") | |
167 | - parser.add_argument("utt2", | |
168 | - type=str, | |
169 | - help="file with [utt] [value]") | |
170 | - parser.add_argument("--idsfrom", | |
171 | - type=str, | |
172 | - default="utt2", | |
173 | - choices=[ | |
174 | - "features", | |
175 | - "utt2" | |
176 | - ], | |
177 | - help="from features or from utt2?") | |
178 | - parser.add_argument("--prefix", | |
179 | - default="", | |
180 | - type=str, | |
181 | - help="prefix of saved files") | |
182 | - parser.add_argument("--outdir", | |
183 | - default=None, | |
184 | - type=str, | |
185 | - help="Output directory") | |
186 | - parser.add_argument("--nmodels", | |
187 | - type=int, | |
188 | - default=1, | |
189 | - help="specifies the number of models to train") | |
190 | - parser.add_argument("--onlymeasures", | |
191 | - action='store_true', | |
192 | - help="Don't compute the clustering, compute only the measures") | |
193 | - args = parser.parse_args() | |
194 | - | |
195 | - assert args.outdir | |
196 | - | |
197 | - start = time.time() | |
198 | - | |
199 | - # Load features and utt2 | |
200 | - features = read_features(args.features) | |
201 | - utt2 = read_lst(args.utt2) | |
202 | - | |
203 | - # Take id list | |
204 | - if args.idsfrom == "features": | |
205 | - ids = list(features.keys()) | |
206 | - elif args.idsfrom == "utt2": | |
207 | - ids = list(utt2.keys()) | |
208 | - else: | |
209 | - print(f"idsfrom is not good: {args.idsfrom}") | |
210 | - exit(1) | |
211 | - | |
212 | - feats = np.vstack([ features[id_] for id_ in ids ]) | |
213 | - classes = [ utt2[id_] for id_ in ids ] | |
214 | - | |
215 | - # Encode labels | |
216 | - le = LabelEncoder() | |
217 | - labels = le.fit_transform(classes) | |
218 | - | |
219 | - measures = {} | |
220 | - for i in range(1, args.nmodels+1): | |
221 | - subdir = os.path.join(args.outdir, str(i)) | |
222 | - if not os.path.exists(subdir): | |
223 | - os.mkdir(subdir) | |
224 | - print(f"[{i}/{args.nmodels}] => {subdir}") | |
225 | - results = train_clustering(le, feats, classes, subdir) | |
226 | - | |
227 | - for key, value in results.items(): | |
228 | - if key not in measures: | |
229 | - measures[key] = [] | |
230 | - measures[key].append(results[key]) | |
231 | - | |
232 | - | |
233 | - # File with results | |
234 | - file_results = os.path.join(args.outdir, args.prefix + "clustering_measures.txt") | |
235 | - | |
236 | - with open(file_results, "w") as f: | |
237 | - f.write(f"[nmodels: {args.nmodels}]\n") | |
238 | - for key in measures.keys(): | |
239 | - values = np.asarray(measures[key], dtype=float) | |
240 | - mean = np.mean(values) | |
241 | - std = np.std(values) | |
242 | - f.write(f"[{key} => mean: {mean}, std: {std}] \n") | |
243 | - | |
244 | - # CSV File with all the values | |
245 | - file_csv_measures = os.path.join(args.outdir, args.prefix + "clustering_measures.csv") | |
246 | - | |
247 | - with open(file_csv_measures, "w", newline="") as f: | |
248 | - writer = csv.writer(f, delimiter=",") | |
249 | - writer.writerow(["measure"] + list(range(1, args.nmodels+1)) + ["mean"] + ["std"]) | |
250 | - for key in measures.keys(): | |
251 | - values = np.asarray(measures[key], dtype=float) | |
252 | - mean = np.mean(values) | |
253 | - std = np.std(values) | |
254 | - writer.writerow([key] + list(values) + [mean] + [std]) |
scripts/plot/plot-character.py
1 | - | |
2 | -import matplotlib.pyplot as plt | |
3 | -import numpy as np | |
4 | -import pandas as pd | |
5 | -import argparse | |
6 | -from os.path import isfile | |
7 | -from volia.data_io import read_features, read_lst | |
8 | - | |
9 | - | |
10 | -if __name__ == "__main__": | |
11 | - # Argparse | |
12 | - parser = argparse.ArgumentParser(description="Plot points with color for each character") | |
13 | - parser.add_argument("--features", type=str, help="features file path") | |
14 | - parser.add_argument("--utt2char", type=str, help="char2utt file path") | |
15 | - parser.add_argument("--sublist", type=str, default=None, help="white list of ids to take into account") | |
16 | - parser.add_argument("--outfile", default="out.pdf", type=str, help="") | |
17 | - parser.add_argument("--title", default="Example of plot", type=str, help="Specify the title") | |
18 | - args = parser.parse_args() | |
19 | - | |
20 | - # List of assertions | |
21 | - assert args.features, "Need to specify features option" | |
22 | - assert args.utt2char, "Need to specify char2utt option file" | |
23 | - assert isfile(args.features), "Features path should point to a file" | |
24 | - assert isfile(args.utt2char), "char2utt path should point to a file" | |
25 | - if args.sublist is not None: | |
26 | - assert isfile(args.sublist), "sublist path should point to a file" | |
27 | - | |
28 | - | |
29 | - id_to_features = read_features(args.features) | |
30 | - | |
31 | - ids = [] | |
32 | - if args.sublist is not None: | |
33 | - print("Using sublist") | |
34 | - list_ids = read_lst(args.sublist) | |
35 | - ids = [ key for key in list_ids.keys() ] | |
36 | - else: | |
37 | - ids = [ key for key in id_to_features.keys() ] | |
38 | - | |
39 | - utt2char = read_lst(args.utt2char) | |
40 | - | |
41 | - features = [ id_to_features[id_] for id_ in ids ] | |
42 | - features = np.vstack(features) | |
43 | - | |
44 | - characters_list = [ utt2char[id_][0] for id_ in ids ] | |
45 | - | |
46 | - features_T = features.transpose() | |
47 | - print("Number of characters: ", len(np.unique(characters_list))) | |
48 | - df = pd.DataFrame(dict( | |
49 | - x=features_T[0], | |
50 | - y=features_T[1], | |
51 | - character=characters_list)) | |
52 | - | |
53 | - groups = df.groupby('character') | |
54 | - | |
55 | - # Plot | |
56 | - fig, ax = plt.subplots() | |
57 | - | |
58 | - for character, group in groups: | |
59 | - p = ax.plot(group.x, group.y, marker='o', linestyle='', ms=1, label=character) | |
60 | - ax.legend() | |
61 | - plt.savefig(args.outfile) | |
62 | - print("Your plot is saved well (no check of this affirmation)") |
volia/convert-old.py
1 | +import argparse | |
2 | +from os.path import isfile | |
3 | + | |
4 | + | |
5 | +if __name__ == "__main__": | |
6 | + | |
7 | + parser = argparse.ArgumentParser( | |
8 | + description="Convert old files with wrong id to new one. Masseffect.") | |
9 | + | |
10 | + parser.add_argument("file", type=str, help="feature, x2x, or list file") | |
11 | + parser.add_argument("--outfile", type=str, default="out.txt", help="output file") | |
12 | + | |
13 | + args = parser.parse_args() | |
14 | + | |
15 | + assert isfile(args.file), "The given file does not exist." | |
16 | + | |
17 | + with open(args.file, "r") as f, open(args.outfile, "w") as of: | |
18 | + for line in f: | |
19 | + splited = line.replace("\n", "").split(" ") | |
20 | + metas = splited[0].split(",") | |
21 | + metas.pop(2) | |
22 | + splited[0] = ",".join(metas) | |
23 | + of.write(" ".join(splited) + "\n") |
volia/core/data.py
1 | +''' | |
2 | +Data management input/output | |
3 | +''' | |
4 | + | |
5 | +# Import packages and modules | |
6 | +import numpy as np | |
7 | + | |
8 | +# Defining some types | |
9 | +from typing import List, Dict | |
10 | +KeyToList = Dict[str, List[str]] | |
11 | +KeyToFeatures = Dict[str, List[float]] | |
12 | + | |
13 | + | |
14 | +def read_lst(file_path: str) -> KeyToList: | |
15 | + ''' | |
16 | + Read lst file with this structure: | |
17 | + [id] [value1] [value2] ... [value n] | |
18 | + | |
19 | + This is a basic function reused by others like read_features. | |
20 | + returns a dictionary with id as key and a list of value as corresponding values | |
21 | + ''' | |
22 | + # KeyToList type variable | |
23 | + key_to_list = dict() | |
24 | + with open(file_path, "r") as f: | |
25 | + for line in f: | |
26 | + splited = line.replace("\n", "").split(" ") | |
27 | + id = splited[0] | |
28 | + values = splited[1:] | |
29 | + key_to_list[id] = values | |
30 | + return key_to_list | |
31 | + | |
32 | + | |
33 | +def read_features(file_path: str) -> KeyToFeatures: | |
34 | + ''' | |
35 | + ''' | |
36 | + # KeyToFeatures type variable | |
37 | + key_to_features = dict() | |
38 | + # and the KeyToList | |
39 | + key_to_list = read_lst(file_path) | |
40 | + | |
41 | + for key_, list_ in key_to_list.items(): | |
42 | + key_to_features[key_] = np.asarray(list_, dtype=float) | |
43 | + | |
44 | + return key_to_features |
volia/core/measures.py
1 | +''' | |
2 | +This module is a part of my library. | |
3 | +It aims to compute some measures for clustering. | |
4 | +''' | |
5 | + | |
6 | +import numpy as np | |
7 | + | |
8 | +def disequilibrium_(matrix1, matrix2, isGlobal=False, mod=None): | |
9 | + ''' | |
10 | + Compute disequilibrium for all the clusters. | |
11 | + The disequilibrium is compute from the difference | |
12 | + between two clustering sets. | |
13 | + isGlobal permet ร l'utilisateur de choisir le dรฉnominateur de | |
14 | + la fonction : | |
15 | + - True : divise la valeur par le nombre d'รฉlรฉment du cluster | |
16 | + - False : divise la valeur par le nombre d'รฉlรฉment total | |
17 | + | |
18 | + withPower permet ร l'utilisateur de dรฉcider d'appliquer un carrรฉ 2 ou | |
19 | + une valeur absolue. | |
20 | + ''' | |
21 | + | |
22 | + def divide_line(a, divider): | |
23 | + ''' | |
24 | + Sub function used for dividing matrix by a vector line by line. | |
25 | + ''' | |
26 | + return np.divide(a, divider, out=np.zeros_like(a), where=divider!=0) | |
27 | + | |
28 | + dividers1 = 0 | |
29 | + dividers2 = 0 | |
30 | + | |
31 | + if isGlobal: | |
32 | + dividers1 = matrix1.sum() | |
33 | + dividers2 = matrix2.sum() | |
34 | + else: | |
35 | + dividers1 = matrix1.sum(axis=1) | |
36 | + dividers2 = matrix2.sum(axis=1) | |
37 | + | |
38 | + matrix1_divided = np.apply_along_axis(divide_line, 0, np.asarray(matrix1, dtype=np.float), dividers1) | |
39 | + | |
40 | + matrix2_divided = np.apply_along_axis(divide_line, 0, np.asarray(matrix2, dtype=np.float), dividers2) | |
41 | + | |
42 | + diff = matrix1_divided - matrix2_divided | |
43 | + | |
44 | + mask = np.logical_not(np.logical_and(matrix2==0, matrix1==0)) | |
45 | + | |
46 | + result = diff | |
47 | + | |
48 | + if mod != None or mod == "": | |
49 | + for word in mod.split(" "): | |
50 | + if word == "power": | |
51 | + result = np.power(result,2) | |
52 | + elif word == "human": | |
53 | + result = result * 100 | |
54 | + elif word == "abs": | |
55 | + result = np.absolute(result) | |
56 | + else: | |
57 | + raise Exception("Need to specify an accepted mod of the disequilibrium (\"power\", \"human\" or \"abs\"") | |
58 | + return (mask, result) | |
59 | + | |
60 | + | |
61 | + | |
62 | +def disequilibrium_mean_by_cluster(mask, matrix): | |
63 | + ''' | |
64 | + Mean of disequilibrium | |
65 | + matrix is the disequilibrium calculated | |
66 | + from number of occurences belonging to a class, | |
67 | + for each cluster. | |
68 | + ''' | |
69 | + nb_k = len(matrix) | |
70 | + results = np.zeros((nb_k)) | |
71 | + | |
72 | + for i in range(nb_k): | |
73 | + results[i] = matrix[i].sum() / mask[i].sum() | |
74 | + return results | |
75 | + | |
76 | + | |
77 | +def disequilibrium(matrix1, matrix2, isGlobal=False): | |
78 | + ''' | |
79 | + Disequilibrium matrix | |
80 | + And Disequilibrium value | |
81 | + ''' | |
82 | + mask, result = disequilibrium_(matrix1, matrix2, isGlobal) | |
83 | + result_human = result * 100 | |
84 | + result_power = np.power(result, 2) | |
85 | + | |
86 | + return ( | |
87 | + mask, | |
88 | + result_human, | |
89 | + disequilibrium_mean_by_cluster(mask, result_power).sum()/matrix1.shape[0] | |
90 | + ) | |
91 | + | |
92 | + | |
93 | +def compute_count_matrix(y_truth, y_hat): | |
94 | + ''' | |
95 | + Check the size of the lists with assertion | |
96 | + ''' | |
97 | + # Check size of the lists | |
98 | + assert len(y_hat) == len(y_truth), f"Matrices should have the same length y_hat: {len(y_hat)}, y_truth: {len(y_truth)}" | |
99 | + | |
100 | + # Build count matrix | |
101 | + count_matrix = np.zeros((max(y_hat+1), max(y_truth+1))) | |
102 | + for i in range(len(y_hat)): | |
103 | + count_matrix[y_hat[i]][y_truth[i]] += 1 | |
104 | + return count_matrix | |
105 | + | |
106 | + | |
107 | +def entropy_score(y_truth, y_hat): | |
108 | + ''' | |
109 | + Need to use label encoder before givin y_hat and y_truth | |
110 | + Don't use one hot labels | |
111 | + | |
112 | + Return a tuple with: | |
113 | + - result_matrix : the matrix with the log multiplied probabilities (P(x) * log(P(x))) | |
114 | + - result_vector : the vector avec summing entropy of each class. Each value corresponds to a cluster. | |
115 | + - result : the final entropy measure of the clustering | |
116 | + ''' | |
117 | + def divide_line(a, divider): | |
118 | + ''' | |
119 | + Sub function used for dividing matrix by a vector line by line. | |
120 | + ''' | |
121 | + return np.divide(a, divider, out=np.zeros_like(a), where=divider!=0) | |
122 | + | |
123 | + # Build count matrix | |
124 | + count_matrix = compute_count_matrix(y_truth, y_hat) | |
125 | + | |
126 | + # Build dividers vector | |
127 | + dividers = count_matrix.sum(axis=1) | |
128 | + | |
129 | + matrix_divided = np.apply_along_axis(divide_line, 0, np.asarray(count_matrix, dtype=np.float), dividers) | |
130 | + | |
131 | + log_matrix = np.zeros(matrix_divided.shape) | |
132 | + np.log2(matrix_divided, out=log_matrix, where=count_matrix != 0) | |
133 | + result_matrix = -1 * np.multiply(matrix_divided, log_matrix) | |
134 | + result_vector = result_matrix.sum(axis=1) | |
135 | + result_vector.sum() | |
136 | + | |
137 | + if np.isnan(np.sum(result_vector)): | |
138 | + print("COUNT MATRIX") | |
139 | + print(count_matrix) | |
140 | + print("MATRIX DIVIDED") | |
141 | + print(matrix_divided) | |
142 | + print("RESULT MATRIX") | |
143 | + print(result_matrix) | |
144 | + print("VECTOR MATRIX") | |
145 | + print(result_vector) | |
146 | + print("An error occured due to nan value, some values are printed before") | |
147 | + exit(1) | |
148 | + | |
149 | + result = result_vector * dividers / dividers.sum() | |
150 | + result = result.sum() | |
151 | + return (result_matrix, result_vector, result) | |
152 | + | |
153 | + | |
154 | +def purity_score(y_truth, y_hat): | |
155 | + ''' | |
156 | + Return three values in a dictionary: | |
157 | + - purity_class_score: the purity score of the class (asp) | |
158 | + - purity_cluster_score: the purity score of the cluster (acp) | |
159 | + - K: the overall evaluation criterion (sqrt(asp * acp)) | |
160 | + | |
161 | + This function is based on the following article: | |
162 | + Unknown-multiple speaker clustering using HMM, J. Ajmera, H. Bourlard, I. Lapidot, I. McCowan | |
163 | + ''' | |
164 | + | |
165 | + def divide_line(a, divider): | |
166 | + ''' | |
167 | + Sub function used for dividing matrix by a vector line by line. | |
168 | + ''' | |
169 | + return np.divide(a, divider, out=np.zeros_like(a), where=divider!=0) | |
170 | + | |
171 | + def compute_purity_score(count_matrix, axis=0): | |
172 | + if axis==0: | |
173 | + other_axis = 1 | |
174 | + else: | |
175 | + other_axis = 0 | |
176 | + count_per_row = count_matrix.sum(axis=axis) | |
177 | + dividers = np.square(count_per_row) | |
178 | + | |
179 | + count_matrix_squared = np.square(count_matrix) | |
180 | + matrix_divided = np.apply_along_axis(divide_line, other_axis, np.asarray(count_matrix_squared, dtype=np.float), dividers) | |
181 | + vector_purity = np.sum(matrix_divided, axis=axis) | |
182 | + | |
183 | + scalar_purity = np.average(vector_purity, weights=count_per_row) | |
184 | + return (vector_purity, scalar_purity) | |
185 | + | |
186 | + | |
187 | + count_matrix = compute_count_matrix(y_truth, y_hat) | |
188 | + _, purity_cluster_score = compute_purity_score(count_matrix, 1) | |
189 | + _, purity_class_score = cluster_purity = compute_purity_score(count_matrix, 0) | |
190 | + | |
191 | + K = np.sqrt(purity_cluster_score * purity_class_score) | |
192 | + | |
193 | + for i in range(count_matrix.shape[0]): | |
194 | + for j in range(count_matrix.shape[1]): | |
195 | + count_matrix[i][j] | |
196 | + count_matrix[i] | |
197 | + return { | |
198 | + "purity_class_score": purity_class_score, | |
199 | + "purity_cluster_score": purity_cluster_score, | |
200 | + "K": K | |
201 | + } | |
202 | + | |
203 | + | |
204 | +if __name__ == "__main__": | |
205 | + print("Purity test #1") | |
206 | + # Hypothesis | |
207 | + y_hat = np.asarray([0, 1, 2, 0, 1, 0, 3, 2, 2, 3, 3, 0]) | |
208 | + # Truth | |
209 | + y = np.asarray([0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3]) | |
210 | + | |
211 | + (result_matrix, result_vector, result) = entropy_score(y, y_hat) | |
212 | + print(purity_score(y, y_hat)) | |
213 | + | |
214 | + exit(1) | |
215 | + print("Purity test #2") | |
216 | + # Hypothesis | |
217 | + y_hat = np.asarray([0, 1, 2, 0, 1, 0, 3, 2, 2, 3, 3, 0, 4, 4, 4]) | |
218 | + # Truth | |
219 | + y = np.asarray([0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 0, 3, 3, 3]) | |
220 | + | |
221 | + (result_matrix, result_vector, result) = entropy_score(y, y_hat) | |
222 | + exit(1) | |
223 | + print("Result matrix: ") | |
224 | + print(result_matrix) | |
225 | + print("Result vector: ") | |
226 | + print(result_vector) | |
227 | + print("Result: ", result) |
volia/data_io.py
1 | -''' | |
2 | -Data management input/output | |
3 | -''' | |
4 | - | |
5 | -# Import packages and modules | |
6 | -import numpy as np | |
7 | - | |
8 | -# Defining some types | |
9 | -from typing import List, Dict | |
10 | -KeyToList = Dict[str, List[str]] | |
11 | -KeyToFeatures = Dict[str, List[float]] | |
12 | - | |
13 | - | |
14 | -def read_lst(file_path: str) -> KeyToList: | |
15 | - ''' | |
16 | - Read lst file with this structure: | |
17 | - [id] [value1] [value2] ... [value n] | |
18 | - | |
19 | - This is a basic function reused by others like read_features. | |
20 | - returns a dictionary with id as key and a list of value as corresponding values | |
21 | - ''' | |
22 | - # KeyToList type variable | |
23 | - key_to_list = dict() | |
24 | - with open(file_path, "r") as f: | |
25 | - for line in f: | |
26 | - splited = line.replace("\n", "").split(" ") | |
27 | - id = splited[0] | |
28 | - values = splited[1:] | |
29 | - key_to_list[id] = values | |
30 | - return key_to_list | |
31 | - | |
32 | - | |
33 | -def read_features(file_path: str) -> KeyToFeatures: | |
34 | - ''' | |
35 | - ''' | |
36 | - # KeyToFeatures type variable | |
37 | - key_to_features = dict() | |
38 | - # and the KeyToList | |
39 | - key_to_list = read_lst(file_path) | |
40 | - | |
41 | - for key_, list_ in key_to_list.items(): | |
42 | - key_to_features[key_] = np.asarray(list_, dtype=float) | |
43 | - | |
44 | - return key_to_features |
volia/filter_ids.py
1 | +import argparse | |
2 | +from os.path import isfile | |
3 | +#from volia.data_io import read_lst | |
4 | + | |
5 | +import volia | |
6 | +if __name__ == "__main__": | |
7 | + parser = argparse.ArgumentParser(description="Filter ids of the given file to only keep a subset") | |
8 | + parser.add_argument("file", type=str, help="") | |
9 | + parser.add_argument("--filter", default=None, type=str, help="") | |
10 | + parser.add_argument("--outfile", default="out.txt", type=str, help="") | |
11 | + | |
12 | + args = parser.parse_args() | |
13 | + | |
14 | + assert args.filter is not None | |
15 | + assert isfile(args.file) | |
16 | + | |
17 | + list_ = read_lst(args.file) | |
18 | + filter_ = read_lst(args.filter) | |
19 | + | |
20 | + with open(args.outfile, "w") as of: | |
21 | + for key in filter_.keys(): | |
22 | + of.write(key + " " + " ".join(list_[key]) + "\n") | |
23 | + | |
24 | + print("File filtered and written in: ", args.outfile) |
volia/measures.py
1 | -''' | |
2 | -This module is a part of my library. | |
3 | -It aims to compute some measures for clustering. | |
4 | -''' | |
5 | - | |
6 | -import numpy as np | |
7 | - | |
8 | -def disequilibrium_(matrix1, matrix2, isGlobal=False, mod=None): | |
9 | - ''' | |
10 | - Compute disequilibrium for all the clusters. | |
11 | - The disequilibrium is compute from the difference | |
12 | - between two clustering sets. | |
13 | - isGlobal permet ร l'utilisateur de choisir le dรฉnominateur de | |
14 | - la fonction : | |
15 | - - True : divise la valeur par le nombre d'รฉlรฉment du cluster | |
16 | - - False : divise la valeur par le nombre d'รฉlรฉment total | |
17 | - | |
18 | - withPower permet ร l'utilisateur de dรฉcider d'appliquer un carrรฉ 2 ou | |
19 | - une valeur absolue. | |
20 | - ''' | |
21 | - | |
22 | - def divide_line(a, divider): | |
23 | - ''' | |
24 | - Sub function used for dividing matrix by a vector line by line. | |
25 | - ''' | |
26 | - return np.divide(a, divider, out=np.zeros_like(a), where=divider!=0) | |
27 | - | |
28 | - dividers1 = 0 | |
29 | - dividers2 = 0 | |
30 | - | |
31 | - if isGlobal: | |
32 | - dividers1 = matrix1.sum() | |
33 | - dividers2 = matrix2.sum() | |
34 | - else: | |
35 | - dividers1 = matrix1.sum(axis=1) | |
36 | - dividers2 = matrix2.sum(axis=1) | |
37 | - | |
38 | - matrix1_divided = np.apply_along_axis(divide_line, 0, np.asarray(matrix1, dtype=np.float), dividers1) | |
39 | - | |
40 | - matrix2_divided = np.apply_along_axis(divide_line, 0, np.asarray(matrix2, dtype=np.float), dividers2) | |
41 | - | |
42 | - diff = matrix1_divided - matrix2_divided | |
43 | - | |
44 | - mask = np.logical_not(np.logical_and(matrix2==0, matrix1==0)) | |
45 | - | |
46 | - result = diff | |
47 | - | |
48 | - if mod != None or mod == "": | |
49 | - for word in mod.split(" "): | |
50 | - if word == "power": | |
51 | - result = np.power(result,2) | |
52 | - elif word == "human": | |
53 | - result = result * 100 | |
54 | - elif word == "abs": | |
55 | - result = np.absolute(result) | |
56 | - else: | |
57 | - raise Exception("Need to specify an accepted mod of the disequilibrium (\"power\", \"human\" or \"abs\"") | |
58 | - return (mask, result) | |
59 | - | |
60 | - | |
61 | - | |
62 | -def disequilibrium_mean_by_cluster(mask, matrix): | |
63 | - ''' | |
64 | - Mean of disequilibrium | |
65 | - matrix is the disequilibrium calculated | |
66 | - from number of occurences belonging to a class, | |
67 | - for each cluster. | |
68 | - ''' | |
69 | - nb_k = len(matrix) | |
70 | - results = np.zeros((nb_k)) | |
71 | - | |
72 | - for i in range(nb_k): | |
73 | - results[i] = matrix[i].sum() / mask[i].sum() | |
74 | - return results | |
75 | - | |
76 | - | |
77 | -def disequilibrium(matrix1, matrix2, isGlobal=False): | |
78 | - ''' | |
79 | - Disequilibrium matrix | |
80 | - And Disequilibrium value | |
81 | - ''' | |
82 | - mask, result = disequilibrium_(matrix1, matrix2, isGlobal) | |
83 | - result_human = result * 100 | |
84 | - result_power = np.power(result, 2) | |
85 | - | |
86 | - return ( | |
87 | - mask, | |
88 | - result_human, | |
89 | - disequilibrium_mean_by_cluster(mask, result_power).sum()/matrix1.shape[0] | |
90 | - ) | |
91 | - | |
92 | - | |
93 | -def compute_count_matrix(y_truth, y_hat): | |
94 | - ''' | |
95 | - Check the size of the lists with assertion | |
96 | - ''' | |
97 | - # Check size of the lists | |
98 | - assert len(y_hat) == len(y_truth), f"Matrices should have the same length y_hat: {len(y_hat)}, y_truth: {len(y_truth)}" | |
99 | - | |
100 | - # Build count matrix | |
101 | - count_matrix = np.zeros((max(y_hat+1), max(y_truth+1))) | |
102 | - for i in range(len(y_hat)): | |
103 | - count_matrix[y_hat[i]][y_truth[i]] += 1 | |
104 | - return count_matrix | |
105 | - | |
106 | - | |
107 | -def entropy_score(y_truth, y_hat): | |
108 | - ''' | |
109 | - Need to use label encoder before givin y_hat and y_truth | |
110 | - Don't use one hot labels | |
111 | - | |
112 | - Return a tuple with: | |
113 | - - result_matrix : the matrix with the log multiplied probabilities (P(x) * log(P(x))) | |
114 | - - result_vector : the vector avec summing entropy of each class. Each value corresponds to a cluster. | |
115 | - - result : the final entropy measure of the clustering | |
116 | - ''' | |
117 | - def divide_line(a, divider): | |
118 | - ''' | |
119 | - Sub function used for dividing matrix by a vector line by line. | |
120 | - ''' | |
121 | - return np.divide(a, divider, out=np.zeros_like(a), where=divider!=0) | |
122 | - | |
123 | - # Build count matrix | |
124 | - count_matrix = compute_count_matrix(y_truth, y_hat) | |
125 | - | |
126 | - # Build dividers vector | |
127 | - dividers = count_matrix.sum(axis=1) | |
128 | - | |
129 | - matrix_divided = np.apply_along_axis(divide_line, 0, np.asarray(count_matrix, dtype=np.float), dividers) | |
130 | - | |
131 | - log_matrix = np.zeros(matrix_divided.shape) | |
132 | - np.log2(matrix_divided, out=log_matrix, where=count_matrix != 0) | |
133 | - result_matrix = -1 * np.multiply(matrix_divided, log_matrix) | |
134 | - result_vector = result_matrix.sum(axis=1) | |
135 | - result_vector.sum() | |
136 | - | |
137 | - if np.isnan(np.sum(result_vector)): | |
138 | - print("COUNT MATRIX") | |
139 | - print(count_matrix) | |
140 | - print("MATRIX DIVIDED") | |
141 | - print(matrix_divided) | |
142 | - print("RESULT MATRIX") | |
143 | - print(result_matrix) | |
144 | - print("VECTOR MATRIX") | |
145 | - print(result_vector) | |
146 | - print("An error occured due to nan value, some values are printed before") | |
147 | - exit(1) | |
148 | - | |
149 | - result = result_vector * dividers / dividers.sum() | |
150 | - result = result.sum() | |
151 | - return (result_matrix, result_vector, result) | |
152 | - | |
153 | - | |
154 | -def purity_score(y_truth, y_hat): | |
155 | - ''' | |
156 | - Return three values in a dictionary: | |
157 | - - purity_class_score: the purity score of the class (asp) | |
158 | - - purity_cluster_score: the purity score of the cluster (acp) | |
159 | - - K: the overall evaluation criterion (sqrt(asp * acp)) | |
160 | - | |
161 | - This function is based on the following article: | |
162 | - Unknown-multiple speaker clustering using HMM, J. Ajmera, H. Bourlard, I. Lapidot, I. McCowan | |
163 | - ''' | |
164 | - | |
165 | - def divide_line(a, divider): | |
166 | - ''' | |
167 | - Sub function used for dividing matrix by a vector line by line. | |
168 | - ''' | |
169 | - return np.divide(a, divider, out=np.zeros_like(a), where=divider!=0) | |
170 | - | |
171 | - def compute_purity_score(count_matrix, axis=0): | |
172 | - if axis==0: | |
173 | - other_axis = 1 | |
174 | - else: | |
175 | - other_axis = 0 | |
176 | - count_per_row = count_matrix.sum(axis=axis) | |
177 | - dividers = np.square(count_per_row) | |
178 | - | |
179 | - count_matrix_squared = np.square(count_matrix) | |
180 | - matrix_divided = np.apply_along_axis(divide_line, other_axis, np.asarray(count_matrix_squared, dtype=np.float), dividers) | |
181 | - vector_purity = np.sum(matrix_divided, axis=axis) | |
182 | - | |
183 | - scalar_purity = np.average(vector_purity, weights=count_per_row) | |
184 | - return (vector_purity, scalar_purity) | |
185 | - | |
186 | - | |
187 | - count_matrix = compute_count_matrix(y_truth, y_hat) | |
188 | - _, purity_cluster_score = compute_purity_score(count_matrix, 1) | |
189 | - _, purity_class_score = cluster_purity = compute_purity_score(count_matrix, 0) | |
190 | - | |
191 | - K = np.sqrt(purity_cluster_score * purity_class_score) | |
192 | - | |
193 | - for i in range(count_matrix.shape[0]): | |
194 | - for j in range(count_matrix.shape[1]): | |
195 | - count_matrix[i][j] | |
196 | - count_matrix[i] | |
197 | - return { | |
198 | - "purity_class_score": purity_class_score, | |
199 | - "purity_cluster_score": purity_cluster_score, | |
200 | - "K": K | |
201 | - } | |
202 | - | |
203 | - | |
204 | -if __name__ == "__main__": | |
205 | - print("Purity test #1") | |
206 | - # Hypothesis | |
207 | - y_hat = np.asarray([0, 1, 2, 0, 1, 0, 3, 2, 2, 3, 3, 0]) | |
208 | - # Truth | |
209 | - y = np.asarray([0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3]) | |
210 | - | |
211 | - (result_matrix, result_vector, result) = entropy_score(y, y_hat) | |
212 | - print(purity_score(y, y_hat)) | |
213 | - | |
214 | - exit(1) | |
215 | - print("Purity test #2") | |
216 | - # Hypothesis | |
217 | - y_hat = np.asarray([0, 1, 2, 0, 1, 0, 3, 2, 2, 3, 3, 0, 4, 4, 4]) | |
218 | - # Truth | |
219 | - y = np.asarray([0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 0, 3, 3, 3]) | |
220 | - | |
221 | - (result_matrix, result_vector, result) = entropy_score(y, y_hat) | |
222 | - exit(1) | |
223 | - print("Result matrix: ") | |
224 | - print(result_matrix) | |
225 | - print("Result vector: ") | |
226 | - print(result_vector) | |
227 | - print("Result: ", result) |
volia/plot-character.py
1 | + | |
2 | +import matplotlib.pyplot as plt | |
3 | +import numpy as np | |
4 | +import pandas as pd | |
5 | +import argparse | |
6 | +from os.path import isfile | |
7 | +from volia.data_io import read_features, read_lst | |
8 | + | |
9 | + | |
10 | +if __name__ == "__main__": | |
11 | + # Argparse | |
12 | + parser = argparse.ArgumentParser(description="Plot points with color for each character") | |
13 | + parser.add_argument("--features", type=str, help="features file path") | |
14 | + parser.add_argument("--utt2char", type=str, help="char2utt file path") | |
15 | + parser.add_argument("--sublist", type=str, default=None, help="white list of ids to take into account") | |
16 | + parser.add_argument("--outfile", default="out.pdf", type=str, help="") | |
17 | + parser.add_argument("--title", default="Example of plot", type=str, help="Specify the title") | |
18 | + args = parser.parse_args() | |
19 | + | |
20 | + # List of assertions | |
21 | + assert args.features, "Need to specify features option" | |
22 | + assert args.utt2char, "Need to specify char2utt option file" | |
23 | + assert isfile(args.features), "Features path should point to a file" | |
24 | + assert isfile(args.utt2char), "char2utt path should point to a file" | |
25 | + if args.sublist is not None: | |
26 | + assert isfile(args.sublist), "sublist path should point to a file" | |
27 | + | |
28 | + | |
29 | + id_to_features = read_features(args.features) | |
30 | + | |
31 | + ids = [] | |
32 | + if args.sublist is not None: | |
33 | + print("Using sublist") | |
34 | + list_ids = read_lst(args.sublist) | |
35 | + ids = [ key for key in list_ids.keys() ] | |
36 | + else: | |
37 | + ids = [ key for key in id_to_features.keys() ] | |
38 | + | |
39 | + utt2char = read_lst(args.utt2char) | |
40 | + | |
41 | + features = [ id_to_features[id_] for id_ in ids ] | |
42 | + features = np.vstack(features) | |
43 | + | |
44 | + characters_list = [ utt2char[id_][0] for id_ in ids ] | |
45 | + | |
46 | + features_T = features.transpose() | |
47 | + print("Number of characters: ", len(np.unique(characters_list))) | |
48 | + df = pd.DataFrame(dict( | |
49 | + x=features_T[0], | |
50 | + y=features_T[1], | |
51 | + character=characters_list)) | |
52 | + | |
53 | + groups = df.groupby('character') | |
54 | + | |
55 | + # Plot | |
56 | + fig, ax = plt.subplots() | |
57 | + | |
58 | + for character, group in groups: | |
59 | + p = ax.plot(group.x, group.y, marker='o', linestyle='', ms=1, label=character) | |
60 | + ax.legend() | |
61 | + plt.savefig(args.outfile) | |
62 | + print("Your plot is saved well (no check of this affirmation)") |
volia/test.py
volia/tsne.py
1 | +''' | |
2 | +The goal of this script is to display calculate tsne of pvectors. | |
3 | +''' | |
4 | + | |
5 | +import os | |
6 | +from os.path import isfile | |
7 | +import argparse | |
8 | +import numpy as np | |
9 | +from sklearn.manifold import TSNE | |
10 | + | |
11 | +from volia.data_io import read_features | |
12 | + | |
13 | +if __name__ == "__main__": | |
14 | + # Defining argparse | |
15 | + parser = argparse.ArgumentParser(prog='pvector tsne', description='Calculate the tsne representation of pvector in 3 or 2d') | |
16 | + parser.add_argument('features', type=str, | |
17 | + help='the path of the file you want to calculate tsne') | |
18 | + parser.add_argument('-o', '--outfile', type=str, | |
19 | + default='.', | |
20 | + help='the path of the output file.') | |
21 | + parser.add_argument('-n', '--n-comp', type=int, choices=[2, 3], | |
22 | + default='2', | |
23 | + help='number of components output of tsne') | |
24 | + | |
25 | + args = parser.parse_args() | |
26 | + | |
27 | + assert isfile(args.features) | |
28 | + | |
29 | + features_list = read_features(args.features) | |
30 | + tuples_key_feat = np.vstack([ (key, feats) for key, feats in features_list.items()]) | |
31 | + keys, features = zip(*tuples_key_feat) | |
32 | + feat_tsne = TSNE(n_components=args.n_comp).fit_transform(features) | |
33 | + | |
34 | + with open(args.outfile, "w") as of: | |
35 | + for i in range(len(keys)): | |
36 | + of.write(keys[i] + " " + " ".join([str(feat) for feat in feat_tsne[i]]) + "\n") | |
37 | + print("TSNE finished. Check if everything has been done well.") |