Commit 60d1f63cd51b15456f730cfe81e439c297c21995
1 parent
b3371498cf
Exists in
master
Script and lib that help measuring clustering
Showing 2 changed files with 252 additions and 0 deletions Side-by-side Diff
bin/measure_clustering.py
1 | +''' | |
2 | +Compute some measures from clustering like | |
3 | +disequilibrium and gini. | |
4 | +''' | |
5 | +# TODO: Juste disequilibrium par personnage pour commencer. | |
6 | + | |
7 | +import argparse | |
8 | +from data import read_file, index_by_id | |
9 | +import numpy as np | |
10 | +from sklearn import preprocessing | |
11 | +from measures import disequilibrium, entropy | |
12 | +from sklearn import metrics | |
13 | +import json | |
14 | + | |
15 | +# -- ARGPARSE | |
16 | +parser = argparse.ArgumentParser(description="Compute metrics from clustering") | |
17 | +parser.add_argument("clustering", type=str, | |
18 | + help="clustering file") | |
19 | +parser.add_argument("classlst", type=str, | |
20 | + help="List used for its classes.") | |
21 | +parser.add_argument("trainlst", type=str, | |
22 | + help="train list") | |
23 | +parser.add_argument("vallst", type=str, | |
24 | + help="val lst") | |
25 | +parser.add_argument("--outfile", type=str, default="out.out", | |
26 | + help="output file path") | |
27 | + | |
28 | +args = parser.parse_args() | |
29 | +CLUSTERING = args.clustering | |
30 | +CLASS_LST = args.classlst | |
31 | +TRAIN_LST = args.trainlst | |
32 | +VAL_LST = args.vallst | |
33 | +OUTFILE = args.outfile | |
34 | + | |
35 | + | |
36 | +# -- READ FILES | |
37 | +clustering = read_file(CLUSTERING) | |
38 | +clustering_ind = index_by_id(clustering) | |
39 | + | |
40 | +class_lst = read_file(CLASS_LST) | |
41 | +class_lst_ind = index_by_id(class_lst) | |
42 | + | |
43 | +train_lst = read_file(TRAIN_LST) | |
44 | +val_lst = read_file(VAL_LST) | |
45 | + | |
46 | +# -- GET CLASSES AND CLUSTERS | |
47 | +train_classes = np.asarray([class_lst_ind[x[0][0]][x[0][3]][0][1] for x in train_lst]) | |
48 | +train_clusters = np.asarray([clustering_ind[x[0][0]][x[0][3]][0][1] for x in train_lst], dtype=np.int) | |
49 | + | |
50 | +val_classes = np.asarray([class_lst_ind[x[0][0]][x[0][3]][0][1] for x in val_lst]) | |
51 | +val_clusters = np.asarray([clustering_ind[x[0][0]][x[0][3]][0][1] for x in val_lst], dtype=np.int) | |
52 | + | |
53 | +unique, count = np.unique(train_clusters, return_counts=True) | |
54 | +train_cluster_ind = dict(zip(unique, count)) | |
55 | + | |
56 | +unique, count = np.unique(val_clusters, return_counts=True) | |
57 | +val_cluster_ind = dict(zip(unique, count)) | |
58 | + | |
59 | + | |
60 | +#print(np.unique(train_classes, return_counts=True)) | |
61 | + | |
62 | +#sub = np.extract(train_clusters == 1, train_classes) | |
63 | +#print(np.unique(sub, return_counts=True)) | |
64 | + | |
65 | + | |
66 | + | |
67 | + | |
68 | + | |
69 | +def generate_count_matrix(classes1, clusters1, classes2, clusters2): | |
70 | + ''' | |
71 | + Generate matrices for which sets. | |
72 | + Lines are clusters and columns are classes. | |
73 | + A cell is contains the number of character occurence | |
74 | + on a specific cluster. | |
75 | + ''' | |
76 | + | |
77 | + # Index Classes | |
78 | + classe1_unique = np.unique(classes1) | |
79 | + classe2_unique = np.unique(classes2) | |
80 | + all_classes = np.unique(np.concatenate((classe1_unique, classe2_unique))) | |
81 | + | |
82 | + # Label Encoder for classes | |
83 | + le = preprocessing.LabelEncoder() | |
84 | + le.fit(all_classes) | |
85 | + | |
86 | + # Index | |
87 | + cluster1_unique = np.unique(clusters1) | |
88 | + cluster2_unique = np.unique(clusters2) | |
89 | + | |
90 | + all_clusters = np.unique(np.concatenate((cluster1_unique, cluster2_unique))) | |
91 | + | |
92 | + # Create matrix lin(clust) col(class) | |
93 | + counts_matrix1 = np.zeros((len(all_clusters), len(all_classes))) | |
94 | + counts_matrix2 = np.zeros((len(all_clusters), len(all_classes))) | |
95 | + | |
96 | + for cluster in all_clusters: | |
97 | + | |
98 | + # Il faut d'abord extraire les classes prรฉsentes dans ce cluster | |
99 | + cc1 = np.extract(np.asarray(clusters1) == cluster, np.asarray(classes1)) | |
100 | + cc2 = np.extract(np.asarray(clusters2) == cluster, np.asarray(classes2)) | |
101 | + | |
102 | + cc1_unique, cc1_counts = np.unique(cc1, return_counts=True) | |
103 | + cc1_ind = dict(zip(cc1_unique, cc1_counts)) | |
104 | + | |
105 | + cc2_unique, cc2_counts = np.unique(cc2, return_counts=True) | |
106 | + cc2_ind = dict(zip(cc2_unique, cc2_counts)) | |
107 | + | |
108 | + for class_ in all_classes: | |
109 | + class_id = le.transform([class_])[0] | |
110 | + if class_ in cc1_ind: | |
111 | + counts_matrix1[int(cluster)][int(class_id)] = cc1_ind[class_] | |
112 | + if class_ in cc2_ind: | |
113 | + counts_matrix2[int(cluster)][int(class_id)] = cc2_ind[class_] | |
114 | + return (counts_matrix1, counts_matrix2) | |
115 | + | |
116 | + | |
117 | +train_vscore = metrics.cluster.v_measure_score(train_classes, train_clusters) | |
118 | +val_vscore = metrics.cluster.v_measure_score(val_classes, val_clusters) | |
119 | + | |
120 | +train_homogeneity = metrics.homogeneity_score(train_classes, train_clusters) | |
121 | +val_homogeneity = homogeneity_val = metrics.homogeneity_score(val_classes, val_clusters) | |
122 | +train_completeness = metrics.completeness_score(train_classes, train_clusters) | |
123 | +val_completeness = metrics.completeness_score(val_classes, val_clusters) | |
124 | + | |
125 | +counts_matrix1, counts_matrix2 = generate_count_matrix(train_classes, train_clusters, val_classes, val_clusters) | |
126 | +mask, dis_human, dis_measures = disequilibrium(counts_matrix1, counts_matrix2, isGlobal=False) | |
127 | + | |
128 | +(train_entropy_matrix, train_entropy) = entropy(counts_matrix1) | |
129 | +(val_entropy_matrix, val_entropy) = entropy(counts_matrix2) | |
130 | + | |
131 | +results = {} | |
132 | +results["train"] = {} | |
133 | +results["train"]["vscore"] = train_vscore | |
134 | +results["train"]["homogeneity"] = train_homogeneity | |
135 | +results["train"]["completeness"] = val_completeness | |
136 | + | |
137 | +results["val"] = {} | |
138 | +results["val"]["vscore"] = val_vscore | |
139 | +results["val"]["homogeneity"] = val_homogeneity | |
140 | +results["val"]["completeness"] = val_completeness | |
141 | + | |
142 | +results["disequilibrium"] = dis_measures | |
143 | + | |
144 | +#results = disequilibrium(counts_matrix1, counts_matrix2, isGlobal=False, mod="pow") | |
145 | +with open(OUTFILE, "w") as f: | |
146 | + json_content = json.dumps(results) | |
147 | + f.write(json_content) |
bin/measures.py
1 | +''' | |
2 | +This module is a part of my library. | |
3 | +It aims to compute some measures for clustering. | |
4 | +''' | |
5 | + | |
6 | +import numpy as np | |
7 | + | |
8 | +def disequilibrium_(matrix1, matrix2, isGlobal=False, mod=None): | |
9 | + ''' | |
10 | + Compute disequilibrium for all the clusters. | |
11 | + The disequilibrium is compute from the difference | |
12 | + between two clustering sets. | |
13 | + isGlobal permet ร l'utilisateur de choisir le dรฉnominateur de | |
14 | + la fonction : | |
15 | + - True : divise la valeur par le nombre d'รฉlรฉment du cluster | |
16 | + - False : divise la valeur par le nombre d'รฉlรฉment total | |
17 | + | |
18 | + withPower permet ร l'utilisateur de dรฉcider d'appliquer un carrรฉ 2 ou | |
19 | + une valeur absolue. | |
20 | + ''' | |
21 | + | |
22 | + def divide_line(a, divider): | |
23 | + ''' | |
24 | + Sub function used for dividing matrix by a vector line by line. | |
25 | + ''' | |
26 | + return np.divide(a, divider, out=np.zeros_like(a), where=divider!=0) | |
27 | + | |
28 | + dividers1 = 0 | |
29 | + dividers2 = 0 | |
30 | + | |
31 | + if isGlobal: | |
32 | + dividers1 = matrix1.sum() | |
33 | + dividers2 = matrix2.sum() | |
34 | + else: | |
35 | + dividers1 = matrix1.sum(axis=1) | |
36 | + dividers2 = matrix2.sum(axis=1) | |
37 | + | |
38 | + matrix1_divided = np.apply_along_axis(divide_line, 0, np.asarray(matrix1, dtype=np.float), dividers1) | |
39 | + | |
40 | + matrix2_divided = np.apply_along_axis(divide_line, 0, np.asarray(matrix2, dtype=np.float), dividers2) | |
41 | + | |
42 | + diff = matrix1_divided - matrix2_divided | |
43 | + | |
44 | + mask = (matrix2==0) & (matrix1==0) | |
45 | + result = diff | |
46 | + | |
47 | + if mod != None or mod == "": | |
48 | + for word in mod.split(" "): | |
49 | + if word == "power": | |
50 | + result = np.power(result,2) | |
51 | + elif word == "human": | |
52 | + result = result * 100 | |
53 | + elif word == "abs": | |
54 | + result = np.absolute(result) | |
55 | + else: | |
56 | + raise Exception("Need to specify an accepted mod of the disequilibrium (\"power\", \"human\" or \"abs\"") | |
57 | + return (mask, result) | |
58 | + | |
59 | + | |
60 | + | |
61 | +def disequilibrium_mean_by_cluster(mask, matrix): | |
62 | + ''' | |
63 | + Mean of disequilibrium | |
64 | + matrix is the disequilibrium calculated | |
65 | + from number of occurences belonging to a class, | |
66 | + for each cluster. | |
67 | + ''' | |
68 | + nb_k = len(matrix) | |
69 | + results = np.zeros((nb_k)) | |
70 | + for i in range(nb_k): | |
71 | + results[i] = matrix[i].sum() / mask[i].sum() | |
72 | + return results | |
73 | + | |
74 | + | |
75 | +def disequilibrium(matrix1, matrix2, isGlobal=False): | |
76 | + ''' | |
77 | + Disequilibrium matrix | |
78 | + And Disequilibrium value | |
79 | + ''' | |
80 | + mask, result = disequilibrium_(matrix1, matrix2, isGlobal) | |
81 | + result_human = result * 100 | |
82 | + result_power = np.power(result, 2) | |
83 | + | |
84 | + return ( | |
85 | + mask, | |
86 | + result_human, | |
87 | + disequilibrium_mean_by_cluster(mask, result_power).sum() | |
88 | + ) | |
89 | + | |
90 | + | |
91 | +def entropy(count_matrix): | |
92 | + def divide_line(a, divider): | |
93 | + ''' | |
94 | + Sub function used for dividing matrix by a vector line by line. | |
95 | + ''' | |
96 | + return np.divide(a, divider, out=np.zeros_like(a), where=divider!=0) | |
97 | + | |
98 | + dividers = count_matrix.sum(axis=1) | |
99 | + | |
100 | + matrix_divided = np.apply_along_axis(divide_line, 0, np.asarray(count_matrix, dtype=np.float), dividers) | |
101 | + | |
102 | + result_matrix = -1 * matrix_divided * np.log2(matrix_divided, where=count_matrix != 0) | |
103 | + result = result_matrix.sum(axis=1) * dividers / dividers.sum() | |
104 | + result = result.sum() | |
105 | + return (result_matrix, result) |