Commit 60d1f63cd51b15456f730cfe81e439c297c21995
1 parent
b3371498cf
Exists in
master
Script and lib that help measuring clustering
Showing 2 changed files with 252 additions and 0 deletions Inline Diff
bin/measure_clustering.py
File was created | 1 | ''' | |
2 | Compute some measures from clustering like | ||
3 | disequilibrium and gini. | ||
4 | ''' | ||
5 | # TODO: Juste disequilibrium par personnage pour commencer. | ||
6 | |||
7 | import argparse | ||
8 | from data import read_file, index_by_id | ||
9 | import numpy as np | ||
10 | from sklearn import preprocessing | ||
11 | from measures import disequilibrium, entropy | ||
12 | from sklearn import metrics | ||
13 | import json | ||
14 | |||
15 | # -- ARGPARSE | ||
16 | parser = argparse.ArgumentParser(description="Compute metrics from clustering") | ||
17 | parser.add_argument("clustering", type=str, | ||
18 | help="clustering file") | ||
19 | parser.add_argument("classlst", type=str, | ||
20 | help="List used for its classes.") | ||
21 | parser.add_argument("trainlst", type=str, | ||
22 | help="train list") | ||
23 | parser.add_argument("vallst", type=str, | ||
24 | help="val lst") | ||
25 | parser.add_argument("--outfile", type=str, default="out.out", | ||
26 | help="output file path") | ||
27 | |||
28 | args = parser.parse_args() | ||
29 | CLUSTERING = args.clustering | ||
30 | CLASS_LST = args.classlst | ||
31 | TRAIN_LST = args.trainlst | ||
32 | VAL_LST = args.vallst | ||
33 | OUTFILE = args.outfile | ||
34 | |||
35 | |||
36 | # -- READ FILES | ||
37 | clustering = read_file(CLUSTERING) | ||
38 | clustering_ind = index_by_id(clustering) | ||
39 | |||
40 | class_lst = read_file(CLASS_LST) | ||
41 | class_lst_ind = index_by_id(class_lst) | ||
42 | |||
43 | train_lst = read_file(TRAIN_LST) | ||
44 | val_lst = read_file(VAL_LST) | ||
45 | |||
46 | # -- GET CLASSES AND CLUSTERS | ||
47 | train_classes = np.asarray([class_lst_ind[x[0][0]][x[0][3]][0][1] for x in train_lst]) | ||
48 | train_clusters = np.asarray([clustering_ind[x[0][0]][x[0][3]][0][1] for x in train_lst], dtype=np.int) | ||
49 | |||
50 | val_classes = np.asarray([class_lst_ind[x[0][0]][x[0][3]][0][1] for x in val_lst]) | ||
51 | val_clusters = np.asarray([clustering_ind[x[0][0]][x[0][3]][0][1] for x in val_lst], dtype=np.int) | ||
52 | |||
53 | unique, count = np.unique(train_clusters, return_counts=True) | ||
54 | train_cluster_ind = dict(zip(unique, count)) | ||
55 | |||
56 | unique, count = np.unique(val_clusters, return_counts=True) | ||
57 | val_cluster_ind = dict(zip(unique, count)) | ||
58 | |||
59 | |||
60 | #print(np.unique(train_classes, return_counts=True)) | ||
61 | |||
62 | #sub = np.extract(train_clusters == 1, train_classes) | ||
63 | #print(np.unique(sub, return_counts=True)) | ||
64 | |||
65 | |||
66 | |||
67 | |||
68 | |||
69 | def generate_count_matrix(classes1, clusters1, classes2, clusters2): | ||
70 | ''' | ||
71 | Generate matrices for which sets. | ||
72 | Lines are clusters and columns are classes. | ||
73 | A cell is contains the number of character occurence | ||
74 | on a specific cluster. | ||
75 | ''' | ||
76 | |||
77 | # Index Classes | ||
78 | classe1_unique = np.unique(classes1) | ||
79 | classe2_unique = np.unique(classes2) | ||
80 | all_classes = np.unique(np.concatenate((classe1_unique, classe2_unique))) | ||
81 | |||
82 | # Label Encoder for classes | ||
83 | le = preprocessing.LabelEncoder() | ||
84 | le.fit(all_classes) | ||
85 | |||
86 | # Index | ||
87 | cluster1_unique = np.unique(clusters1) | ||
88 | cluster2_unique = np.unique(clusters2) | ||
89 | |||
90 | all_clusters = np.unique(np.concatenate((cluster1_unique, cluster2_unique))) | ||
91 | |||
92 | # Create matrix lin(clust) col(class) | ||
93 | counts_matrix1 = np.zeros((len(all_clusters), len(all_classes))) | ||
94 | counts_matrix2 = np.zeros((len(all_clusters), len(all_classes))) | ||
95 | |||
96 | for cluster in all_clusters: | ||
97 | |||
98 | # Il faut d'abord extraire les classes présentes dans ce cluster | ||
99 | cc1 = np.extract(np.asarray(clusters1) == cluster, np.asarray(classes1)) | ||
100 | cc2 = np.extract(np.asarray(clusters2) == cluster, np.asarray(classes2)) | ||
101 | |||
102 | cc1_unique, cc1_counts = np.unique(cc1, return_counts=True) | ||
103 | cc1_ind = dict(zip(cc1_unique, cc1_counts)) | ||
104 | |||
105 | cc2_unique, cc2_counts = np.unique(cc2, return_counts=True) | ||
106 | cc2_ind = dict(zip(cc2_unique, cc2_counts)) | ||
107 | |||
108 | for class_ in all_classes: | ||
109 | class_id = le.transform([class_])[0] | ||
110 | if class_ in cc1_ind: | ||
111 | counts_matrix1[int(cluster)][int(class_id)] = cc1_ind[class_] | ||
112 | if class_ in cc2_ind: | ||
113 | counts_matrix2[int(cluster)][int(class_id)] = cc2_ind[class_] | ||
114 | return (counts_matrix1, counts_matrix2) | ||
115 | |||
116 | |||
117 | train_vscore = metrics.cluster.v_measure_score(train_classes, train_clusters) | ||
118 | val_vscore = metrics.cluster.v_measure_score(val_classes, val_clusters) | ||
119 | |||
120 | train_homogeneity = metrics.homogeneity_score(train_classes, train_clusters) | ||
121 | val_homogeneity = homogeneity_val = metrics.homogeneity_score(val_classes, val_clusters) | ||
122 | train_completeness = metrics.completeness_score(train_classes, train_clusters) | ||
123 | val_completeness = metrics.completeness_score(val_classes, val_clusters) | ||
124 | |||
125 | counts_matrix1, counts_matrix2 = generate_count_matrix(train_classes, train_clusters, val_classes, val_clusters) | ||
126 | mask, dis_human, dis_measures = disequilibrium(counts_matrix1, counts_matrix2, isGlobal=False) | ||
127 | |||
128 | (train_entropy_matrix, train_entropy) = entropy(counts_matrix1) | ||
129 | (val_entropy_matrix, val_entropy) = entropy(counts_matrix2) | ||
130 | |||
131 | results = {} | ||
132 | results["train"] = {} | ||
133 | results["train"]["vscore"] = train_vscore | ||
134 | results["train"]["homogeneity"] = train_homogeneity | ||
135 | results["train"]["completeness"] = val_completeness | ||
136 | |||
137 | results["val"] = {} | ||
138 | results["val"]["vscore"] = val_vscore | ||
139 | results["val"]["homogeneity"] = val_homogeneity | ||
140 | results["val"]["completeness"] = val_completeness | ||
141 | |||
142 | results["disequilibrium"] = dis_measures | ||
143 | |||
144 | #results = disequilibrium(counts_matrix1, counts_matrix2, isGlobal=False, mod="pow") | ||
145 | with open(OUTFILE, "w") as f: | ||
146 | json_content = json.dumps(results) | ||
147 | f.write(json_content) | ||
148 |
bin/measures.py
File was created | 1 | ''' | |
2 | This module is a part of my library. | ||
3 | It aims to compute some measures for clustering. | ||
4 | ''' | ||
5 | |||
6 | import numpy as np | ||
7 | |||
8 | def disequilibrium_(matrix1, matrix2, isGlobal=False, mod=None): | ||
9 | ''' | ||
10 | Compute disequilibrium for all the clusters. | ||
11 | The disequilibrium is compute from the difference | ||
12 | between two clustering sets. | ||
13 | isGlobal permet à l'utilisateur de choisir le dénominateur de | ||
14 | la fonction : | ||
15 | - True : divise la valeur par le nombre d'élément du cluster | ||
16 | - False : divise la valeur par le nombre d'élément total | ||
17 | |||
18 | withPower permet à l'utilisateur de décider d'appliquer un carré 2 ou | ||
19 | une valeur absolue. | ||
20 | ''' | ||
21 | |||
22 | def divide_line(a, divider): | ||
23 | ''' | ||
24 | Sub function used for dividing matrix by a vector line by line. | ||
25 | ''' | ||
26 | return np.divide(a, divider, out=np.zeros_like(a), where=divider!=0) | ||
27 | |||
28 | dividers1 = 0 | ||
29 | dividers2 = 0 | ||
30 | |||
31 | if isGlobal: | ||
32 | dividers1 = matrix1.sum() | ||
33 | dividers2 = matrix2.sum() | ||
34 | else: | ||
35 | dividers1 = matrix1.sum(axis=1) | ||
36 | dividers2 = matrix2.sum(axis=1) | ||
37 | |||
38 | matrix1_divided = np.apply_along_axis(divide_line, 0, np.asarray(matrix1, dtype=np.float), dividers1) | ||
39 | |||
40 | matrix2_divided = np.apply_along_axis(divide_line, 0, np.asarray(matrix2, dtype=np.float), dividers2) | ||
41 | |||
42 | diff = matrix1_divided - matrix2_divided | ||
43 | |||
44 | mask = (matrix2==0) & (matrix1==0) | ||
45 | result = diff | ||
46 | |||
47 | if mod != None or mod == "": | ||
48 | for word in mod.split(" "): | ||
49 | if word == "power": | ||
50 | result = np.power(result,2) | ||
51 | elif word == "human": | ||
52 | result = result * 100 | ||
53 | elif word == "abs": | ||
54 | result = np.absolute(result) | ||
55 | else: | ||
56 | raise Exception("Need to specify an accepted mod of the disequilibrium (\"power\", \"human\" or \"abs\"") | ||
57 | return (mask, result) | ||
58 | |||
59 | |||
60 | |||
61 | def disequilibrium_mean_by_cluster(mask, matrix): | ||
62 | ''' | ||
63 | Mean of disequilibrium | ||
64 | matrix is the disequilibrium calculated | ||
65 | from number of occurences belonging to a class, | ||
66 | for each cluster. | ||
67 | ''' | ||
68 | nb_k = len(matrix) | ||
69 | results = np.zeros((nb_k)) | ||
70 | for i in range(nb_k): | ||
71 | results[i] = matrix[i].sum() / mask[i].sum() | ||
72 | return results | ||
73 | |||
74 | |||
75 | def disequilibrium(matrix1, matrix2, isGlobal=False): | ||
76 | ''' | ||
77 | Disequilibrium matrix | ||
78 | And Disequilibrium value | ||
79 | ''' | ||
80 | mask, result = disequilibrium_(matrix1, matrix2, isGlobal) | ||
81 | result_human = result * 100 | ||
82 | result_power = np.power(result, 2) | ||
83 | |||
84 | return ( | ||
85 | mask, | ||
86 | result_human, | ||
87 | disequilibrium_mean_by_cluster(mask, result_power).sum() | ||
88 | ) | ||
89 | |||
90 | |||
91 | def entropy(count_matrix): | ||
92 | def divide_line(a, divider): | ||
93 | ''' | ||
94 | Sub function used for dividing matrix by a vector line by line. | ||
95 | ''' | ||
96 | return np.divide(a, divider, out=np.zeros_like(a), where=divider!=0) | ||
97 | |||
98 | dividers = count_matrix.sum(axis=1) | ||
99 | |||
100 | matrix_divided = np.apply_along_axis(divide_line, 0, np.asarray(count_matrix, dtype=np.float), dividers) | ||
101 | |||
102 | result_matrix = -1 * matrix_divided * np.log2(matrix_divided, where=count_matrix != 0) | ||
103 | result = result_matrix.sum(axis=1) * dividers / dividers.sum() | ||
104 | result = result.sum() | ||
105 | return (result_matrix, result) |