Commit 60d1f63cd51b15456f730cfe81e439c297c21995

Authored by Mathias Quillot
1 parent b3371498cf
Exists in master

Script and lib that help measuring clustering

Showing 2 changed files with 252 additions and 0 deletions Inline Diff

bin/measure_clustering.py
File was created 1 '''
2 Compute some measures from clustering like
3 disequilibrium and gini.
4 '''
5 # TODO: Juste disequilibrium par personnage pour commencer.
6
7 import argparse
8 from data import read_file, index_by_id
9 import numpy as np
10 from sklearn import preprocessing
11 from measures import disequilibrium, entropy
12 from sklearn import metrics
13 import json
14
15 # -- ARGPARSE
16 parser = argparse.ArgumentParser(description="Compute metrics from clustering")
17 parser.add_argument("clustering", type=str,
18 help="clustering file")
19 parser.add_argument("classlst", type=str,
20 help="List used for its classes.")
21 parser.add_argument("trainlst", type=str,
22 help="train list")
23 parser.add_argument("vallst", type=str,
24 help="val lst")
25 parser.add_argument("--outfile", type=str, default="out.out",
26 help="output file path")
27
28 args = parser.parse_args()
29 CLUSTERING = args.clustering
30 CLASS_LST = args.classlst
31 TRAIN_LST = args.trainlst
32 VAL_LST = args.vallst
33 OUTFILE = args.outfile
34
35
36 # -- READ FILES
37 clustering = read_file(CLUSTERING)
38 clustering_ind = index_by_id(clustering)
39
40 class_lst = read_file(CLASS_LST)
41 class_lst_ind = index_by_id(class_lst)
42
43 train_lst = read_file(TRAIN_LST)
44 val_lst = read_file(VAL_LST)
45
46 # -- GET CLASSES AND CLUSTERS
47 train_classes = np.asarray([class_lst_ind[x[0][0]][x[0][3]][0][1] for x in train_lst])
48 train_clusters = np.asarray([clustering_ind[x[0][0]][x[0][3]][0][1] for x in train_lst], dtype=np.int)
49
50 val_classes = np.asarray([class_lst_ind[x[0][0]][x[0][3]][0][1] for x in val_lst])
51 val_clusters = np.asarray([clustering_ind[x[0][0]][x[0][3]][0][1] for x in val_lst], dtype=np.int)
52
53 unique, count = np.unique(train_clusters, return_counts=True)
54 train_cluster_ind = dict(zip(unique, count))
55
56 unique, count = np.unique(val_clusters, return_counts=True)
57 val_cluster_ind = dict(zip(unique, count))
58
59
60 #print(np.unique(train_classes, return_counts=True))
61
62 #sub = np.extract(train_clusters == 1, train_classes)
63 #print(np.unique(sub, return_counts=True))
64
65
66
67
68
69 def generate_count_matrix(classes1, clusters1, classes2, clusters2):
70 '''
71 Generate matrices for which sets.
72 Lines are clusters and columns are classes.
73 A cell is contains the number of character occurence
74 on a specific cluster.
75 '''
76
77 # Index Classes
78 classe1_unique = np.unique(classes1)
79 classe2_unique = np.unique(classes2)
80 all_classes = np.unique(np.concatenate((classe1_unique, classe2_unique)))
81
82 # Label Encoder for classes
83 le = preprocessing.LabelEncoder()
84 le.fit(all_classes)
85
86 # Index
87 cluster1_unique = np.unique(clusters1)
88 cluster2_unique = np.unique(clusters2)
89
90 all_clusters = np.unique(np.concatenate((cluster1_unique, cluster2_unique)))
91
92 # Create matrix lin(clust) col(class)
93 counts_matrix1 = np.zeros((len(all_clusters), len(all_classes)))
94 counts_matrix2 = np.zeros((len(all_clusters), len(all_classes)))
95
96 for cluster in all_clusters:
97
98 # Il faut d'abord extraire les classes présentes dans ce cluster
99 cc1 = np.extract(np.asarray(clusters1) == cluster, np.asarray(classes1))
100 cc2 = np.extract(np.asarray(clusters2) == cluster, np.asarray(classes2))
101
102 cc1_unique, cc1_counts = np.unique(cc1, return_counts=True)
103 cc1_ind = dict(zip(cc1_unique, cc1_counts))
104
105 cc2_unique, cc2_counts = np.unique(cc2, return_counts=True)
106 cc2_ind = dict(zip(cc2_unique, cc2_counts))
107
108 for class_ in all_classes:
109 class_id = le.transform([class_])[0]
110 if class_ in cc1_ind:
111 counts_matrix1[int(cluster)][int(class_id)] = cc1_ind[class_]
112 if class_ in cc2_ind:
113 counts_matrix2[int(cluster)][int(class_id)] = cc2_ind[class_]
114 return (counts_matrix1, counts_matrix2)
115
116
117 train_vscore = metrics.cluster.v_measure_score(train_classes, train_clusters)
118 val_vscore = metrics.cluster.v_measure_score(val_classes, val_clusters)
119
120 train_homogeneity = metrics.homogeneity_score(train_classes, train_clusters)
121 val_homogeneity = homogeneity_val = metrics.homogeneity_score(val_classes, val_clusters)
122 train_completeness = metrics.completeness_score(train_classes, train_clusters)
123 val_completeness = metrics.completeness_score(val_classes, val_clusters)
124
125 counts_matrix1, counts_matrix2 = generate_count_matrix(train_classes, train_clusters, val_classes, val_clusters)
126 mask, dis_human, dis_measures = disequilibrium(counts_matrix1, counts_matrix2, isGlobal=False)
127
128 (train_entropy_matrix, train_entropy) = entropy(counts_matrix1)
129 (val_entropy_matrix, val_entropy) = entropy(counts_matrix2)
130
131 results = {}
132 results["train"] = {}
133 results["train"]["vscore"] = train_vscore
134 results["train"]["homogeneity"] = train_homogeneity
135 results["train"]["completeness"] = val_completeness
136
137 results["val"] = {}
138 results["val"]["vscore"] = val_vscore
139 results["val"]["homogeneity"] = val_homogeneity
140 results["val"]["completeness"] = val_completeness
141
142 results["disequilibrium"] = dis_measures
143
144 #results = disequilibrium(counts_matrix1, counts_matrix2, isGlobal=False, mod="pow")
145 with open(OUTFILE, "w") as f:
146 json_content = json.dumps(results)
147 f.write(json_content)
148
File was created 1 '''
2 This module is a part of my library.
3 It aims to compute some measures for clustering.
4 '''
5
6 import numpy as np
7
8 def disequilibrium_(matrix1, matrix2, isGlobal=False, mod=None):
9 '''
10 Compute disequilibrium for all the clusters.
11 The disequilibrium is compute from the difference
12 between two clustering sets.
13 isGlobal permet à l'utilisateur de choisir le dénominateur de
14 la fonction :
15 - True : divise la valeur par le nombre d'élément du cluster
16 - False : divise la valeur par le nombre d'élément total
17
18 withPower permet à l'utilisateur de décider d'appliquer un carré 2 ou
19 une valeur absolue.
20 '''
21
22 def divide_line(a, divider):
23 '''
24 Sub function used for dividing matrix by a vector line by line.
25 '''
26 return np.divide(a, divider, out=np.zeros_like(a), where=divider!=0)
27
28 dividers1 = 0
29 dividers2 = 0
30
31 if isGlobal:
32 dividers1 = matrix1.sum()
33 dividers2 = matrix2.sum()
34 else:
35 dividers1 = matrix1.sum(axis=1)
36 dividers2 = matrix2.sum(axis=1)
37
38 matrix1_divided = np.apply_along_axis(divide_line, 0, np.asarray(matrix1, dtype=np.float), dividers1)
39
40 matrix2_divided = np.apply_along_axis(divide_line, 0, np.asarray(matrix2, dtype=np.float), dividers2)
41
42 diff = matrix1_divided - matrix2_divided
43
44 mask = (matrix2==0) & (matrix1==0)
45 result = diff
46
47 if mod != None or mod == "":
48 for word in mod.split(" "):
49 if word == "power":
50 result = np.power(result,2)
51 elif word == "human":
52 result = result * 100
53 elif word == "abs":
54 result = np.absolute(result)
55 else:
56 raise Exception("Need to specify an accepted mod of the disequilibrium (\"power\", \"human\" or \"abs\"")
57 return (mask, result)
58
59
60
61 def disequilibrium_mean_by_cluster(mask, matrix):
62 '''
63 Mean of disequilibrium
64 matrix is the disequilibrium calculated
65 from number of occurences belonging to a class,
66 for each cluster.
67 '''
68 nb_k = len(matrix)
69 results = np.zeros((nb_k))
70 for i in range(nb_k):
71 results[i] = matrix[i].sum() / mask[i].sum()
72 return results
73
74
75 def disequilibrium(matrix1, matrix2, isGlobal=False):
76 '''
77 Disequilibrium matrix
78 And Disequilibrium value
79 '''
80 mask, result = disequilibrium_(matrix1, matrix2, isGlobal)
81 result_human = result * 100
82 result_power = np.power(result, 2)
83
84 return (
85 mask,
86 result_human,
87 disequilibrium_mean_by_cluster(mask, result_power).sum()
88 )
89
90
91 def entropy(count_matrix):
92 def divide_line(a, divider):
93 '''
94 Sub function used for dividing matrix by a vector line by line.
95 '''
96 return np.divide(a, divider, out=np.zeros_like(a), where=divider!=0)
97
98 dividers = count_matrix.sum(axis=1)
99
100 matrix_divided = np.apply_along_axis(divide_line, 0, np.asarray(count_matrix, dtype=np.float), dividers)
101
102 result_matrix = -1 * matrix_divided * np.log2(matrix_divided, where=count_matrix != 0)
103 result = result_matrix.sum(axis=1) * dividers / dividers.sum()
104 result = result.sum()
105 return (result_matrix, result)