Commit 933b2505a171366ecf2ae150b3a8ba49413d3695

Authored by Mathias Quillot
1 parent e36dbbc98b
Exists in master

Add entropy

Showing 1 changed file with 12 additions and 4 deletions Inline Diff

bin/measure_clustering.py
1 ''' 1 '''
2 Compute some measures from clustering like 2 Compute some measures from clustering like
3 disequilibrium and gini. 3 disequilibrium and gini.
4 ''' 4 '''
5 # TODO: Juste disequilibrium par personnage pour commencer. 5 # TODO: Juste disequilibrium par personnage pour commencer.
6 6
7 import argparse 7 import argparse
8 from data import read_file, index_by_id 8 from data import read_file, index_by_id
9 import numpy as np 9 import numpy as np
10 from sklearn import preprocessing 10 from sklearn import preprocessing
11 from measures import disequilibrium, entropy 11 from measures import disequilibrium, entropy
12 from sklearn import metrics 12 from sklearn import metrics
13 import json 13 import json
14 14
15 # -- ARGPARSE 15 # -- ARGPARSE
16 parser = argparse.ArgumentParser(description="Compute metrics from clustering") 16 parser = argparse.ArgumentParser(description="Compute metrics from clustering")
17 parser.add_argument("clustering", type=str, 17 parser.add_argument("clustering", type=str,
18 help="clustering file") 18 help="clustering file")
19 parser.add_argument("classlst", type=str, 19 parser.add_argument("classlst", type=str,
20 help="List used for its classes.") 20 help="List used for its classes.")
21 parser.add_argument("trainlst", type=str, 21 parser.add_argument("trainlst", type=str,
22 help="train list") 22 help="train list")
23 parser.add_argument("vallst", type=str, 23 parser.add_argument("vallst", type=str,
24 help="val lst") 24 help="val lst")
25 parser.add_argument("--outfile", type=str, default="out.out", 25 parser.add_argument("--outfile", type=str, default="out.out",
26 help="output file path") 26 help="output file path")
27 27
28 args = parser.parse_args() 28 args = parser.parse_args()
29 CLUSTERING = args.clustering 29 CLUSTERING = args.clustering
30 CLASS_LST = args.classlst 30 CLASS_LST = args.classlst
31 TRAIN_LST = args.trainlst 31 TRAIN_LST = args.trainlst
32 VAL_LST = args.vallst 32 VAL_LST = args.vallst
33 OUTFILE = args.outfile 33 OUTFILE = args.outfile
34 34
35 35
36 # -- READ FILES 36 # -- READ FILES
37 clustering = read_file(CLUSTERING) 37 clustering = read_file(CLUSTERING)
38 clustering_ind = index_by_id(clustering) 38 clustering_ind = index_by_id(clustering)
39 39
40 class_lst = read_file(CLASS_LST) 40 class_lst = read_file(CLASS_LST)
41 class_lst_ind = index_by_id(class_lst) 41 class_lst_ind = index_by_id(class_lst)
42 42
43 train_lst = read_file(TRAIN_LST) 43 train_lst = read_file(TRAIN_LST)
44 val_lst = read_file(VAL_LST) 44 val_lst = read_file(VAL_LST)
45 45
46 # -- GET CLASSES AND CLUSTERS 46 # -- GET CLASSES AND CLUSTERS
47 train_classes = np.asarray([class_lst_ind[x[0][0]][x[0][3]][0][1] for x in train_lst]) 47 train_classes = np.asarray([class_lst_ind[x[0][0]][x[0][3]][0][1] for x in train_lst])
48 train_clusters = np.asarray([clustering_ind[x[0][0]][x[0][3]][0][1] for x in train_lst], dtype=np.int) 48 train_clusters = np.asarray([clustering_ind[x[0][0]][x[0][3]][0][1] for x in train_lst], dtype=np.int)
49 49
50
50 val_classes = np.asarray([class_lst_ind[x[0][0]][x[0][3]][0][1] for x in val_lst]) 51 val_classes = np.asarray([class_lst_ind[x[0][0]][x[0][3]][0][1] for x in val_lst])
51 val_clusters = np.asarray([clustering_ind[x[0][0]][x[0][3]][0][1] for x in val_lst], dtype=np.int) 52 val_clusters = np.asarray([clustering_ind[x[0][0]][x[0][3]][0][1] for x in val_lst], dtype=np.int)
52 53
53 unique, count = np.unique(train_clusters, return_counts=True) 54 unique, count = np.unique(train_clusters, return_counts=True)
54 train_cluster_ind = dict(zip(unique, count)) 55 train_cluster_ind = dict(zip(unique, count))
55 56
56 unique, count = np.unique(val_clusters, return_counts=True) 57 unique, count = np.unique(val_clusters, return_counts=True)
57 val_cluster_ind = dict(zip(unique, count)) 58 val_cluster_ind = dict(zip(unique, count))
58 59
59 60
60 #print(np.unique(train_classes, return_counts=True)) 61 #print(np.unique(train_classes, return_counts=True))
61 62
62 #sub = np.extract(train_clusters == 1, train_classes) 63 #sub = np.extract(train_clusters == 1, train_classes)
63 #print(np.unique(sub, return_counts=True)) 64 #print(np.unique(sub, return_counts=True))
64 65
65 66
66 67
67 68
68 69
69 def generate_count_matrix(classes1, clusters1, classes2, clusters2): 70 def generate_count_matrix(classes1, clusters1, classes2, clusters2):
70 ''' 71 '''
71 Generate matrices for which sets. 72 Generate matrices for which sets.
72 Lines are clusters and columns are classes. 73 Lines are clusters and columns are classes.
73 A cell is contains the number of character occurence 74 A cell is contains the number of character occurence
74 on a specific cluster. 75 on a specific cluster.
75 ''' 76 '''
76 77
77 # Index Classes 78 # Index Classes
78 classe1_unique = np.unique(classes1) 79 classe1_unique = np.unique(classes1)
79 classe2_unique = np.unique(classes2) 80 classe2_unique = np.unique(classes2)
80 all_classes = np.unique(np.concatenate((classe1_unique, classe2_unique))) 81 all_classes = np.unique(np.concatenate((classe1_unique, classe2_unique)))
81 82
82 # Label Encoder for classes 83 # Label Encoder for classes
83 le = preprocessing.LabelEncoder() 84 le = preprocessing.LabelEncoder()
84 le.fit(all_classes) 85 le.fit(all_classes)
85 86
86 # Index 87 # Index
87 cluster1_unique = np.unique(clusters1) 88 cluster1_unique = np.unique(clusters1)
88 cluster2_unique = np.unique(clusters2) 89 cluster2_unique = np.unique(clusters2)
89
90 all_clusters = np.unique(np.concatenate((cluster1_unique, cluster2_unique))) 90 all_clusters = np.unique(np.concatenate((cluster1_unique, cluster2_unique)))
91 91
92 # Warning
93 if np.max(all_clusters) != len(cluster1_unique)-1:
94 print("WARNING: Some clusters are empty. Value max : " + str(np.max(all_clusters)) + " Nb values : " + str(len(cluster1_unique)))
95
92 # Create matrix lin(clust) col(class) 96 # Create matrix lin(clust) col(class)
93 counts_matrix1 = np.zeros((len(all_clusters), len(all_classes))) 97 counts_matrix1 = np.zeros((np.max(all_clusters) + 1, len(all_classes)))
94 counts_matrix2 = np.zeros((len(all_clusters), len(all_classes))) 98 counts_matrix2 = np.zeros((np.max(all_clusters) + 1, len(all_classes)))
95 99
96 for cluster in all_clusters: 100 for cluster in all_clusters:
97 101
98 # Il faut d'abord extraire les classes présentes dans ce cluster 102 # Il faut d'abord extraire les classes présentes dans ce cluster
99 cc1 = np.extract(np.asarray(clusters1) == cluster, np.asarray(classes1)) 103 cc1 = np.extract(np.asarray(clusters1) == cluster, np.asarray(classes1))
100 cc2 = np.extract(np.asarray(clusters2) == cluster, np.asarray(classes2)) 104 cc2 = np.extract(np.asarray(clusters2) == cluster, np.asarray(classes2))
101 105
102 cc1_unique, cc1_counts = np.unique(cc1, return_counts=True) 106 cc1_unique, cc1_counts = np.unique(cc1, return_counts=True)
103 cc1_ind = dict(zip(cc1_unique, cc1_counts)) 107 cc1_ind = dict(zip(cc1_unique, cc1_counts))
104 108
105 cc2_unique, cc2_counts = np.unique(cc2, return_counts=True) 109 cc2_unique, cc2_counts = np.unique(cc2, return_counts=True)
106 cc2_ind = dict(zip(cc2_unique, cc2_counts)) 110 cc2_ind = dict(zip(cc2_unique, cc2_counts))
107 111
108 for class_ in all_classes: 112 for class_ in all_classes:
109 class_id = le.transform([class_])[0] 113 class_id = le.transform([class_])[0]
110 if class_ in cc1_ind: 114 if class_ in cc1_ind:
111 counts_matrix1[int(cluster)][int(class_id)] = cc1_ind[class_] 115 counts_matrix1[int(cluster)][int(class_id)] = cc1_ind[class_]
112 if class_ in cc2_ind: 116 if class_ in cc2_ind:
113 counts_matrix2[int(cluster)][int(class_id)] = cc2_ind[class_] 117 counts_matrix2[int(cluster)][int(class_id)] = cc2_ind[class_]
114 return (counts_matrix1, counts_matrix2) 118 return (counts_matrix1, counts_matrix2)
115 119
116 120
117 train_vscore = metrics.cluster.v_measure_score(train_classes, train_clusters) 121 train_vscore = metrics.cluster.v_measure_score(train_classes, train_clusters)
118 val_vscore = metrics.cluster.v_measure_score(val_classes, val_clusters) 122 val_vscore = metrics.cluster.v_measure_score(val_classes, val_clusters)
119 123
120 train_homogeneity = metrics.homogeneity_score(train_classes, train_clusters) 124 train_homogeneity = metrics.homogeneity_score(train_classes, train_clusters)
121 val_homogeneity = homogeneity_val = metrics.homogeneity_score(val_classes, val_clusters) 125 val_homogeneity = homogeneity_val = metrics.homogeneity_score(val_classes, val_clusters)
122 train_completeness = metrics.completeness_score(train_classes, train_clusters) 126 train_completeness = metrics.completeness_score(train_classes, train_clusters)
123 val_completeness = metrics.completeness_score(val_classes, val_clusters) 127 val_completeness = metrics.completeness_score(val_classes, val_clusters)
124 128
125 counts_matrix1, counts_matrix2 = generate_count_matrix(train_classes, train_clusters, val_classes, val_clusters) 129 counts_matrix1, counts_matrix2 = generate_count_matrix(train_classes, train_clusters, val_classes, val_clusters)
130
126 mask, dis_human, dis_measures = disequilibrium(counts_matrix1, counts_matrix2, isGlobal=False) 131 mask, dis_human, dis_measures = disequilibrium(counts_matrix1, counts_matrix2, isGlobal=False)
127 132
133
128 (train_entropy_matrix, train_entropy) = entropy(counts_matrix1) 134 (train_entropy_matrix, train_entropy) = entropy(counts_matrix1)
129 (val_entropy_matrix, val_entropy) = entropy(counts_matrix2) 135 (val_entropy_matrix, val_entropy) = entropy(counts_matrix2)
130 136
131 results = {} 137 results = {}
132 results["train"] = {} 138 results["train"] = {}
139 results["train"]["entropy"] = train_entropy
133 results["train"]["vscore"] = train_vscore 140 results["train"]["vscore"] = train_vscore
134 results["train"]["homogeneity"] = train_homogeneity 141 results["train"]["homogeneity"] = train_homogeneity
135 results["train"]["completeness"] = val_completeness 142 results["train"]["completeness"] = val_completeness
136 143
137 results["val"] = {} 144 results["val"] = {}
145 results["val"]["entropy"] = val_entropy
138 results["val"]["vscore"] = val_vscore 146 results["val"]["vscore"] = val_vscore
139 results["val"]["homogeneity"] = val_homogeneity 147 results["val"]["homogeneity"] = val_homogeneity
140 results["val"]["completeness"] = val_completeness 148 results["val"]["completeness"] = val_completeness
141 149
142 results["disequilibrium"] = dis_measures 150 results["disequilibrium"] = dis_measures
143 151
144 #results = disequilibrium(counts_matrix1, counts_matrix2, isGlobal=False, mod="pow") 152 #results = disequilibrium(counts_matrix1, counts_matrix2, isGlobal=False, mod="pow")
145 with open(OUTFILE, "w") as f: 153 with open(OUTFILE, "w") as f:
146 json_content = json.dumps(results) 154 json_content = json.dumps(results)
147 f.write(json_content) 155 f.write(json_content)