Commit 933b2505a171366ecf2ae150b3a8ba49413d3695
1 parent
e36dbbc98b
Exists in
master
Add entropy
Showing 1 changed file with 12 additions and 4 deletions Inline Diff
bin/measure_clustering.py
1 | ''' | 1 | ''' |
2 | Compute some measures from clustering like | 2 | Compute some measures from clustering like |
3 | disequilibrium and gini. | 3 | disequilibrium and gini. |
4 | ''' | 4 | ''' |
5 | # TODO: Juste disequilibrium par personnage pour commencer. | 5 | # TODO: Juste disequilibrium par personnage pour commencer. |
6 | 6 | ||
7 | import argparse | 7 | import argparse |
8 | from data import read_file, index_by_id | 8 | from data import read_file, index_by_id |
9 | import numpy as np | 9 | import numpy as np |
10 | from sklearn import preprocessing | 10 | from sklearn import preprocessing |
11 | from measures import disequilibrium, entropy | 11 | from measures import disequilibrium, entropy |
12 | from sklearn import metrics | 12 | from sklearn import metrics |
13 | import json | 13 | import json |
14 | 14 | ||
15 | # -- ARGPARSE | 15 | # -- ARGPARSE |
16 | parser = argparse.ArgumentParser(description="Compute metrics from clustering") | 16 | parser = argparse.ArgumentParser(description="Compute metrics from clustering") |
17 | parser.add_argument("clustering", type=str, | 17 | parser.add_argument("clustering", type=str, |
18 | help="clustering file") | 18 | help="clustering file") |
19 | parser.add_argument("classlst", type=str, | 19 | parser.add_argument("classlst", type=str, |
20 | help="List used for its classes.") | 20 | help="List used for its classes.") |
21 | parser.add_argument("trainlst", type=str, | 21 | parser.add_argument("trainlst", type=str, |
22 | help="train list") | 22 | help="train list") |
23 | parser.add_argument("vallst", type=str, | 23 | parser.add_argument("vallst", type=str, |
24 | help="val lst") | 24 | help="val lst") |
25 | parser.add_argument("--outfile", type=str, default="out.out", | 25 | parser.add_argument("--outfile", type=str, default="out.out", |
26 | help="output file path") | 26 | help="output file path") |
27 | 27 | ||
28 | args = parser.parse_args() | 28 | args = parser.parse_args() |
29 | CLUSTERING = args.clustering | 29 | CLUSTERING = args.clustering |
30 | CLASS_LST = args.classlst | 30 | CLASS_LST = args.classlst |
31 | TRAIN_LST = args.trainlst | 31 | TRAIN_LST = args.trainlst |
32 | VAL_LST = args.vallst | 32 | VAL_LST = args.vallst |
33 | OUTFILE = args.outfile | 33 | OUTFILE = args.outfile |
34 | 34 | ||
35 | 35 | ||
36 | # -- READ FILES | 36 | # -- READ FILES |
37 | clustering = read_file(CLUSTERING) | 37 | clustering = read_file(CLUSTERING) |
38 | clustering_ind = index_by_id(clustering) | 38 | clustering_ind = index_by_id(clustering) |
39 | 39 | ||
40 | class_lst = read_file(CLASS_LST) | 40 | class_lst = read_file(CLASS_LST) |
41 | class_lst_ind = index_by_id(class_lst) | 41 | class_lst_ind = index_by_id(class_lst) |
42 | 42 | ||
43 | train_lst = read_file(TRAIN_LST) | 43 | train_lst = read_file(TRAIN_LST) |
44 | val_lst = read_file(VAL_LST) | 44 | val_lst = read_file(VAL_LST) |
45 | 45 | ||
46 | # -- GET CLASSES AND CLUSTERS | 46 | # -- GET CLASSES AND CLUSTERS |
47 | train_classes = np.asarray([class_lst_ind[x[0][0]][x[0][3]][0][1] for x in train_lst]) | 47 | train_classes = np.asarray([class_lst_ind[x[0][0]][x[0][3]][0][1] for x in train_lst]) |
48 | train_clusters = np.asarray([clustering_ind[x[0][0]][x[0][3]][0][1] for x in train_lst], dtype=np.int) | 48 | train_clusters = np.asarray([clustering_ind[x[0][0]][x[0][3]][0][1] for x in train_lst], dtype=np.int) |
49 | 49 | ||
50 | |||
50 | val_classes = np.asarray([class_lst_ind[x[0][0]][x[0][3]][0][1] for x in val_lst]) | 51 | val_classes = np.asarray([class_lst_ind[x[0][0]][x[0][3]][0][1] for x in val_lst]) |
51 | val_clusters = np.asarray([clustering_ind[x[0][0]][x[0][3]][0][1] for x in val_lst], dtype=np.int) | 52 | val_clusters = np.asarray([clustering_ind[x[0][0]][x[0][3]][0][1] for x in val_lst], dtype=np.int) |
52 | 53 | ||
53 | unique, count = np.unique(train_clusters, return_counts=True) | 54 | unique, count = np.unique(train_clusters, return_counts=True) |
54 | train_cluster_ind = dict(zip(unique, count)) | 55 | train_cluster_ind = dict(zip(unique, count)) |
55 | 56 | ||
56 | unique, count = np.unique(val_clusters, return_counts=True) | 57 | unique, count = np.unique(val_clusters, return_counts=True) |
57 | val_cluster_ind = dict(zip(unique, count)) | 58 | val_cluster_ind = dict(zip(unique, count)) |
58 | 59 | ||
59 | 60 | ||
60 | #print(np.unique(train_classes, return_counts=True)) | 61 | #print(np.unique(train_classes, return_counts=True)) |
61 | 62 | ||
62 | #sub = np.extract(train_clusters == 1, train_classes) | 63 | #sub = np.extract(train_clusters == 1, train_classes) |
63 | #print(np.unique(sub, return_counts=True)) | 64 | #print(np.unique(sub, return_counts=True)) |
64 | 65 | ||
65 | 66 | ||
66 | 67 | ||
67 | 68 | ||
68 | 69 | ||
69 | def generate_count_matrix(classes1, clusters1, classes2, clusters2): | 70 | def generate_count_matrix(classes1, clusters1, classes2, clusters2): |
70 | ''' | 71 | ''' |
71 | Generate matrices for which sets. | 72 | Generate matrices for which sets. |
72 | Lines are clusters and columns are classes. | 73 | Lines are clusters and columns are classes. |
73 | A cell is contains the number of character occurence | 74 | A cell is contains the number of character occurence |
74 | on a specific cluster. | 75 | on a specific cluster. |
75 | ''' | 76 | ''' |
76 | 77 | ||
77 | # Index Classes | 78 | # Index Classes |
78 | classe1_unique = np.unique(classes1) | 79 | classe1_unique = np.unique(classes1) |
79 | classe2_unique = np.unique(classes2) | 80 | classe2_unique = np.unique(classes2) |
80 | all_classes = np.unique(np.concatenate((classe1_unique, classe2_unique))) | 81 | all_classes = np.unique(np.concatenate((classe1_unique, classe2_unique))) |
81 | 82 | ||
82 | # Label Encoder for classes | 83 | # Label Encoder for classes |
83 | le = preprocessing.LabelEncoder() | 84 | le = preprocessing.LabelEncoder() |
84 | le.fit(all_classes) | 85 | le.fit(all_classes) |
85 | 86 | ||
86 | # Index | 87 | # Index |
87 | cluster1_unique = np.unique(clusters1) | 88 | cluster1_unique = np.unique(clusters1) |
88 | cluster2_unique = np.unique(clusters2) | 89 | cluster2_unique = np.unique(clusters2) |
89 | |||
90 | all_clusters = np.unique(np.concatenate((cluster1_unique, cluster2_unique))) | 90 | all_clusters = np.unique(np.concatenate((cluster1_unique, cluster2_unique))) |
91 | 91 | ||
92 | # Warning | ||
93 | if np.max(all_clusters) != len(cluster1_unique)-1: | ||
94 | print("WARNING: Some clusters are empty. Value max : " + str(np.max(all_clusters)) + " Nb values : " + str(len(cluster1_unique))) | ||
95 | |||
92 | # Create matrix lin(clust) col(class) | 96 | # Create matrix lin(clust) col(class) |
93 | counts_matrix1 = np.zeros((len(all_clusters), len(all_classes))) | 97 | counts_matrix1 = np.zeros((np.max(all_clusters) + 1, len(all_classes))) |
94 | counts_matrix2 = np.zeros((len(all_clusters), len(all_classes))) | 98 | counts_matrix2 = np.zeros((np.max(all_clusters) + 1, len(all_classes))) |
95 | 99 | ||
96 | for cluster in all_clusters: | 100 | for cluster in all_clusters: |
97 | 101 | ||
98 | # Il faut d'abord extraire les classes présentes dans ce cluster | 102 | # Il faut d'abord extraire les classes présentes dans ce cluster |
99 | cc1 = np.extract(np.asarray(clusters1) == cluster, np.asarray(classes1)) | 103 | cc1 = np.extract(np.asarray(clusters1) == cluster, np.asarray(classes1)) |
100 | cc2 = np.extract(np.asarray(clusters2) == cluster, np.asarray(classes2)) | 104 | cc2 = np.extract(np.asarray(clusters2) == cluster, np.asarray(classes2)) |
101 | 105 | ||
102 | cc1_unique, cc1_counts = np.unique(cc1, return_counts=True) | 106 | cc1_unique, cc1_counts = np.unique(cc1, return_counts=True) |
103 | cc1_ind = dict(zip(cc1_unique, cc1_counts)) | 107 | cc1_ind = dict(zip(cc1_unique, cc1_counts)) |
104 | 108 | ||
105 | cc2_unique, cc2_counts = np.unique(cc2, return_counts=True) | 109 | cc2_unique, cc2_counts = np.unique(cc2, return_counts=True) |
106 | cc2_ind = dict(zip(cc2_unique, cc2_counts)) | 110 | cc2_ind = dict(zip(cc2_unique, cc2_counts)) |
107 | 111 | ||
108 | for class_ in all_classes: | 112 | for class_ in all_classes: |
109 | class_id = le.transform([class_])[0] | 113 | class_id = le.transform([class_])[0] |
110 | if class_ in cc1_ind: | 114 | if class_ in cc1_ind: |
111 | counts_matrix1[int(cluster)][int(class_id)] = cc1_ind[class_] | 115 | counts_matrix1[int(cluster)][int(class_id)] = cc1_ind[class_] |
112 | if class_ in cc2_ind: | 116 | if class_ in cc2_ind: |
113 | counts_matrix2[int(cluster)][int(class_id)] = cc2_ind[class_] | 117 | counts_matrix2[int(cluster)][int(class_id)] = cc2_ind[class_] |
114 | return (counts_matrix1, counts_matrix2) | 118 | return (counts_matrix1, counts_matrix2) |
115 | 119 | ||
116 | 120 | ||
117 | train_vscore = metrics.cluster.v_measure_score(train_classes, train_clusters) | 121 | train_vscore = metrics.cluster.v_measure_score(train_classes, train_clusters) |
118 | val_vscore = metrics.cluster.v_measure_score(val_classes, val_clusters) | 122 | val_vscore = metrics.cluster.v_measure_score(val_classes, val_clusters) |
119 | 123 | ||
120 | train_homogeneity = metrics.homogeneity_score(train_classes, train_clusters) | 124 | train_homogeneity = metrics.homogeneity_score(train_classes, train_clusters) |
121 | val_homogeneity = homogeneity_val = metrics.homogeneity_score(val_classes, val_clusters) | 125 | val_homogeneity = homogeneity_val = metrics.homogeneity_score(val_classes, val_clusters) |
122 | train_completeness = metrics.completeness_score(train_classes, train_clusters) | 126 | train_completeness = metrics.completeness_score(train_classes, train_clusters) |
123 | val_completeness = metrics.completeness_score(val_classes, val_clusters) | 127 | val_completeness = metrics.completeness_score(val_classes, val_clusters) |
124 | 128 | ||
125 | counts_matrix1, counts_matrix2 = generate_count_matrix(train_classes, train_clusters, val_classes, val_clusters) | 129 | counts_matrix1, counts_matrix2 = generate_count_matrix(train_classes, train_clusters, val_classes, val_clusters) |
130 | |||
126 | mask, dis_human, dis_measures = disequilibrium(counts_matrix1, counts_matrix2, isGlobal=False) | 131 | mask, dis_human, dis_measures = disequilibrium(counts_matrix1, counts_matrix2, isGlobal=False) |
127 | 132 | ||
133 | |||
128 | (train_entropy_matrix, train_entropy) = entropy(counts_matrix1) | 134 | (train_entropy_matrix, train_entropy) = entropy(counts_matrix1) |
129 | (val_entropy_matrix, val_entropy) = entropy(counts_matrix2) | 135 | (val_entropy_matrix, val_entropy) = entropy(counts_matrix2) |
130 | 136 | ||
131 | results = {} | 137 | results = {} |
132 | results["train"] = {} | 138 | results["train"] = {} |
139 | results["train"]["entropy"] = train_entropy | ||
133 | results["train"]["vscore"] = train_vscore | 140 | results["train"]["vscore"] = train_vscore |
134 | results["train"]["homogeneity"] = train_homogeneity | 141 | results["train"]["homogeneity"] = train_homogeneity |
135 | results["train"]["completeness"] = val_completeness | 142 | results["train"]["completeness"] = val_completeness |
136 | 143 | ||
137 | results["val"] = {} | 144 | results["val"] = {} |
145 | results["val"]["entropy"] = val_entropy | ||
138 | results["val"]["vscore"] = val_vscore | 146 | results["val"]["vscore"] = val_vscore |
139 | results["val"]["homogeneity"] = val_homogeneity | 147 | results["val"]["homogeneity"] = val_homogeneity |
140 | results["val"]["completeness"] = val_completeness | 148 | results["val"]["completeness"] = val_completeness |
141 | 149 | ||
142 | results["disequilibrium"] = dis_measures | 150 | results["disequilibrium"] = dis_measures |
143 | 151 | ||
144 | #results = disequilibrium(counts_matrix1, counts_matrix2, isGlobal=False, mod="pow") | 152 | #results = disequilibrium(counts_matrix1, counts_matrix2, isGlobal=False, mod="pow") |
145 | with open(OUTFILE, "w") as f: | 153 | with open(OUTFILE, "w") as f: |
146 | json_content = json.dumps(results) | 154 | json_content = json.dumps(results) |
147 | f.write(json_content) | 155 | f.write(json_content) |