Blame view
bin/measure_clustering.py
5.37 KB
60d1f63cd Script and lib th... |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 |
''' Compute some measures from clustering like disequilibrium and gini. ''' # TODO: Juste disequilibrium par personnage pour commencer. import argparse from data import read_file, index_by_id import numpy as np from sklearn import preprocessing from measures import disequilibrium, entropy from sklearn import metrics import json # -- ARGPARSE parser = argparse.ArgumentParser(description="Compute metrics from clustering") parser.add_argument("clustering", type=str, help="clustering file") parser.add_argument("classlst", type=str, help="List used for its classes.") parser.add_argument("trainlst", type=str, help="train list") parser.add_argument("vallst", type=str, help="val lst") parser.add_argument("--outfile", type=str, default="out.out", help="output file path") args = parser.parse_args() CLUSTERING = args.clustering CLASS_LST = args.classlst TRAIN_LST = args.trainlst VAL_LST = args.vallst OUTFILE = args.outfile # -- READ FILES clustering = read_file(CLUSTERING) clustering_ind = index_by_id(clustering) class_lst = read_file(CLASS_LST) class_lst_ind = index_by_id(class_lst) train_lst = read_file(TRAIN_LST) val_lst = read_file(VAL_LST) # -- GET CLASSES AND CLUSTERS train_classes = np.asarray([class_lst_ind[x[0][0]][x[0][3]][0][1] for x in train_lst]) train_clusters = np.asarray([clustering_ind[x[0][0]][x[0][3]][0][1] for x in train_lst], dtype=np.int) |
933b2505a Add entropy |
49 |
|
60d1f63cd Script and lib th... |
50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 |
val_classes = np.asarray([class_lst_ind[x[0][0]][x[0][3]][0][1] for x in val_lst]) val_clusters = np.asarray([clustering_ind[x[0][0]][x[0][3]][0][1] for x in val_lst], dtype=np.int) unique, count = np.unique(train_clusters, return_counts=True) train_cluster_ind = dict(zip(unique, count)) unique, count = np.unique(val_clusters, return_counts=True) val_cluster_ind = dict(zip(unique, count)) #print(np.unique(train_classes, return_counts=True)) #sub = np.extract(train_clusters == 1, train_classes) #print(np.unique(sub, return_counts=True)) def generate_count_matrix(classes1, clusters1, classes2, clusters2): ''' Generate matrices for which sets. Lines are clusters and columns are classes. A cell is contains the number of character occurence on a specific cluster. ''' # Index Classes classe1_unique = np.unique(classes1) classe2_unique = np.unique(classes2) all_classes = np.unique(np.concatenate((classe1_unique, classe2_unique))) |
933b2505a Add entropy |
81 |
|
60d1f63cd Script and lib th... |
82 83 84 85 86 87 88 |
# Label Encoder for classes le = preprocessing.LabelEncoder() le.fit(all_classes) # Index cluster1_unique = np.unique(clusters1) cluster2_unique = np.unique(clusters2) |
60d1f63cd Script and lib th... |
89 90 |
all_clusters = np.unique(np.concatenate((cluster1_unique, cluster2_unique))) |
933b2505a Add entropy |
91 92 93 |
# Warning if np.max(all_clusters) != len(cluster1_unique)-1: print("WARNING: Some clusters are empty. Value max : " + str(np.max(all_clusters)) + " Nb values : " + str(len(cluster1_unique))) |
60d1f63cd Script and lib th... |
94 |
# Create matrix lin(clust) col(class) |
933b2505a Add entropy |
95 96 |
counts_matrix1 = np.zeros((np.max(all_clusters) + 1, len(all_classes))) counts_matrix2 = np.zeros((np.max(all_clusters) + 1, len(all_classes))) |
60d1f63cd Script and lib th... |
97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 |
for cluster in all_clusters: # Il faut d'abord extraire les classes présentes dans ce cluster cc1 = np.extract(np.asarray(clusters1) == cluster, np.asarray(classes1)) cc2 = np.extract(np.asarray(clusters2) == cluster, np.asarray(classes2)) cc1_unique, cc1_counts = np.unique(cc1, return_counts=True) cc1_ind = dict(zip(cc1_unique, cc1_counts)) cc2_unique, cc2_counts = np.unique(cc2, return_counts=True) cc2_ind = dict(zip(cc2_unique, cc2_counts)) for class_ in all_classes: class_id = le.transform([class_])[0] if class_ in cc1_ind: counts_matrix1[int(cluster)][int(class_id)] = cc1_ind[class_] if class_ in cc2_ind: counts_matrix2[int(cluster)][int(class_id)] = cc2_ind[class_] return (counts_matrix1, counts_matrix2) train_vscore = metrics.cluster.v_measure_score(train_classes, train_clusters) val_vscore = metrics.cluster.v_measure_score(val_classes, val_clusters) train_homogeneity = metrics.homogeneity_score(train_classes, train_clusters) val_homogeneity = homogeneity_val = metrics.homogeneity_score(val_classes, val_clusters) train_completeness = metrics.completeness_score(train_classes, train_clusters) val_completeness = metrics.completeness_score(val_classes, val_clusters) counts_matrix1, counts_matrix2 = generate_count_matrix(train_classes, train_clusters, val_classes, val_clusters) |
933b2505a Add entropy |
128 |
|
60d1f63cd Script and lib th... |
129 |
mask, dis_human, dis_measures = disequilibrium(counts_matrix1, counts_matrix2, isGlobal=False) |
933b2505a Add entropy |
130 |
|
60d1f63cd Script and lib th... |
131 132 133 134 135 |
(train_entropy_matrix, train_entropy) = entropy(counts_matrix1) (val_entropy_matrix, val_entropy) = entropy(counts_matrix2) results = {} results["train"] = {} |
933b2505a Add entropy |
136 |
results["train"]["entropy"] = train_entropy |
60d1f63cd Script and lib th... |
137 138 139 140 141 |
results["train"]["vscore"] = train_vscore results["train"]["homogeneity"] = train_homogeneity results["train"]["completeness"] = val_completeness results["val"] = {} |
933b2505a Add entropy |
142 |
results["val"]["entropy"] = val_entropy |
60d1f63cd Script and lib th... |
143 144 145 146 147 148 149 150 151 152 |
results["val"]["vscore"] = val_vscore results["val"]["homogeneity"] = val_homogeneity results["val"]["completeness"] = val_completeness results["disequilibrium"] = dis_measures #results = disequilibrium(counts_matrix1, counts_matrix2, isGlobal=False, mod="pow") with open(OUTFILE, "w") as f: json_content = json.dumps(results) f.write(json_content) |