Blame view
bin/measure_clustering.py
5.06 KB
60d1f63cd Script and lib th... |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 |
''' Compute some measures from clustering like disequilibrium and gini. ''' # TODO: Juste disequilibrium par personnage pour commencer. import argparse from data import read_file, index_by_id import numpy as np from sklearn import preprocessing from measures import disequilibrium, entropy from sklearn import metrics import json # -- ARGPARSE parser = argparse.ArgumentParser(description="Compute metrics from clustering") parser.add_argument("clustering", type=str, help="clustering file") parser.add_argument("classlst", type=str, help="List used for its classes.") parser.add_argument("trainlst", type=str, help="train list") parser.add_argument("vallst", type=str, help="val lst") parser.add_argument("--outfile", type=str, default="out.out", help="output file path") args = parser.parse_args() CLUSTERING = args.clustering CLASS_LST = args.classlst TRAIN_LST = args.trainlst VAL_LST = args.vallst OUTFILE = args.outfile # -- READ FILES clustering = read_file(CLUSTERING) clustering_ind = index_by_id(clustering) class_lst = read_file(CLASS_LST) class_lst_ind = index_by_id(class_lst) train_lst = read_file(TRAIN_LST) val_lst = read_file(VAL_LST) # -- GET CLASSES AND CLUSTERS train_classes = np.asarray([class_lst_ind[x[0][0]][x[0][3]][0][1] for x in train_lst]) train_clusters = np.asarray([clustering_ind[x[0][0]][x[0][3]][0][1] for x in train_lst], dtype=np.int) val_classes = np.asarray([class_lst_ind[x[0][0]][x[0][3]][0][1] for x in val_lst]) val_clusters = np.asarray([clustering_ind[x[0][0]][x[0][3]][0][1] for x in val_lst], dtype=np.int) unique, count = np.unique(train_clusters, return_counts=True) train_cluster_ind = dict(zip(unique, count)) unique, count = np.unique(val_clusters, return_counts=True) val_cluster_ind = dict(zip(unique, count)) #print(np.unique(train_classes, return_counts=True)) #sub = np.extract(train_clusters == 1, train_classes) #print(np.unique(sub, return_counts=True)) def generate_count_matrix(classes1, clusters1, classes2, clusters2): ''' Generate matrices for which sets. Lines are clusters and columns are classes. A cell is contains the number of character occurence on a specific cluster. ''' # Index Classes classe1_unique = np.unique(classes1) classe2_unique = np.unique(classes2) all_classes = np.unique(np.concatenate((classe1_unique, classe2_unique))) # Label Encoder for classes le = preprocessing.LabelEncoder() le.fit(all_classes) # Index cluster1_unique = np.unique(clusters1) cluster2_unique = np.unique(clusters2) all_clusters = np.unique(np.concatenate((cluster1_unique, cluster2_unique))) # Create matrix lin(clust) col(class) counts_matrix1 = np.zeros((len(all_clusters), len(all_classes))) counts_matrix2 = np.zeros((len(all_clusters), len(all_classes))) for cluster in all_clusters: # Il faut d'abord extraire les classes présentes dans ce cluster cc1 = np.extract(np.asarray(clusters1) == cluster, np.asarray(classes1)) cc2 = np.extract(np.asarray(clusters2) == cluster, np.asarray(classes2)) cc1_unique, cc1_counts = np.unique(cc1, return_counts=True) cc1_ind = dict(zip(cc1_unique, cc1_counts)) cc2_unique, cc2_counts = np.unique(cc2, return_counts=True) cc2_ind = dict(zip(cc2_unique, cc2_counts)) for class_ in all_classes: class_id = le.transform([class_])[0] if class_ in cc1_ind: counts_matrix1[int(cluster)][int(class_id)] = cc1_ind[class_] if class_ in cc2_ind: counts_matrix2[int(cluster)][int(class_id)] = cc2_ind[class_] return (counts_matrix1, counts_matrix2) train_vscore = metrics.cluster.v_measure_score(train_classes, train_clusters) val_vscore = metrics.cluster.v_measure_score(val_classes, val_clusters) train_homogeneity = metrics.homogeneity_score(train_classes, train_clusters) val_homogeneity = homogeneity_val = metrics.homogeneity_score(val_classes, val_clusters) train_completeness = metrics.completeness_score(train_classes, train_clusters) val_completeness = metrics.completeness_score(val_classes, val_clusters) counts_matrix1, counts_matrix2 = generate_count_matrix(train_classes, train_clusters, val_classes, val_clusters) mask, dis_human, dis_measures = disequilibrium(counts_matrix1, counts_matrix2, isGlobal=False) (train_entropy_matrix, train_entropy) = entropy(counts_matrix1) (val_entropy_matrix, val_entropy) = entropy(counts_matrix2) results = {} results["train"] = {} results["train"]["vscore"] = train_vscore results["train"]["homogeneity"] = train_homogeneity results["train"]["completeness"] = val_completeness results["val"] = {} results["val"]["vscore"] = val_vscore results["val"]["homogeneity"] = val_homogeneity results["val"]["completeness"] = val_completeness results["disequilibrium"] = dis_measures #results = disequilibrium(counts_matrix1, counts_matrix2, isGlobal=False, mod="pow") with open(OUTFILE, "w") as f: json_content = json.dumps(results) f.write(json_content) |