Commit 60d1f63cd51b15456f730cfe81e439c297c21995
1 parent
b3371498cf
Exists in
master
Script and lib that help measuring clustering
Showing 2 changed files with 252 additions and 0 deletions Side-by-side Diff
bin/measure_clustering.py
| 1 | +''' | |
| 2 | +Compute some measures from clustering like | |
| 3 | +disequilibrium and gini. | |
| 4 | +''' | |
| 5 | +# TODO: Juste disequilibrium par personnage pour commencer. | |
| 6 | + | |
| 7 | +import argparse | |
| 8 | +from data import read_file, index_by_id | |
| 9 | +import numpy as np | |
| 10 | +from sklearn import preprocessing | |
| 11 | +from measures import disequilibrium, entropy | |
| 12 | +from sklearn import metrics | |
| 13 | +import json | |
| 14 | + | |
| 15 | +# -- ARGPARSE | |
| 16 | +parser = argparse.ArgumentParser(description="Compute metrics from clustering") | |
| 17 | +parser.add_argument("clustering", type=str, | |
| 18 | + help="clustering file") | |
| 19 | +parser.add_argument("classlst", type=str, | |
| 20 | + help="List used for its classes.") | |
| 21 | +parser.add_argument("trainlst", type=str, | |
| 22 | + help="train list") | |
| 23 | +parser.add_argument("vallst", type=str, | |
| 24 | + help="val lst") | |
| 25 | +parser.add_argument("--outfile", type=str, default="out.out", | |
| 26 | + help="output file path") | |
| 27 | + | |
| 28 | +args = parser.parse_args() | |
| 29 | +CLUSTERING = args.clustering | |
| 30 | +CLASS_LST = args.classlst | |
| 31 | +TRAIN_LST = args.trainlst | |
| 32 | +VAL_LST = args.vallst | |
| 33 | +OUTFILE = args.outfile | |
| 34 | + | |
| 35 | + | |
| 36 | +# -- READ FILES | |
| 37 | +clustering = read_file(CLUSTERING) | |
| 38 | +clustering_ind = index_by_id(clustering) | |
| 39 | + | |
| 40 | +class_lst = read_file(CLASS_LST) | |
| 41 | +class_lst_ind = index_by_id(class_lst) | |
| 42 | + | |
| 43 | +train_lst = read_file(TRAIN_LST) | |
| 44 | +val_lst = read_file(VAL_LST) | |
| 45 | + | |
| 46 | +# -- GET CLASSES AND CLUSTERS | |
| 47 | +train_classes = np.asarray([class_lst_ind[x[0][0]][x[0][3]][0][1] for x in train_lst]) | |
| 48 | +train_clusters = np.asarray([clustering_ind[x[0][0]][x[0][3]][0][1] for x in train_lst], dtype=np.int) | |
| 49 | + | |
| 50 | +val_classes = np.asarray([class_lst_ind[x[0][0]][x[0][3]][0][1] for x in val_lst]) | |
| 51 | +val_clusters = np.asarray([clustering_ind[x[0][0]][x[0][3]][0][1] for x in val_lst], dtype=np.int) | |
| 52 | + | |
| 53 | +unique, count = np.unique(train_clusters, return_counts=True) | |
| 54 | +train_cluster_ind = dict(zip(unique, count)) | |
| 55 | + | |
| 56 | +unique, count = np.unique(val_clusters, return_counts=True) | |
| 57 | +val_cluster_ind = dict(zip(unique, count)) | |
| 58 | + | |
| 59 | + | |
| 60 | +#print(np.unique(train_classes, return_counts=True)) | |
| 61 | + | |
| 62 | +#sub = np.extract(train_clusters == 1, train_classes) | |
| 63 | +#print(np.unique(sub, return_counts=True)) | |
| 64 | + | |
| 65 | + | |
| 66 | + | |
| 67 | + | |
| 68 | + | |
| 69 | +def generate_count_matrix(classes1, clusters1, classes2, clusters2): | |
| 70 | + ''' | |
| 71 | + Generate matrices for which sets. | |
| 72 | + Lines are clusters and columns are classes. | |
| 73 | + A cell is contains the number of character occurence | |
| 74 | + on a specific cluster. | |
| 75 | + ''' | |
| 76 | + | |
| 77 | + # Index Classes | |
| 78 | + classe1_unique = np.unique(classes1) | |
| 79 | + classe2_unique = np.unique(classes2) | |
| 80 | + all_classes = np.unique(np.concatenate((classe1_unique, classe2_unique))) | |
| 81 | + | |
| 82 | + # Label Encoder for classes | |
| 83 | + le = preprocessing.LabelEncoder() | |
| 84 | + le.fit(all_classes) | |
| 85 | + | |
| 86 | + # Index | |
| 87 | + cluster1_unique = np.unique(clusters1) | |
| 88 | + cluster2_unique = np.unique(clusters2) | |
| 89 | + | |
| 90 | + all_clusters = np.unique(np.concatenate((cluster1_unique, cluster2_unique))) | |
| 91 | + | |
| 92 | + # Create matrix lin(clust) col(class) | |
| 93 | + counts_matrix1 = np.zeros((len(all_clusters), len(all_classes))) | |
| 94 | + counts_matrix2 = np.zeros((len(all_clusters), len(all_classes))) | |
| 95 | + | |
| 96 | + for cluster in all_clusters: | |
| 97 | + | |
| 98 | + # Il faut d'abord extraire les classes prรฉsentes dans ce cluster | |
| 99 | + cc1 = np.extract(np.asarray(clusters1) == cluster, np.asarray(classes1)) | |
| 100 | + cc2 = np.extract(np.asarray(clusters2) == cluster, np.asarray(classes2)) | |
| 101 | + | |
| 102 | + cc1_unique, cc1_counts = np.unique(cc1, return_counts=True) | |
| 103 | + cc1_ind = dict(zip(cc1_unique, cc1_counts)) | |
| 104 | + | |
| 105 | + cc2_unique, cc2_counts = np.unique(cc2, return_counts=True) | |
| 106 | + cc2_ind = dict(zip(cc2_unique, cc2_counts)) | |
| 107 | + | |
| 108 | + for class_ in all_classes: | |
| 109 | + class_id = le.transform([class_])[0] | |
| 110 | + if class_ in cc1_ind: | |
| 111 | + counts_matrix1[int(cluster)][int(class_id)] = cc1_ind[class_] | |
| 112 | + if class_ in cc2_ind: | |
| 113 | + counts_matrix2[int(cluster)][int(class_id)] = cc2_ind[class_] | |
| 114 | + return (counts_matrix1, counts_matrix2) | |
| 115 | + | |
| 116 | + | |
| 117 | +train_vscore = metrics.cluster.v_measure_score(train_classes, train_clusters) | |
| 118 | +val_vscore = metrics.cluster.v_measure_score(val_classes, val_clusters) | |
| 119 | + | |
| 120 | +train_homogeneity = metrics.homogeneity_score(train_classes, train_clusters) | |
| 121 | +val_homogeneity = homogeneity_val = metrics.homogeneity_score(val_classes, val_clusters) | |
| 122 | +train_completeness = metrics.completeness_score(train_classes, train_clusters) | |
| 123 | +val_completeness = metrics.completeness_score(val_classes, val_clusters) | |
| 124 | + | |
| 125 | +counts_matrix1, counts_matrix2 = generate_count_matrix(train_classes, train_clusters, val_classes, val_clusters) | |
| 126 | +mask, dis_human, dis_measures = disequilibrium(counts_matrix1, counts_matrix2, isGlobal=False) | |
| 127 | + | |
| 128 | +(train_entropy_matrix, train_entropy) = entropy(counts_matrix1) | |
| 129 | +(val_entropy_matrix, val_entropy) = entropy(counts_matrix2) | |
| 130 | + | |
| 131 | +results = {} | |
| 132 | +results["train"] = {} | |
| 133 | +results["train"]["vscore"] = train_vscore | |
| 134 | +results["train"]["homogeneity"] = train_homogeneity | |
| 135 | +results["train"]["completeness"] = val_completeness | |
| 136 | + | |
| 137 | +results["val"] = {} | |
| 138 | +results["val"]["vscore"] = val_vscore | |
| 139 | +results["val"]["homogeneity"] = val_homogeneity | |
| 140 | +results["val"]["completeness"] = val_completeness | |
| 141 | + | |
| 142 | +results["disequilibrium"] = dis_measures | |
| 143 | + | |
| 144 | +#results = disequilibrium(counts_matrix1, counts_matrix2, isGlobal=False, mod="pow") | |
| 145 | +with open(OUTFILE, "w") as f: | |
| 146 | + json_content = json.dumps(results) | |
| 147 | + f.write(json_content) |
bin/measures.py
| 1 | +''' | |
| 2 | +This module is a part of my library. | |
| 3 | +It aims to compute some measures for clustering. | |
| 4 | +''' | |
| 5 | + | |
| 6 | +import numpy as np | |
| 7 | + | |
| 8 | +def disequilibrium_(matrix1, matrix2, isGlobal=False, mod=None): | |
| 9 | + ''' | |
| 10 | + Compute disequilibrium for all the clusters. | |
| 11 | + The disequilibrium is compute from the difference | |
| 12 | + between two clustering sets. | |
| 13 | + isGlobal permet ร l'utilisateur de choisir le dรฉnominateur de | |
| 14 | + la fonction : | |
| 15 | + - True : divise la valeur par le nombre d'รฉlรฉment du cluster | |
| 16 | + - False : divise la valeur par le nombre d'รฉlรฉment total | |
| 17 | + | |
| 18 | + withPower permet ร l'utilisateur de dรฉcider d'appliquer un carrรฉ 2 ou | |
| 19 | + une valeur absolue. | |
| 20 | + ''' | |
| 21 | + | |
| 22 | + def divide_line(a, divider): | |
| 23 | + ''' | |
| 24 | + Sub function used for dividing matrix by a vector line by line. | |
| 25 | + ''' | |
| 26 | + return np.divide(a, divider, out=np.zeros_like(a), where=divider!=0) | |
| 27 | + | |
| 28 | + dividers1 = 0 | |
| 29 | + dividers2 = 0 | |
| 30 | + | |
| 31 | + if isGlobal: | |
| 32 | + dividers1 = matrix1.sum() | |
| 33 | + dividers2 = matrix2.sum() | |
| 34 | + else: | |
| 35 | + dividers1 = matrix1.sum(axis=1) | |
| 36 | + dividers2 = matrix2.sum(axis=1) | |
| 37 | + | |
| 38 | + matrix1_divided = np.apply_along_axis(divide_line, 0, np.asarray(matrix1, dtype=np.float), dividers1) | |
| 39 | + | |
| 40 | + matrix2_divided = np.apply_along_axis(divide_line, 0, np.asarray(matrix2, dtype=np.float), dividers2) | |
| 41 | + | |
| 42 | + diff = matrix1_divided - matrix2_divided | |
| 43 | + | |
| 44 | + mask = (matrix2==0) & (matrix1==0) | |
| 45 | + result = diff | |
| 46 | + | |
| 47 | + if mod != None or mod == "": | |
| 48 | + for word in mod.split(" "): | |
| 49 | + if word == "power": | |
| 50 | + result = np.power(result,2) | |
| 51 | + elif word == "human": | |
| 52 | + result = result * 100 | |
| 53 | + elif word == "abs": | |
| 54 | + result = np.absolute(result) | |
| 55 | + else: | |
| 56 | + raise Exception("Need to specify an accepted mod of the disequilibrium (\"power\", \"human\" or \"abs\"") | |
| 57 | + return (mask, result) | |
| 58 | + | |
| 59 | + | |
| 60 | + | |
| 61 | +def disequilibrium_mean_by_cluster(mask, matrix): | |
| 62 | + ''' | |
| 63 | + Mean of disequilibrium | |
| 64 | + matrix is the disequilibrium calculated | |
| 65 | + from number of occurences belonging to a class, | |
| 66 | + for each cluster. | |
| 67 | + ''' | |
| 68 | + nb_k = len(matrix) | |
| 69 | + results = np.zeros((nb_k)) | |
| 70 | + for i in range(nb_k): | |
| 71 | + results[i] = matrix[i].sum() / mask[i].sum() | |
| 72 | + return results | |
| 73 | + | |
| 74 | + | |
| 75 | +def disequilibrium(matrix1, matrix2, isGlobal=False): | |
| 76 | + ''' | |
| 77 | + Disequilibrium matrix | |
| 78 | + And Disequilibrium value | |
| 79 | + ''' | |
| 80 | + mask, result = disequilibrium_(matrix1, matrix2, isGlobal) | |
| 81 | + result_human = result * 100 | |
| 82 | + result_power = np.power(result, 2) | |
| 83 | + | |
| 84 | + return ( | |
| 85 | + mask, | |
| 86 | + result_human, | |
| 87 | + disequilibrium_mean_by_cluster(mask, result_power).sum() | |
| 88 | + ) | |
| 89 | + | |
| 90 | + | |
| 91 | +def entropy(count_matrix): | |
| 92 | + def divide_line(a, divider): | |
| 93 | + ''' | |
| 94 | + Sub function used for dividing matrix by a vector line by line. | |
| 95 | + ''' | |
| 96 | + return np.divide(a, divider, out=np.zeros_like(a), where=divider!=0) | |
| 97 | + | |
| 98 | + dividers = count_matrix.sum(axis=1) | |
| 99 | + | |
| 100 | + matrix_divided = np.apply_along_axis(divide_line, 0, np.asarray(count_matrix, dtype=np.float), dividers) | |
| 101 | + | |
| 102 | + result_matrix = -1 * matrix_divided * np.log2(matrix_divided, where=count_matrix != 0) | |
| 103 | + result = result_matrix.sum(axis=1) * dividers / dividers.sum() | |
| 104 | + result = result.sum() | |
| 105 | + return (result_matrix, result) |