diff --git a/bin/measure_clustering.py b/bin/measure_clustering.py new file mode 100644 index 0000000..82de23c --- /dev/null +++ b/bin/measure_clustering.py @@ -0,0 +1,147 @@ +''' +Compute some measures from clustering like +disequilibrium and gini. +''' +# TODO: Juste disequilibrium par personnage pour commencer. + +import argparse +from data import read_file, index_by_id +import numpy as np +from sklearn import preprocessing +from measures import disequilibrium, entropy +from sklearn import metrics +import json + +# -- ARGPARSE +parser = argparse.ArgumentParser(description="Compute metrics from clustering") +parser.add_argument("clustering", type=str, + help="clustering file") +parser.add_argument("classlst", type=str, + help="List used for its classes.") +parser.add_argument("trainlst", type=str, + help="train list") +parser.add_argument("vallst", type=str, + help="val lst") +parser.add_argument("--outfile", type=str, default="out.out", + help="output file path") + +args = parser.parse_args() +CLUSTERING = args.clustering +CLASS_LST = args.classlst +TRAIN_LST = args.trainlst +VAL_LST = args.vallst +OUTFILE = args.outfile + + +# -- READ FILES +clustering = read_file(CLUSTERING) +clustering_ind = index_by_id(clustering) + +class_lst = read_file(CLASS_LST) +class_lst_ind = index_by_id(class_lst) + +train_lst = read_file(TRAIN_LST) +val_lst = read_file(VAL_LST) + +# -- GET CLASSES AND CLUSTERS +train_classes = np.asarray([class_lst_ind[x[0][0]][x[0][3]][0][1] for x in train_lst]) +train_clusters = np.asarray([clustering_ind[x[0][0]][x[0][3]][0][1] for x in train_lst], dtype=np.int) + +val_classes = np.asarray([class_lst_ind[x[0][0]][x[0][3]][0][1] for x in val_lst]) +val_clusters = np.asarray([clustering_ind[x[0][0]][x[0][3]][0][1] for x in val_lst], dtype=np.int) + +unique, count = np.unique(train_clusters, return_counts=True) +train_cluster_ind = dict(zip(unique, count)) + +unique, count = np.unique(val_clusters, return_counts=True) +val_cluster_ind = dict(zip(unique, count)) + + +#print(np.unique(train_classes, return_counts=True)) + +#sub = np.extract(train_clusters == 1, train_classes) +#print(np.unique(sub, return_counts=True)) + + + + + +def generate_count_matrix(classes1, clusters1, classes2, clusters2): + ''' + Generate matrices for which sets. + Lines are clusters and columns are classes. + A cell is contains the number of character occurence + on a specific cluster. + ''' + + # Index Classes + classe1_unique = np.unique(classes1) + classe2_unique = np.unique(classes2) + all_classes = np.unique(np.concatenate((classe1_unique, classe2_unique))) + + # Label Encoder for classes + le = preprocessing.LabelEncoder() + le.fit(all_classes) + + # Index + cluster1_unique = np.unique(clusters1) + cluster2_unique = np.unique(clusters2) + + all_clusters = np.unique(np.concatenate((cluster1_unique, cluster2_unique))) + + # Create matrix lin(clust) col(class) + counts_matrix1 = np.zeros((len(all_clusters), len(all_classes))) + counts_matrix2 = np.zeros((len(all_clusters), len(all_classes))) + + for cluster in all_clusters: + + # Il faut d'abord extraire les classes présentes dans ce cluster + cc1 = np.extract(np.asarray(clusters1) == cluster, np.asarray(classes1)) + cc2 = np.extract(np.asarray(clusters2) == cluster, np.asarray(classes2)) + + cc1_unique, cc1_counts = np.unique(cc1, return_counts=True) + cc1_ind = dict(zip(cc1_unique, cc1_counts)) + + cc2_unique, cc2_counts = np.unique(cc2, return_counts=True) + cc2_ind = dict(zip(cc2_unique, cc2_counts)) + + for class_ in all_classes: + class_id = le.transform([class_])[0] + if class_ in cc1_ind: + counts_matrix1[int(cluster)][int(class_id)] = cc1_ind[class_] + if class_ in cc2_ind: + counts_matrix2[int(cluster)][int(class_id)] = cc2_ind[class_] + return (counts_matrix1, counts_matrix2) + + +train_vscore = metrics.cluster.v_measure_score(train_classes, train_clusters) +val_vscore = metrics.cluster.v_measure_score(val_classes, val_clusters) + +train_homogeneity = metrics.homogeneity_score(train_classes, train_clusters) +val_homogeneity = homogeneity_val = metrics.homogeneity_score(val_classes, val_clusters) +train_completeness = metrics.completeness_score(train_classes, train_clusters) +val_completeness = metrics.completeness_score(val_classes, val_clusters) + +counts_matrix1, counts_matrix2 = generate_count_matrix(train_classes, train_clusters, val_classes, val_clusters) +mask, dis_human, dis_measures = disequilibrium(counts_matrix1, counts_matrix2, isGlobal=False) + +(train_entropy_matrix, train_entropy) = entropy(counts_matrix1) +(val_entropy_matrix, val_entropy) = entropy(counts_matrix2) + +results = {} +results["train"] = {} +results["train"]["vscore"] = train_vscore +results["train"]["homogeneity"] = train_homogeneity +results["train"]["completeness"] = val_completeness + +results["val"] = {} +results["val"]["vscore"] = val_vscore +results["val"]["homogeneity"] = val_homogeneity +results["val"]["completeness"] = val_completeness + +results["disequilibrium"] = dis_measures + +#results = disequilibrium(counts_matrix1, counts_matrix2, isGlobal=False, mod="pow") +with open(OUTFILE, "w") as f: + json_content = json.dumps(results) + f.write(json_content) diff --git a/bin/measures.py b/bin/measures.py new file mode 100644 index 0000000..7982fb6 --- /dev/null +++ b/bin/measures.py @@ -0,0 +1,105 @@ +''' +This module is a part of my library. +It aims to compute some measures for clustering. +''' + +import numpy as np + +def disequilibrium_(matrix1, matrix2, isGlobal=False, mod=None): + ''' + Compute disequilibrium for all the clusters. + The disequilibrium is compute from the difference + between two clustering sets. + isGlobal permet à l'utilisateur de choisir le dénominateur de + la fonction : + - True : divise la valeur par le nombre d'élément du cluster + - False : divise la valeur par le nombre d'élément total + + withPower permet à l'utilisateur de décider d'appliquer un carré 2 ou + une valeur absolue. + ''' + + def divide_line(a, divider): + ''' + Sub function used for dividing matrix by a vector line by line. + ''' + return np.divide(a, divider, out=np.zeros_like(a), where=divider!=0) + + dividers1 = 0 + dividers2 = 0 + + if isGlobal: + dividers1 = matrix1.sum() + dividers2 = matrix2.sum() + else: + dividers1 = matrix1.sum(axis=1) + dividers2 = matrix2.sum(axis=1) + + matrix1_divided = np.apply_along_axis(divide_line, 0, np.asarray(matrix1, dtype=np.float), dividers1) + + matrix2_divided = np.apply_along_axis(divide_line, 0, np.asarray(matrix2, dtype=np.float), dividers2) + + diff = matrix1_divided - matrix2_divided + + mask = (matrix2==0) & (matrix1==0) + result = diff + + if mod != None or mod == "": + for word in mod.split(" "): + if word == "power": + result = np.power(result,2) + elif word == "human": + result = result * 100 + elif word == "abs": + result = np.absolute(result) + else: + raise Exception("Need to specify an accepted mod of the disequilibrium (\"power\", \"human\" or \"abs\"") + return (mask, result) + + + +def disequilibrium_mean_by_cluster(mask, matrix): + ''' + Mean of disequilibrium + matrix is the disequilibrium calculated + from number of occurences belonging to a class, + for each cluster. + ''' + nb_k = len(matrix) + results = np.zeros((nb_k)) + for i in range(nb_k): + results[i] = matrix[i].sum() / mask[i].sum() + return results + + +def disequilibrium(matrix1, matrix2, isGlobal=False): + ''' + Disequilibrium matrix + And Disequilibrium value + ''' + mask, result = disequilibrium_(matrix1, matrix2, isGlobal) + result_human = result * 100 + result_power = np.power(result, 2) + + return ( + mask, + result_human, + disequilibrium_mean_by_cluster(mask, result_power).sum() + ) + + +def entropy(count_matrix): + def divide_line(a, divider): + ''' + Sub function used for dividing matrix by a vector line by line. + ''' + return np.divide(a, divider, out=np.zeros_like(a), where=divider!=0) + + dividers = count_matrix.sum(axis=1) + + matrix_divided = np.apply_along_axis(divide_line, 0, np.asarray(count_matrix, dtype=np.float), dividers) + + result_matrix = -1 * matrix_divided * np.log2(matrix_divided, where=count_matrix != 0) + result = result_matrix.sum(axis=1) * dividers / dividers.sum() + result = result.sum() + return (result_matrix, result) \ No newline at end of file