Commit 60d1f63cd51b15456f730cfe81e439c297c21995

Authored by Mathias Quillot
1 parent b3371498cf
Exists in master

Script and lib that help measuring clustering

Showing 2 changed files with 252 additions and 0 deletions Side-by-side Diff

bin/measure_clustering.py
  1 +'''
  2 +Compute some measures from clustering like
  3 +disequilibrium and gini.
  4 +'''
  5 +# TODO: Juste disequilibrium par personnage pour commencer.
  6 +
  7 +import argparse
  8 +from data import read_file, index_by_id
  9 +import numpy as np
  10 +from sklearn import preprocessing
  11 +from measures import disequilibrium, entropy
  12 +from sklearn import metrics
  13 +import json
  14 +
  15 +# -- ARGPARSE
  16 +parser = argparse.ArgumentParser(description="Compute metrics from clustering")
  17 +parser.add_argument("clustering", type=str,
  18 + help="clustering file")
  19 +parser.add_argument("classlst", type=str,
  20 + help="List used for its classes.")
  21 +parser.add_argument("trainlst", type=str,
  22 + help="train list")
  23 +parser.add_argument("vallst", type=str,
  24 + help="val lst")
  25 +parser.add_argument("--outfile", type=str, default="out.out",
  26 + help="output file path")
  27 +
  28 +args = parser.parse_args()
  29 +CLUSTERING = args.clustering
  30 +CLASS_LST = args.classlst
  31 +TRAIN_LST = args.trainlst
  32 +VAL_LST = args.vallst
  33 +OUTFILE = args.outfile
  34 +
  35 +
  36 +# -- READ FILES
  37 +clustering = read_file(CLUSTERING)
  38 +clustering_ind = index_by_id(clustering)
  39 +
  40 +class_lst = read_file(CLASS_LST)
  41 +class_lst_ind = index_by_id(class_lst)
  42 +
  43 +train_lst = read_file(TRAIN_LST)
  44 +val_lst = read_file(VAL_LST)
  45 +
  46 +# -- GET CLASSES AND CLUSTERS
  47 +train_classes = np.asarray([class_lst_ind[x[0][0]][x[0][3]][0][1] for x in train_lst])
  48 +train_clusters = np.asarray([clustering_ind[x[0][0]][x[0][3]][0][1] for x in train_lst], dtype=np.int)
  49 +
  50 +val_classes = np.asarray([class_lst_ind[x[0][0]][x[0][3]][0][1] for x in val_lst])
  51 +val_clusters = np.asarray([clustering_ind[x[0][0]][x[0][3]][0][1] for x in val_lst], dtype=np.int)
  52 +
  53 +unique, count = np.unique(train_clusters, return_counts=True)
  54 +train_cluster_ind = dict(zip(unique, count))
  55 +
  56 +unique, count = np.unique(val_clusters, return_counts=True)
  57 +val_cluster_ind = dict(zip(unique, count))
  58 +
  59 +
  60 +#print(np.unique(train_classes, return_counts=True))
  61 +
  62 +#sub = np.extract(train_clusters == 1, train_classes)
  63 +#print(np.unique(sub, return_counts=True))
  64 +
  65 +
  66 +
  67 +
  68 +
  69 +def generate_count_matrix(classes1, clusters1, classes2, clusters2):
  70 + '''
  71 + Generate matrices for which sets.
  72 + Lines are clusters and columns are classes.
  73 + A cell is contains the number of character occurence
  74 + on a specific cluster.
  75 + '''
  76 +
  77 + # Index Classes
  78 + classe1_unique = np.unique(classes1)
  79 + classe2_unique = np.unique(classes2)
  80 + all_classes = np.unique(np.concatenate((classe1_unique, classe2_unique)))
  81 +
  82 + # Label Encoder for classes
  83 + le = preprocessing.LabelEncoder()
  84 + le.fit(all_classes)
  85 +
  86 + # Index
  87 + cluster1_unique = np.unique(clusters1)
  88 + cluster2_unique = np.unique(clusters2)
  89 +
  90 + all_clusters = np.unique(np.concatenate((cluster1_unique, cluster2_unique)))
  91 +
  92 + # Create matrix lin(clust) col(class)
  93 + counts_matrix1 = np.zeros((len(all_clusters), len(all_classes)))
  94 + counts_matrix2 = np.zeros((len(all_clusters), len(all_classes)))
  95 +
  96 + for cluster in all_clusters:
  97 +
  98 + # Il faut d'abord extraire les classes présentes dans ce cluster
  99 + cc1 = np.extract(np.asarray(clusters1) == cluster, np.asarray(classes1))
  100 + cc2 = np.extract(np.asarray(clusters2) == cluster, np.asarray(classes2))
  101 +
  102 + cc1_unique, cc1_counts = np.unique(cc1, return_counts=True)
  103 + cc1_ind = dict(zip(cc1_unique, cc1_counts))
  104 +
  105 + cc2_unique, cc2_counts = np.unique(cc2, return_counts=True)
  106 + cc2_ind = dict(zip(cc2_unique, cc2_counts))
  107 +
  108 + for class_ in all_classes:
  109 + class_id = le.transform([class_])[0]
  110 + if class_ in cc1_ind:
  111 + counts_matrix1[int(cluster)][int(class_id)] = cc1_ind[class_]
  112 + if class_ in cc2_ind:
  113 + counts_matrix2[int(cluster)][int(class_id)] = cc2_ind[class_]
  114 + return (counts_matrix1, counts_matrix2)
  115 +
  116 +
  117 +train_vscore = metrics.cluster.v_measure_score(train_classes, train_clusters)
  118 +val_vscore = metrics.cluster.v_measure_score(val_classes, val_clusters)
  119 +
  120 +train_homogeneity = metrics.homogeneity_score(train_classes, train_clusters)
  121 +val_homogeneity = homogeneity_val = metrics.homogeneity_score(val_classes, val_clusters)
  122 +train_completeness = metrics.completeness_score(train_classes, train_clusters)
  123 +val_completeness = metrics.completeness_score(val_classes, val_clusters)
  124 +
  125 +counts_matrix1, counts_matrix2 = generate_count_matrix(train_classes, train_clusters, val_classes, val_clusters)
  126 +mask, dis_human, dis_measures = disequilibrium(counts_matrix1, counts_matrix2, isGlobal=False)
  127 +
  128 +(train_entropy_matrix, train_entropy) = entropy(counts_matrix1)
  129 +(val_entropy_matrix, val_entropy) = entropy(counts_matrix2)
  130 +
  131 +results = {}
  132 +results["train"] = {}
  133 +results["train"]["vscore"] = train_vscore
  134 +results["train"]["homogeneity"] = train_homogeneity
  135 +results["train"]["completeness"] = val_completeness
  136 +
  137 +results["val"] = {}
  138 +results["val"]["vscore"] = val_vscore
  139 +results["val"]["homogeneity"] = val_homogeneity
  140 +results["val"]["completeness"] = val_completeness
  141 +
  142 +results["disequilibrium"] = dis_measures
  143 +
  144 +#results = disequilibrium(counts_matrix1, counts_matrix2, isGlobal=False, mod="pow")
  145 +with open(OUTFILE, "w") as f:
  146 + json_content = json.dumps(results)
  147 + f.write(json_content)
  1 +'''
  2 +This module is a part of my library.
  3 +It aims to compute some measures for clustering.
  4 +'''
  5 +
  6 +import numpy as np
  7 +
  8 +def disequilibrium_(matrix1, matrix2, isGlobal=False, mod=None):
  9 + '''
  10 + Compute disequilibrium for all the clusters.
  11 + The disequilibrium is compute from the difference
  12 + between two clustering sets.
  13 + isGlobal permet à l'utilisateur de choisir le dénominateur de
  14 + la fonction :
  15 + - True : divise la valeur par le nombre d'élément du cluster
  16 + - False : divise la valeur par le nombre d'élément total
  17 +
  18 + withPower permet à l'utilisateur de décider d'appliquer un carré 2 ou
  19 + une valeur absolue.
  20 + '''
  21 +
  22 + def divide_line(a, divider):
  23 + '''
  24 + Sub function used for dividing matrix by a vector line by line.
  25 + '''
  26 + return np.divide(a, divider, out=np.zeros_like(a), where=divider!=0)
  27 +
  28 + dividers1 = 0
  29 + dividers2 = 0
  30 +
  31 + if isGlobal:
  32 + dividers1 = matrix1.sum()
  33 + dividers2 = matrix2.sum()
  34 + else:
  35 + dividers1 = matrix1.sum(axis=1)
  36 + dividers2 = matrix2.sum(axis=1)
  37 +
  38 + matrix1_divided = np.apply_along_axis(divide_line, 0, np.asarray(matrix1, dtype=np.float), dividers1)
  39 +
  40 + matrix2_divided = np.apply_along_axis(divide_line, 0, np.asarray(matrix2, dtype=np.float), dividers2)
  41 +
  42 + diff = matrix1_divided - matrix2_divided
  43 +
  44 + mask = (matrix2==0) & (matrix1==0)
  45 + result = diff
  46 +
  47 + if mod != None or mod == "":
  48 + for word in mod.split(" "):
  49 + if word == "power":
  50 + result = np.power(result,2)
  51 + elif word == "human":
  52 + result = result * 100
  53 + elif word == "abs":
  54 + result = np.absolute(result)
  55 + else:
  56 + raise Exception("Need to specify an accepted mod of the disequilibrium (\"power\", \"human\" or \"abs\"")
  57 + return (mask, result)
  58 +
  59 +
  60 +
  61 +def disequilibrium_mean_by_cluster(mask, matrix):
  62 + '''
  63 + Mean of disequilibrium
  64 + matrix is the disequilibrium calculated
  65 + from number of occurences belonging to a class,
  66 + for each cluster.
  67 + '''
  68 + nb_k = len(matrix)
  69 + results = np.zeros((nb_k))
  70 + for i in range(nb_k):
  71 + results[i] = matrix[i].sum() / mask[i].sum()
  72 + return results
  73 +
  74 +
  75 +def disequilibrium(matrix1, matrix2, isGlobal=False):
  76 + '''
  77 + Disequilibrium matrix
  78 + And Disequilibrium value
  79 + '''
  80 + mask, result = disequilibrium_(matrix1, matrix2, isGlobal)
  81 + result_human = result * 100
  82 + result_power = np.power(result, 2)
  83 +
  84 + return (
  85 + mask,
  86 + result_human,
  87 + disequilibrium_mean_by_cluster(mask, result_power).sum()
  88 + )
  89 +
  90 +
  91 +def entropy(count_matrix):
  92 + def divide_line(a, divider):
  93 + '''
  94 + Sub function used for dividing matrix by a vector line by line.
  95 + '''
  96 + return np.divide(a, divider, out=np.zeros_like(a), where=divider!=0)
  97 +
  98 + dividers = count_matrix.sum(axis=1)
  99 +
  100 + matrix_divided = np.apply_along_axis(divide_line, 0, np.asarray(count_matrix, dtype=np.float), dividers)
  101 +
  102 + result_matrix = -1 * matrix_divided * np.log2(matrix_divided, where=count_matrix != 0)
  103 + result = result_matrix.sum(axis=1) * dividers / dividers.sum()
  104 + result = result.sum()
  105 + return (result_matrix, result)