measure_clustering.py 5.37 KB
'''
Compute some measures from clustering like
disequilibrium and gini.
'''
# TODO: Juste disequilibrium par personnage pour commencer.

import argparse
from data import read_file, index_by_id
import numpy as np
from sklearn import preprocessing
from measures import disequilibrium, entropy
from sklearn import metrics
import json

# -- ARGPARSE
parser = argparse.ArgumentParser(description="Compute metrics from clustering")
parser.add_argument("clustering", type=str,
                    help="clustering file")
parser.add_argument("classlst", type=str,
                    help="List used for its classes.")
parser.add_argument("trainlst", type=str,
                    help="train list")
parser.add_argument("vallst", type=str,
                    help="val lst")
parser.add_argument("--outfile", type=str, default="out.out",
                    help="output file path")

args = parser.parse_args()
CLUSTERING = args.clustering
CLASS_LST = args.classlst
TRAIN_LST = args.trainlst
VAL_LST = args.vallst
OUTFILE = args.outfile


# -- READ FILES
clustering = read_file(CLUSTERING)
clustering_ind = index_by_id(clustering)

class_lst = read_file(CLASS_LST)
class_lst_ind = index_by_id(class_lst)

train_lst = read_file(TRAIN_LST)
val_lst = read_file(VAL_LST)

# -- GET CLASSES AND CLUSTERS
train_classes = np.asarray([class_lst_ind[x[0][0]][x[0][3]][0][1] for x in train_lst])
train_clusters = np.asarray([clustering_ind[x[0][0]][x[0][3]][0][1] for x in train_lst], dtype=np.int)


val_classes = np.asarray([class_lst_ind[x[0][0]][x[0][3]][0][1] for x in val_lst])
val_clusters = np.asarray([clustering_ind[x[0][0]][x[0][3]][0][1] for x in val_lst], dtype=np.int)

unique, count = np.unique(train_clusters, return_counts=True)
train_cluster_ind = dict(zip(unique, count))

unique, count = np.unique(val_clusters, return_counts=True)
val_cluster_ind = dict(zip(unique, count))


#print(np.unique(train_classes, return_counts=True))

#sub = np.extract(train_clusters == 1, train_classes)
#print(np.unique(sub, return_counts=True))





def generate_count_matrix(classes1, clusters1, classes2, clusters2):
    '''
    Generate matrices for which sets. 
    Lines are clusters and columns are classes. 
    A cell is contains the number of character occurence 
    on a specific cluster.
    '''

    # Index Classes
    classe1_unique = np.unique(classes1)
    classe2_unique = np.unique(classes2)
    all_classes = np.unique(np.concatenate((classe1_unique, classe2_unique)))
       
    # Label Encoder for classes
    le = preprocessing.LabelEncoder()
    le.fit(all_classes)

    # Index
    cluster1_unique = np.unique(clusters1)
    cluster2_unique = np.unique(clusters2)
    all_clusters = np.unique(np.concatenate((cluster1_unique, cluster2_unique)))
    
    # Warning
    if np.max(all_clusters) != len(cluster1_unique)-1:
        print("WARNING: Some clusters are empty. Value max : " + str(np.max(all_clusters)) + " Nb values : " + str(len(cluster1_unique)))

    # Create matrix lin(clust) col(class)
    counts_matrix1 = np.zeros((np.max(all_clusters) + 1, len(all_classes)))
    counts_matrix2 = np.zeros((np.max(all_clusters) + 1, len(all_classes)))

    for cluster in all_clusters:
        
        # Il faut d'abord extraire les classes présentes dans ce cluster
        cc1 = np.extract(np.asarray(clusters1) == cluster, np.asarray(classes1))
        cc2 = np.extract(np.asarray(clusters2) == cluster, np.asarray(classes2))

        cc1_unique, cc1_counts = np.unique(cc1, return_counts=True)
        cc1_ind = dict(zip(cc1_unique, cc1_counts))

        cc2_unique, cc2_counts = np.unique(cc2, return_counts=True)
        cc2_ind = dict(zip(cc2_unique, cc2_counts))

        for class_ in all_classes:
            class_id = le.transform([class_])[0]
            if class_ in cc1_ind:
                counts_matrix1[int(cluster)][int(class_id)] = cc1_ind[class_]
            if class_ in cc2_ind:
                counts_matrix2[int(cluster)][int(class_id)] = cc2_ind[class_]
    return (counts_matrix1, counts_matrix2)


train_vscore = metrics.cluster.v_measure_score(train_classes, train_clusters)
val_vscore = metrics.cluster.v_measure_score(val_classes, val_clusters)

train_homogeneity = metrics.homogeneity_score(train_classes, train_clusters)  
val_homogeneity = homogeneity_val = metrics.homogeneity_score(val_classes, val_clusters)  
train_completeness = metrics.completeness_score(train_classes, train_clusters) 
val_completeness = metrics.completeness_score(val_classes, val_clusters)

counts_matrix1, counts_matrix2 = generate_count_matrix(train_classes, train_clusters, val_classes, val_clusters)

mask, dis_human, dis_measures = disequilibrium(counts_matrix1, counts_matrix2, isGlobal=False)


(train_entropy_matrix, train_entropy) = entropy(counts_matrix1)
(val_entropy_matrix, val_entropy) = entropy(counts_matrix2)

results = {}
results["train"] = {}
results["train"]["entropy"] = train_entropy 
results["train"]["vscore"] = train_vscore
results["train"]["homogeneity"] = train_homogeneity
results["train"]["completeness"] = val_completeness

results["val"] = {}
results["val"]["entropy"] = val_entropy 
results["val"]["vscore"] = val_vscore
results["val"]["homogeneity"] = val_homogeneity
results["val"]["completeness"] = val_completeness

results["disequilibrium"] = dis_measures

#results = disequilibrium(counts_matrix1, counts_matrix2, isGlobal=False, mod="pow")
with open(OUTFILE, "w") as f:
    json_content = json.dumps(results)
    f.write(json_content)