clustering.py 9.6 KB
import argparse
from os import path, mkdir
from utils import SubCommandRunner
from core.data import read_features, read_lst, read_labels, write_line
import numpy as np
from sklearn.cluster import KMeans
import pickle
from clustering_modules.kmeans import kmeans
from clustering_modules.kmeans_mahalanobis import  kmeansMahalanobis
from clustering_modules.kmeans_multidistance import kmeansMultidistance

from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import v_measure_score, homogeneity_score, completeness_score

import core.measures
import json


CLUSTERING_METHODS = {
    "k-means": kmeans(),
    "k-means-mahalanobis": kmeansMahalanobis(),
    "k-means-mahalanobis-constrained": kmeansMahalanobis(constrained=True),
    "k-means-basic-mahalanobis": kmeansMultidistance(distance="mahalanobis"),
    "k-means-basic-cosine": kmeansMultidistance(distance="cosine")
}

KMEANS_METHODS = [key for key in CLUSTERING_METHODS if key.startswith("k-means")]

EVALUATION_METHODS = {
    "entropy": core.measures.entropy_score,
    "purity": core.measures.purity_score,
    "v-measure": v_measure_score,
    "homogeneity": homogeneity_score,
    "completeness": completeness_score,
}


def disequilibrium_run():
    pass


def measure_run(measure: str, features: str, lst: str, truelabels: str, model: str, modeltype: str):
    """

    @param measure:
    @param features:
    @param lst:
    @param truelabels:
    @param model:
    @param modeltype:
    @return:
    """
    module = CLUSTERING_METHODS[modeltype]
    module.load(model)

    eval = {}
    for ms in measure:
        evaluation = EVALUATION_METHODS[ms]
        feats_dict = read_features(features)
        labels_dict = read_labels(truelabels)
        lst_dict = read_lst(lst)
        lst_keys = [key for key in lst_dict]
        feats = np.asarray([feats_dict[key] for key in lst_keys])
        Y_pred = module.predict(feats)
        Y_truth = [labels_dict[key][0] for key in lst_keys]

        le = LabelEncoder()
        le.fit(Y_truth)
        Y_truth = le.transform(Y_truth)

        eval[ms] = evaluation(Y_truth, Y_pred)

    print(json.dumps(eval))


def kmeans_run(features: str,
               lst: str,
               k:int,
               kmax: int,
               klist,
               maxiter: int,
               ninit: int,
               output: str,
               tol: float,
               modeltype: str,
               debug: bool = False):
    """

    @param features: output features
    @param lst: list file
    @param k: k (kmin if kmax specified)
    @param kmax: maximum k to compute
    @param klist: list of k values to compute, ignore k value
    @param output: output file if kmax not specified, else, output directory
    @param mahalanobis: distance option of k-means.
    """
    json_content = locals().copy()

    def fit_model(k: int, output_file):
        if debug:
            print(f"Computing clustering with k={k}")
        model = CLUSTERING_METHODS[modeltype]
        model.fit(X, k, tol, ninit, maxiter, debug)
        model.save(output_file)
        json_content["models"].append({
            "model_file": output_file,
            "k": k,
        })

    json_content["models"] = []

    # -- READ FILES --
    features_dict = read_features(features)
    lst_dict = read_lst(lst)
    X = np.asarray([features_dict[x] for x in lst_dict])

    # Exception cases
    if kmax is None and klist is None and path.isdir(output):
        raise Exception("The \"output\" is an existing directory while the system is waiting the path of a file.")

    if (kmax is not None or klist is not None) and path.isfile(output):
        raise Exception("The \"output\" is an existing file while the system is waiting the path of a directory.")

    # Mono value case
    if kmax is None and klist is None:
        fit_model(k, output)

    # Multi values case with kmax
    if kmax is not None:
        if not path.isdir(output):
            mkdir(output)
        Ks = range(k, kmax + 1)
        for i in Ks:
            fit_model(i, path.join(output, "clustering_" + str(i) + ".pkl"))

    # Second multi values case with klist
    if klist is not None:
        if not path.isdir(output):
            mkdir(output)
        for k in klist:
            k = int(k)
            fit_model(k, path.join(output, "clustering_" + str(k) + ".pkl"))

    print(json.dumps(json_content))


def extract_run(features, lst, model, modeltype, outfile):
    feats_dict = read_features(features)
    lst_dict = read_lst(lst)
    lst_keys = [key for key in lst_dict]
    feats = np.asarray([feats_dict[key] for key in lst_keys])

    module = CLUSTERING_METHODS[modeltype]
    module.load(model)
    Y_pred = module.predict(feats)
    with open(outfile, "w") as f:
        for i, key in enumerate(lst_keys):
            write_line(key, Y_pred[i], f)
    json_output = {
        "outfile": outfile
    }
    print(json.dumps(json_output))


if __name__ == "__main__":
    # Main parser
    parser = argparse.ArgumentParser(description="Clustering methods to apply")
    subparsers = parser.add_subparsers(title="action")

    # kmeans
    parser_kmeans = subparsers.add_parser(
        "kmeans", help="Compute clustering using k-means algorithm")

    parser_kmeans.add_argument("--features", required=True, type=str, help="Features file (works with list)")
    parser_kmeans.add_argument("--lst", required=True, type=str, help="List file (.lst)")
    parser_kmeans.add_argument("-k", default=2, type=int,
                               help="number of clusters to compute. It is kmin if kmax is specified.")
    parser_kmeans.add_argument("--kmax", default=None, type=int, help="if specified, k is kmin.")
    parser_kmeans.add_argument("--klist", nargs="+",
                               help="List of k values to test. As kmax, activate the multi values mod.")
    parser_kmeans.add_argument("--maxiter",
                               type=int,
                               default=300,
                               help="Max number of iteration before stoping if not converging")
    parser_kmeans.add_argument("--ninit",
                               type=int,
                               default=10,
                               help="Number of time the k-means algorithm will be run with different centroid seeds.")
    parser_kmeans.add_argument("--tol",
                               type=float,
                               default=0.0001,
                               help="Tolerance to finish of distance between centroids and their updates.")
    parser_kmeans.add_argument("--debug", action="store_true")
    parser_kmeans.add_argument("--output",
                               default=".kmeans",
                               help="output file if only k. Output directory if multiple kmax specified.")
    parser_kmeans.add_argument("--modeltype",
                                required=True,
                                choices=KMEANS_METHODS,
                                help="type of model for learning")
    parser_kmeans.set_defaults(which="kmeans")

    # measure
    parser_measure = subparsers.add_parser(
        "measure", help="compute the entropy")

    parser_measure.add_argument("--measure",
                                required=True,
                                nargs="+",
                                choices=[key for key in EVALUATION_METHODS],
                                help="...")
    parser_measure.add_argument("--features", required=True, type=str, help="...")
    parser_measure.add_argument("--lst", required=True, type=str, help="...")
    parser_measure.add_argument("--truelabels", required=True, type=str, help="...")
    parser_measure.add_argument("--model", required=True, type=str, help="...")
    parser_measure.add_argument("--modeltype",
                                required=True,
                                choices=[key for key in CLUSTERING_METHODS],
                                help="type of model for learning")
    parser_measure.set_defaults(which="measure")

    # disequilibrium
    parser_disequilibrium = subparsers.add_parser(
        "disequilibrium", help="...")

    parser_disequilibrium.add_argument("--features", required=True, type=str, help="...")
    parser_disequilibrium.add_argument("--lstrain", required=True, type=str, help="...")
    parser_disequilibrium.add_argument("--lstest", required=True, type=str, help="...")
    parser_disequilibrium.add_argument("--model", required=True, type=str, help="...")
    parser_disequilibrium.add_argument("--modeltype",
                                required=True,
                                choices=["kmeans", "2", "3"],
                                help="...")
    parser_disequilibrium.set_defaults(which="disequilibrium")

    # Extract
    parser_extract = subparsers.add_parser(
        "extract", help="extract cluster labels")

    parser_extract.add_argument("--features", required=True, type=str, help="...")
    parser_extract.add_argument("--lst", required=True, type=str, help="...")
    parser_extract.add_argument("--model", required=True, type=str, help="...")
    parser_extract.add_argument("--modeltype",
                                required=True,
                                choices=[key for key in CLUSTERING_METHODS],
                                help="type of model for learning")
    parser_extract.add_argument("--outfile", required=True, type=str, help="...")
    parser_extract.set_defaults(which="extract")

    # Parse
    args = parser.parse_args()

    # Run commands
    runner = SubCommandRunner({
        "kmeans": kmeans_run,
        "measure": measure_run,
        "disequilibrium": disequilibrium_run,
        "extract": extract_run
    })

    runner.run(args.which, args.__dict__, remove="which")