Quillot Mathias / volia

1

import argparse

1

import argparse

2

from os import path, mkdir

2

from os import path, mkdir

3

from utils import SubCommandRunner

3

from utils import SubCommandRunner

4

from core.data import read_features, read_lst, read_labels

4

from core.data import read_features, read_lst, read_labels

5

import numpy as np

5

import numpy as np

6

from sklearn.cluster import KMeans

6

from sklearn.cluster import KMeans

7

import pickle

7

import pickle

8

from clustering_modules.kmeans import kmeans

8

from clustering_modules.kmeans import kmeans

9

from clustering_modules.kmeans_mahalanobis import kmeansMahalanobis

9

from clustering_modules.kmeans_mahalanobis import kmeansMahalanobis

10

11

from sklearn.preprocessing import LabelEncoder

11

from sklearn.preprocessing import LabelEncoder

12

from sklearn.metrics import v_measure_score, homogeneity_score, completeness_score

12

from sklearn.metrics import v_measure_score, homogeneity_score, completeness_score

13

14

import core.measures

14

import core.measures

15

import json

15

import json

16

17

18

CLUSTERING_METHODS = {

18

CLUSTERING_METHODS = {

19

"k-means": kmeans(),

19

"k-means": kmeans(),

20

"k-means-mahalanobis": kmeansMahalanobis()

20

"k-means-mahalanobis": kmeansMahalanobis()

21

}

21

}

22

23

EVALUATION_METHODS = {

23

EVALUATION_METHODS = {

24

"entropy": core.measures.entropy_score,

24

"entropy": core.measures.entropy_score,

25

"purity": core.measures.purity_score,

25

"purity": core.measures.purity_score,

26

"v-measure": v_measure_score,

26

"v-measure": v_measure_score,

27

"homogeneity": homogeneity_score,

27

"homogeneity": homogeneity_score,

28

"completeness": completeness_score,

28

"completeness": completeness_score,

29

}

29

}

30

31

32

def disequilibrium_run():

32

def disequilibrium_run():

33

pass

33

pass

34

35

36

def measure_run(measure: str, features: str, lst: str, truelabels: str, model: str, modeltype: str):

36

def measure_run(measure: str, features: str, lst: str, truelabels: str, model: str, modeltype: str):

37

"""

37

"""

38

39

@param measure:

39

@param measure:

40

@param features:

40

@param features:

41

@param lst:

41

@param lst:

42

@param truelabels:

42

@param truelabels:

43

@param model:

43

@param model:

44

@param modeltype:

44

@param modeltype:

45

@return:

45

@return:

46

"""

46

"""

47

module = CLUSTERING_METHODS[modeltype]

47

module = CLUSTERING_METHODS[modeltype]

48

module.load(model)

48

module.load(model)

49

50

eval = {}

50

eval = {}

51

for ms in measure:

51

for ms in measure:

52

evaluation = EVALUATION_METHODS[ms]

52

evaluation = EVALUATION_METHODS[ms]

53

feats_dict = read_features(features)

53

feats_dict = read_features(features)

54

labels_dict = read_labels(truelabels)

54

labels_dict = read_labels(truelabels)

55

lst_dict = read_lst(lst)

55

lst_dict = read_lst(lst)

56

lst_keys = [key for key in lst_dict]

56

lst_keys = [key for key in lst_dict]

57

feats = np.asarray([feats_dict[key] for key in lst_keys])

57

feats = np.asarray([feats_dict[key] for key in lst_keys])

58

Y_pred = module.predict(feats)

58

Y_pred = module.predict(feats)

59

Y_truth = [labels_dict[key][0] for key in lst_keys]

59

Y_truth = [labels_dict[key][0] for key in lst_keys]

60

61

le = LabelEncoder()

61

le = LabelEncoder()

62

le.fit(Y_truth)

62

le.fit(Y_truth)

63

Y_truth = le.transform(Y_truth)

63

Y_truth = le.transform(Y_truth)

64

65

eval[ms] = evaluation(Y_truth, Y_pred)

65

eval[ms] = evaluation(Y_truth, Y_pred)

66

67

print(json.dumps(eval))

67

print(json.dumps(eval))

68

69

70

def kmeans_run(features: str,

70

def kmeans_run(features: str,

71

lst: str,

71

lst: str,

72

k:int,

72

k:int,

73

kmax: int,

73

kmax: int,

74

klist,

74

klist,

75

maxiter: int,

75

maxiter: int,

76

ninit: int,

76

ninit: int,

77

output: str,

77

output: str,

78

tol: float,

78

tol: float,

79

debug: bool = False,

79

debug: bool = False,

80

mahalanobis: str = False):

80

mahalanobis: str = False):

81

"""

81

"""

82

83

@param features: output features

83

@param features: output features

84

@param lst: list file

84

@param lst: list file

85

@param k: k (kmin if kmax specified)

85

@param k: k (kmin if kmax specified)

86

@param kmax: maximum k to compute

86

@param kmax: maximum k to compute

87

@param klist: list of k values to compute, ignore k value

87

@param klist: list of k values to compute, ignore k value

88

@param output: output file if kmax not specified, else, output directory

88

@param output: output file if kmax not specified, else, output directory

89

@param mahalanobis: distance option of k-means.

89

@param mahalanobis: distance option of k-means.

90

"""

90

"""

91

json_content = locals().copy()

91

json_content = locals().copy()

92

93

def fit_model(k: int, output_file):

93

def fit_model(k: int, output_file):

94

if debug:

94

if debug:

95

print(f"Computing clustering with k={k}")

95

print(f"Computing clustering with k={k}")

96

model = CLUSTERING_METHODS["k-means"]

96

model = CLUSTERING_METHODS["k-means"]

97

if mahalanobis:

97

if mahalanobis:

98

if debug:

98

if debug:

99

print("Mahalanobis activated")

99

print("Mahalanobis activated")

100

model = CLUSTERING_METHODS["k-means-mahalanobis"]

100

model = CLUSTERING_METHODS["k-means-mahalanobis"]

101

model.fit(X, k, tol, ninit, maxiter, debug)

101

model.fit(X, k, tol, ninit, maxiter, debug)

102

model.save(output_file)

102

model.save(output_file)

103

json_content["models"].append({

103

json_content["models"].append({

104

"model_file": output_file,

104

"model_file": output_file,

105

"k": k,

105

"k": k,

106

})

106

})

107

108

json_content["models"] = []

108

json_content["models"] = []

109

110

# -- READ FILES --

110

# -- READ FILES --

111

features_dict = read_features(features)

111

features_dict = read_features(features)

112

lst_dict = read_lst(lst)

112

lst_dict = read_lst(lst)

113

X = np.asarray([features_dict[x] for x in lst_dict])

113

X = np.asarray([features_dict[x] for x in lst_dict])

114

115

# Exception cases

115

# Exception cases

116

if kmax is None and klist is None and path.isdir(output):

116

if kmax is None and klist is None and path.isdir(output):

117

raise Exception("The \"output\" is an existing directory while the system is waiting the path of a file.")

117

raise Exception("The \"output\" is an existing directory while the system is waiting the path of a file.")

118

119

if (kmax is not None or klist is not None) and path.isfile(output):

119

if (kmax is not None or klist is not None) and path.isfile(output):

120

raise Exception("The \"output\" is an existing file while the system is waiting the path of a directory.")

120

raise Exception("The \"output\" is an existing file while the system is waiting the path of a directory.")

121

122

# Mono value case

122

# Mono value case

123

if kmax is None and klist is None:

123

if kmax is None and klist is None:

124

fit_model(k, output)

124

fit_model(k, output)

125

126

# Multi values case with kmax

126

# Multi values case with kmax

127

if kmax is not None:

127

if kmax is not None:

128

if not path.isdir(output):

128

if not path.isdir(output):

129

mkdir(output)

129

mkdir(output)

130

Ks = range(k, kmax + 1)

130

Ks = range(k, kmax + 1)

131

for i in Ks:

131

for i in Ks:

132

fit_model(i, path.join(output, "clustering_" + str(i) + ".pkl"))

132

fit_model(i, path.join(output, "clustering_" + str(i) + ".pkl"))

133

134

# Second multi values case with klist

134

# Second multi values case with klist

135

if klist is not None:

135

if klist is not None:

136

if not path.isdir(output):

136

if not path.isdir(output):

137

mkdir(output)

137

mkdir(output)

138

for k in klist:

138

for k in klist:

139

k = int(k)

139

k = int(k)

140

fit_model(k, path.join(output, "clustering_" + str(i) + ".pkl"))

140

fit_model(k, path.join(output, "clustering_" + str(i) + ".pkl"))

141

142

print(json_content)

142

print(json_content)

143

# TODO: compute loss with k-means mahalanobis.

144

# TODO: n_init have to be taken into account for mahalanobis case of k-means algorithm.

145

143

146

144

147

if __name__ == "__main__":

145

if __name__ == "__main__":

148

# Main parser

146

# Main parser

149

parser = argparse.ArgumentParser(description="Clustering methods to apply")

147

parser = argparse.ArgumentParser(description="Clustering methods to apply")

150

subparsers = parser.add_subparsers(title="action")

148

subparsers = parser.add_subparsers(title="action")

151

149

152

# kmeans

150

# kmeans

153

parser_kmeans = subparsers.add_parser(

151

parser_kmeans = subparsers.add_parser(

154

"kmeans", help="Compute clustering using k-means algorithm")

152

"kmeans", help="Compute clustering using k-means algorithm")

155

153

156

parser_kmeans.add_argument("--features", required=True, type=str, help="Features file (works with list)")

154

parser_kmeans.add_argument("--features", required=True, type=str, help="Features file (works with list)")

157

parser_kmeans.add_argument("--lst", required=True, type=str, help="List file (.lst)")

155

parser_kmeans.add_argument("--lst", required=True, type=str, help="List file (.lst)")

158

parser_kmeans.add_argument("-k", default=2, type=int,

156

parser_kmeans.add_argument("-k", default=2, type=int,

159

help="number of clusters to compute. It is kmin if kmax is specified.")

157

help="number of clusters to compute. It is kmin if kmax is specified.")

160

parser_kmeans.add_argument("--kmax", default=None, type=int, help="if specified, k is kmin.")

158

parser_kmeans.add_argument("--kmax", default=None, type=int, help="if specified, k is kmin.")

161

parser_kmeans.add_argument("--klist", nargs="+",

159

parser_kmeans.add_argument("--klist", nargs="+",

162

help="List of k values to test. As kmax, activate the multi values mod.")

160

help="List of k values to test. As kmax, activate the multi values mod.")

163

parser_kmeans.add_argument("--maxiter",

161

parser_kmeans.add_argument("--maxiter",

164

type=int,

162

type=int,

165

default=300,

163

default=300,

166

help="Max number of iteration before stoping if not converging")

164

help="Max number of iteration before stoping if not converging")

167

parser_kmeans.add_argument("--ninit",

165

parser_kmeans.add_argument("--ninit",

168

type=int,

166

type=int,

169

default=10,

167

default=10,

170

help="Number of time the k-means algorithm will be run with different centroid seeds.")

168

help="Number of time the k-means algorithm will be run with different centroid seeds.")

171

parser_kmeans.add_argument("--tol",

169

parser_kmeans.add_argument("--tol",

172

type=float,

170

type=float,

173

default=0.0001,

171

default=0.0001,

174

help="Tolerance to finish of distance between centroids and their updates.")

172

help="Tolerance to finish of distance between centroids and their updates.")

175

parser_kmeans.add_argument("--debug", action="store_true")

173

parser_kmeans.add_argument("--debug", action="store_true")

176

parser_kmeans.add_argument("--output",

174

parser_kmeans.add_argument("--output",

177

default=".kmeans",

175

default=".kmeans",

178

help="output file if only k. Output directory if multiple kmax specified.")

176

help="output file if only k. Output directory if multiple kmax specified.")

179

parser_kmeans.add_argument("--mahalanobis", action="store_true")

177

parser_kmeans.add_argument("--mahalanobis", action="store_true")

180

parser_kmeans.set_defaults(which="kmeans")

178

parser_kmeans.set_defaults(which="kmeans")

181

179

182

# measure

180

# measure

183

parser_measure = subparsers.add_parser(

181

parser_measure = subparsers.add_parser(

184

"measure", help="compute the entropy")

182

"measure", help="compute the entropy")

185

183

186

parser_measure.add_argument("--measure",

184

parser_measure.add_argument("--measure",

187

required=True,

185

required=True,

188

nargs="+",

186

nargs="+",

189

choices=[key for key in EVALUATION_METHODS],

187

choices=[key for key in EVALUATION_METHODS],

190

help="...")

188

help="...")

191

parser_measure.add_argument("--features", required=True, type=str, help="...")

189

parser_measure.add_argument("--features", required=True, type=str, help="...")

192

parser_measure.add_argument("--lst", required=True, type=str, help="...")

190

parser_measure.add_argument("--lst", required=True, type=str, help="...")

193

parser_measure.add_argument("--truelabels", required=True, type=str, help="...")

191

parser_measure.add_argument("--truelabels", required=True, type=str, help="...")

194

parser_measure.add_argument("--model", required=True, type=str, help="...")

192

parser_measure.add_argument("--model", required=True, type=str, help="...")

195

parser_measure.add_argument("--modeltype",

193

parser_measure.add_argument("--modeltype",

196

required=True,

194

required=True,

197

choices=[key for key in CLUSTERING_METHODS],

195

choices=[key for key in CLUSTERING_METHODS],

198

help="type of model for learning")

196

help="type of model for learning")

199

parser_measure.set_defaults(which="measure")

197

parser_measure.set_defaults(which="measure")

200

198

201

# disequilibrium

199

# disequilibrium

202

parser_disequilibrium = subparsers.add_parser(

200

parser_disequilibrium = subparsers.add_parser(

203

"disequilibrium", help="...")

201

"disequilibrium", help="...")

204

202

205

parser_disequilibrium.add_argument("--features", required=True, type=str, help="...")

203

parser_disequilibrium.add_argument("--features", required=True, type=str, help="...")

206

parser_disequilibrium.add_argument("--lstrain", required=True, type=str, help="...")

204

parser_disequilibrium.add_argument("--lstrain", required=True, type=str, help="...")

207

parser_disequilibrium.add_argument("--lstest", required=True, type=str, help="...")

205

parser_disequilibrium.add_argument("--lstest", required=True, type=str, help="...")

208

parser_disequilibrium.add_argument("--model", required=True, type=str, help="...")

206

parser_disequilibrium.add_argument("--model", required=True, type=str, help="...")

209

parser_disequilibrium.add_argument("--model-type",

207

parser_disequilibrium.add_argument("--model-type",

210

required=True,

208

required=True,

211

choices=["kmeans", "2", "3"],

209

choices=["kmeans", "2", "3"],

212

help="...")

210

help="...")

213

parser_disequilibrium.set_defaults(which="disequilibrium")

211

parser_disequilibrium.set_defaults(which="disequilibrium")

214

212

215

# Parse

213

# Parse

216

args = parser.parse_args()

214

args = parser.parse_args()

217

215

218

# Run commands

216

# Run commands

219

runner = SubCommandRunner({

217

runner = SubCommandRunner({

220

"kmeans": kmeans_run,

218

"kmeans": kmeans_run,

221

"measure": measure_run,

219

"measure": measure_run,

222

"disequilibrium": disequilibrium_run

220

"disequilibrium": disequilibrium_run

223

})

221

})

224

222

225

runner.run(args.which, args.__dict__, remove="which")

223

runner.run(args.which, args.__dict__, remove="which")

226

224

GITLAB

Quillot Mathias / volia

Removing todo comments

 import argparse
 from os import path, mkdir
 from utils import SubCommandRunner
 from core.data import read_features, read_lst, read_labels
 import numpy as np
 from sklearn.cluster import KMeans
 import pickle
 from clustering_modules.kmeans import kmeans
 from clustering_modules.kmeans_mahalanobis import  kmeansMahalanobis
 from sklearn.preprocessing import LabelEncoder
 from sklearn.metrics import v_measure_score, homogeneity_score, completeness_score
 import core.measures
 import json
 CLUSTERING_METHODS = {
     "k-means": kmeans(),
     "k-means-mahalanobis": kmeansMahalanobis()
 }
 EVALUATION_METHODS = {
     "entropy": core.measures.entropy_score,
     "purity": core.measures.purity_score,
     "v-measure": v_measure_score,
     "homogeneity": homogeneity_score,
     "completeness": completeness_score,
 }
 def disequilibrium_run():
     pass
 def measure_run(measure: str, features: str, lst: str, truelabels: str, model: str, modeltype: str):
     """
     @param measure:
     @param features:
     @param lst:
     @param truelabels:
     @param model:
     @param modeltype:
     @return:
     """
     module = CLUSTERING_METHODS[modeltype]
     module.load(model)
     eval = {}
     for ms in measure:
         evaluation = EVALUATION_METHODS[ms]
         feats_dict = read_features(features)
         labels_dict = read_labels(truelabels)
         lst_dict = read_lst(lst)
         lst_keys = [key for key in lst_dict]
         feats = np.asarray([feats_dict[key] for key in lst_keys])
         Y_pred = module.predict(feats)
         Y_truth = [labels_dict[key][0] for key in lst_keys]
         le = LabelEncoder()
         le.fit(Y_truth)
         Y_truth = le.transform(Y_truth)
         eval[ms] = evaluation(Y_truth, Y_pred)
     print(json.dumps(eval))
 def kmeans_run(features: str,
                lst: str,
                k:int,
                kmax: int,
                klist,
                maxiter: int,
                ninit: int,
                output: str,
                tol: float,
                debug: bool = False,
                mahalanobis: str = False):
     """
     @param features: output features
     @param lst: list file
     @param k: k (kmin if kmax specified)
     @param kmax: maximum k to compute
     @param klist: list of k values to compute, ignore k value
     @param output: output file if kmax not specified, else, output directory
     @param mahalanobis: distance option of k-means.
     """
     json_content = locals().copy()
     def fit_model(k: int, output_file):
         if debug:
             print(f"Computing clustering with k={k}")
         model = CLUSTERING_METHODS["k-means"]
         if mahalanobis:
             if debug:
                 print("Mahalanobis activated")
             model = CLUSTERING_METHODS["k-means-mahalanobis"]
         model.fit(X, k, tol, ninit, maxiter, debug)
         model.save(output_file)
         json_content["models"].append({
             "model_file": output_file,
             "k": k,
         })
     json_content["models"] = []
     # -- READ FILES --
     features_dict = read_features(features)
     lst_dict = read_lst(lst)
     X = np.asarray([features_dict[x] for x in lst_dict])
     # Exception cases
     if kmax is None and klist is None and path.isdir(output):
         raise Exception("The \"output\" is an existing directory while the system is waiting the path of a file.")
     if (kmax is not None or klist is not None) and path.isfile(output):
         raise Exception("The \"output\" is an existing file while the system is waiting the path of a directory.")
     # Mono value case
     if kmax is None and klist is None:
         fit_model(k, output)
     # Multi values case with kmax
     if kmax is not None:
         if not path.isdir(output):
             mkdir(output)
         Ks = range(k, kmax + 1)
         for i in Ks:
             fit_model(i, path.join(output, "clustering_" + str(i) + ".pkl"))
     # Second multi values case with klist
     if klist is not None:
         if not path.isdir(output):
             mkdir(output)
         for k in klist:
             k = int(k)
             fit_model(k, path.join(output, "clustering_" + str(i) + ".pkl"))
     print(json_content)
-    # TODO: compute loss with k-means mahalanobis.
-    # TODO: n_init have to be taken into account for mahalanobis case of k-means algorithm.
 if __name__ == "__main__":
     # Main parser
     parser = argparse.ArgumentParser(description="Clustering methods to apply")
     subparsers = parser.add_subparsers(title="action")
     # kmeans
     parser_kmeans = subparsers.add_parser(
         "kmeans", help="Compute clustering using k-means algorithm")
     parser_kmeans.add_argument("--features", required=True, type=str, help="Features file (works with list)")
     parser_kmeans.add_argument("--lst", required=True, type=str, help="List file (.lst)")
     parser_kmeans.add_argument("-k", default=2, type=int,
                                help="number of clusters to compute. It is kmin if kmax is specified.")
     parser_kmeans.add_argument("--kmax", default=None, type=int, help="if specified, k is kmin.")
     parser_kmeans.add_argument("--klist", nargs="+",
                                help="List of k values to test. As kmax, activate the multi values mod.")
     parser_kmeans.add_argument("--maxiter",
                                type=int,
                                default=300,
                                help="Max number of iteration before stoping if not converging")
     parser_kmeans.add_argument("--ninit",
                                type=int,
                                default=10,
                                help="Number of time the k-means algorithm will be run with different centroid seeds.")
     parser_kmeans.add_argument("--tol",
                                type=float,
                                default=0.0001,
                                help="Tolerance to finish of distance between centroids and their updates.")
     parser_kmeans.add_argument("--debug", action="store_true")
     parser_kmeans.add_argument("--output",
                                default=".kmeans",
                                help="output file if only k. Output directory if multiple kmax specified.")
     parser_kmeans.add_argument("--mahalanobis", action="store_true")
     parser_kmeans.set_defaults(which="kmeans")
     # measure
     parser_measure = subparsers.add_parser(
         "measure", help="compute the entropy")
     parser_measure.add_argument("--measure",
                                 required=True,
                                 nargs="+",
                                 choices=[key for key in EVALUATION_METHODS],
                                 help="...")
     parser_measure.add_argument("--features", required=True, type=str, help="...")
     parser_measure.add_argument("--lst", required=True, type=str, help="...")
     parser_measure.add_argument("--truelabels", required=True, type=str, help="...")
     parser_measure.add_argument("--model", required=True, type=str, help="...")
     parser_measure.add_argument("--modeltype",
                                 required=True,
                                 choices=[key for key in CLUSTERING_METHODS],
                                 help="type of model for learning")
     parser_measure.set_defaults(which="measure")
     # disequilibrium
     parser_disequilibrium = subparsers.add_parser(
         "disequilibrium", help="...")
     parser_disequilibrium.add_argument("--features", required=True, type=str, help="...")
     parser_disequilibrium.add_argument("--lstrain", required=True, type=str, help="...")
     parser_disequilibrium.add_argument("--lstest", required=True, type=str, help="...")
     parser_disequilibrium.add_argument("--model", required=True, type=str, help="...")
     parser_disequilibrium.add_argument("--model-type",
                                 required=True,
                                 choices=["kmeans", "2", "3"],
                                 help="...")
     parser_disequilibrium.set_defaults(which="disequilibrium")
     # Parse
     args = parser.parse_args()
     # Run commands
     runner = SubCommandRunner({
         "kmeans": kmeans_run,
         "measure": measure_run,
         "disequilibrium": disequilibrium_run
     })
     runner.run(args.which, args.__dict__, remove="which")