Quillot Mathias / volia

1

import argparse

1

import argparse

2

from os import path, mkdir

2

from os import path, mkdir

3

from utils import SubCommandRunner

3

from utils import SubCommandRunner

4

from core.data import read_features, read_lst, read_labels, write_line

4

from core.data import read_features, read_lst, read_labels, write_line

5

import numpy as np

5

import numpy as np

6

from sklearn.cluster import KMeans

6

from sklearn.cluster import KMeans

7

import pickle

7

import pickle

8

from clustering_modules.kmeans import kmeans

8

from clustering_modules.kmeans import kmeans

9

from clustering_modules.kmeans_mahalanobis import kmeansMahalanobis

9

from clustering_modules.kmeans_mahalanobis import kmeansMahalanobis

10

11

from sklearn.preprocessing import LabelEncoder

11

from sklearn.preprocessing import LabelEncoder

12

from sklearn.metrics import v_measure_score, homogeneity_score, completeness_score

12

from sklearn.metrics import v_measure_score, homogeneity_score, completeness_score

13

14

import core.measures

14

import core.measures

15

import json

15

import json

16

17

18

CLUSTERING_METHODS = {

18

CLUSTERING_METHODS = {

19

"k-means": kmeans(),

19

"k-means": kmeans(),

20

"k-means-mahalanobis": kmeansMahalanobis(),

20

"k-means-mahalanobis": kmeansMahalanobis(),

21

"k-means-mahalanobis-constrained": kmeansMahalanobis(constrained=True)

21

"k-means-mahalanobis-constrained": kmeansMahalanobis(constrained=True)

22

}

22

}

23

24

EVALUATION_METHODS = {

24

EVALUATION_METHODS = {

25

"entropy": core.measures.entropy_score,

25

"entropy": core.measures.entropy_score,

26

"purity": core.measures.purity_score,

26

"purity": core.measures.purity_score,

27

"v-measure": v_measure_score,

27

"v-measure": v_measure_score,

28

"homogeneity": homogeneity_score,

28

"homogeneity": homogeneity_score,

29

"completeness": completeness_score,

29

"completeness": completeness_score,

30

}

30

}

31

32

33

def disequilibrium_run():

33

def disequilibrium_run():

34

pass

34

pass

35

36

37

def measure_run(measure: str, features: str, lst: str, truelabels: str, model: str, modeltype: str):

37

def measure_run(measure: str, features: str, lst: str, truelabels: str, model: str, modeltype: str):

38

"""

38

"""

39

40

@param measure:

40

@param measure:

41

@param features:

41

@param features:

42

@param lst:

42

@param lst:

43

@param truelabels:

43

@param truelabels:

44

@param model:

44

@param model:

45

@param modeltype:

45

@param modeltype:

46

@return:

46

@return:

47

"""

47

"""

48

module = CLUSTERING_METHODS[modeltype]

48

module = CLUSTERING_METHODS[modeltype]

49

module.load(model)

49

module.load(model)

50

51

eval = {}

51

eval = {}

52

for ms in measure:

52

for ms in measure:

53

evaluation = EVALUATION_METHODS[ms]

53

evaluation = EVALUATION_METHODS[ms]

54

feats_dict = read_features(features)

54

feats_dict = read_features(features)

55

labels_dict = read_labels(truelabels)

55

labels_dict = read_labels(truelabels)

56

lst_dict = read_lst(lst)

56

lst_dict = read_lst(lst)

57

lst_keys = [key for key in lst_dict]

57

lst_keys = [key for key in lst_dict]

58

feats = np.asarray([feats_dict[key] for key in lst_keys])

58

feats = np.asarray([feats_dict[key] for key in lst_keys])

59

Y_pred = module.predict(feats)

59

Y_pred = module.predict(feats)

60

Y_truth = [labels_dict[key][0] for key in lst_keys]

60

Y_truth = [labels_dict[key][0] for key in lst_keys]

61

62

le = LabelEncoder()

62

le = LabelEncoder()

63

le.fit(Y_truth)

63

le.fit(Y_truth)

64

Y_truth = le.transform(Y_truth)

64

Y_truth = le.transform(Y_truth)

65

66

eval[ms] = evaluation(Y_truth, Y_pred)

66

eval[ms] = evaluation(Y_truth, Y_pred)

67

68

print(json.dumps(eval))

68

print(json.dumps(eval))

69

70

71

def kmeans_run(features: str,

71

def kmeans_run(features: str,

72

lst: str,

72

lst: str,

73

k:int,

73

k:int,

74

kmax: int,

74

kmax: int,

75

klist,

75

klist,

76

maxiter: int,

76

maxiter: int,

77

ninit: int,

77

ninit: int,

78

output: str,

78

output: str,

79

tol: float,

79

tol: float,

80

debug: bool = False,

80

debug: bool = False,

81

mahalanobis: str = False):

81

mahalanobis: str = False):

82

"""

82

"""

83

84

@param features: output features

84

@param features: output features

85

@param lst: list file

85

@param lst: list file

86

@param k: k (kmin if kmax specified)

86

@param k: k (kmin if kmax specified)

87

@param kmax: maximum k to compute

87

@param kmax: maximum k to compute

88

@param klist: list of k values to compute, ignore k value

88

@param klist: list of k values to compute, ignore k value

89

@param output: output file if kmax not specified, else, output directory

89

@param output: output file if kmax not specified, else, output directory

90

@param mahalanobis: distance option of k-means.

90

@param mahalanobis: distance option of k-means.

91

"""

91

"""

92

json_content = locals().copy()

92

json_content = locals().copy()

93

94

def fit_model(k: int, output_file):

94

def fit_model(k: int, output_file):

95

if debug:

95

if debug:

96

print(f"Computing clustering with k={k}")

96

print(f"Computing clustering with k={k}")

97

model = CLUSTERING_METHODS["k-means"]

97

model = CLUSTERING_METHODS["k-means"]

98

if mahalanobis:

98

if mahalanobis:

99

if debug:

99

if debug:

100

print("Mahalanobis activated")

100

print("Mahalanobis activated")

101

model = CLUSTERING_METHODS["k-means-mahalanobis"]

101

model = CLUSTERING_METHODS["k-means-mahalanobis-constrained"]

102

model.fit(X, k, tol, ninit, maxiter, debug)

102

model.fit(X, k, tol, ninit, maxiter, debug)

103

model.save(output_file)

103

model.save(output_file)

104

json_content["models"].append({

104

json_content["models"].append({

105

"model_file": output_file,

105

"model_file": output_file,

106

"k": k,

106

"k": k,

107

})

107

})

108

109

json_content["models"] = []

109

json_content["models"] = []

110

111

# -- READ FILES --

111

# -- READ FILES --

112

features_dict = read_features(features)

112

features_dict = read_features(features)

113

lst_dict = read_lst(lst)

113

lst_dict = read_lst(lst)

114

X = np.asarray([features_dict[x] for x in lst_dict])

114

X = np.asarray([features_dict[x] for x in lst_dict])

115

116

# Exception cases

116

# Exception cases

117

if kmax is None and klist is None and path.isdir(output):

117

if kmax is None and klist is None and path.isdir(output):

118

raise Exception("The \"output\" is an existing directory while the system is waiting the path of a file.")

118

raise Exception("The \"output\" is an existing directory while the system is waiting the path of a file.")

119

120

if (kmax is not None or klist is not None) and path.isfile(output):

120

if (kmax is not None or klist is not None) and path.isfile(output):

121

raise Exception("The \"output\" is an existing file while the system is waiting the path of a directory.")

121

raise Exception("The \"output\" is an existing file while the system is waiting the path of a directory.")

122

123

# Mono value case

123

# Mono value case

124

if kmax is None and klist is None:

124

if kmax is None and klist is None:

125

fit_model(k, output)

125

fit_model(k, output)

126

127

# Multi values case with kmax

127

# Multi values case with kmax

128

if kmax is not None:

128

if kmax is not None:

129

if not path.isdir(output):

129

if not path.isdir(output):

130

mkdir(output)

130

mkdir(output)

131

Ks = range(k, kmax + 1)

131

Ks = range(k, kmax + 1)

132

for i in Ks:

132

for i in Ks:

133

fit_model(i, path.join(output, "clustering_" + str(i) + ".pkl"))

133

fit_model(i, path.join(output, "clustering_" + str(i) + ".pkl"))

134

135

# Second multi values case with klist

135

# Second multi values case with klist

136

if klist is not None:

136

if klist is not None:

137

if not path.isdir(output):

137

if not path.isdir(output):

138

mkdir(output)

138

mkdir(output)

139

for k in klist:

139

for k in klist:

140

k = int(k)

140

k = int(k)

141

fit_model(k, path.join(output, "clustering_" + str(k) + ".pkl"))

141

fit_model(k, path.join(output, "clustering_" + str(k) + ".pkl"))

142

143

print(json.dumps(json_content))

143

print(json.dumps(json_content))

144

145

146

def extract_run(features, lst, model, modeltype, outfile):

146

def extract_run(features, lst, model, modeltype, outfile):

147

feats_dict = read_features(features)

147

feats_dict = read_features(features)

148

lst_dict = read_lst(lst)

148

lst_dict = read_lst(lst)

149

lst_keys = [key for key in lst_dict]

149

lst_keys = [key for key in lst_dict]

150

feats = np.asarray([feats_dict[key] for key in lst_keys])

150

feats = np.asarray([feats_dict[key] for key in lst_keys])

151

152

module = CLUSTERING_METHODS[modeltype]

152

module = CLUSTERING_METHODS[modeltype]

153

module.load(model)

153

module.load(model)

154

Y_pred = module.predict(feats)

154

Y_pred = module.predict(feats)

155

with open(outfile, "w") as f:

155

with open(outfile, "w") as f:

156

for i, key in enumerate(lst_keys):

156

for i, key in enumerate(lst_keys):

157

write_line(key, Y_pred[i], f)

157

write_line(key, Y_pred[i], f)

158

json_output = {

158

json_output = {

159

"outfile": outfile

159

"outfile": outfile

160

}

160

}

161

print(json.dumps(json_output))

161

print(json.dumps(json_output))

162

163

164

if __name__ == "__main__":

164

if __name__ == "__main__":

165

# Main parser

165

# Main parser

166

parser = argparse.ArgumentParser(description="Clustering methods to apply")

166

parser = argparse.ArgumentParser(description="Clustering methods to apply")

167

subparsers = parser.add_subparsers(title="action")

167

subparsers = parser.add_subparsers(title="action")

168

169

# kmeans

169

# kmeans

170

parser_kmeans = subparsers.add_parser(

170

parser_kmeans = subparsers.add_parser(

171

"kmeans", help="Compute clustering using k-means algorithm")

171

"kmeans", help="Compute clustering using k-means algorithm")

172

173

parser_kmeans.add_argument("--features", required=True, type=str, help="Features file (works with list)")

173

parser_kmeans.add_argument("--features", required=True, type=str, help="Features file (works with list)")

174

parser_kmeans.add_argument("--lst", required=True, type=str, help="List file (.lst)")

174

parser_kmeans.add_argument("--lst", required=True, type=str, help="List file (.lst)")

175

parser_kmeans.add_argument("-k", default=2, type=int,

175

parser_kmeans.add_argument("-k", default=2, type=int,

176

help="number of clusters to compute. It is kmin if kmax is specified.")

176

help="number of clusters to compute. It is kmin if kmax is specified.")

177

parser_kmeans.add_argument("--kmax", default=None, type=int, help="if specified, k is kmin.")

177

parser_kmeans.add_argument("--kmax", default=None, type=int, help="if specified, k is kmin.")

178

parser_kmeans.add_argument("--klist", nargs="+",

178

parser_kmeans.add_argument("--klist", nargs="+",

179

help="List of k values to test. As kmax, activate the multi values mod.")

179

help="List of k values to test. As kmax, activate the multi values mod.")

180

parser_kmeans.add_argument("--maxiter",

180

parser_kmeans.add_argument("--maxiter",

181

type=int,

181

type=int,

182

default=300,

182

default=300,

183

help="Max number of iteration before stoping if not converging")

183

help="Max number of iteration before stoping if not converging")

184

parser_kmeans.add_argument("--ninit",

184

parser_kmeans.add_argument("--ninit",

185

type=int,

185

type=int,

186

default=10,

186

default=10,

187

help="Number of time the k-means algorithm will be run with different centroid seeds.")

187

help="Number of time the k-means algorithm will be run with different centroid seeds.")

188

parser_kmeans.add_argument("--tol",

188

parser_kmeans.add_argument("--tol",

189

type=float,

189

type=float,

190

default=0.0001,

190

default=0.0001,

191

help="Tolerance to finish of distance between centroids and their updates.")

191

help="Tolerance to finish of distance between centroids and their updates.")

192

parser_kmeans.add_argument("--debug", action="store_true")

192

parser_kmeans.add_argument("--debug", action="store_true")

193

parser_kmeans.add_argument("--output",

193

parser_kmeans.add_argument("--output",

194

default=".kmeans",

194

default=".kmeans",

195

help="output file if only k. Output directory if multiple kmax specified.")

195

help="output file if only k. Output directory if multiple kmax specified.")

196

parser_kmeans.add_argument("--mahalanobis", action="store_true")

196

parser_kmeans.add_argument("--mahalanobis", action="store_true")

197

parser_kmeans.set_defaults(which="kmeans")

197

parser_kmeans.set_defaults(which="kmeans")

198

199

# measure

199

# measure

200

parser_measure = subparsers.add_parser(

200

parser_measure = subparsers.add_parser(

201

"measure", help="compute the entropy")

201

"measure", help="compute the entropy")

202

203

parser_measure.add_argument("--measure",

203

parser_measure.add_argument("--measure",

204

required=True,

204

required=True,

205

nargs="+",

205

nargs="+",

206

choices=[key for key in EVALUATION_METHODS],

206

choices=[key for key in EVALUATION_METHODS],

207

help="...")

207

help="...")

208

parser_measure.add_argument("--features", required=True, type=str, help="...")

208

parser_measure.add_argument("--features", required=True, type=str, help="...")

209

parser_measure.add_argument("--lst", required=True, type=str, help="...")

209

parser_measure.add_argument("--lst", required=True, type=str, help="...")

210

parser_measure.add_argument("--truelabels", required=True, type=str, help="...")

210

parser_measure.add_argument("--truelabels", required=True, type=str, help="...")

211

parser_measure.add_argument("--model", required=True, type=str, help="...")

211

parser_measure.add_argument("--model", required=True, type=str, help="...")

212

parser_measure.add_argument("--modeltype",

212

parser_measure.add_argument("--modeltype",

213

required=True,

213

required=True,

214

choices=[key for key in CLUSTERING_METHODS],

214

choices=[key for key in CLUSTERING_METHODS],

215

help="type of model for learning")

215

help="type of model for learning")

216

parser_measure.set_defaults(which="measure")

216

parser_measure.set_defaults(which="measure")

217

218

# disequilibrium

218

# disequilibrium

219

parser_disequilibrium = subparsers.add_parser(

219

parser_disequilibrium = subparsers.add_parser(

220

"disequilibrium", help="...")

220

"disequilibrium", help="...")

221

222

parser_disequilibrium.add_argument("--features", required=True, type=str, help="...")

222

parser_disequilibrium.add_argument("--features", required=True, type=str, help="...")

223

parser_disequilibrium.add_argument("--lstrain", required=True, type=str, help="...")

223

parser_disequilibrium.add_argument("--lstrain", required=True, type=str, help="...")

224

parser_disequilibrium.add_argument("--lstest", required=True, type=str, help="...")

224

parser_disequilibrium.add_argument("--lstest", required=True, type=str, help="...")

225

parser_disequilibrium.add_argument("--model", required=True, type=str, help="...")

225

parser_disequilibrium.add_argument("--model", required=True, type=str, help="...")

226

parser_disequilibrium.add_argument("--model-type",

226

parser_disequilibrium.add_argument("--model-type",

227

required=True,

227

required=True,

228

choices=["kmeans", "2", "3"],

228

choices=["kmeans", "2", "3"],

229

help="...")

229

help="...")

230

parser_disequilibrium.set_defaults(which="disequilibrium")

230

parser_disequilibrium.set_defaults(which="disequilibrium")

231

232

# Extract

232

# Extract

233

parser_extract = subparsers.add_parser(

233

parser_extract = subparsers.add_parser(

234

"extract", help="extract cluster labels")

234

"extract", help="extract cluster labels")

235

236

parser_extract.add_argument("--features", required=True, type=str, help="...")

236

parser_extract.add_argument("--features", required=True, type=str, help="...")

237

parser_extract.add_argument("--lst", required=True, type=str, help="...")

237

parser_extract.add_argument("--lst", required=True, type=str, help="...")

238

parser_extract.add_argument("--model", required=True, type=str, help="...")

238

parser_extract.add_argument("--model", required=True, type=str, help="...")

239

parser_extract.add_argument("--modeltype",

239

parser_extract.add_argument("--modeltype",

240

required=True,

240

required=True,

241

choices=[key for key in CLUSTERING_METHODS],

241

choices=[key for key in CLUSTERING_METHODS],

242

help="type of model for learning")

242

help="type of model for learning")

243

parser_extract.add_argument("--outfile", required=True, type=str, help="...")

243

parser_extract.add_argument("--outfile", required=True, type=str, help="...")

244

parser_extract.set_defaults(which="extract")

244

parser_extract.set_defaults(which="extract")

245

246

# Parse

246

# Parse

247

args = parser.parse_args()

247

args = parser.parse_args()

248

249

# Run commands

249

# Run commands

250

runner = SubCommandRunner({

250

runner = SubCommandRunner({

251

"kmeans": kmeans_run,

251

"kmeans": kmeans_run,

252

"measure": measure_run,

252

"measure": measure_run,

253

"disequilibrium": disequilibrium_run,

253

"disequilibrium": disequilibrium_run,

254

"extract": extract_run

254

"extract": extract_run

255

})

255

})

256

257

runner.run(args.which, args.__dict__, remove="which")

257

runner.run(args.which, args.__dict__, remove="which")

258

GITLAB

Quillot Mathias / volia

By default, kmeans mahalanobis is with constrains