Quillot Mathias / volia

1

2

3

from sklearn.cluster import KMeans

3

from sklearn.cluster import KMeans

4

import pickle

4

import pickle

5

import numpy as np

5

import numpy as np

6

import matplotlib.pyplot as plt

6

import matplotlib.pyplot as plt

7

from sklearn.manifold import TSNE

7

from sklearn.manifold import TSNE

8

from abstract_clustering import AbstractClustering

8

from abstract_clustering import AbstractClustering

9

10

class kmeansMahalanobis():

10

class kmeansMahalanobis():

11

def __init__(self):

11

def __init__(self):

12

"""

12

"""

13

14

"""

14

"""

15

self.C = None

15

self.C = None

16

self.L = None

16

self.L = None

17

self.K = None

17

self.K = None

18

19

def predict(self, features):

19

def predict(self, features):

20

"""

20

"""

21

22

@param features:

22

@param features:

23

@return:

23

@return:

24

"""

24

"""

25

N = features.shape[0]

25

N = features.shape[0]

26

distances = np.zeros((N, self.K))

26

distances = np.zeros((N, self.K))

27

for n in range(N):

27

for n in range(N):

28

for k in range(self.K):

28

for k in range(self.K):

29

distances[n][k] = self._dist(features[n], self.C[k], self.L[k])

29

distances[n][k] = self._dist(features[n], self.C[k], self.L[k])

30

print(distances)

31

closest_cluster = np.argmin(distances, axis=1)

30

closest_cluster = np.argmin(distances, axis=1)

32

return closest_cluster

31

return closest_cluster

33

32

34

def load(self, model_path):

33

def load(self, model_path):

35

"""

34

"""

36

35

37

@param model_path:

36

@param model_path:

38

@return:

37

@return:

39

"""

38

"""

40

data = None

39

data = None

41

with open(model_path):

40

with open(model_path):

42

data = pickle.load()

41

data = pickle.load()

43

if data is None:

42

if data is None:

44

raise Exception("Le modèle n'a pas pu être chargé")

43

raise Exception("Le modèle n'a pas pu être chargé")

45

else:

44

else:

46

self.C = data["C"]

45

self.C = data["C"]

47

self.L = data["L"]

46

self.L = data["L"]

48

self.K = data["K"]

47

self.K = data["K"]

49

48

50

def save(self, modelpath: str):

49

def save(self, modelpath: str):

51

"""

50

"""

52

51

53

@param modelpath:

52

@param modelpath:

54

@return:

53

@return:

55

"""

54

"""

56

data = {

55

data = {

57

"C": self.C,

56

"C": self.C,

58

"L": self.L,

57

"L": self.L,

59

"K": self.K

58

"K": self.K

60

}

59

}

61

with open(modelpath, "wb") as f:

60

with open(modelpath, "wb") as f:

62

pickle.dump(data, f)

61

pickle.dump(data, f)

63

62

64

def fit(self, features, K: int):

63

def fit(self, features, K: int):

65

self._train(features, K)

64

self._train(features, K)

66

65

67

def _initialize_model(self, X, number_clusters):

66

def _initialize_model(self, X, number_clusters):

68

d = X.shape[1]

67

d = X.shape[1]

69

C = X[np.random.choice(X.shape[0], number_clusters)]

68

C = X[np.random.choice(X.shape[0], number_clusters)]

70

L = np.zeros((number_clusters, d, d))

69

L = np.zeros((number_clusters, d, d))

71

for k in range(number_clusters):

70

for k in range(number_clusters):

72

L[k] = np.identity(d)

71

L[k] = np.identity(d)

73

return C, L

72

return C, L

74

73

75

def _dist(self, a, b, l):

74

def _dist(self, a, b, l):

76

'''

75

'''

77

Distance euclidienne

76

Distance euclidienne

78

'''

77

'''

79

a = np.reshape(a, (-1, 1))

78

a = np.reshape(a, (-1, 1))

80

b = np.reshape(b, (-1, 1))

79

b = np.reshape(b, (-1, 1))

81

result = np.transpose(a - b).dot(l).dot(a-b)[0][0]

80

result = np.transpose(a - b).dot(l).dot(a-b)[0][0]

82

return result

81

return result

83

82

84

def _plot_iteration(self, iteration, points, clusters, centers):

83

def _plot_iteration(self, iteration, points, clusters, centers):

85

fig = plt.figure()

84

fig = plt.figure()

86

ax = fig.add_subplot(111)

85

ax = fig.add_subplot(111)

87

scatter = ax.scatter(points[:, 0], points[:, 1], c=clusters, s=50)

86

scatter = ax.scatter(points[:, 0], points[:, 1], c=clusters, s=50)

88

87

89

#for center in centers:

88

#for center in centers:

90

# ax.scatter(center[0], center[1], s=50, c='red', marker='+')

89

# ax.scatter(center[0], center[1], s=50, c='red', marker='+')

91

ax.scatter(centers[:, 0], centers[:, 1], s=50, c='red', marker='+')

90

ax.scatter(centers[:, 0], centers[:, 1], s=50, c='red', marker='+')

92

91

93

ax.set_xlabel('x')

92

ax.set_xlabel('x')

94

ax.set_ylabel('y')

93

ax.set_ylabel('y')

95

plt.colorbar(scatter)

94

plt.colorbar(scatter)

96

#plt.ylim(0, 1)

95

#plt.ylim(0, 1)

97

#plt.xlim(0, 1)

96

#plt.xlim(0, 1)

98

plt.savefig("test_" + str(iteration) + ".pdf")

97

plt.savefig("test_" + str(iteration) + ".pdf")

99

98

100

def _train(self, features, K: int):

99

def _train(self, features, K: int):

101

X = features

100

X = features

102

N = X.shape[0]

101

N = X.shape[0]

103

d = X.shape[1]

102

d = X.shape[1]

104

103

105

C, L = self._initialize_model(X, K)

104

C, L = self._initialize_model(X, K)

106

self.C = C

105

self.C = C

107

self.L = L

106

self.L = L

108

self.K = K

107

self.K = K

109

108

110

end_algo = False

109

end_algo = False

111

i = 0

110

i = 0

112

while not end_algo:

111

while not end_algo:

113

if i == 10:

112

if i == 10:

114

exit(1)

113

exit(1)

115

print("Iteration: ", i)

114

print("Iteration: ", i)

116

# Calcul matrix distance

115

# Calcul matrix distance

117

distances = np.zeros((N, K))

116

distances = np.zeros((N, K))

118

117

119

for n in range(N):

118

for n in range(N):

120

for k in range(self.K):

119

for k in range(self.K):

121

distances[n][k] = self._dist(X[n], self.C[k], self.L[k])

120

distances[n][k] = self._dist(X[n], self.C[k], self.L[k])

122

print(distances)

123

closest_cluster = np.argmin(distances, axis=1)

121

closest_cluster = np.argmin(distances, axis=1)

124

if i % 1 == 0:

122

if i % 1 == 0:

125

# -- Debug tool ----------------------

123

# -- Debug tool ----------------------

126

# TSNE

124

# TSNE

127

#X_embedded = np.concatenate((X, self.C), axis=0)

125

#X_embedded = np.concatenate((X, self.C), axis=0)

128

X_embedded = TSNE(n_components=2).fit_transform(np.concatenate((X, C), axis=0))

126

X_embedded = TSNE(n_components=2).fit_transform(np.concatenate((X, C), axis=0))

129

# Then plot

127

# Then plot

130

self._plot_iteration(

128

self._plot_iteration(

131

i,

129

i,

132

X_embedded[:X.shape[0]],

130

X_embedded[:X.shape[0]],

133

closest_cluster,

131

closest_cluster,

134

X_embedded[X.shape[0]:]

132

X_embedded[X.shape[0]:]

135

)

133

)

136

# ------------------------------------

134

# ------------------------------------

137

135

138

end_algo = True

136

end_algo = True

139

for k in range(K):

137

for k in range(K):

140

# Find subset of X with values closed to the centroid c_k.

138

# Find subset of X with values closed to the centroid c_k.

141

X_sub = np.where(closest_cluster == k)

139

X_sub = np.where(closest_cluster == k)

142

X_sub = np.take(X, X_sub[0], axis=0)

140

X_sub = np.take(X, X_sub[0], axis=0)

141

if X_sub.shape[0] == 0:

142

continue

143

np.mean(X_sub, axis=0)

143

np.mean(X_sub, axis=0)

144

C_new = np.mean(X_sub, axis=0)

144

C_new = np.mean(X_sub, axis=0)

145

146

# -- COMPUTE NEW LAMBDA (here named K) --

146

# -- COMPUTE NEW LAMBDA (here named K) --

147

K_new = np.zeros((L.shape[1], L.shape[2]))

147

K_new = np.zeros((L.shape[1], L.shape[2]))

148

for x in X_sub:

148

for x in X_sub:

149

x = np.reshape(x, (-1, 1))

149

x = np.reshape(x, (-1, 1))

150

c_tmp = np.reshape(C_new, (-1, 1))

150

c_tmp = np.reshape(C_new, (-1, 1))

151

K_new = K_new + (x - c_tmp).dot((x - c_tmp).transpose())

151

K_new = K_new + (x - c_tmp).dot((x - c_tmp).transpose())

152

K_new = K_new / X_sub.shape[0]

152

K_new = K_new / X_sub.shape[0]

153

K_new = np.linalg.inv(K_new)

153

K_new = np.linalg.pinv(K_new)

154

155

if end_algo and (not (self.C[k] == C_new).all()): # If the same stop

155

if end_algo and (not (self.C[k] == C_new).all()): # If the same stop

156

end_algo = False

156

end_algo = False

157

self.C[k] = C_new

157

self.C[k] = C_new

158

self.L[k] = K_new

158

self.L[k] = K_new

GITLAB

Quillot Mathias / volia

We do not update a cluster if it is not associated to any data point.

 from sklearn.cluster import KMeans
 import pickle
 import numpy as np
 import matplotlib.pyplot as plt
 from sklearn.manifold import TSNE
 from abstract_clustering import AbstractClustering
 class kmeansMahalanobis():
     def __init__(self):
         """
         """
         self.C = None
         self.L = None
         self.K = None
     def predict(self, features):
         """
         @param features:
         @return:
         """
         N = features.shape[0]
         distances = np.zeros((N, self.K))
         for n in range(N):
             for k in range(self.K):
                 distances[n][k] = self._dist(features[n], self.C[k], self.L[k])
-        print(distances)
         closest_cluster = np.argmin(distances, axis=1)
         return closest_cluster
     def load(self, model_path):
         """
         @param model_path:
         @return:
         """
         data = None
         with open(model_path):
             data = pickle.load()
         if data is None:
             raise Exception("Le modèle n'a pas pu être chargé")
         else:
             self.C = data["C"]
             self.L = data["L"]
             self.K = data["K"]
     def save(self, modelpath: str):
         """
         @param modelpath:
         @return:
         """
         data = {
             "C": self.C,
             "L": self.L,
             "K": self.K
         }
         with open(modelpath, "wb") as f:
             pickle.dump(data, f)
     def fit(self, features, K: int):
         self._train(features, K)
     def _initialize_model(self, X, number_clusters):
         d = X.shape[1]
         C = X[np.random.choice(X.shape[0], number_clusters)]
         L = np.zeros((number_clusters, d, d))
         for k in range(number_clusters):
             L[k] = np.identity(d)
         return C, L
     def _dist(self, a, b, l):
         '''
         Distance euclidienne
         '''
         a = np.reshape(a, (-1, 1))
         b = np.reshape(b, (-1, 1))
         result = np.transpose(a - b).dot(l).dot(a-b)[0][0]
         return result
     def _plot_iteration(self, iteration, points, clusters, centers):
         fig = plt.figure()
         ax = fig.add_subplot(111)
         scatter = ax.scatter(points[:, 0], points[:, 1], c=clusters, s=50)
         #for center in centers:
         #    ax.scatter(center[0], center[1], s=50, c='red', marker='+')
         ax.scatter(centers[:, 0], centers[:, 1], s=50, c='red', marker='+')
         ax.set_xlabel('x')
         ax.set_ylabel('y')
         plt.colorbar(scatter)
         #plt.ylim(0, 1)
         #plt.xlim(0, 1)
         plt.savefig("test_" + str(iteration) + ".pdf")
     def _train(self, features, K: int):
         X = features
         N = X.shape[0]
         d = X.shape[1]
         C, L = self._initialize_model(X, K)
         self.C = C
         self.L = L
         self.K = K
         end_algo = False
         i = 0
         while not end_algo:
             if i == 10:
                 exit(1)
             print("Iteration: ", i)
             # Calcul matrix distance
             distances = np.zeros((N, K))
             for n in range(N):
                 for k in range(self.K):
                     distances[n][k] = self._dist(X[n], self.C[k], self.L[k])
-            print(distances)
             closest_cluster = np.argmin(distances, axis=1)
             if i % 1 == 0:
                 # -- Debug tool ----------------------
                 # TSNE
                 #X_embedded = np.concatenate((X, self.C), axis=0)
                 X_embedded = TSNE(n_components=2).fit_transform(np.concatenate((X, C), axis=0))
                 # Then plot
                 self._plot_iteration(
                     i,
                     X_embedded[:X.shape[0]],
                     closest_cluster,
                     X_embedded[X.shape[0]:]
                 )
                 # ------------------------------------
             end_algo = True
             for k in range(K):
                 # Find subset of X with values closed to the centroid c_k.
                 X_sub = np.where(closest_cluster == k)
                 X_sub = np.take(X, X_sub[0], axis=0)
+                if X_sub.shape[0] == 0:
+                    continue
                 np.mean(X_sub, axis=0)
                 C_new = np.mean(X_sub, axis=0)
                 # -- COMPUTE NEW LAMBDA (here named K) --
                 K_new = np.zeros((L.shape[1], L.shape[2]))
                 for x in X_sub:
                     x = np.reshape(x, (-1, 1))
                     c_tmp = np.reshape(C_new, (-1, 1))
                     K_new = K_new + (x - c_tmp).dot((x - c_tmp).transpose())
                 K_new = K_new / X_sub.shape[0]
-                K_new = np.linalg.inv(K_new)
+                K_new = np.linalg.pinv(K_new)
                 if end_algo and (not (self.C[k] == C_new).all()):  # If the same stop
                     end_algo = False
                 self.C[k] = C_new
                 self.L[k] = K_new