cluster_kmeans_gaussianML.py 3.17 KB
edit raw blame history



1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

19

20

21

22

23

24

25

26

27

28

29

30

31

32

33

34

35

36

37

38

39

40

41

42

43

44

45

46

47

48

49

50

51

52

53

54

55

56

57

58

59

60

61

62

63

64

65

66

67

68

69

70

71

72

73

74

75

76

77

78

79

80

81

82

83

84

85

86

87

88

89

90

91

92

93

94

95

96

97

98

99

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

119

120

121

122

123

124


'''
Un petit test pour faire du clustering
avec une distance de mahalanobis
From paper:
Convergence problems of Mahalanobis distance-based k-means clustering,
Itshak Lapidot

Just one thing: Column and lines are inversed in this script.
TODO: Random selection from the set
'''

import matplotlib.pyplot as plt
import numpy as np
# from sklearn.manifold import TSNE

N = 50  # Number of individus
d = 2  # Number of dimensions
K = 3  # number of clusters


def initialize_model(X, number_clusters):
    rand = np.random.choice(X.shape[0], number_clusters)
    print(rand)
    C = X[rand]
    L = np.zeros((K, d, d))
    for k in range(K):
        L[k] = np.identity(d)
    return C, L


X = np.random.rand(N, d)  # Features
C, L = initialize_model(X, K)


def gaussian(x, c, l):
    '''
    G function
    '''
    p1 = np.power(np.linalg.det(l), 0.5) / np.power((2 * np.pi), d/2)
    p2 = np.exp(np.transpose(x - c).dot(1/2).dot(l).dot(x - c))
    return (p1 * p2).reshape(1)


def dist(a, b, l):
    '''
    Distance euclidienne
    '''
    a = np.reshape(a, (-1, 1))
    b = np.reshape(b, (-1, 1))
    return np.log(gaussian(a, b, l))


def plot_iteration(iteration, points, clusters, centers):
    fig = plt.figure()
    ax = fig.add_subplot(111)
    scatter = ax.scatter(points[:, 0], points[:, 1], c=clusters, s=50)
    for i, j in centers:
        ax.scatter(i, j, s=50, c='red', marker='+')
    ax.set_xlabel('x')
    ax.set_ylabel('y')
    plt.colorbar(scatter)
    plt.ylim(0, 1)
    plt.xlim(0, 1)
    plt.savefig("test_" + str(iteration) + ".pdf")


end_algo = False
i = 0
while not end_algo:
    if i == 10:
        exit(1)
    print("Iteration: ", i)
    # -- Calcul matrix distance
    distances = np.zeros((N, K))

    # -- Calcul closest cluster
    for n in range(N):
        for k in range(K):
            distances[n][k] = dist(X[n], C[k], L[k])
    closest_cluster = np.argmin(distances, axis=1)

    # -- Debug tool ----------------------
    if i % 1 == 0:
        # TSNE
        X_embedded = np.concatenate((X, C), axis=0)
        # X_embedded = TSNE(n_components=2).fit_transform(np.concatenate((X, C), axis=0))
        # Then plot
        plot_iteration(
            i,
            X_embedded[:X.shape[0]],
            closest_cluster,
            X_embedded[X.shape[0]:]
            )
    # ------------------------------------

    end_algo = True
    for k in range(K):
        # Find subset of X with values closed to the centroid c_k.
        X_sub = np.where(closest_cluster == k)
        X_sub = np.take(X, X_sub[0], axis=0)
        np.mean(X_sub, axis=0)
        C_new = np.mean(X_sub, axis=0)

        # -- COMPUTE NEW LAMBDA (here named K) --
        K_new = np.zeros((L.shape[1], L.shape[2]))
        for x in X_sub:
            x = np.reshape(x, (-1, 1))
            c_tmp = np.reshape(C_new, (-1, 1))
            K_new = K_new + (x - c_tmp).dot((x - c_tmp).transpose())
        K_new = K_new / X_sub.shape[0]
        K_new = np.linalg.inv(K_new)

        if end_algo and (not (C[k] == C_new).all()):  # If the same stop
            end_algo = False
        C[k] = C_new
        L[k] = K_new
    i = i + 1

plot_iteration(
    i,
    X_embedded[:X.shape[0]],
    closest_cluster,
    X_embedded[X.shape[0]:]
)