Commit 9191399c3b15f017c4a84edeacdb799b490c07e4

Authored by quillotm
1 parent 40650f20d7
Exists in master

Clustering and evaluation are now availables and we can configure them through global variables.

Showing 3 changed files with 95 additions and 7 deletions Inline Diff

1 import argparse 1 import argparse
2 from os import path, mkdir 2 from os import path, mkdir
3 from utils import SubCommandRunner 3 from utils import SubCommandRunner
4 from core.data import read_features, read_lst 4 from core.data import read_features, read_lst, read_labels
5
6 import numpy as np 5 import numpy as np
7 from sklearn.cluster import KMeans 6 from sklearn.cluster import KMeans
8 import pickle 7 import pickle
8 from clustering_modules.kmeans import kmeans
9 9
10 from sklearn.preprocessing import LabelEncoder
11 from sklearn.metrics import v_measure_score
10 12
13 import core.measures
14
15
16 CLUSTERING_METHODS = {
17 "k-means": kmeans()
18 }
19
20 EVALUATION_METHODS = {
21 "entropy": core.measures.entropy_score,
22 "v-measure": v_measure_score
23 }
24
25
26 def disequilibrium_run():
27 pass
28
29
30 def measure_run(measure: str, features: str, lst: str, truelabels: str, model: str, modeltype: str):
31 module = CLUSTERING_METHODS[modeltype]
32 module.load(model)
33 evaluation = EVALUATION_METHODS[measure]
34 feats_dict = read_features(features)
35 labels_dict = read_labels(truelabels)
36 lst_dict = read_lst(lst)
37 lst_keys = [key for key in lst_dict]
38 feats = np.asarray([feats_dict[key] for key in lst_keys])
39 Y_pred = module.predict(feats)
40 Y_truth = [labels_dict[key][0] for key in lst_keys]
41
42 le = LabelEncoder()
43 le.fit(Y_truth)
44 Y_truth = le.transform(Y_truth)
45
46 eval = evaluation(Y_truth, Y_pred)
47 print(eval)
48
49
50
11 def kmeans_run(features: str, lst: str, k:int, kmax: int, klist, output: str): 51 def kmeans_run(features: str, lst: str, k:int, kmax: int, klist, output: str):
12 """ 52 """
13 53
14 @param features: output features 54 @param features: output features
15 @param lst: list file 55 @param lst: list file
16 @param k: k (kmin if kmax specified) 56 @param k: k (kmin if kmax specified)
17 @param kmax: maximum k to compute 57 @param kmax: maximum k to compute
18 @param klist: list of k values to compute, ignore k value 58 @param klist: list of k values to compute, ignore k value
19 @param output: output file if kmax not specified, else, output directory 59 @param output: output file if kmax not specified, else, output directory
20 """ 60 """
21 # -- READE FILES -- 61 # -- READ FILES --
22 features_dict = read_features(features) 62 features_dict = read_features(features)
23 lst_dict = read_lst(lst) 63 lst_dict = read_lst(lst)
24 X = np.asarray([features_dict[x] for x in lst_dict]) 64 X = np.asarray([features_dict[x] for x in lst_dict])
25 65
26 # Exception cases 66 # Exception cases
27 if kmax is None and klist is None and path.isdir(output): 67 if kmax is None and klist is None and path.isdir(output):
28 raise Exception("The \"output\" is an existing directory while the system is waiting the path of a file.") 68 raise Exception("The \"output\" is an existing directory while the system is waiting the path of a file.")
29 69
30 if (kmax is not None or klist is not None) and path.isfile(output): 70 if (kmax is not None or klist is not None) and path.isfile(output):
31 raise Exception("The \"output\" is an existing file while the system is waiting the path of a directory.") 71 raise Exception("The \"output\" is an existing file while the system is waiting the path of a directory.")
32 72
33 # Mono value case 73 # Mono value case
34 if kmax is None and klist is None: 74 if kmax is None and klist is None:
35 print(f"Computing clustering with k={k}") 75 print(f"Computing clustering with k={k}")
36 kmeans = KMeans(n_clusters=k, n_init=10, random_state=0).fit(X) 76 kmeans = KMeans(n_clusters=k, n_init=10, random_state=0).fit(X)
37 preds = kmeans.predict(X) 77 preds = kmeans.predict(X)
38 pickle.dump(kmeans, open(output, "wb")) 78 pickle.dump(kmeans, open(output, "wb"))
39 79
40 # Multi values case with kmax 80 # Multi values case with kmax
41 if kmax is not None: 81 if kmax is not None:
42 if not path.isdir(output): 82 if not path.isdir(output):
43 mkdir(output) 83 mkdir(output)
44 Ks = range(k, kmax + 1) 84 Ks = range(k, kmax + 1)
45 for i in Ks: 85 for i in Ks:
46 print(f"Computing clustering with k={i}") 86 print(f"Computing clustering with k={i}")
47 kmeans = KMeans(n_clusters=i, n_init=10, random_state=0).fit(X) 87 kmeans = KMeans(n_clusters=i, n_init=10, random_state=0).fit(X)
48 preds = kmeans.predict(X) 88 preds = kmeans.predict(X)
49 pickle.dump(kmeans, open(path.join(output, "clustering_" + str(i) + ".pkl"), "wb")) 89 pickle.dump(kmeans, open(path.join(output, "clustering_" + str(i) + ".pkl"), "wb"))
50 90
51 # Second multi values case with klist 91 # Second multi values case with klist
52 if klist is not None: 92 if klist is not None:
53 if not path.isdir(output): 93 if not path.isdir(output):
54 mkdir(output) 94 mkdir(output)
55 for k in klist: 95 for k in klist:
56 k = int(k) 96 k = int(k)
57 print(f"Computing clustering with k={k}") 97 print(f"Computing clustering with k={k}")
58 kmeans = KMeans(n_clusters=k, n_init=10, random_state=0).fit(X) 98 kmeans = KMeans(n_clusters=k, n_init=10, random_state=0).fit(X)
59 preds = kmeans.predict(X) 99 preds = kmeans.predict(X)
60 pickle.dump(kmeans, open(path.join(output, "clustering_" + str(k) + ".pkl"), "wb")) 100 pickle.dump(kmeans, open(path.join(output, "clustering_" + str(k) + ".pkl"), "wb"))
61 101
62 102
63 if __name__ == "__main__": 103 if __name__ == "__main__":
64 # Main parser 104 # Main parser
65 parser = argparse.ArgumentParser(description="Clustering methods to apply") 105 parser = argparse.ArgumentParser(description="Clustering methods to apply")
66 subparsers = parser.add_subparsers(title="action") 106 subparsers = parser.add_subparsers(title="action")
67 107
68 # kmeans 108 # kmeans
69 parser_kmeans = subparsers.add_parser( 109 parser_kmeans = subparsers.add_parser(
70 "kmeans", help="Compute clustering using k-means algorithm") 110 "kmeans", help="Compute clustering using k-means algorithm")
71 111
72 parser_kmeans.add_argument("--features", required=True, type=str, help="Features file (works with list)") 112 parser_kmeans.add_argument("--features", required=True, type=str, help="Features file (works with list)")
73 parser_kmeans.add_argument("--lst", required=True, type=str, help="List file (.lst)") 113 parser_kmeans.add_argument("--lst", required=True, type=str, help="List file (.lst)")
74 parser_kmeans.add_argument("-k", default=2, type=int, 114 parser_kmeans.add_argument("-k", default=2, type=int,
75 help="number of clusters to compute. It is kmin if kmax is specified.") 115 help="number of clusters to compute. It is kmin if kmax is specified.")
76 parser_kmeans.add_argument("--kmax", default=None, type=int, help="if specified, k is kmin.") 116 parser_kmeans.add_argument("--kmax", default=None, type=int, help="if specified, k is kmin.")
77 parser_kmeans.add_argument("--klist", nargs="+", 117 parser_kmeans.add_argument("--klist", nargs="+",
78 help="List of k values to test. As kmax, activate the multi values mod.") 118 help="List of k values to test. As kmax, activate the multi values mod.")
79 parser_kmeans.add_argument("--output", default=".kmeans", help="output file if only k. Output directory if multiple kmax specified.") 119 parser_kmeans.add_argument("--output", default=".kmeans", help="output file if only k. Output directory if multiple kmax specified.")
80 parser_kmeans.set_defaults(which="kmeans") 120 parser_kmeans.set_defaults(which="kmeans")
81 121
122 # measure
123 parser_measure = subparsers.add_parser(
124 "measure", help="compute the entropy")
125
126 parser_measure.add_argument("--measure",
127 required=True,
128 type=str,
129 choices=[key for key in EVALUATION_METHODS],
130 help="...")
131 parser_measure.add_argument("--features", required=True, type=str, help="...")
132 parser_measure.add_argument("--lst", required=True, type=str, help="...")
133 parser_measure.add_argument("--truelabels", required=True, type=str, help="...")
134 parser_measure.add_argument("--model", required=True, type=str, help="...")
135 parser_measure.add_argument("--modeltype",
136 required=True,
137 choices=[key for key in CLUSTERING_METHODS],
138 help="type of model for learning")
139 parser_measure.set_defaults(which="measure")
140
141 # disequilibrium
142 parser_disequilibrium = subparsers.add_parser(
143 "disequilibrium", help="...")
144
145 parser_disequilibrium.add_argument("--features", required=True, type=str, help="...")
146 parser_disequilibrium.add_argument("--lstrain", required=True, type=str, help="...")
147 parser_disequilibrium.add_argument("--lstest", required=True, type=str, help="...")
148 parser_disequilibrium.add_argument("--model", required=True, type=str, help="...")
149 parser_disequilibrium.add_argument("--model-type",
150 required=True,
151 choices=["kmeans", "2", "3"],
152 help="...")
153
82 # Parse 154 # Parse
83 args = parser.parse_args() 155 args = parser.parse_args()
84 156
85 # Run commands 157 # Run commands
86 runner = SubCommandRunner({ 158 runner = SubCommandRunner({
87 "kmeans": kmeans_run 159 "kmeans": kmeans_run,
160 "measure": measure_run,
161 "disequilibrium": disequilibrium_run
88 }) 162 })
89 163
90 runner.run(args.which, args.__dict__, remove="which") 164 runner.run(args.which, args.__dict__, remove="which")
volia/clustering_modules/kmeans.py
File was created 1
2 from sklearn.cluster import KMeans
3 import pickle
4 from abstract_clustering import AbstractClustering
5
6 class kmeans():
7 def __init__(self):
8 self.kmeans_model = None
9
10 def predict(self, features):
11 return self.kmeans_model.predict(features)
12
13 def load(self, model_path):
14 self.kmeans_model = pickle.load(open(model_path, "rb"))
15
volia/core/measures.py
1 ''' 1 '''
2 This module is a part of my library. 2 This module is a part of my library.
3 It aims to compute some measures for clustering. 3 It aims to compute some measures for clustering.
4 ''' 4 '''
5 5
6 import numpy as np 6 import numpy as np
7 7
8 def disequilibrium_(matrix1, matrix2, isGlobal=False, mod=None): 8 def disequilibrium_(matrix1, matrix2, isGlobal=False, mod=None):
9 ''' 9 '''
10 Compute disequilibrium for all the clusters. 10 Compute disequilibrium for all the clusters.
11 The disequilibrium is compute from the difference 11 The disequilibrium is compute from the difference
12 between two clustering sets. 12 between two clustering sets.
13 isGlobal permet à l'utilisateur de choisir le dénominateur de 13 isGlobal permet à l'utilisateur de choisir le dénominateur de
14 la fonction : 14 la fonction :
15 - True : divise la valeur par le nombre d'élément du cluster 15 - True : divise la valeur par le nombre d'élément du cluster
16 - False : divise la valeur par le nombre d'élément total 16 - False : divise la valeur par le nombre d'élément total
17 17
18 withPower permet à l'utilisateur de décider d'appliquer un carré 2 ou 18 withPower permet à l'utilisateur de décider d'appliquer un carré 2 ou
19 une valeur absolue. 19 une valeur absolue.
20 ''' 20 '''
21 21
22 def divide_line(a, divider): 22 def divide_line(a, divider):
23 ''' 23 '''
24 Sub function used for dividing matrix by a vector line by line. 24 Sub function used for dividing matrix by a vector line by line.
25 ''' 25 '''
26 return np.divide(a, divider, out=np.zeros_like(a), where=divider!=0) 26 return np.divide(a, divider, out=np.zeros_like(a), where=divider!=0)
27 27
28 dividers1 = 0 28 dividers1 = 0
29 dividers2 = 0 29 dividers2 = 0
30 30
31 if isGlobal: 31 if isGlobal:
32 dividers1 = matrix1.sum() 32 dividers1 = matrix1.sum()
33 dividers2 = matrix2.sum() 33 dividers2 = matrix2.sum()
34 else: 34 else:
35 dividers1 = matrix1.sum(axis=1) 35 dividers1 = matrix1.sum(axis=1)
36 dividers2 = matrix2.sum(axis=1) 36 dividers2 = matrix2.sum(axis=1)
37 37
38 matrix1_divided = np.apply_along_axis(divide_line, 0, np.asarray(matrix1, dtype=np.float), dividers1) 38 matrix1_divided = np.apply_along_axis(divide_line, 0, np.asarray(matrix1, dtype=np.float), dividers1)
39 39
40 matrix2_divided = np.apply_along_axis(divide_line, 0, np.asarray(matrix2, dtype=np.float), dividers2) 40 matrix2_divided = np.apply_along_axis(divide_line, 0, np.asarray(matrix2, dtype=np.float), dividers2)
41 41
42 diff = matrix1_divided - matrix2_divided 42 diff = matrix1_divided - matrix2_divided
43 43
44 mask = np.logical_not(np.logical_and(matrix2==0, matrix1==0)) 44 mask = np.logical_not(np.logical_and(matrix2==0, matrix1==0))
45 45
46 result = diff 46 result = diff
47 47
48 if mod != None or mod == "": 48 if mod != None or mod == "":
49 for word in mod.split(" "): 49 for word in mod.split(" "):
50 if word == "power": 50 if word == "power":
51 result = np.power(result,2) 51 result = np.power(result,2)
52 elif word == "human": 52 elif word == "human":
53 result = result * 100 53 result = result * 100
54 elif word == "abs": 54 elif word == "abs":
55 result = np.absolute(result) 55 result = np.absolute(result)
56 else: 56 else:
57 raise Exception("Need to specify an accepted mod of the disequilibrium (\"power\", \"human\" or \"abs\"") 57 raise Exception("Need to specify an accepted mod of the disequilibrium (\"power\", \"human\" or \"abs\"")
58 return (mask, result) 58 return (mask, result)
59 59
60 60
61 61
62 def disequilibrium_mean_by_cluster(mask, matrix): 62 def disequilibrium_mean_by_cluster(mask, matrix):
63 ''' 63 '''
64 Mean of disequilibrium 64 Mean of disequilibrium
65 matrix is the disequilibrium calculated 65 matrix is the disequilibrium calculated
66 from number of occurences belonging to a class, 66 from number of occurences belonging to a class,
67 for each cluster. 67 for each cluster.
68 ''' 68 '''
69 nb_k = len(matrix) 69 nb_k = len(matrix)
70 results = np.zeros((nb_k)) 70 results = np.zeros((nb_k))
71 71
72 for i in range(nb_k): 72 for i in range(nb_k):
73 results[i] = matrix[i].sum() / mask[i].sum() 73 results[i] = matrix[i].sum() / mask[i].sum()
74 return results 74 return results
75 75
76 76
77 def disequilibrium(matrix1, matrix2, isGlobal=False): 77 def disequilibrium(matrix1, matrix2, isGlobal=False):
78 ''' 78 '''
79 Disequilibrium matrix 79 Disequilibrium matrix
80 And Disequilibrium value 80 And Disequilibrium value
81 ''' 81 '''
82 mask, result = disequilibrium_(matrix1, matrix2, isGlobal) 82 mask, result = disequilibrium_(matrix1, matrix2, isGlobal)
83 result_human = result * 100 83 result_human = result * 100
84 result_power = np.power(result, 2) 84 result_power = np.power(result, 2)
85 85
86 return ( 86 return (
87 mask, 87 mask,
88 result_human, 88 result_human,
89 disequilibrium_mean_by_cluster(mask, result_power).sum()/matrix1.shape[0] 89 disequilibrium_mean_by_cluster(mask, result_power).sum()/matrix1.shape[0]
90 ) 90 )
91 91
92 92
93 def compute_count_matrix(y_truth, y_hat): 93 def compute_count_matrix(y_truth, y_hat):
94 ''' 94 '''
95 Check the size of the lists with assertion 95 Check the size of the lists with assertion
96 ''' 96 '''
97 # Check size of the lists 97 # Check size of the lists
98 assert len(y_hat) == len(y_truth), f"Matrices should have the same length y_hat: {len(y_hat)}, y_truth: {len(y_truth)}" 98 assert len(y_hat) == len(y_truth), f"Matrices should have the same length y_hat: {len(y_hat)}, y_truth: {len(y_truth)}"
99 99
100 # Build count matrix 100 # Build count matrix
101 count_matrix = np.zeros((max(y_hat+1), max(y_truth+1))) 101 count_matrix = np.zeros((max(y_hat+1), max(y_truth+1)))
102 for i in range(len(y_hat)): 102 for i in range(len(y_hat)):
103 count_matrix[y_hat[i]][y_truth[i]] += 1 103 count_matrix[y_hat[i]][y_truth[i]] += 1
104 return count_matrix 104 return count_matrix
105 105
106 106
107 def entropy_score(y_truth, y_hat): 107 def entropy_score(y_truth, y_hat):
108 ''' 108 '''
109 Need to use label encoder before givin y_hat and y_truth 109 Need to use label encoder before givin y_hat and y_truth
110 Don't use one hot labels 110 Don't use one hot labels
111 111
112 Return a tuple with: 112 Return a tuple with:
113 - result_matrix : the matrix with the log multiplied probabilities (P(x) * log(P(x))) 113 - result_matrix : the matrix with the log multiplied probabilities (P(x) * log(P(x)))
114 - result_vector : the vector avec summing entropy of each class. Each value corresponds to a cluster. 114 - result_vector : the vector avec summing entropy of each class. Each value corresponds to a cluster.
115 - result : the final entropy measure of the clustering 115 - result : the final entropy measure of the clustering
116 ''' 116 '''
117 def divide_line(a, divider): 117 def divide_line(a, divider):
118 ''' 118 '''
119 Sub function used for dividing matrix by a vector line by line. 119 Sub function used for dividing matrix by a vector line by line.
120 ''' 120 '''
121 return np.divide(a, divider, out=np.zeros_like(a), where=divider!=0) 121 return np.divide(a, divider, out=np.zeros_like(a), where=divider!=0)
122 122
123 # Build count matrix 123 # Build count matrix
124 count_matrix = compute_count_matrix(y_truth, y_hat) 124 count_matrix = compute_count_matrix(y_truth, y_hat)
125 125
126 # Build dividers vector 126 # Build dividers vector
127 dividers = count_matrix.sum(axis=1) 127 dividers = count_matrix.sum(axis=1)
128 128
129 matrix_divided = np.apply_along_axis(divide_line, 0, np.asarray(count_matrix, dtype=np.float), dividers) 129 matrix_divided = np.apply_along_axis(divide_line, 0, np.asarray(count_matrix, dtype=np.float), dividers)
130 130
131 log_matrix = np.zeros(matrix_divided.shape) 131 log_matrix = np.zeros(matrix_divided.shape)
132 np.log2(matrix_divided, out=log_matrix, where=count_matrix != 0) 132 np.log2(matrix_divided, out=log_matrix, where=count_matrix != 0)
133 result_matrix = -1 * np.multiply(matrix_divided, log_matrix) 133 result_matrix = -1 * np.multiply(matrix_divided, log_matrix)
134 result_vector = result_matrix.sum(axis=1) 134 result_vector = result_matrix.sum(axis=1)
135 result_vector.sum() 135 result_vector.sum()
136 136
137 if np.isnan(np.sum(result_vector)): 137 if np.isnan(np.sum(result_vector)):
138 print("COUNT MATRIX") 138 print("COUNT MATRIX")
139 print(count_matrix) 139 print(count_matrix)
140 print("MATRIX DIVIDED") 140 print("MATRIX DIVIDED")
141 print(matrix_divided) 141 print(matrix_divided)
142 print("RESULT MATRIX") 142 print("RESULT MATRIX")
143 print(result_matrix) 143 print(result_matrix)
144 print("VECTOR MATRIX") 144 print("VECTOR MATRIX")
145 print(result_vector) 145 print(result_vector)
146 print("An error occured due to nan value, some values are printed before") 146 print("An error occured due to nan value, some values are printed before")
147 exit(1) 147 exit(1)
148 148
149 result = result_vector * dividers / dividers.sum() 149 result = result_vector * dividers / dividers.sum()
150 result = result.sum() 150 result = result.sum()
151 return (result_matrix, result_vector, result) 151 return result
152 152
153 153
154 def purity_score(y_truth, y_hat): 154 def purity_score(y_truth, y_hat):
155 ''' 155 '''
156 Return three values in a dictionary: 156 Return three values in a dictionary:
157 - purity_class_score: the purity score of the class (asp) 157 - purity_class_score: the purity score of the class (asp)
158 - purity_cluster_score: the purity score of the cluster (acp) 158 - purity_cluster_score: the purity score of the cluster (acp)
159 - K: the overall evaluation criterion (sqrt(asp * acp)) 159 - K: the overall evaluation criterion (sqrt(asp * acp))
160 160
161 This function is based on the following article: 161 This function is based on the following article:
162 Unknown-multiple speaker clustering using HMM, J. Ajmera, H. Bourlard, I. Lapidot, I. McCowan 162 Unknown-multiple speaker clustering using HMM, J. Ajmera, H. Bourlard, I. Lapidot, I. McCowan
163 ''' 163 '''
164 164
165 def divide_line(a, divider): 165 def divide_line(a, divider):
166 ''' 166 '''
167 Sub function used for dividing matrix by a vector line by line. 167 Sub function used for dividing matrix by a vector line by line.
168 ''' 168 '''
169 return np.divide(a, divider, out=np.zeros_like(a), where=divider!=0) 169 return np.divide(a, divider, out=np.zeros_like(a), where=divider!=0)
170 170
171 def compute_purity_score(count_matrix, axis=0): 171 def compute_purity_score(count_matrix, axis=0):
172 if axis==0: 172 if axis==0:
173 other_axis = 1 173 other_axis = 1
174 else: 174 else:
175 other_axis = 0 175 other_axis = 0
176 count_per_row = count_matrix.sum(axis=axis) 176 count_per_row = count_matrix.sum(axis=axis)
177 dividers = np.square(count_per_row) 177 dividers = np.square(count_per_row)
178 178
179 count_matrix_squared = np.square(count_matrix) 179 count_matrix_squared = np.square(count_matrix)
180 matrix_divided = np.apply_along_axis(divide_line, other_axis, np.asarray(count_matrix_squared, dtype=np.float), dividers) 180 matrix_divided = np.apply_along_axis(divide_line, other_axis, np.asarray(count_matrix_squared, dtype=np.float), dividers)
181 vector_purity = np.sum(matrix_divided, axis=axis) 181 vector_purity = np.sum(matrix_divided, axis=axis)
182 182
183 scalar_purity = np.average(vector_purity, weights=count_per_row) 183 scalar_purity = np.average(vector_purity, weights=count_per_row)
184 return (vector_purity, scalar_purity) 184 return scalar_purity
185 185
186 186
187 count_matrix = compute_count_matrix(y_truth, y_hat) 187 count_matrix = compute_count_matrix(y_truth, y_hat)
188 _, purity_cluster_score = compute_purity_score(count_matrix, 1) 188 _, purity_cluster_score = compute_purity_score(count_matrix, 1)
189 _, purity_class_score = cluster_purity = compute_purity_score(count_matrix, 0) 189 _, purity_class_score = cluster_purity = compute_purity_score(count_matrix, 0)
190 190
191 K = np.sqrt(purity_cluster_score * purity_class_score) 191 K = np.sqrt(purity_cluster_score * purity_class_score)
192 192
193 for i in range(count_matrix.shape[0]): 193 for i in range(count_matrix.shape[0]):
194 for j in range(count_matrix.shape[1]): 194 for j in range(count_matrix.shape[1]):
195 count_matrix[i][j] 195 count_matrix[i][j]
196 count_matrix[i] 196 count_matrix[i]
197 return { 197 return {
198 "purity_class_score": purity_class_score, 198 "purity_class_score": purity_class_score,
199 "purity_cluster_score": purity_cluster_score, 199 "purity_cluster_score": purity_cluster_score,
200 "K": K 200 "K": K
201 } 201 }
202 202
203 203
204 if __name__ == "__main__": 204 if __name__ == "__main__":
205 print("Purity test #1") 205 print("Purity test #1")
206 # Hypothesis 206 # Hypothesis
207 y_hat = np.asarray([0, 1, 2, 0, 1, 0, 3, 2, 2, 3, 3, 0]) 207 y_hat = np.asarray([0, 1, 2, 0, 1, 0, 3, 2, 2, 3, 3, 0])
208 # Truth 208 # Truth
209 y = np.asarray([0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3]) 209 y = np.asarray([0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3])
210 210
211 (result_matrix, result_vector, result) = entropy_score(y, y_hat) 211 (result_matrix, result_vector, result) = entropy_score(y, y_hat)
212 print(purity_score(y, y_hat)) 212 print(purity_score(y, y_hat))
213 213
214 exit(1) 214 exit(1)
215 print("Purity test #2") 215 print("Purity test #2")
216 # Hypothesis 216 # Hypothesis
217 y_hat = np.asarray([0, 1, 2, 0, 1, 0, 3, 2, 2, 3, 3, 0, 4, 4, 4]) 217 y_hat = np.asarray([0, 1, 2, 0, 1, 0, 3, 2, 2, 3, 3, 0, 4, 4, 4])
218 # Truth 218 # Truth
219 y = np.asarray([0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 0, 3, 3, 3]) 219 y = np.asarray([0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 0, 3, 3, 3])
220 220
221 (result_matrix, result_vector, result) = entropy_score(y, y_hat) 221 (result_matrix, result_vector, result) = entropy_score(y, y_hat)
222 exit(1) 222 exit(1)
223 print("Result matrix: ") 223 print("Result matrix: ")
224 print(result_matrix) 224 print(result_matrix)
225 print("Result vector: ") 225 print("Result vector: ")
226 print(result_vector) 226 print(result_vector)
227 print("Result: ", result) 227 print("Result: ", result)