Commit 9191399c3b15f017c4a84edeacdb799b490c07e4
1 parent
40650f20d7
Exists in
master
Clustering and evaluation are now availables and we can configure them through global variables.
Showing 3 changed files with 95 additions and 7 deletions Inline Diff
volia/clustering.py
1 | import argparse | 1 | import argparse |
2 | from os import path, mkdir | 2 | from os import path, mkdir |
3 | from utils import SubCommandRunner | 3 | from utils import SubCommandRunner |
4 | from core.data import read_features, read_lst | 4 | from core.data import read_features, read_lst, read_labels |
5 | |||
6 | import numpy as np | 5 | import numpy as np |
7 | from sklearn.cluster import KMeans | 6 | from sklearn.cluster import KMeans |
8 | import pickle | 7 | import pickle |
8 | from clustering_modules.kmeans import kmeans | ||
9 | 9 | ||
10 | from sklearn.preprocessing import LabelEncoder | ||
11 | from sklearn.metrics import v_measure_score | ||
10 | 12 | ||
13 | import core.measures | ||
14 | |||
15 | |||
16 | CLUSTERING_METHODS = { | ||
17 | "k-means": kmeans() | ||
18 | } | ||
19 | |||
20 | EVALUATION_METHODS = { | ||
21 | "entropy": core.measures.entropy_score, | ||
22 | "v-measure": v_measure_score | ||
23 | } | ||
24 | |||
25 | |||
26 | def disequilibrium_run(): | ||
27 | pass | ||
28 | |||
29 | |||
30 | def measure_run(measure: str, features: str, lst: str, truelabels: str, model: str, modeltype: str): | ||
31 | module = CLUSTERING_METHODS[modeltype] | ||
32 | module.load(model) | ||
33 | evaluation = EVALUATION_METHODS[measure] | ||
34 | feats_dict = read_features(features) | ||
35 | labels_dict = read_labels(truelabels) | ||
36 | lst_dict = read_lst(lst) | ||
37 | lst_keys = [key for key in lst_dict] | ||
38 | feats = np.asarray([feats_dict[key] for key in lst_keys]) | ||
39 | Y_pred = module.predict(feats) | ||
40 | Y_truth = [labels_dict[key][0] for key in lst_keys] | ||
41 | |||
42 | le = LabelEncoder() | ||
43 | le.fit(Y_truth) | ||
44 | Y_truth = le.transform(Y_truth) | ||
45 | |||
46 | eval = evaluation(Y_truth, Y_pred) | ||
47 | print(eval) | ||
48 | |||
49 | |||
50 | |||
11 | def kmeans_run(features: str, lst: str, k:int, kmax: int, klist, output: str): | 51 | def kmeans_run(features: str, lst: str, k:int, kmax: int, klist, output: str): |
12 | """ | 52 | """ |
13 | 53 | ||
14 | @param features: output features | 54 | @param features: output features |
15 | @param lst: list file | 55 | @param lst: list file |
16 | @param k: k (kmin if kmax specified) | 56 | @param k: k (kmin if kmax specified) |
17 | @param kmax: maximum k to compute | 57 | @param kmax: maximum k to compute |
18 | @param klist: list of k values to compute, ignore k value | 58 | @param klist: list of k values to compute, ignore k value |
19 | @param output: output file if kmax not specified, else, output directory | 59 | @param output: output file if kmax not specified, else, output directory |
20 | """ | 60 | """ |
21 | # -- READE FILES -- | 61 | # -- READ FILES -- |
22 | features_dict = read_features(features) | 62 | features_dict = read_features(features) |
23 | lst_dict = read_lst(lst) | 63 | lst_dict = read_lst(lst) |
24 | X = np.asarray([features_dict[x] for x in lst_dict]) | 64 | X = np.asarray([features_dict[x] for x in lst_dict]) |
25 | 65 | ||
26 | # Exception cases | 66 | # Exception cases |
27 | if kmax is None and klist is None and path.isdir(output): | 67 | if kmax is None and klist is None and path.isdir(output): |
28 | raise Exception("The \"output\" is an existing directory while the system is waiting the path of a file.") | 68 | raise Exception("The \"output\" is an existing directory while the system is waiting the path of a file.") |
29 | 69 | ||
30 | if (kmax is not None or klist is not None) and path.isfile(output): | 70 | if (kmax is not None or klist is not None) and path.isfile(output): |
31 | raise Exception("The \"output\" is an existing file while the system is waiting the path of a directory.") | 71 | raise Exception("The \"output\" is an existing file while the system is waiting the path of a directory.") |
32 | 72 | ||
33 | # Mono value case | 73 | # Mono value case |
34 | if kmax is None and klist is None: | 74 | if kmax is None and klist is None: |
35 | print(f"Computing clustering with k={k}") | 75 | print(f"Computing clustering with k={k}") |
36 | kmeans = KMeans(n_clusters=k, n_init=10, random_state=0).fit(X) | 76 | kmeans = KMeans(n_clusters=k, n_init=10, random_state=0).fit(X) |
37 | preds = kmeans.predict(X) | 77 | preds = kmeans.predict(X) |
38 | pickle.dump(kmeans, open(output, "wb")) | 78 | pickle.dump(kmeans, open(output, "wb")) |
39 | 79 | ||
40 | # Multi values case with kmax | 80 | # Multi values case with kmax |
41 | if kmax is not None: | 81 | if kmax is not None: |
42 | if not path.isdir(output): | 82 | if not path.isdir(output): |
43 | mkdir(output) | 83 | mkdir(output) |
44 | Ks = range(k, kmax + 1) | 84 | Ks = range(k, kmax + 1) |
45 | for i in Ks: | 85 | for i in Ks: |
46 | print(f"Computing clustering with k={i}") | 86 | print(f"Computing clustering with k={i}") |
47 | kmeans = KMeans(n_clusters=i, n_init=10, random_state=0).fit(X) | 87 | kmeans = KMeans(n_clusters=i, n_init=10, random_state=0).fit(X) |
48 | preds = kmeans.predict(X) | 88 | preds = kmeans.predict(X) |
49 | pickle.dump(kmeans, open(path.join(output, "clustering_" + str(i) + ".pkl"), "wb")) | 89 | pickle.dump(kmeans, open(path.join(output, "clustering_" + str(i) + ".pkl"), "wb")) |
50 | 90 | ||
51 | # Second multi values case with klist | 91 | # Second multi values case with klist |
52 | if klist is not None: | 92 | if klist is not None: |
53 | if not path.isdir(output): | 93 | if not path.isdir(output): |
54 | mkdir(output) | 94 | mkdir(output) |
55 | for k in klist: | 95 | for k in klist: |
56 | k = int(k) | 96 | k = int(k) |
57 | print(f"Computing clustering with k={k}") | 97 | print(f"Computing clustering with k={k}") |
58 | kmeans = KMeans(n_clusters=k, n_init=10, random_state=0).fit(X) | 98 | kmeans = KMeans(n_clusters=k, n_init=10, random_state=0).fit(X) |
59 | preds = kmeans.predict(X) | 99 | preds = kmeans.predict(X) |
60 | pickle.dump(kmeans, open(path.join(output, "clustering_" + str(k) + ".pkl"), "wb")) | 100 | pickle.dump(kmeans, open(path.join(output, "clustering_" + str(k) + ".pkl"), "wb")) |
61 | 101 | ||
62 | 102 | ||
63 | if __name__ == "__main__": | 103 | if __name__ == "__main__": |
64 | # Main parser | 104 | # Main parser |
65 | parser = argparse.ArgumentParser(description="Clustering methods to apply") | 105 | parser = argparse.ArgumentParser(description="Clustering methods to apply") |
66 | subparsers = parser.add_subparsers(title="action") | 106 | subparsers = parser.add_subparsers(title="action") |
67 | 107 | ||
68 | # kmeans | 108 | # kmeans |
69 | parser_kmeans = subparsers.add_parser( | 109 | parser_kmeans = subparsers.add_parser( |
70 | "kmeans", help="Compute clustering using k-means algorithm") | 110 | "kmeans", help="Compute clustering using k-means algorithm") |
71 | 111 | ||
72 | parser_kmeans.add_argument("--features", required=True, type=str, help="Features file (works with list)") | 112 | parser_kmeans.add_argument("--features", required=True, type=str, help="Features file (works with list)") |
73 | parser_kmeans.add_argument("--lst", required=True, type=str, help="List file (.lst)") | 113 | parser_kmeans.add_argument("--lst", required=True, type=str, help="List file (.lst)") |
74 | parser_kmeans.add_argument("-k", default=2, type=int, | 114 | parser_kmeans.add_argument("-k", default=2, type=int, |
75 | help="number of clusters to compute. It is kmin if kmax is specified.") | 115 | help="number of clusters to compute. It is kmin if kmax is specified.") |
76 | parser_kmeans.add_argument("--kmax", default=None, type=int, help="if specified, k is kmin.") | 116 | parser_kmeans.add_argument("--kmax", default=None, type=int, help="if specified, k is kmin.") |
77 | parser_kmeans.add_argument("--klist", nargs="+", | 117 | parser_kmeans.add_argument("--klist", nargs="+", |
78 | help="List of k values to test. As kmax, activate the multi values mod.") | 118 | help="List of k values to test. As kmax, activate the multi values mod.") |
79 | parser_kmeans.add_argument("--output", default=".kmeans", help="output file if only k. Output directory if multiple kmax specified.") | 119 | parser_kmeans.add_argument("--output", default=".kmeans", help="output file if only k. Output directory if multiple kmax specified.") |
80 | parser_kmeans.set_defaults(which="kmeans") | 120 | parser_kmeans.set_defaults(which="kmeans") |
81 | 121 | ||
122 | # measure | ||
123 | parser_measure = subparsers.add_parser( | ||
124 | "measure", help="compute the entropy") | ||
125 | |||
126 | parser_measure.add_argument("--measure", | ||
127 | required=True, | ||
128 | type=str, | ||
129 | choices=[key for key in EVALUATION_METHODS], | ||
130 | help="...") | ||
131 | parser_measure.add_argument("--features", required=True, type=str, help="...") | ||
132 | parser_measure.add_argument("--lst", required=True, type=str, help="...") | ||
133 | parser_measure.add_argument("--truelabels", required=True, type=str, help="...") | ||
134 | parser_measure.add_argument("--model", required=True, type=str, help="...") | ||
135 | parser_measure.add_argument("--modeltype", | ||
136 | required=True, | ||
137 | choices=[key for key in CLUSTERING_METHODS], | ||
138 | help="type of model for learning") | ||
139 | parser_measure.set_defaults(which="measure") | ||
140 | |||
141 | # disequilibrium | ||
142 | parser_disequilibrium = subparsers.add_parser( | ||
143 | "disequilibrium", help="...") | ||
144 | |||
145 | parser_disequilibrium.add_argument("--features", required=True, type=str, help="...") | ||
146 | parser_disequilibrium.add_argument("--lstrain", required=True, type=str, help="...") | ||
147 | parser_disequilibrium.add_argument("--lstest", required=True, type=str, help="...") | ||
148 | parser_disequilibrium.add_argument("--model", required=True, type=str, help="...") | ||
149 | parser_disequilibrium.add_argument("--model-type", | ||
150 | required=True, | ||
151 | choices=["kmeans", "2", "3"], | ||
152 | help="...") | ||
153 | |||
82 | # Parse | 154 | # Parse |
83 | args = parser.parse_args() | 155 | args = parser.parse_args() |
84 | 156 | ||
85 | # Run commands | 157 | # Run commands |
86 | runner = SubCommandRunner({ | 158 | runner = SubCommandRunner({ |
87 | "kmeans": kmeans_run | 159 | "kmeans": kmeans_run, |
160 | "measure": measure_run, | ||
161 | "disequilibrium": disequilibrium_run | ||
88 | }) | 162 | }) |
89 | 163 | ||
90 | runner.run(args.which, args.__dict__, remove="which") | 164 | runner.run(args.which, args.__dict__, remove="which") |
volia/clustering_modules/kmeans.py
File was created | 1 | ||
2 | from sklearn.cluster import KMeans | ||
3 | import pickle | ||
4 | from abstract_clustering import AbstractClustering | ||
5 | |||
6 | class kmeans(): | ||
7 | def __init__(self): | ||
8 | self.kmeans_model = None | ||
9 | |||
10 | def predict(self, features): | ||
11 | return self.kmeans_model.predict(features) | ||
12 | |||
13 | def load(self, model_path): | ||
14 | self.kmeans_model = pickle.load(open(model_path, "rb")) | ||
15 |
volia/core/measures.py
1 | ''' | 1 | ''' |
2 | This module is a part of my library. | 2 | This module is a part of my library. |
3 | It aims to compute some measures for clustering. | 3 | It aims to compute some measures for clustering. |
4 | ''' | 4 | ''' |
5 | 5 | ||
6 | import numpy as np | 6 | import numpy as np |
7 | 7 | ||
8 | def disequilibrium_(matrix1, matrix2, isGlobal=False, mod=None): | 8 | def disequilibrium_(matrix1, matrix2, isGlobal=False, mod=None): |
9 | ''' | 9 | ''' |
10 | Compute disequilibrium for all the clusters. | 10 | Compute disequilibrium for all the clusters. |
11 | The disequilibrium is compute from the difference | 11 | The disequilibrium is compute from the difference |
12 | between two clustering sets. | 12 | between two clustering sets. |
13 | isGlobal permet à l'utilisateur de choisir le dénominateur de | 13 | isGlobal permet à l'utilisateur de choisir le dénominateur de |
14 | la fonction : | 14 | la fonction : |
15 | - True : divise la valeur par le nombre d'élément du cluster | 15 | - True : divise la valeur par le nombre d'élément du cluster |
16 | - False : divise la valeur par le nombre d'élément total | 16 | - False : divise la valeur par le nombre d'élément total |
17 | 17 | ||
18 | withPower permet à l'utilisateur de décider d'appliquer un carré 2 ou | 18 | withPower permet à l'utilisateur de décider d'appliquer un carré 2 ou |
19 | une valeur absolue. | 19 | une valeur absolue. |
20 | ''' | 20 | ''' |
21 | 21 | ||
22 | def divide_line(a, divider): | 22 | def divide_line(a, divider): |
23 | ''' | 23 | ''' |
24 | Sub function used for dividing matrix by a vector line by line. | 24 | Sub function used for dividing matrix by a vector line by line. |
25 | ''' | 25 | ''' |
26 | return np.divide(a, divider, out=np.zeros_like(a), where=divider!=0) | 26 | return np.divide(a, divider, out=np.zeros_like(a), where=divider!=0) |
27 | 27 | ||
28 | dividers1 = 0 | 28 | dividers1 = 0 |
29 | dividers2 = 0 | 29 | dividers2 = 0 |
30 | 30 | ||
31 | if isGlobal: | 31 | if isGlobal: |
32 | dividers1 = matrix1.sum() | 32 | dividers1 = matrix1.sum() |
33 | dividers2 = matrix2.sum() | 33 | dividers2 = matrix2.sum() |
34 | else: | 34 | else: |
35 | dividers1 = matrix1.sum(axis=1) | 35 | dividers1 = matrix1.sum(axis=1) |
36 | dividers2 = matrix2.sum(axis=1) | 36 | dividers2 = matrix2.sum(axis=1) |
37 | 37 | ||
38 | matrix1_divided = np.apply_along_axis(divide_line, 0, np.asarray(matrix1, dtype=np.float), dividers1) | 38 | matrix1_divided = np.apply_along_axis(divide_line, 0, np.asarray(matrix1, dtype=np.float), dividers1) |
39 | 39 | ||
40 | matrix2_divided = np.apply_along_axis(divide_line, 0, np.asarray(matrix2, dtype=np.float), dividers2) | 40 | matrix2_divided = np.apply_along_axis(divide_line, 0, np.asarray(matrix2, dtype=np.float), dividers2) |
41 | 41 | ||
42 | diff = matrix1_divided - matrix2_divided | 42 | diff = matrix1_divided - matrix2_divided |
43 | 43 | ||
44 | mask = np.logical_not(np.logical_and(matrix2==0, matrix1==0)) | 44 | mask = np.logical_not(np.logical_and(matrix2==0, matrix1==0)) |
45 | 45 | ||
46 | result = diff | 46 | result = diff |
47 | 47 | ||
48 | if mod != None or mod == "": | 48 | if mod != None or mod == "": |
49 | for word in mod.split(" "): | 49 | for word in mod.split(" "): |
50 | if word == "power": | 50 | if word == "power": |
51 | result = np.power(result,2) | 51 | result = np.power(result,2) |
52 | elif word == "human": | 52 | elif word == "human": |
53 | result = result * 100 | 53 | result = result * 100 |
54 | elif word == "abs": | 54 | elif word == "abs": |
55 | result = np.absolute(result) | 55 | result = np.absolute(result) |
56 | else: | 56 | else: |
57 | raise Exception("Need to specify an accepted mod of the disequilibrium (\"power\", \"human\" or \"abs\"") | 57 | raise Exception("Need to specify an accepted mod of the disequilibrium (\"power\", \"human\" or \"abs\"") |
58 | return (mask, result) | 58 | return (mask, result) |
59 | 59 | ||
60 | 60 | ||
61 | 61 | ||
62 | def disequilibrium_mean_by_cluster(mask, matrix): | 62 | def disequilibrium_mean_by_cluster(mask, matrix): |
63 | ''' | 63 | ''' |
64 | Mean of disequilibrium | 64 | Mean of disequilibrium |
65 | matrix is the disequilibrium calculated | 65 | matrix is the disequilibrium calculated |
66 | from number of occurences belonging to a class, | 66 | from number of occurences belonging to a class, |
67 | for each cluster. | 67 | for each cluster. |
68 | ''' | 68 | ''' |
69 | nb_k = len(matrix) | 69 | nb_k = len(matrix) |
70 | results = np.zeros((nb_k)) | 70 | results = np.zeros((nb_k)) |
71 | 71 | ||
72 | for i in range(nb_k): | 72 | for i in range(nb_k): |
73 | results[i] = matrix[i].sum() / mask[i].sum() | 73 | results[i] = matrix[i].sum() / mask[i].sum() |
74 | return results | 74 | return results |
75 | 75 | ||
76 | 76 | ||
77 | def disequilibrium(matrix1, matrix2, isGlobal=False): | 77 | def disequilibrium(matrix1, matrix2, isGlobal=False): |
78 | ''' | 78 | ''' |
79 | Disequilibrium matrix | 79 | Disequilibrium matrix |
80 | And Disequilibrium value | 80 | And Disequilibrium value |
81 | ''' | 81 | ''' |
82 | mask, result = disequilibrium_(matrix1, matrix2, isGlobal) | 82 | mask, result = disequilibrium_(matrix1, matrix2, isGlobal) |
83 | result_human = result * 100 | 83 | result_human = result * 100 |
84 | result_power = np.power(result, 2) | 84 | result_power = np.power(result, 2) |
85 | 85 | ||
86 | return ( | 86 | return ( |
87 | mask, | 87 | mask, |
88 | result_human, | 88 | result_human, |
89 | disequilibrium_mean_by_cluster(mask, result_power).sum()/matrix1.shape[0] | 89 | disequilibrium_mean_by_cluster(mask, result_power).sum()/matrix1.shape[0] |
90 | ) | 90 | ) |
91 | 91 | ||
92 | 92 | ||
93 | def compute_count_matrix(y_truth, y_hat): | 93 | def compute_count_matrix(y_truth, y_hat): |
94 | ''' | 94 | ''' |
95 | Check the size of the lists with assertion | 95 | Check the size of the lists with assertion |
96 | ''' | 96 | ''' |
97 | # Check size of the lists | 97 | # Check size of the lists |
98 | assert len(y_hat) == len(y_truth), f"Matrices should have the same length y_hat: {len(y_hat)}, y_truth: {len(y_truth)}" | 98 | assert len(y_hat) == len(y_truth), f"Matrices should have the same length y_hat: {len(y_hat)}, y_truth: {len(y_truth)}" |
99 | 99 | ||
100 | # Build count matrix | 100 | # Build count matrix |
101 | count_matrix = np.zeros((max(y_hat+1), max(y_truth+1))) | 101 | count_matrix = np.zeros((max(y_hat+1), max(y_truth+1))) |
102 | for i in range(len(y_hat)): | 102 | for i in range(len(y_hat)): |
103 | count_matrix[y_hat[i]][y_truth[i]] += 1 | 103 | count_matrix[y_hat[i]][y_truth[i]] += 1 |
104 | return count_matrix | 104 | return count_matrix |
105 | 105 | ||
106 | 106 | ||
107 | def entropy_score(y_truth, y_hat): | 107 | def entropy_score(y_truth, y_hat): |
108 | ''' | 108 | ''' |
109 | Need to use label encoder before givin y_hat and y_truth | 109 | Need to use label encoder before givin y_hat and y_truth |
110 | Don't use one hot labels | 110 | Don't use one hot labels |
111 | 111 | ||
112 | Return a tuple with: | 112 | Return a tuple with: |
113 | - result_matrix : the matrix with the log multiplied probabilities (P(x) * log(P(x))) | 113 | - result_matrix : the matrix with the log multiplied probabilities (P(x) * log(P(x))) |
114 | - result_vector : the vector avec summing entropy of each class. Each value corresponds to a cluster. | 114 | - result_vector : the vector avec summing entropy of each class. Each value corresponds to a cluster. |
115 | - result : the final entropy measure of the clustering | 115 | - result : the final entropy measure of the clustering |
116 | ''' | 116 | ''' |
117 | def divide_line(a, divider): | 117 | def divide_line(a, divider): |
118 | ''' | 118 | ''' |
119 | Sub function used for dividing matrix by a vector line by line. | 119 | Sub function used for dividing matrix by a vector line by line. |
120 | ''' | 120 | ''' |
121 | return np.divide(a, divider, out=np.zeros_like(a), where=divider!=0) | 121 | return np.divide(a, divider, out=np.zeros_like(a), where=divider!=0) |
122 | 122 | ||
123 | # Build count matrix | 123 | # Build count matrix |
124 | count_matrix = compute_count_matrix(y_truth, y_hat) | 124 | count_matrix = compute_count_matrix(y_truth, y_hat) |
125 | 125 | ||
126 | # Build dividers vector | 126 | # Build dividers vector |
127 | dividers = count_matrix.sum(axis=1) | 127 | dividers = count_matrix.sum(axis=1) |
128 | 128 | ||
129 | matrix_divided = np.apply_along_axis(divide_line, 0, np.asarray(count_matrix, dtype=np.float), dividers) | 129 | matrix_divided = np.apply_along_axis(divide_line, 0, np.asarray(count_matrix, dtype=np.float), dividers) |
130 | 130 | ||
131 | log_matrix = np.zeros(matrix_divided.shape) | 131 | log_matrix = np.zeros(matrix_divided.shape) |
132 | np.log2(matrix_divided, out=log_matrix, where=count_matrix != 0) | 132 | np.log2(matrix_divided, out=log_matrix, where=count_matrix != 0) |
133 | result_matrix = -1 * np.multiply(matrix_divided, log_matrix) | 133 | result_matrix = -1 * np.multiply(matrix_divided, log_matrix) |
134 | result_vector = result_matrix.sum(axis=1) | 134 | result_vector = result_matrix.sum(axis=1) |
135 | result_vector.sum() | 135 | result_vector.sum() |
136 | 136 | ||
137 | if np.isnan(np.sum(result_vector)): | 137 | if np.isnan(np.sum(result_vector)): |
138 | print("COUNT MATRIX") | 138 | print("COUNT MATRIX") |
139 | print(count_matrix) | 139 | print(count_matrix) |
140 | print("MATRIX DIVIDED") | 140 | print("MATRIX DIVIDED") |
141 | print(matrix_divided) | 141 | print(matrix_divided) |
142 | print("RESULT MATRIX") | 142 | print("RESULT MATRIX") |
143 | print(result_matrix) | 143 | print(result_matrix) |
144 | print("VECTOR MATRIX") | 144 | print("VECTOR MATRIX") |
145 | print(result_vector) | 145 | print(result_vector) |
146 | print("An error occured due to nan value, some values are printed before") | 146 | print("An error occured due to nan value, some values are printed before") |
147 | exit(1) | 147 | exit(1) |
148 | 148 | ||
149 | result = result_vector * dividers / dividers.sum() | 149 | result = result_vector * dividers / dividers.sum() |
150 | result = result.sum() | 150 | result = result.sum() |
151 | return (result_matrix, result_vector, result) | 151 | return result |
152 | 152 | ||
153 | 153 | ||
154 | def purity_score(y_truth, y_hat): | 154 | def purity_score(y_truth, y_hat): |
155 | ''' | 155 | ''' |
156 | Return three values in a dictionary: | 156 | Return three values in a dictionary: |
157 | - purity_class_score: the purity score of the class (asp) | 157 | - purity_class_score: the purity score of the class (asp) |
158 | - purity_cluster_score: the purity score of the cluster (acp) | 158 | - purity_cluster_score: the purity score of the cluster (acp) |
159 | - K: the overall evaluation criterion (sqrt(asp * acp)) | 159 | - K: the overall evaluation criterion (sqrt(asp * acp)) |
160 | 160 | ||
161 | This function is based on the following article: | 161 | This function is based on the following article: |
162 | Unknown-multiple speaker clustering using HMM, J. Ajmera, H. Bourlard, I. Lapidot, I. McCowan | 162 | Unknown-multiple speaker clustering using HMM, J. Ajmera, H. Bourlard, I. Lapidot, I. McCowan |
163 | ''' | 163 | ''' |
164 | 164 | ||
165 | def divide_line(a, divider): | 165 | def divide_line(a, divider): |
166 | ''' | 166 | ''' |
167 | Sub function used for dividing matrix by a vector line by line. | 167 | Sub function used for dividing matrix by a vector line by line. |
168 | ''' | 168 | ''' |
169 | return np.divide(a, divider, out=np.zeros_like(a), where=divider!=0) | 169 | return np.divide(a, divider, out=np.zeros_like(a), where=divider!=0) |
170 | 170 | ||
171 | def compute_purity_score(count_matrix, axis=0): | 171 | def compute_purity_score(count_matrix, axis=0): |
172 | if axis==0: | 172 | if axis==0: |
173 | other_axis = 1 | 173 | other_axis = 1 |
174 | else: | 174 | else: |
175 | other_axis = 0 | 175 | other_axis = 0 |
176 | count_per_row = count_matrix.sum(axis=axis) | 176 | count_per_row = count_matrix.sum(axis=axis) |
177 | dividers = np.square(count_per_row) | 177 | dividers = np.square(count_per_row) |
178 | 178 | ||
179 | count_matrix_squared = np.square(count_matrix) | 179 | count_matrix_squared = np.square(count_matrix) |
180 | matrix_divided = np.apply_along_axis(divide_line, other_axis, np.asarray(count_matrix_squared, dtype=np.float), dividers) | 180 | matrix_divided = np.apply_along_axis(divide_line, other_axis, np.asarray(count_matrix_squared, dtype=np.float), dividers) |
181 | vector_purity = np.sum(matrix_divided, axis=axis) | 181 | vector_purity = np.sum(matrix_divided, axis=axis) |
182 | 182 | ||
183 | scalar_purity = np.average(vector_purity, weights=count_per_row) | 183 | scalar_purity = np.average(vector_purity, weights=count_per_row) |
184 | return (vector_purity, scalar_purity) | 184 | return scalar_purity |
185 | 185 | ||
186 | 186 | ||
187 | count_matrix = compute_count_matrix(y_truth, y_hat) | 187 | count_matrix = compute_count_matrix(y_truth, y_hat) |
188 | _, purity_cluster_score = compute_purity_score(count_matrix, 1) | 188 | _, purity_cluster_score = compute_purity_score(count_matrix, 1) |
189 | _, purity_class_score = cluster_purity = compute_purity_score(count_matrix, 0) | 189 | _, purity_class_score = cluster_purity = compute_purity_score(count_matrix, 0) |
190 | 190 | ||
191 | K = np.sqrt(purity_cluster_score * purity_class_score) | 191 | K = np.sqrt(purity_cluster_score * purity_class_score) |
192 | 192 | ||
193 | for i in range(count_matrix.shape[0]): | 193 | for i in range(count_matrix.shape[0]): |
194 | for j in range(count_matrix.shape[1]): | 194 | for j in range(count_matrix.shape[1]): |
195 | count_matrix[i][j] | 195 | count_matrix[i][j] |
196 | count_matrix[i] | 196 | count_matrix[i] |
197 | return { | 197 | return { |
198 | "purity_class_score": purity_class_score, | 198 | "purity_class_score": purity_class_score, |
199 | "purity_cluster_score": purity_cluster_score, | 199 | "purity_cluster_score": purity_cluster_score, |
200 | "K": K | 200 | "K": K |
201 | } | 201 | } |
202 | 202 | ||
203 | 203 | ||
204 | if __name__ == "__main__": | 204 | if __name__ == "__main__": |
205 | print("Purity test #1") | 205 | print("Purity test #1") |
206 | # Hypothesis | 206 | # Hypothesis |
207 | y_hat = np.asarray([0, 1, 2, 0, 1, 0, 3, 2, 2, 3, 3, 0]) | 207 | y_hat = np.asarray([0, 1, 2, 0, 1, 0, 3, 2, 2, 3, 3, 0]) |
208 | # Truth | 208 | # Truth |
209 | y = np.asarray([0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3]) | 209 | y = np.asarray([0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3]) |
210 | 210 | ||
211 | (result_matrix, result_vector, result) = entropy_score(y, y_hat) | 211 | (result_matrix, result_vector, result) = entropy_score(y, y_hat) |
212 | print(purity_score(y, y_hat)) | 212 | print(purity_score(y, y_hat)) |
213 | 213 | ||
214 | exit(1) | 214 | exit(1) |
215 | print("Purity test #2") | 215 | print("Purity test #2") |
216 | # Hypothesis | 216 | # Hypothesis |
217 | y_hat = np.asarray([0, 1, 2, 0, 1, 0, 3, 2, 2, 3, 3, 0, 4, 4, 4]) | 217 | y_hat = np.asarray([0, 1, 2, 0, 1, 0, 3, 2, 2, 3, 3, 0, 4, 4, 4]) |
218 | # Truth | 218 | # Truth |
219 | y = np.asarray([0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 0, 3, 3, 3]) | 219 | y = np.asarray([0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 0, 3, 3, 3]) |
220 | 220 | ||
221 | (result_matrix, result_vector, result) = entropy_score(y, y_hat) | 221 | (result_matrix, result_vector, result) = entropy_score(y, y_hat) |
222 | exit(1) | 222 | exit(1) |
223 | print("Result matrix: ") | 223 | print("Result matrix: ") |
224 | print(result_matrix) | 224 | print(result_matrix) |
225 | print("Result vector: ") | 225 | print("Result vector: ") |
226 | print(result_vector) | 226 | print(result_vector) |
227 | print("Result: ", result) | 227 | print("Result: ", result) |