Quillot Mathias / volia

Browse Code »

Commit 1f8612ebfd7fe8173f5e7f5374192182a1064da3

Authored by Mathias 2020-09-15 16:26:15 +0200

1 parent adbca3b1ce

Exists in master

repaired memory error due to np.log2 behaviour

Showing 1 changed file with 167 additions and 0 deletions Side-by-side Diff

volia/measures.py

volia/measures.py

Diff comments View file @ 1f8612e

	1	+'''
	2	+This module is a part of my library.
	3	+It aims to compute some measures for clustering.
	4	+'''
	5	+
	6	+import numpy as np
	7	+
	8	+def disequilibrium_(matrix1, matrix2, isGlobal=False, mod=None):
	9	+ '''
	10	+ Compute disequilibrium for all the clusters.
	11	+ The disequilibrium is compute from the difference
	12	+ between two clustering sets.
	13	+ isGlobal permet à l'utilisateur de choisir le dénominateur de
	14	+ la fonction :
	15	+ - True : divise la valeur par le nombre d'élément du cluster
	16	+ - False : divise la valeur par le nombre d'élément total
	17	+
	18	+ withPower permet à l'utilisateur de décider d'appliquer un carré 2 ou
	19	+ une valeur absolue.
	20	+ '''
	21	+
	22	+ def divide_line(a, divider):
	23	+ '''
	24	+ Sub function used for dividing matrix by a vector line by line.
	25	+ '''
	26	+ return np.divide(a, divider, out=np.zeros_like(a), where=divider!=0)
	27	+
	28	+ dividers1 = 0
	29	+ dividers2 = 0
	30	+
	31	+ if isGlobal:
	32	+ dividers1 = matrix1.sum()
	33	+ dividers2 = matrix2.sum()
	34	+ else:
	35	+ dividers1 = matrix1.sum(axis=1)
	36	+ dividers2 = matrix2.sum(axis=1)
	37	+
	38	+ matrix1_divided = np.apply_along_axis(divide_line, 0, np.asarray(matrix1, dtype=np.float), dividers1)
	39	+
	40	+ matrix2_divided = np.apply_along_axis(divide_line, 0, np.asarray(matrix2, dtype=np.float), dividers2)
	41	+
	42	+ diff = matrix1_divided - matrix2_divided
	43	+
	44	+ mask = np.logical_not(np.logical_and(matrix2==0, matrix1==0))
	45	+
	46	+ result = diff
	47	+
	48	+ if mod != None or mod == "":
	49	+ for word in mod.split(" "):
	50	+ if word == "power":
	51	+ result = np.power(result,2)
	52	+ elif word == "human":
	53	+ result = result * 100
	54	+ elif word == "abs":
	55	+ result = np.absolute(result)
	56	+ else:
	57	+ raise Exception("Need to specify an accepted mod of the disequilibrium (\"power\", \"human\" or \"abs\"")
	58	+ return (mask, result)
	59	+
	60	+
	61	+
	62	+def disequilibrium_mean_by_cluster(mask, matrix):
	63	+ '''
	64	+ Mean of disequilibrium
	65	+ matrix is the disequilibrium calculated
	66	+ from number of occurences belonging to a class,
	67	+ for each cluster.
	68	+ '''
	69	+ nb_k = len(matrix)
	70	+ results = np.zeros((nb_k))
	71	+
	72	+ for i in range(nb_k):
	73	+ results[i] = matrix[i].sum() / mask[i].sum()
	74	+ return results
	75	+
	76	+
	77	+def disequilibrium(matrix1, matrix2, isGlobal=False):
	78	+ '''
	79	+ Disequilibrium matrix
	80	+ And Disequilibrium value
	81	+ '''
	82	+ mask, result = disequilibrium_(matrix1, matrix2, isGlobal)
	83	+ result_human = result * 100
	84	+ result_power = np.power(result, 2)
	85	+
	86	+ return (
	87	+ mask,
	88	+ result_human,
	89	+ disequilibrium_mean_by_cluster(mask, result_power).sum()/matrix1.shape[0]
	90	+ )
	91	+
	92	+
	93	+def compute_count_matrix(y_hat, y_truth):
	94	+ '''
	95	+ Check the size of the lists with assertion
	96	+ '''
	97	+ # Check size of the lists
	98	+ assert len(y_hat) == len(y_truth), f"Matrices should have the same length y_hat: {len(y_hat)}, y_truth: {len(y_truth)}"
	99	+
	100	+ # Build count matrix
	101	+ count_matrix = np.zeros((max(y_hat+1), max(y_truth+1)))
	102	+ for i in range(len(y_hat)):
	103	+ count_matrix[y_hat[i]][y_truth[i]] += 1
	104	+ return count_matrix
	105	+
	106	+
	107	+def entropy_score(y_truth, y_hat):
	108	+ '''
	109	+ Need to use label encoder before givin y_hat and y_truth
	110	+ Don't use one hot labels
	111	+
	112	+ Return a tuple with:
	113	+ - result_matrix : the matrix with the log multiplied probabilities (P(x) * log(P(x)))
	114	+ - result_vector : the vector avec summing entropy of each class. Each value corresponds to a cluster.
	115	+ - result : the final entropy measure of the clustering
	116	+ '''
	117	+ def divide_line(a, divider):
	118	+ '''
	119	+ Sub function used for dividing matrix by a vector line by line.
	120	+ '''
	121	+ return np.divide(a, divider, out=np.zeros_like(a), where=divider!=0)
	122	+
	123	+ # Build count matrix
	124	+ count_matrix = compute_count_matrix(y_hat, y_truth)
	125	+
	126	+ # Build dividers vector
	127	+ dividers = count_matrix.sum(axis=1)
	128	+
	129	+ matrix_divided = np.apply_along_axis(divide_line, 0, np.asarray(count_matrix, dtype=np.float), dividers)
	130	+
	131	+ log_matrix = np.zeros(matrix_divided.shape)
	132	+ np.log2(matrix_divided, out=log_matrix, where=count_matrix != 0)
	133	+ result_matrix = -1 * np.multiply(matrix_divided, log_matrix)
	134	+ result_vector = result_matrix.sum(axis=1)
	135	+ result_vector.sum()
	136	+
	137	+ if np.isnan(np.sum(result_vector)):
	138	+ print("COUNT MATRIX")
	139	+ print(count_matrix)
	140	+ print("MATRIX DIVIDED")
	141	+ print(matrix_divided)
	142	+ print("RESULT MATRIX")
	143	+ print(result_matrix)
	144	+ print("VECTOR MATRIX")
	145	+ print(result_vector)
	146	+ print("An error occured due to nan value, some values are printed before")
	147	+ exit(1)
	148	+
	149	+ result = result_vector * dividers / dividers.sum()
	150	+ result = result.sum()
	151	+ return (result_matrix, result_vector, result)
	152	+
	153	+
	154	+
	155	+if __name__ == "__main__":
	156	+ # Hypothesis
	157	+ y_hat = np.asarray([0, 1, 2, 0, 1, 0, 3, 2, 2, 3, 3, 0])
	158	+ # Truth
	159	+ y = np.asarray([0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3])
	160	+
	161	+ (result_matrix, result_vector, result) = entropy(y, y_hat)
	162	+
	163	+ print("Result matrix: ")
	164	+ print(result_matrix)
	165	+ print("Result vector: ")
	166	+ print(result_vector)
	167	+ print("Result: ", result)