Quillot Mathias / volia

1

'''

1

'''

2

This script allows the user to evaluate a classification system on new labels using clustering methods.

2

This script allows the user to evaluate a classification system on new labels using clustering methods.

3

The algorithms are applied on the given latent space (embedding).

3

The algorithms are applied on the given latent space (embedding).

4

'''

4

'''

5

import argparse

5

import argparse

6

import numpy as np

6

import numpy as np

7

import pandas as pd

7

import pandas as pd

8

import os

8

import os

9

import time

9

import time

10

from sklearn.preprocessing import LabelEncoder

10

from sklearn.preprocessing import LabelEncoder

11

from sklearn.metrics.pairwise import pairwise_distances

11

from sklearn.metrics.pairwise import pairwise_distances

12

from sklearn.metrics import f1_score

12

from sklearn.metrics import f1_score

13

from sklearn.cluster import KMeans

13

from sklearn.cluster import KMeans

14

from sklearn.manifold import TSNE

14

from sklearn.manifold import TSNE

15

import matplotlib.pyplot as plt

15

import matplotlib.pyplot as plt

16

17

from volia.data_io import read_features,read_lst

17

from volia.data_io import read_features,read_lst

18

19

if __name__ == "__main__":

19

if __name__ == "__main__":

20

# Argparse

20

# Argparse

21

parser = argparse.ArgumentParser("Compute clustering on a latent space")

21

parser = argparse.ArgumentParser("Compute clustering on a latent space")

22

parser.add_argument("features")

22

parser.add_argument("features")

23

parser.add_argument("utt2",

23

parser.add_argument("utt2",

24

type=str,

24

type=str,

25

help="file with [utt] [value]")

25

help="file with [utt] [value]")

26

parser.add_argument("--idsfrom",

26

parser.add_argument("--idsfrom",

27

type=str,

27

type=str,

28

default="utt2",

28

default="utt2",

29

choices=[

29

choices=[

30

"features",

30

"features",

31

"utt2"

31

"utt2"

32

],

32

],

33

help="from features or from utt2?")

33

help="from features or from utt2?")

34

parser.add_argument("--prefix",

34

parser.add_argument("--prefix",

35

default="",

35

type=str,

36

type=str,

36

help="prefix of saved files")

37

help="prefix of saved files")

37

parser.add_argument("--outdir",

38

parser.add_argument("--outdir",

38

default=None,

39

default=None,

39

type=str,

40

type=str,

40

help="Output directory")

41

help="Output directory")

41

42

args = parser.parse_args()

43

args = parser.parse_args()

43

44

assert args.outdir

45

assert args.outdir

45

46

start = time.time()

47

start = time.time()

47

48

# Load features and utt2

49

# Load features and utt2

49

features = read_features(args.features)

50

features = read_features(args.features)

50

utt2 = read_lst(args.utt2)

51

utt2 = read_lst(args.utt2)

51

52

# Take id list

53

# Take id list

53

if args.idsfrom == "features":

54

if args.idsfrom == "features":

54

ids = list(features.keys())

55

ids = list(features.keys())

55

elif args.idsfrom == "utt2":

56

elif args.idsfrom == "utt2":

56

ids = list(utt2.keys())

57

ids = list(utt2.keys())

57

else:

58

else:

58

print(f"idsfrom is not good: {args.idsfrom}")

59

print(f"idsfrom is not good: {args.idsfrom}")

59

exit(1)

60

exit(1)

60

61

feats = np.vstack([ features[id_] for id_ in ids ])

62

feats = np.vstack([ features[id_] for id_ in ids ])

62

classes = [ utt2[id_] for id_ in ids ]

63

classes = [ utt2[id_] for id_ in ids ]

63

64

# Encode labels

65

# Encode labels

65

le = LabelEncoder()

66

le = LabelEncoder()

66

labels = le.fit_transform(classes)

67

labels = le.fit_transform(classes)

67

num_classes = len(le.classes_)

68

num_classes = len(le.classes_)

68

69

# Compute KMEANS clustering on data

70

# Compute KMEANS clustering on data

70

estimator = KMeans(

71

estimator = KMeans(

71

n_clusters=num_classes,

72

n_clusters=num_classes,

72

n_init=100,

73

n_init=100,

73

tol=10-6,

74

tol=10-6,

74

algorithm="elkan"

75

algorithm="elkan"

75

)

76

)

76

estimator.fit(feats)

77

estimator.fit(feats)

77

print(f"Kmeans: processed {estimator.n_iter_} iterations - intertia={estimator.inertia_}")

78

print(f"Kmeans: processed {estimator.n_iter_} iterations - intertia={estimator.inertia_}")

78

79

# contains distance to each cluster for each sample

80

# contains distance to each cluster for each sample

80

dist_space = estimator.transform(feats)

81

dist_space = estimator.transform(feats)

81

predictions = np.argmin(dist_space, axis=1)

82

predictions = np.argmin(dist_space, axis=1)

82

83

# gives each cluster a name (considering most represented character)

84

# gives each cluster a name (considering most represented character)

84

dataframe = pd.DataFrame({

85

dataframe = pd.DataFrame({

85

"label": pd.Series(list(map(lambda x: le.classes_[x], labels))),

86

"label": pd.Series(list(map(lambda x: le.classes_[x], labels))),

86

"prediction": pd.Series(predictions)

87

"prediction": pd.Series(predictions)

87

})

88

})

88

89

def find_cluster_name_fn(c):

90

def find_cluster_name_fn(c):

90

mask = dataframe["prediction"] == c

91

mask = dataframe["prediction"] == c

91

return dataframe[mask]["label"].value_counts(sort=False).idxmax()

92

return dataframe[mask]["label"].value_counts(sort=False).idxmax()

92

93

cluster_names = list(map(find_cluster_name_fn, range(num_classes)))

94

cluster_names = list(map(find_cluster_name_fn, range(num_classes)))

94

predicted_labels = le.transform(

95

predicted_labels = le.transform(

95

[cluster_names[pred] for pred in predictions])

96

[cluster_names[pred] for pred in predictions])

96

97

# F-measure

98

# F-measure

98

fscores = f1_score(labels, predicted_labels, average=None)

99

fscores = f1_score(labels, predicted_labels, average=None)

99

fscores_str = "\n".join(map(lambda i: "{0:25s}: {1:.4f}".format(le.classes_[i], fscores[i]), range(len(fscores))))

100

fscores_str = "\n".join(map(lambda i: "{0:25s}: {1:.4f}".format(le.classes_[i], fscores[i]), range(len(fscores))))

100

print(f"F1-scores for each classes:\n{fscores_str}")

101

print(f"F1-scores for each classes:\n{fscores_str}")

101

print(f"Global score : {np.mean(fscores)}")

102

print(f"Global score : {np.mean(fscores)}")

102

with open(os.path.join(args.outdir, args.prefix + "eval_clustering.log"), "w") as fd:

103

with open(os.path.join(args.outdir, args.prefix + "eval_clustering.log"), "w") as fd:

103

print(f"F1-scores for each classes:\n{fscores_str}", file=fd)

104

print(f"F1-scores for each classes:\n{fscores_str}", file=fd)

104

print(f"Global score : {np.mean(fscores)}", file=fd)

105

print(f"Global score : {np.mean(fscores)}", file=fd)

105

106

# Process t-SNE and plot

107

# Process t-SNE and plot

107

tsne_estimator = TSNE()

108

tsne_estimator = TSNE()

108

embeddings = tsne_estimator.fit_transform(feats)

109

embeddings = tsne_estimator.fit_transform(feats)

109

print("t-SNE: processed {0} iterations - KL_divergence={1:.4f}".format(

110

print("t-SNE: processed {0} iterations - KL_divergence={1:.4f}".format(

110

tsne_estimator.n_iter_, tsne_estimator.kl_divergence_))

111

tsne_estimator.n_iter_, tsne_estimator.kl_divergence_))

111

112

fig, [axe1, axe2] = plt.subplots(1, 2, figsize=(10, 5))

113

fig, [axe1, axe2] = plt.subplots(1, 2, figsize=(10, 5))

113

for c, name in enumerate(le.classes_):

114

for c, name in enumerate(le.classes_):

114

c_mask = np.where(labels == c)

115

c_mask = np.where(labels == c)

115

axe1.scatter(embeddings[c_mask][:, 0], embeddings[c_mask][:, 1], label=name, alpha=0.2, edgecolors=None)

116

axe1.scatter(embeddings[c_mask][:, 0], embeddings[c_mask][:, 1], label=name, alpha=0.2, edgecolors=None)

116

117

try:

118

try:

118

id_cluster = cluster_names.index(name)

119

id_cluster = cluster_names.index(name)

119

except ValueError:

120

except ValueError:

120

print("WARNING: no cluster found for {}".format(name))

121

print("WARNING: no cluster found for {}".format(name))

121

continue

122

continue

122

c_mask = np.where(predictions == id_cluster)

123

c_mask = np.where(predictions == id_cluster)

123

axe2.scatter(embeddings[c_mask][:, 0], embeddings[c_mask][:, 1], label=name, alpha=0.2, edgecolors=None)

124

axe2.scatter(embeddings[c_mask][:, 0], embeddings[c_mask][:, 1], label=name, alpha=0.2, edgecolors=None)

124

125

axe1.legend(loc="lower center", bbox_to_anchor=(0.5, -0.35))

126

axe1.legend(loc="lower center", bbox_to_anchor=(0.5, -0.35))

126

axe1.set_title("true labels")

127

axe1.set_title("true labels")

127

axe2.legend(loc="lower center", bbox_to_anchor=(0.5, -0.35))

128

axe2.legend(loc="lower center", bbox_to_anchor=(0.5, -0.35))

128

axe2.set_title("predicted cluster label")

129

axe2.set_title("predicted cluster label")

129

130

plt.suptitle("Kmeans Clustering")

131

plt.suptitle("Kmeans Clustering")

131

132

loc = os.path.join(

133

loc = os.path.join(

133

args.outdir,

134

args.outdir,

134

args.prefix + "kmeans.pdf"

135

args.prefix + "kmeans.pdf"

135

)

136

)

136

plt.savefig(loc, bbox_inches="tight")

137

plt.savefig(loc, bbox_inches="tight")

137

plt.close()

138

plt.close()

138

139

print("INFO: figure saved at {}".format(loc))

140

print("INFO: figure saved at {}".format(loc))

140

141

end = time.time()

142

end = time.time()

142

print("program ended in {0:.2f} seconds".format(end-start))

143

print("program ended in {0:.2f} seconds".format(end-start))

GITLAB

Quillot Mathias / volia

Add default value to prefix

 '''
 This script allows the user to evaluate a classification system on new labels using clustering methods.
 The algorithms are applied on the given latent space (embedding).
 '''
 import argparse
 import numpy as np
 import pandas as pd
 import os
 import time
 from sklearn.preprocessing import LabelEncoder
 from sklearn.metrics.pairwise import pairwise_distances
 from sklearn.metrics import f1_score
 from sklearn.cluster import KMeans
 from sklearn.manifold import TSNE
 import matplotlib.pyplot as plt
 from volia.data_io import read_features,read_lst
 if __name__ == "__main__":
     # Argparse
     parser = argparse.ArgumentParser("Compute clustering on a latent space")
     parser.add_argument("features")
     parser.add_argument("utt2",
                         type=str,
                         help="file with [utt] [value]")
     parser.add_argument("--idsfrom",
                         type=str,
                         default="utt2",
                         choices=[
                             "features",
                             "utt2"
                         ],
                         help="from features or from utt2?")
     parser.add_argument("--prefix",
+                        default="",
                         type=str,
                         help="prefix of saved files")
     parser.add_argument("--outdir",
                         default=None,
                         type=str,
                         help="Output directory")
     args = parser.parse_args()
     assert args.outdir
     start = time.time()
     # Load features and utt2
     features = read_features(args.features)
     utt2 = read_lst(args.utt2)
     # Take id list
     if args.idsfrom == "features":
         ids = list(features.keys())
     elif args.idsfrom == "utt2":
         ids = list(utt2.keys())
     else:
         print(f"idsfrom is not good: {args.idsfrom}")
         exit(1)
     feats = np.vstack([ features[id_] for id_ in ids ])
     classes = [ utt2[id_] for id_ in ids ]
     # Encode labels
     le = LabelEncoder()
     labels = le.fit_transform(classes)
     num_classes = len(le.classes_)
     # Compute KMEANS clustering on data
     estimator = KMeans(
         n_clusters=num_classes,
         n_init=100,
         tol=10-6,
         algorithm="elkan"
     )
     estimator.fit(feats)
     print(f"Kmeans: processed {estimator.n_iter_} iterations - intertia={estimator.inertia_}")
     # contains distance to each cluster for each sample
     dist_space = estimator.transform(feats)
     predictions = np.argmin(dist_space, axis=1)
     # gives each cluster a name (considering most represented character)
     dataframe = pd.DataFrame({
         "label": pd.Series(list(map(lambda x: le.classes_[x], labels))),
         "prediction": pd.Series(predictions)
     })
     def find_cluster_name_fn(c):
         mask = dataframe["prediction"] == c
         return dataframe[mask]["label"].value_counts(sort=False).idxmax()
     cluster_names = list(map(find_cluster_name_fn, range(num_classes)))
     predicted_labels = le.transform(
         [cluster_names[pred] for pred in predictions])
     # F-measure
     fscores = f1_score(labels, predicted_labels, average=None)
     fscores_str = "\n".join(map(lambda i: "{0:25s}: {1:.4f}".format(le.classes_[i], fscores[i]), range(len(fscores))))
     print(f"F1-scores for each classes:\n{fscores_str}")
     print(f"Global score : {np.mean(fscores)}")
     with open(os.path.join(args.outdir, args.prefix + "eval_clustering.log"), "w") as fd:
         print(f"F1-scores for each classes:\n{fscores_str}", file=fd)
         print(f"Global score : {np.mean(fscores)}", file=fd)
     # Process t-SNE and plot
     tsne_estimator = TSNE()
     embeddings = tsne_estimator.fit_transform(feats)
     print("t-SNE: processed {0} iterations - KL_divergence={1:.4f}".format(
         tsne_estimator.n_iter_, tsne_estimator.kl_divergence_))
     fig, [axe1, axe2] = plt.subplots(1, 2, figsize=(10, 5))
     for c, name in enumerate(le.classes_):
         c_mask = np.where(labels == c)
         axe1.scatter(embeddings[c_mask][:, 0], embeddings[c_mask][:, 1], label=name, alpha=0.2, edgecolors=None)
         try:
             id_cluster = cluster_names.index(name)
         except ValueError:
             print("WARNING: no cluster found for {}".format(name))
             continue
         c_mask = np.where(predictions == id_cluster)
         axe2.scatter(embeddings[c_mask][:, 0], embeddings[c_mask][:, 1], label=name, alpha=0.2, edgecolors=None)
     axe1.legend(loc="lower center", bbox_to_anchor=(0.5, -0.35))
     axe1.set_title("true labels")
     axe2.legend(loc="lower center", bbox_to_anchor=(0.5, -0.35))
     axe2.set_title("predicted cluster label")
     plt.suptitle("Kmeans Clustering")
     loc = os.path.join(
         args.outdir,
         args.prefix + "kmeans.pdf"
     )
     plt.savefig(loc, bbox_inches="tight")
     plt.close()
     print("INFO: figure saved at {}".format(loc))
     end = time.time()
     print("program ended in {0:.2f} seconds".format(end-start))