agglomerative-clustering.h 7.67 KB
edit raw blame history



1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

19

20

21

22

23

24

25

26

27

28

29

30

31

32

33

34

35

36

37

38

39

40

41

42

43

44

45

46

47

48

49

50

51

52

53

54

55

56

57

58

59

60

61

62

63

64

65

66

67

68

69

70

71

72

73

74

75

76

77

78

79

80

81

82

83

84

85

86

87

88

89

90

91

92

93

94

95

96

97

98

99

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

119

120

121

122

123

124

125

126

127

128

129

130

131

132

133

134

135

136

137

138

139

140

141

142

143

144

145

146

147

148

149

150

151

152

153

154

155

156

157

158

159

160

161

162

163

164

165

166

167

168

169

170

171

172

173

174

175

176

177

178

179

180

181

182

183

184

185

186

187

188

189

190


// ivector/agglomerative-clustering.h

// Copyright  2017-2018  Matthew Maciejewski
//                 2018  David Snyder
//                 2019  Dogan Can

// See ../../COPYING for clarification regarding multiple authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//  http://www.apache.org/licenses/LICENSE-2.0
//
// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
// MERCHANTABLITY OR NON-INFRINGEMENT.
// See the Apache 2 License for the specific language governing permissions and
// limitations under the License.

#ifndef KALDI_IVECTOR_AGGLOMERATIVE_CLUSTERING_H_
#define KALDI_IVECTOR_AGGLOMERATIVE_CLUSTERING_H_

#include <vector>
#include <queue>
#include <set>
#include <unordered_map>
#include <functional>
#include "base/kaldi-common.h"
#include "matrix/matrix-lib.h"
#include "util/stl-utils.h"

namespace kaldi {

/// AhcCluster is the cluster object for the agglomerative clustering. It
/// contains three integer IDs: its own ID and the IDs of its "parents", i.e.
/// the clusters that were merged to form it. It also contains the size (the
/// number of points in the cluster) and a vector of the IDs of the utterances
/// contained in the cluster.
struct AhcCluster {
  int32 id,
    parent1,
    parent2,
    size;
  std::vector<int32> utt_ids;
  AhcCluster(int32 id, int32 p1, int32 p2, std::vector<int32> utts)
      : id(id), parent1(p1), parent2(p2), utt_ids(utts) {
    size = utts.size();
  }
};

/// The AgglomerativeClusterer class contains the necessary mechanisms for the
/// actual clustering algorithm.
class AgglomerativeClusterer {
 public:
  AgglomerativeClusterer(
      const Matrix<BaseFloat> &costs,
      BaseFloat threshold,
      int32 min_clusters,
      int32 first_pass_max_points,
      BaseFloat max_cluster_fraction,
      std::vector<int32> *assignments_out)
      : costs_(costs), threshold_(threshold), min_clusters_(min_clusters),
        first_pass_max_points_(first_pass_max_points),
        assignments_(assignments_out) {
    num_points_ = costs.NumRows();

    // The max_cluster_size_ is a hard limit on the number points in a cluster.
    // This is useful for handling degenerate cases where some outlier points
    // form their own clusters and force everything else to be clustered
    // together, e.g. when min-clusters is provided instead of a threshold.
    max_cluster_size_ = ceil(num_points_ * max_cluster_fraction);

    // The count_, which is used for identifying clusters, is initialized to
    // num_points_ because cluster IDs 1..num_points_ are reserved for input
    // points, which are the initial set of clusters.
    count_ = num_points_;

    // The second_pass_count_, which is used for identifying the initial set of
    // second pass clusters and initializing count_ before the second pass, is
    // initialized to 0 and incremented whenever a new cluster is added to the
    // initial set of second pass clusters.
    second_pass_count_ = 0;
  }

  // Clusters points. Chooses single pass or two pass algorithm.
  void Cluster();

  // Clusters points using single pass algorithm.
  void ClusterSinglePass();

  // Clusters points using two pass algorithm.
  void ClusterTwoPass();

 private:
  // Encodes cluster pair into a 32bit unsigned integer.
  uint32 EncodePair(int32 i, int32 j);
  // Decodes cluster pair from a 32bit unsigned integer.
  std::pair<int32, int32> DecodePair(uint32 key);
  // Initializes the clustering queue with singleton clusters
  void InitializeClusters(int32 first, int32 last);
  // Does hierarchical agglomerative clustering
  void ComputeClusters(int32 min_clusters);
  // Adds clusters created in first pass to second pass clusters
  void AddClustersToSecondPass();
  // Assigns points to clusters
  void AssignClusters();
  // Merges clusters with IDs i and j and updates cost map and queue
  void MergeClusters(int32 i, int32 j);

  const Matrix<BaseFloat> &costs_;  // cost matrix
  BaseFloat threshold_;  // stopping criterion threshold
  int32 min_clusters_;  // minimum number of clusters
  int32 first_pass_max_points_;  // maximum number of points in each subset
  std::vector<int32> *assignments_;  // assignments out

  int32 num_points_;  // total number of points to cluster
  int32 max_cluster_size_;  // maximum number of points in a cluster
  int32 count_;  // count of first pass clusters, used for identifying clusters
  int32 second_pass_count_;  // count of second pass clusters

  // Priority queue using greater (lowest costs are highest priority).
  // Elements contain pairs of cluster IDs and their cost.
  typedef std::pair<BaseFloat, uint32> QueueElement;
  typedef std::priority_queue<QueueElement, std::vector<QueueElement>,
    std::greater<QueueElement>  > QueueType;
  QueueType queue_, second_pass_queue_;

  // Map from cluster IDs to cost between them
  std::unordered_map<uint32, BaseFloat> cluster_cost_map_;
  // Map from cluster ID to cluster object address
  std::unordered_map<int32, AhcCluster*> clusters_map_;
  // Set of unmerged cluster IDs
  std::set<int32> active_clusters_;

  // Map from second pass cluster IDs to cost between them
  std::unordered_map<uint32, BaseFloat> second_pass_cluster_cost_map_;
  // Map from second pass cluster ID to cluster object address
  std::unordered_map<int32, AhcCluster*> second_pass_clusters_map_;
  // Set of unmerged second pass cluster IDs
  std::set<int32> second_pass_active_clusters_;
};

/** This is the function that is called to perform the agglomerative
 *  clustering. It takes the following arguments:
 *   - A matrix of all pairwise costs, with each row/column corresponding
 *      to an utterance ID, and the elements of the matrix containing the
 *      cost for pairing the utterances for its row and column
 *   - A threshold which is used as the stopping criterion for the clusters
 *   - A minimum number of clusters that will not be merged past
 *   - A maximum fraction of points that can be in a cluster
 *   - A vector which will be filled with integer IDs corresponding to each
 *      of the rows/columns of the score matrix.
 *
 *  The basic algorithm is as follows:
 *  \code
 *      while (num-clusters > min-clusters && smallest-merge-cost <= threshold)
 *          if (size-of-new-cluster <= max-cluster-size)
 *              merge the two clusters with lowest cost
 *  \endcode
 *
 *  The cost between two clusters is the average cost of all pairwise
 *  costs between points across the two clusters.
 *
 *  The algorithm takes advantage of the fact that the sum of the pairwise
 *  costs between the points of clusters I and J is equiavlent to the
 *  sum of the pairwise costs between cluster I and the parents of cluster
 *  J. In other words, the total cost between I and J is the sum of the
 *  costs between clusters I and M and clusters I and N, where
 *  cluster J was formed by merging clusters M and N.
 *
 *  If the number of points to cluster is larger than first-pass-max-points,
 *  then clustering is done in two passes. In the first pass, input points are
 *  divided into contiguous subsets of size at most first-pass-max-points and
 *  each subset is clustered separately. In the second pass, the first pass
 *  clusters are merged into the final set of clusters.
 *
 */
void AgglomerativeCluster(
    const Matrix<BaseFloat> &costs,
    BaseFloat threshold,
    int32 min_clusters,
    int32 first_pass_max_points,
    BaseFloat max_cluster_fraction,
    std::vector<int32> *assignments_out);

}  // end namespace kaldi.

#endif  // KALDI_IVECTOR_AGGLOMERATIVE_CLUSTERING_H_