Blame view
src/ivector/agglomerative-clustering.h
7.67 KB
8dcb6dfcb first commit |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 |
// ivector/agglomerative-clustering.h // Copyright 2017-2018 Matthew Maciejewski // 2018 David Snyder // 2019 Dogan Can // See ../../COPYING for clarification regarding multiple authors // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY // KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED // WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, // MERCHANTABLITY OR NON-INFRINGEMENT. // See the Apache 2 License for the specific language governing permissions and // limitations under the License. #ifndef KALDI_IVECTOR_AGGLOMERATIVE_CLUSTERING_H_ #define KALDI_IVECTOR_AGGLOMERATIVE_CLUSTERING_H_ #include <vector> #include <queue> #include <set> #include <unordered_map> #include <functional> #include "base/kaldi-common.h" #include "matrix/matrix-lib.h" #include "util/stl-utils.h" namespace kaldi { /// AhcCluster is the cluster object for the agglomerative clustering. It /// contains three integer IDs: its own ID and the IDs of its "parents", i.e. /// the clusters that were merged to form it. It also contains the size (the /// number of points in the cluster) and a vector of the IDs of the utterances /// contained in the cluster. struct AhcCluster { int32 id, parent1, parent2, size; std::vector<int32> utt_ids; AhcCluster(int32 id, int32 p1, int32 p2, std::vector<int32> utts) : id(id), parent1(p1), parent2(p2), utt_ids(utts) { size = utts.size(); } }; /// The AgglomerativeClusterer class contains the necessary mechanisms for the /// actual clustering algorithm. class AgglomerativeClusterer { public: AgglomerativeClusterer( const Matrix<BaseFloat> &costs, BaseFloat threshold, int32 min_clusters, int32 first_pass_max_points, BaseFloat max_cluster_fraction, std::vector<int32> *assignments_out) : costs_(costs), threshold_(threshold), min_clusters_(min_clusters), first_pass_max_points_(first_pass_max_points), assignments_(assignments_out) { num_points_ = costs.NumRows(); // The max_cluster_size_ is a hard limit on the number points in a cluster. // This is useful for handling degenerate cases where some outlier points // form their own clusters and force everything else to be clustered // together, e.g. when min-clusters is provided instead of a threshold. max_cluster_size_ = ceil(num_points_ * max_cluster_fraction); // The count_, which is used for identifying clusters, is initialized to // num_points_ because cluster IDs 1..num_points_ are reserved for input // points, which are the initial set of clusters. count_ = num_points_; // The second_pass_count_, which is used for identifying the initial set of // second pass clusters and initializing count_ before the second pass, is // initialized to 0 and incremented whenever a new cluster is added to the // initial set of second pass clusters. second_pass_count_ = 0; } // Clusters points. Chooses single pass or two pass algorithm. void Cluster(); // Clusters points using single pass algorithm. void ClusterSinglePass(); // Clusters points using two pass algorithm. void ClusterTwoPass(); private: // Encodes cluster pair into a 32bit unsigned integer. uint32 EncodePair(int32 i, int32 j); // Decodes cluster pair from a 32bit unsigned integer. std::pair<int32, int32> DecodePair(uint32 key); // Initializes the clustering queue with singleton clusters void InitializeClusters(int32 first, int32 last); // Does hierarchical agglomerative clustering void ComputeClusters(int32 min_clusters); // Adds clusters created in first pass to second pass clusters void AddClustersToSecondPass(); // Assigns points to clusters void AssignClusters(); // Merges clusters with IDs i and j and updates cost map and queue void MergeClusters(int32 i, int32 j); const Matrix<BaseFloat> &costs_; // cost matrix BaseFloat threshold_; // stopping criterion threshold int32 min_clusters_; // minimum number of clusters int32 first_pass_max_points_; // maximum number of points in each subset std::vector<int32> *assignments_; // assignments out int32 num_points_; // total number of points to cluster int32 max_cluster_size_; // maximum number of points in a cluster int32 count_; // count of first pass clusters, used for identifying clusters int32 second_pass_count_; // count of second pass clusters // Priority queue using greater (lowest costs are highest priority). // Elements contain pairs of cluster IDs and their cost. typedef std::pair<BaseFloat, uint32> QueueElement; typedef std::priority_queue<QueueElement, std::vector<QueueElement>, std::greater<QueueElement> > QueueType; QueueType queue_, second_pass_queue_; // Map from cluster IDs to cost between them std::unordered_map<uint32, BaseFloat> cluster_cost_map_; // Map from cluster ID to cluster object address std::unordered_map<int32, AhcCluster*> clusters_map_; // Set of unmerged cluster IDs std::set<int32> active_clusters_; // Map from second pass cluster IDs to cost between them std::unordered_map<uint32, BaseFloat> second_pass_cluster_cost_map_; // Map from second pass cluster ID to cluster object address std::unordered_map<int32, AhcCluster*> second_pass_clusters_map_; // Set of unmerged second pass cluster IDs std::set<int32> second_pass_active_clusters_; }; /** This is the function that is called to perform the agglomerative * clustering. It takes the following arguments: * - A matrix of all pairwise costs, with each row/column corresponding * to an utterance ID, and the elements of the matrix containing the * cost for pairing the utterances for its row and column * - A threshold which is used as the stopping criterion for the clusters * - A minimum number of clusters that will not be merged past * - A maximum fraction of points that can be in a cluster * - A vector which will be filled with integer IDs corresponding to each * of the rows/columns of the score matrix. * * The basic algorithm is as follows: * \code * while (num-clusters > min-clusters && smallest-merge-cost <= threshold) * if (size-of-new-cluster <= max-cluster-size) * merge the two clusters with lowest cost * \endcode * * The cost between two clusters is the average cost of all pairwise * costs between points across the two clusters. * * The algorithm takes advantage of the fact that the sum of the pairwise * costs between the points of clusters I and J is equiavlent to the * sum of the pairwise costs between cluster I and the parents of cluster * J. In other words, the total cost between I and J is the sum of the * costs between clusters I and M and clusters I and N, where * cluster J was formed by merging clusters M and N. * * If the number of points to cluster is larger than first-pass-max-points, * then clustering is done in two passes. In the first pass, input points are * divided into contiguous subsets of size at most first-pass-max-points and * each subset is clustered separately. In the second pass, the first pass * clusters are merged into the final set of clusters. * */ void AgglomerativeCluster( const Matrix<BaseFloat> &costs, BaseFloat threshold, int32 min_clusters, int32 first_pass_max_points, BaseFloat max_cluster_fraction, std::vector<int32> *assignments_out); } // end namespace kaldi. #endif // KALDI_IVECTOR_AGGLOMERATIVE_CLUSTERING_H_ |