// ivector/agglomerative-clustering.h // Copyright 2017-2018 Matthew Maciejewski // 2018 David Snyder // 2019 Dogan Can // See ../../COPYING for clarification regarding multiple authors // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY // KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED // WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, // MERCHANTABLITY OR NON-INFRINGEMENT. // See the Apache 2 License for the specific language governing permissions and // limitations under the License. #ifndef KALDI_IVECTOR_AGGLOMERATIVE_CLUSTERING_H_ #define KALDI_IVECTOR_AGGLOMERATIVE_CLUSTERING_H_ #include #include #include #include #include #include "base/kaldi-common.h" #include "matrix/matrix-lib.h" #include "util/stl-utils.h" namespace kaldi { /// AhcCluster is the cluster object for the agglomerative clustering. It /// contains three integer IDs: its own ID and the IDs of its "parents", i.e. /// the clusters that were merged to form it. It also contains the size (the /// number of points in the cluster) and a vector of the IDs of the utterances /// contained in the cluster. struct AhcCluster { int32 id, parent1, parent2, size; std::vector utt_ids; AhcCluster(int32 id, int32 p1, int32 p2, std::vector utts) : id(id), parent1(p1), parent2(p2), utt_ids(utts) { size = utts.size(); } }; /// The AgglomerativeClusterer class contains the necessary mechanisms for the /// actual clustering algorithm. class AgglomerativeClusterer { public: AgglomerativeClusterer( const Matrix &costs, BaseFloat threshold, int32 min_clusters, int32 first_pass_max_points, BaseFloat max_cluster_fraction, std::vector *assignments_out) : costs_(costs), threshold_(threshold), min_clusters_(min_clusters), first_pass_max_points_(first_pass_max_points), assignments_(assignments_out) { num_points_ = costs.NumRows(); // The max_cluster_size_ is a hard limit on the number points in a cluster. // This is useful for handling degenerate cases where some outlier points // form their own clusters and force everything else to be clustered // together, e.g. when min-clusters is provided instead of a threshold. max_cluster_size_ = ceil(num_points_ * max_cluster_fraction); // The count_, which is used for identifying clusters, is initialized to // num_points_ because cluster IDs 1..num_points_ are reserved for input // points, which are the initial set of clusters. count_ = num_points_; // The second_pass_count_, which is used for identifying the initial set of // second pass clusters and initializing count_ before the second pass, is // initialized to 0 and incremented whenever a new cluster is added to the // initial set of second pass clusters. second_pass_count_ = 0; } // Clusters points. Chooses single pass or two pass algorithm. void Cluster(); // Clusters points using single pass algorithm. void ClusterSinglePass(); // Clusters points using two pass algorithm. void ClusterTwoPass(); private: // Encodes cluster pair into a 32bit unsigned integer. uint32 EncodePair(int32 i, int32 j); // Decodes cluster pair from a 32bit unsigned integer. std::pair DecodePair(uint32 key); // Initializes the clustering queue with singleton clusters void InitializeClusters(int32 first, int32 last); // Does hierarchical agglomerative clustering void ComputeClusters(int32 min_clusters); // Adds clusters created in first pass to second pass clusters void AddClustersToSecondPass(); // Assigns points to clusters void AssignClusters(); // Merges clusters with IDs i and j and updates cost map and queue void MergeClusters(int32 i, int32 j); const Matrix &costs_; // cost matrix BaseFloat threshold_; // stopping criterion threshold int32 min_clusters_; // minimum number of clusters int32 first_pass_max_points_; // maximum number of points in each subset std::vector *assignments_; // assignments out int32 num_points_; // total number of points to cluster int32 max_cluster_size_; // maximum number of points in a cluster int32 count_; // count of first pass clusters, used for identifying clusters int32 second_pass_count_; // count of second pass clusters // Priority queue using greater (lowest costs are highest priority). // Elements contain pairs of cluster IDs and their cost. typedef std::pair QueueElement; typedef std::priority_queue, std::greater > QueueType; QueueType queue_, second_pass_queue_; // Map from cluster IDs to cost between them std::unordered_map cluster_cost_map_; // Map from cluster ID to cluster object address std::unordered_map clusters_map_; // Set of unmerged cluster IDs std::set active_clusters_; // Map from second pass cluster IDs to cost between them std::unordered_map second_pass_cluster_cost_map_; // Map from second pass cluster ID to cluster object address std::unordered_map second_pass_clusters_map_; // Set of unmerged second pass cluster IDs std::set second_pass_active_clusters_; }; /** This is the function that is called to perform the agglomerative * clustering. It takes the following arguments: * - A matrix of all pairwise costs, with each row/column corresponding * to an utterance ID, and the elements of the matrix containing the * cost for pairing the utterances for its row and column * - A threshold which is used as the stopping criterion for the clusters * - A minimum number of clusters that will not be merged past * - A maximum fraction of points that can be in a cluster * - A vector which will be filled with integer IDs corresponding to each * of the rows/columns of the score matrix. * * The basic algorithm is as follows: * \code * while (num-clusters > min-clusters && smallest-merge-cost <= threshold) * if (size-of-new-cluster <= max-cluster-size) * merge the two clusters with lowest cost * \endcode * * The cost between two clusters is the average cost of all pairwise * costs between points across the two clusters. * * The algorithm takes advantage of the fact that the sum of the pairwise * costs between the points of clusters I and J is equiavlent to the * sum of the pairwise costs between cluster I and the parents of cluster * J. In other words, the total cost between I and J is the sum of the * costs between clusters I and M and clusters I and N, where * cluster J was formed by merging clusters M and N. * * If the number of points to cluster is larger than first-pass-max-points, * then clustering is done in two passes. In the first pass, input points are * divided into contiguous subsets of size at most first-pass-max-points and * each subset is clustered separately. In the second pass, the first pass * clusters are merged into the final set of clusters. * */ void AgglomerativeCluster( const Matrix &costs, BaseFloat threshold, int32 min_clusters, int32 first_pass_max_points, BaseFloat max_cluster_fraction, std::vector *assignments_out); } // end namespace kaldi. #endif // KALDI_IVECTOR_AGGLOMERATIVE_CLUSTERING_H_