Yannick Estève / ONTRAC-Kaldi

Blame view

src/ivector/agglomerative-clustering.h 7.67 KB
  // ivector/agglomerative-clustering.h
  
  // Copyright  2017-2018  Matthew Maciejewski
  //                 2018  David Snyder
  //                 2019  Dogan Can
  
  // See ../../COPYING for clarification regarding multiple authors
  //
  // Licensed under the Apache License, Version 2.0 (the "License");
  // you may not use this file except in compliance with the License.
  // You may obtain a copy of the License at
  //
  //  http://www.apache.org/licenses/LICENSE-2.0
  //
  // THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
  // KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
  // WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
  // MERCHANTABLITY OR NON-INFRINGEMENT.
  // See the Apache 2 License for the specific language governing permissions and
  // limitations under the License.
  
  #ifndef KALDI_IVECTOR_AGGLOMERATIVE_CLUSTERING_H_
  #define KALDI_IVECTOR_AGGLOMERATIVE_CLUSTERING_H_
  
  #include <vector>
  #include <queue>
  #include <set>
  #include <unordered_map>
  #include <functional>
  #include "base/kaldi-common.h"
  #include "matrix/matrix-lib.h"
  #include "util/stl-utils.h"
  
  namespace kaldi {
  
  /// AhcCluster is the cluster object for the agglomerative clustering. It
  /// contains three integer IDs: its own ID and the IDs of its "parents", i.e.
  /// the clusters that were merged to form it. It also contains the size (the
  /// number of points in the cluster) and a vector of the IDs of the utterances
  /// contained in the cluster.
  struct AhcCluster {
    int32 id,
      parent1,
      parent2,
      size;
    std::vector<int32> utt_ids;
    AhcCluster(int32 id, int32 p1, int32 p2, std::vector<int32> utts)
        : id(id), parent1(p1), parent2(p2), utt_ids(utts) {
      size = utts.size();
    }
  };
  
  /// The AgglomerativeClusterer class contains the necessary mechanisms for the
  /// actual clustering algorithm.
  class AgglomerativeClusterer {
   public:
    AgglomerativeClusterer(
        const Matrix<BaseFloat> &costs,
        BaseFloat threshold,
        int32 min_clusters,
        int32 first_pass_max_points,
        BaseFloat max_cluster_fraction,
        std::vector<int32> *assignments_out)
        : costs_(costs), threshold_(threshold), min_clusters_(min_clusters),
          first_pass_max_points_(first_pass_max_points),
          assignments_(assignments_out) {
      num_points_ = costs.NumRows();
  
      // The max_cluster_size_ is a hard limit on the number points in a cluster.
      // This is useful for handling degenerate cases where some outlier points
      // form their own clusters and force everything else to be clustered
      // together, e.g. when min-clusters is provided instead of a threshold.
      max_cluster_size_ = ceil(num_points_ * max_cluster_fraction);
  
      // The count_, which is used for identifying clusters, is initialized to
      // num_points_ because cluster IDs 1..num_points_ are reserved for input
      // points, which are the initial set of clusters.
      count_ = num_points_;
  
      // The second_pass_count_, which is used for identifying the initial set of
      // second pass clusters and initializing count_ before the second pass, is
      // initialized to 0 and incremented whenever a new cluster is added to the
      // initial set of second pass clusters.
      second_pass_count_ = 0;
    }
  
    // Clusters points. Chooses single pass or two pass algorithm.
    void Cluster();
  
    // Clusters points using single pass algorithm.
    void ClusterSinglePass();
  
    // Clusters points using two pass algorithm.
    void ClusterTwoPass();
  
   private:
    // Encodes cluster pair into a 32bit unsigned integer.
    uint32 EncodePair(int32 i, int32 j);
    // Decodes cluster pair from a 32bit unsigned integer.
    std::pair<int32, int32> DecodePair(uint32 key);
    // Initializes the clustering queue with singleton clusters
    void InitializeClusters(int32 first, int32 last);
    // Does hierarchical agglomerative clustering
    void ComputeClusters(int32 min_clusters);
    // Adds clusters created in first pass to second pass clusters
    void AddClustersToSecondPass();
    // Assigns points to clusters
    void AssignClusters();
    // Merges clusters with IDs i and j and updates cost map and queue
    void MergeClusters(int32 i, int32 j);
  
    const Matrix<BaseFloat> &costs_;  // cost matrix
    BaseFloat threshold_;  // stopping criterion threshold
    int32 min_clusters_;  // minimum number of clusters
    int32 first_pass_max_points_;  // maximum number of points in each subset
    std::vector<int32> *assignments_;  // assignments out
  
    int32 num_points_;  // total number of points to cluster
    int32 max_cluster_size_;  // maximum number of points in a cluster
    int32 count_;  // count of first pass clusters, used for identifying clusters
    int32 second_pass_count_;  // count of second pass clusters
  
    // Priority queue using greater (lowest costs are highest priority).
    // Elements contain pairs of cluster IDs and their cost.
    typedef std::pair<BaseFloat, uint32> QueueElement;
    typedef std::priority_queue<QueueElement, std::vector<QueueElement>,
      std::greater<QueueElement>  > QueueType;
    QueueType queue_, second_pass_queue_;
  
    // Map from cluster IDs to cost between them
    std::unordered_map<uint32, BaseFloat> cluster_cost_map_;
    // Map from cluster ID to cluster object address
    std::unordered_map<int32, AhcCluster*> clusters_map_;
    // Set of unmerged cluster IDs
    std::set<int32> active_clusters_;
  
    // Map from second pass cluster IDs to cost between them
    std::unordered_map<uint32, BaseFloat> second_pass_cluster_cost_map_;
    // Map from second pass cluster ID to cluster object address
    std::unordered_map<int32, AhcCluster*> second_pass_clusters_map_;
    // Set of unmerged second pass cluster IDs
    std::set<int32> second_pass_active_clusters_;
  };
  
  /** This is the function that is called to perform the agglomerative
   *  clustering. It takes the following arguments:
   *   - A matrix of all pairwise costs, with each row/column corresponding
   *      to an utterance ID, and the elements of the matrix containing the
   *      cost for pairing the utterances for its row and column
   *   - A threshold which is used as the stopping criterion for the clusters
   *   - A minimum number of clusters that will not be merged past
   *   - A maximum fraction of points that can be in a cluster
   *   - A vector which will be filled with integer IDs corresponding to each
   *      of the rows/columns of the score matrix.
   *
   *  The basic algorithm is as follows:
   *  \code
   *      while (num-clusters > min-clusters && smallest-merge-cost <= threshold)
   *          if (size-of-new-cluster <= max-cluster-size)
   *              merge the two clusters with lowest cost
   *  \endcode
   *
   *  The cost between two clusters is the average cost of all pairwise
   *  costs between points across the two clusters.
   *
   *  The algorithm takes advantage of the fact that the sum of the pairwise
   *  costs between the points of clusters I and J is equiavlent to the
   *  sum of the pairwise costs between cluster I and the parents of cluster
   *  J. In other words, the total cost between I and J is the sum of the
   *  costs between clusters I and M and clusters I and N, where
   *  cluster J was formed by merging clusters M and N.
   *
   *  If the number of points to cluster is larger than first-pass-max-points,
   *  then clustering is done in two passes. In the first pass, input points are
   *  divided into contiguous subsets of size at most first-pass-max-points and
   *  each subset is clustered separately. In the second pass, the first pass
   *  clusters are merged into the final set of clusters.
   *
   */
  void AgglomerativeCluster(
      const Matrix<BaseFloat> &costs,
      BaseFloat threshold,
      int32 min_clusters,
      int32 first_pass_max_points,
      BaseFloat max_cluster_fraction,
      std::vector<int32> *assignments_out);
  
  }  // end namespace kaldi.
  
  #endif  // KALDI_IVECTOR_AGGLOMERATIVE_CLUSTERING_H_