// ivector/voice-activity-detection.cc // Copyright 2013 Daniel Povey // See ../../COPYING for clarification regarding multiple authors // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY // KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED // WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, // MERCHANTABLITY OR NON-INFRINGEMENT. // See the Apache 2 License for the specific language governing permissions and // limitations under the License. #include "ivector/voice-activity-detection.h" #include "matrix/matrix-functions.h" namespace kaldi { void ComputeVadEnergy(const VadEnergyOptions &opts, const MatrixBase &feats, Vector *output_voiced) { int32 T = feats.NumRows(); output_voiced->Resize(T); if (T == 0) { KALDI_WARN << "Empty features"; return; } Vector log_energy(T); log_energy.CopyColFromMat(feats, 0); // column zero is log-energy. BaseFloat energy_threshold = opts.vad_energy_threshold; if (opts.vad_energy_mean_scale != 0.0) { KALDI_ASSERT(opts.vad_energy_mean_scale > 0.0); energy_threshold += opts.vad_energy_mean_scale * log_energy.Sum() / T; } KALDI_ASSERT(opts.vad_frames_context >= 0); KALDI_ASSERT(opts.vad_proportion_threshold > 0.0 && opts.vad_proportion_threshold < 1.0); for (int32 t = 0; t < T; t++) { const BaseFloat *log_energy_data = log_energy.Data(); int32 num_count = 0, den_count = 0, context = opts.vad_frames_context; for (int32 t2 = t - context; t2 <= t + context; t2++) { if (t2 >= 0 && t2 < T) { den_count++; if (log_energy_data[t2] > energy_threshold) num_count++; } } if (num_count >= den_count * opts.vad_proportion_threshold) (*output_voiced)(t) = 1.0; else (*output_voiced)(t) = 0.0; } } }