pitch-functions.h 20.4 KB
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450
// feat/pitch-functions.h

// Copyright     2013  Pegah Ghahremani
//               2014  IMSL, PKU-HKUST (author: Wei Shi)
//               2014  Yanqing Sun, Junjie Wang,
//                     Daniel Povey, Korbinian Riedhammer
//                     Xin Lei

// See ../../COPYING for clarification regarding multiple authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//  http://www.apache.org/licenses/LICENSE-2.0
//
// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
// MERCHANTABLITY OR NON-INFRINGEMENT.
// See the Apache 2 License for the specific language governing permissions and
// limitations under the License.

#ifndef KALDI_FEAT_PITCH_FUNCTIONS_H_
#define KALDI_FEAT_PITCH_FUNCTIONS_H_

#include <cassert>
#include <cstdlib>
#include <string>
#include <vector>

#include "base/kaldi-error.h"
#include "feat/mel-computations.h"
#include "itf/online-feature-itf.h"
#include "matrix/matrix-lib.h"
#include "util/common-utils.h"

namespace kaldi {
/// @addtogroup  feat FeatureExtraction
/// @{

struct PitchExtractionOptions {
  // FrameExtractionOptions frame_opts;
  BaseFloat samp_freq;          // sample frequency in hertz
  BaseFloat frame_shift_ms;     // in milliseconds.
  BaseFloat frame_length_ms;    // in milliseconds.
  BaseFloat preemph_coeff;      // Preemphasis coefficient. [use is deprecated.]
  BaseFloat min_f0;             // min f0 to search (Hz)
  BaseFloat max_f0;             // max f0 to search (Hz)
  BaseFloat soft_min_f0;        // Minimum f0, applied in soft way, must not
                                // exceed min-f0
  BaseFloat penalty_factor;     // cost factor for FO change
  BaseFloat lowpass_cutoff;     // cutoff frequency for Low pass filter
  BaseFloat resample_freq;      // Integer that determines filter width when
                                // upsampling NCCF
  BaseFloat delta_pitch;        // the pitch tolerance in pruning lags
  BaseFloat nccf_ballast;       // Increasing this factor reduces NCCF for
                                // quiet frames, helping ensure pitch
                                // continuity in unvoiced region
  int32 lowpass_filter_width;   // Integer that determines filter width of
                                // lowpass filter
  int32 upsample_filter_width;  // Integer that determines filter width when
                                // upsampling NCCF

  // Below are newer config variables, not present in the original paper,
  // that relate to the online pitch extraction algorithm.

  // The maximum number of frames of latency that we allow the pitch-processing
  // to introduce, for online operation. If you set this to a large value,
  // there would be no inaccuracy from the Viterbi traceback (but it might make
  // you wait to see the pitch). This is not very relevant for the online
  // operation: normalization-right-context is more relevant, you
  // can just leave this value at zero.
  int32 max_frames_latency;

  // Only relevant for the function ComputeKaldiPitch which is called by
  // compute-kaldi-pitch-feats. If nonzero, we provide the input as chunks of
  // this size. This affects the energy normalization which has a small effect
  // on the resulting features, especially at the beginning of a file. For best
  // compatibility with online operation (e.g. if you plan to train models for
  // the online-deocding setup), you might want to set this to a small value,
  // like one frame.
  int32 frames_per_chunk;

  // Only relevant for the function ComputeKaldiPitch which is called by
  // compute-kaldi-pitch-feats, and only relevant if frames_per_chunk is
  // nonzero. If true, it will query the features as soon as they are
  // available, which simulates the first-pass features you would get in online
  // decoding. If false, the features you will get will be the same as those
  // available at the end of the utterance, after InputFinished() has been
  // called: e.g. during lattice rescoring.
  bool simulate_first_pass_online;

  // Only relevant for online operation or when emulating online operation
  // (e.g. when setting frames_per_chunk). This is the frame-index on which we
  // recompute the NCCF (e.g. frame-index 500 = after 5 seconds); if the
  // segment ends before this we do it when the segment ends. We do this by
  // re-computing the signal average energy, which affects the NCCF via the
  // "ballast term", scaling the resampled NCCF by a factor derived from the
  // average change in the "ballast term", and re-doing the backtrace
  // computation. Making this infinity would be the most exact, but would
  // introduce unwanted latency at the end of long utterances, for little
  // benefit.
  int32 recompute_frame;

  // This is a "hidden config" used only for testing the online pitch
  // extraction. If true, we compute the signal root-mean-squared for the
  // ballast term, only up to the current frame, rather than the end of the
  // current chunk of signal. This makes the output insensitive to the
  // chunking, which is useful for testing purposes.
  bool nccf_ballast_online;
  bool snip_edges;
  PitchExtractionOptions():
      samp_freq(16000),
      frame_shift_ms(10.0),
      frame_length_ms(25.0),
      preemph_coeff(0.0),
      min_f0(50),
      max_f0(400),
      soft_min_f0(10.0),
      penalty_factor(0.1),
      lowpass_cutoff(1000),
      resample_freq(4000),
      delta_pitch(0.005),
      nccf_ballast(7000),
      lowpass_filter_width(1),
      upsample_filter_width(5),
      max_frames_latency(0),
      frames_per_chunk(0),
      simulate_first_pass_online(false),
      recompute_frame(500),
      nccf_ballast_online(false),
      snip_edges(true) { }

  void Register(OptionsItf *opts) {
    opts->Register("sample-frequency", &samp_freq,
                   "Waveform data sample frequency (must match the waveform "
                   "file, if specified there)");
    opts->Register("frame-length", &frame_length_ms, "Frame length in "
                   "milliseconds");
    opts->Register("frame-shift", &frame_shift_ms, "Frame shift in "
                   "milliseconds");
    opts->Register("preemphasis-coefficient", &preemph_coeff,
                   "Coefficient for use in signal preemphasis (deprecated)");
    opts->Register("min-f0", &min_f0,
                   "min. F0 to search for (Hz)");
    opts->Register("max-f0", &max_f0,
                   "max. F0 to search for (Hz)");
    opts->Register("soft-min-f0", &soft_min_f0,
                   "Minimum f0, applied in soft way, must not exceed min-f0");
    opts->Register("penalty-factor", &penalty_factor,
                   "cost factor for FO change.");
    opts->Register("lowpass-cutoff", &lowpass_cutoff,
                   "cutoff frequency for LowPass filter (Hz) ");
    opts->Register("resample-frequency", &resample_freq,
                   "Frequency that we down-sample the signal to.  Must be "
                   "more than twice lowpass-cutoff");
    opts->Register("delta-pitch", &delta_pitch,
                   "Smallest relative change in pitch that our algorithm "
                   "measures");
    opts->Register("nccf-ballast", &nccf_ballast,
                   "Increasing this factor reduces NCCF for quiet frames");
    opts->Register("nccf-ballast-online", &nccf_ballast_online,
                   "This is useful mainly for debug; it affects how the NCCF "
                   "ballast is computed.");
    opts->Register("lowpass-filter-width", &lowpass_filter_width,
                   "Integer that determines filter width of "
                   "lowpass filter, more gives sharper filter");
    opts->Register("upsample-filter-width", &upsample_filter_width,
                   "Integer that determines filter width when upsampling NCCF");
    opts->Register("frames-per-chunk", &frames_per_chunk, "Only relevant for "
                   "offline pitch extraction (e.g. compute-kaldi-pitch-feats), "
                   "you can set it to a small nonzero value, such as 10, for "
                   "better feature compatibility with online decoding (affects "
                   "energy normalization in the algorithm)");
    opts->Register("simulate-first-pass-online", &simulate_first_pass_online,
                   "If true, compute-kaldi-pitch-feats will output features "
                   "that correspond to what an online decoder would see in the "
                   "first pass of decoding-- not the final version of the "
                   "features, which is the default.  Relevant if "
                   "--frames-per-chunk > 0");
    opts->Register("recompute-frame", &recompute_frame, "Only relevant for "
                   "online pitch extraction, or for compatibility with online "
                   "pitch extraction.  A non-critical parameter; the frame at "
                   "which we recompute some of the forward pointers, after "
                   "revising our estimate of the signal energy.  Relevant if"
                   "--frames-per-chunk > 0");
    opts->Register("max-frames-latency", &max_frames_latency, "Maximum number "
                   "of frames of latency that we allow pitch tracking to "
                   "introduce into the feature processing (affects output only "
                   "if --frames-per-chunk > 0 and "
                   "--simulate-first-pass-online=true");
    opts->Register("snip-edges", &snip_edges, "If this is set to false, the "
                   "incomplete frames near the ending edge won't be snipped, "
                   "so that the number of frames is the file size divided by "
                   "the frame-shift. This makes different types of features "
                   "give the same number of frames.");
  }
  /// Returns the window-size in samples, after resampling.  This is the
  /// "basic window size", not the full window size after extending by max-lag.
  // Because of floating point representation, it is more reliable to divide
  // by 1000 instead of multiplying by 0.001, but it is a bit slower.
  int32 NccfWindowSize() const {
    return static_cast<int32>(resample_freq * frame_length_ms / 1000.0);
  }
  /// Returns the window-shift in samples, after resampling.
  int32 NccfWindowShift() const {
    return static_cast<int32>(resample_freq * frame_shift_ms / 1000.0);
  }
};

struct ProcessPitchOptions {
  BaseFloat pitch_scale;  // the final normalized-log-pitch feature is scaled
                          // with this value
  BaseFloat pov_scale;    // the final POV feature is scaled with this value
  BaseFloat pov_offset;   // An offset that can be added to the final POV
                          // feature (useful for online-decoding, where we don't
                          // do CMN to the pitch-derived features.

  BaseFloat delta_pitch_scale;
  BaseFloat delta_pitch_noise_stddev;  // stddev of noise we add to delta-pitch
  int32 normalization_left_context;    // left-context used for sliding-window
                                       // normalization
  int32 normalization_right_context;   // this should be reduced in online
                                       // decoding to reduce latency

  int32 delta_window;
  int32 delay;

  bool add_pov_feature;
  bool add_normalized_log_pitch;
  bool add_delta_pitch;
  bool add_raw_log_pitch;

  ProcessPitchOptions() :
      pitch_scale(2.0),
      pov_scale(2.0),
      pov_offset(0.0),
      delta_pitch_scale(10.0),
      delta_pitch_noise_stddev(0.005),
      normalization_left_context(75),
      normalization_right_context(75),
      delta_window(2),
      delay(0),
      add_pov_feature(true),
      add_normalized_log_pitch(true),
      add_delta_pitch(true),
      add_raw_log_pitch(false) { }


  void Register(ParseOptions *opts) {
    opts->Register("pitch-scale", &pitch_scale,
                   "Scaling factor for the final normalized log-pitch value");
    opts->Register("pov-scale", &pov_scale,
                   "Scaling factor for final POV (probability of voicing) "
                   "feature");
    opts->Register("pov-offset", &pov_offset,
                   "This can be used to add an offset to the POV feature. "
                   "Intended for use in online decoding as a substitute for "
                   " CMN.");
    opts->Register("delta-pitch-scale", &delta_pitch_scale,
                   "Term to scale the final delta log-pitch feature");
    opts->Register("delta-pitch-noise-stddev", &delta_pitch_noise_stddev,
                   "Standard deviation for noise we add to the delta log-pitch "
                   "(before scaling); should be about the same as delta-pitch "
                   "option to pitch creation.  The purpose is to get rid of "
                   "peaks in the delta-pitch caused by discretization of pitch "
                   "values.");
    opts->Register("normalization-left-context", &normalization_left_context,
                   "Left-context (in frames) for moving window normalization");
    opts->Register("normalization-right-context", &normalization_right_context,
                   "Right-context (in frames) for moving window normalization");
    opts->Register("delta-window", &delta_window,
                   "Number of frames on each side of central frame, to use for "
                   "delta window.");
    opts->Register("delay", &delay,
                   "Number of frames by which the pitch information is "
                   "delayed.");
    opts->Register("add-pov-feature", &add_pov_feature,
                   "If true, the warped NCCF is added to output features");
    opts->Register("add-normalized-log-pitch", &add_normalized_log_pitch,
                   "If true, the log-pitch with POV-weighted mean subtraction "
                   "over 1.5 second window is added to output features");
    opts->Register("add-delta-pitch", &add_delta_pitch,
                   "If true, time derivative of log-pitch is added to output "
                   "features");
    opts->Register("add-raw-log-pitch", &add_raw_log_pitch,
                   "If true, log(pitch) is added to output features");
  }
};


// We don't want to expose the pitch-extraction internals here as it's
// quite complex, so we use a private implementation.
class OnlinePitchFeatureImpl;


// Note: to start on a new waveform, just construct a new version
// of this object.
class OnlinePitchFeature: public OnlineBaseFeature {
 public:
  explicit OnlinePitchFeature(const PitchExtractionOptions &opts);

  virtual int32 Dim() const { return 2; /* (NCCF, pitch) */ }

  virtual int32 NumFramesReady() const;

  virtual BaseFloat FrameShiftInSeconds() const;

  virtual bool IsLastFrame(int32 frame) const;

  /// Outputs the two-dimensional feature consisting of (pitch, NCCF).  You
  /// should probably post-process this using class OnlineProcessPitch.
  virtual void GetFrame(int32 frame, VectorBase<BaseFloat> *feat);

  virtual void AcceptWaveform(BaseFloat sampling_rate,
                              const VectorBase<BaseFloat> &waveform);

  virtual void InputFinished();

  virtual ~OnlinePitchFeature();

 private:
  OnlinePitchFeatureImpl *impl_;
};


/// This online-feature class implements post processing of pitch features.
/// Inputs are original 2 dims (nccf, pitch).  It can produce various
/// kinds of outputs, using the default options it will be (pov-feature,
/// normalized-log-pitch, delta-log-pitch).
class OnlineProcessPitch: public OnlineFeatureInterface {
 public:
  virtual int32 Dim() const { return dim_; }

  virtual bool IsLastFrame(int32 frame) const {
    if (frame <= -1)
      return src_->IsLastFrame(-1);
    else if (frame < opts_.delay)
      return src_->IsLastFrame(-1) == true ? false : src_->IsLastFrame(0);
    else
      return src_->IsLastFrame(frame - opts_.delay);
  }
  virtual BaseFloat FrameShiftInSeconds() const {
    return src_->FrameShiftInSeconds();
  }

  virtual int32 NumFramesReady() const;

  virtual void GetFrame(int32 frame, VectorBase<BaseFloat> *feat);

  virtual ~OnlineProcessPitch() {  }

  // Does not take ownership of "src".
  OnlineProcessPitch(const ProcessPitchOptions &opts,
                     OnlineFeatureInterface *src);

 private:
  enum { kRawFeatureDim = 2};  // anonymous enum to define a constant.
                               // kRawFeatureDim defines the dimension
                               // of the input: (nccf, pitch)

  ProcessPitchOptions opts_;
  OnlineFeatureInterface *src_;
  int32 dim_;  // Output feature dimension, set in initializer.

  struct NormalizationStats {
    int32 cur_num_frames;      // value of src_->NumFramesReady() when
                               // "mean_pitch" was set.
    bool input_finished;       // true if input data was finished when
                               // "mean_pitch" was computed.
    double sum_pov;            // sum of pov over relevant range
    double sum_log_pitch_pov;  // sum of log(pitch) * pov over relevant range

    NormalizationStats(): cur_num_frames(-1), input_finished(false),
                          sum_pov(0.0), sum_log_pitch_pov(0.0) { }
  };

  std::vector<BaseFloat> delta_feature_noise_;

  std::vector<NormalizationStats> normalization_stats_;

  /// Computes and returns the POV feature for this frame.
  /// Called from GetFrame().
  inline BaseFloat GetPovFeature(int32 frame) const;

  /// Computes and returns the delta-log-pitch feature for this frame.
  /// Called from GetFrame().
  inline BaseFloat GetDeltaPitchFeature(int32 frame);

  /// Computes and returns the raw log-pitch feature for this frame.
  /// Called from GetFrame().
  inline BaseFloat GetRawLogPitchFeature(int32 frame) const;

  /// Computes and returns the mean-subtracted log-pitch feature for this frame.
  /// Called from GetFrame().
  inline BaseFloat GetNormalizedLogPitchFeature(int32 frame);

  /// Computes the normalization window sizes.
  inline void GetNormalizationWindow(int32 frame,
                                     int32 src_frames_ready,
                                     int32 *window_begin,
                                     int32 *window_end) const;

  /// Makes sure the entry in normalization_stats_ for this frame is up to date;
  /// called from GetNormalizedLogPitchFeature.
  inline void UpdateNormalizationStats(int32 frame);
};


/// This function extracts (pitch, NCCF) per frame, using the pitch extraction
/// method described in "A Pitch Extraction Algorithm Tuned for Automatic Speech
/// Recognition", Pegah Ghahremani, Bagher BabaAli, Daniel Povey, Korbinian
/// Riedhammer, Jan Trmal and Sanjeev Khudanpur, ICASSP 2014.  The output will
/// have as many rows as there are frames, and two columns corresponding to
/// (NCCF, pitch)
void ComputeKaldiPitch(const PitchExtractionOptions &opts,
                       const VectorBase<BaseFloat> &wave,
                       Matrix<BaseFloat> *output);

/// This function processes the raw (NCCF, pitch) quantities computed by
/// ComputeKaldiPitch, and processes them into features.  By default it will
/// output three-dimensional features, (POV-feature, mean-subtracted-log-pitch,
/// delta-of-raw-pitch), but this is configurable in the options.  The number of
/// rows of "output" will be the number of frames (rows) in "input", and the
/// number of columns will be the number of different types of features
/// requested (by default, 3; 4 is the max).  The four config variables
/// --add-pov-feature, --add-normalized-log-pitch, --add-delta-pitch,
/// --add-raw-log-pitch determine which features we create; by default we create
/// the first three.
void ProcessPitch(const ProcessPitchOptions &opts,
                  const MatrixBase<BaseFloat> &input,
                  Matrix<BaseFloat> *output);

/// This function combines ComputeKaldiPitch and ProcessPitch.  The reason
/// why we need a separate function to do this is in order to be able to
/// accurately simulate the online pitch-processing, for testing and for
/// training models matched to the "first-pass" features.  It is sensitive to
/// the variables in pitch_opts that relate to online processing,
/// i.e. max_frames_latency, frames_per_chunk, simulate_first_pass_online,
/// recompute_frame.
void ComputeAndProcessKaldiPitch(const PitchExtractionOptions &pitch_opts,
                                 const ProcessPitchOptions &process_opts,
                                 const VectorBase<BaseFloat> &wave,
                                 Matrix<BaseFloat> *output);


/// @} End of "addtogroup feat"
}  // namespace kaldi
#endif  // KALDI_FEAT_PITCH_FUNCTIONS_H_