// gst-plugin/gst-online-gmm-decode-faster.cc
// Copyright 2013 Tanel Alumae, Tallinn University of Technology
// Copyright 2012 Cisco Systems (author: Matthias Paulik)
// Modifications to the original contribution by Cisco Systems made by:
// Vassil Panayotov
// See ../../COPYING for clarification regarding multiple authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
// MERCHANTABLITY OR NON-INFRINGEMENT.
// See the Apache 2 License for the specific language governing permissions and
// limitations under the License.
/**
* GStreamer plugin for automatic speecg recognition.
* Based on Kaldi's OnlineGmmDecodeFaster decoder.
*
*
* Example launch line
* |[
* gst-launch-1.0 filesrc location=test.wav \
* ! decodebin ! audioconvert ! audioresample \
* ! onlinegmmdecodefaster rt-min=0.8 rt-max=0.85 max-active=4000 beam=12.0 acoustic-scale=0.0769 \
* model=$ac_model/model fst=$ac_model/HCLG.fst \
* word-syms=$ac_model/words.txt silence-phones="1:2:3:4:5" \
* lda-mat=$trans_matrix \
* ! filesink location=$resultfile
* ]|
*
*/
#ifdef HAVE_CONFIG_H
# include
#else
# define VERSION "1.0"
#endif
#include
#include
#include
#include "gst-plugin/kaldimarshal.h"
#include "gst-plugin/gst-online-gmm-decode-faster.h"
#include "feat/feature-mfcc.h"
#include "online/online-audio-source.h"
#include "online/online-feat-input.h"
#include "online/online-decodable.h"
#include "online/online-faster-decoder.h"
#include "online/onlinebin-util.h"
#include "util/simple-options.h"
#include "util/parse-options.h"
namespace kaldi {
GST_DEBUG_CATEGORY_STATIC(gst_online_gmm_decode_faster_debug);
#define GST_CAT_DEFAULT gst_online_gmm_decode_faster_debug
enum {
HYP_WORD_SIGNAL,
LAST_SIGNAL
};
enum {
PROP_0,
PROP_SILENT,
PROP_MODEL,
PROP_FST,
PROP_WORD_SYMS,
PROP_SILENCE_PHONES,
PROP_LDA_MAT,
PROP_LAST
};
#define DEFAULT_MODEL "final.mdl"
#define DEFAULT_FST "HCLG.fst"
#define DEFAULT_WORD_SYMS "words.txt"
#define DEFAULT_SILENCE_PHONES "1:2:3:4:5"
#define DEFAULT_ACOUSTIC_SCALE 1.0/13
#define DEFAULT_LEFT_CONTEXT 4
#define DEFAULT_RIGHT_CONTEXT 4
/* the capabilities of the inputs and outputs.
*
* describe the real formats here.
*/
static GstStaticPadTemplate sink_factory =
GST_STATIC_PAD_TEMPLATE("sink",
GST_PAD_SINK,
GST_PAD_ALWAYS,
GST_STATIC_CAPS(
"audio/x-raw, "
"format = (string) S16LE, "
"channels = (int) 1, "
"rate = (int) 16000 "));
static GstStaticPadTemplate src_factory =
GST_STATIC_PAD_TEMPLATE("src",
GST_PAD_SRC,
GST_PAD_ALWAYS,
GST_STATIC_CAPS("text/x-raw, format= { utf8 }"));
static guint gst_online_gmm_decode_faster_signals[LAST_SIGNAL];
#define gst_online_gmm_decode_faster_parent_class parent_class
G_DEFINE_TYPE(GstOnlineGmmDecodeFaster, gst_online_gmm_decode_faster, GST_TYPE_ELEMENT);
static void
gst_online_gmm_decode_faster_set_property(GObject * object, guint prop_id,
const GValue * value,
GParamSpec * pspec);
static void
gst_online_gmm_decode_faster_get_property(GObject * object, guint prop_id,
GValue * value, GParamSpec * pspec);
static GstStateChangeReturn
gst_online_gmm_decode_faster_change_state(GstElement *element,
GstStateChange transition);
static void
gst_online_gmm_decode_faster_finalize(GObject * object);
static gboolean
gst_online_gmm_decode_faster_sink_event(GstPad * pad, GstObject * parent,
GstEvent * event);
static GstFlowReturn gst_online_gmm_decode_faster_chain(GstPad * pad,
GstObject * parent,
GstBuffer * buf);
static void
gst_online_gmm_decode_faster_loop(GstOnlineGmmDecodeFaster * filter);
/* GObject vmethod implementations */
/* initialize the onlinegmmdecodefaster's class */
static void gst_online_gmm_decode_faster_class_init(GstOnlineGmmDecodeFasterClass * klass) {
GObjectClass *gobject_class;
GstElementClass *gstelement_class;
gobject_class = (GObjectClass *) klass;
gstelement_class = (GstElementClass *) klass;
gobject_class->set_property = gst_online_gmm_decode_faster_set_property;
gobject_class->get_property = gst_online_gmm_decode_faster_get_property;
gobject_class->finalize = gst_online_gmm_decode_faster_finalize;
gstelement_class->change_state = gst_online_gmm_decode_faster_change_state;
g_object_class_install_property(G_OBJECT_CLASS(klass),
PROP_SILENT,
g_param_spec_boolean("silent",
"Silence the decoder",
"Determines whether incoming audio is sent to the decoder or not",
false,
(GParamFlags) G_PARAM_READWRITE));
g_object_class_install_property(G_OBJECT_CLASS(klass),
PROP_MODEL,
g_param_spec_string("model",
"Acoustic model",
"Filename of the acoustic model",
DEFAULT_MODEL,
(GParamFlags) G_PARAM_READWRITE));
g_object_class_install_property(G_OBJECT_CLASS(klass),
PROP_FST,
g_param_spec_string("fst",
"Decoding FST",
"Filename of the HCLG FST",
DEFAULT_FST,
(GParamFlags) G_PARAM_READWRITE));
g_object_class_install_property(G_OBJECT_CLASS(klass),
PROP_WORD_SYMS,
g_param_spec_string("word-syms",
"Word symbols",
"Name of word symbols file (typically words.txt)",
DEFAULT_WORD_SYMS,
(GParamFlags) G_PARAM_READWRITE));
g_object_class_install_property(G_OBJECT_CLASS(klass),
PROP_SILENCE_PHONES,
g_param_spec_string("silence-phones",
"Silence phones",
"Colon-separated IDs of silence phones, e.g. '1:2:3:4:5'",
DEFAULT_SILENCE_PHONES,
(GParamFlags) G_PARAM_READWRITE));
g_object_class_install_property(G_OBJECT_CLASS(klass),
PROP_LDA_MAT,
g_param_spec_string("lda-mat",
"LDA matrix",
"Filename of the LDA transform data",
"",
(GParamFlags) G_PARAM_READWRITE));
gst_element_class_set_details_simple(gstelement_class,
"OnlineGmmDecodeFaster",
"Speech/Audio",
"Convert speech to text",
"Tanel Alumae ");
gst_element_class_add_pad_template(gstelement_class,
gst_static_pad_template_get(&src_factory));
gst_element_class_add_pad_template(gstelement_class,
gst_static_pad_template_get(&sink_factory));
gst_online_gmm_decode_faster_signals[HYP_WORD_SIGNAL]
= g_signal_new("hyp-word", G_TYPE_FROM_CLASS(klass), G_SIGNAL_RUN_LAST,
G_STRUCT_OFFSET(GstOnlineGmmDecodeFasterClass, hyp_word),
NULL, NULL, kaldi_marshal_VOID__STRING, G_TYPE_NONE, 1,
G_TYPE_STRING);
}
/* initialize the new element
* instantiate pads and add them to element
* set pad calback functions
* initialize instance structure
*/
static void
gst_online_gmm_decode_faster_init(GstOnlineGmmDecodeFaster * filter) {
bool tmp_bool;
int32 tmp_int;
uint32 tmp_uint;
float tmp_float;
double tmp_double;
std::string tmp_string;
filter->silent_ = false;
filter->model_rspecifier_ = g_strdup(DEFAULT_MODEL);
filter->fst_rspecifier_ = g_strdup(DEFAULT_FST);
filter->word_syms_filename_ = g_strdup(DEFAULT_WORD_SYMS);
filter->lda_mat_rspecifier_ = g_strdup("");
filter->silence_phones_ = new std::vector;
SplitStringToIntegers(DEFAULT_SILENCE_PHONES, ":", false, filter->silence_phones_);
filter->simple_options_ = new SimpleOptions();
filter->decoder_opts_ = new OnlineFasterDecoderOpts();
filter->decoder_opts_->Register(filter->simple_options_, true);
filter->feature_reading_opts_ = new OnlineFeatureMatrixOptions();
filter->feature_reading_opts_->Register(filter->simple_options_);
filter->acoustic_scale_ = DEFAULT_ACOUSTIC_SCALE;
filter->cmn_window_ = 600;
filter->min_cmn_window_ = 100; // adds 1 second latency, only at utterance start.
filter->right_context_ = DEFAULT_RIGHT_CONTEXT;
filter->left_context_ = DEFAULT_LEFT_CONTEXT;
filter->simple_options_->Register("left-context", &filter->left_context_,
"Number of frames of left context");
filter->simple_options_->Register("right-context", &filter->right_context_,
"Number of frames of right context");
filter->simple_options_->Register("acoustic-scale", &filter->acoustic_scale_,
"Scaling factor for acoustic likelihoods");
filter->simple_options_->Register("cmn-window", &(filter->cmn_window_),
"Number of feat. vectors used in the running average CMN calculation");
filter->simple_options_->Register("min-cmn-window", &filter->min_cmn_window_,
"Minumum CMN window used at start of decoding (adds "
"latency only at start)");
filter->sinkpad_ = gst_pad_new_from_static_template(&sink_factory, "sink");
gst_pad_set_event_function(filter->sinkpad_,
GST_DEBUG_FUNCPTR(gst_online_gmm_decode_faster_sink_event));
gst_pad_set_chain_function(filter->sinkpad_,
GST_DEBUG_FUNCPTR(gst_online_gmm_decode_faster_chain));
gst_pad_use_fixed_caps(filter->sinkpad_);
gst_element_add_pad(GST_ELEMENT(filter), filter->sinkpad_);
filter->srcpad_ = gst_pad_new_from_static_template(&src_factory, "src");
gst_pad_use_fixed_caps(filter->srcpad_);
gst_element_add_pad(GST_ELEMENT(filter), filter->srcpad_);
// init properties from various Kaldi Opts
GstElementClass * klass = GST_ELEMENT_GET_CLASS(filter);
std::vector > option_info_list;
option_info_list = filter->simple_options_->GetOptionInfoList();
int32 i = 0;
for (std::vector >::iterator dx = option_info_list.begin();
dx != option_info_list.end(); dx++) {
std::pair result = (*dx);
SimpleOptions::OptionInfo option_info = result.second;
std::string name = result.first;
switch (option_info.type) {
case SimpleOptions::kBool:
filter->simple_options_->GetOption(name, &tmp_bool);
g_object_class_install_property(
G_OBJECT_CLASS(klass),
PROP_LAST + i,
g_param_spec_boolean(
name.c_str(),
option_info.doc.c_str(),
option_info.doc.c_str(),
tmp_bool,
(GParamFlags) G_PARAM_READWRITE));
break;
case SimpleOptions::kInt32:
filter->simple_options_->GetOption(name, &tmp_int);
g_object_class_install_property(
G_OBJECT_CLASS(klass),
PROP_LAST + i,
g_param_spec_int(
name.c_str(),
option_info.doc.c_str(),
option_info.doc.c_str(),
G_MININT,
G_MAXINT,
tmp_int,
(GParamFlags) G_PARAM_READWRITE));
break;
case SimpleOptions::kUint32:
filter->simple_options_->GetOption(name, &tmp_uint);
g_object_class_install_property(
G_OBJECT_CLASS(klass),
PROP_LAST + i,
g_param_spec_uint(
name.c_str(),
option_info.doc.c_str(),
option_info.doc.c_str(),
0,
G_MAXUINT,
tmp_uint,
(GParamFlags) G_PARAM_READWRITE));
break;
case SimpleOptions::kFloat:
filter->simple_options_->GetOption(name, &tmp_float);
g_object_class_install_property(
G_OBJECT_CLASS(klass),
PROP_LAST + i,
g_param_spec_float(
name.c_str(),
option_info.doc.c_str(),
option_info.doc.c_str(),
G_MINFLOAT,
G_MAXFLOAT,
tmp_float,
(GParamFlags) G_PARAM_READWRITE));
break;
case SimpleOptions::kDouble:
filter->simple_options_->GetOption(name, &tmp_double);
g_object_class_install_property(
G_OBJECT_CLASS(klass),
PROP_LAST + i,
g_param_spec_double(
name.c_str(),
option_info.doc.c_str(),
option_info.doc.c_str(),
G_MINDOUBLE,
G_MAXDOUBLE,
tmp_double,
(GParamFlags) G_PARAM_READWRITE));
break;
case SimpleOptions::kString:
filter->simple_options_->GetOption(name, &tmp_string);
g_object_class_install_property(
G_OBJECT_CLASS(klass),
PROP_LAST + i,
g_param_spec_string(
name.c_str(),
option_info.doc.c_str(),
option_info.doc.c_str(),
tmp_string.c_str(),
(GParamFlags) G_PARAM_READWRITE));
break;
}
i += 1;
}
}
static bool
gst_online_gmm_decode_faster_allocate(GstOnlineGmmDecodeFaster * filter) {
if (!filter->decoder_) {
GST_INFO_OBJECT(filter, "Loading Kaldi decoder");
filter->lda_transform_ = new Matrix;
if (strlen(filter->lda_mat_rspecifier_) > 0) {
bool binary_in;
Input ki(filter->lda_mat_rspecifier_, &binary_in);
filter->lda_transform_->Read(ki.Stream(), binary_in);
}
filter->trans_model_ = new TransitionModel();
filter->am_gmm_ = new AmDiagGmm();
{
bool binary;
Input ki(filter->model_rspecifier_, &binary);
filter->trans_model_->Read(ki.Stream(), binary);
filter->am_gmm_->Read(ki.Stream(), binary);
}
filter->word_syms_ = NULL;
if (!(filter->word_syms_ = fst::SymbolTable::ReadText(filter->word_syms_filename_))) {
GST_ERROR_OBJECT(filter, "Could not read symbol table from file %s", filter->word_syms_filename_);
return false;
}
filter->decode_fst_ = ReadDecodeGraph(filter->fst_rspecifier_);
int32 window_size = filter->right_context_ + filter->left_context_ + 1;
filter->decoder_opts_->batch_size = std::max(filter->decoder_opts_->batch_size, window_size);
filter->out_fst_ = new fst::VectorFst ();
filter->au_src_ = new GstBufferSource();
filter->decoder_ = new OnlineFasterDecoder(*(filter->decode_fst_),
*(filter->decoder_opts_),
*(filter->silence_phones_),
*(filter->trans_model_));
GST_INFO_OBJECT(filter, "Finished loading Kaldi decoder");
}
return true;
}
static void
gst_online_gmm_decode_faster_finalize(GObject * object) {
GstOnlineGmmDecodeFaster *filter = GST_ONLINEGMMDECODEFASTER(object);
g_free(filter->model_rspecifier_);
g_free(filter->fst_rspecifier_);
g_free(filter->word_syms_filename_);
g_free(filter->lda_mat_rspecifier_);
delete filter->silence_phones_;
delete filter->decoder_opts_;
delete filter->feature_reading_opts_;
if (filter->decoder_) {
delete filter->decoder_;
filter->decoder_ = NULL;
}
if (filter->lda_transform_) {
delete filter->lda_transform_;
filter->lda_transform_ = NULL;
}
if (filter->am_gmm_) {
delete filter->am_gmm_;
filter->am_gmm_ = NULL;
}
if (filter->word_syms_) {
delete filter->word_syms_;
filter->word_syms_ = NULL;
}
if (filter->decode_fst_) {
delete filter->decode_fst_;
filter->decode_fst_ = NULL;
}
if (filter->out_fst_) {
delete filter->out_fst_;
filter->out_fst_ = NULL;
}
if (filter->au_src_) {
delete filter->au_src_;
filter->au_src_ = NULL;
}
if (filter->simple_options_) {
delete filter->simple_options_;
filter->simple_options_ = NULL;
}
G_OBJECT_CLASS(parent_class)->finalize(object);
}
static bool
gst_online_gmm_decode_faster_deallocate(GstOnlineGmmDecodeFaster * filter) {
/* We won't deallocate the decoder once it's already allocated, since model loading could take a lot of time */
GST_INFO_OBJECT(filter, "Refusing to unload decoder");
return true;
}
static void
gst_online_gmm_decode_faster_set_property(GObject * object, guint prop_id,
const GValue * value, GParamSpec * pspec) {
GstOnlineGmmDecodeFaster *filter = GST_ONLINEGMMDECODEFASTER(object);
if (prop_id == PROP_SILENT) {
filter->silent_ = g_value_get_boolean(value);
return;
}
// All other props cannot be changed after initialization
if (filter->decoder_) {
GST_WARNING_OBJECT(filter, "Decoder already initialized, cannot change it's properties");
return;
}
switch (prop_id) {
case PROP_MODEL:
g_free(filter->model_rspecifier_);
filter->model_rspecifier_ = g_value_dup_string(value);
break;
case PROP_FST:
g_free(filter->fst_rspecifier_);
filter->fst_rspecifier_ = g_value_dup_string(value);
break;
case PROP_WORD_SYMS:
g_free(filter->word_syms_filename_);
filter->word_syms_filename_ = g_value_dup_string(value);
break;
case PROP_LDA_MAT:
g_free(filter->lda_mat_rspecifier_);
filter->lda_mat_rspecifier_ = g_value_dup_string(value);
break;
case PROP_SILENCE_PHONES:
SplitStringToIntegers(g_value_get_string(value), ":", false, filter->silence_phones_);
break;
default:
if (prop_id >= PROP_LAST) {
const gchar* name = g_param_spec_get_name(pspec);
SimpleOptions::OptionType option_type;
if (filter->simple_options_->GetOptionType(std::string(name), &option_type)) {
switch (option_type) {
case SimpleOptions::kBool:
filter->simple_options_->SetOption(name, g_value_get_boolean(value));
break;
case SimpleOptions::kInt32:
filter->simple_options_->SetOption(name, g_value_get_int(value));
break;
case SimpleOptions::kUint32:
filter->simple_options_->SetOption(name, g_value_get_uint(value));
break;
case SimpleOptions::kFloat:
filter->simple_options_->SetOption(name, g_value_get_float(value));
break;
case SimpleOptions::kDouble:
filter->simple_options_->SetOption(name, g_value_get_double(value));
break;
case SimpleOptions::kString:
filter->simple_options_->SetOption(name, g_value_dup_string(value));
break;
}
break;
}
}
G_OBJECT_WARN_INVALID_PROPERTY_ID(object, prop_id, pspec);
break;
}
}
static void
gst_online_gmm_decode_faster_get_property(GObject * object, guint prop_id,
GValue * value, GParamSpec * pspec) {
bool tmp_bool;
int32 tmp_int;
uint32 tmp_uint;
float tmp_float;
double tmp_double;
std::string tmp_string;
GstOnlineGmmDecodeFaster *filter = GST_ONLINEGMMDECODEFASTER(object);
std::ostringstream ss;
switch (prop_id) {
case PROP_SILENT:
g_value_set_boolean(value, filter->silent_);
break;
case PROP_MODEL:
g_value_set_string(value, filter->model_rspecifier_);
break;
case PROP_FST:
g_value_set_string(value, filter->fst_rspecifier_);
break;
case PROP_WORD_SYMS:
g_value_set_string(value, filter->word_syms_filename_);
break;
case PROP_LDA_MAT:
g_value_set_string(value, filter->lda_mat_rspecifier_);
break;
case PROP_SILENCE_PHONES:
for (size_t j = 0; j < filter->silence_phones_->size(); j++) {
if (j > 0) {
ss << ":";
}
ss << (*filter->silence_phones_)[j];
}
g_value_set_string(value, ss.str().c_str());
break;
default:
if (prop_id >= PROP_LAST) {
const gchar* name = g_param_spec_get_name(pspec);
SimpleOptions::OptionType option_type;
if (filter->simple_options_->GetOptionType(std::string(name), &option_type)) {
switch (option_type) {
case SimpleOptions::kBool:
filter->simple_options_->GetOption(name, &tmp_bool);
g_value_set_boolean(value, tmp_bool);
break;
case SimpleOptions::kInt32:
filter->simple_options_->GetOption(name, &tmp_int);
g_value_set_int(value, tmp_int);
break;
case SimpleOptions::kUint32:
filter->simple_options_->GetOption(name, &tmp_uint);
g_value_set_uint(value, tmp_uint);
break;
case SimpleOptions::kFloat:
filter->simple_options_->GetOption(name, &tmp_float);
g_value_set_float(value, tmp_float);
break;
case SimpleOptions::kDouble:
filter->simple_options_->GetOption(name, &tmp_double);
g_value_set_double(value, tmp_double);
break;
case SimpleOptions::kString:
filter->simple_options_->GetOption(name, &tmp_string);
g_value_set_string(value, tmp_string.c_str());
break;
}
break;
}
}
G_OBJECT_WARN_INVALID_PROPERTY_ID(object, prop_id, pspec);
break;
}
}
static GstStateChangeReturn
gst_online_gmm_decode_faster_change_state(GstElement *element, GstStateChange transition) {
GstStateChangeReturn ret = GST_STATE_CHANGE_SUCCESS;
GstOnlineGmmDecodeFaster *filter = GST_ONLINEGMMDECODEFASTER(element);
switch (transition) {
case GST_STATE_CHANGE_NULL_TO_READY:
if (!gst_online_gmm_decode_faster_allocate(filter))
return GST_STATE_CHANGE_FAILURE;
break;
default:
break;
}
ret = GST_ELEMENT_CLASS(parent_class)->change_state(element, transition);
if (ret == GST_STATE_CHANGE_FAILURE)
return ret;
switch (transition) {
case GST_STATE_CHANGE_READY_TO_NULL:
gst_online_gmm_decode_faster_deallocate(filter);
break;
default:
break;
}
return ret;
}
/*
* Emit a single recognized word:
* * emit through the sink pad of the element
* * emit by the hy-word signal
*/
static void
gst_online_gmm_decode_faster_push_word(GstOnlineGmmDecodeFaster * filter, GstPad *pad, std::string word) {
const gchar *hyp = word.c_str();
guint hyp_len = strlen(hyp);
GST_DEBUG_OBJECT(filter, "WORD: %s", hyp);
/* +1 for terminating NUL character */
GstBuffer *buffer = gst_buffer_new_and_alloc(hyp_len + 2);
gst_buffer_fill(buffer, 0, hyp, hyp_len);
gst_buffer_memset(buffer, hyp_len, ' ', 1);
gst_buffer_memset(buffer, hyp_len + 1, '\0', 1);
gst_buffer_set_size(buffer, hyp_len + 1);
gst_pad_push(pad, buffer);
/* Emit a signal for applications. */
g_signal_emit(filter, gst_online_gmm_decode_faster_signals[HYP_WORD_SIGNAL], 0, hyp);
}
static void
gst_online_gmm_decode_faster_push_words(GstOnlineGmmDecodeFaster * filter, GstPad *pad,
const std::vector& words,
const fst::SymbolTable *word_syms,
bool line_break) {
KALDI_ASSERT(word_syms != NULL);
std::stringstream ss;
for (size_t i = 0; i < words.size(); i++) {
std::string word = word_syms->Find(words[i]);
if (word == "") {
GST_ERROR_OBJECT(filter, "Word-id %d not in symbol table!", words[i]);
}
gst_online_gmm_decode_faster_push_word(filter, pad, word);
}
if (line_break) {
gst_online_gmm_decode_faster_push_word(filter, pad, "<#s>");
}
}
static void
gst_online_gmm_decode_faster_loop(GstOnlineGmmDecodeFaster * filter) {
// We are not properly registering/exposing MFCC and frame extraction options,
// because there are parts of the online decoding code, where some of these
// options are hardwired(ToDo: we should fix this at some point)
MfccOptions mfcc_opts;
mfcc_opts.use_energy = false;
int32 frame_length = mfcc_opts.frame_opts.frame_length_ms = 25;
int32 frame_shift = mfcc_opts.frame_opts.frame_shift_ms = 10;
// Up to delta-delta derivative features are calculated (unless LDA is used)
const int32 kDeltaOrder = 2;
Mfcc mfcc(mfcc_opts);
FeInput fe_input(filter->au_src_, &mfcc,
frame_length * (kSampleFreq / 1000),
frame_shift * (kSampleFreq / 1000));
OnlineCmnInput cmn_input(&fe_input, filter->cmn_window_, filter->min_cmn_window_);
OnlineFeatInputItf *feat_transform = 0;
if (strlen(filter->lda_mat_rspecifier_) > 0) {
feat_transform = new OnlineLdaInput(&cmn_input, *(filter->lda_transform_),
filter->left_context_,
filter->right_context_);
} else {
DeltaFeaturesOptions opts;
opts.order = kDeltaOrder;
// Note from Dan: keeping the next statement for back-compatibility,
// but I don't think this is really the right way to set the window-size
// in the delta computation: it should be a separate config.
opts.window = filter->left_context_ / 2;
feat_transform = new OnlineDeltaInput(opts, &cmn_input);
}
// feature_reading_opts contains timeout, batch size.
OnlineFeatureMatrix feature_matrix(*(filter->feature_reading_opts_),
feat_transform);
OnlineDecodableDiagGmmScaled decodable(*(filter->am_gmm_), *(filter->trans_model_),
filter->acoustic_scale_, &feature_matrix);
GST_DEBUG_OBJECT(filter, "starting decoding loop");
bool partial_res = false;
filter->decoder_->InitDecoding();
while (1) {
OnlineFasterDecoder::DecodeState dstate = filter->decoder_->Decode(&decodable);
if (dstate & (filter->decoder_->kEndFeats | filter->decoder_->kEndUtt)) {
std::vector word_ids;
filter->decoder_->FinishTraceBack(filter->out_fst_);
fst::GetLinearSymbolSequence(*(filter->out_fst_),
static_cast *>(0),
&word_ids,
static_cast(0));
gst_online_gmm_decode_faster_push_words(filter, filter->srcpad_, word_ids, filter->word_syms_, partial_res || word_ids.size());
partial_res = false;
if (dstate == filter->decoder_->kEndFeats)
break;
} else {
std::vector word_ids;
if (filter->decoder_->PartialTraceback(filter->out_fst_)) {
fst::GetLinearSymbolSequence(*(filter->out_fst_),
static_cast *>(0),
&word_ids,
static_cast(0));
gst_online_gmm_decode_faster_push_words(filter, filter->srcpad_, word_ids, filter->word_syms_, false);
if (!partial_res)
partial_res = (word_ids.size() > 0);
}
}
}
GST_DEBUG_OBJECT(filter, "Finished decoding loop");
GST_DEBUG_OBJECT(filter, "Pushing EOS event");
gst_pad_push_event(filter->srcpad_, gst_event_new_eos());
GST_DEBUG_OBJECT(filter, "Pausing decoding task");
gst_pad_pause_task(filter->srcpad_);
delete feat_transform;
delete filter->au_src_;
filter->au_src_ = new GstBufferSource();
}
/* GstElement vmethod implementations */
/* this function handles sink events */
static gboolean
gst_online_gmm_decode_faster_sink_event(GstPad * pad, GstObject * parent, GstEvent * event) {
gboolean ret;
GstOnlineGmmDecodeFaster *filter;
filter = GST_ONLINEGMMDECODEFASTER(parent);
GST_DEBUG_OBJECT(filter, "Handling %s event", GST_EVENT_TYPE_NAME(event));
switch (GST_EVENT_TYPE(event)) {
case GST_EVENT_SEGMENT:
{
GST_DEBUG_OBJECT(filter, "Starting decoding task");
gst_pad_start_task(filter->srcpad_,
(GstTaskFunction) gst_online_gmm_decode_faster_loop, filter, NULL);
GST_DEBUG_OBJECT(filter, "Started decoding task");
ret = TRUE;
break;
}
case GST_EVENT_CAPS:
{
ret = TRUE;
break;
}
case GST_EVENT_EOS:
{
/* end-of-stream, we should close down all stream leftovers here */
GST_DEBUG_OBJECT(filter, "EOS received");
filter->au_src_->SetEnded(true);
ret = TRUE;
break;
}
default:
ret = gst_pad_event_default(pad, parent, event);
break;
}
return ret;
}
/* chain function
* this function does the actual processing
*/
static GstFlowReturn gst_online_gmm_decode_faster_chain(GstPad * pad,
GstObject * parent,
GstBuffer * buf) {
GstOnlineGmmDecodeFaster *filter;
filter = GST_ONLINEGMMDECODEFASTER(parent);
if (G_UNLIKELY(!filter->decoder_))
goto not_negotiated;
if (!filter->silent_) {
filter->au_src_->PushBuffer(buf);
}
gst_buffer_unref(buf);
return GST_FLOW_OK;
/* special cases */
not_negotiated: {
GST_ELEMENT_ERROR(filter, CORE, NEGOTIATION, (NULL),
("decoder wasn't allocated before chain function"));
gst_buffer_unref(buf);
return GST_FLOW_NOT_NEGOTIATED;
}
}
/* entry point to initialize the plug-in
* initialize the plug-in itself
* register the element factories and other features
*/
static gboolean
onlinegmmdecodefaster_init(GstPlugin * onlinegmmdecodefaster) {
/* debug category for fltering log messages
*/
GST_DEBUG_CATEGORY_INIT(gst_online_gmm_decode_faster_debug, "onlinegmmdecodefaster",
0, "Automatic Speech Recognition");
return gst_element_register(onlinegmmdecodefaster, "onlinegmmdecodefaster", GST_RANK_NONE,
GST_TYPE_ONLINEGMMDECODEFASTER);
}
/* PACKAGE: this is usually set by autotools depending on some _INIT macro
* in configure.ac and then written into and defined in config.h, but we can
* just set it ourselves here in case someone doesn't use autotools to
* compile this code. GST_PLUGIN_DEFINE needs PACKAGE to be defined.
*/
#ifndef PACKAGE
#define PACKAGE "myfirstonlinegmmdecodefaster"
#endif
/* gstreamer looks for this structure to register onlinegmmdecodefasters
*
* exchange the string 'Template onlinegmmdecodefaster' with your onlinegmmdecodefaster description
*/
GST_PLUGIN_DEFINE(
GST_VERSION_MAJOR,
GST_VERSION_MINOR,
onlinegmmdecodefaster,
"Online speech recognizer based on the Kaldi toolkit",
onlinegmmdecodefaster_init,
VERSION,
"LGPL", // Changing it into Apache prevents the plugin from loading, see gst/gstplugin.c in GStreamer source
"Kaldi",
"http://kaldi.sourceforge.net/"
)
}