discriminative-supervision.cc 16.4 KB
edit raw blame history



1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

19

20

21

22

23

24

25

26

27

28

29

30

31

32

33

34

35

36

37

38

39

40

41

42

43

44

45

46

47

48

49

50

51

52

53

54

55

56

57

58

59

60

61

62

63

64

65

66

67

68

69

70

71

72

73

74

75

76

77

78

79

80

81

82

83

84

85

86

87

88

89

90

91

92

93

94

95

96

97

98

99

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

119

120

121

122

123

124

125

126

127

128

129

130

131

132

133

134

135

136

137

138

139

140

141

142

143

144

145

146

147

148

149

150

151

152

153

154

155

156

157

158

159

160

161

162

163

164

165

166

167

168

169

170

171

172

173

174

175

176

177

178

179

180

181

182

183

184

185

186

187

188

189

190

191

192

193

194

195

196

197

198

199

200

201

202

203

204

205

206

207

208

209

210

211

212

213

214

215

216

217

218

219

220

221

222

223

224

225

226

227

228

229

230

231

232

233

234

235

236

237

238

239

240

241

242

243

244

245

246

247

248

249

250

251

252

253

254

255

256

257

258

259

260

261

262

263

264

265

266

267

268

269

270

271

272

273

274

275

276

277

278

279

280

281

282

283

284

285

286

287

288

289

290

291

292

293

294

295

296

297

298

299

300

301

302

303

304

305

306

307

308

309

310

311

312

313

314

315

316

317

318

319

320

321

322

323

324

325

326

327

328

329

330

331

332

333

334

335

336

337

338

339

340

341

342

343

344

345

346

347

348

349

350

351

352

353

354

355

356

357

358

359

360

361

362

363

364

365

366

367

368

369

370

371

372

373

374

375

376

377

378

379

380

381

382

383

384

385

386

387

388

389

390

391

392

393

394

395

396

397

398

399

400

401

402

403

404

405

406

407

408

409

410

411

412

413

414

415

416

417

418

419

420

421

422

423

424

425

426

427

428

429

430

431

432

433

434

435

436


// nnet3/discriminative-supervision.cc

// Copyright 2012-2015  Johns Hopkins University (author: Daniel Povey)
//           2014-2015  Vimal Manohar

// See ../../COPYING for clarification regarding multiple authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//  http://www.apache.org/licenses/LICENSE-2.0
//
// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
// MERCHANTABLITY OR NON-INFRINGEMENT.
// See the Apache 2 License for the specific language governing permissions and
// limitations under the License.

#include "nnet3/discriminative-supervision.h"
#include "lat/lattice-functions.h"

namespace kaldi {
namespace discriminative {


DiscriminativeSupervision::DiscriminativeSupervision(
    const DiscriminativeSupervision &other):
    weight(other.weight), num_sequences(other.num_sequences),
    frames_per_sequence(other.frames_per_sequence),
    num_ali(other.num_ali), den_lat(other.den_lat) { }

void DiscriminativeSupervision::Swap(DiscriminativeSupervision *other) {
  std::swap(weight, other->weight);
  std::swap(num_sequences, other->num_sequences);
  std::swap(frames_per_sequence, other->frames_per_sequence);
  std::swap(num_ali, other->num_ali);
  std::swap(den_lat, other->den_lat);
}

bool DiscriminativeSupervision::operator == (
    const DiscriminativeSupervision &other) const {
  return ( weight == other.weight &&
      num_sequences == other.num_sequences &&
      frames_per_sequence == other.frames_per_sequence &&
      num_ali == other.num_ali &&
      fst::Equal(den_lat, other.den_lat) );
}

void DiscriminativeSupervision::Write(std::ostream &os, bool binary) const {
  WriteToken(os, binary, "<DiscriminativeSupervision>");
  WriteToken(os, binary, "<Weight>");
  WriteBasicType(os, binary, weight);
  WriteToken(os, binary, "<NumSequences>");
  WriteBasicType(os, binary, num_sequences);
  WriteToken(os, binary, "<FramesPerSeq>");
  WriteBasicType(os, binary, frames_per_sequence);
  KALDI_ASSERT(frames_per_sequence > 0 &&
               num_sequences > 0);

  WriteToken(os, binary, "<NumAli>");
  WriteIntegerVector(os, binary, num_ali);

  WriteToken(os, binary, "<DenLat>");
  if (!WriteLattice(os, binary, den_lat)) {
    // We can't return error status from this function so we
    // throw an exception.
    KALDI_ERR << "Error writing denominator lattice to stream";
  }

  WriteToken(os, binary, "</DiscriminativeSupervision>");
}

void DiscriminativeSupervision::Read(std::istream &is, bool binary) {
  ExpectToken(is, binary, "<DiscriminativeSupervision>");
  ExpectToken(is, binary, "<Weight>");
  ReadBasicType(is, binary, &weight);
  ExpectToken(is, binary, "<NumSequences>");
  ReadBasicType(is, binary, &num_sequences);
  ExpectToken(is, binary, "<FramesPerSeq>");
  ReadBasicType(is, binary, &frames_per_sequence);
  KALDI_ASSERT(frames_per_sequence > 0 &&
               num_sequences > 0);

  ExpectToken(is, binary, "<NumAli>");
  ReadIntegerVector(is, binary, &num_ali);

  ExpectToken(is, binary, "<DenLat>");
  {
    Lattice *lat = NULL;
    if (!ReadLattice(is, binary, &lat) || lat == NULL) {
      // We can't return error status from this function so we
      // throw an exception.
      KALDI_ERR << "Error reading Lattice from stream";
    }
    den_lat = *lat;
    delete lat;
    TopSort(&den_lat);
  }

  ExpectToken(is, binary, "</DiscriminativeSupervision>");
}

bool DiscriminativeSupervision::Initialize(const std::vector<int32> &num_ali,
                                           const Lattice &den_lat,
                                           BaseFloat weight) {
  if (num_ali.size() == 0) return false;
  if (den_lat.NumStates() == 0) return false;

  this->weight = weight;
  this->num_sequences = 1;
  this->frames_per_sequence = num_ali.size();
  this->num_ali = num_ali;
  this->den_lat = den_lat;
  KALDI_ASSERT(TopSort(&(this->den_lat)));

  // Checks if num frames in alignment matches lattice
  Check();

  return true;
}

void DiscriminativeSupervision::Check() const {
  int32 num_frames_subsampled = num_ali.size();
  KALDI_ASSERT(num_frames_subsampled ==
               num_sequences * frames_per_sequence);

  {
    std::vector<int32> state_times;
    int32 max_time = LatticeStateTimes(den_lat, &state_times);
    KALDI_ASSERT(max_time == num_frames_subsampled);
  }
}

DiscriminativeSupervisionSplitter::DiscriminativeSupervisionSplitter(
    const SplitDiscriminativeSupervisionOptions &config,
    const TransitionModel &tmodel,
    const DiscriminativeSupervision &supervision):
    config_(config), tmodel_(tmodel), supervision_(supervision) {
  if (supervision_.num_sequences != 1) {
    KALDI_WARN << "Splitting already-reattached sequence (only expected in "
               << "testing code)";
  }

  KALDI_ASSERT(supervision_.num_sequences == 1); // For now, don't allow splitting already merged examples

  den_lat_ = supervision_.den_lat;
  PrepareLattice(&den_lat_, &den_lat_scores_);

  int32 num_states = den_lat_.NumStates(),
        num_frames = supervision_.frames_per_sequence * supervision_.num_sequences;
  KALDI_ASSERT(num_states > 0);
  int32 start_state = den_lat_.Start();
  // Lattice should be top-sorted and connected, so start-state must be 0.
  KALDI_ASSERT(start_state == 0 && "Expecting start-state to be 0");

  KALDI_ASSERT(num_states == den_lat_scores_.state_times.size());
  KALDI_ASSERT(den_lat_scores_.state_times[start_state] == 0);
  KALDI_ASSERT(den_lat_scores_.state_times.back() == num_frames);
}

// Make sure that for any given pdf-id and any given frame, the den-lat has
// only one transition-id mapping to that pdf-id, on the same frame.
// It helps us to more completely minimize the lattice.  Note: we
// can't do this if the criterion is MPFE, because in that case the
// objective function will be affected by the phone-identities being
// different even if the pdf-ids are the same.
void DiscriminativeSupervisionSplitter::CollapseTransitionIds(
    const std::vector<int32> &state_times, Lattice *lat) const {
  typedef Lattice::StateId StateId;
  typedef Lattice::Arc Arc;

  int32 num_frames = state_times.back();   // TODO: Check if this is always true
  StateId num_states = lat->NumStates();

  std::vector<std::map<int32, int32> > pdf_to_tid(num_frames);
  for (StateId s = 0; s < num_states; s++) {
    int32 t = state_times[s];
    for (fst::MutableArcIterator<Lattice> aiter(lat, s);
         !aiter.Done(); aiter.Next()) {
      KALDI_ASSERT(t >= 0 && t < num_frames);
      Arc arc = aiter.Value();
      KALDI_ASSERT(arc.ilabel != 0 && arc.ilabel == arc.olabel);
      int32 pdf = tmodel_.TransitionIdToPdf(arc.ilabel);
      if (pdf_to_tid[t].count(pdf) != 0) {
        arc.ilabel = arc.olabel = pdf_to_tid[t][pdf];
        aiter.SetValue(arc);
      } else {
        pdf_to_tid[t][pdf] = arc.ilabel;
      }
    }
  }
}

void DiscriminativeSupervisionSplitter::LatticeInfo::Check() const {
  // Check if all the vectors are of size num_states
  KALDI_ASSERT(state_times.size() == alpha.size() &&
               state_times.size() == beta.size());

  // Check that the states are ordered in increasing order of state_times.
  // This must be true since the states are in breadth-first search order.
  KALDI_ASSERT(IsSorted(state_times));
}

void DiscriminativeSupervisionSplitter::GetFrameRange(int32 begin_frame, int32 num_frames, bool normalize,
                                                      DiscriminativeSupervision *out_supervision) const {
  int32 end_frame = begin_frame + num_frames;
  // Note: end_frame is not included in the range of frames that the
  // output supervision object covers; it's one past the end.
  KALDI_ASSERT(num_frames > 0 && begin_frame >= 0 &&
               begin_frame + num_frames <=
               supervision_.num_sequences * supervision_.frames_per_sequence);

  CreateRangeLattice(den_lat_,
                     den_lat_scores_,
                     begin_frame, end_frame, normalize,
                     &(out_supervision->den_lat));

  out_supervision->num_ali.clear();
  std::copy(supervision_.num_ali.begin() + begin_frame,
            supervision_.num_ali.begin() + end_frame,
            std::back_inserter(out_supervision->num_ali));

  out_supervision->num_sequences = 1;
  out_supervision->weight = supervision_.weight;
  out_supervision->frames_per_sequence = num_frames;

  out_supervision->Check();
}

void DiscriminativeSupervisionSplitter::CreateRangeLattice(
    const Lattice &in_lat, const LatticeInfo &scores,
    int32 begin_frame, int32 end_frame, bool normalize,
    Lattice *out_lat) const {
  typedef Lattice::StateId StateId;

  const std::vector<int32> &state_times = scores.state_times;

  // Some checks to ensure the lattice and scores are prepared properly
  KALDI_ASSERT(state_times.size() == in_lat.NumStates());
  if (!in_lat.Properties(fst::kTopSorted, true))
    KALDI_ERR << "Input lattice must be topologically sorted.";

  std::vector<int32>::const_iterator begin_iter =
      std::lower_bound(state_times.begin(), state_times.end(), begin_frame),
      end_iter = std::lower_bound(begin_iter,
                                  state_times.end(), end_frame);

  KALDI_ASSERT(*begin_iter == begin_frame &&
               (begin_iter == state_times.begin() ||
                begin_iter[-1] < begin_frame));
  // even if end_frame == supervision_.num_frames, there should be a state with
  // that frame index.
  KALDI_ASSERT(end_iter[-1] < end_frame &&
               (end_iter < state_times.end() || *end_iter == end_frame));
  StateId begin_state = begin_iter - state_times.begin(),
          end_state = end_iter - state_times.begin();

  KALDI_ASSERT(end_state > begin_state);
  out_lat->DeleteStates();
  out_lat->ReserveStates(end_state - begin_state + 2);

  // Add special start state
  StateId start_state = out_lat->AddState();
  out_lat->SetStart(start_state);

  for (StateId i = begin_state; i < end_state; i++)
    out_lat->AddState();

  // Add the special final-state.
  StateId final_state = out_lat->AddState();
  out_lat->SetFinal(final_state, LatticeWeight::One());

  for (StateId state = begin_state; state < end_state; state++) {
    StateId output_state = state - begin_state + 1;
    if (state_times[state] == begin_frame) {
      // we'd like to make this an initial state, but OpenFst doesn't allow
      // multiple initial states.  Instead we add an epsilon transition to it
      // from our actual initial state.  The weight on this
      // transition is the forward probability of the said 'initial state'
      LatticeWeight weight = LatticeWeight::One();
      weight.SetValue1((normalize ? scores.beta[0] : 0.0) - scores.alpha[state]);
      // Add negative of the forward log-probability to the graph cost score,
      // since the acoustic scores would be changed later.
      // Assuming that the lattice is scaled with appropriate acoustic
      // scale.
      // We additionally normalize using the total lattice score. Since the
      // same score is added as normalizer to all the paths in the lattice,
      // the relative probabilities of the paths in the lattice is not affected.
      // Note: Doing a forward-backward on this split must result in a total
      // score of 0 because of the normalization.

      out_lat->AddArc(start_state,
                      LatticeArc(0, 0, weight, output_state));
    } else {
      KALDI_ASSERT(scores.state_times[state] < end_frame);
    }
    for (fst::ArcIterator<Lattice> aiter(in_lat, state);
          !aiter.Done(); aiter.Next()) {
      const LatticeArc &arc = aiter.Value();
      StateId nextstate = arc.nextstate;
      if (nextstate >= end_state) {
        // A transition to any state outside the range becomes a transition to
        // our special final-state.
        // The weight is just the negative of the backward log-probability +
        // the arc cost. We again normalize with the total lattice score.
        LatticeWeight weight;
        //KALDI_ASSERT(scores.beta[state] < 0);
        weight.SetValue1(arc.weight.Value1() - scores.beta[nextstate]);
        weight.SetValue2(arc.weight.Value2());
        // Add negative of the backward log-probability to the LM score, since
        // the acoustic scores would be changed later.
        // Note: We don't normalize here because that is already done with the
        // initial cost.

        out_lat->AddArc(output_state,
            LatticeArc(arc.ilabel, arc.olabel, weight, final_state));
      } else {
        StateId output_nextstate = nextstate - begin_state + 1;
        out_lat->AddArc(output_state,
            LatticeArc(arc.ilabel, arc.olabel, arc.weight, output_nextstate));
      }
    }
  }

  // Get rid of the word labels and put the
  // transition-ids on both sides.
  fst::Project(out_lat, fst::PROJECT_INPUT);
  fst::RmEpsilon(out_lat);

  if (config_.collapse_transition_ids)
    CollapseTransitionIds(state_times, out_lat);

  if (config_.determinize) {
    if (!config_.minimize) {
      Lattice tmp_lat;
      fst::Determinize(*out_lat, &tmp_lat);
      std::swap(*out_lat, tmp_lat);
    } else {
      Lattice tmp_lat;
      fst::Reverse(*out_lat, &tmp_lat);
      fst::Determinize(tmp_lat, out_lat);
      fst::Reverse(*out_lat, &tmp_lat);
      fst::Determinize(tmp_lat, out_lat);
      fst::RmEpsilon(out_lat);
    }
  }

  fst::TopSort(out_lat);
  std::vector<int32> state_times_tmp;
  KALDI_ASSERT(LatticeStateTimes(*out_lat, &state_times_tmp) ==
                                            end_frame - begin_frame);

  // Remove the acoustic scale that was previously added
  if (config_.acoustic_scale != 1.0) {
    fst::ScaleLattice(fst::AcousticLatticeScale(
          1 / config_.acoustic_scale), out_lat);
  }
}

void DiscriminativeSupervisionSplitter::PrepareLattice(
    Lattice *lat, LatticeInfo *scores) const {
  // Scale the lattice to appropriate acoustic scale. It is important to
  // ensure this is equal to the acoustic scale used while training. This is
  // because, on splitting lattices, the initial and final costs are added
  // into the graph cost.
  KALDI_ASSERT(config_.acoustic_scale != 0.0);
  if (config_.acoustic_scale != 1.0)
    fst::ScaleLattice(fst::AcousticLatticeScale(
        config_.acoustic_scale), lat);

  LatticeStateTimes(*lat, &(scores->state_times));
  int32 num_states = lat->NumStates();
  std::vector<std::pair<int32,int32> > state_time_indexes(num_states);
  for (int32 s = 0; s < num_states; s++) {
    state_time_indexes[s] = std::make_pair(scores->state_times[s], s);
  }

  // Order the states based on the state times. This is stronger than just
  // topological sort. This is required by the lattice splitting code.
  std::sort(state_time_indexes.begin(), state_time_indexes.end());

  std::vector<int32> state_order(num_states);
  for (int32 s = 0; s < num_states; s++) {
    state_order[state_time_indexes[s].second] = s;
  }

  fst::StateSort(lat, state_order);
  ComputeLatticeScores(*lat, scores);
}

void DiscriminativeSupervisionSplitter::ComputeLatticeScores(const Lattice &lat,
    LatticeInfo *scores) const {
  LatticeStateTimes(lat, &(scores->state_times));
  ComputeLatticeAlphasAndBetas(lat, false,
                               &(scores->alpha), &(scores->beta));
  scores->Check();
  // This check will fail if the lattice is not breadth-first search sorted
}

void MergeSupervision(const std::vector<const DiscriminativeSupervision*> &input,
    DiscriminativeSupervision *output_supervision) {
  KALDI_ASSERT(!input.empty());
  int32 num_inputs = input.size();
  if (num_inputs == 1) {
    *output_supervision = *(input[0]);
    return;
  }
  *output_supervision = *(input[num_inputs-1]);
  for (int32 i = num_inputs - 2; i >= 0; i--) {
    const DiscriminativeSupervision &src = *(input[i]);
    KALDI_ASSERT(src.num_sequences == 1);
    if (output_supervision->weight == src.weight &&
        output_supervision->frames_per_sequence ==
        src.frames_per_sequence) {
      // Combine with current output
      // append src.den_lat to output_supervision->den_lat.
      fst::Concat(src.den_lat, &output_supervision->den_lat);

      output_supervision->num_ali.insert(
          output_supervision->num_ali.begin(),
          src.num_ali.begin(), src.num_ali.end());

      output_supervision->num_sequences++;
    } else {
      KALDI_ERR << "Mismatch weight or frames_per_sequence  between inputs";
    }
  }
  DiscriminativeSupervision &out_sup = *output_supervision;
  fst::TopSort(&(out_sup.den_lat));
  out_sup.Check();
}

} // namespace discriminative
} // namespace kaldi