sampling-lm.cc 17.4 KB
edit raw blame history



1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

19

20

21

22

23

24

25

26

27

28

29

30

31

32

33

34

35

36

37

38

39

40

41

42

43

44

45

46

47

48

49

50

51

52

53

54

55

56

57

58

59

60

61

62

63

64

65

66

67

68

69

70

71

72

73

74

75

76

77

78

79

80

81

82

83

84

85

86

87

88

89

90

91

92

93

94

95

96

97

98

99

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

119

120

121

122

123

124

125

126

127

128

129

130

131

132

133

134

135

136

137

138

139

140

141

142

143

144

145

146

147

148

149

150

151

152

153

154

155

156

157

158

159

160

161

162

163

164

165

166

167

168

169

170

171

172

173

174

175

176

177

178

179

180

181

182

183

184

185

186

187

188

189

190

191

192

193

194

195

196

197

198

199

200

201

202

203

204

205

206

207

208

209

210

211

212

213

214

215

216

217

218

219

220

221

222

223

224

225

226

227

228

229

230

231

232

233

234

235

236

237

238

239

240

241

242

243

244

245

246

247

248

249

250

251

252

253

254

255

256

257

258

259

260

261

262

263

264

265

266

267

268

269

270

271

272

273

274

275

276

277

278

279

280

281

282

283

284

285

286

287

288

289

290

291

292

293

294

295

296

297

298

299

300

301

302

303

304

305

306

307

308

309

310

311

312

313

314

315

316

317

318

319

320

321

322

323

324

325

326

327

328

329

330

331

332

333

334

335

336

337

338

339

340

341

342

343

344

345

346

347

348

349

350

351

352

353

354

355

356

357

358

359

360

361

362

363

364

365

366

367

368

369

370

371

372

373

374

375

376

377

378

379

380

381

382

383

384

385

386

387

388

389

390

391

392

393

394

395

396

397

398

399

400

401

402

403

404

405

406

407

408

409

410

411

412

413

414

415

416


// sampling-lm.cc

// Copyright 2017  Ke Li
//           2017  Johns Hopkins University (author: Daniel Povey)

// See ../COPYING for clarification regarding multiple authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//  http://www.apache.org/licenses/LICENSE-2.0
//
// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
// MERCHANTABILITY OR NON-INFRINGEMENT.
// See the Apache 2 License for the specific language governing permissions and
// limitations under the License.

#include "rnnlm/sampling-lm.h"

namespace kaldi {
namespace rnnlm {

// This function reads in each ngram line from an ARPA file
void SamplingLm::ConsumeNGram(const NGram& ngram) {
  int32 cur_order = ngram.words.size(),
      max_order = Order();
  int32 word = ngram.words.back();  // word is the last word in a ngram term
  KALDI_ASSERT(cur_order > 0 && word > 0);

  if (cur_order == 1) {
    // unigram
    if (unigram_probs_.size() <= static_cast<size_t>(word))
      unigram_probs_.resize(static_cast<size_t>(word + 1), 0.0);
    KALDI_ASSERT(unigram_probs_[word] == 0.0);  // or repeated unigram.
    unigram_probs_[word] = Exp(ngram.logprob);
    if (ngram.backoff != 0.0)
      higher_order_probs_[cur_order - 1][ngram.words].backoff_prob =
        Exp(ngram.backoff);
  } else {
    HistType history(ngram.words.begin(), ngram.words.end() - 1);
    // Note: we'll later on change the probability, subtracting the
    // part that is due to backoff.  This change of format is
    // convenient for our application.
    // ngram.logprob has already been converted to log-base e at
    // this point.
    higher_order_probs_[cur_order - 2][history].words_and_probs.push_back(
        std::pair<int32, BaseFloat>(word, Exp(ngram.logprob)));
    if (ngram.backoff != 0.0) {
      KALDI_ASSERT(cur_order != max_order);
      higher_order_probs_[cur_order - 1][ngram.words].backoff_prob =
          Exp(ngram.backoff);
    }
  }
}

void SamplingLm::HeaderAvailable() {
  unigram_probs_.reserve(NgramCounts()[0] + 100);
  // e.g. for a trigram LM we store bigram and trigram
  // history states in probs_, while unigram_probs_ stores
  // the unigram probabilities.
  int32 ngram_order = NgramCounts().size();
  higher_order_probs_.resize(ngram_order - 1);
}

BaseFloat SamplingLm::GetProbWithBackoff(
    const std::vector<int32> &history,
    const HistoryState *state,
    int32 word) const {
  if (state == NULL) {
    int32 order = history.size() + 1;
    if (order == 1) {
      KALDI_ASSERT(static_cast<size_t>(word) < unigram_probs_.size());
      return unigram_probs_[word];
    } else {
      std::unordered_map<HistType, HistoryState, VectorHasher<int32> >::const_iterator
          hist_iter = higher_order_probs_[order - 2].find(history);
      KALDI_ASSERT(hist_iter != higher_order_probs_[order - 2].end());
      // it's not optimally efficient to recurse here, but this is on a code
      // path that will rarely be taken in practice.
      return GetProbWithBackoff(history, &(hist_iter->second), word);
    }
  } else {
    std::pair<int32, BaseFloat> p(word, 0.0);
    std::vector<std::pair<int32, BaseFloat> >::const_iterator iter =
        std::lower_bound(state->words_and_probs.begin(),
                         state->words_and_probs.end(), p);
    if (iter != state->words_and_probs.end() && iter->first == word) {
      // the probability for this word was given in this history state.  (note:
      // we assume that at the time this function is called, the entire
      // probability is present here, as it is in the ARPA format LM.  See
      // documentation for this function for more explanation.
      return iter->second;
    } else {
      // we have to back off.
      std::vector<int32> backoff_history(history.begin() + 1,
                                         history.end());
      return state->backoff_prob *
          GetProbWithBackoff(backoff_history, NULL, word);
    }
  }
}

void SamplingLm::EnsureHistoryStatesSorted() {
  for (size_t i = 0; i < higher_order_probs_.size(); i++) {
    std::unordered_map<HistType, HistoryState, VectorHasher<int32> >::iterator
        iter = higher_order_probs_[i].begin(),
        end = higher_order_probs_[i].end();
    for (; iter != end; ++iter)
      std::sort(iter->second.words_and_probs.begin(),
                iter->second.words_and_probs.end());
  }
}

void SamplingLm::ReadComplete() {
  EnsureHistoryStatesSorted();
  int32 max_order = Order();
  for (int32 order = max_order; order >= 2; order--) {
    std::unordered_map<HistType, HistoryState, VectorHasher<int32> >
        &this_map = higher_order_probs_[order - 2];
    std::unordered_map<HistType, HistoryState,
        VectorHasher<int32> >::iterator
        hist_iter = this_map.begin(), hist_end = this_map.end();
    for (; hist_iter != hist_end; ++hist_iter) {
      const HistType &history = hist_iter->first;
      HistoryState &history_state = hist_iter->second;
      BaseFloat backoff_prob = history_state.backoff_prob;
      HistoryState *backoff_state;
      HistType backoff_history(history.begin() + 1, history.end());
      if (order == 2) backoff_state = NULL;  // unigram has different format.
      else backoff_state = &(higher_order_probs_[order - 3][backoff_history]);

      std::vector<std::pair<int32, BaseFloat> >::iterator
          word_iter = history_state.words_and_probs.begin(),
          word_end = history_state.words_and_probs.end();
      double total_prob_after_subtracting = 0.0;
      for (; word_iter != word_end; ++word_iter) {
        int32 word = word_iter->first;
        BaseFloat prob = word_iter->second;
        // OK, we want to subtract the backoff part.
        BaseFloat backoff_part_of_prob = backoff_prob *
            GetProbWithBackoff(backoff_history, backoff_state, word);
        if (backoff_part_of_prob > 1.01 * prob) {
          KALDI_WARN << "Backoff part of prob is larger than prob itself: "
                     << backoff_part_of_prob << " > " << prob
                     << ".  This may mean your language model was not "
                     << "Kneser-Ney 'with addition'.  We advise to use "
                     << "Kneser-Ney with addition or some other type of "
                     << "LM 'with addition'.";
        }
        // OK, this could now be negative.  This shouldn't matter
        BaseFloat new_prob = prob - backoff_part_of_prob;
        word_iter->second = new_prob;
        total_prob_after_subtracting += new_prob;
      }
      BaseFloat new_total = total_prob_after_subtracting + backoff_prob;
      if (fabs(new_total - 1.0) > 0.01)
        KALDI_WARN << "Expected LM-state to sum to one, got "
                   << new_total;
    }
  }
}

void SamplingLm::AddBackoffToHistoryStates(
    const WeightedHistType &histories,
    WeightedHistType *histories_closure,
    BaseFloat *total_weight_out,
    BaseFloat *unigram_weight_out) const {
  // the implementation of this function is not as efficient as it could be,
  // but it should not dominate.
  std::vector<std::pair<HistType, BaseFloat> >::const_iterator
      histories_iter = histories.begin(), histories_end = histories.end();
  int32 max_order = Order();
  std::unordered_map<HistType, BaseFloat,
      VectorHasher<int32> > hist_to_weight_map;
  double total_weight = 0.0, total_unigram_weight = 0.0;
  for (; histories_iter != histories_end; ++histories_iter) {
    std::vector<int32> history = histories_iter->first;
    int32 cur_hist_len = history.size();
    BaseFloat weight = histories_iter->second;
    total_weight += weight;
    KALDI_ASSERT(history.size() <= max_order - 1 && weight > 0);

    // back off until the history exists or until we reached the unigram state.
    while (cur_hist_len > 0 &&
           higher_order_probs_[cur_hist_len - 1].count(history) == 0) {
      history.erase(history.begin());
      cur_hist_len--;
    }
    // OK, the history-state exists.
    while (cur_hist_len > 0) {
      hist_to_weight_map[history] += weight;
      std::unordered_map<HistType, HistoryState, VectorHasher<int32> >::const_iterator
          iter = higher_order_probs_[cur_hist_len - 1].find(history);
      KALDI_ASSERT(iter != higher_order_probs_[cur_hist_len - 1].end());
      weight *= iter->second.backoff_prob;
      history.erase(history.begin());
      cur_hist_len--;
    }
    // at this point, 'history' is empty and 'weight' is the unigram
    // backoff weight for this history state.
    total_unigram_weight += weight;
  }
  histories_closure->clear();
  histories_closure->resize(hist_to_weight_map.size());
  std::unordered_map<HistType, BaseFloat, VectorHasher<int32> >::iterator
      hist_to_weight_iter = hist_to_weight_map.begin(),
      hist_to_weight_end = hist_to_weight_map.end();
  size_t pos = 0;
  for (; hist_to_weight_iter != hist_to_weight_end; ++hist_to_weight_iter) {
    (*histories_closure)[pos].first = hist_to_weight_iter->first;
    (*histories_closure)[pos].second = hist_to_weight_iter->second;
    pos++;
  }
  *total_weight_out = total_weight;
  *unigram_weight_out = total_unigram_weight;
  KALDI_ASSERT(pos == hist_to_weight_map.size());
}


BaseFloat SamplingLm::GetDistribution(
    const WeightedHistType &histories,
    std::vector<std::pair<int32, BaseFloat> > *non_unigram_probs_out) const {
  std::unordered_map<int32, BaseFloat> non_unigram_probs_temp;
  // Call the other version of GetDistribution().
  BaseFloat ans = GetDistribution(histories, &non_unigram_probs_temp);
  non_unigram_probs_out->clear();
  non_unigram_probs_out->reserve(non_unigram_probs_temp.size());
  non_unigram_probs_out->insert(non_unigram_probs_out->end(),
                                non_unigram_probs_temp.begin(),
                                non_unigram_probs_temp.end());
  std::sort(non_unigram_probs_out->begin(),
            non_unigram_probs_out->end());
  return ans;
}

BaseFloat SamplingLm::GetDistribution(
    const WeightedHistType &histories,
    std::unordered_map<int32, BaseFloat> *non_unigram_probs) const {
  WeightedHistType histories_closure;
  BaseFloat total_weight, total_unigram_weight;
  AddBackoffToHistoryStates(histories, &histories_closure,
                            &total_weight, &total_unigram_weight);
  non_unigram_probs->clear();
  double total_weight_check = total_unigram_weight;
  WeightedHistType::const_iterator iter = histories_closure.begin(),
      end = histories_closure.end();
  for (; iter != end; ++iter) {
    const HistType &history = iter->first;
    BaseFloat hist_weight = iter->second;
    int32 order = history.size() + 1;
    KALDI_ASSERT(order > 1);  // unigram history is not included at this point.
    std::unordered_map<HistType, HistoryState,
        VectorHasher<int32> >::const_iterator it_hist =
           higher_order_probs_[order - 2].find(history);
    KALDI_ASSERT(it_hist != higher_order_probs_[order - 2].end());
    std::vector<std::pair<int32, BaseFloat> >::const_iterator
        word_iter = it_hist->second.words_and_probs.begin(),
        word_end = it_hist->second.words_and_probs.end();
    for (; word_iter != word_end; ++word_iter) {
      int32 word = word_iter->first;
      BaseFloat prob = word_iter->second;
      // note: if 'word' was not in the map, it's as if it were zero, for C++
      // version >= C++11; search for unordered_map value initialization for
      // explanation
      (*non_unigram_probs)[word] += prob * hist_weight;
      total_weight_check += prob * hist_weight;
    }
  }
  // Check that 'total_weight' and 'total_weight_check' are
  // the same.  'total_weight' is the total of the of the .second
  // member of the input 'histories', and 'total_weight_check' is the
  // total weight of 'non_unigrm_probs' plus 'total_unigram_weight'.
  // Essentially this is a check that the distribution given
  // by the ARPA file (and as processed by us) sums to one for each
  // history state.  If this check fails, it could either be
  // a problem with this code, or an issue with the software that
  // created the ARPA file.
  if (fabs(total_weight - total_weight_check) >
      0.01 * total_weight) {
    static int32 num_times_warned = 0;
    if (num_times_warned < 10) {
      KALDI_WARN << "Total weight does not have expected value (problem in "
          "your ARPA file, or this code).  Won't warn >10 times.";
      num_times_warned++;
    }
  }
  KALDI_ASSERT(total_unigram_weight > 0.0);
  return total_unigram_weight;
}

SamplingLm::SamplingLm(const SamplingLmEstimator &estimator):
    ArpaFileParser(ArpaParseOptions(), NULL),
    unigram_probs_(estimator.unigram_probs_),
    higher_order_probs_(estimator.history_states_.size() - 1) {
  for (int32 o = 2;
       o <= static_cast<int32>(estimator.history_states_.size()); o++) {
    higher_order_probs_[o-2].reserve(estimator.history_states_[o-1].size());
    unordered_map<std::vector<int32>, SamplingLmEstimator::HistoryState*,
                  VectorHasher<int32> >::const_iterator
        iter = estimator.history_states_[o-1].begin(),
        end =  estimator.history_states_[o-1].end();
    for (; iter != end; ++iter) {
      const std::vector<int32> &history = iter->first;
      const SamplingLmEstimator::HistoryState &src_state = *(iter->second);
      // the next statement adds a history state to the map.
      HistoryState &dest_state = higher_order_probs_[o-2][history];
      BaseFloat inv_total_count = BaseFloat(1.0) / src_state.total_count;
      dest_state.backoff_prob = src_state.backoff_count * inv_total_count;
      dest_state.words_and_probs.resize(src_state.counts.size());
      std::vector<SamplingLmEstimator::Count>::const_iterator
          src_iter = src_state.counts.begin(),
          src_end = src_state.counts.end();
      std::vector<std::pair<int32, BaseFloat> >::iterator
          dest_iter = dest_state.words_and_probs.begin();
      for (; src_iter != src_end; ++src_iter, ++dest_iter) {
        dest_iter->first = src_iter->word;
        dest_iter->second = inv_total_count * src_iter->count;
      }
    }
  }
}

void SamplingLm::Write(std::ostream &os, bool binary) const {
  WriteToken(os, binary, "<SamplingLm>");
  WriteToken(os, binary, "<Order>");
  int32 order = higher_order_probs_.size() + 1;
  WriteBasicType(os, binary, order);
  WriteToken(os, binary, "<VocabSize>");
  int32 vocab_size = unigram_probs_.size();
  WriteBasicType(os, binary, vocab_size);
  KALDI_ASSERT(!unigram_probs_.empty());
  // we have read and write functions in class Vector, so use that.
  SubVector<BaseFloat> probs(const_cast<BaseFloat*>(&(unigram_probs_[0])),
                             static_cast<int32>(unigram_probs_.size()));
  probs.Write(os, binary);
  for (int32 o = 2; o <= order; o++) {
    WriteToken(os, binary, "<StatesOfOrder>");
    WriteBasicType(os, binary, o);
    WriteToken(os, binary, "<NumStates>");
    int32 num_states = higher_order_probs_[o-2].size();
    WriteBasicType(os, binary, num_states);

    unordered_map<std::vector<int32>, HistoryState,
                  VectorHasher<int32> >::const_iterator
        iter = higher_order_probs_[o-2].begin(),
        end = higher_order_probs_[o-2].end();
    for (; iter != end; ++iter ){
      const std::vector<int32> &history = iter->first;
      const HistoryState &state = iter->second;
      WriteIntegerVector(os, binary, history);
      WriteBasicType(os, binary, state.backoff_prob);
      int32 num_words = state.words_and_probs.size();
      WriteBasicType(os, binary, num_words);
      for (int32 i = 0; i < num_words; i++) {
        WriteBasicType(os, binary, state.words_and_probs[i].first);
        WriteBasicType(os, binary, state.words_and_probs[i].second);
      }
      if (!binary) os << std::endl;
    }
  }
  WriteToken(os, binary, "</SamplingLm>");
}


void SamplingLm::Read(std::istream &is, bool binary) {
  ExpectToken(is, binary, "<SamplingLm>");
  ExpectToken(is, binary, "<Order>");
  int32 order;
  ReadBasicType(is, binary, &order);
  KALDI_ASSERT(order >= 1 && order < 100);
  higher_order_probs_.resize(order - 1);
  ExpectToken(is, binary, "<VocabSize>");
  int32 vocab_size;
  ReadBasicType(is, binary, &vocab_size);
  unigram_probs_.resize(vocab_size);
  // we have read and write functions in class Vector, so use that.
  SubVector<BaseFloat> probs(&(unigram_probs_[0]), vocab_size);
  probs.Read(is, binary);
  for (int32 o = 2; o <= order; o++) {
    ExpectToken(is, binary, "<StatesOfOrder>");
    int32 o2;
    ReadBasicType(is, binary, &o2);
    KALDI_ASSERT(o2 == o);
    int32 num_states;
    ExpectToken(is, binary, "<NumStates>");
    ReadBasicType(is, binary, &num_states);
    higher_order_probs_[o-2].reserve(num_states);
    for  (int32 s = 0; s < num_states; s++) {
      std::vector<int32> history;
      ReadIntegerVector(is, binary, &history);
      HistoryState &state = higher_order_probs_[o-2][history];
      ReadBasicType(is, binary, &(state.backoff_prob));
      int32 num_words;
      ReadBasicType(is, binary, &num_words);
      KALDI_ASSERT(num_words >= 0);
      state.words_and_probs.resize(num_words);
      for (int32 i = 0; i < num_words; i++) {
        ReadBasicType(is, binary, &(state.words_and_probs[i].first));
        ReadBasicType(is, binary, &(state.words_and_probs[i].second));
      }
    }
  }
  ExpectToken(is, binary, "</SamplingLm>");
}

// TODO: delete if unused.
void SamplingLm::Swap(SamplingLm *other) {
  unigram_probs_.swap(other->unigram_probs_);
  higher_order_probs_.swap(other->higher_order_probs_);
}

}  // namespace rnnlm
}  // namespace kaldi