arpa-to-const-arpa.cc
3.18 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
// lmbin/arpa-to-const-arpa.cc
// Copyright 2014 Guoguo Chen
// See ../../COPYING for clarification regarding multiple authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
// MERCHANTABILITY OR NON-INFRINGEMENT.
// See the Apache 2 License for the specific language governing permissions and
// limitations under the License.
#include <string>
#include "lm/const-arpa-lm.h"
#include "util/parse-options.h"
int main(int argc, char *argv[]) {
using namespace kaldi;
typedef kaldi::int32 int32;
try {
const char *usage =
"Converts an Arpa format language model into ConstArpaLm format,\n"
"which is an in-memory representation of the pre-built Arpa language\n"
"model. The output language model can then be read in by a program\n"
"that wants to rescore lattices. We assume that the words in the\n"
"input arpa language model has been converted to integers.\n"
"\n"
"The program is used jointly with utils/map_arpa_lm.pl to build\n"
"ConstArpaLm format language model. We first map the words in an Arpa\n"
"format language model to integers using utils/map_arpa_m.pl, and\n"
"then use this program to build a ConstArpaLm format language model.\n"
"\n"
"Usage: arpa-to-const-arpa [opts] <input-arpa> <const-arpa>\n"
" e.g.: arpa-to-const-arpa --bos-symbol=1 --eos-symbol=2 \\\n"
" arpa.txt const_arpa";
kaldi::ParseOptions po(usage);
ArpaParseOptions options;
options.Register(&po);
// Ideally, these registrations would be in ArpaParseOptions, but some
// programs want integers and other want symbols, so we register them
// outside instead.
po.Register("unk-symbol", &options.unk_symbol,
"Integer corresponds to unknown-word in language model. -1 if "
"no such word is provided.");
po.Register("bos-symbol", &options.bos_symbol,
"Integer corresponds to <s>. You must set this to your actual "
"BOS integer.");
po.Register("eos-symbol", &options.eos_symbol,
"Integer corresponds to </s>. You must set this to your actual "
"EOS integer.");
po.Read(argc, argv);
if (po.NumArgs() != 2) {
po.PrintUsage();
exit(1);
}
if (options.bos_symbol == -1 || options.eos_symbol == -1) {
KALDI_ERR << "Please set --bos-symbol and --eos-symbol.";
exit(1);
}
std::string arpa_rxfilename = po.GetArg(1),
const_arpa_wxfilename = po.GetOptArg(2);
bool ans = BuildConstArpaLm(options, arpa_rxfilename,
const_arpa_wxfilename);
if (ans)
return 0;
else
return 1;
} catch(const std::exception &e) {
std::cerr << e.what() << '\n';
return -1;
}
}