Blame view

src/util/text-utils.h 10.8 KB
8dcb6dfcb   Yannick Estève   first commit
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
  // util/text-utils.h
  
  // Copyright 2009-2011  Saarland University;  Microsoft Corporation
  
  // See ../../COPYING for clarification regarding multiple authors
  //
  // Licensed under the Apache License, Version 2.0 (the "License");
  // you may not use this file except in compliance with the License.
  // You may obtain a copy of the License at
  //
  //  http://www.apache.org/licenses/LICENSE-2.0
  //
  // THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
  // KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
  // WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
  // MERCHANTABLITY OR NON-INFRINGEMENT.
  // See the Apache 2 License for the specific language governing permissions and
  // limitations under the License.
  
  #ifndef KALDI_UTIL_TEXT_UTILS_H_
  #define KALDI_UTIL_TEXT_UTILS_H_
  
  #include <errno.h>
  #include <string>
  #include <algorithm>
  #include <map>
  #include <set>
  #include <vector>
  #include <limits>
  #include "base/kaldi-common.h"
  
  
  namespace kaldi {
  
  /// Split a string using any of the single character delimiters.
  /// If omit_empty_strings == true, the output will contain any
  /// nonempty strings after splitting on any of the
  /// characters in the delimiter.  If omit_empty_strings == false,
  /// the output will contain n+1 strings if there are n characters
  /// in the set "delim" within the input string.  In this case
  /// the empty string is split to a single empty string.
  void SplitStringToVector(const std::string &full, const char *delim,
                           bool omit_empty_strings,
                           std::vector<std::string> *out);
  
  /// Joins the elements of a vector of strings into a single string using
  /// "delim" as the delimiter. If omit_empty_strings == true, any empty strings
  /// in the vector are skipped. A vector of empty strings results in an empty
  /// string on the output.
  void JoinVectorToString(const std::vector<std::string> &vec_in,
                          const char *delim, bool omit_empty_strings,
                          std::string *str_out);
  
  /**
    \brief Split a string (e.g. 1:2:3) into a vector of integers.
  
    \param [in]  delim  String containing a list of characters, any of which
                        is allowed as a delimiter.
    \param [in] omit_empty_strings If true, empty strings between delimiters are
                        allowed and will not produce an output integer; if false,
                        instances of characters in 'delim' that are consecutive or
                        at the start or end of the string would be an error.
                        You'll normally want this to be true if 'delim' consists
                        of spaces, and false otherwise.
    \param [out] out   The output list of integers.
  */
  template<class I>
  bool SplitStringToIntegers(const std::string &full,
                             const char *delim,
                             bool omit_empty_strings,  // typically false [but
                                                       // should probably be true
                                                       // if "delim" is spaces].
                             std::vector<I> *out) {
    KALDI_ASSERT(out != NULL);
    KALDI_ASSERT_IS_INTEGER_TYPE(I);
    if (*(full.c_str()) == '\0') {
      out->clear();
      return true;
    }
    std::vector<std::string> split;
    SplitStringToVector(full, delim, omit_empty_strings, &split);
    out->resize(split.size());
    for (size_t i = 0; i < split.size(); i++) {
      const char *this_str = split[i].c_str();
      char *end = NULL;
      int64 j = 0;
      j = KALDI_STRTOLL(this_str, &end);
      if (end == this_str || *end != '\0') {
        out->clear();
        return false;
      } else {
        I jI = static_cast<I>(j);
        if (static_cast<int64>(jI) != j) {
          // output type cannot fit this integer.
          out->clear();
          return false;
        }
        (*out)[i] = jI;
      }
    }
    return true;
  }
  
  // This is defined for F = float and double.
  template<class F>
  bool SplitStringToFloats(const std::string &full,
                           const char *delim,
                           bool omit_empty_strings,  // typically false
                           std::vector<F> *out);
  
  
  /// Converts a string into an integer via strtoll and returns false if there was
  /// any kind of problem (i.e. the string was not an integer or contained extra
  /// non-whitespace junk, or the integer was too large to fit into the type it is
  /// being converted into).  Only sets *out if everything was OK and it returns
  /// true.
  template<class Int>
  bool ConvertStringToInteger(const std::string &str,
                              Int *out) {
    KALDI_ASSERT_IS_INTEGER_TYPE(Int);
    const char *this_str = str.c_str();
    char *end = NULL;
    errno = 0;
    int64 i = KALDI_STRTOLL(this_str, &end);
    if (end != this_str)
      while (isspace(*end)) end++;
    if (end == this_str || *end != '\0' || errno != 0)
      return false;
    Int iInt = static_cast<Int>(i);
    if (static_cast<int64>(iInt) != i ||
        (i < 0 && !std::numeric_limits<Int>::is_signed)) {
      return false;
    }
    *out = iInt;
    return true;
  }
  
  
  /// ConvertStringToReal converts a string into either float or double
  /// and returns false if there was any kind of problem (i.e. the string
  /// was not a floating point number or contained extra non-whitespace junk).
  /// Be careful- this function will successfully read inf's or nan's.
  template <typename T>
  bool ConvertStringToReal(const std::string &str,
                           T *out);
  
  /// Removes the beginning and trailing whitespaces from a string
  void Trim(std::string *str);
  
  
  /// Removes leading and trailing white space from the string, then splits on the
  /// first section of whitespace found (if present), putting the part before the
  /// whitespace in "first" and the rest in "rest".  If there is no such space,
  /// everything that remains after removing leading and trailing whitespace goes
  /// in "first".
  void SplitStringOnFirstSpace(const std::string &line,
                               std::string *first,
                               std::string *rest);
  
  
  /// Returns true if "token" is nonempty, and all characters are
  /// printable and whitespace-free.
  bool IsToken(const std::string &token);
  
  
  /// Returns true if "line" is free of 
   characters and unprintable
  /// characters, and does not contain leading or trailing whitespace.
  bool IsLine(const std::string &line);
  
  
  
  /**
     This function returns true when two text strings are approximately equal, and
     false when they are not.  The definition of 'equal' is normal string
     equality, except that two substrings like "0.31134" and "0.311341" would be
     considered equal.  'decimal_places_tolerance' controls how many digits after
     the '.' have to match up.
     E.g. StringsApproxEqual("hello 0.23 there", "hello 0.24 there", 2) would
     return false because there is a difference in the 2nd decimal, but with
     an argument of 1 it would return true.
   */
  bool StringsApproxEqual(const std::string &a,
                          const std::string &b,
                          int32 decimal_places_check = 2);
  
  /**
     This class is responsible for parsing input like
      hi-there xx=yyy a=b c empty= f-oo=Append(bar, sss) ba_z=123 bing='a b c' baz="a b c d='a b' e"
     and giving you access to the fields, in this case
  
     FirstToken() == "hi-there", and key->value pairs:
  
     xx->yyy, a->"b c", empty->"", f-oo->"Append(bar, sss)", ba_z->"123",
     bing->"a b c", baz->"a b c d='a b' e"
  
     The first token is optional, if the line started with a key-value pair then
     FirstValue() will be empty.
  
     Note: it can parse value fields with space inside them only if they are free of the '='
     character.  If values are going to contain the '=' character, you need to quote them
     with either single or double quotes.
  
     Key values may contain -_a-zA-Z0-9, but must begin with a-zA-Z_.
   */
  class ConfigLine {
   public:
    // Tries to parse the line as a config-file line.  Returns false
    // if it could not for some reason, e.g. parsing failure.  In most cases
    // prints no warnings; the user should do this.  Does not expect comments.
    bool ParseLine(const std::string &line);
  
    // the GetValue functions are overloaded for various types.  They return true
    // if the key exists with value that can be converted to that type, and false
    // otherwise.  They also mark the key-value pair as having been read.  It is
    // not an error to read values twice.
    bool GetValue(const std::string &key, std::string *value);
    bool GetValue(const std::string &key, BaseFloat *value);
    bool GetValue(const std::string &key, int32 *value);
    // Values may be separated by ":" or by ",".
    bool GetValue(const std::string &key, std::vector<int32> *value);
    bool GetValue(const std::string &key, bool *value);
  
    bool HasUnusedValues() const;
    /// returns e.g. foo=bar xxx=yyy if foo and xxx were not consumed by one
    /// of the GetValue() functions.
    std::string UnusedValues() const;
  
    const std::string &FirstToken() const { return first_token_; }
  
    const std::string WholeLine() { return whole_line_; }
    // use default assignment operator and copy constructor.
   private:
    std::string whole_line_;
    // the first token of the line, e.g. if line is
    // foo-bar baz=bing
    // then first_token_ would be "foo-bar".
    std::string first_token_;
  
    // data_ maps from key to (value, is-this-value-consumed?).
    std::map<std::string, std::pair<std::string, bool> > data_;
  
  };
  
  /// This function is like ExpectToken but for two tokens, and it will either
  /// accept token1 and then token2, or just token2.  This is useful in Read
  /// functions where the first token may already have been consumed.
  void ExpectOneOrTwoTokens(std::istream &is, bool binary,
                            const std::string &token1,
                            const std::string &token2);
  
  
  /**
     This function reads in a config file and *appends* its contents to a vector of
     lines; it is responsible for removing comments (anything after '#') and
     stripping out any lines that contain only whitespace after comment removal.
   */
  void ReadConfigLines(std::istream &is,
                       std::vector<std::string> *lines);
  
  
  /**
     This function converts config-lines from a simple sequence of strings
     as output by ReadConfigLines(), into a sequence of first-tokens and
     name-value pairs.  The general format is:
        "command-type bar=baz xx=yyy"
     etc., although there are subtleties as to what exactly is allowed, see
     documentation for class ConfigLine for details.
     This function will die if there was a parsing failure.
   */
  void ParseConfigLines(const std::vector<std::string> &lines,
                        std::vector<ConfigLine> *config_lines);
  
  
  /// Returns true if 'name' would be a valid name for a component or node in a
  /// nnet3Nnet.  This is a nonempty string beginning with A-Za-z_, and containing only
  /// '-', '_', '.', A-Z, a-z, or 0-9.
  bool IsValidName(const std::string &name);
  
  }  // namespace kaldi
  
  #endif  // KALDI_UTIL_TEXT_UTILS_H_