Blame view
tools/openfst-1.6.7/include/fst/icu.h
3.8 KB
8dcb6dfcb first commit |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 |
// See www.openfst.org for extensive documentation on this weighted // finite-state transducer library. // // This library implements an unrestricted Thompson/Pike UTF-8 parser and // serializer. UTF-8 is a restricted subset of this byte stream encoding. See // http://en.wikipedia.org/wiki/UTF-8 for a good description of the encoding // details. #ifndef FST_ICU_H_ #define FST_ICU_H_ #include <sstream> #include <vector> #include <fst/log.h> namespace fst { // This function writes UTF-8 codepoints into a vector of Labels, truncating if // necessary. It is possible to use this sensibly with as little as 16 bits of // Label precision (i.e., when all characters are within the Basic Multilingual // Plane). With 21 bits, one can encode all UTF-8 codepoints, including those // from the various Astral Planes. Naturally, it is safe to use this with larger // Labels (e.g., 64 bits). template <class Label> bool UTF8StringToLabels(const string &str, std::vector<Label> *labels) { const auto *data = str.data(); const auto length = str.size(); for (size_t i = 0; i < length;) { int c = data[i++] & 0xff; if ((c & 0x80) == 0) { labels->push_back(c); } else { if ((c & 0xc0) == 0x80) { LOG(ERROR) << "UTF8StringToLabels: Continuation byte as lead byte"; return false; } int count = (c >= 0xc0) + (c >= 0xe0) + (c >= 0xf0) + (c >= 0xf8) + (c >= 0xfc); int32 code = c & ((1 << (6 - count)) - 1); while (count != 0) { if (i == length) { LOG(ERROR) << "UTF8StringToLabels: Truncated UTF-8 byte sequence"; return false; } char cb = data[i++]; if ((cb & 0xc0) != 0x80) { LOG(ERROR) << "UTF8StringToLabels: Missing/invalid continuation byte"; return false; } code = (code << 6) | (cb & 0x3f); count--; } if (code < 0) { // Should be unreachable. LOG(ERROR) << "UTF8StringToLabels: Invalid character found: " << c; return false; } labels->push_back(code); } } return true; } template <class Label> bool LabelsToUTF8String(const std::vector<Label> &labels, string *str) { std::ostringstream ostr; for (size_t i = 0; i < labels.size(); ++i) { int32 code = labels[i]; if (code < 0) { LOG(ERROR) << "LabelsToUTF8String: Invalid character found: " << code; return false; } else if (code < 0x80) { ostr << static_cast<char>(code); } else if (code < 0x800) { ostr << static_cast<char>((code >> 6) | 0xc0); ostr << static_cast<char>((code & 0x3f) | 0x80); } else if (code < 0x10000) { ostr << static_cast<char>((code >> 12) | 0xe0); ostr << static_cast<char>(((code >> 6) & 0x3f) | 0x80); ostr << static_cast<char>((code & 0x3f) | 0x80); } else if (code < 0x200000) { ostr << static_cast<char>((code >> 18) | 0xf0); ostr << static_cast<char>(((code >> 12) & 0x3f) | 0x80); ostr << static_cast<char>(((code >> 6) & 0x3f) | 0x80); ostr << static_cast<char>((code & 0x3f) | 0x80); } else if (code < 0x4000000) { ostr << static_cast<char>((code >> 24) | 0xf8); ostr << static_cast<char>(((code >> 18) & 0x3f) | 0x80); ostr << static_cast<char>(((code >> 12) & 0x3f) | 0x80); ostr << static_cast<char>(((code >> 6) & 0x3f) | 0x80); ostr << static_cast<char>((code & 0x3f) | 0x80); } else { ostr << static_cast<char>((code >> 30) | 0xfc); ostr << static_cast<char>(((code >> 24) & 0x3f) | 0x80); ostr << static_cast<char>(((code >> 18) & 0x3f) | 0x80); ostr << static_cast<char>(((code >> 12) & 0x3f) | 0x80); ostr << static_cast<char>(((code >> 6) & 0x3f) | 0x80); ostr << static_cast<char>((code & 0x3f) | 0x80); } } *str = ostr.str(); return true; } } // namespace fst #endif // FST_ICU_H_ |