// See www.openfst.org for extensive documentation on this weighted // finite-state transducer library. // // This library implements an unrestricted Thompson/Pike UTF-8 parser and // serializer. UTF-8 is a restricted subset of this byte stream encoding. See // http://en.wikipedia.org/wiki/UTF-8 for a good description of the encoding // details. #ifndef FST_ICU_H_ #define FST_ICU_H_ #include #include #include namespace fst { // This function writes UTF-8 codepoints into a vector of Labels, truncating if // necessary. It is possible to use this sensibly with as little as 16 bits of // Label precision (i.e., when all characters are within the Basic Multilingual // Plane). With 21 bits, one can encode all UTF-8 codepoints, including those // from the various Astral Planes. Naturally, it is safe to use this with larger // Labels (e.g., 64 bits). template bool UTF8StringToLabels(const string &str, std::vector