// lat/kaldi-lattice.cc // Copyright 2009-2011 Microsoft Corporation // 2013 Johns Hopkins University (author: Daniel Povey) // See ../../COPYING for clarification regarding multiple authors // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY // KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED // WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, // MERCHANTABLITY OR NON-INFRINGEMENT. // See the Apache 2 License for the specific language governing permissions and // limitations under the License. #include "lat/kaldi-lattice.h" #include "fst/script/print-impl.h" namespace kaldi { /// Converts lattice types if necessary, deleting its input. template CompactLattice* ConvertToCompactLattice(fst::VectorFst *ifst) { if (!ifst) return NULL; CompactLattice *ofst = new CompactLattice(); ConvertLattice(*ifst, ofst); delete ifst; return ofst; } // This overrides the template if there is no type conversion going on // (for efficiency). template<> CompactLattice* ConvertToCompactLattice(CompactLattice *ifst) { return ifst; } /// Converts lattice types if necessary, deleting its input. template Lattice* ConvertToLattice(fst::VectorFst *ifst) { if (!ifst) return NULL; Lattice *ofst = new Lattice(); ConvertLattice(*ifst, ofst); delete ifst; return ofst; } // This overrides the template if there is no type conversion going on // (for efficiency). template<> Lattice* ConvertToLattice(Lattice *ifst) { return ifst; } bool WriteCompactLattice(std::ostream &os, bool binary, const CompactLattice &t) { if (binary) { fst::FstWriteOptions opts; // Leave all the options default. Normally these lattices wouldn't have any // osymbols/isymbols so no point directing it not to write them (who knows what // we'd want to if we had them). return t.Write(os, opts); } else { // Text-mode output. Note: we expect that t.InputSymbols() and // t.OutputSymbols() would always return NULL. The corresponding input // routine would not work if the FST actually had symbols attached. // Write a newline after the key, so the first line of the FST appears // on its own line. os << '\n'; bool acceptor = true, write_one = false; fst::FstPrinter printer(t, t.InputSymbols(), t.OutputSymbols(), NULL, acceptor, write_one, "\t"); printer.Print(&os, ""); if (os.fail()) KALDI_WARN << "Stream failure detected."; // Write another newline as a terminating character. The read routine will // detect this [this is a Kaldi mechanism, not somethig in the original // OpenFst code]. os << '\n'; return os.good(); } } /// LatticeReader provides (static) functions for reading both Lattice /// and CompactLattice, in text form. class LatticeReader { typedef LatticeArc Arc; typedef LatticeWeight Weight; typedef CompactLatticeArc CArc; typedef CompactLatticeWeight CWeight; typedef Arc::Label Label; typedef Arc::StateId StateId; public: // everything is static in this class. /** This function reads from the FST text format; it does not know in advance whether it's a Lattice or CompactLattice in the stream so it tries to read both formats until it becomes clear which is the correct one. */ static std::pair ReadText( std::istream &is) { typedef std::pair PairT; using std::string; using std::vector; Lattice *fst = new Lattice(); CompactLattice *cfst = new CompactLattice(); string line; size_t nline = 0; string separator = FLAGS_fst_field_separator + "\r\n"; while (std::getline(is, line)) { nline++; vector col; // on Windows we'll write in text and read in binary mode. SplitStringToVector(line, separator.c_str(), true, &col); if (col.size() == 0) break; // Empty line is a signal to stop, in our // archive format. if (col.size() > 5) { KALDI_WARN << "Reading lattice: bad line in FST: " << line; delete fst; delete cfst; return PairT(static_cast(NULL), static_cast(NULL)); } StateId s; if (!ConvertStringToInteger(col[0], &s)) { KALDI_WARN << "FstCompiler: bad line in FST: " << line; delete fst; delete cfst; return PairT(static_cast(NULL), static_cast(NULL)); } if (fst) while (s >= fst->NumStates()) fst->AddState(); if (cfst) while (s >= cfst->NumStates()) cfst->AddState(); if (nline == 1) { if (fst) fst->SetStart(s); if (cfst) cfst->SetStart(s); } if (fst) { // we still have fst; try to read that arc. bool ok = true; Arc arc; Weight w; StateId d = s; switch (col.size()) { case 1 : fst->SetFinal(s, Weight::One()); break; case 2: if (!StrToWeight(col[1], true, &w)) ok = false; else fst->SetFinal(s, w); break; case 3: // 3 columns not ok for Lattice format; it's not an acceptor. ok = false; break; case 4: ok = ConvertStringToInteger(col[1], &arc.nextstate) && ConvertStringToInteger(col[2], &arc.ilabel) && ConvertStringToInteger(col[3], &arc.olabel); if (ok) { d = arc.nextstate; arc.weight = Weight::One(); fst->AddArc(s, arc); } break; case 5: ok = ConvertStringToInteger(col[1], &arc.nextstate) && ConvertStringToInteger(col[2], &arc.ilabel) && ConvertStringToInteger(col[3], &arc.olabel) && StrToWeight(col[4], false, &arc.weight); if (ok) { d = arc.nextstate; fst->AddArc(s, arc); } break; default: ok = false; } while (d >= fst->NumStates()) fst->AddState(); if (!ok) { delete fst; fst = NULL; } } if (cfst) { bool ok = true; CArc arc; CWeight w; StateId d = s; switch (col.size()) { case 1 : cfst->SetFinal(s, CWeight::One()); break; case 2: if (!StrToCWeight(col[1], true, &w)) ok = false; else cfst->SetFinal(s, w); break; case 3: // compact-lattice is acceptor format: state, next-state, label. ok = ConvertStringToInteger(col[1], &arc.nextstate) && ConvertStringToInteger(col[2], &arc.ilabel); if (ok) { d = arc.nextstate; arc.olabel = arc.ilabel; arc.weight = CWeight::One(); cfst->AddArc(s, arc); } break; case 4: ok = ConvertStringToInteger(col[1], &arc.nextstate) && ConvertStringToInteger(col[2], &arc.ilabel) && StrToCWeight(col[3], false, &arc.weight); if (ok) { d = arc.nextstate; arc.olabel = arc.ilabel; cfst->AddArc(s, arc); } break; case 5: default: ok = false; } while (d >= cfst->NumStates()) cfst->AddState(); if (!ok) { delete cfst; cfst = NULL; } } if (!fst && !cfst) { KALDI_WARN << "Bad line in lattice text format: " << line; // read until we get an empty line, so at least we // have a chance to read the next one (although this might // be a bit futile since the calling code will get unhappy // about failing to read this one. while (std::getline(is, line)) { SplitStringToVector(line, separator.c_str(), true, &col); if (col.empty()) break; } return PairT(static_cast(NULL), static_cast(NULL)); } } return PairT(fst, cfst); } static bool StrToWeight(const std::string &s, bool allow_zero, Weight *w) { std::istringstream strm(s); strm >> *w; if (!strm || (!allow_zero && *w == Weight::Zero())) { return false; } return true; } static bool StrToCWeight(const std::string &s, bool allow_zero, CWeight *w) { std::istringstream strm(s); strm >> *w; if (!strm || (!allow_zero && *w == CWeight::Zero())) { return false; } return true; } }; CompactLattice *ReadCompactLatticeText(std::istream &is) { std::pair lat_pair = LatticeReader::ReadText(is); if (lat_pair.second != NULL) { delete lat_pair.first; return lat_pair.second; } else if (lat_pair.first != NULL) { // note: ConvertToCompactLattice frees its input. return ConvertToCompactLattice(lat_pair.first); } else { return NULL; } } Lattice *ReadLatticeText(std::istream &is) { std::pair lat_pair = LatticeReader::ReadText(is); if (lat_pair.first != NULL) { delete lat_pair.second; return lat_pair.first; } else if (lat_pair.second != NULL) { // note: ConvertToLattice frees its input. return ConvertToLattice(lat_pair.second); } else { return NULL; } } bool ReadCompactLattice(std::istream &is, bool binary, CompactLattice **clat) { KALDI_ASSERT(*clat == NULL); if (binary) { fst::FstHeader hdr; if (!hdr.Read(is, "")) { KALDI_WARN << "Reading compact lattice: error reading FST header."; return false; } if (hdr.FstType() != "vector") { KALDI_WARN << "Reading compact lattice: unsupported FST type: " << hdr.FstType(); return false; } fst::FstReadOptions ropts("", &hdr); typedef fst::CompactLatticeWeightTpl, int32> T1; typedef fst::CompactLatticeWeightTpl, int32> T2; typedef fst::LatticeWeightTpl T3; typedef fst::LatticeWeightTpl T4; typedef fst::VectorFst > F1; typedef fst::VectorFst > F2; typedef fst::VectorFst > F3; typedef fst::VectorFst > F4; CompactLattice *ans = NULL; if (hdr.ArcType() == T1::Type()) { ans = ConvertToCompactLattice(F1::Read(is, ropts)); } else if (hdr.ArcType() == T2::Type()) { ans = ConvertToCompactLattice(F2::Read(is, ropts)); } else if (hdr.ArcType() == T3::Type()) { ans = ConvertToCompactLattice(F3::Read(is, ropts)); } else if (hdr.ArcType() == T4::Type()) { ans = ConvertToCompactLattice(F4::Read(is, ropts)); } else { KALDI_WARN << "FST with arc type " << hdr.ArcType() << " cannot be converted to CompactLattice.\n"; return false; } if (ans == NULL) { KALDI_WARN << "Error reading compact lattice (after reading header)."; return false; } *clat = ans; return true; } else { // The next line would normally consume the \r on Windows, plus any // extra spaces that might have got in there somehow. while (std::isspace(is.peek()) && is.peek() != '\n') is.get(); if (is.peek() == '\n') is.get(); // consume the newline. else { // saw spaces but no newline.. this is not expected. KALDI_WARN << "Reading compact lattice: unexpected sequence of spaces " << " at file position " << is.tellg(); return false; } *clat = ReadCompactLatticeText(is); // that routine will warn on error. return (*clat != NULL); } } bool CompactLatticeHolder::Read(std::istream &is) { Clear(); // in case anything currently stored. int c = is.peek(); if (c == -1) { KALDI_WARN << "End of stream detected reading CompactLattice."; return false; } else if (isspace(c)) { // The text form of the lattice begins // with space (normally, '\n'), so this means it's text (the binary form // cannot begin with space because it starts with the FST Type() which is not // space). return ReadCompactLattice(is, false, &t_); } else if (c != 214) { // 214 is first char of FST magic number, // on little-endian machines which is all we support (\326 octal) KALDI_WARN << "Reading compact lattice: does not appear to be an FST " << " [non-space but no magic number detected], file pos is " << is.tellg(); return false; } else { return ReadCompactLattice(is, true, &t_); } } bool WriteLattice(std::ostream &os, bool binary, const Lattice &t) { if (binary) { fst::FstWriteOptions opts; // Leave all the options default. Normally these lattices wouldn't have any // osymbols/isymbols so no point directing it not to write them (who knows what // we'd want to do if we had them). return t.Write(os, opts); } else { // Text-mode output. Note: we expect that t.InputSymbols() and // t.OutputSymbols() would always return NULL. The corresponding input // routine would not work if the FST actually had symbols attached. // Write a newline after the key, so the first line of the FST appears // on its own line. os << '\n'; bool acceptor = false, write_one = false; fst::FstPrinter printer(t, t.InputSymbols(), t.OutputSymbols(), NULL, acceptor, write_one, "\t"); printer.Print(&os, ""); if (os.fail()) KALDI_WARN << "Stream failure detected."; // Write another newline as a terminating character. The read routine will // detect this [this is a Kaldi mechanism, not somethig in the original // OpenFst code]. os << '\n'; return os.good(); } } bool ReadLattice(std::istream &is, bool binary, Lattice **lat) { KALDI_ASSERT(*lat == NULL); if (binary) { fst::FstHeader hdr; if (!hdr.Read(is, "")) { KALDI_WARN << "Reading lattice: error reading FST header."; return false; } if (hdr.FstType() != "vector") { KALDI_WARN << "Reading lattice: unsupported FST type: " << hdr.FstType(); return false; } fst::FstReadOptions ropts("", &hdr); typedef fst::CompactLatticeWeightTpl, int32> T1; typedef fst::CompactLatticeWeightTpl, int32> T2; typedef fst::LatticeWeightTpl T3; typedef fst::LatticeWeightTpl T4; typedef fst::VectorFst > F1; typedef fst::VectorFst > F2; typedef fst::VectorFst > F3; typedef fst::VectorFst > F4; Lattice *ans = NULL; if (hdr.ArcType() == T1::Type()) { ans = ConvertToLattice(F1::Read(is, ropts)); } else if (hdr.ArcType() == T2::Type()) { ans = ConvertToLattice(F2::Read(is, ropts)); } else if (hdr.ArcType() == T3::Type()) { ans = ConvertToLattice(F3::Read(is, ropts)); } else if (hdr.ArcType() == T4::Type()) { ans = ConvertToLattice(F4::Read(is, ropts)); } else { KALDI_WARN << "FST with arc type " << hdr.ArcType() << " cannot be converted to Lattice.\n"; return false; } if (ans == NULL) { KALDI_WARN << "Error reading lattice (after reading header)."; return false; } *lat = ans; return true; } else { // The next line would normally consume the \r on Windows, plus any // extra spaces that might have got in there somehow. while (std::isspace(is.peek()) && is.peek() != '\n') is.get(); if (is.peek() == '\n') is.get(); // consume the newline. else { // saw spaces but no newline.. this is not expected. KALDI_WARN << "Reading compact lattice: unexpected sequence of spaces " << " at file position " << is.tellg(); return false; } *lat = ReadLatticeText(is); // that routine will warn on error. return (*lat != NULL); } } /* Since we don't write the binary headers for this type of holder, we use a different method to work out whether we're in binary mode. */ bool LatticeHolder::Read(std::istream &is) { Clear(); // in case anything currently stored. int c = is.peek(); if (c == -1) { KALDI_WARN << "End of stream detected reading Lattice."; return false; } else if (isspace(c)) { // The text form of the lattice begins // with space (normally, '\n'), so this means it's text (the binary form // cannot begin with space because it starts with the FST Type() which is not // space). return ReadLattice(is, false, &t_); } else if (c != 214) { // 214 is first char of FST magic number, // on little-endian machines which is all we support (\326 octal) KALDI_WARN << "Reading compact lattice: does not appear to be an FST " << " [non-space but no magic number detected], file pos is " << is.tellg(); return false; } else { return ReadLattice(is, true, &t_); } } } // end namespace kaldi