From d26d4b076ef87703ed9c3948fb3dc7e50d5eb6db Mon Sep 17 00:00:00 2001 From: Ric Date: Tue, 22 Jan 2019 09:42:27 -0500 Subject: [PATCH] Support for parsing/writing SGroups in SD Mol files. (#2138) * Implementation of SGroups * remove sample files test * update gitignore with test outputs * fix RevisionModifier * re-enable tests * backup commit; things seem to work so far * some refactoring; obvious s group tests pass now * more refactoring * everything now out of the public API * not sure why this was still in there * rename functions; all tests now pass * remove getNextFreeSGroupId; readd comment in copy SGroups * clang-format * squash-merge current master * squash merge master * Address comments on PR - Update to current master. - Move SGroup parse time checks to SGroupChecks namespace. - Store SGroups in ROMOl as vector. - SGroup methods return referenes instead of pointers. - Use atom/bond/sgroup indexes for properties instead of pointers. - Have SGroups inherit from RDProps; move properties to RDProps. - Remove trivial/unused methods. - Add a link to the SD specification atop SGroup.h --- .gitignore | 1 + CMakeLists.txt | 2 +- CTestCustom.ctest.in | 8 +- Code/GraphMol/CMakeLists.txt | 8 +- Code/GraphMol/FileParsers/CMakeLists.txt | 5 +- Code/GraphMol/FileParsers/FileParsers.h | 15 + Code/GraphMol/FileParsers/MolFileParser.cpp | 202 ++-- Code/GraphMol/FileParsers/MolFileWriter.cpp | 42 +- .../GraphMol/FileParsers/MolSGroupParsing.cpp | 888 ++++++++++++++++++ Code/GraphMol/FileParsers/MolSGroupParsing.h | 103 ++ .../GraphMol/FileParsers/MolSGroupWriting.cpp | 678 +++++++++++++ Code/GraphMol/FileParsers/MolSGroupWriting.h | 136 +++ Code/GraphMol/FileParsers/test1.cpp | 11 +- Code/GraphMol/FileParsers/testMolSupplier.cpp | 224 +++-- .../test_data/Issue3432136_2.v3k.mol | 1 + Code/GraphMol/MolPickler.cpp | 220 ++++- Code/GraphMol/MolPickler.h | 11 + Code/GraphMol/ROMol.cpp | 38 +- Code/GraphMol/ROMol.h | 20 +- Code/GraphMol/RWMol.cpp | 31 + Code/GraphMol/Sgroup.cpp | 200 ++++ Code/GraphMol/Sgroup.h | 180 ++++ Code/GraphMol/catch_sgroups.cpp | 289 ++++++ Code/GraphMol/testSGroup.cpp | 567 +++++++++++ Docs/Code/SGroups.md | 43 + 25 files changed, 3653 insertions(+), 270 deletions(-) create mode 100644 Code/GraphMol/FileParsers/MolSGroupParsing.cpp create mode 100644 Code/GraphMol/FileParsers/MolSGroupParsing.h create mode 100644 Code/GraphMol/FileParsers/MolSGroupWriting.cpp create mode 100644 Code/GraphMol/FileParsers/MolSGroupWriting.h create mode 100644 Code/GraphMol/Sgroup.cpp create mode 100644 Code/GraphMol/Sgroup.h create mode 100644 Code/GraphMol/catch_sgroups.cpp create mode 100644 Code/GraphMol/testSGroup.cpp create mode 100644 Docs/Code/SGroups.md diff --git a/.gitignore b/.gitignore index 046ce5fd6..cde11f848 100644 --- a/.gitignore +++ b/.gitignore @@ -83,6 +83,7 @@ __pycache__/ /Code/GraphMol/FileParsers/test_data/outNCI_few.tdt /Code/GraphMol/FileParsers/test_data/outNCI_first_200.props.sdf /Code/GraphMol/FileParsers/test_data/outSmiles.csv +/Code/GraphMol/FileParsers/test_data/testSGroupsSample_V?000.mol /Code/GraphMol/ForceFieldHelpers/UFF/test_data/Issue62.sdf diff --git a/CMakeLists.txt b/CMakeLists.txt index a9215704a..b4c0ee972 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -484,7 +484,7 @@ endif(RDK_BUILD_CONTRIB) # Memory testing setup FIND_PROGRAM(MEMORYCHECK_COMMAND valgrind) -CONFIGURE_FILE(CTestCustom.ctest.in ${CMAKE_CURRENT_BINARY_DIR}/CTestCustom.ctest) +CONFIGURE_FILE(CTestCustom.ctest.in ${RDKit_BINARY_DIR}/CTestCustom.ctest) # Packaging SET(CPACK_GENERATOR "TGZ;DEB;RPM") diff --git a/CTestCustom.ctest.in b/CTestCustom.ctest.in index 016f008c0..207095bd6 100644 --- a/CTestCustom.ctest.in +++ b/CTestCustom.ctest.in @@ -1,9 +1,11 @@ +# Require us to have the $RDBASE env var set to run the tests +IF(NOT DEFINED ENV{RDBASE}) + MESSAGE(FATAL_ERROR "\n\nPlease set your RDBASE env variable before running the tests.\n\n") +ENDIF(NOT DEFINED ENV{RDBASE}) + SET(CTEST_CUSTOM_MEMCHECK_IGNORE ${CTEST_CUSTOM_MEMCHECK_IGNORE} # python tests are not memchecked: these are slow, and difficult to interpret ${RDKIT_PYTEST_CACHE} ) - - - diff --git a/Code/GraphMol/CMakeLists.txt b/Code/GraphMol/CMakeLists.txt index f9906e0c5..621cd2405 100644 --- a/Code/GraphMol/CMakeLists.txt +++ b/Code/GraphMol/CMakeLists.txt @@ -8,7 +8,7 @@ rdkit_library(GraphMol MolDiscriminators.cpp ConjugHybrid.cpp AddHs.cpp Matrices.cpp Chirality.cpp RingInfo.cpp Conformer.cpp Renumber.cpp AdjustQuery.cpp Resonance.cpp StereoGroup.cpp - new_canon.cpp + new_canon.cpp Sgroup.cpp SHARED LINK_LIBRARIES RDGeometryLib RDGeneral ${RDKit_THREAD_LIBS}) @@ -39,6 +39,7 @@ rdkit_headers(Atom.h ROMol.h RWMol.h SanitException.h + Sgroup.h StereoGroup.h MonomerInfo.h new_canon.h @@ -137,7 +138,12 @@ rdkit_test(resMolSupplierTest resMolSupplierTest.cpp rdkit_test(molBundleTest testMolBundle.cpp LINK_LIBRARIES SmilesParse GraphMol RDGeometryLib RDGeneral SubstructMatch FileParsers) +rdkit_test(testSGroup testSGroup.cpp LINK_LIBRARIES FileParsers GraphMol) + rdkit_test(test-valgrind test-valgrind.cpp LINK_LIBRARIES SmilesParse GraphMol RDGeneral) rdkit_catch_test(graphmolTestsCatch catch_tests.cpp LINK_LIBRARIES SubstructMatch FileParsers SmilesParse GraphMol RDGeometryLib RDGeneral) + +rdkit_catch_test(graphmolSGroupCatch catch_sgroups.cpp + LINK_LIBRARIES SmilesParse GraphMol RDGeometryLib RDGeneral) diff --git a/Code/GraphMol/FileParsers/CMakeLists.txt b/Code/GraphMol/FileParsers/CMakeLists.txt index 9831a537f..d8803394a 100644 --- a/Code/GraphMol/FileParsers/CMakeLists.txt +++ b/Code/GraphMol/FileParsers/CMakeLists.txt @@ -14,8 +14,9 @@ if(RDK_USE_BOOST_REGEX) endif() rdkit_library(FileParsers - Mol2FileParser.cpp - MolFileParser.cpp MolFileStereochem.cpp MolFileWriter.cpp + Mol2FileParser.cpp MolFileParser.cpp + MolSGroupParsing.cpp MolSGroupWriting.cpp + MolFileStereochem.cpp MolFileWriter.cpp ForwardSDMolSupplier.cpp SDMolSupplier.cpp SDWriter.cpp SmilesMolSupplier.cpp SmilesWriter.cpp TDTMolSupplier.cpp TDTWriter.cpp diff --git a/Code/GraphMol/FileParsers/FileParsers.h b/Code/GraphMol/FileParsers/FileParsers.h index 731d5bb77..ca5d4dbe3 100644 --- a/Code/GraphMol/FileParsers/FileParsers.h +++ b/Code/GraphMol/FileParsers/FileParsers.h @@ -25,6 +25,21 @@ namespace RDKit { const int MOLFILE_MAXLINE = 256; RDKIT_FILEPARSERS_EXPORT std::string strip(const std::string &orig); +class MolFileUnhandledFeatureException : public std::exception { + public: + //! construct with an error message + explicit MolFileUnhandledFeatureException(const char *msg) : _msg(msg){}; + //! construct with an error message + explicit MolFileUnhandledFeatureException(const std::string msg) + : _msg(msg){}; + //! get the error message + const char *message() const { return _msg.c_str(); }; + ~MolFileUnhandledFeatureException() throw() override{}; + + private: + std::string _msg; +}; + //----- // mol files //----- diff --git a/Code/GraphMol/FileParsers/MolFileParser.cpp b/Code/GraphMol/FileParsers/MolFileParser.cpp index 654014fa8..cf7007690 100644 --- a/Code/GraphMol/FileParsers/MolFileParser.cpp +++ b/Code/GraphMol/FileParsers/MolFileParser.cpp @@ -16,11 +16,13 @@ #include "FileParsers.h" #include "FileParserUtils.h" +#include "MolSGroupParsing.h" #include "MolFileStereochem.h" #include #include #include +#include #include #include @@ -46,23 +48,12 @@ using std::smatch; #include #include -namespace RDKit { -class MolFileUnhandledFeatureException : public std::exception { - public: - //! construct with an error message - explicit MolFileUnhandledFeatureException(const char *msg) : _msg(msg){}; - //! construct with an error message - explicit MolFileUnhandledFeatureException(const std::string msg) - : _msg(msg){}; - //! get the error message - const char *message() const { return _msg.c_str(); }; - ~MolFileUnhandledFeatureException() throw() override{}; +using namespace RDKit::SGroupParsing; - private: - std::string _msg; -}; +namespace RDKit { namespace FileParserUtils { + int toInt(const std::string &input, bool acceptSpaces) { int res = 0; // don't need to worry about locale stuff here because @@ -137,6 +128,7 @@ Atom *replaceAtomWithQueryAtom(RWMol *mol, Atom *atom) { using RDKit::FileParserUtils::getV3000Line; namespace { + void completeQueryAndChildren(ATOM_EQUALS_QUERY *query, Atom *tgt, int magicVal) { PRECONDITION(query, "no query"); @@ -300,7 +292,7 @@ void ParseOldAtomList(RWMol *mol, const std::string &text, unsigned int line) { a.setProp(common_properties::_MolFileAtomQuery, 1); mol->replaceAtom(idx, &a); -}; +} void ParseChargeLine(RWMol *mol, const std::string &text, bool firstCall, unsigned int line) { @@ -343,63 +335,6 @@ void ParseChargeLine(RWMol *mol, const std::string &text, bool firstCall, } } -bool SGroupOK(std::string typ) { - const char *cfailTyps[11] = { - // polymer sgroups: - "SRU", "MON", "COP", "CRO", "GRA", "MOD", "MER", "ANY", - // formulations/mixtures: - "COM", "MIX", "FOR"}; - std::vector failTyps(cfailTyps, cfailTyps + 11); - return std::find(failTyps.begin(), failTyps.end(), typ) == failTyps.end(); -} - -void ParseSGroup2000STYLine(RWMol *mol, const std::string &text, - unsigned int line) { - PRECONDITION(mol, "bad mol"); - PRECONDITION(text.substr(0, 6) == std::string("M STY"), "bad STY line"); - - int nent; - try { - nent = FileParserUtils::toInt(text.substr(6, 3)); - } catch (boost::bad_lexical_cast &) { - std::ostringstream errout; - errout << "Cannot convert " << text.substr(6, 3) << " to int on line " - << line; - throw FileParseException(errout.str()); - } - - unsigned int spos = 9; - for (int ie = 0; ie < nent; ie++) { - if (text.size() < spos + 8) { - std::ostringstream errout; - errout << "SGroup line too short: '" << text << "' on line " << line; - throw FileParseException(errout.str()); - } -#if 0 - int nbr; - try { - nbr = FileParserUtils::toInt(text.substr(spos,4)); - } - catch (boost::bad_lexical_cast &) { - std::ostringstream errout; - errout << "Cannot convert " << text.substr(spos,3) << " to int on line "<replaceAtom(idx, a); delete a; -}; +} void ParseV3000RGroups(RWMol *mol, Atom *&atom, const std::string &text, unsigned int line) { @@ -1103,7 +1040,7 @@ void ParseRGroupLabels(RWMol *mol, const std::string &text, unsigned int line) { qatom.setQuery(makeAtomNullQuery()); mol->replaceAtom(atIdx, &qatom); } -}; +} void ParseAtomAlias(RWMol *mol, std::string text, const std::string &nextLine, unsigned int line) { @@ -1123,7 +1060,7 @@ void ParseAtomAlias(RWMol *mol, std::string text, const std::string &nextLine, URANGE_CHECK(idx, mol->getNumAtoms()); Atom *at = mol->getAtomWithIdx(idx); at->setProp(common_properties::molFileAlias, nextLine); -}; +} void ParseAtomValue(RWMol *mol, std::string text, unsigned int line) { PRECONDITION(mol, "bad mol"); @@ -1143,7 +1080,7 @@ void ParseAtomValue(RWMol *mol, std::string text, unsigned int line) { Atom *at = mol->getAtomWithIdx(idx); at->setProp(common_properties::molFileValue, text.substr(7, text.length() - 7)); -}; +} Atom *ParseMolFileAtomLine(const std::string text, RDGeom::Point3D &pos, unsigned int line) { @@ -1386,7 +1323,7 @@ Atom *ParseMolFileAtomLine(const std::string text, RDGeom::Point3D &pos, res->setProp("molExactChangeFlag", exactChangeFlag); } return res; -}; +} Bond *ParseMolFileBondLine(const std::string &text, unsigned int line) { int idx1, idx2, bType, stereo; @@ -1549,14 +1486,14 @@ Bond *ParseMolFileBondLine(const std::string &text, unsigned int line) { } } return res; -}; +} void ParseMolBlockAtoms(std::istream *inStream, unsigned int &line, unsigned int nAtoms, RWMol *mol, Conformer *conf) { PRECONDITION(inStream, "bad stream"); PRECONDITION(mol, "bad molecule"); PRECONDITION(conf, "bad conformer"); - for (unsigned int i = 0; i < nAtoms; ++i) { + for (unsigned int i = 1; i <= nAtoms; ++i) { ++line; std::string tempStr = getLine(inStream); if (inStream->eof()) { @@ -1566,6 +1503,7 @@ void ParseMolBlockAtoms(std::istream *inStream, unsigned int &line, Atom *atom = ParseMolFileAtomLine(tempStr, pos, line); unsigned int aid = mol->addAtom(atom, false, true); conf->setAtomPos(aid, pos); + mol->setAtomBookmark(atom, i); } } @@ -1575,7 +1513,7 @@ void ParseMolBlockBonds(std::istream *inStream, unsigned int &line, bool &chiralityPossible) { PRECONDITION(inStream, "bad stream"); PRECONDITION(mol, "bad molecule"); - for (unsigned int i = 0; i < nBonds; ++i) { + for (unsigned int i = 1; i <= nBonds; ++i) { ++line; std::string tempStr = getLine(inStream); if (inStream->eof()) { @@ -1595,6 +1533,7 @@ void ParseMolBlockBonds(std::istream *inStream, unsigned int &line, chiralityPossible = true; } mol->addBond(bond, true); + mol->setBondBookmark(bond, i); } } @@ -1626,8 +1565,13 @@ bool ParseMolBlockProperties(std::istream *inStream, unsigned int &line, } } + IDX_TO_SGROUP_MAP sGroupMap; + IDX_TO_STR_VECT_MAP dataFieldsMap; bool fileComplete = false; bool firstChargeLine = true; + unsigned int SCDcounter = 0; + unsigned int lastDataSGroup = 0; + std::ostringstream currentDataField; std::string lineBeg = tempStr.substr(0, 6); while (!inStream->eof() && lineBeg != "M END" && tempStr.substr(0, 4) != "$$$$") { @@ -1675,8 +1619,54 @@ bool ParseMolBlockProperties(std::istream *inStream, unsigned int &line, } else if (lineBeg == "M RAD") { ParseRadicalLine(mol, tempStr, firstChargeLine, line); firstChargeLine = false; + + /* SGroup parsing start */ } else if (lineBeg == "M STY") { - ParseSGroup2000STYLine(mol, tempStr, line); + ParseSGroupV2000STYLine(sGroupMap, mol, tempStr, line); + } else if (lineBeg == "M SST") { + ParseSGroupV2000SSTLine(sGroupMap, mol, tempStr, line); + } else if (lineBeg == "M SLB") { + ParseSGroupV2000SLBLine(sGroupMap, mol, tempStr, line); + } else if (lineBeg == "M SCN") { + ParseSGroupV2000SCNLine(sGroupMap, mol, tempStr, line); + } else if (lineBeg == "M SDS") { + ParseSGroupV2000SDSLine(sGroupMap, mol, tempStr, line); + } else if (lineBeg == "M SAL" || lineBeg == "M SBL" || + lineBeg == "M SPA") { + ParseSGroupV2000VectorDataLine(sGroupMap, mol, tempStr, line); + } else if (lineBeg == "M SMT") { + ParseSGroupV2000SMTLine(sGroupMap, mol, tempStr, line); + } else if (lineBeg == "M SDI") { + ParseSGroupV2000SDILine(sGroupMap, mol, tempStr, line); + } else if (lineBeg == "M CRS") { + std::ostringstream errout; + errout << "Unsupported SGroup subtype '" << lineBeg << "' on line " + << line; + throw FileParseException(errout.str()); + } else if (lineBeg == "M SBV") { + ParseSGroupV2000SBVLine(sGroupMap, mol, tempStr, line); + } else if (lineBeg == "M SDT") { + ParseSGroupV2000SDTLine(sGroupMap, mol, tempStr, line); + } else if (lineBeg == "M SDD") { + ParseSGroupV2000SDDLine(sGroupMap, mol, tempStr, line); + } else if (lineBeg == "M SCD" || lineBeg == "M SED") { + ParseSGroupV2000SCDSEDLine(sGroupMap, dataFieldsMap, mol, tempStr, line, + strictParsing, SCDcounter, lastDataSGroup, + currentDataField); + } else if (lineBeg == "M SPL") { + ParseSGroupV2000SPLLine(sGroupMap, mol, tempStr, line); + } else if (lineBeg == "M SNC") { + ParseSGroupV2000SNCLine(sGroupMap, mol, tempStr, line); + } else if (lineBeg == "M PXA") { + ParseSGroupV2000PXALine(sGroupMap, mol, tempStr, line); + } else if (lineBeg == "M SAP") { + ParseSGroupV2000SAPLine(sGroupMap, mol, tempStr, line); + } else if (lineBeg == "M SCL") { + ParseSGroupV2000SCLLine(sGroupMap, mol, tempStr, line); + } else if (lineBeg == "M SBT") { + ParseSGroupV2000SBTLine(sGroupMap, mol, tempStr, line); + + /* SGroup parsing end */ } else if (lineBeg == "M ZBO") ParseZBOLine(mol, tempStr, line); else if (lineBeg == "M ZCH") { @@ -1691,6 +1681,12 @@ bool ParseMolBlockProperties(std::istream *inStream, unsigned int &line, lineBeg = tempStr.substr(0, 6); } if (tempStr[0] == 'M' && tempStr.substr(0, 6) == "M END") { + // All went well, make final updates to SGroups, and add them to Mol + for (const auto &sgroup : sGroupMap) { + sgroup.second.setProp("DATAFIELDS", dataFieldsMap[sgroup.first]); + addSGroup(*mol, sgroup.second); + } + fileComplete = true; } return fileComplete; @@ -2348,37 +2344,9 @@ bool ParseV3000CTAB(std::istream *inStream, unsigned int &line, RWMol *mol, } if (nSgroups) { - tempStr = getV3000Line(inStream, line); - boost::to_upper(tempStr); - if (tempStr.length() < 12 || tempStr.substr(0, 12) != "BEGIN SGROUP") { - std::ostringstream errout; - errout << "BEGIN SGROUP line not found on line " << line; - throw FileParseException(errout.str()); - } - for (unsigned int si = 0; si < nSgroups; ++si) { - tempStr = getV3000Line(inStream, line); - boost::to_upper(tempStr); - std::vector localSplitLine; - boost::split(localSplitLine, tempStr, boost::is_any_of(" \t"), - boost::token_compress_on); - std::string typ = localSplitLine[1]; - if (strictParsing && !SGroupOK(typ)) { - std::ostringstream errout; - errout << "S group " << typ << " on line " << line; - throw MolFileUnhandledFeatureException(errout.str()); - } else { - BOOST_LOG(rdWarningLog) << " S group " << typ << " ignored on line " - << line << "." << std::endl; - } - } - tempStr = getV3000Line(inStream, line); - boost::to_upper(tempStr); - if (tempStr.length() < 10 || tempStr.substr(0, 10) != "END SGROUP") { - std::ostringstream errout; - errout << "END SGROUP line not found on line " << line; - throw FileParseException(errout.str()); - } + ParseV3000SGroupsBlock(inStream, line, nSgroups, mol, strictParsing); } + if (n3DConstraints) { BOOST_LOG(rdWarningLog) << "3d constraint information in mol block igored at line " << line @@ -2458,7 +2426,7 @@ bool ParseV2000CTAB(std::istream *inStream, unsigned int &line, RWMol *mol, if (mol->hasProp(common_properties::_3DConf)) { conf->set3D(true); mol->clearProp(common_properties::_3DConf); - } else { // default is 2D + } else { // default is 2D conf->set3D(hasNonZeroZCoords(*conf)); } } @@ -2649,8 +2617,7 @@ RWMol *MolDataStreamToMol(std::istream *inStream, unsigned int &line, res = nullptr; conf = nullptr; BOOST_LOG(rdErrorLog) << " Unhandled CTAB feature: " << e.message() - << " on line: " << line << ". Molecule skipped." - << std::endl; + << ". Molecule skipped." << std::endl; if (!inStream->eof()) tempStr = getLine(inStream); ++line; @@ -2686,6 +2653,9 @@ RWMol *MolDataStreamToMol(std::istream *inStream, unsigned int &line, } if (res) { + res->clearAllAtomBookmarks(); + res->clearAllBondBookmarks(); + // calculate explicit valence on each atom: for (RWMol::AtomIterator atomIt = res->beginAtoms(); atomIt != res->endAtoms(); ++atomIt) { @@ -2751,12 +2721,12 @@ RWMol *MolDataStreamToMol(std::istream *inStream, unsigned int &line, } } return res; -}; +} RWMol *MolDataStreamToMol(std::istream &inStream, unsigned int &line, bool sanitize, bool removeHs, bool strictParsing) { return MolDataStreamToMol(&inStream, line, sanitize, removeHs, strictParsing); -}; +} //------------------------------------------------ // // Read a molecule from a string diff --git a/Code/GraphMol/FileParsers/MolFileWriter.cpp b/Code/GraphMol/FileParsers/MolFileWriter.cpp index de9ca1432..f4dde5465 100644 --- a/Code/GraphMol/FileParsers/MolFileWriter.cpp +++ b/Code/GraphMol/FileParsers/MolFileWriter.cpp @@ -11,23 +11,29 @@ // V3000 mol block writer contributed by Jan Holst Jensen // #include "FileParsers.h" +#include "MolSGroupWriting.h" #include "MolFileStereochem.h" #include #include +#include #include #include + #include #include #include #include #include #include + #include #include #include #include #include +using namespace RDKit::SGroupWriting; + namespace RDKit { //************************************* @@ -1056,7 +1062,6 @@ void appendEnhancedStereoGroups(std::string &res, const RWMol &tmol) { std::string outputMolToMolBlock(const RWMol &tmol, int confId, bool forceV3000) { std::string res; - bool isV3000; unsigned int nAtoms, nBonds, nLists, chiralFlag, nsText, nRxnComponents; unsigned int nReactants, nProducts, nIntermediates; @@ -1064,6 +1069,9 @@ std::string outputMolToMolBlock(const RWMol &tmol, int confId, nBonds = tmol.getNumBonds(); nLists = 0; + const auto &sgroups = getSGroups(tmol); + unsigned int nSGroups = sgroups.size(); + chiralFlag = 0; nsText = 0; nRxnComponents = 0; @@ -1109,11 +1117,8 @@ std::string outputMolToMolBlock(const RWMol &tmol, int confId, } res += "\n"; - if (forceV3000) - isV3000 = true; - else - isV3000 = - (nAtoms > 999) || (nBonds > 999) || !tmol.getStereoGroups().empty(); + isV3000 = forceV3000 || nAtoms > 999 || nBonds > 999 || nSGroups > 999 || + !tmol.getStereoGroups().empty(); // the counts line: std::stringstream ss; @@ -1134,7 +1139,7 @@ std::string outputMolToMolBlock(const RWMol &tmol, int confId, ss << std::setw(3) << nAtoms; ss << std::setw(3) << nBonds; ss << std::setw(3) << nLists; - ss << std::setw(3) << 0; + ss << std::setw(3) << nSGroups; ss << std::setw(3) << chiralFlag; ss << std::setw(3) << nsText; ss << std::setw(3) << nRxnComponents; @@ -1167,17 +1172,18 @@ std::string outputMolToMolBlock(const RWMol &tmol, int confId, res += GetMolFileAliasInfo(tmol); res += GetMolFileZBOInfo(tmol); + res += GetMolFileSGroupInfo(tmol); + // FIX: R-group logic, SGroups and 3D features etc. } else { // V3000 output. res += "M V30 BEGIN CTAB\n"; std::stringstream ss; - // numSgroups (not implemented) - // | num3DConstraints (not - // +---------+ | implemented) - // | | - ss << "M V30 COUNTS " << nAtoms << " " << nBonds << " 0 0 " << chiralFlag - << "\n"; + ss << "M V30 COUNTS " << nAtoms << " " << nBonds << " " << nSGroups + << " 0 " << chiralFlag << "\n"; + // | + // num3DConstraints (not implemented) + res += ss.str(); res += "M V30 BEGIN ATOM\n"; @@ -1198,6 +1204,16 @@ std::string outputMolToMolBlock(const RWMol &tmol, int confId, } res += "M V30 END BOND\n"; } + + if (nSGroups > 0) { + res += "M V30 BEGIN SGROUP\n"; + unsigned int idx = 0; + for (const auto &sgroup : sgroups) { + res += GetV3000MolFileSGroupLines(++idx, sgroup); + } + res += "M V30 END SGROUP\n"; + } + appendEnhancedStereoGroups(res, tmol); res += "M V30 END CTAB\n"; diff --git a/Code/GraphMol/FileParsers/MolSGroupParsing.cpp b/Code/GraphMol/FileParsers/MolSGroupParsing.cpp new file mode 100644 index 000000000..6dc5d53d9 --- /dev/null +++ b/Code/GraphMol/FileParsers/MolSGroupParsing.cpp @@ -0,0 +1,888 @@ +// +// Copyright (C) 2002-2018 Greg Landrum and T5 Informatics GmbH +// +// @@ All Rights Reserved @@ +// This file is part of the RDKit. +// The contents are covered by the terms of the BSD license +// which is included in the file license.txt, found at the root +// of the RDKit source tree. +// + +#include "FileParsers.h" +#include "FileParserUtils.h" +#include "MolSGroupParsing.h" + +#include + +namespace RDKit { +namespace SGroupParsing { + +/* ------------------ V2000 Utils ------------------ */ + +unsigned int ParseSGroupIntField(const std::string &text, unsigned int line, + unsigned int &pos, bool isFieldCounter) { + ++pos; // Account for separation space + unsigned int fieldValue; + size_t len = 3 - isFieldCounter; // field counters are smaller + try { + fieldValue = FileParserUtils::toInt(text.substr(pos, len)); + } catch (boost::bad_lexical_cast &) { + std::ostringstream errout; + errout << "Cannot convert '" << text.substr(pos, len) << "' to int on line " + << line; + throw FileParseException(errout.str()); + } + pos += len; + return fieldValue; +} + +double ParseSGroupDoubleField(const std::string &text, unsigned int line, + unsigned int &pos) { + size_t len = 10; + double fieldValue; + try { + fieldValue = FileParserUtils::toDouble(text.substr(pos, len)); + } catch (boost::bad_lexical_cast &) { + std::ostringstream errout; + errout << "Cannot convert '" << text.substr(pos, len) + << "' to double on line " << line; + throw FileParseException(errout.str()); + } + pos += len; + return fieldValue; +} + +void ParseSGroupV2000STYLine(IDX_TO_SGROUP_MAP &sGroupMap, RWMol *mol, + const std::string &text, unsigned int line) { + PRECONDITION(mol, "bad mol"); + PRECONDITION(text.substr(0, 6) == "M STY", "bad STY line"); + + unsigned int pos = 6; + unsigned int nent = ParseSGroupIntField(text, line, pos, true); + + for (unsigned int ie = 0; ie < nent; ++ie) { + if (text.size() < pos + 8) { + std::ostringstream errout; + errout << "SGroup STY line too short: '" << text << "' on line " << line; + throw FileParseException(errout.str()); + } + + int nbr = ParseSGroupIntField(text, line, pos); + + std::string typ = text.substr(pos + 1, 3); + if (SGroupChecks::isValidType(typ)) { + sGroupMap.emplace(nbr, SGroup(mol, typ)); + } else { + std::ostringstream errout; + errout << "S group " << typ << " on line " << line; + throw MolFileUnhandledFeatureException(errout.str()); + } + pos += 4; + } +} + +void ParseSGroupV2000VectorDataLine(IDX_TO_SGROUP_MAP &sGroupMap, RWMol *mol, + const std::string &text, + unsigned int line) { + PRECONDITION(mol, "bad mol"); + + std::string typ = text.substr(3, 3); + + void (SGroup::*sGroupAddIndexedElement)(const int) = nullptr; + + if (typ == "SAL") { + sGroupAddIndexedElement = &SGroup::addAtomWithBookmark; + } else if (typ == "SBL") { + sGroupAddIndexedElement = &SGroup::addBondWithBookmark; + } else if (typ == "SPA") { + sGroupAddIndexedElement = &SGroup::addParentAtomWithBookmark; + } else { + std::ostringstream errout; + errout << "Unsupported SGroup line '" << typ + << "' passed to Vector Data parser "; + throw FileParseException(errout.str()); + } + + unsigned int pos = 6; + unsigned int sgIdx = ParseSGroupIntField(text, line, pos); + unsigned int nent = ParseSGroupIntField(text, line, pos, true); + + for (unsigned int i = 0; i < nent; ++i) { + if (text.size() < pos + 4) { + std::ostringstream errout; + errout << "SGroup line too short: '" << text << "' on line " << line; + throw FileParseException(errout.str()); + } + + int nbr = ParseSGroupIntField(text, line, pos); + (sGroupMap.at(sgIdx).*sGroupAddIndexedElement)(nbr); + } +} + +void ParseSGroupV2000SDILine(IDX_TO_SGROUP_MAP &sGroupMap, RWMol *mol, + const std::string &text, unsigned int line) { + PRECONDITION(mol, "bad mol"); + PRECONDITION(text.substr(0, 6) == "M SDI", "bad SDI line"); + + unsigned int pos = 6; + unsigned int sgIdx = ParseSGroupIntField(text, line, pos); + + unsigned int nCoords = ParseSGroupIntField(text, line, pos, true); + if (nCoords != 4) { + std::ostringstream errout; + errout << "Unexpected number of coordinates for SDI on line " << line; + throw FileParseException(errout.str()); + } + + SGroup::Bracket bracket; + for (unsigned int i = 0; i < 2; ++i) { + double x = ParseSGroupDoubleField(text, line, pos); + double y = ParseSGroupDoubleField(text, line, pos); + double z = 0.; + bracket[i] = RDGeom::Point3D(x, y, z); + } + bracket[2] = RDGeom::Point3D(0., 0., 0.); + sGroupMap.at(sgIdx).addBracket(bracket); +} + +void ParseSGroupV2000SSTLine(IDX_TO_SGROUP_MAP &sGroupMap, RWMol *mol, + const std::string &text, unsigned int line) { + PRECONDITION(mol, "bad mol"); + PRECONDITION(text.substr(0, 6) == "M SST", "bad SST line"); + + unsigned int pos = 6; + unsigned int nent = ParseSGroupIntField(text, line, pos, true); + + for (unsigned int ie = 0; ie < nent; ++ie) { + if (text.size() < pos + 8) { + std::ostringstream errout; + errout << "SGroup SST line too short: '" << text << "' on line " << line; + throw FileParseException(errout.str()); + } + + unsigned int sgIdx = ParseSGroupIntField(text, line, pos); + + std::string subType = text.substr(++pos, 3); + + if (!SGroupChecks::isValidSubType(subType)) { + std::ostringstream errout; + errout << "Unsupported SGroup subtype '" << subType << "' on line " + << line; + throw FileParseException(errout.str()); + } + + sGroupMap.at(sgIdx).setProp("SUBTYPE", subType); + pos += 3; + } +} + +void ParseSGroupV2000SMTLine(IDX_TO_SGROUP_MAP &sGroupMap, RWMol *mol, + const std::string &text, unsigned int line) { + PRECONDITION(mol, "bad mol"); + PRECONDITION(text.substr(0, 6) == "M SMT", "bad SMT line"); + + unsigned int pos = 6; + unsigned int sgIdx = ParseSGroupIntField(text, line, pos); + auto &sgroup = sGroupMap.at(sgIdx); + ++pos; + + std::string label = text.substr(pos, text.length() - pos); + + if (sgroup.getProp("TYPE") == + "MUL") { // Case of multiple groups + sgroup.setProp("MULT", label); + + } else { // Case of abbreviation groups, but we might not have seen a SCL + // line yet + sgroup.setProp("LABEL", label); + } +} + +void ParseSGroupV2000SLBLine(IDX_TO_SGROUP_MAP &sGroupMap, RWMol *mol, + const std::string &text, unsigned int line) { + PRECONDITION(mol, "bad mol"); + PRECONDITION(text.substr(0, 6) == "M SLB", "bad SLB line"); + + unsigned int pos = 6; + unsigned int nent = ParseSGroupIntField(text, line, pos, true); + + for (unsigned int ie = 0; ie < nent; ++ie) { + if (text.size() < pos + 8) { + std::ostringstream errout; + errout << "SGroup SLB line too short: '" << text << "' on line " << line; + throw FileParseException(errout.str()); + } + + unsigned int sgIdx = ParseSGroupIntField(text, line, pos); + unsigned int id = ParseSGroupIntField(text, line, pos); + + if (id != 0 && !SGroupChecks::isSGroupIdFree(*mol, id)) { + std::ostringstream errout; + errout << "SGroup ID '" << id + << "' is assigned to more than onge SGroup, on line " << line; + throw FileParseException(errout.str()); + } + + sGroupMap.at(sgIdx).setProp("ID", id); + } +} + +void ParseSGroupV2000SCNLine(IDX_TO_SGROUP_MAP &sGroupMap, RWMol *mol, + const std::string &text, unsigned int line) { + PRECONDITION(mol, "bad mol"); + PRECONDITION(text.substr(0, 6) == "M SCN", "bad SCN line"); + + unsigned int pos = 6; + unsigned int nent = ParseSGroupIntField(text, line, pos, true); + + for (unsigned int ie = 0; ie < nent; ++ie) { + if (text.size() < pos + 8) { + std::ostringstream errout; + errout << "SGroup SCN line too short: '" << text << "' on line " << line; + throw FileParseException(errout.str()); + } + + unsigned int sgIdx = ParseSGroupIntField(text, line, pos); + + std::string connect = text.substr(++pos, 2); + + if (!SGroupChecks::isValidConnectType(connect)) { + std::ostringstream errout; + errout << "Unsupported SGroup connection type '" << connect + << "' on line " << line; + throw FileParseException(errout.str()); + } + + sGroupMap.at(sgIdx).setProp("CONNECT", connect); + pos += 3; + } +} + +void ParseSGroupV2000SDSLine(IDX_TO_SGROUP_MAP &sGroupMap, RWMol *mol, + const std::string &text, unsigned int line) { + PRECONDITION(mol, "bad mol"); + PRECONDITION(text.substr(0, 10) == "M SDS EXP", "bad SDS line"); + + unsigned int pos = 10; + unsigned int nent = ParseSGroupIntField(text, line, pos, true); + + for (unsigned int ie = 0; ie < nent; ++ie) { + if (text.size() < pos + 4) { + std::ostringstream errout; + errout << "SGroup SDS line too short: '" << text << "' on line " << line; + throw FileParseException(errout.str()); + } + unsigned int sgIdx = ParseSGroupIntField(text, line, pos); + + sGroupMap.at(sgIdx).setProp("ESTATE", "E"); + } +} + +void ParseSGroupV2000SBVLine(IDX_TO_SGROUP_MAP &sGroupMap, RWMol *mol, + const std::string &text, unsigned int line) { + PRECONDITION(mol, "bad mol"); + PRECONDITION(text.substr(0, 6) == "M SBV", "bad SBV line"); + + unsigned int pos = 6; + + unsigned int sgIdx = ParseSGroupIntField(text, line, pos); + auto &sgroup = sGroupMap.at(sgIdx); + + unsigned int bondMark = ParseSGroupIntField(text, line, pos); + Bond *bond = mol->getUniqueBondWithBookmark(bondMark); + + RDGeom::Point3D vector; + if (sgroup.getProp("TYPE") == "SUP") { + vector.x = ParseSGroupDoubleField(text, line, pos); + vector.y = ParseSGroupDoubleField(text, line, pos); + vector.z = 0.; + } + + sgroup.addCState(bond->getIdx(), vector); +} + +void ParseSGroupV2000SDTLine(IDX_TO_SGROUP_MAP &sGroupMap, RWMol *mol, + const std::string &text, unsigned int line) { + PRECONDITION(mol, "bad mol"); + PRECONDITION(text.substr(0, 6) == "M SDT", "bad SDT line"); + + unsigned int pos = 6; + unsigned int sgIdx = ParseSGroupIntField(text, line, pos); + auto &sgroup = sGroupMap.at(sgIdx); + + std::string fieldName = text.substr(++pos, 30); + boost::trim_right(fieldName); + pos += 30; + std::string fieldType = text.substr(pos, 2); + boost::trim_right(fieldType); + pos += 2; + std::string fieldInfo = text.substr(pos, 20); + boost::trim_right(fieldInfo); + pos += 20; + std::string queryType = text.substr(pos, 2); + boost::trim_right(queryType); + pos += 2; + std::string queryOp = text.substr(pos, text.length() - pos); + boost::trim_right(queryOp); + + sgroup.setProp("FIELDNAME", fieldName); + sgroup.setProp("FIELDTYPE", fieldType); + sgroup.setProp("FIELDINFO", fieldInfo); + sgroup.setProp("QUERYTYPE", queryType); + sgroup.setProp("QUERYOP", queryOp); +} + +void ParseSGroupV2000SDDLine(IDX_TO_SGROUP_MAP &sGroupMap, RWMol *mol, + const std::string &text, unsigned int line) { + PRECONDITION(mol, "bad mol"); + PRECONDITION(text.substr(0, 6) == "M SDD", "bad SDD line"); + + unsigned int pos = 6; + unsigned int sgIdx = ParseSGroupIntField(text, line, pos); + + // Store the rest of the line as is. + ++pos; + sGroupMap.at(sgIdx).setProp("FIELDDISP", + text.substr(pos, text.length() - pos)); +} + +void ParseSGroupV2000SCDSEDLine(IDX_TO_SGROUP_MAP &sGroupMap, + IDX_TO_STR_VECT_MAP &dataFieldsMap, RWMol *mol, + const std::string &text, unsigned int line, + bool strictParsing, unsigned int &counter, + unsigned int &lastDataSGroup, + std::ostringstream ¤tDataField) { + PRECONDITION(mol, "bad mol"); + + unsigned int pos = 3; + std::string type = text.substr(pos, 3); + pos += 3; + + unsigned int sgIdx = ParseSGroupIntField(text, line, pos); + + if (lastDataSGroup != 0 && lastDataSGroup != sgIdx) { + std::ostringstream errout; + errout << "Found a Data Field not matching the the SGroup of the last Data " + "Field at line " + << line; + throw FileParseException(errout.str()); + } else if (lastDataSGroup == 0 && type == "SCD") { + lastDataSGroup = sgIdx; + } else if (type == "SED") { + lastDataSGroup = 0; + } + + auto &sgroup = sGroupMap.at(sgIdx); + + // this group must already have seen an SDT line + if (!sgroup.hasProp("FIELDNAME")) { + std::ostringstream errout; + errout << "Found a SCD line without a previous SDT specification at line " + << line; + throw FileParseException(errout.str()); + } + + if (strictParsing) { + if (type == "SCD" && counter > 2) { + std::ostringstream errout; + errout << "Found too many consecutive SCD lines, (#" << (counter + 1) + << " at line " << line << ") for SGroup " << sgIdx; + throw FileParseException(errout.str()); + } + } + + currentDataField << text.substr(++pos, 69); + + if (type == "SED") { + std::string trimmedData = boost::trim_right_copy(currentDataField.str()); + dataFieldsMap[sgIdx].push_back(trimmedData.substr(0, 200)); + currentDataField.str(""); + counter = 0; + } else { + ++counter; + } +} + +void ParseSGroupV2000SPLLine(IDX_TO_SGROUP_MAP &sGroupMap, RWMol *mol, + const std::string &text, unsigned int line) { + PRECONDITION(mol, "bad mol"); + PRECONDITION(text.substr(0, 6) == "M SPL", "bad SPL line"); + + unsigned int pos = 6; + unsigned int nent = ParseSGroupIntField(text, line, pos, true); + + for (unsigned int ie = 0; ie < nent; ++ie) { + if (text.size() < pos + 8) { + std::ostringstream errout; + errout << "SGroup SPL line too short: '" << text << "' on line " << line; + throw FileParseException(errout.str()); + } + + unsigned int sgIdx = ParseSGroupIntField(text, line, pos); + unsigned int parentIdx = ParseSGroupIntField(text, line, pos); + + // both parentIdx and size are offset by +1, so it's fine + if (parentIdx > sGroupMap.size()) { + std::ostringstream errout; + errout << "SGroup SPL line contains wrong parent SGroup (" << parentIdx + << ") for SGroup " << sgIdx << "on line " << line; + throw FileParseException(errout.str()); + } + + sGroupMap.at(sgIdx).setProp("PARENT", parentIdx - 1); + } +} + +void ParseSGroupV2000SNCLine(IDX_TO_SGROUP_MAP &sGroupMap, RWMol *mol, + const std::string &text, unsigned int line) { + PRECONDITION(mol, "bad mol"); + PRECONDITION(text.substr(0, 6) == "M SNC", "bad SNC line"); + + unsigned int pos = 6; + unsigned int nent = ParseSGroupIntField(text, line, pos, true); + + for (unsigned int ie = 0; ie < nent; ++ie) { + if (text.size() < pos + 8) { + std::ostringstream errout; + errout << "SGroup SNC line too short: '" << text << "' on line " << line; + throw FileParseException(errout.str()); + } + + unsigned int sgIdx = ParseSGroupIntField(text, line, pos); + + unsigned int compno = ParseSGroupIntField(text, line, pos); + if (compno > 256u) { + std::ostringstream errout; + errout << "SGroup SNC value over 256: '" << compno << "' on line " + << line; + throw FileParseException(errout.str()); + } + sGroupMap.at(sgIdx).setProp("COMPNO", compno); + } +} + +// PXA does not have a V3000 equivalent +void ParseSGroupV2000PXALine(IDX_TO_SGROUP_MAP &sGroupMap, RWMol *mol, + const std::string &text, unsigned int line) { + PRECONDITION(mol, "bad mol"); + PRECONDITION(text.substr(0, 6) == "M PXA", "bad PXA line"); + + unsigned int pos = 6; + + unsigned int sgIdx = ParseSGroupIntField(text, line, pos); + + ++pos; + sGroupMap.at(sgIdx).setProp("PXA", text.substr(pos, text.length() - pos)); +} + +void ParseSGroupV2000SAPLine(IDX_TO_SGROUP_MAP &sGroupMap, RWMol *mol, + const std::string &text, unsigned int line) { + PRECONDITION(mol, "bad mol"); + PRECONDITION(text.substr(0, 6) == "M SAP", "bad SAP line"); + + unsigned int pos = 6; + + unsigned int sgIdx = ParseSGroupIntField(text, line, pos); + + unsigned int nent = ParseSGroupIntField(text, line, pos, true); + + for (unsigned int ie = 0; ie < nent; ++ie) { + if (text.size() < pos + 8) { + std::ostringstream errout; + errout << "SGroup SAP line too short: '" << text << "' on line " << line; + throw FileParseException(errout.str()); + } + + unsigned int aIdxMark = ParseSGroupIntField(text, line, pos); + unsigned int lvIdxMark = ParseSGroupIntField(text, line, pos); + + unsigned int aIdx = mol->getAtomWithBookmark(aIdxMark)->getIdx(); + int lvIdx = -1; + + if (lvIdxMark != 0) { + lvIdx = mol->getAtomWithBookmark(lvIdxMark)->getIdx(); + } + + sGroupMap.at(sgIdx).addAttachPoint(aIdx, lvIdx, text.substr(++pos, 2)); + } +} + +void ParseSGroupV2000SCLLine(IDX_TO_SGROUP_MAP &sGroupMap, RWMol *mol, + const std::string &text, unsigned int line) { + PRECONDITION(mol, "bad mol"); + PRECONDITION(text.substr(0, 6) == "M SCL", "bad SCL line"); + + unsigned int pos = 6; + + unsigned int sgIdx = ParseSGroupIntField(text, line, pos); + + ++pos; + sGroupMap.at(sgIdx).setProp("CLASS", text.substr(pos, text.length() - pos)); +} + +void ParseSGroupV2000SBTLine(IDX_TO_SGROUP_MAP &sGroupMap, RWMol *mol, + const std::string &text, unsigned int line) { + PRECONDITION(mol, "bad mol"); + PRECONDITION(text.substr(0, 6) == "M SBT", "bad SBT line"); + + unsigned int pos = 6; + unsigned int nent = ParseSGroupIntField(text, line, pos, true); + + for (unsigned int ie = 0; ie < nent; ++ie) { + if (text.size() < pos + 8) { + std::ostringstream errout; + errout << "SGroup SBT line too short: '" << text << "' on line " << line; + throw FileParseException(errout.str()); + } + + unsigned int sgIdx = ParseSGroupIntField(text, line, pos); + unsigned int bracketType = ParseSGroupIntField(text, line, pos); + + auto &sgroup = sGroupMap.at(sgIdx); + + if (bracketType == 0) { + sgroup.setProp("BRKTYP", "BRACKET"); + } else if (bracketType == 1) { + sgroup.setProp("BRKTYP", "PAREN"); + } else { + std::ostringstream errout; + errout << "Invalid SBT value '" << bracketType << "' on line " << line; + throw FileParseException(errout.str()); + } + } +} + +/* ------------------ V3000 Utils ------------------ */ + +template +std::vector ParseV3000Array(std::stringstream &stream) { + stream.get(); // discard parentheses + + unsigned int count; + stream >> count; + std::vector values; + values.reserve(count); + T value; + for (unsigned i = 0; i < count; ++i) { + stream >> value; + values.push_back(value); + } + stream.get(); // discard parentheses + return values; +} + +void ParseV3000CStateLabel(RWMol *mol, SGroup &sgroup, + std::stringstream &stream, unsigned int line) { + stream.get(); // discard parentheses + + unsigned int count; + unsigned int bondMark; + stream >> count >> bondMark; + + std::string type = sgroup.getProp("TYPE"); + + if ((type != "SUP" && count != 1) || (type == "SUP" && count != 4)) { + std::ostringstream errout; + errout << "Unexpected number of fields for CSTATE field on line " << line; + throw FileParseException(errout.str()); + } + + Bond *bond = mol->getUniqueBondWithBookmark(bondMark); + + RDGeom::Point3D vector; + if (type == "SUP") { + stream >> vector.x >> vector.y >> vector.z; + } + sgroup.addCState(bond->getIdx(), vector); + + stream.get(); // discard final parentheses +} + +void ParseV3000SAPLabel(RWMol *mol, SGroup &sgroup, std::stringstream &stream) { + stream.get(); // discard parentheses + + unsigned int count; + unsigned int aIdxMark; + std::string lvIdxStr; // In V3000 this may be a string + std::string sapIdStr; + stream >> count >> aIdxMark >> lvIdxStr >> sapIdStr; + + // remove final parentheses that gets parsed into sapIdStr + sapIdStr.pop_back(); + + unsigned int aIdx = mol->getAtomWithBookmark(aIdxMark)->getIdx(); + int lvIdx = -1; + + boost::to_upper(lvIdxStr); + if (lvIdxStr == "AIDX") { + lvIdx = aIdx; + } else { + unsigned int lvIdxTmp = FileParserUtils::toInt(lvIdxStr); + if (lvIdxTmp > 0) { + lvIdx = mol->getAtomWithBookmark(lvIdxTmp)->getIdx(); + } + } + + sgroup.addAttachPoint(aIdx, lvIdx, sapIdStr); +} + +std::string ParseV3000StringPropLabel(std::stringstream &stream) { + std::string strValue; + + // TODO: this should be improved to be able to handle escaped quotes + + auto nextChar = stream.peek(); + if (nextChar == '"') { + stream.get(); + std::getline(stream, strValue, '"'); + } else if (nextChar == '\'') { + std::getline(stream, strValue, '\''); + } else { + stream >> strValue; + } + + boost::trim_right(strValue); + return strValue; +} + +void ParseV3000ParseLabel(const std::string &label, + std::stringstream &lineStream, STR_VECT &dataFields, + unsigned int &line, SGroup &sgroup, size_t nSgroups, + RWMol *mol, bool &strictParsing) { + // TODO: remove this once we find out how to handle XBHEAD & XBCORR + if (label == "XBHEAD" || label == "XBCORR") { + std::ostringstream errout; + errout << "XBHEAD or XBCORR labels (found on line " << line + << ") are not yet supported"; + throw FileParseException(errout.str()); + } + + if (label == "ATOMS") { + for (auto atomIdx : ParseV3000Array(lineStream)) { + sgroup.addAtomWithBookmark(atomIdx); + } + } else if (label == "PATOMS") { + for (auto patomIdx : ParseV3000Array(lineStream)) { + sgroup.addParentAtomWithBookmark(patomIdx); + } + } else if (label == "CBONDS" || label == "XBONDS") { + for (auto bondIdx : ParseV3000Array(lineStream)) { + sgroup.addBondWithBookmark(bondIdx); + } + } else if (label == "BRKXYZ") { + auto coords = ParseV3000Array(lineStream); + if (coords.size() != 9) { + std::ostringstream errout; + errout << "Unexpected number of coordinates for BRKXYZ on line " << line; + throw FileParseException(errout.str()); + } + + SGroup::Bracket bracket; + for (unsigned int i = 0; i < 3; ++i) { + bracket[i] = RDGeom::Point3D(*(coords.begin() + (3 * i)), + *(coords.begin() + (3 * i) + 1), + *(coords.begin() + (3 * i) + 2)); + } + sgroup.addBracket(bracket); + } else if (label == "CSTATE") { + ParseV3000CStateLabel(mol, sgroup, lineStream, line); + } else if (label == "SAP") { + ParseV3000SAPLabel(mol, sgroup, lineStream); + } else if (label == "PARENT") { + // Store relationship until all SGroups have been read + unsigned int parentIdx; + lineStream >> parentIdx; + + // both parentIdx and nSGroups are offset by +1, so it's fine + if (parentIdx > nSgroups) { + std::ostringstream errout; + errout << "Wrong parent SGroup '" << parentIdx << "' on line " << line; + throw FileParseException(errout.str()); + } + + sgroup.setProp("PARENT", parentIdx - 1); + } else if (label == "COMPNO") { + unsigned int compno; + lineStream >> compno; + if (compno > 256u) { + std::ostringstream errout; + errout << "SGroup SNC value over 256: '" << compno << "' on line " + << line; + throw FileParseException(errout.str()); + } + sgroup.setProp("COMPNO", compno); + } else if (label == "FIELDDATA") { + auto strValue = ParseV3000StringPropLabel(lineStream); + if (strictParsing) { + strValue = strValue.substr(0, 200); + } + dataFields.push_back(strValue); + + } else { + // Parse string props + auto strValue = ParseV3000StringPropLabel(lineStream); + + if (label == "SUBTYPE" && !SGroupChecks::isValidSubType(strValue)) { + std::ostringstream errout; + errout << "Unsupported SGroup subtype '" << strValue << "' on line " + << line; + throw FileParseException(errout.str()); + } else if (label == "CONNECT" && + !SGroupChecks::isValidConnectType(strValue)) { + std::ostringstream errout; + errout << "Unsupported SGroup connection type '" << strValue + << "' on line " << line; + throw FileParseException(errout.str()); + } + + sgroup.setProp(label, strValue); + } +} + +void ParseV3000SGroupsBlock(std::istream *inStream, unsigned int &line, + unsigned int nSgroups, RWMol *mol, + bool &strictParsing) { + std::string tempStr = FileParserUtils::getV3000Line(inStream, line); + boost::to_upper(tempStr); + if (tempStr.length() < 12 || tempStr.substr(0, 12) != "BEGIN SGROUP") { + std::ostringstream errout; + errout << "BEGIN SGROUP line not found on line " << line; + throw FileParseException(errout.str()); + } + + unsigned int defaultLineNum = 0; + std::string defaultString; + + // SGroups may be written in unsorted ID order, according to spec, so we will + // temporarily store them in a map before adding them to the mol + IDX_TO_SGROUP_MAP sGroupMap; + + std::unordered_map defaultLabels; + + tempStr = FileParserUtils::getV3000Line(inStream, line); + + // Store defaults + if (tempStr.substr(0, 7) == "DEFAULT") { + defaultString = tempStr.substr(7); + defaultLineNum = line; + boost::trim_right(defaultString); + tempStr = FileParserUtils::getV3000Line(inStream, line); + boost::trim_right(tempStr); + } + + for (unsigned int si = 0; si < nSgroups; ++si) { + unsigned int sequenceId; + unsigned int externalId; + std::string type; + + std::stringstream lineStream(tempStr); + lineStream >> sequenceId; + lineStream >> type; + lineStream >> externalId; + + std::set parsedLabels; + + if (strictParsing && !SGroupChecks::isValidType(type)) { + std::ostringstream errout; + errout << "Unsupported SGroup type '" << type << "' on line " << line; + throw MolFileUnhandledFeatureException(errout.str()); + } + + SGroup sgroup(mol, type); + STR_VECT dataFields; + + if (externalId > 0) { + if (!SGroupChecks::isSGroupIdFree(*mol, externalId)) { + std::ostringstream errout; + errout << "Existing SGroup ID '" << externalId + << "' assigned to a second SGroup on line " << line; + throw FileParseException(errout.str()); + } + + sgroup.setProp("ID", externalId); + } + + while (!lineStream.eof()) { + char spacer; + std::string label; + + lineStream.get(spacer); + if (lineStream.gcount() == 0) { + continue; + } else if (spacer != ' ') { + std::ostringstream errout; + errout << "Found character '" << spacer + << "' when expecting a separator (space) on line " << line; + throw FileParseException(errout.str()); + } + + std::getline(lineStream, label, '='); + ParseV3000ParseLabel(label, lineStream, dataFields, line, sgroup, + nSgroups, mol, strictParsing); + parsedLabels.insert(label); + } + + // Process defaults + lineStream.clear(); + lineStream.str(defaultString); + while (!lineStream.eof()) { + char spacer; + std::string label; + + lineStream.get(spacer); + if (lineStream.gcount() == 0) { + continue; + } else if (spacer != ' ') { + std::ostringstream errout; + errout << "Found character '" << spacer + << "' when expecting a separator (space) in DEFAULTS on line " + << defaultLineNum; + throw FileParseException(errout.str()); + } + + std::getline(lineStream, label, '='); + if (std::find(parsedLabels.begin(), parsedLabels.end(), label) == + parsedLabels.end()) { + ParseV3000ParseLabel(label, lineStream, dataFields, defaultLineNum, + sgroup, nSgroups, mol, strictParsing); + } else { + spacer = lineStream.peek(); + if (spacer == ' ') { + std::ostringstream errout; + errout << "Found unexpected whitespace at DEFAULT label " << label; + throw FileParseException(errout.str()); + } else if (spacer == '(') { + std::getline(lineStream, label, ')'); + lineStream.get(spacer); + } else if (spacer == '"') { + lineStream.get(spacer); + std::getline(lineStream, label, '"'); + } else { + std::getline(lineStream, label, ' '); + lineStream.putback(' '); + } + } + } + + sgroup.setProp("DATAFIELDS", dataFields); + sGroupMap.emplace(sequenceId, sgroup); + + tempStr = FileParserUtils::getV3000Line(inStream, line); + boost::trim_right(tempStr); + } + + boost::to_upper(tempStr); + if (tempStr.length() < 10 || tempStr.substr(0, 10) != "END SGROUP") { + std::ostringstream errout; + errout << "END SGROUP line not found on line " << line; + throw FileParseException(errout.str()); + } + + // SGroups successfully parsed, now add them to the molecule sorted by index + for (const auto &sg : sGroupMap) { + addSGroup(*mol, sg.second); + } +} + +} // namespace SGroupParsing +} // namespace RDKit diff --git a/Code/GraphMol/FileParsers/MolSGroupParsing.h b/Code/GraphMol/FileParsers/MolSGroupParsing.h new file mode 100644 index 000000000..1fe9420da --- /dev/null +++ b/Code/GraphMol/FileParsers/MolSGroupParsing.h @@ -0,0 +1,103 @@ +// +// Copyright (C) 2002-2018 Greg Landrum and T5 Informatics GmbH +// +// @@ All Rights Reserved @@ +// This file is part of the RDKit. +// The contents are covered by the terms of the BSD license +// which is included in the file license.txt, found at the root +// of the RDKit source tree. +// +#pragma once +#include + +namespace RDKit { + +namespace SGroupParsing { +typedef std::map IDX_TO_SGROUP_MAP; +typedef std::map IDX_TO_STR_VECT_MAP; + +/* ------------------ V2000 Utils ------------------ */ + +unsigned int ParseSGroupIntField(const std::string &text, unsigned int line, + unsigned int &pos, + bool isFieldCounter = false); + +double ParseSGroupDoubleField(const std::string &text, unsigned int line, + unsigned int &pos); + +void ParseSGroupV2000STYLine(IDX_TO_SGROUP_MAP &sGroupMap, RWMol *mol, + const std::string &text, unsigned int line); + +void ParseSGroupV2000VectorDataLine(IDX_TO_SGROUP_MAP &sGroupMap, RWMol *mol, + const std::string &text, unsigned int line); + +void ParseSGroupV2000SDILine(IDX_TO_SGROUP_MAP &sGroupMap, RWMol *mol, + const std::string &text, unsigned int line); + +void ParseSGroupV2000SSTLine(IDX_TO_SGROUP_MAP &sGroupMap, RWMol *mol, + const std::string &text, unsigned int line); + +void ParseSGroupV2000SMTLine(IDX_TO_SGROUP_MAP &sGroupMap, RWMol *mol, + const std::string &text, unsigned int line); + +void ParseSGroupV2000SLBLine(IDX_TO_SGROUP_MAP &sGroupMap, RWMol *mol, + const std::string &text, unsigned int line); + +void ParseSGroupV2000SCNLine(IDX_TO_SGROUP_MAP &sGroupMap, RWMol *mol, + const std::string &text, unsigned int line); + +void ParseSGroupV2000SDSLine(IDX_TO_SGROUP_MAP &sGroupMap, RWMol *mol, + const std::string &text, unsigned int line); +void ParseSGroupV2000SBVLine(IDX_TO_SGROUP_MAP &sGroupMap, RWMol *mol, + const std::string &text, unsigned int line); + +void ParseSGroupV2000SDTLine(IDX_TO_SGROUP_MAP &sGroupMap, RWMol *mol, + const std::string &text, unsigned int line); + +void ParseSGroupV2000SDDLine(IDX_TO_SGROUP_MAP &sGroupMap, RWMol *mol, + const std::string &text, unsigned int line); + +void ParseSGroupV2000SCDSEDLine(IDX_TO_SGROUP_MAP &sGroupMap, + IDX_TO_STR_VECT_MAP &dataFieldsMap, RWMol *mol, + const std::string &text, unsigned int line, + bool strictParsing, unsigned int &counter, + unsigned int &lastDataSGroup, + std::ostringstream ¤tDataField); + +void ParseSGroupV2000SPLLine(IDX_TO_SGROUP_MAP &sGroupMap, RWMol *mol, + const std::string &text, unsigned int line); + +void ParseSGroupV2000SNCLine(IDX_TO_SGROUP_MAP &sGroupMap, RWMol *mol, + const std::string &text, unsigned int line); + +// PXA does not have a V3000 equivalent +void ParseSGroupV2000PXALine(IDX_TO_SGROUP_MAP &sGroupMap, RWMol *mol, + const std::string &text, unsigned int line); + +void ParseSGroupV2000SAPLine(IDX_TO_SGROUP_MAP &sGroupMap, RWMol *mol, + const std::string &text, unsigned int line); + +void ParseSGroupV2000SCLLine(IDX_TO_SGROUP_MAP &sGroupMap, RWMol *mol, + const std::string &text, unsigned int line); + +void ParseSGroupV2000SBTLine(IDX_TO_SGROUP_MAP &sGroupMap, RWMol *mol, + const std::string &text, unsigned int line); + +/* ------------------ V3000 Utils ------------------ */ + +template +std::vector ParseV3000Array(std::stringstream &stream); + +void ParseV3000CStateLabel(unsigned int line, const std::string &type, + SGroup *sgroup, std::stringstream &stream); + +void ParseV3000SAPLabel(RWMol *mol, SGroup *sgroup, std::stringstream &stream); + +std::string ParseV3000StringPropLabel(std::stringstream &stream); + +void ParseV3000SGroupsBlock(std::istream *inStream, unsigned int &line, + unsigned int nSgroups, RWMol *mol, + bool &strictParsing); + +} // namespace SGroupParsing +} // namespace RDKit diff --git a/Code/GraphMol/FileParsers/MolSGroupWriting.cpp b/Code/GraphMol/FileParsers/MolSGroupWriting.cpp new file mode 100644 index 000000000..f0a15ad47 --- /dev/null +++ b/Code/GraphMol/FileParsers/MolSGroupWriting.cpp @@ -0,0 +1,678 @@ +// +// Copyright (C) 2002-2018 Greg Landrum and T5 Informatics GmbH +// +// @@ All Rights Reserved @@ +// This file is part of the RDKit. +// The contents are covered by the terms of the BSD license +// which is included in the file license.txt, found at the root +// of the RDKit source tree. +// + +#include + +#include "FileParsers.h" +#include "MolSGroupWriting.h" + +namespace RDKit { +namespace SGroupWriting { + +/* ------------------ V2000 Utils ------------------ */ + +std::string BuildV2000STYLines(const ROMol &mol) { + std::ostringstream ret; + std::ostringstream temp; + + unsigned int count = 0; + + const auto &sgroups = getSGroups(mol); + for (auto sg = sgroups.begin(); sg != sgroups.end(); ++sg) { + temp << FormatV2000IntField(1 + (sg - sgroups.begin())) + << FormatV2000StringField(sg->getProp("TYPE"), 3, true, + true); + if (++count == 8) { + ret << "M STY" << FormatV2000NumEntriesField(8) << temp.str() + << std::endl; + temp.str(""); + count = 0; + } + } + if (count) { + ret << "M STY" << FormatV2000NumEntriesField(count) << temp.str() + << std::endl; + } + + return ret.str(); +} + +std::string BuildV2000StringPropLines(const unsigned int entriesPerLine, + const ROMol &mol, + const std::string &propName, + const std::string &propCode, + const unsigned int fieldWitdh) { + std::ostringstream ret; + std::ostringstream temp; + + unsigned int count = 0; + const auto &sgroups = getSGroups(mol); + for (auto sg = sgroups.begin(); sg != sgroups.end(); ++sg) { + std::string propValue; + // Write field only if defined + if (sg->getPropIfPresent(propName, propValue)) { + temp << FormatV2000IntField(1 + (sg - sgroups.begin())) + << FormatV2000StringField(propValue, fieldWitdh, true, true); + if (++count == entriesPerLine) { + ret << "M " << propCode << FormatV2000NumEntriesField(entriesPerLine) + << temp.str() << std::endl; + temp.str(""); + count = 0; + } + } + } + if (count) { + ret << "M " << propCode << FormatV2000NumEntriesField(count) << temp.str() + << std::endl; + } + + return ret.str(); +} + +std::string BuildV2000SLBLines(const ROMol &mol) { + std::ostringstream ret; + std::ostringstream temp; + + unsigned int count = 0; + const auto &sgroups = getSGroups(mol); + for (auto sg = sgroups.begin(); sg != sgroups.end(); ++sg) { + unsigned int id; + // Write value if assigned, else 0 + if (sg->getPropIfPresent("ID", id)) { + temp << FormatV2000IntField(1 + (sg - sgroups.begin())) + << FormatV2000IntField(id); + if (++count == 8) { + ret << "M SLB" << FormatV2000NumEntriesField(8) << temp.str() + << std::endl; + temp.str(""); + count = 0; + } + } + } + if (count) { + ret << "M SLB" << FormatV2000NumEntriesField(count) << temp.str() + << std::endl; + } + + return ret.str(); +} + +std::string BuildV2000SDSLines(const ROMol &mol) { + std::ostringstream ret; + std::ostringstream temp; + + unsigned int count = 0; + const auto &sgroups = getSGroups(mol); + for (auto sg = sgroups.begin(); sg != sgroups.end(); ++sg) { + // Write field only if defined + std::string eState; + if (sg->getPropIfPresent("ESTATE", eState) && eState == "E") { + temp << FormatV2000IntField(1 + (sg - sgroups.begin())); + if (++count == 15) { + ret << "M SDS EXP" << FormatV2000NumEntriesField(15) << temp.str() + << std::endl; + temp.str(""); + count = 0; + } + } + } + if (count) { + ret << "M SDS EXP" << FormatV2000NumEntriesField(count) << temp.str() + << std::endl; + } + + return ret.str(); +} + +std::string BuildV2000SPLLines(const ROMol &mol) { + std::ostringstream ret; + std::ostringstream temp; + + unsigned int count = 0; + const auto &sgroups = getSGroups(mol); + for (auto sg = sgroups.begin(); sg != sgroups.end(); ++sg) { + // Write field only if a parent is defined + unsigned int parentIdx = -1; + if (sg->getPropIfPresent("PARENT", parentIdx)) { + temp << FormatV2000IntField(1 + (sg - sgroups.begin())) + << FormatV2000IntField(1 + parentIdx); + if (++count == 8) { + ret << "M SPL" << FormatV2000NumEntriesField(8) << temp.str() + << std::endl; + temp.str(""); + count = 0; + } + } + } + if (count) { + ret << "M SPL" << FormatV2000NumEntriesField(count) << temp.str() + << std::endl; + } + + return ret.str(); +} + +std::string BuildV2000SNCLines(const ROMol &mol) { + std::ostringstream ret; + std::ostringstream temp; + + unsigned int count = 0; + const auto &sgroups = getSGroups(mol); + for (auto sg = sgroups.begin(); sg != sgroups.end(); ++sg) { + unsigned int compno; + // Write field only if compno is set + if (sg->getPropIfPresent("COMPNO", compno)) { + temp << FormatV2000IntField(1 + (sg - sgroups.begin())) + << FormatV2000IntField(compno); + if (++count == 8) { + ret << "M SNC" << FormatV2000NumEntriesField(8) << temp.str() + << std::endl; + temp.str(""); + count = 0; + } + } + } + if (count) { + ret << "M SNC" << FormatV2000NumEntriesField(count) << temp.str() + << std::endl; + } + + return ret.str(); +} + +std::string BuildV2000SBTLines(const ROMol &mol) { + std::ostringstream ret; + std::ostringstream temp; + + unsigned int count = 0; + const auto &sgroups = getSGroups(mol); + for (auto sg = sgroups.begin(); sg != sgroups.end(); ++sg) { + std::string bracketType; + if (sg->getPropIfPresent("BRKTYP", bracketType)) { + unsigned int idx = 1 + (sg - sgroups.begin()); + if (bracketType == "BRACKET") { + temp << FormatV2000IntField(idx) << FormatV2000IntField(0); + } else if (bracketType == "PAREN") { + temp << FormatV2000IntField(idx) << FormatV2000IntField(1); + } else { + std::ostringstream errout; + errout << "Invalid BRKTYP value '" << bracketType << "' for SGroup " + << idx; + throw SGroupException(errout.str()); + } + if (++count == 8) { + ret << "M SBT" << FormatV2000NumEntriesField(8) << temp.str() + << std::endl; + temp.str(""); + count = 0; + } + } + } + if (count) { + ret << "M SBT" << FormatV2000NumEntriesField(count) << temp.str() + << std::endl; + } + + return ret.str(); +} + +template +std::string BuildV2000IdxVectorDataLines(const unsigned int entriesPerLine, + const unsigned int sGroupId, + const std::string &code, + const T &idxVector) { + std::ostringstream ret; + std::ostringstream temp; + + unsigned int count = 0; + for (const auto &idx : idxVector) { + temp << FormatV2000IntField(1 + idx); + if (++count == entriesPerLine) { + ret << "M " << code << FormatV2000IntField(sGroupId) + << FormatV2000NumEntriesField(entriesPerLine) << temp.str() + << std::endl; + temp.str(""); + count = 0; + } + } + if (count) { + ret << "M " << code << FormatV2000IntField(sGroupId) + << FormatV2000NumEntriesField(count) << temp.str() << std::endl; + } + return ret.str(); +} + +std::string BuildV2000SMTLine(const int idx, const SGroup &sgroup) { + std::ostringstream ret; + + std::string smtValue; + if ((sgroup.getProp("TYPE") == "MUL" && + sgroup.getPropIfPresent("MULT", smtValue)) || + sgroup.getPropIfPresent("LABEL", smtValue)) { + ret << "M SMT" << FormatV2000IntField(idx) + << FormatV2000StringField(smtValue, 69, false, true) << std::endl; + } + return ret.str(); +} + +std::string BuildV2000SDILine(const int idx, const SGroup &sgroup) { + std::ostringstream ret; + + const std::vector brackets = sgroup.getBrackets(); + + for (const auto &bracket : brackets) { + ret << "M SDI" << FormatV2000IntField(idx) + << FormatV2000NumEntriesField(4); + + for (unsigned int iPoint = 0; iPoint < 2; ++iPoint) { + ret << FormatV2000DoubleField(bracket.at(iPoint).x); + ret << FormatV2000DoubleField(bracket.at(iPoint).y); + } + ret << std::endl; + } + + return ret.str(); +} + +std::string BuildV2000SBVLine(const int idx, const SGroup &sgroup) { + std::ostringstream ret; + + for (const auto &cstate : sgroup.getCStates()) { + ret << "M SBV" << FormatV2000IntField(idx) + << FormatV2000IntField(cstate.bondIdx + 1); + if (sgroup.getProp("TYPE") == "SUP") { + ret << FormatV2000DoubleField(cstate.vector.x); + ret << FormatV2000DoubleField(cstate.vector.y); + } + ret << std::endl; + } + + return ret.str(); +} + +std::string BuildV2000SDTLine(const int idx, const SGroup &sgroup) { + std::ostringstream ret; + + std::string sdtValue; + if (sgroup.getPropIfPresent("FIELDNAME", sdtValue)) { + ret << "M SDT" << FormatV2000IntField(idx); + ret << FormatV2000StringField(sdtValue, 30, true, true); + + if (sgroup.getPropIfPresent("FIELDTYPE", sdtValue)) { + ret << FormatV2000StringField(sdtValue, 2, true, false); + } else { + ret << " T"; + } + + if (sgroup.getPropIfPresent("FIELDINFO", sdtValue)) { + ret << FormatV2000StringField(sdtValue, 20, true, false); + } + + if (sgroup.getPropIfPresent("QUERYTYPE", sdtValue)) { + ret << FormatV2000StringField(sdtValue, 2, true, false); + } + if (sgroup.getPropIfPresent("QUERYOP", sdtValue)) { + ret << FormatV2000StringField(sdtValue, 15, true, false); + } + + ret << std::endl; + } + return ret.str(); +} + +std::string BuildV2000SDDLine(const int idx, const SGroup &sgroup) { + std::ostringstream ret; + + std::string sddValue; + if (sgroup.getPropIfPresent("FIELDDISP", sddValue)) { + ret << "M SDD" << FormatV2000IntField(idx); + ret << FormatV2000StringField(sddValue, 69, false, true); + ret << std::endl; + } + + return ret.str(); +} + +std::string BuildV2000SCDSEDLines(const int idx, const SGroup &sgroup) { + std::ostringstream ret; + + STR_VECT dataFields; + if (sgroup.getPropIfPresent("DATAFIELDS", dataFields)) { + for (const auto &data : dataFields) { + unsigned int length = data.size(); + if (length > 200) { + std::ostringstream errout; + errout << "Data field '" << data << "' in SGroup " << idx + << " is longer than limit of 200 characters."; + throw SGroupException(errout.str()); + } + unsigned int start = 0; + unsigned int end = 69; + for (; length > end; start += 69, end += 69) { + std::string dataChunk = data.substr(start, end); + ret << "M SCD" << FormatV2000IntField(idx) + << FormatV2000StringField(dataChunk, 69, true, true) << std::endl; + } + std::string dataChunk = data.substr(start, end); + ret << "M SED" << FormatV2000IntField(idx) + << FormatV2000StringField(dataChunk, 69, false, true) << std::endl; + } + } + + return ret.str(); +} + +std::string BuildV2000PXALine(const int idx, const SGroup &sgroup) { + std::ostringstream ret; + + std::string pxaValue; + if (sgroup.getPropIfPresent("PXA", pxaValue)) { + ret << "M PXA" << FormatV2000IntField(idx); + ret << FormatV2000StringField(pxaValue, 69, false, true); + ret << std::endl; + } + + return ret.str(); +} + +std::string BuildV2000SAPLines(const int idx, const SGroup &sgroup) { + std::ostringstream ret; + std::ostringstream temp; + + const std::vector saps = sgroup.getAttachPoints(); + + unsigned int count = 0; + unsigned int entriesPerLine = 6; + for (const auto &sap : saps) { + temp << FormatV2000IntField(1 + sap.aIdx); + + // lvIdx == -1 will turn into 0, which is right (see spec) + temp << FormatV2000IntField(1 + sap.lvIdx); + + temp << FormatV2000StringField(sap.id, 2, false, true); + if (++count == entriesPerLine) { + ret << "M SAP" << FormatV2000IntField(idx) + << FormatV2000IntField(entriesPerLine) << temp.str() << std::endl; + temp.str(""); + count = 0; + } + } + if (count) { + ret << "M SAP" << FormatV2000IntField(idx) + << FormatV2000NumEntriesField(count) << temp.str() << std::endl; + } + return ret.str(); +} + +std::string BuildV2000SCLLine(const int idx, const SGroup &sgroup) { + std::ostringstream ret; + + std::string sclValue; + if (sgroup.getPropIfPresent("CLASS", sclValue)) { + ret << "M SCL" << FormatV2000IntField(idx); + ret << FormatV2000StringField(sclValue, 69, false, true); + ret << std::endl; + } + + return ret.str(); +} + +const std::string GetMolFileSGroupInfo(const RWMol &mol) { + std::ostringstream ret; + + // multiple group per line properties + ret << BuildV2000STYLines(mol); + ret << BuildV2000SLBLines(mol); + ret << BuildV2000StringPropLines(8, mol, "SUBTYPE", "SST", 3); + ret << BuildV2000StringPropLines(8, mol, "CONNECT", "SCN", 3); + ret << BuildV2000SDSLines(mol); + ret << BuildV2000SPLLines(mol); + ret << BuildV2000SNCLines(mol); + ret << BuildV2000SBTLines(mol); + + // single group per line properties + unsigned int idx = 0; + for (const auto &sgroup : getSGroups(mol)) { + ++idx; + ret << BuildV2000IdxVectorDataLines(15, idx, "SAL", sgroup.getAtoms()); + + // SPA line must always come somewhere after the SAL line. + ret << BuildV2000IdxVectorDataLines(15, idx, "SPA", + sgroup.getParentAtoms()); + + ret << BuildV2000IdxVectorDataLines(15, idx, "SBL", sgroup.getBonds()); + // Write CRS line -- CRS still not supported + ret << BuildV2000SDILine(idx, sgroup); + + ret << BuildV2000SMTLine(idx, sgroup); + ret << BuildV2000SBVLine(idx, sgroup); + + ret << BuildV2000SDTLine(idx, sgroup); + ret << BuildV2000SDDLine(idx, sgroup); + // SCD/SED must come after SDT + ret << BuildV2000SCDSEDLines(idx, sgroup); + + ret << BuildV2000PXALine(idx, sgroup); + ret << BuildV2000SAPLines(idx, sgroup); + ret << BuildV2000SCLLine(idx, sgroup); + } + return ret.str(); +} + +/* ------------------ V3000 Utils ------------------ */ + +template +std::string BuildV3000IdxVectorDataBlock(const std::string &key, + const std::vector &dataVector) { + return BuildV3000IdxVectorDataBlock(key, dataVector.begin(), + dataVector.end()); +} + +template +std::string BuildV3000IdxVectorDataBlock(const std::string &key, + const Iterator &dataVectorBegin, + const Iterator &dataVectorEnd) { + std::ostringstream ret; + size_t size = dataVectorEnd - dataVectorBegin; + if (size) { + ret << ' ' << key << "=(" << size; + for (auto itr = dataVectorBegin; itr < dataVectorEnd; ++itr) { + ret << ' ' << 1 + *itr; + } + ret << ')'; + } + return ret.str(); +} + +std::string BuildV3000BondsBlock(const SGroup &sgroup) { + std::ostringstream ret; + + auto isXBond = [&sgroup](unsigned int bondIdx) { + return SGroup::BondType::XBOND == sgroup.getBondType(bondIdx); + }; + + auto bonds = sgroup.getBonds(); + auto first_cbond = std::stable_partition(bonds.begin(), bonds.end(), isXBond); + + ret << BuildV3000IdxVectorDataBlock("XBONDS", bonds.begin(), first_cbond); + ret << BuildV3000IdxVectorDataBlock("CBONDS", first_cbond, bonds.end()); + + return ret.str(); +} + +std::string FormatV3000StringPropertyBlock(const std::string &prop, + const SGroup &sgroup) { + std::ostringstream ret; + + std::string propValue; + if (sgroup.getPropIfPresent(prop, propValue)) { + ret << ' ' << prop << '='; + bool hasSpaces = + (propValue.end() != find(propValue.begin(), propValue.end(), ' ')); + + if (hasSpaces || propValue.empty()) { + ret << "\""; + } + + ret << propValue; + + if (hasSpaces || propValue.empty()) { + ret << "\""; + } + } + + return ret.str(); +} + +std::string FormatV3000ParentBlock(const SGroup &sgroup) { + std::ostringstream ret; + + unsigned int parentIdx = -1; + if (sgroup.getPropIfPresent("PARENT", parentIdx)) { + ret << " PARENT=" << (1 + parentIdx); + } + + return ret.str(); +} + +std::string FormatV3000CompNoBlock(const SGroup &sgroup) { + std::ostringstream ret; + + unsigned int compno; + + if (sgroup.getPropIfPresent("COMPNO", compno)) { + ret << " COMPNO=" << compno; + } + + return ret.str(); +} + +std::string FormatV3000BracketBlock( + const std::vector brackets) { + std::ostringstream ret; + + for (const auto &bracket : brackets) { + ret << " BRKXYZ=(9"; + for (unsigned int iPoint = 0; iPoint < 2; ++iPoint) { + ret << ' ' << FormatV3000DoubleField(bracket.at(iPoint).x); + ret << ' ' << FormatV3000DoubleField(bracket.at(iPoint).y); + ret << " 0"; // z coordinate is 0 by format specification + } + ret << " 0 0 0"; // 3rd point is 0 by format specification + ret << ")"; + } + + return ret.str(); +} + +std::string FormatV3000FieldDataBlock(const SGroup &sgroup) { + std::ostringstream ret; + + STR_VECT dataFields; + if (sgroup.getPropIfPresent("DATAFIELDS", dataFields)) { + for (const auto &data : dataFields) { + ret << " FIELDDATA=\"" << data << "\""; + } + } + + return ret.str(); +} + +std::string FormatV3000CStateBlock(const SGroup &sgroup) { + std::ostringstream ret; + + for (const auto &cstate : sgroup.getCStates()) { + unsigned int xbondIdx = 1 + cstate.bondIdx; + ret << " CSTATE=("; + if (sgroup.getProp("TYPE") == "SUP") { + ret << "4 " << xbondIdx; + ret << ' ' << FormatV3000DoubleField(cstate.vector.x); + ret << ' ' << FormatV3000DoubleField(cstate.vector.y); + ret << " 0"; + } else { + ret << "1 " << xbondIdx; + } + ret << ")"; + } + + return ret.str(); +} + +std::string FormatV3000AttachPointBlock( + const std::vector &saps) { + std::ostringstream ret; + + for (const auto &sap : saps) { + ret << " SAP=(3 " << (1 + sap.aIdx); + + if (sap.lvIdx != -1 && sap.aIdx == static_cast(sap.lvIdx)) { + ret << " aidx"; + } else { + // lvIdx == -1 will turn into 0, which is right (see spec) + ret << ' ' << (1 + sap.lvIdx); + } + + ret << ' ' << sap.id << ")"; + } + + return ret.str(); +} + +//! Write a SGroup line. The order of the labels is defined in the spec. +const std::string GetV3000MolFileSGroupLines(const unsigned int idx, + const SGroup &sgroup) { + std::ostringstream os; + + unsigned int id = 0; + sgroup.getPropIfPresent("ID", id); + + os << idx << ' ' << sgroup.getProp("TYPE") << ' ' << id; + + os << BuildV3000IdxVectorDataBlock("ATOMS", sgroup.getAtoms()); + os << BuildV3000BondsBlock(sgroup); + os << BuildV3000IdxVectorDataBlock("PATOMS", sgroup.getParentAtoms()); + os << FormatV3000StringPropertyBlock("SUBTYPE", sgroup); + os << FormatV3000StringPropertyBlock("MULT", sgroup); + os << FormatV3000StringPropertyBlock("CONNECT", sgroup); + os << FormatV3000ParentBlock(sgroup); + os << FormatV3000CompNoBlock(sgroup); + // XBHEAD -> part of V2000 CRS, not supported yet + // XBCORR -> part of V2000 CRS, not supported yet + os << FormatV3000StringPropertyBlock("LABEL", sgroup); + os << FormatV3000BracketBlock(sgroup.getBrackets()); + os << FormatV3000StringPropertyBlock("ESTATE", sgroup); + os << FormatV3000CStateBlock(sgroup); + os << FormatV3000StringPropertyBlock("FIELDNAME", sgroup); + os << FormatV3000StringPropertyBlock("FIELDINFO", sgroup); + os << FormatV3000StringPropertyBlock("FIELDDISP", sgroup); + os << FormatV3000StringPropertyBlock("QUERYTYPE", sgroup); + os << FormatV3000StringPropertyBlock("QUERYOP", sgroup); + os << FormatV3000FieldDataBlock(sgroup); + os << FormatV3000StringPropertyBlock("CLASS", sgroup); + os << FormatV3000AttachPointBlock(sgroup.getAttachPoints()); + os << FormatV3000StringPropertyBlock("BRKTYP", sgroup); + os << FormatV3000StringPropertyBlock("SEQID", sgroup); + + std::string sGroupBlock = os.str(); + unsigned int length = sGroupBlock.size(); + os.str(""); + + unsigned int start = 0; + while (length - start > 73) { + os << "M V30 " << sGroupBlock.substr(start, 72) << '-' << std::endl; + start += 72; + } + os << "M V30 " << sGroupBlock.substr(start, 73) << std::endl; + + return os.str(); +} + +} // namespace SGroupWriting +} // namespace RDKit diff --git a/Code/GraphMol/FileParsers/MolSGroupWriting.h b/Code/GraphMol/FileParsers/MolSGroupWriting.h new file mode 100644 index 000000000..f35fa1180 --- /dev/null +++ b/Code/GraphMol/FileParsers/MolSGroupWriting.h @@ -0,0 +1,136 @@ +// +// Copyright (C) 2002-2018 Greg Landrum and T5 Informatics GmbH +// +// @@ All Rights Reserved @@ +// This file is part of the RDKit. +// The contents are covered by the terms of the BSD license +// which is included in the file license.txt, found at the root +// of the RDKit source tree. +// + +#pragma once + +#include +#include + +namespace RDKit { +namespace SGroupWriting { +typedef std::unordered_map IDX_TO_SGROUP_MAP; + +/* ------------------ Inlined Formatters ------------------ */ + +inline std::string FormatV2000IntField(int value) { + char output[5]; + snprintf(output, 5, " %3d", value); + return std::string(output); +} + +inline std::string FormatV2000NumEntriesField(int value) { + char output[4]; + snprintf(output, 4, " %2d", value); + return std::string(output); +} + +inline std::string FormatV2000DoubleField(double value) { + char output[11]; + snprintf(output, 11, "%10.4f", value); + return std::string(output); +} + +inline std::string FormatV2000StringField(const std::string &value, + unsigned int fieldSize, bool pad, + bool addSeparator) { + std::ostringstream os; + if (addSeparator) { + os << ' '; + } + if (value.size() >= fieldSize) { + os << value.substr(0, fieldSize); + } else if (pad) { + os << std::setw(fieldSize) << std::left << value; + } else { + os << value; + } + return os.str(); +} + +inline std::string FormatV3000DoubleField(double value) { + return boost::trim_copy(FormatV2000DoubleField(value)); +} + +/* ------------------ V2000 Utils ------------------ */ + +std::string BuildV2000STYLines(const ROMol &mol); + +std::string BuildV2000StringPropLines(const unsigned int entriesPerLine, + const ROMol &mol, + const std::string &propName, + const std::string &propCode, + const unsigned int fieldWitdh); + +std::string BuildV2000SLBLines(const ROMol &mol); + +std::string BuildV2000SDSLines(const ROMol &mol); + +std::string BuildV2000SPLLines(const ROMol &mol); + +std::string BuildV2000SNCLines(const ROMol &mol); + +std::string BuildV2000SBTLines(const ROMol &mol); + +template +std::string BuildV2000IdxVectorDataLines(const unsigned int entriesPerLine, + const unsigned int sGroupId, + const std::string &code, + const T &dataVector); + +std::string BuildV2000SMTLine(const int idx, const SGroup *sgroup); + +std::string BuildV2000SDILine(const int idx, const SGroup *sgroup); + +std::string BuildV2000SBVLine(const int idx, const SGroup *sgroup); + +std::string BuildV2000SDTLine(const int idx, const SGroup *sgroup); + +std::string BuildV2000SDDLine(const int idx, const SGroup *sgroup); + +std::string BuildV2000SCDSEDLines(const int idx, const SGroup *sgroup); + +std::string BuildV2000PXALine(const int idx, const SGroup *sgroup); + +std::string BuildV2000SAPLines(const int idx, const SGroup *sgroup); + +std::string BuildV2000SCLLine(const int idx, const SGroup *sgroup); +const std::string GetMolFileSGroupInfo(const RWMol &mol); + +/* ------------------ V3000 Utils ------------------ */ + +template +std::string BuildV3000IdxVectorDataBlock(const std::string &key, + const std::vector &dataVector); + +template +std::string BuildV3000IdxVectorDataBlock(const std::string &key, + const Iterator &dataVectorBegin, + const Iterator &dataVectorEnd); + +/* Classify bonds between XBONDS and CBOfindP work on a copy of + * bonds vector to prevent reordering of original vector */ +std::string BuildV3000BondsBlock(const SGroup &sgroup); + +std::string FormatV3000StringPropertyBlock(const std::string &prop, + const SGroup &sgroup); + +std::string FormatV3000ParentBlock(const SGroup &sgroup); + +std::string FormatV3000CompNoBlock(const SGroup &sgroup); + +std::string FormatV3000BracketBlock( + const std::vector brackets); + +std::string FormatV3000CStateBlock(const std::vector &cstates); + +const std::string GetV3000MolFileSGroupLines(const unsigned int idx, + const SGroup &sgroup); +} // namespace SGroupWriting +} // namespace RDKit diff --git a/Code/GraphMol/FileParsers/test1.cpp b/Code/GraphMol/FileParsers/test1.cpp index 881249515..14fa4267c 100644 --- a/Code/GraphMol/FileParsers/test1.cpp +++ b/Code/GraphMol/FileParsers/test1.cpp @@ -4466,9 +4466,12 @@ void testRCSBSdf() { RWMol *mol = MolFileToMol(pathName + "s58_rcsb.mol"); TEST_ASSERT(mol); delete mol; + BOOST_LOG(rdInfoLog) << "Finished" << std::endl; } void testParseCHG() { + BOOST_LOG(rdInfoLog) << "-------------------------------------" << std::endl; + BOOST_LOG(rdInfoLog) << "Testing PDB charge parsing" << std::endl; // BAD PDB Ligand with CHG line too long (>8) and right and mid-justified // symbols const std::string molblock_chg = @@ -4611,9 +4614,12 @@ void testParseCHG() { TEST_ASSERT(positions.size() == 3); // 24/3 == 8 delete m; + BOOST_LOG(rdInfoLog) << "Finished" << std::endl; } void testMDLAtomProps() { + BOOST_LOG(rdInfoLog) << "-------------------------------------" << std::endl; + BOOST_LOG(rdInfoLog) << "Testing MDL atom properties" << std::endl; std::string smi = "CC"; ROMOL_SPTR mol(SmilesToMol(smi, false, false)); setAtomAlias(mol->getAtomWithIdx(0), "foo"); @@ -4628,15 +4634,19 @@ void testMDLAtomProps() { TEST_ASSERT(0); } catch (...) { } + BOOST_LOG(rdInfoLog) << "Finished" << std::endl; } void testSupplementalSmilesLabel() { + BOOST_LOG(rdInfoLog) << "-------------------------------------" << std::endl; + BOOST_LOG(rdInfoLog) << "Testing supplemental SMILES labels" << std::endl; std::string smi = "C"; ROMOL_SPTR mol(SmilesToMol(smi, false, false)); setSupplementalSmilesLabel(mol->getAtomWithIdx(0), "xxx"); smi = MolToSmiles(*mol.get()); TEST_ASSERT(smi == "Cxxx"); TEST_ASSERT(getSupplementalSmilesLabel(mol->getAtomWithIdx(0)) == "xxx"); + BOOST_LOG(rdInfoLog) << "Finished" << std::endl; } void testGithub1034() { @@ -5248,7 +5258,6 @@ void RunTests() { testIssue3375684(); testChiralPhosphorous(); testIssue3392107(); - testIssue3432136(); testIssue3477283(); testIssue3484552(); testIssue3514824(); diff --git a/Code/GraphMol/FileParsers/testMolSupplier.cpp b/Code/GraphMol/FileParsers/testMolSupplier.cpp index 843da9290..4443f0c9c 100644 --- a/Code/GraphMol/FileParsers/testMolSupplier.cpp +++ b/Code/GraphMol/FileParsers/testMolSupplier.cpp @@ -130,7 +130,7 @@ int testMolSup() { } TEST_ASSERT(ok); } -#endif // RDK_BUILD_COORDGEN_SUPPORT +#endif // RDK_BUILD_COORDGEN_SUPPORT return 1; } @@ -320,11 +320,10 @@ void testSmilesSupFromText() { // this was a delightful boundary condition: BOOST_LOG(rdErrorLog) << "------------------------------------------------------" << std::endl; - text = - "CC\n" - "CCC\n" - "CCOC\n" - "CCCCOC"; + text = "CC\n" + "CCC\n" + "CCOC\n" + "CCCCOC"; { nSup2.setData(text, " ", 0, -1, false, true); // BOOST_LOG(rdErrorLog) << "SIZE: " << nSup2.length() << std::endl; @@ -397,12 +396,11 @@ void testSmilesSupFromText() { } // -------------- // basics: - text = - "Id SMILES Column_2\n" - "mol-1 C 1.0\n" - "mol-2 CC 4.0\n" - "mol-3 CCC 9.0\n" - "mol-4 CCCC 16.0\n"; + text = "Id SMILES Column_2\n" + "mol-1 C 1.0\n" + "mol-2 CC 4.0\n" + "mol-3 CCC 9.0\n" + "mol-4 CCCC 16.0\n"; #if 1 nSup2.setData(text, " ", 1, 0, true, true); mol = nSup2[3]; @@ -415,11 +413,10 @@ void testSmilesSupFromText() { delete mol; // ensure that we can call setData a second time: - text = - "Id SMILES Column_2\n" - "mol-1 C 1.0\n" - "mol-2 CC 4.0\n" - "mol-3 CCC 9.0\n"; + text = "Id SMILES Column_2\n" + "mol-1 C 1.0\n" + "mol-2 CC 4.0\n" + "mol-3 CCC 9.0\n"; nSup2.setData(text, " ", 1, 0, true, true); CHECK_INVARIANT(nSup2.length() == 3, ""); mol = nSup2[2]; @@ -430,12 +427,11 @@ void testSmilesSupFromText() { delete mol; // now test for failure handling: - text = - "Id SMILES Column_2\n" - "mol-1 C 1.0\n" - "mol-2 CC 4.0\n" - "mol-3 fail 9.0\n" - "mol-4 CCCC 16.0\n"; + text = "Id SMILES Column_2\n" + "mol-1 C 1.0\n" + "mol-2 CC 4.0\n" + "mol-3 fail 9.0\n" + "mol-4 CCCC 16.0\n"; nSup2.setData(text, " ", 1, 0, true, true); mol = nSup2[3]; // BOOST_LOG(rdErrorLog) << "SIZE: " << nSup2.length() << std::endl; @@ -453,11 +449,10 @@ void testSmilesSupFromText() { #endif // issue 114, no \n at EOF: - text = - "Id SMILES Column_2\n" - "mol-1 C 1.0\n" - "mol-2 CC 4.0\n" - "mol-4 CCCC 16.0\n"; + text = "Id SMILES Column_2\n" + "mol-1 C 1.0\n" + "mol-2 CC 4.0\n" + "mol-4 CCCC 16.0\n"; nSup2.setData(text, " ", 1, 0, true, true); // BOOST_LOG(rdErrorLog) << "SIZE: " << nSup2.length() << std::endl; TEST_ASSERT(nSup2.length() == 3); @@ -470,11 +465,10 @@ void testSmilesSupFromText() { TEST_ASSERT(nSup2.atEnd()); delete mol; - text = - "Id SMILES Column_2\n" - "mol-1 C 1.0\n" - "mol-2 CC 4.0\n" - "mol-4 CCCC 16.0"; + text = "Id SMILES Column_2\n" + "mol-1 C 1.0\n" + "mol-2 CC 4.0\n" + "mol-4 CCCC 16.0"; nSup2.setData(text, " ", 1, 0, true, true); // BOOST_LOG(rdErrorLog) << "SIZE: " << nSup2.length() << std::endl; TEST_ASSERT(nSup2.length() == 3); @@ -495,10 +489,9 @@ void testSmilesSupFromText() { } TEST_ASSERT(failed); - text = - "mol-1 C 1.0\n" - "mol-2 CC 4.0\n" - "mol-4 CCCC 16.0"; + text = "mol-1 C 1.0\n" + "mol-2 CC 4.0\n" + "mol-4 CCCC 16.0"; nSup2.setData(text, " ", 1, 0, false, true); // BOOST_LOG(rdErrorLog) << "SIZE: " << nSup2.length() << std::endl; TEST_ASSERT(nSup2.length() == 3); @@ -510,10 +503,9 @@ void testSmilesSupFromText() { TEST_ASSERT(mname == "16.0"); delete mol; - text = - "C\n" - "CC\n" - "CCCC"; + text = "C\n" + "CC\n" + "CCCC"; nSup2.setData(text, " ", 0, -1, false, true); // BOOST_LOG(rdErrorLog) << "SIZE: " << nSup2.length() << std::endl; TEST_ASSERT(nSup2.length() == 3); @@ -525,11 +517,10 @@ void testSmilesSupFromText() { // this was a delightful boundary condition: BOOST_LOG(rdErrorLog) << "------------------------------------------------------" << std::endl; - text = - "CC\n" - "CCC\n" - "CCOC\n" - "CCCCOC"; + text = "CC\n" + "CCC\n" + "CCOC\n" + "CCCCOC"; nSup2.setData(text, " ", 0, -1, false, true); // BOOST_LOG(rdErrorLog) << "SIZE: " << nSup2.length() << std::endl; mol = nSup2.next(); @@ -551,11 +542,10 @@ void testSmilesSupFromText() { BOOST_LOG(rdErrorLog) << "------------------------------------------------------" << std::endl; // this was a delightful boundary condition: - text = - "CC\n" - "CCC\n" - "CCOC\n" - "CCCCOC"; + text = "CC\n" + "CCC\n" + "CCOC\n" + "CCCCOC"; nSup2.setData(text, " ", 0, -1, false, true); // BOOST_LOG(rdErrorLog) << "SIZE: " << nSup2.length() << std::endl; failed = false; @@ -582,13 +572,12 @@ void testSmilesSupFromText() { TEST_ASSERT(nDone == nSup2.length()); // ensure that we can call setData a second time: - text = - "Id SMILES Column_2\n" - "# comment, ignore\n" - "mol-1 C 1.0\n" - "mol-2 CC 4.0\n" - "mol-3 CCC 9.0\n" - "mol-4 CCCC 16.0\n"; + text = "Id SMILES Column_2\n" + "# comment, ignore\n" + "mol-1 C 1.0\n" + "mol-2 CC 4.0\n" + "mol-3 CCC 9.0\n" + "mol-4 CCCC 16.0\n"; nSup2.setData(text, " ", 1, 0, true, true); mol = nSup2[2]; mol->getProp(common_properties::_Name, mname); @@ -605,13 +594,12 @@ void testSmilesSupFromText() { delete mol; // this was a delightful boundary condition: - text = - "CC\n" - "CCC\n" - "CCOC\n" - "CCCCOC\n" - "\n" - "\n"; + text = "CC\n" + "CCC\n" + "CCOC\n" + "CCCCOC\n" + "\n" + "\n"; nSup2.setData(text, " ", 0, -1, false, true); TEST_ASSERT(nSup2.length() == 4); nSup2.reset(); @@ -850,10 +838,9 @@ void testSuppliersEmptyFile() { void testCisTrans() { std::string text; - text = - "mol-1 ClC(C)=C(Br)C\n" - "mol-2 C1=COC=CC1C(Cl)=C(Br)C\n" - "mol-3 C1=COC=CC1\\C(Cl)=C(Br)\\C"; + text = "mol-1 ClC(C)=C(Br)C\n" + "mol-2 C1=COC=CC1C(Cl)=C(Br)C\n" + "mol-3 C1=COC=CC1\\C(Cl)=C(Br)\\C"; SmilesMolSupplier smiSup; smiSup.setData(text, " ", 1, 0, false, true); @@ -1090,19 +1077,18 @@ int testTDTSupplier3() { TDTMolSupplier suppl; - data = - "$SMI\n" - "CAS<17584-12-2>\n" - "|\n" - "$SMI\n" - "CAS<~>\n" - "|\n" - "$SMI\n" - "CAS<932-53-6>\n" - "|\n" - "$SMI\n" - "CAS<~>\n" - "|\n"; + data = "$SMI\n" + "CAS<17584-12-2>\n" + "|\n" + "$SMI\n" + "CAS<~>\n" + "|\n" + "$SMI\n" + "CAS<932-53-6>\n" + "|\n" + "$SMI\n" + "CAS<~>\n" + "|\n"; suppl.setData(data, "CAS"); i = 0; @@ -1283,7 +1269,8 @@ void testSDSupplierFromTextStrLax1() { while (!reader.atEnd()) { ROMol *mol = reader.next(); TEST_ASSERT(mol->hasProp(common_properties::_Name)); - if (i == 0) TEST_ASSERT(!mol->hasProp("ID")); + if (i == 0) + TEST_ASSERT(!mol->hasProp("ID")); TEST_ASSERT(!mol->hasProp("ANOTHER_PROPERTY")); i++; delete mol; @@ -1372,11 +1359,10 @@ void testSDSupplierFromTextStrLax2() { mol->getProp("ID", s); TEST_ASSERT(s == "Lig1"); mol->getProp("ANOTHER_PROPERTY", s); - TEST_ASSERT(s == - "No blank line before dollars\n" - "$$$$\n" - "Structure1\n" - "csChFnd70/05230312262D"); + TEST_ASSERT(s == "No blank line before dollars\n" + "$$$$\n" + "Structure1\n" + "csChFnd70/05230312262D"); i++; delete mol; } @@ -1418,7 +1404,8 @@ void testSDSupplierStrLax1() { while (!reader.atEnd()) { ROMol *mol = reader.next(); TEST_ASSERT(mol->hasProp(common_properties::_Name)); - if (i == 0) TEST_ASSERT(!mol->hasProp("ID")); + if (i == 0) + TEST_ASSERT(!mol->hasProp("ID")); TEST_ASSERT(!mol->hasProp("ANOTHER_PROPERTY")); i++; delete mol; @@ -1460,11 +1447,10 @@ void testSDSupplierStrLax2() { mol->getProp("ID", s); TEST_ASSERT(s == "Lig1"); mol->getProp("ANOTHER_PROPERTY", s); - TEST_ASSERT(s == - "No blank line before dollars\n" - "$$$$\n" - "Structure1\n" - "csChFnd70/05230312262D"); + TEST_ASSERT(s == "No blank line before dollars\n" + "$$$$\n" + "Structure1\n" + "csChFnd70/05230312262D"); i++; delete mol; } @@ -1584,7 +1570,8 @@ void testIssue381() { count = 0; while (!sdsup->atEnd()) { nmol = sdsup->next(); - if (nmol) delete nmol; + if (nmol) + delete nmol; count++; } TEST_ASSERT(sdsup->atEnd()); @@ -1606,10 +1593,12 @@ void testSetStreamIndices() { std::streampos pos = 0; std::string line; while (notEof) { - if (addIndex) pos = ifs.tellg(); + if (addIndex) + pos = ifs.tellg(); notEof = (std::getline(ifs, line) ? true : false); if (notEof) { - if (addIndex) indices.push_back(pos); + if (addIndex) + indices.push_back(pos); addIndex = (line.substr(0, 4) == "$$$$"); } } @@ -1627,7 +1616,8 @@ void testSetStreamIndices() { count = 0; while (!sdsup->atEnd()) { nmol = sdsup->next(); - if (nmol) delete nmol; + if (nmol) + delete nmol; count++; } TEST_ASSERT(sdsup->atEnd()); @@ -2095,8 +2085,10 @@ int testForwardSDSupplier() { while (!strm.eof()) { std::string line; std::getline(strm, line); - if (!strm.eof()) ++i; - if (i > 1000) break; + if (!strm.eof()) + ++i; + if (i > 1000) + break; } TEST_ASSERT(i == 998); } @@ -2112,8 +2104,10 @@ int testForwardSDSupplier() { while (!strm.eof()) { std::string line; std::getline(strm, line); - if (!strm.eof()) ++i; - if (i > 1000) break; + if (!strm.eof()) + ++i; + if (i > 1000) + break; } TEST_ASSERT(i == 997); } @@ -2154,8 +2148,10 @@ int testForwardSDSupplier() { while (!strm.eof()) { std::string line; std::getline(strm, line); - if (!strm.eof()) ++i; - if (i > 1700) break; + if (!strm.eof()) + ++i; + if (i > 1700) + break; } TEST_ASSERT(i == 1663); } @@ -2171,8 +2167,10 @@ int testForwardSDSupplier() { while (!strm.eof()) { std::string line; std::getline(strm, line); - if (!strm.eof()) ++i; - if (i > 1700) break; + if (!strm.eof()) + ++i; + if (i > 1700) + break; } TEST_ASSERT(i == 1663); } @@ -2195,7 +2193,7 @@ int testForwardSDSupplier() { } TEST_ASSERT(i == 16); } -#endif // RDK_BUILD_COORDGEN_SUPPORT +#endif // RDK_BUILD_COORDGEN_SUPPORT #endif return 1; @@ -2232,7 +2230,7 @@ void testIssue3525673() { ROMol *nmol; nmol = reader.next(); - TEST_ASSERT(!nmol); + TEST_ASSERT(nmol); nmol = reader.next(); TEST_ASSERT(nmol); @@ -2240,10 +2238,10 @@ void testIssue3525673() { delete nmol; nmol = reader.next(); - TEST_ASSERT(!nmol); + TEST_ASSERT(nmol); nmol = reader.next(); - TEST_ASSERT(!nmol); + TEST_ASSERT(nmol); nmol = reader.next(); TEST_ASSERT(nmol); @@ -2251,10 +2249,10 @@ void testIssue3525673() { delete nmol; nmol = reader.next(); - TEST_ASSERT(!nmol); + TEST_ASSERT(nmol); nmol = reader.next(); - TEST_ASSERT(!nmol); + TEST_ASSERT(!nmol); // broken due to 'foo' in counts line! nmol = reader.next(); TEST_ASSERT(nmol); @@ -2262,7 +2260,7 @@ void testIssue3525673() { delete nmol; nmol = reader.next(); - TEST_ASSERT(!nmol); + TEST_ASSERT(nmol); } void testBlankLinesInProps() { diff --git a/Code/GraphMol/FileParsers/test_data/Issue3432136_2.v3k.mol b/Code/GraphMol/FileParsers/test_data/Issue3432136_2.v3k.mol index 4032b20b2..7afe060ce 100644 --- a/Code/GraphMol/FileParsers/test_data/Issue3432136_2.v3k.mol +++ b/Code/GraphMol/FileParsers/test_data/Issue3432136_2.v3k.mol @@ -32,6 +32,7 @@ M V30 10 1 7 11 M V30 11 1 11 12 M V30 END BOND M V30 BEGIN SGROUP +M V30 DEFAULT CLASS=DEMOCLASS LABEL=overwritten M V30 1 SUP 0 ATOMS=(6 6 7 8 9 11 12) XBONDS=(1 5) LABEL=abbrev ESTATE=E M V30 END SGROUP M V30 END CTAB diff --git a/Code/GraphMol/MolPickler.cpp b/Code/GraphMol/MolPickler.cpp index a087c08fd..750889c01 100644 --- a/Code/GraphMol/MolPickler.cpp +++ b/Code/GraphMol/MolPickler.cpp @@ -12,12 +12,12 @@ #include #include #include +#include #include #include #include #include #include -#include #include #include @@ -823,6 +823,7 @@ void MolPickler::_pickle(const ROMol *mol, std::ostream &ss, int32_t tmpInt; bool includeAtomCoords = true; std::map atomIdxMap; + std::map bondIdxMap; tmpInt = static_cast(mol->getNumAtoms()); streamWrite(ss, tmpInt); @@ -854,7 +855,9 @@ void MolPickler::_pickle(const ROMol *mol, std::ostream &ss, // ------------------- streamWrite(ss, BEGINBOND); for (unsigned int i = 0; i < mol->getNumBonds(); i++) { - _pickleBond(ss, mol->getBondWithIdx(i), atomIdxMap); + auto bond = mol->getBondWithIdx(i); + _pickleBond(ss, bond, atomIdxMap); + bondIdxMap[bond->getIdx()] = i; } // ------------------- @@ -868,6 +871,22 @@ void MolPickler::_pickle(const ROMol *mol, std::ostream &ss, _pickleSSSR(ss, ringInfo, atomIdxMap); } + // ------------------- + // + // Write SGroups (if present) + // + // ------------------- + const auto &sgroups = getSGroups(*mol); + if (!sgroups.empty()) { + streamWrite(ss, BEGINSGROUP); + + tmpInt = static_cast(sgroups.size()); + streamWrite(ss, tmpInt); + + for (const auto &sgroup : sgroups) { + _pickleSGroup(ss, sgroup, atomIdxMap, bondIdxMap); + } + } // Write Stereo Groups { auto &stereo_groups = mol->getStereoGroups(); @@ -997,6 +1016,24 @@ void MolPickler::_depickle(std::istream &ss, ROMol *mol, int version, streamRead(ss, tag, version); } + // ------------------- + // + // Read SGroups (if needed) + // + // ------------------- + + if (tag == BEGINSGROUP) { + streamRead(ss, tmpInt, version); + + // Create SGroups + for (int i = 0; i < tmpInt; ++i) { + auto sgroup = _getSGroupFromPickle(ss, mol, version); + addSGroup(*mol, sgroup); + } + + streamRead(ss, tag, version); + } + if (tag == BEGINSTEREOGROUP) { _depickleStereo(ss, mol, version); streamRead(ss, tag, version); @@ -1727,6 +1764,178 @@ void MolPickler::_addRingInfoFromPickle(std::istream &ss, ROMol *mol, } } +//-------------------------------------- +// +// SGroups +// +//-------------------------------------- + +template +void MolPickler::_pickleSGroup(std::ostream &ss, const SGroup &sgroup, + std::map &atomIdxMap, + std::map &bondIdxMap) { + T tmpT; + + streamWriteProps(ss, sgroup); + + const auto &atoms = sgroup.getAtoms(); + streamWrite(ss, static_cast(atoms.size())); + for (const auto &atom : atoms) { + tmpT = static_cast(atomIdxMap[atom]); + streamWrite(ss, tmpT); + } + + const auto &p_atoms = sgroup.getParentAtoms(); + streamWrite(ss, static_cast(p_atoms.size())); + for (const auto &p_atom : p_atoms) { + tmpT = static_cast(atomIdxMap[p_atom]); + streamWrite(ss, tmpT); + } + + const auto &bonds = sgroup.getBonds(); + streamWrite(ss, static_cast(bonds.size())); + for (const auto &bond : bonds) { + tmpT = static_cast(bondIdxMap[bond]); + streamWrite(ss, tmpT); + } + + const auto &brackets = sgroup.getBrackets(); + streamWrite(ss, static_cast(brackets.size())); + for (const auto &bracket : brackets) { + // 3 point per bracket; 3rd point and all z are zeros, + // but this might change in the future. + for (const auto &pt : bracket) { + float tmpFloat; + tmpFloat = static_cast(pt.x); + streamWrite(ss, tmpFloat); + tmpFloat = static_cast(pt.y); + streamWrite(ss, tmpFloat); + tmpFloat = static_cast(pt.z); + streamWrite(ss, tmpFloat); + } + } + + const auto &cstates = sgroup.getCStates(); + streamWrite(ss, static_cast(cstates.size())); + for (const auto &cstate : cstates) { + // Bond + tmpT = static_cast(bondIdxMap[cstate.bondIdx]); + streamWrite(ss, tmpT); + + // Vector -- existence depends on SGroup type + if ("SUP" == sgroup.getProp("TYPE")) { + float tmpFloat; + tmpFloat = static_cast(cstate.vector.x); + streamWrite(ss, tmpFloat); + tmpFloat = static_cast(cstate.vector.y); + streamWrite(ss, tmpFloat); + tmpFloat = static_cast(cstate.vector.z); + streamWrite(ss, tmpFloat); + } + } + + const auto &attachPoints = sgroup.getAttachPoints(); + streamWrite(ss, static_cast(attachPoints.size())); + for (const auto &attachPoint : attachPoints) { + // aIdx -- always present + tmpT = static_cast(atomIdxMap[attachPoint.aIdx]); + streamWrite(ss, tmpT); + + // lvIdx -- may be -1 if not used (0 in spec) + auto tmpInt = -1; + if (attachPoint.lvIdx != -1) { + tmpInt = static_cast(atomIdxMap[attachPoint.lvIdx]); + } + streamWrite(ss, tmpInt); + + // id -- may be blank + auto tmpS = static_cast(attachPoint.id); + streamWrite(ss, tmpS); + } +} + +template +SGroup MolPickler::_getSGroupFromPickle(std::istream &ss, ROMol *mol, + int version) { + T tmpT; + T numItems; + float tmpFloat; + int tmpInt = -1; + std::string tmpS; + + // temporarily accept empty TYPE + SGroup sgroup(mol, ""); + + // Read RDProps, overriding ID, TYPE and COMPNO + streamReadProps(ss, sgroup); + + streamRead(ss, numItems, version); + for (int i = 0; i < numItems; ++i) { + streamRead(ss, tmpT, version); + sgroup.addAtomWithIdx(tmpT); + } + + streamRead(ss, numItems, version); + for (int i = 0; i < numItems; ++i) { + streamRead(ss, tmpT, version); + sgroup.addParentAtomWithIdx(tmpT); + } + + streamRead(ss, numItems, version); + for (int i = 0; i < numItems; ++i) { + streamRead(ss, tmpT, version); + sgroup.addBondWithIdx(tmpT); + } + + streamRead(ss, numItems, version); + for (int i = 0; i < numItems; ++i) { + SGroup::Bracket bracket; + for (auto j = 0; j < 3; ++j) { + streamRead(ss, tmpFloat, version); + double x = static_cast(tmpFloat); + streamRead(ss, tmpFloat, version); + double y = static_cast(tmpFloat); + streamRead(ss, tmpFloat, version); + double z = static_cast(tmpFloat); + bracket[j] = RDGeom::Point3D(x, y, z); + } + sgroup.addBracket(bracket); + } + + streamRead(ss, numItems, version); + for (int i = 0; i < numItems; ++i) { + streamRead(ss, tmpT, version); + RDGeom::Point3D vector; + + if ("SUP" == sgroup.getProp("TYPE")) { + streamRead(ss, tmpFloat, version); + vector.x = static_cast(tmpFloat); + streamRead(ss, tmpFloat, version); + vector.y = static_cast(tmpFloat); + streamRead(ss, tmpFloat, version); + vector.z = static_cast(tmpFloat); + } + + sgroup.addCState(tmpT, vector); + } + + streamRead(ss, numItems, version); + for (int i = 0; i < numItems; ++i) { + streamRead(ss, tmpT, version); + unsigned int aIdx = tmpT; + + streamRead(ss, tmpInt, version); + int lvIdx = tmpInt; + + std::string id; + streamRead(ss, id, version); + + sgroup.addAttachPoint(aIdx, lvIdx, id); + } + + return sgroup; +} + template void MolPickler::_pickleStereo(std::ostream &ss, const std::vector &groups, @@ -1735,7 +1944,7 @@ void MolPickler::_pickleStereo(std::ostream &ss, streamWrite(ss, tmpT); for (auto &&group : groups) { streamWrite(ss, static_cast(group.getGroupType())); - auto& atoms = group.getAtoms(); + auto &atoms = group.getAtoms(); streamWrite(ss, static_cast(atoms.size())); for (auto &&atom : atoms) { tmpT = static_cast(atomIdxMap[atom->getIdx()]); @@ -1915,9 +2124,8 @@ void MolPickler::_addAtomFromPickleV1(std::istream &ss, ROMol *mol) { break; case ATOM_MASS: streamRead(ss, dblVar, version); - // we don't need to set this anymore, but we do need to read it in order - // to - // maintain backwards compatibility + // we don't need to set this anymore, but we do need to read it in + // order to maintain backwards compatibility break; case ATOM_ISAROMATIC: streamRead(ss, charVar, version); diff --git a/Code/GraphMol/MolPickler.h b/Code/GraphMol/MolPickler.h index d4219c86a..349c10b07 100644 --- a/Code/GraphMol/MolPickler.h +++ b/Code/GraphMol/MolPickler.h @@ -132,6 +132,7 @@ class RDKIT_GRAPHMOL_EXPORT MolPickler { BEGINATOMPROPS, BEGINBONDPROPS, BEGINQUERYATOMDATA, + BEGINSGROUP, BEGINSTEREOGROUP, } Tags; @@ -199,6 +200,12 @@ class RDKIT_GRAPHMOL_EXPORT MolPickler { static void _pickleSSSR(std::ostream &ss, const RingInfo *ringInfo, std::map &atomIdxMap); + //! do the actual work of pickling a SGroup + template + static void _pickleSGroup(std::ostream &ss, const SGroup &sgroup, + std::map &atomIdxMap, + std::map &bondIdxMap); + //! do the actual work of pickling Stereo Group data template static void _pickleStereo(std::ostream &ss, @@ -232,6 +239,10 @@ class RDKIT_GRAPHMOL_EXPORT MolPickler { static void _addRingInfoFromPickle(std::istream &ss, ROMol *mol, int version, bool directMap = false); + //! extract a SGroup from a pickle + template + static SGroup _getSGroupFromPickle(std::istream &ss, ROMol *mol, int version); + template static void _depickleStereo(std::istream &ss, ROMol *mol, int version); diff --git a/Code/GraphMol/ROMol.cpp b/Code/GraphMol/ROMol.cpp index d95efdace..20b0abb4d 100644 --- a/Code/GraphMol/ROMol.cpp +++ b/Code/GraphMol/ROMol.cpp @@ -21,6 +21,7 @@ #include "QueryBond.h" #include "MolPickler.h" #include "Conformer.h" +#include "Sgroup.h" namespace RDKit { class QueryAtom; @@ -104,6 +105,11 @@ void ROMol::initFromOther(const ROMol &other, bool quickCopy, int confId) { } } + // Copy sgroups + for (const auto &sg : getSGroups(other)) { + addSGroup(*this, sg); + } + dp_props = other.dp_props; // Bookmarks should be copied as well: @@ -185,7 +191,7 @@ const Atom *ROMol::getAtomWithIdx(unsigned int idx) const { } // returns the first inserted atom with the given bookmark -Atom *ROMol::getAtomWithBookmark(const int mark) { +Atom *ROMol::getAtomWithBookmark(int mark) { PRECONDITION(d_atomBookmarks.count(mark) != 0, "atom bookmark not found"); PRECONDITION(d_atomBookmarks[mark].begin() != d_atomBookmarks[mark].end(), "atom bookmark not found"); @@ -194,13 +200,20 @@ Atom *ROMol::getAtomWithBookmark(const int mark) { }; // returns all atoms with the given bookmark -ROMol::ATOM_PTR_LIST &ROMol::getAllAtomsWithBookmark(const int mark) { +ROMol::ATOM_PTR_LIST &ROMol::getAllAtomsWithBookmark(int mark) { PRECONDITION(d_atomBookmarks.count(mark) != 0, "atom bookmark not found"); return d_atomBookmarks[mark]; }; +// returns the unique atom with the given bookmark +Atom *ROMol::getUniqueAtomWithBookmark(int mark) { + PRECONDITION(d_atomBookmarks.count(mark) == 1, + "multiple atoms with same bookmark"); + return getAtomWithBookmark(mark); +} + // returns the first inserted bond with the given bookmark -Bond *ROMol::getBondWithBookmark(const int mark) { +Bond *ROMol::getBondWithBookmark(int mark) { PRECONDITION(d_bondBookmarks.count(mark) != 0, "bond bookmark not found"); PRECONDITION(d_bondBookmarks[mark].begin() != d_bondBookmarks[mark].end(), "bond bookmark not found"); @@ -208,14 +221,21 @@ Bond *ROMol::getBondWithBookmark(const int mark) { }; // returns all bonds with the given bookmark -ROMol::BOND_PTR_LIST &ROMol::getAllBondsWithBookmark(const int mark) { +ROMol::BOND_PTR_LIST &ROMol::getAllBondsWithBookmark(int mark) { PRECONDITION(d_bondBookmarks.count(mark) != 0, "bond bookmark not found"); return d_bondBookmarks[mark]; }; +// returns the unique bond with the given bookmark +Bond *ROMol::getUniqueBondWithBookmark(int mark) { + PRECONDITION(d_bondBookmarks.count(mark) == 1, + "multiple bons with same bookmark"); + return getBondWithBookmark(mark); +} + void ROMol::clearAtomBookmark(const int mark) { d_atomBookmarks.erase(mark); } -void ROMol::clearAtomBookmark(const int mark, const Atom *atom) { +void ROMol::clearAtomBookmark(int mark, const Atom *atom) { if (d_atomBookmarks.count(mark) != 0) { ATOM_PTR_LIST *entry = &d_atomBookmarks[mark]; unsigned int tgtIdx = atom->getIdx(); @@ -231,8 +251,8 @@ void ROMol::clearAtomBookmark(const int mark, const Atom *atom) { } } -void ROMol::clearBondBookmark(const int mark) { d_bondBookmarks.erase(mark); } -void ROMol::clearBondBookmark(const int mark, const Bond *bond) { +void ROMol::clearBondBookmark(int mark) { d_bondBookmarks.erase(mark); } +void ROMol::clearBondBookmark(int mark, const Bond *bond) { if (d_bondBookmarks.count(mark) != 0) { BOND_PTR_LIST *entry = &d_bondBookmarks[mark]; unsigned int tgtIdx = bond->getIdx(); @@ -540,7 +560,7 @@ const Conformer &ROMol::getConformer(int id) const { return *(*ci); } } - // we did not find a coformation with the specified ID + // we did not find a conformation with the specified ID std::string mesg = "Can't find conformation with ID: "; mesg += id; throw ConformerException(mesg); @@ -561,7 +581,7 @@ Conformer &ROMol::getConformer(int id) { return *(*ci); } } - // we did not find a coformation with the specified ID + // we did not find a conformation with the specified ID std::string mesg = "Can't find conformation with ID: "; mesg += id; throw ConformerException(mesg); diff --git a/Code/GraphMol/ROMol.h b/Code/GraphMol/ROMol.h index dac9c8bdd..0cf362bc9 100644 --- a/Code/GraphMol/ROMol.h +++ b/Code/GraphMol/ROMol.h @@ -1,5 +1,5 @@ // -// Copyright (C) 2003-2015 Greg Landrum and Rational Discovery LLC +// Copyright (C) 2003-2018 Greg Landrum and Rational Discovery LLC // // @@ All Rights Reserved @@ // This file is part of the RDKit. @@ -34,9 +34,11 @@ #include "Atom.h" #include "Bond.h" #include "Conformer.h" +#include "Sgroup.h" #include "StereoGroup.h" namespace RDKit { +class SGroup; class Atom; class Bond; //! This is the BGL type used to store the topology: @@ -367,12 +369,15 @@ class RDKIT_GRAPHMOL_EXPORT ROMol : public RDProps { }; //! returns the first Atom associated with the \c bookmark provided Atom *getAtomWithBookmark(int mark); + //! returns the Atom associated with the \c bookmark provided + //! a check is made to ensure it is the only atom with that bookmark + Atom *getUniqueAtomWithBookmark(int mark); //! returns all Atoms associated with the \c bookmark provided ATOM_PTR_LIST &getAllAtomsWithBookmark(int mark); //! removes a \c bookmark from our collection - void clearAtomBookmark(const int mark); + void clearAtomBookmark(int mark); //! removes a particular Atom from the list associated with the \c bookmark - void clearAtomBookmark(const int mark, const Atom *atom); + void clearAtomBookmark(int mark, const Atom *atom); //! blows out all atomic \c bookmarks void clearAllAtomBookmarks() { d_atomBookmarks.clear(); }; @@ -387,6 +392,9 @@ class RDKIT_GRAPHMOL_EXPORT ROMol : public RDProps { }; //! returns the first Bond associated with the \c bookmark provided Bond *getBondWithBookmark(int mark); + //! returns the Bond associated with the \c bookmark provided + //! a check is made to ensure it is the only bond with that bookmark + Bond *getUniqueBondWithBookmark(int mark); //! returns all bonds associated with the \c bookmark provided BOND_PTR_LIST &getAllBondsWithBookmark(int mark); //! removes a \c bookmark from our collection @@ -436,8 +444,6 @@ class RDKIT_GRAPHMOL_EXPORT ROMol : public RDProps { return rdcast(d_confs.size()); } - //@} - //! \name Topology //@{ @@ -663,6 +669,10 @@ class RDKIT_GRAPHMOL_EXPORT ROMol : public RDProps { BOND_BOOKMARK_MAP d_bondBookmarks; RingInfo *dp_ringInfo; CONF_SPTR_LIST d_confs; + std::vector d_sgroups; + friend std::vector &getSGroups(ROMol &); + friend const std::vector &getSGroups(const ROMol &); + void clearSGroups() { d_sgroups.clear(); } std::vector d_stereo_groups; ROMol &operator=( diff --git a/Code/GraphMol/RWMol.cpp b/Code/GraphMol/RWMol.cpp index f8051da72..7352d68c3 100644 --- a/Code/GraphMol/RWMol.cpp +++ b/Code/GraphMol/RWMol.cpp @@ -18,6 +18,7 @@ #include "Bond.h" #include "BondIterators.h" #include "RingInfo.h" +#include "Sgroup.h" namespace RDKit { void RWMol::destroy() { @@ -77,6 +78,9 @@ void RWMol::insertMol(const ROMol &other) { ++firstB; } + // SGroups do not tolerate modification of the molecule, so drop them + clearSGroups(); + // add atom to any conformers as well, if we have any if (other.getNumConformers() && !getNumConformers()) { for (auto cfi = other.beginConformers(); cfi != other.endConformers(); @@ -119,6 +123,10 @@ unsigned int RWMol::addAtom(bool updateLabel) { clearAtomBookmark(ci_RIGHTMOST_ATOM); setAtomBookmark(atom_p, ci_RIGHTMOST_ATOM); } + + // SGroups do not tolerate modification of the molecule, so drop them + clearSGroups(); + // add atom to any conformers as well, if we have any for (auto cfi = this->beginConformers(); cfi != this->endConformers(); ++cfi) { @@ -143,6 +151,9 @@ void RWMol::replaceAtom(unsigned int idx, Atom *atom_pin, bool updateLabel, delete d_graph[vd]; d_graph[vd] = atom_p; // FIX: do something about bookmarks + + // SGroups do not tolerate modification of the molecule, so drop them + clearSGroups(); }; void RWMol::replaceBond(unsigned int idx, Bond *bond_pin, bool preserveProps) { @@ -164,6 +175,9 @@ void RWMol::replaceBond(unsigned int idx, Bond *bond_pin, bool preserveProps) { delete d_graph[*(bIter.first)]; d_graph[*(bIter.first)] = bond_p; // FIX: do something about bookmarks + + // SGroups do not tolerate modification of the molecule, so drop them + clearSGroups(); }; Atom *RWMol::getActiveAtom() { @@ -258,6 +272,9 @@ void RWMol::removeAtom(Atom *atom) { } } + // SGroups do not tolerate modification of the molecule, so drop them + clearSGroups(); + // Remove any stereo group which includes the atom being deleted removeGroupsWithAtom(atom, d_stereo_groups); @@ -314,6 +331,9 @@ unsigned int RWMol::addBond(unsigned int atomIdx1, unsigned int atomIdx2, dp_ringInfo->reset(); } + // SGroups do not tolerate modification of the molecule, so drop them + clearSGroups(); + return numBonds; // res; } @@ -374,6 +394,9 @@ void RWMol::removeBond(unsigned int aid1, unsigned int aid2) { // to be wrong now: dp_ringInfo->reset(); + // SGroups do not tolerate modification of the molecule, so drop them + clearSGroups(); + // loop over all bonds with higher indices and update their indices ROMol::EDGE_ITER firstB, lastB; boost::tie(firstB, lastB) = this->getEdges(); @@ -399,6 +422,10 @@ Bond *RWMol::createPartialBond(unsigned int atomIdx1, Bond::BondType bondType) { auto *b = new Bond(bondType); b->setOwningMol(this); b->setBeginAtomIdx(atomIdx1); + + // SGroups do not tolerate modification of the molecule, so drop them + clearSGroups(); + return b; } @@ -411,6 +438,10 @@ unsigned int RWMol::finishPartialBond(unsigned int atomIdx2, int bondBookmark, if (bondType == Bond::UNSPECIFIED) { bondType = bsp->getBondType(); } + + // SGroups do not tolerate modification of the molecule, so drop them + clearSGroups(); + return addBond(bsp->getBeginAtomIdx(), atomIdx2, bondType); } diff --git a/Code/GraphMol/Sgroup.cpp b/Code/GraphMol/Sgroup.cpp new file mode 100644 index 000000000..1d3102452 --- /dev/null +++ b/Code/GraphMol/Sgroup.cpp @@ -0,0 +1,200 @@ +// +// +// Copyright (C) 2002-2018 Greg Landrum and T5 Informatics GmbH +// +// @@ All Rights Reserved @@ +// This file is part of the RDKit. +// The contents are covered by the terms of the BSD license +// which is included in the file license.txt, found at the root +// of the RDKit source tree. +// +#include "Sgroup.h" +#include "ROMol.h" + +namespace RDKit { + +SGroup::SGroup(ROMol *owning_mol, const std::string &type) + : RDProps(), dp_mol(owning_mol) { + PRECONDITION(owning_mol, "supplied owning molecule is bad"); + + // TYPE is required to be set , as other properties will depend on it. + setProp("TYPE", type); +} + +void SGroup::setOwningMol(ROMol *mol) { + PRECONDITION(mol, "owning molecule is nullptr"); + + dp_mol = mol; +} + +unsigned int SGroup::getIndexInMol() const { + PRECONDITION(dp_mol, "SGroup is not owned by any molecule"); + + const auto &sgroups = getSGroups(*dp_mol); + CHECK_INVARIANT(!sgroups.empty(), "No SGroups found on owning molecule"); + + auto match_sgroup = [&](const SGroup &sg) { return this == &sg; }; + auto sgroupItr = std::find_if(sgroups.begin(), sgroups.end(), match_sgroup); + + if (sgroupItr == sgroups.end()) { + std::ostringstream errout; + errout << "Unable to find own index in owning mol SGroup collection" + << std::endl; + throw SGroupException(errout.str()); + } + + return sgroupItr - sgroups.begin(); +} + +void SGroup::addAtomWithIdx(unsigned int idx) { + PRECONDITION(dp_mol, "bad mol"); + PRECONDITION(dp_mol->getAtomWithIdx(idx), "wrong atom index"); + + d_atoms.push_back(idx); +} + +void SGroup::addAtomWithBookmark(int mark) { + PRECONDITION(dp_mol, "bad mol"); + Atom *atom = dp_mol->getUniqueAtomWithBookmark(mark); + d_atoms.push_back(atom->getIdx()); +} + +void SGroup::addParentAtomWithIdx(unsigned int idx) { + PRECONDITION(dp_mol, "bad mol"); + + if (std::find(d_atoms.begin(), d_atoms.end(), idx) == d_atoms.end()) { + std::ostringstream errout; + errout << "Atom " << idx << " is not a member of current SGroup"; + throw SGroupException(errout.str()); + } + + d_patoms.push_back(idx); +} + +void SGroup::addParentAtomWithBookmark(int mark) { + PRECONDITION(dp_mol, "bad mol"); + + Atom *atom = dp_mol->getUniqueAtomWithBookmark(mark); + unsigned int idx = atom->getIdx(); + if (std::find(d_atoms.begin(), d_atoms.end(), idx) == d_atoms.end()) { + std::ostringstream errout; + errout << "Atom with bookmark " << mark + << " is not a member of current SGroup "; + throw SGroupException(errout.str()); + } + + d_patoms.push_back(idx); +} + +void SGroup::addBondWithIdx(unsigned int idx) { + PRECONDITION(dp_mol, "bad mol"); + PRECONDITION(dp_mol->getBondWithIdx(idx), "wrong bond index"); + + d_bonds.push_back(idx); +} + +void SGroup::addBondWithBookmark(int mark) { + PRECONDITION(dp_mol, "bad mol"); + Bond *bond = dp_mol->getUniqueBondWithBookmark(mark); + d_bonds.push_back(bond->getIdx()); +} + +void SGroup::addBracket(const SGroup::Bracket &bracket) { + d_brackets.push_back(bracket); +} + +void SGroup::addCState(unsigned int bondIdx, const RDGeom::Point3D &vector) { + PRECONDITION(dp_mol, "bad mol"); + PRECONDITION(!d_bonds.empty(), "no bonds"); + + if (getBondType(bondIdx) != SGroup::BondType::XBOND) { + std::ostringstream errout; + errout << "Bond with index " << bondIdx + << " is not an XBOND for current SGroup"; + throw SGroupException(errout.str()); + } + d_cstates.push_back({bondIdx, vector}); +} + +void SGroup::addAttachPoint(unsigned int aIdx, int lvIdx, + const std::string &idStr) { + d_saps.push_back({aIdx, lvIdx, idStr}); +} + +//! check if the bond is SGroup XBOND or CBOND +SGroup::BondType SGroup::getBondType(unsigned int bondIdx) const { + PRECONDITION( + std::find(d_bonds.begin(), d_bonds.end(), bondIdx) != d_bonds.end(), + "bond is not part of the SGroup") + + auto bond = dp_mol->getBondWithIdx(bondIdx); + bool begin_atom_in_sgroup = + std::find(d_atoms.begin(), d_atoms.end(), bond->getBeginAtomIdx()) != + d_atoms.end(); + bool end_atom_in_sgroup = std::find(d_atoms.begin(), d_atoms.end(), + bond->getEndAtomIdx()) != d_atoms.end(); + + if (begin_atom_in_sgroup && end_atom_in_sgroup) { + return SGroup::BondType::CBOND; + } else if (begin_atom_in_sgroup || end_atom_in_sgroup) { + return SGroup::BondType::XBOND; + } else { + std::ostringstream errout; + errout << "Neither beginning nor ending atoms of bond " << bond->getIdx() + << " is in this SGroup."; + throw SGroupException(errout.str()); + } +} + +bool SGroupChecks::isValidType(const std::string &type) { + return std::find(SGroupChecks::sGroupTypes.begin(), + SGroupChecks::sGroupTypes.end(), + type) != SGroupChecks::sGroupTypes.end(); +} + +bool SGroupChecks::isValidSubType(const std::string &type) { + return std::find(SGroupChecks::sGroupSubtypes.begin(), + SGroupChecks::sGroupSubtypes.end(), + type) != SGroupChecks::sGroupSubtypes.end(); +} + +bool SGroupChecks::isValidConnectType(const std::string &type) { + return std::find(SGroupChecks::sGroupConnectTypes.begin(), + SGroupChecks::sGroupConnectTypes.end(), + type) != SGroupChecks::sGroupConnectTypes.end(); +} + +bool SGroupChecks::isSGroupIdFree(const ROMol &mol, unsigned int id) { + auto match_sgroup = [&](const SGroup &sg) { + unsigned int storedId; + return sg.getPropIfPresent("ID", storedId) && id == storedId; + }; + + const auto &sgroups = getSGroups(mol); + return std::find_if(sgroups.begin(), sgroups.end(), match_sgroup) == + sgroups.end(); +} + +std::vector &getSGroups(ROMol &mol) { return mol.d_sgroups; } +const std::vector &getSGroups(const ROMol &mol) { + return mol.d_sgroups; +} + +unsigned int addSGroup(ROMol &mol, SGroup sgroup) { + sgroup.setOwningMol(&mol); + + auto &&sgroups = getSGroups(mol); + unsigned int id = sgroups.size(); + + sgroups.push_back(std::move(sgroup)); + + return id; +} + +} // namespace RDKit + +std::ostream &operator<<(std::ostream &target, const RDKit::SGroup &sgroup) { + target << sgroup.getIndexInMol() << ' ' + << sgroup.getProp("TYPE"); + return target; +} diff --git a/Code/GraphMol/Sgroup.h b/Code/GraphMol/Sgroup.h new file mode 100644 index 000000000..b0f479baf --- /dev/null +++ b/Code/GraphMol/Sgroup.h @@ -0,0 +1,180 @@ +// +// +// Copyright (C) 2002-2018 Greg Landrum and T5 Informatics GmbH +// +// @@ All Rights Reserved @@ +// This file is part of the RDKit. +// The contents are covered by the terms of the BSD license +// which is included in the file license.txt, found at the root +// of the RDKit source tree. +// +/*! \file Sgroup.h + + \brief Defines the SGroup class + +*/ +#include +#ifndef _RD_SGROUP_H +#define _RD_SGROUP_H + +#include + +#include +#include +#include +#include + +namespace RDKit { +class ROMol; +class Bond; +class Atom; + +//! used to indicate errors from incorrect sgroup access +class RDKIT_GRAPHMOL_EXPORT SGroupException : public std::runtime_error { + public: + //! construct with an error message + SGroupException(const char *msg) : std::runtime_error(msg){}; + //! construct with an error message + SGroupException(const std::string &msg) : std::runtime_error(msg){}; +}; + +//! The class for representing SGroups +/*! + Notes: + - Implementation is based on 2010 MDL SD specification: + http://infochim.u-strasbg.fr/recherche/Download/Fragmentor/MDL_SDF.pdf + - See SGroups.md for further, more comprehensive notes. + +*/ + +class RDKIT_GRAPHMOL_EXPORT SGroup : public RDProps { + public: + //! Bond type (see V3000 spec) + enum class BondType { + XBOND, // External/Crossing bond + CBOND, // Internal/Contained bond + }; + + typedef std::array Bracket; + + //! Data structure for SAP lines (see V3000 spec) + //! lvIdx may not be set; this signaled with value -1 + struct AttachPoint { + unsigned int aIdx; + int lvIdx; + std::string id; + }; + + //! See specification for V3000 CSTATE + //! vector may or not be considered, depending on TYPE + struct CState { + unsigned int bondIdx; + RDGeom::Point3D vector; + }; + + //! No default constructor + SGroup() = delete; + + //! Main Constructor. Ownsership is only set on this side of the relationship: + //! mol->addSGroup(sgroup) still needs to be called to get ownership on the + //! other side. + SGroup(ROMol *owning_mol, const std::string &type); + + SGroup(const SGroup &other) = default; + SGroup(SGroup &&other) = default; + + SGroup &operator=(const SGroup &other) = default; + SGroup &operator=(SGroup &&other) = default; + + //! Destructor + ~SGroup(){}; + + //! Get the molecule that owns this conformation + ROMol &getOwningMol() const { return *dp_mol; } + + //! get the index of this sgroup in dp_mol's sgroups vector + //! (do not mistake this by the ID!) + unsigned int getIndexInMol() const; + + /* Atom and Bond methods */ + void addAtomWithIdx(unsigned int idx); + void addParentAtomWithIdx(unsigned int idx); + void addBondWithIdx(unsigned int idx); + void addAtomWithBookmark(int mark); + void addParentAtomWithBookmark(int mark); + void addBondWithBookmark(int mark); + + void addBracket(const Bracket &bracket); + void addCState(unsigned int bondIdx, const RDGeom::Point3D &vector); + void addAttachPoint(unsigned int aIdx, int lvIdx, const std::string &idStr); + + BondType getBondType(unsigned int bondIdx) const; + + const std::vector &getAtoms() const { return d_atoms; } + const std::vector &getParentAtoms() const { return d_patoms; } + const std::vector &getBonds() const { return d_bonds; } + + const std::vector &getBrackets() const { return d_brackets; } + const std::vector &getCStates() const { return d_cstates; } + const std::vector &getAttachPoints() const { return d_saps; } + + //! Set owning moelcule + //! This only updates atoms and bonds; parent sgroup has to be updated + //! independently, since parent might not exist at the time this is called. + void setOwningMol(ROMol *mol); + + private: + ROMol *dp_mol = nullptr; // owning molecule + + std::vector d_atoms; + std::vector d_patoms; + std::vector d_bonds; + + std::vector d_brackets; + std::vector d_cstates; + std::vector d_saps; +}; + +namespace SGroupChecks { + +const std::vector sGroupTypes = { + // polymer sgroups: + "SRU", "MON", "COP", "CRO", "GRA", "MOD", "MER", "ANY", + // formulations/mixtures: + "COM", "MIX", "FOR", + // other + "SUP", "MUL", "DAT", "GEN"}; + +const std::vector sGroupSubtypes = {"ALT", "RAN", "BLO"}; +const std::vector sGroupConnectTypes = {"HH", "HT", "EU"}; + +bool isValidType(const std::string &type); + +bool isValidSubType(const std::string &type); + +bool isValidConnectType(const std::string &type); + +bool isSGroupIdFree(const ROMol &mol, unsigned int id); + +} // namespace SGroupChecks + +//! \name SGroups and molecules +//@{ + +std::vector &getSGroups(ROMol &mol); +const std::vector &getSGroups(const ROMol &mol); + +//! Add a new SGroup. A copy is added, so we can be sure that no other +//! references to the SGroup exist. +/*! + \param sgroup - SGroup to be added to the molecule. +*/ +unsigned int addSGroup(ROMol &mol, SGroup sgroup); +//@} + +} // namespace RDKit + +//! allows SGroup objects to be dumped to streams +RDKIT_GRAPHMOL_EXPORT std::ostream &operator<<(std::ostream &target, + const RDKit::SGroup &sg); +#endif diff --git a/Code/GraphMol/catch_sgroups.cpp b/Code/GraphMol/catch_sgroups.cpp new file mode 100644 index 000000000..5f4e38c8f --- /dev/null +++ b/Code/GraphMol/catch_sgroups.cpp @@ -0,0 +1,289 @@ +// +// +// Copyright (C) 2018 Greg Landrum and T5 Informatics GmbH +// +// @@ All Rights Reserved @@ +// This file is part of the RDKit. +// The contents are covered by the terms of the BSD license +// which is included in the file license.txt, found at the root +// of the RDKit source tree. +// + +#define CATCH_CONFIG_MAIN // This tells Catch to provide a main() - only do + // this in one cpp file +#include "catch.hpp" + +#include +#include +#include +#include +#include "GraphMol/FileParsers/FileParsers.h" + +using namespace RDKit; + +/* Auxiliary functions */ +void testIdxVector(const std::vector &groupVector, + const std::vector &reference) { + size_t vecSize = reference.size(); + CHECK(groupVector.size() == vecSize); + + auto sgItr = groupVector.begin(); + for (auto refItr = reference.begin(); refItr != reference.end(); + ++sgItr, ++refItr) { + CHECK(1 + *sgItr == *refItr); + } +} + +void testBrackets( + const std::vector &brackets, + const std::vector, 3>> &reference) { + CHECK(brackets.size() == 2); + for (int i = 0; i < 2; ++i) { + for (int j = 0; j < 3; ++j) { + for (int k = 0; k < 3; ++k) { + CHECK(std::abs(brackets[i][j][k] - reference[i][j][k]) < 1.e-6); + } + } + } +} + +RWMol buildSampleMolecule() { + // This builds a RDKit::RWMol with all implemented SGroup features in order to + // test them. SGroups and features probably do not make any sense. + + //// Initialize Molecule //// + RWMol mol; + + // Add some atoms and bonds + for (unsigned i = 0; i < 6; ++i) { + mol.addAtom(new Atom(6), false, true); + + if (i > 0) { + mol.addBond(i - 1, i, Bond::SINGLE); + } + } + + //// First SGroup //// + { + SGroup sg(&mol, "MUL"); + + sg.setProp("SUBTYPE", "BLO"); + sg.setProp("MULT", "n"); + sg.setProp("CONNECT", "HH"); + + // Add some atoms and bonds + for (unsigned i = 0; i < 3; ++i) { + sg.addAtomWithIdx(i); + sg.addParentAtomWithIdx(i); + sg.addBondWithIdx(i); // add 2 CBONDs + 1 XBOND + } + + sg.setProp("COMPNO", 7u); + sg.setProp("ESTATE", "E"); + + SGroup::Bracket bracket1 = {{RDGeom::Point3D(1., 3., 0.), + RDGeom::Point3D(5., 7., 0.), + RDGeom::Point3D(0., 0., 0.)}}; + sg.addBracket(bracket1); + + SGroup::Bracket bracket2 = {{RDGeom::Point3D(2., 4., 0.), + RDGeom::Point3D(6., 8., 0.), + RDGeom::Point3D(0., 0., 0.)}}; + sg.addBracket(bracket2); + + // Vector should not be parsed (not a SUP group) + sg.addCState(2, RDGeom::Point3D()); + + sg.setProp("CLASS", "TEST CLASS"); + + sg.addAttachPoint(0, 0, "XX"); + + sg.setProp("BRKTYP", "PAREN"); + + addSGroup(mol, sg); + } + //// Second SGroup //// + { + SGroup sg(&mol, "SUP"); + + // Add some atoms and bonds + for (unsigned i = 3; i < 6; ++i) { + sg.addAtomWithIdx(i); + sg.addParentAtomWithIdx(i); + sg.addBondWithIdx(i - 1); // add 1 XBOND + 2 CBONDs + } + + sg.setProp("LABEL", "TEST LABEL"); + + // V2000 has only x and y coords; z value restricted to 0. + RDGeom::Point3D vector(3., 4., 0.); + sg.addCState(2, vector); // Vector should be parsed now! + + sg.addAttachPoint(3, -1, "YY"); + + addSGroup(mol, sg); + } + //// Third SGroup //// + { + SGroup sg(&mol, "DAT"); + + sg.setProp("FIELDNAME", "SAMPLE FIELD NAME"); // 30 char max + // Field Type is ignored in V3000 + sg.setProp("FIELDINFO", "SAMPLE FIELD INFO"); // 20 char max + sg.setProp("QUERYTYPE", "PQ"); // 2 char max + sg.setProp("QUERYOP", "SAMPLE QUERY OP"); // 15 char max (rest of line) + + // This should be properly formatted, but format is not checked + sg.setProp("FIELDDISP", "SAMPLE FIELD DISP"); + + STR_VECT dataFields = {"SAMPLE DATA FIELD 1", "SAMPLE DATA FIELD 2", + "SAMPLE DATA FIELD 3"}; + sg.setProp("DATAFIELDS", dataFields); + + addSGroup(mol, sg); + } + + // Set a parent with higher index + const auto &sgroups = getSGroups(mol); + sgroups.at(0).setProp("PARENT", 2); + + return mol; +} + +/* End Auxiliary functions */ + +TEST_CASE("Basic Sgroup creation", "[Sgroups]") { + // Create two SGroups and add them to a molecule + RWMol mol; + + { + SGroup sg0(&mol, "DAT"); + SGroup sg1(&mol, "SUP"); + addSGroup(mol, sg0); + addSGroup(mol, sg1); + } + + const auto &sgroups = getSGroups(mol); + REQUIRE(sgroups.size() == 2); + CHECK(sgroups.at(0).getProp("TYPE") == "DAT"); + CHECK(sgroups.at(1).getProp("TYPE") == "SUP"); +} + +TEST_CASE("Build and test sample molecule", "[Sgroups]") { + auto mol = buildSampleMolecule(); + + const auto &sgroups = getSGroups(mol); + CHECK(sgroups.size() == 3); + + SECTION("first sgroup") { + const auto &sg = sgroups.at(0); + CHECK(sg.getProp("TYPE") == "MUL"); + + CHECK(sg.getProp("SUBTYPE") == "BLO"); + CHECK(sg.getProp("MULT") == "n"); + CHECK(sg.getProp("CONNECT") == "HH"); + + std::vector atoms_reference = {1, 2, 3}; + auto atoms = sg.getAtoms(); + testIdxVector(atoms, atoms_reference); + + std::vector patoms_reference = {1, 2, 3}; + testIdxVector(sg.getParentAtoms(), patoms_reference); + + std::vector bonds_reference = {1, 2, 3}; + auto bonds = sg.getBonds(); + + // bonds are not sorted in V3000; sort them here + std::sort(bonds.begin(), bonds.end()); + + testIdxVector(bonds, bonds_reference); + + CHECK(sg.getBondType(bonds[0]) == SGroup::BondType::CBOND); + CHECK(sg.getBondType(bonds[1]) == SGroup::BondType::CBOND); + CHECK(sg.getBondType(bonds[2]) == SGroup::BondType::XBOND); + + CHECK(sg.getProp("COMPNO") == 7u); + CHECK(sg.getProp("ESTATE") == "E"); + + std::vector, 3>> brackets_reference = {{ + {{{{1., 3., 0.}}, {{5., 7., 0.}}, {{0., 0., 0.}}}}, + {{{{2., 4., 0.}}, {{6., 8., 0.}}, {{0., 0., 0.}}}}, + }}; + testBrackets(sg.getBrackets(), brackets_reference); + + auto cstates = sg.getCStates(); + CHECK(cstates.size() == 1); + CHECK(cstates[0].bondIdx == bonds[2]); + CHECK(cstates[0].vector.x == 0.); + CHECK(cstates[0].vector.y == 0.); + CHECK(cstates[0].vector.z == 0.); + + CHECK(sg.getProp("CLASS") == "TEST CLASS"); + + auto ap = sg.getAttachPoints(); + CHECK(ap.size() == 1); + CHECK(ap[0].aIdx == atoms[0]); + CHECK(ap[0].lvIdx == static_cast(atoms[0])); + CHECK(ap[0].id == "XX"); + + CHECK(sg.getProp("BRKTYP") == "PAREN"); + + CHECK(sg.getProp("PARENT") == 2u); + } + + SECTION("second sgroup") { + const auto &sg = sgroups.at(1); + CHECK(sg.getProp("TYPE") == "SUP"); + + std::vector atoms_reference = {4, 5, 6}; + auto atoms = sg.getAtoms(); + testIdxVector(atoms, atoms_reference); + + std::vector patoms_reference = {4, 5, 6}; + testIdxVector(sg.getParentAtoms(), patoms_reference); + + std::vector bonds_reference = {3, 4, 5}; + auto bonds = sg.getBonds(); + + // bonds are not sorted in V3000; sort them here + std::sort(bonds.begin(), bonds.end()); + + testIdxVector(bonds, bonds_reference); + CHECK(sg.getBondType(bonds[0]) == SGroup::BondType::XBOND); + CHECK(sg.getBondType(bonds[1]) == SGroup::BondType::CBOND); + CHECK(sg.getBondType(bonds[2]) == SGroup::BondType::CBOND); + + CHECK(sg.getProp("LABEL") == "TEST LABEL"); + + auto cstates = sg.getCStates(); + CHECK(cstates.size() == 1); + CHECK(cstates[0].bondIdx == bonds[0]); + CHECK(cstates[0].vector.x == 3.); + CHECK(cstates[0].vector.y == 4.); + CHECK(cstates[0].vector.z == 0.); + + auto ap = sg.getAttachPoints(); + CHECK(ap.size() == 1); + CHECK(ap[0].aIdx == atoms[0]); + CHECK(ap[0].lvIdx == -1); + CHECK(ap[0].id == "YY"); + } + + SECTION("third sgroup") { + const auto &sg = sgroups.at(2); + CHECK(sg.getProp("TYPE") == "DAT"); + + CHECK(sg.getProp("FIELDNAME") == "SAMPLE FIELD NAME"); + CHECK(sg.getProp("FIELDINFO") == "SAMPLE FIELD INFO"); + CHECK(sg.getProp("QUERYTYPE") == "PQ"); + CHECK(sg.getProp("QUERYOP") == "SAMPLE QUERY OP"); + + CHECK(sg.getProp("FIELDDISP") == "SAMPLE FIELD DISP"); + + auto dataFields = sg.getProp("DATAFIELDS"); + CHECK(dataFields.size() == 3); + CHECK(dataFields[0] == "SAMPLE DATA FIELD 1"); + CHECK(dataFields[1] == "SAMPLE DATA FIELD 2"); + CHECK(dataFields[2] == "SAMPLE DATA FIELD 3"); + } +} diff --git a/Code/GraphMol/testSGroup.cpp b/Code/GraphMol/testSGroup.cpp new file mode 100644 index 000000000..94bfeb03f --- /dev/null +++ b/Code/GraphMol/testSGroup.cpp @@ -0,0 +1,567 @@ +// +// +// Copyright (C) 2002-2018 Greg Landrum and T5 Informatics GmbH +// +// @@ All Rights Reserved @@ +// This file is part of the RDKit. +// The contents are covered by the terms of the BSD license +// which is included in the file license.txt, found at the root +// of the RDKit source tree. +// + +#include +#include +#include +#include +#include +#include +#include +#include "GraphMol/FileParsers/FileParsers.h" +#include "GraphMol/FileParsers/MolSupplier.h" +#include "GraphMol/FileParsers/MolWriters.h" + +#include +#include + +using namespace RDKit; + +/* Auxiliary functions */ +void testIdxVector(const std::vector &groupVector, + const std::vector &reference) { + size_t vecSize = reference.size(); + TEST_ASSERT(groupVector.size() == vecSize); + + auto sgItr = groupVector.begin(); + for (auto refItr = reference.begin(); refItr != reference.end(); + ++sgItr, ++refItr) { + TEST_ASSERT(1 + *sgItr == *refItr); + } +} + +void testBrackets( + const std::vector &brackets, + const std::vector, 3>> &reference) { + TEST_ASSERT(brackets.size() == 2); + for (int i = 0; i < 2; ++i) { + for (int j = 0; j < 3; ++j) { + for (int k = 0; k < 3; ++k) { + TEST_ASSERT(std::abs(brackets[i][j][k] - reference[i][j][k]) < 1.e-6); + } + } + } +} + +RWMol buildSampleMolecule() { + // This builds a RDKit::RWMol with all implemented SGroup features in order to + // test them. SGroups and features probably do not make any sense. + + //// Initialize Molecule //// + RWMol mol; + + // Add some atoms and bonds + for (unsigned i = 0; i < 6; ++i) { + mol.addAtom(new Atom(6), false, true); + + if (i > 0) { + mol.addBond(i - 1, i, Bond::SINGLE); + } + } + + //// First SGroup //// + { + SGroup sg(&mol, "MUL"); + + sg.setProp("SUBTYPE", "BLO"); + sg.setProp("MULT", "n"); + sg.setProp("CONNECT", "HH"); + + // Add some atoms and bonds + for (unsigned i = 0; i < 3; ++i) { + sg.addAtomWithIdx(i); + sg.addParentAtomWithIdx(i); + sg.addBondWithIdx(i); // add 2 CBONDs + 1 XBOND + } + + sg.setProp("COMPNO", 7u); + sg.setProp("ESTATE", "E"); + + SGroup::Bracket bracket1 = {{RDGeom::Point3D(1., 3., 0.), + RDGeom::Point3D(5., 7., 0.), + RDGeom::Point3D(0., 0., 0.)}}; + sg.addBracket(bracket1); + + SGroup::Bracket bracket2 = {{RDGeom::Point3D(2., 4., 0.), + RDGeom::Point3D(6., 8., 0.), + RDGeom::Point3D(0., 0., 0.)}}; + sg.addBracket(bracket2); + + // Vector should not be parsed (not a SUP group) + sg.addCState(2, RDGeom::Point3D()); + + sg.setProp("CLASS", "TEST CLASS"); + + sg.addAttachPoint(0, 0, "XX"); + + sg.setProp("BRKTYP", "PAREN"); + + addSGroup(mol, sg); + } + //// Second SGroup //// + { + SGroup sg(&mol, "SUP"); + + // Add some atoms and bonds + for (unsigned i = 3; i < 6; ++i) { + sg.addAtomWithIdx(i); + sg.addParentAtomWithIdx(i); + sg.addBondWithIdx(i - 1); // add 1 XBOND + 2 CBONDs + } + + sg.setProp("LABEL", "TEST LABEL"); + + // V2000 has only x and y coords; z value restricted to 0. + RDGeom::Point3D vector(3., 4., 0.); + sg.addCState(2, vector); // Vector should be parsed now! + + sg.addAttachPoint(3, -1, "YY"); + + addSGroup(mol, sg); + } + //// Third SGroup //// + { + SGroup sg(&mol, "DAT"); + + sg.setProp("FIELDNAME", "SAMPLE FIELD NAME"); // 30 char max + // Field Type is ignored in V3000 + sg.setProp("FIELDINFO", "SAMPLE FIELD INFO"); // 20 char max + sg.setProp("QUERYTYPE", "PQ"); // 2 char max + sg.setProp("QUERYOP", "SAMPLE QUERY OP"); // 15 char max (rest of line) + + // This should be properly formatted, but format is not checked + sg.setProp("FIELDDISP", "SAMPLE FIELD DISP"); + + STR_VECT dataFields = {"SAMPLE DATA FIELD 1", "SAMPLE DATA FIELD 2", + "SAMPLE DATA FIELD 3"}; + sg.setProp("DATAFIELDS", dataFields); + + addSGroup(mol, sg); + } + + // Set a parent with higher index + const auto &sgroups = getSGroups(mol); + sgroups.at(0).setProp("PARENT", 2); + + return mol; +} + +void checkSampleMolecule(const RWMol &mol) { + // Test a molecule created by buildSampleMolecule (or a copy) + + const auto &sgroups = getSGroups(mol); + TEST_ASSERT(sgroups.size() == 3); + + { + // First SGroup + const auto &sg = sgroups.at(0); + TEST_ASSERT(sg.getProp("TYPE") == "MUL"); + + TEST_ASSERT(sg.getProp("SUBTYPE") == "BLO"); + TEST_ASSERT(sg.getProp("MULT") == "n"); + TEST_ASSERT(sg.getProp("CONNECT") == "HH"); + + std::vector atoms_reference = {1, 2, 3}; + auto atoms = sg.getAtoms(); + testIdxVector(atoms, atoms_reference); + + std::vector patoms_reference = {1, 2, 3}; + testIdxVector(sg.getParentAtoms(), patoms_reference); + + std::vector bonds_reference = {1, 2, 3}; + auto bonds = sg.getBonds(); + + // bonds are not sorted in V3000; sort them here + std::sort(bonds.begin(), bonds.end()); + + testIdxVector(bonds, bonds_reference); + + TEST_ASSERT(sg.getBondType(bonds[0]) == SGroup::BondType::CBOND); + TEST_ASSERT(sg.getBondType(bonds[1]) == SGroup::BondType::CBOND); + TEST_ASSERT(sg.getBondType(bonds[2]) == SGroup::BondType::XBOND); + + TEST_ASSERT(sg.getProp("COMPNO") == 7); + TEST_ASSERT(sg.getProp("ESTATE") == "E"); + + std::vector, 3>> brackets_reference = {{ + {{{{1., 3., 0.}}, {{5., 7., 0.}}, {{0., 0., 0.}}}}, + {{{{2., 4., 0.}}, {{6., 8., 0.}}, {{0., 0., 0.}}}}, + }}; + testBrackets(sg.getBrackets(), brackets_reference); + + auto cstates = sg.getCStates(); + TEST_ASSERT(cstates.size() == 1); + TEST_ASSERT(cstates[0].bondIdx == bonds[2]); + TEST_ASSERT(cstates[0].vector.x == 0.); + TEST_ASSERT(cstates[0].vector.y == 0.); + TEST_ASSERT(cstates[0].vector.z == 0.); + + TEST_ASSERT(sg.getProp("CLASS") == "TEST CLASS"); + + auto ap = sg.getAttachPoints(); + TEST_ASSERT(ap.size() == 1); + TEST_ASSERT(ap[0].aIdx == atoms[0]); + TEST_ASSERT(ap[0].lvIdx == static_cast(atoms[0])); + TEST_ASSERT(ap[0].id == "XX"); + + TEST_ASSERT(sg.getProp("BRKTYP") == "PAREN"); + + TEST_ASSERT(sg.getProp("PARENT") == 2u); + } + + { + // Second SGroup + const auto &sg = sgroups.at(1); + TEST_ASSERT(sg.getProp("TYPE") == "SUP"); + + std::vector atoms_reference = {4, 5, 6}; + auto atoms = sg.getAtoms(); + testIdxVector(atoms, atoms_reference); + + std::vector patoms_reference = {4, 5, 6}; + testIdxVector(sg.getParentAtoms(), patoms_reference); + + std::vector bonds_reference = {3, 4, 5}; + auto bonds = sg.getBonds(); + + // bonds are not sorted in V3000; sort them here + std::sort(bonds.begin(), bonds.end()); + + testIdxVector(bonds, bonds_reference); + TEST_ASSERT(sg.getBondType(bonds[0]) == SGroup::BondType::XBOND); + TEST_ASSERT(sg.getBondType(bonds[1]) == SGroup::BondType::CBOND); + TEST_ASSERT(sg.getBondType(bonds[2]) == SGroup::BondType::CBOND); + + TEST_ASSERT(sg.getProp("LABEL") == "TEST LABEL"); + + auto cstates = sg.getCStates(); + TEST_ASSERT(cstates.size() == 1); + TEST_ASSERT(cstates[0].bondIdx == bonds[0]); + TEST_ASSERT(cstates[0].vector.x == 3.); + TEST_ASSERT(cstates[0].vector.y == 4.); + TEST_ASSERT(cstates[0].vector.z == 0.); + + auto ap = sg.getAttachPoints(); + TEST_ASSERT(ap.size() == 1); + TEST_ASSERT(ap[0].aIdx == atoms[0]); + TEST_ASSERT(ap[0].lvIdx == -1); + TEST_ASSERT(ap[0].id == "YY"); + } + + { + // Third SGroup + const auto &sg = sgroups.at(2); + TEST_ASSERT(sg.getProp("TYPE") == "DAT"); + + TEST_ASSERT(sg.getProp("FIELDNAME") == "SAMPLE FIELD NAME"); + TEST_ASSERT(sg.getProp("FIELDINFO") == "SAMPLE FIELD INFO"); + TEST_ASSERT(sg.getProp("QUERYTYPE") == "PQ"); + TEST_ASSERT(sg.getProp("QUERYOP") == "SAMPLE QUERY OP"); + + TEST_ASSERT(sg.getProp("FIELDDISP") == "SAMPLE FIELD DISP"); + + auto dataFields = sg.getProp("DATAFIELDS"); + TEST_ASSERT(dataFields.size() == 3); + TEST_ASSERT(dataFields[0] == "SAMPLE DATA FIELD 1"); + TEST_ASSERT(dataFields[1] == "SAMPLE DATA FIELD 2"); + TEST_ASSERT(dataFields[2] == "SAMPLE DATA FIELD 3"); + } +} + +/* End Auxiliary functions */ + +void testCreateSGroups() { + BOOST_LOG(rdInfoLog) << " ----------> Testing basic SGroup creation" + << std::endl; + + // Create two SGroups and add them to a molecule + RWMol mol; + + { + SGroup sg0(&mol, "DAT"); + SGroup sg1(&mol, "SUP"); + addSGroup(mol, sg0); + addSGroup(mol, sg1); + } + + const auto &sgroups = getSGroups(mol); + TEST_ASSERT(sgroups.size() == 2); + TEST_ASSERT(sgroups.at(0).getProp("TYPE") == "DAT"); + TEST_ASSERT(sgroups.at(1).getProp("TYPE") == "SUP"); +} + +void testParseSGroups(const std::string &rdbase) { + BOOST_LOG(rdInfoLog) << " ----------> Parsing Issue3432136_1.mol (V2000)" + << std::endl; + { + std::string fName = + rdbase + "/Code/GraphMol/FileParsers/test_data/Issue3432136_1.mol"; + + std::unique_ptr mol(MolFileToMol(fName)); + TEST_ASSERT(mol); + + const auto &sgroups = getSGroups(*mol); + TEST_ASSERT(sgroups.size() == 1); + + const auto &sgroup = sgroups.at(0); + + TEST_ASSERT(sgroup.getProp("TYPE") == "MON"); + + std::vector atoms_reference = {2, 3, 4, 1, 5}; + + testIdxVector(sgroup.getAtoms(), atoms_reference); + + std::vector bonds_reference = + {}; // No bonds defined in this mol + testIdxVector(sgroup.getBonds(), bonds_reference); + + std::vector, 3>> brackets_reference = {{ + {{{{-3.9679, -0.1670, 0.}}, {{-3.9679, 2.1705, 0.}}, {{0., 0., 0.}}}}, + {{{{-0.7244, 2.1705, 0.}}, {{-0.7244, -0.1670, 0.}}, {{0., 0., 0.}}}}, + }}; + testBrackets(sgroup.getBrackets(), brackets_reference); + } + + BOOST_LOG(rdInfoLog) << " ----------> Parsing Issue3432136_1.v3k.mol (V3000) " + << std::endl; + { + std::string fName = + rdbase + "/Code/GraphMol/FileParsers/test_data/Issue3432136_1.v3k.mol"; + std::unique_ptr mol(MolFileToMol(fName)); + TEST_ASSERT(mol); + + const auto &sgroups = getSGroups(*mol); + TEST_ASSERT(sgroups.size() == 1); + + const auto sgroup = sgroups.at(0); + + TEST_ASSERT(sgroup.getProp("TYPE") == "MON"); + + std::vector atoms_reference = {2, 3, 4, 1, 5}; + testIdxVector(sgroup.getAtoms(), atoms_reference); + + std::vector bonds_reference = + {}; // No bonds defined in this mol + testIdxVector(sgroup.getBonds(), bonds_reference); + } + + BOOST_LOG(rdInfoLog) << " ----------> Parsing Issue3432136_2.v3k.mol (V3000) " + << std::endl; + { + std::string fName = + rdbase + "/Code/GraphMol/FileParsers/test_data/Issue3432136_2.v3k.mol"; + std::unique_ptr mol(MolFileToMol(fName)); + TEST_ASSERT(mol); + + const auto &sgroups = getSGroups(*mol); + TEST_ASSERT(sgroups.size() == 1); + + const auto sgroup = sgroups.at(0); + + TEST_ASSERT(sgroup.getProp("TYPE") == "SUP"); + TEST_ASSERT(sgroup.getProp("CLASS") == "DEMOCLASS"); + TEST_ASSERT(sgroup.getProp("LABEL") == "abbrev"); + + std::vector atoms_reference = {6, 7, 8, 9, 11, 12}; + testIdxVector(sgroup.getAtoms(), atoms_reference); + + std::vector bonds_reference = {5}; + testIdxVector(sgroup.getBonds(), bonds_reference); + + auto bond = sgroup.getBonds()[0]; + TEST_ASSERT(sgroup.getBondType(bond) == SGroup::BondType::XBOND); + } + + BOOST_LOG(rdInfoLog) << " ----------> Parsing Issue3432136_2.mol (V2000) " + << std::endl; + { + std::string fName = + rdbase + "/Code/GraphMol/FileParsers/test_data/Issue3432136_2.mol"; + std::unique_ptr mol(MolFileToMol(fName)); + TEST_ASSERT(mol); + + const auto &sgroups = getSGroups(*mol); + TEST_ASSERT(sgroups.size() == 1); + + const auto sgroup = sgroups.at(0); + + TEST_ASSERT(sgroup.getProp("TYPE") == "SUP"); + + std::vector atoms_reference = {6, 7, 8, 9, 11, 12}; + testIdxVector(sgroup.getAtoms(), atoms_reference); + + std::vector bonds_reference = {5}; + testIdxVector(sgroup.getBonds(), bonds_reference); + + auto bond = sgroup.getBonds()[0]; + TEST_ASSERT(sgroup.getBondType(bond) == SGroup::BondType::XBOND); + } +} + +void testSGroupsRoundTrip(const std::string &rdbase, bool forceV3000) { + BOOST_LOG(rdInfoLog) + << " ----------> Testing SGroup writing & parsing Roundtrip (" + << (forceV3000 ? "V3000" : "V2000") << ')' << std::endl; + + std::string fName = + rdbase + "/Code/GraphMol/FileParsers/test_data/testSGroupsSample_" + + (forceV3000 ? "V3000" : "V2000") + ".mol"; + + { + auto sampleMol = buildSampleMolecule(); + + const auto &sgroups = getSGroups(sampleMol); + TEST_ASSERT(sgroups.size() == 3); + + auto writer = SDWriter(fName); + writer.setForceV3000(forceV3000); + writer.write(sampleMol); + writer.close(); + } + + std::unique_ptr roundtripMol(MolFileToMol(fName)); + checkSampleMolecule(*roundtripMol); +} + +void testPickleSGroups() { + BOOST_LOG(rdInfoLog) + << " ----------> Testing SGroup pickling & unpickling Roundtrip" + << std::endl; + + std::string pkl; + + { + auto sampleMol = buildSampleMolecule(); + + MolPickler::pickleMol(sampleMol, pkl); + } + + RWMol roundtripMol(pkl); + checkSampleMolecule(roundtripMol); +} + +void testModifyMol() { + BOOST_LOG(rdInfoLog) << " ----------> Test dropping SGroups on modification" + << std::endl; + + auto mol = buildSampleMolecule(); + auto mol_copy = mol; + + const auto &sgroups = getSGroups(mol); + TEST_ASSERT(sgroups.size() == 3); + + { // insertion will drop SGroups + const auto &sgroups = getSGroups(mol_copy); + TEST_ASSERT(sgroups.size() == 3); + + mol_copy.insertMol(mol); + + TEST_ASSERT(sgroups.size() == 0); + } + { + // adding an atom will drop SGroups + mol_copy = mol; + + const auto &sgroups = getSGroups(mol_copy); + TEST_ASSERT(sgroups.size() == 3); + + mol_copy.addAtom(); + + TEST_ASSERT(sgroups.size() == 0); + } + { + // replacing an atom will drop SGroups + mol_copy = mol; + + const auto &sgroups = getSGroups(mol_copy); + TEST_ASSERT(sgroups.size() == 3); + + auto new_atom = Atom(); + mol_copy.replaceAtom(1, &new_atom); + + TEST_ASSERT(sgroups.size() == 0); + } + { + // replacing a new bond will drop SGroups + mol_copy = mol; + + const auto &sgroups = getSGroups(mol_copy); + TEST_ASSERT(sgroups.size() == 3); + + auto new_bond = Bond(Bond::SINGLE); + mol_copy.replaceBond(1, &new_bond); + + TEST_ASSERT(sgroups.size() == 0); + } + { + // removing an atom will drop SGroups + mol_copy = mol; + + const auto &sgroups = getSGroups(mol_copy); + TEST_ASSERT(sgroups.size() == 3); + + mol_copy.removeAtom(1); + + TEST_ASSERT(sgroups.size() == 0); + } + { + // creating a new bond between existing atoms will drop SGroups + mol_copy = mol; + + const auto &sgroups = getSGroups(mol_copy); + TEST_ASSERT(sgroups.size() == 3); + + mol_copy.addBond(1, 3, Bond::SINGLE); + + TEST_ASSERT(sgroups.size() == 0); + } + { + // removing a bond will drop SGroups + mol_copy = mol; + + const auto &sgroups = getSGroups(mol_copy); + TEST_ASSERT(sgroups.size() == 3); + + mol_copy.removeBond(1, 2); + + TEST_ASSERT(sgroups.size() == 0); + } + { + // creating a partial bond will drop SGroups + mol_copy = mol; + + const auto &sgroups = getSGroups(mol_copy); + TEST_ASSERT(sgroups.size() == 3); + + mol_copy.createPartialBond(1, Bond::SINGLE); + + TEST_ASSERT(sgroups.size() == 0); + } +} + +int main() { + std::string rdbase = std::string(getenv("RDBASE")); + if (rdbase.empty()) { + std::cerr << "\n\n RDBASE has not been set, aborting.\n\n"; + return 1; + } + + RDLog::InitLogs(); + + testCreateSGroups(); + testParseSGroups(rdbase); + testSGroupsRoundTrip(rdbase, false); // test V2000 + testSGroupsRoundTrip(rdbase, true); // test V3000 + testPickleSGroups(); + testModifyMol(); + + return 0; +} diff --git a/Docs/Code/SGroups.md b/Docs/Code/SGroups.md new file mode 100644 index 000000000..6e6e0be98 --- /dev/null +++ b/Docs/Code/SGroups.md @@ -0,0 +1,43 @@ +# SGroups in RDKit +Ricardo Rodriguez-Schmidt + +September 2018 + + +## Overview + +MDL SD files support SGroups as a way to store some types of extended intra-molecular information. Some usage examples of these SGroup are the following: + +1. Identification of repeating groups in polymers, i.e. identification of monomer units and, in case of copolymers, how these monomers distribute in the polymer chain. +1. Labeling of relevant sections of a molecule. +1. Detailing depiction information for the molecule or other defined SGroups. +1. Storing information relative to parts of the molecule in the form of data fields. + +This document describes an intent to make a `SGroup` data structure available to RDKit molecules, initiating support for parsing, storing, serializing and writing back this information. The goal is to achieve conversion of MDL SD files that contain SGroups using RDkit without loss of information in these, with some limitations. At this point, no intent will be made to allow or support manipulation or depiction of the information contained in SGroups, but this might be added in the future. + +## Documentation + +As a reference for the design and implementation of the `SGroup` data class, the following document has been used: + +[http://infochim.u-strasbg.fr/recherche/Download/Fragmentor/MDL_SDF.pdf](http://infochim.u-strasbg.fr/recherche/Download/Fragmentor/MDL_SDF.pdf) + +The document describes the syntax for specifying SGroup information both in version V2000 and V3000 of MDL SD files. Both of these have been implemented for input and output, as well as interconversion between them. Best effort has been made to support as many features of SGroups as possible, as well as following specifications as close as possible. Nevertheless, the following features are not currently included in this implementation: + +- V2000 "CRS" lines. +- V3000 "XBHEAD" and "XBCORR" labels. + +## Testing + +A Test file has been added to the SGroup code to make sure that it works in the expected way. Nonetheless, the sample SGroups which are checked in these tests have been designed to check as many features as possible with as less SGroups as possible. They do not make any sense, and should not be taken as a reference on how to use SGroups. + +Besides this test file, the SGroups code has been tested on some sample molecules, which have been parsed and then written back to disk to check data consistency. + +**Remarks**: + +1. The `SGroup` class contains elements consisting of pointers to atoms, bonds and other SGroup objects. When making a copy of a molecule that has SGroups, it is required to update these pointers to objects contained in the new owning molecule. This is done by using the private method `void updateOwningMol(ROMol *other_mol);`. + + This method is to be used **only** after all atoms, bonds, and SGroups on the new molecule have been created, as the pointers will be updated with the objects that are at the same indexes as those in the original molecule. This is especially important for SGroups, as these can have a parent relationship pointing to another SGroup with a higher index (i.e. that has been added to the molecule after the current one), and therefore, might not be initialized yet). + +2. This implementation of SGroups is only meant for parsing and writing from/to SD files and serialization, and not for manipulation. Please take into account that: + - SGroups cannot be removed individually (but all of them can be cleared from a molecule). + - Adding, replacing or removing atoms/bonds from a RWMol will drop the SGroups. \ No newline at end of file