diff --git a/Code/GraphMol/MolHash/CMakeLists.txt b/Code/GraphMol/MolHash/CMakeLists.txt index 7baa47a88..92b9db0f8 100644 --- a/Code/GraphMol/MolHash/CMakeLists.txt +++ b/Code/GraphMol/MolHash/CMakeLists.txt @@ -7,7 +7,7 @@ target_compile_definitions(MolHash PRIVATE RDKIT_MOLHASH_BUILD) rdkit_headers(MolHash.h nmmolhash.h DEST GraphMol/MolHash) -rdkit_catch_test(molHashCatchTest catch_tests.cpp LINK_LIBRARIES MolHash) +rdkit_catch_test(molHashCatchTest ../catch_main.cpp catch_tests.cpp LINK_LIBRARIES MolHash) if(RDK_BUILD_PYTHON_WRAPPERS) add_subdirectory(Wrap) diff --git a/Code/GraphMol/MolHash/MolHash.h b/Code/GraphMol/MolHash/MolHash.h index 3e93d8ab2..061b7d605 100644 --- a/Code/GraphMol/MolHash/MolHash.h +++ b/Code/GraphMol/MolHash/MolHash.h @@ -1,5 +1,5 @@ // -// Copyright (C) 2020 Greg Landrum +// Copyright (C) 2020 Greg Landrum and other RDKit contributors // // @@ All Rights Reserved @@ // This file is part of the RDKit. diff --git a/Code/GraphMol/MolHash/Wrap/rdMolHash.cpp b/Code/GraphMol/MolHash/Wrap/rdMolHash.cpp index 4162cb424..b5f90f782 100644 --- a/Code/GraphMol/MolHash/Wrap/rdMolHash.cpp +++ b/Code/GraphMol/MolHash/Wrap/rdMolHash.cpp @@ -1,5 +1,5 @@ // -// Copyright (C) 2020 Greg Landrum +// Copyright (C) 2020-2022 Greg Landrum and other RDKit contributors // // @@ All Rights Reserved @@ // This file is part of the RDKit. @@ -17,9 +17,10 @@ using namespace RDKit; namespace { -std::string MolHashHelper(const ROMol &mol, MolHash::HashFunction func) { +std::string MolHashHelper(const ROMol &mol, MolHash::HashFunction func, + bool useCXSmiles) { RWMol cpy(mol); - return MolHash::MolHash(&cpy, func); + return MolHash::MolHash(&cpy, func, useCXSmiles); } } // namespace @@ -48,7 +49,8 @@ BOOST_PYTHON_MODULE(rdMolHash) { MolHash::HashFunction::ArthorSubstructureOrder); python::def("MolHash", MolHashHelper, - (python::arg("mol"), python::arg("func")), + (python::arg("mol"), python::arg("func"), + python::arg("useCxSmiles") = false), "Generate a hash for a molecule. The func argument determines " "which hash is generated."); } diff --git a/Code/GraphMol/MolHash/Wrap/testMolHash.py b/Code/GraphMol/MolHash/Wrap/testMolHash.py index 54a2e9454..4ee20ca1d 100644 --- a/Code/GraphMol/MolHash/Wrap/testMolHash.py +++ b/Code/GraphMol/MolHash/Wrap/testMolHash.py @@ -35,6 +35,17 @@ class TestCase(unittest.TestCase): self.assertEqual(rdMolHash.MolHash(m, rdMolHash.HashFunction.ArthorSubstructureOrder), '000f001001000c000300005f000000') + def testCxSmiles(self): + m = Chem.MolFromSmiles( + 'C[C@@H](O)[C@@H](C)[C@@H](C)C[C@H](C1=CN=CN1)C1=CNC=N1 |o1:8,5,&1:1,3,r,c:11,18,t:9,15|') + + self.assertEqual(rdMolHash.MolHash(m, rdMolHash.HashFunction.HetAtomTautomer), + 'C[C@@H](CC([C]1[CH][N][CH][N]1)[C]1[CH][N][CH][N]1)[C@H](C)[C@@H](C)[O]_3_0') + + self.assertEqual( + rdMolHash.MolHash(m, rdMolHash.HashFunction.HetAtomTautomer, True), + 'C[C@@H](CC([C]1[CH][N][CH][N]1)[C]1[CH][N][CH][N]1)[C@H](C)[C@@H](C)[O]_3_0 |o1:1,&1:14,16|') + if __name__ == "__main__": unittest.main() diff --git a/Code/GraphMol/MolHash/catch_tests.cpp b/Code/GraphMol/MolHash/catch_tests.cpp index 144aa3e5f..c4cbc707d 100644 --- a/Code/GraphMol/MolHash/catch_tests.cpp +++ b/Code/GraphMol/MolHash/catch_tests.cpp @@ -1,5 +1,5 @@ // -// Copyright (C) 2019 Greg Landrum +// Copyright (C) 2019-2022 Greg Landrum // // @@ All Rights Reserved @@ // This file is part of the RDKit. @@ -7,12 +7,11 @@ // which is included in the file license.txt, found at the root // of the RDKit source tree. // -#define CATCH_CONFIG_MAIN // This tells Catch to provide a main() - only do - // this in one cpp file #include "catch.hpp" #include #include +#include #include #include "MolHash.h" @@ -230,3 +229,179 @@ TEST_CASE("Github issues", "[molhash]") { CHECK(hsh == "C2H6Cl"); } } + +TEST_CASE("MolHash with CX extensions", "[molhash]") { + SECTION("Tautomer") { + auto mol = + "C[C@@H](O)[C@@H](C)[C@@H](C)C[C@H](C1=CN=CN1)C1=CNC=N1 " + "|o1:8,5,&1:1,3,r,c:11,18,t:9,15|"_smiles; + REQUIRE(mol); + + { + RWMol cp(*mol); + auto hsh = MolHash::MolHash(&cp, MolHash::HashFunction::HetAtomTautomer); + CHECK( + hsh == + "C[C@@H](CC([C]1[CH][N][CH][N]1)[C]1[CH][N][CH][N]1)[C@H](C)[C@@H](C)" + "[O]_3_0"); + } + { + RWMol cp(*mol); + + auto hsh = + MolHash::MolHash(&cp, MolHash::HashFunction::HetAtomTautomer, true); + CHECK( + hsh == + "C[C@@H](CC([C]1[CH][N][CH][N]1)[C]1[CH][N][CH][N]1)[C@H](C)[C@@H](C)" + "[O]_3_0 |o1:1,&1:14,16|"); + } + } + SECTION("no coordinates please") { + auto mol = R"CTAB( + Mrv2108 03032205502D + + 0 0 0 0 0 999 V3000 +M V30 BEGIN CTAB +M V30 COUNTS 15 16 0 0 1 +M V30 BEGIN ATOM +M V30 1 C -0.4657 -3.589 0 0 CFG=1 +M V30 2 C -0.4657 -2.049 0 0 +M V30 3 C 0.8679 -4.359 0 0 +M V30 4 C 2.2016 -3.589 0 0 CFG=2 +M V30 5 C 3.5353 -4.359 0 0 +M V30 6 C 4.9422 -3.7327 0 0 +M V30 7 N 5.9726 -4.8771 0 0 +M V30 8 C 5.2026 -6.2108 0 0 +M V30 9 N 3.6963 -5.8906 0 0 +M V30 10 C 2.2016 -2.049 0 0 +M V30 11 C 0.9557 -1.1438 0 0 +M V30 12 N 1.4316 0.3208 0 0 +M V30 13 C 2.9716 0.3208 0 0 +M V30 14 N 3.4475 -1.1438 0 0 +M V30 15 F -1.7994 -4.359 0 0 +M V30 END ATOM +M V30 BEGIN BOND +M V30 1 1 1 2 CFG=1 +M V30 2 1 1 3 +M V30 3 1 4 3 CFG=1 +M V30 4 1 4 5 +M V30 5 2 5 6 +M V30 6 1 6 7 +M V30 7 2 7 8 +M V30 8 1 8 9 +M V30 9 1 5 9 +M V30 10 1 4 10 +M V30 11 2 10 11 +M V30 12 1 11 12 +M V30 13 1 12 13 +M V30 14 2 13 14 +M V30 15 1 10 14 +M V30 16 1 1 15 +M V30 END BOND +M V30 BEGIN COLLECTION +M V30 MDLV30/STEREL1 ATOMS=(1 1) +M V30 MDLV30/STERAC1 ATOMS=(1 4) +M V30 END COLLECTION +M V30 END CTAB +M END +)CTAB"_ctab; + REQUIRE(mol); + + { + RWMol cp(*mol); + auto hsh = + MolHash::MolHash(&cp, MolHash::HashFunction::HetAtomTautomer, true); + CHECK(hsh == + "C[C@H](F)CC([C]1[CH][N][CH][N]1)[C]1[CH][N][CH][N]1_2_0 |o1:1|"); + } + } + + SECTION("Mesomer") { + auto mol = "C[C@H](F)C[C@@](C([NH-])=O)C([O-])=N |o1:1,&1:4|"_smiles; + REQUIRE(mol); + + { + RWMol cp(*mol); + auto hsh = MolHash::MolHash(&cp, MolHash::HashFunction::Mesomer); + CHECK(hsh == "C[C@H](F)C[C]([C]([NH])[O])[C]([NH])[O]_-2"); + } + { + RWMol cp(*mol); + + auto hsh = MolHash::MolHash(&cp, MolHash::HashFunction::Mesomer, true); + CHECK(hsh == "C[C@H](F)C[C]([C]([NH])[O])[C]([NH])[O]_-2 |o1:1|"); + } + } + SECTION("Extended Murcko") { + auto mol = + "CC1=CC=CC=C1[C@@H](C[C@@H](C1CC1)C1CCC1)C1=CC=CC=C1O |o1:9,&1:7|"_smiles; + REQUIRE(mol); + + { + RWMol cp(*mol); + auto hsh = MolHash::MolHash(&cp, MolHash::HashFunction::ExtendedMurcko); + CHECK(hsh == "*c1ccccc1C(C[C@H](C1CCC1)C1CC1)c1ccccc1*"); + } + { + RWMol cp(*mol); + + auto hsh = + MolHash::MolHash(&cp, MolHash::HashFunction::ExtendedMurcko, true); + CHECK(hsh == "*c1ccccc1C(C[C@H](C1CCC1)C1CC1)c1ccccc1* |o1:9|"); + } + } + SECTION("Murcko") { + auto mol = + "CC1=CC=CC=C1[C@@H](C[C@@H](C1CC1)C1CCC1)C1=CC=CC=C1O |o1:9,&1:7|"_smiles; + REQUIRE(mol); + + { + RWMol cp(*mol); + auto hsh = MolHash::MolHash(&cp, MolHash::HashFunction::MurckoScaffold); + CHECK(hsh == "c1ccc(C(C[C@H](C2CCC2)C2CC2)c2ccccc2)cc1"); + } + { + RWMol cp(*mol); + + auto hsh = + MolHash::MolHash(&cp, MolHash::HashFunction::MurckoScaffold, true); + CHECK(hsh == "c1ccc(C(C[C@H](C2CCC2)C2CC2)c2ccccc2)cc1 |o1:6|"); + } + } + SECTION("Element") { + auto mol = + "C([C@@H](C1CC1)C1CCC1)[C@@H](C1CCCCC1)C1=CC=CC=C1 |o1:1,&1:9,c:21,23,t:19|"_smiles; + REQUIRE(mol); + + { + RWMol cp(*mol); + auto hsh = MolHash::MolHash(&cp, MolHash::HashFunction::ElementGraph); + CHECK(hsh == "C1CCC(C(C[C@H](C2CCC2)C2CC2)C2CCCCC2)CC1"); + } + { + RWMol cp(*mol); + + auto hsh = + MolHash::MolHash(&cp, MolHash::HashFunction::ElementGraph, true); + CHECK(hsh == "C1CCC(C(C[C@H](C2CCC2)C2CC2)C2CCCCC2)CC1 |o1:6|"); + } + } + SECTION("Anonymous") { + auto mol = + "C([C@@H](C1CC1)C1CCC1)[C@@H](C1CCCCC1)C1=CC=CC=N1 |o1:1,&1:9,c:21,23,t:19|"_smiles; + REQUIRE(mol); + + { + RWMol cp(*mol); + auto hsh = MolHash::MolHash(&cp, MolHash::HashFunction::AnonymousGraph); + CHECK(hsh == "*1***(*(**(*2***2)*2**2)*2*****2)**1"); + } + { + RWMol cp(*mol); + + auto hsh = + MolHash::MolHash(&cp, MolHash::HashFunction::AnonymousGraph, true); + CHECK(hsh == "*1***(*(**(*2***2)*2**2)*2*****2)**1"); + } + } +} diff --git a/Code/GraphMol/MolHash/hashfunctions.cpp b/Code/GraphMol/MolHash/hashfunctions.cpp index 9620b8aeb..961a6b1c0 100644 --- a/Code/GraphMol/MolHash/hashfunctions.cpp +++ b/Code/GraphMol/MolHash/hashfunctions.cpp @@ -1,13 +1,11 @@ -/*==============================================*/ -/* Copyright (C) 2011-2019 NextMove Software */ -/* All rights reserved. */ -/* */ -/* This file is part of molhash. */ -/* */ -/* The contents are covered by the terms of the */ -/* BSD license, which is included in the file */ -/* license.txt. */ -/*==============================================*/ +// +// Copyright (C) 2011-2022 NextMove Software and other RDKit contributors +// +// @@ All Rights Reserved @@ +// This file is part of the RDKit. +// The contents are covered by the terms of the BSD license +// which is included in the file license.txt, found at the root +// of the RDKit source tree. #define _CRT_SECURE_NO_WARNINGS #include @@ -23,6 +21,16 @@ #include "mf.h" namespace { + +void addCXExtensions(RDKit::RWMol *mol, std::string &result, + unsigned additionalSkips = 0) { + auto cxext = RDKit::SmilesWrite::getCXExtensions( + *mol, RDKit::SmilesWrite::CX_ALL ^ RDKit::SmilesWrite::CX_COORDS ^ + additionalSkips); + if (!cxext.empty()) { + result += " " + cxext; + } +} unsigned int NMRDKitBondGetOrder(const RDKit::Bond *bnd) { PRECONDITION(bnd, "bad bond"); switch (bnd->getBondType()) { @@ -116,8 +124,10 @@ void NMRDKitSanitizeHydrogens(RDKit::RWMol *mol) { namespace RDKit { namespace MolHash { -static unsigned int NMDetermineComponents(RWMol *mol, unsigned int *parts, - unsigned int acount) { + +namespace { +unsigned int NMDetermineComponents(RWMol *mol, unsigned int *parts, + unsigned int acount) { PRECONDITION(mol, "bad molecule"); PRECONDITION(parts, "bad parts pointer"); memset(parts, 0, acount * sizeof(unsigned int)); @@ -148,8 +158,8 @@ static unsigned int NMDetermineComponents(RWMol *mol, unsigned int *parts, return result; } -static std::string NMMolecularFormula(RWMol *mol, const unsigned int *parts, - unsigned int part) { +std::string NMMolecularFormula(RWMol *mol, const unsigned int *parts, + unsigned int part) { PRECONDITION(mol, "bad molecule"); PRECONDITION((!part || parts), "bad parts pointer"); unsigned int hist[256]; @@ -202,7 +212,7 @@ static std::string NMMolecularFormula(RWMol *mol, const unsigned int *parts, return result; } -static std::string NMMolecularFormula(RWMol *mol, bool sep = false) { +std::string NMMolecularFormula(RWMol *mol, bool sep = false) { PRECONDITION(mol, "bad molecule"); if (!sep) { return NMMolecularFormula(mol, nullptr, 0); @@ -237,7 +247,7 @@ static std::string NMMolecularFormula(RWMol *mol, bool sep = false) { return result; } -static void NormalizeHCount(Atom *aptr) { +void NormalizeHCount(Atom *aptr) { PRECONDITION(aptr, "bad atom pointer"); unsigned int hcount; @@ -280,7 +290,7 @@ static void NormalizeHCount(Atom *aptr) { aptr->setNumExplicitHs(hcount); } -static std::string AnonymousGraph(RWMol *mol, bool elem) { +std::string AnonymousGraph(RWMol *mol, bool elem, bool useCXSmiles) { PRECONDITION(mol, "bad molecule"); std::string result; int charge = 0; @@ -302,11 +312,23 @@ static std::string AnonymousGraph(RWMol *mol, bool elem) { bptr->setBondType(Bond::SINGLE); } MolOps::assignRadicals(*mol); + + // we may have just destroyed some stereocenters/bonds + // clean that up: + bool cleanIt = true; + bool force = true; + MolOps::assignStereochemistry(*mol, cleanIt, force); + result = MolToSmiles(*mol); + + if (useCXSmiles) { + addCXExtensions(mol, result, SmilesWrite::CX_RADICALS); + } + return result; } -static std::string MesomerHash(RWMol *mol, bool netq) { +std::string MesomerHash(RWMol *mol, bool netq, bool useCXSmiles) { PRECONDITION(mol, "bad molecule"); std::string result; char buffer[32]; @@ -323,15 +345,25 @@ static std::string MesomerHash(RWMol *mol, bool netq) { } MolOps::assignRadicals(*mol); + + // we may have just destroyed some stereocenters/bonds + // clean that up: + bool cleanIt = true; + bool force = true; + MolOps::assignStereochemistry(*mol, cleanIt, force); + result = MolToSmiles(*mol); if (netq) { sprintf(buffer, "_%d", charge); result += buffer; } + if (useCXSmiles) { + addCXExtensions(mol, result, SmilesWrite::CX_RADICALS); + } return result; } -static std::string TautomerHash(RWMol *mol, bool proto) { +std::string TautomerHash(RWMol *mol, bool proto, bool useCXSmiles) { PRECONDITION(mol, "bad molecule"); std::string result; char buffer[32]; @@ -372,10 +404,14 @@ static std::string TautomerHash(RWMol *mol, bool proto) { sprintf(buffer, "_%d", hcount - charge); } result += buffer; + if (useCXSmiles) { + addCXExtensions(mol, result, SmilesWrite::CX_RADICALS); + } + return result; } -static bool TraverseForRing(Atom *atom, unsigned char *visit) { +bool TraverseForRing(Atom *atom, unsigned char *visit) { PRECONDITION(atom, "bad atom pointer"); PRECONDITION(visit, "bad pointer"); visit[atom->getIdx()] = 1; @@ -395,8 +431,7 @@ static bool TraverseForRing(Atom *atom, unsigned char *visit) { return false; } -static bool DepthFirstSearchForRing(Atom *root, Atom *nbor, - unsigned int maxatomidx) { +bool DepthFirstSearchForRing(Atom *root, Atom *nbor, unsigned int maxatomidx) { PRECONDITION(root, "bad atom pointer"); PRECONDITION(nbor, "bad atom pointer"); @@ -425,7 +460,7 @@ bool IsInScaffold(Atom *atom, unsigned int maxatomidx) { return count > 1; } -static bool HasNbrInScaffold(Atom *aptr, unsigned char *is_in_scaffold) { +bool HasNbrInScaffold(Atom *aptr, unsigned char *is_in_scaffold) { PRECONDITION(aptr, "bad atom pointer"); PRECONDITION(is_in_scaffold, "bad pointer"); for (auto nbri : boost::make_iterator_range( @@ -438,7 +473,7 @@ static bool HasNbrInScaffold(Atom *aptr, unsigned char *is_in_scaffold) { return false; } -static std::string ExtendedMurckoScaffold(RWMol *mol) { +std::string ExtendedMurckoScaffold(RWMol *mol, bool useCXSmiles) { PRECONDITION(mol, "bad molecule"); RDKit::MolOps::fastFindRings(*mol); @@ -469,12 +504,22 @@ static std::string ExtendedMurckoScaffold(RWMol *mol) { } mol->commitBatchEdit(); MolOps::assignRadicals(*mol); + + // we may have just destroyed some stereocenters/bonds + // clean that up: + bool cleanIt = true; + bool force = true; + MolOps::assignStereochemistry(*mol, cleanIt, force); + std::string result; result = MolToSmiles(*mol); + if (useCXSmiles) { + addCXExtensions(mol, result, SmilesWrite::CX_RADICALS); + } return result; } -static std::string MurckoScaffoldHash(RWMol *mol) { +std::string MurckoScaffoldHash(RWMol *mol, bool useCXSmiles) { PRECONDITION(mol, "bad molecule"); std::vector for_deletion; do { @@ -502,12 +547,22 @@ static std::string MurckoScaffoldHash(RWMol *mol) { mol->commitBatchEdit(); } while (!for_deletion.empty()); MolOps::assignRadicals(*mol); + + // we may have just destroyed some stereocenters/bonds + // clean that up: + bool cleanIt = true; + bool force = true; + MolOps::assignStereochemistry(*mol, cleanIt, force); + std::string result; result = MolToSmiles(*mol); + if (useCXSmiles) { + addCXExtensions(mol, result, SmilesWrite::CX_RADICALS); + } return result; } -static std::string NetChargeHash(RWMol *mol) { +std::string NetChargeHash(RWMol *mol) { PRECONDITION(mol, "bad molecule"); int totalq = 0; @@ -520,7 +575,7 @@ static std::string NetChargeHash(RWMol *mol) { return buffer; } -static std::string SmallWorldHash(RWMol *mol, bool brl) { +std::string SmallWorldHash(RWMol *mol, bool brl) { PRECONDITION(mol, "bad molecule"); char buffer[64]; @@ -542,7 +597,7 @@ static std::string SmallWorldHash(RWMol *mol, bool brl) { return buffer; } -static void DegreeVector(RWMol *mol, unsigned int *v) { +void DegreeVector(RWMol *mol, unsigned int *v) { memset(v, 0, 4 * sizeof(unsigned int)); for (auto aptr : mol->atoms()) { switch (aptr->getDegree()) { @@ -562,7 +617,7 @@ static void DegreeVector(RWMol *mol, unsigned int *v) { } } -static bool HasDoubleBond(Atom *atom) { +bool HasDoubleBond(Atom *atom) { PRECONDITION(atom, "bad atom"); for (const auto &nbri : boost::make_iterator_range(atom->getOwningMol().getAtomBonds(atom))) { @@ -581,7 +636,7 @@ static bool HasDoubleBond(Atom *atom) { // 2 means break, with hydrogen on beg and asterisk on end // 3 means break, with asterisks on both beg and end -static int RegioisomerBond(Bond *bnd) { +int RegioisomerBond(Bond *bnd) { PRECONDITION(bnd, "bad bond"); if (NMRDKitBondGetOrder(bnd) != 1) { return -1; @@ -619,7 +674,7 @@ static int RegioisomerBond(Bond *bnd) { return -1; } -static void ClearEZStereo(Atom *atm) { +void ClearEZStereo(Atom *atm) { PRECONDITION(atm, "bad atom"); for (const auto &nbri : boost::make_iterator_range(atm->getOwningMol().getAtomBonds(atm))) { @@ -630,7 +685,7 @@ static void ClearEZStereo(Atom *atm) { } } -static std::string RegioisomerHash(RWMol *mol) { +std::string RegioisomerHash(RWMol *mol, bool useCXSmiles) { PRECONDITION(mol, "bad molecule"); // we need a copy of the molecule so that we can loop over the bonds of @@ -671,12 +726,21 @@ static std::string RegioisomerHash(RWMol *mol) { } } - std::string result; - result = MolToSmiles(*mol); + // we may have just destroyed some stereocenters/bonds + // clean that up: + bool cleanIt = true; + bool force = true; + MolOps::assignStereochemistry(*mol, cleanIt, force); + + std::string result = MolToSmiles(*mol); + if (useCXSmiles) { + addCXExtensions(mol, result); + } + return result; } -static std::string ArthorSubOrderHash(RWMol *mol) { +std::string ArthorSubOrderHash(RWMol *mol) { PRECONDITION(mol, "bad molecule"); char buffer[256]; @@ -793,8 +857,9 @@ static std::string ArthorSubOrderHash(RWMol *mol) { pcount, ccount, ocount, zcount, rcount, qcount, icount); return buffer; } +} // namespace -std::string MolHash(RWMol *mol, HashFunction func) { +std::string MolHash(RWMol *mol, HashFunction func, bool useCXSmiles) { PRECONDITION(mol, "bad molecule"); std::string result; char buffer[32]; @@ -803,31 +868,34 @@ std::string MolHash(RWMol *mol, HashFunction func) { switch (func) { default: case HashFunction::AnonymousGraph: - result = AnonymousGraph(mol, false); + result = AnonymousGraph(mol, false, useCXSmiles); break; case HashFunction::ElementGraph: - result = AnonymousGraph(mol, true); + result = AnonymousGraph(mol, true, useCXSmiles); break; case HashFunction::CanonicalSmiles: result = MolToSmiles(*mol); + if (useCXSmiles) { + addCXExtensions(mol, result); + } break; case HashFunction::MurckoScaffold: - result = MurckoScaffoldHash(mol); + result = MurckoScaffoldHash(mol, useCXSmiles); break; case HashFunction::ExtendedMurcko: - result = ExtendedMurckoScaffold(mol); + result = ExtendedMurckoScaffold(mol, useCXSmiles); break; case HashFunction::Mesomer: - result = MesomerHash(mol, true); + result = MesomerHash(mol, true, useCXSmiles); break; case HashFunction::RedoxPair: - result = MesomerHash(mol, false); + result = MesomerHash(mol, false, useCXSmiles); break; case HashFunction::HetAtomTautomer: - result = TautomerHash(mol, false); + result = TautomerHash(mol, false, useCXSmiles); break; case HashFunction::HetAtomProtomer: - result = TautomerHash(mol, true); + result = TautomerHash(mol, true, useCXSmiles); break; case HashFunction::MolFormula: result = NMMolecularFormula(mol); @@ -855,7 +923,7 @@ std::string MolHash(RWMol *mol, HashFunction func) { result = ArthorSubOrderHash(mol); break; case HashFunction::Regioisomer: - result = RegioisomerHash(mol); + result = RegioisomerHash(mol, useCXSmiles); break; } return result; diff --git a/Code/GraphMol/MolHash/mf.h b/Code/GraphMol/MolHash/mf.h index be868afab..08e19d808 100644 --- a/Code/GraphMol/MolHash/mf.h +++ b/Code/GraphMol/MolHash/mf.h @@ -1,13 +1,11 @@ -/*==============================================*/ -/* Copyright (C) 2016-2019 NextMove Software */ -/* All rights reserved. */ -/* */ -/* This file is part of molhash. */ -/* */ -/* The contents are covered by the terms of the */ -/* BSD license, which is included in the file */ -/* license.txt. */ -/*==============================================*/ +// +// Copyright (C) 2016-2022 NextMove Software and other RDKit contributors +// +// @@ All Rights Reserved @@ +// This file is part of the RDKit. +// The contents are covered by the terms of the BSD license +// which is included in the file license.txt, found at the root +// of the RDKit source tree. #ifndef NMS_MOLFORMULA_H #define NMS_MOLFORMULA_H diff --git a/Code/GraphMol/MolHash/nmmolhash.h b/Code/GraphMol/MolHash/nmmolhash.h index b08fbb9c0..6abf423e0 100644 --- a/Code/GraphMol/MolHash/nmmolhash.h +++ b/Code/GraphMol/MolHash/nmmolhash.h @@ -42,7 +42,8 @@ enum class HashFunction { ArthorSubstructureOrder = 17 }; -RDKIT_MOLHASH_EXPORT std::string MolHash(RWMol *mol, HashFunction func); +RDKIT_MOLHASH_EXPORT std::string MolHash(RWMol *mol, HashFunction func, + bool useCXSmiles = false); enum class StripType { AtomStereo = 1, diff --git a/Code/GraphMol/MolHash/normalize.cpp b/Code/GraphMol/MolHash/normalize.cpp index db403bbbf..2d35e6dbc 100644 --- a/Code/GraphMol/MolHash/normalize.cpp +++ b/Code/GraphMol/MolHash/normalize.cpp @@ -1,13 +1,11 @@ -/*==============================================*/ -/* Copyright (C) 2019 NextMove Software */ -/* All rights reserved. */ -/* */ -/* This file is part of molhash. */ -/* */ -/* The contents are covered by the terms of the */ -/* BSD license, which is included in the file */ -/* license.txt. */ -/*==============================================*/ +// +// Copyright (C) 2019-2022 NextMove Software and other RDKit contributors +// +// @@ All Rights Reserved @@ +// This file is part of the RDKit. +// The contents are covered by the terms of the BSD license +// which is included in the file license.txt, found at the root +// of the RDKit source tree. #include #include @@ -25,6 +23,10 @@ void Strip(RWMol *mol, unsigned int striptype) { for (auto aptr : mol->atoms()) { aptr->setChiralTag(RDKit::Atom::CHI_UNSPECIFIED); } + if (!mol->getStereoGroups().empty()) { + std::vector no_sgs; + mol->setStereoGroups(std::move(no_sgs)); + } } if (striptype & static_cast(StripType::BondStereo)) { for (auto bptr : mol->bonds()) { @@ -49,11 +51,9 @@ void Strip(RWMol *mol, unsigned int striptype) { } void SplitMolecule(RWMol *mol, std::vector &molv) { - RDKit::MOL_SPTR_VECT mfrags = RDKit::MolOps::getMolFrags(*mol); - RDKit::MOL_SPTR_VECT::iterator vit; - for (vit = mfrags.begin(); vit != mfrags.end(); ++vit) { - RDKit::ROMol *wrappedmol = - (*vit).get(); // reach inside the shared pointer... + auto mfrags = RDKit::MolOps::getMolFrags(*mol); + for (const auto &frag : mfrags) { + const auto *wrappedmol = frag.get(); // reach inside the shared pointer... molv.push_back(new RWMol(*wrappedmol)); // ...and make a copy } } diff --git a/Code/GraphMol/catch_chirality.cpp b/Code/GraphMol/catch_chirality.cpp index 60f2687d8..2b87fae38 100644 --- a/Code/GraphMol/catch_chirality.cpp +++ b/Code/GraphMol/catch_chirality.cpp @@ -1739,22 +1739,6 @@ TEST_CASE("StereoGroup Testing") { } } -TEST_CASE("replaceAtom and StereoGroups") { - SECTION("basics") { - auto mol = "C[C@](O)(Cl)[C@H](F)Cl |o1:1,4|"_smiles; - REQUIRE(mol); - CHECK(mol->getStereoGroups().size() == 1); - CHECK(mol->getStereoGroups()[0].getAtoms().size() == 2); - CHECK(mol->getStereoGroups()[0].getAtoms()[0] == mol->getAtomWithIdx(1)); - - Atom acp(*mol->getAtomWithIdx(1)); - mol->replaceAtom(1, &acp); - CHECK(mol->getStereoGroups().size() == 1); - CHECK(mol->getStereoGroups()[0].getAtoms().size() == 2); - CHECK(mol->getStereoGroups()[0].getAtoms()[0] == mol->getAtomWithIdx(1)); - } -} - TEST_CASE("Removing stereogroups from unspecified atoms") { SECTION("basics") { auto mol = "C[C@](O)(Cl)F |o1:1|"_smiles; @@ -1777,3 +1761,19 @@ TEST_CASE("Removing stereogroups from unspecified atoms") { CHECK(mol->getStereoGroups()[0].getAtoms()[0]->getIdx() == 4); } } + +TEST_CASE("replaceAtom and StereoGroups") { + SECTION("basics") { + auto mol = "C[C@](O)(Cl)[C@H](F)Cl |o1:1,4|"_smiles; + REQUIRE(mol); + CHECK(mol->getStereoGroups().size() == 1); + CHECK(mol->getStereoGroups()[0].getAtoms().size() == 2); + CHECK(mol->getStereoGroups()[0].getAtoms()[0] == mol->getAtomWithIdx(1)); + + Atom acp(*mol->getAtomWithIdx(1)); + mol->replaceAtom(1, &acp); + CHECK(mol->getStereoGroups().size() == 1); + CHECK(mol->getStereoGroups()[0].getAtoms().size() == 2); + CHECK(mol->getStereoGroups()[0].getAtoms()[0] == mol->getAtomWithIdx(1)); + } +} diff --git a/ReleaseNotes.md b/ReleaseNotes.md index f96a215f3..4cff4c097 100644 --- a/ReleaseNotes.md +++ b/ReleaseNotes.md @@ -4,6 +4,10 @@ ## Backwards incompatible changes - When running in Jupyter Notebook, logs are now sent only to Python's standard error stream, and no longer include the `RDKit LEVEL` prefix. +- The MolHash functions now reassign stereochemistry after modifying the + molecule and before calculating the hash. Previous versions would still + include information about atom/bond stereochemistry in the output hash even if + that no longer applies in the modified molecule. ## Code removed in this release: - The `useCountSimulation` keyword argument for