From 93e8a746bffa7c93a52bef0fd2155fb6477b112b Mon Sep 17 00:00:00 2001 From: Greg Landrum Date: Fri, 7 Jun 2024 05:09:57 +0200 Subject: [PATCH] Shrink the tautomeric zone for the v2 hash of things like imines (#7502) --- Code/GraphMol/MolHash/catch_tests.cpp | 21 ++++++++++++++++++--- Code/GraphMol/MolHash/hashfunctions.cpp | 15 +++++++++++++-- ReleaseNotes.md | 1 + 3 files changed, 32 insertions(+), 5 deletions(-) diff --git a/Code/GraphMol/MolHash/catch_tests.cpp b/Code/GraphMol/MolHash/catch_tests.cpp index d5fa7b472..5b31f5202 100644 --- a/Code/GraphMol/MolHash/catch_tests.cpp +++ b/Code/GraphMol/MolHash/catch_tests.cpp @@ -414,7 +414,7 @@ TEST_CASE("tautomer v2") { {{"CC(=O)CC(=O)C", "C=C(O)CC(=O)C", "CC(=O)C=C(O)C", "C=C(O)C=C(O)C", "C=C(O)CC(O)=C"}, {}}, - {{"CN=CF", "C=NCF"}, {"CNCF"}}, + {{"CN=CCF", "CNC=CF"}, {"C=NCCF", "CNCCF"}}, {{"CN=C(C)F", "CNC(=C)F"}, {"C=NC(C)F"}}, {{"Cc1n[nH]cc1", "Cc1[nH][n]cc1", "CC1=NN=CC1", "CC1N=NCC=1"}, {}}, {{"O=C1C=CC(=O)C=C1"}, {"Oc1ccc(O)cc1", "O=C1C=CC(O)C=C1"}}, @@ -501,7 +501,7 @@ TEST_CASE("tautomer v2") { "[CH3]-[C](-[CH3])(-[CH3])-[CH]=[O]_0_0"}, {"CC(C)=CO", "C[C](C)[CH][O]_1_0", "[CH3]-[C](-[CH3]):[C]:[O]_2_0"}, {"COC=O", "CO[CH][O]_0_0", "[CH3]-[O]:[C]:[O]_1_0"}, - {"CNC=O", "C[N][CH][O]_1_0", "[C]:[N]:[C]:[O]_5_0"}, + {"CNC=O", "C[N][CH][O]_1_0", "[CH3]-[N]:[C]:[O]_2_0"}, {"CN(C)C=O", "CN(C)[CH][O]_0_0", "[CH3]-[N](-[CH3]):[C]:[O]_1_0"}, {"CC(C)(C)NC=O", "CC(C)(C)[N][CH][O]_1_0", "[CH3]-[C](-[CH3])(-[CH3])-[N]:[C]:[O]_2_0"}, @@ -511,7 +511,7 @@ TEST_CASE("tautomer v2") { {"CC=CC(=O)C", "C[CH][CH][C](C)[O]_0_0", "[C]:[C](:[O]):[C]:[C]-[CH3]_5_0"}, {"N1C=CCC(F)C1", "FC1C[CH][CH][N]C1_1_0", - "[F]-[CH]1-[C]:[N]:[C]:[C]-[CH2]-1_5_0"}, + "[F]-[CH]1-[CH2]-[C]:[C]:[N]-[CH2]-1_3_0"}, {"CCC=C(O)C", "CC[CH][C](C)[O]_1_0", "[C]:[C](:[O]):[C]-[CH2]-[CH3]_5_0"}, {"CCCC(=O)C", "CCC[C](C)[O]_0_0", "[C]:[C](:[O]):[C]-[CH2]-[CH3]_5_0"}, @@ -860,3 +860,18 @@ TEST_CASE("HetAtomProtomerv2") { } } } + +TEST_CASE("overreach with v2 tautomer hashes and imines") { + SECTION("basics") { + std::vector smileses = {"C[C@H](F)NC1=CCCCC1", + "C[C@H](F)N=C1CCCCC1"}; + for (const auto &smiles : smileses) { + auto m = v2::SmilesParse::MolFromSmiles(smiles); + REQUIRE(m); + auto hsh = + MolHash::MolHash(m.get(), MolHash::HashFunction::HetAtomTautomerv2); + CHECK(hsh == + "[CH3]-[C@H](-[F])-[N]:[C]1:[C]-[CH2]-[CH2]-[CH2]-[C]:1_4_0"); + } + } +} \ No newline at end of file diff --git a/Code/GraphMol/MolHash/hashfunctions.cpp b/Code/GraphMol/MolHash/hashfunctions.cpp index e2c91d7c9..a6acf248f 100644 --- a/Code/GraphMol/MolHash/hashfunctions.cpp +++ b/Code/GraphMol/MolHash/hashfunctions.cpp @@ -1,5 +1,5 @@ // -// Copyright (C) 2011-2022 NextMove Software and other RDKit contributors +// Copyright (C) 2011-2024 NextMove Software and other RDKit contributors // // @@ All Rights Reserved @@ // This file is part of the RDKit. @@ -509,12 +509,23 @@ std::string TautomerHashv2(RWMol *mol, bool proto, bool useCXSmiles, << atm->getIdx() << "-" << oatom->getIdx() << std::endl; std::cerr << " " << bondsConsidered[nbrBond->getIdx()] << " icao " << isCandidateAtom(oatom) << " hsbo " - << hasStartBond(oatom, startBonds) << " unsat " + << hasStartBond(oatom, startBonds) << " atomunsato " + << queryAtomUnsaturated(oatom) << " atomunsat " + << queryAtomUnsaturated(atm) << " bondunsat " << isUnsaturatedBond(nbrBond) << " icaa " << isCandidateAtom(atm) << " hsba " << hasStartBond(atm, startBonds) << std::endl; #endif + // special case to prevent "overreach" with things like enamines. + // the logic here prevents the first bond in CNC=C from being included + // in the tautomeric system. So we get: [CH3]-[N]:[C] instead of + // [C]:[N]:[C] + if (startBonds[bptr->getIdx()] && isHeteroAtom(atm) && + !isUnsaturatedBond(nbrBond)) { + continue; + } + // if both bonds are not eligible, then we can skip this neighbor if (skipNeighborBond(atm, oatom, nbrBond, startBonds) && skipNeighborBond(oatom, atm, nbrBond, startBonds)) { continue; diff --git a/ReleaseNotes.md b/ReleaseNotes.md index be5dbf372..897643756 100644 --- a/ReleaseNotes.md +++ b/ReleaseNotes.md @@ -12,6 +12,7 @@ GitHub) ## Backwards incompatible changes - The SMARTS for the unbranched alkanes in the fragment descriptors has been corrected. This descriptor will now frequently return different results. - The SimilarityMap functions GetSimilarityMapFromWeights(), GetSimilarityMapForFingerprint(), and GetSimilarityMapForModel() all now require an rdMolDraw2D drawing object to be passed in. +- A bug fix in v2 of the tautomer and protomer hashes can lead to different results for these hashes. One less bond is now included in the tautomeric zone for systems like enamines/imines, so the v2 tautomer hash of the molecules CN=CC and CNC=C is now [C]:[C]:[N]-[CH3]_4_0 instead of [C]:[C]:[N]:[C]_7_0 ## New Features and Enhancements: