diff --git a/Code/GraphMol/Atom.cpp b/Code/GraphMol/Atom.cpp index 5c9da0bb5..c11219a79 100644 --- a/Code/GraphMol/Atom.cpp +++ b/Code/GraphMol/Atom.cpp @@ -39,17 +39,18 @@ bool isAromaticAtom(const Atom &atom) { } unsigned int getEffectiveAtomicNum(const Atom &atom, bool checkValue) { + const auto *periodicTable = PeriodicTable::getTable(); auto effectiveAtomicNum = atom.getAtomicNum() - atom.getFormalCharge(); if (checkValue && (effectiveAtomicNum < 0 || effectiveAtomicNum > - static_cast(PeriodicTable::getTable()->getMaxAtomicNumber()))) { + static_cast(periodicTable->getMaxAtomicNumber()))) { throw AtomValenceException("Effective atomic number out of range", atom.getIdx()); } effectiveAtomicNum = std::clamp( effectiveAtomicNum, 0, - static_cast(PeriodicTable::getTable()->getMaxAtomicNumber())); + static_cast(periodicTable->getMaxAtomicNumber())); return static_cast(effectiveAtomicNum); } @@ -345,6 +346,7 @@ bool canBeHypervalent(const Atom &atom, unsigned int effectiveAtomicNum) { } int calculateExplicitValence(const Atom &atom, bool strict, bool checkIt) { + const auto *periodicTable = PeriodicTable::getTable(); // FIX: contributions of bonds to valence are being done at best // approximately double accum = 0; @@ -353,18 +355,15 @@ int calculateExplicitValence(const Atom &atom, bool strict, bool checkIt) { } accum += atom.getNumExplicitHs(); - const auto &ovalens = - PeriodicTable::getTable()->getValenceList(atom.getAtomicNum()); + const auto &ovalens = periodicTable->getValenceList(atom.getAtomicNum()); // if we start with an atom that doesn't have specified valences, we stick // with that. otherwise we will use the effective valence unsigned int effectiveAtomicNum = atom.getAtomicNum(); if (ovalens.size() > 1 || ovalens[0] != -1) { effectiveAtomicNum = getEffectiveAtomicNum(atom, checkIt); } - unsigned int dv = - PeriodicTable::getTable()->getDefaultValence(effectiveAtomicNum); - const auto &valens = - PeriodicTable::getTable()->getValenceList(effectiveAtomicNum); + unsigned int dv = periodicTable->getDefaultValence(effectiveAtomicNum); + const auto &valens = periodicTable->getValenceList(effectiveAtomicNum); if (accum > dv && isAromaticAtom(atom)) { // this needs some explanation : if the atom is aromatic and // accum > dv we assume that no hydrogen can be added @@ -438,8 +437,7 @@ int calculateExplicitValence(const Atom &atom, bool strict, bool checkIt) { // raise an error std::ostringstream errout; errout << "Explicit valence for atom # " << atom.getIdx() << " " - << PeriodicTable::getTable()->getElementSymbol( - atom.getAtomicNum()) + << periodicTable->getElementSymbol(atom.getAtomicNum()) << ", " << res << ", is greater than permitted"; std::string msg = errout.str(); BOOST_LOG(rdErrorLog) << msg << std::endl; @@ -459,6 +457,7 @@ int calculateImplicitValence(const Atom &atom, bool strict, bool checkIt) { if (atom.df_noImplicit) { return 0; } + const auto *periodicTable = PeriodicTable::getTable(); auto explicitValence = atom.d_explicitValence; if (explicitValence == -1) { explicitValence = calculateExplicitValence(atom, strict, checkIt); @@ -498,8 +497,7 @@ int calculateImplicitValence(const Atom &atom, bool strict, bool checkIt) { } int explicitPlusRadV = atom.d_explicitValence + atom.d_numRadicalElectrons; - const auto &ovalens = - PeriodicTable::getTable()->getValenceList(atom.d_atomicNum); + const auto &ovalens = periodicTable->getValenceList(atom.d_atomicNum); // if we start with an atom that doesn't have specified valences, we stick // with that. otherwise we will use the effective valence for the rest of // this. @@ -518,7 +516,7 @@ int calculateImplicitValence(const Atom &atom, bool strict, bool checkIt) { // The d-block and f-block of the periodic table (i.e. transition metals, // lanthanoids and actinoids) have no default valence. - int dv = PeriodicTable::getTable()->getDefaultValence(effectiveAtomicNum); + int dv = periodicTable->getDefaultValence(effectiveAtomicNum); if (dv == -1) { return 0; } @@ -542,8 +540,7 @@ int calculateImplicitValence(const Atom &atom, bool strict, bool checkIt) { effectiveAtomicNum = atomicNum; explicitPlusRadV -= atom.d_formalCharge; } - const auto &valens = - PeriodicTable::getTable()->getValenceList(effectiveAtomicNum); + const auto &valens = periodicTable->getValenceList(effectiveAtomicNum); int res = 0; // if we have an aromatic case treat it differently diff --git a/Code/GraphMol/MolStandardize/Tautomer.cpp b/Code/GraphMol/MolStandardize/Tautomer.cpp index f1e805202..21a346a3d 100644 --- a/Code/GraphMol/MolStandardize/Tautomer.cpp +++ b/Code/GraphMol/MolStandardize/Tautomer.cpp @@ -33,11 +33,9 @@ namespace TautomerScoringFunctions { int scoreRings(const ROMol &mol) { int score = 0; auto ringInfo = mol.getRingInfo(); - std::unique_ptr cp; if (!ringInfo->isSymmSssr()) { - cp.reset(new ROMol(mol)); - MolOps::symmetrizeSSSR(*cp); - ringInfo = cp->getRingInfo(); + MolOps::symmetrizeSSSR(const_cast(mol)); + ringInfo = mol.getRingInfo(); } boost::dynamic_bitset<> isArom(mol.getNumBonds()); boost::dynamic_bitset<> bothCarbon(mol.getNumBonds()); @@ -72,48 +70,261 @@ int scoreRings(const ROMol &mol) { return score; }; -SubstructTerm::SubstructTerm(std::string aname, std::string asmarts, int ascore) - : name(std::move(aname)), smarts(std::move(asmarts)), score(ascore) { +SubstructTerm::SubstructTerm(std::string aname, std::string asmarts, int ascore, + std::vector reqElements, + std::string connSmarts) + : name(std::move(aname)), + smarts(std::move(asmarts)), + score(ascore), + requiredElements(std::move(reqElements)), + connectivitySmarts(std::move(connSmarts)) { std::unique_ptr pattern(SmartsToMol(smarts)); if (pattern) { matcher = std::move(*pattern); } + // Initialize connectivity matcher if provided + if (!connectivitySmarts.empty()) { + std::unique_ptr connPattern(SmartsToMol(connectivitySmarts)); + if (connPattern) { + connectivityMatcher = std::move(*connPattern); + } + } } const std::vector &getDefaultTautomerScoreSubstructs() { + // Each term specifies: + // - name, SMARTS, score + // - requiredElements: atomic numbers that must be present (empty = no filter) + // - connectivitySmarts: bond-order-agnostic pattern for pre-screening + // Since tautomerization only moves H and changes bond orders (never creates/ + // destroys heavy-atom bonds), we can skip patterns whose connectivity + // prerequisites aren't met by the input molecule. static std::vector substructureTerms{ - {"benzoquinone", "[#6]1([#6]=[#6][#6]([#6]=[#6]1)=,:[N,S,O])=,:[N,S,O]", - 25}, - {"oxim", "[#6]=[N][OH]", 4}, - {"C=O", "[#6]=,:[#8]", 2}, - {"N=O", "[#7]=,:[#8]", 2}, - {"P=O", "[#15]=,:[#8]", 2}, - {"C=hetero", "[C]=[!#1;!#6]", 1}, - {"C(=hetero)-hetero", "[C](=[!#1;!#6])[!#1;!#6]", 2}, - {"aromatic C = exocyclic N", "[c]=!@[N]", -1}, - {"methyl", "[CX4H3]", 1}, - {"guanidine terminal=N", "[#7]C(=[NR0])[#7H0]", 1}, - {"guanidine endocyclic=N", "[#7;R][#6;R]([N])=[#7;R]", 2}, - {"aci-nitro", "[#6]=[N+]([O-])[OH]", -4}}; + {"benzoquinone", "[#6]1([#6]=[#6][#6]([#6]=[#6]1)=,:[N,S,O])=,:[N,S,O]", 25, {6}, "[#6]1(~[#6]~[#6]~[#6](~[#6]~[#6]~1)~[N,S,O])~[N,S,O]"}, + {"oxim", "[#6]=[N][OH]", 4, {6, 7, 8}, "[#6]~[#7]~[#8]"}, + {"C=O", "[#6]=,:[#8]", 2, {6, 8}, "[#6]~[#8]"}, + {"N=O", "[#7]=,:[#8]", 2, {7, 8}, "[#7]~[#8]"}, + {"P=O", "[#15]=,:[#8]", 2, {15, 8}, "[#15]~[#8]"}, + {"C=hetero", "[C]=[!#1;!#6]", 1, {6}, "[C]~[!#1;!#6]"}, + {"C(=hetero)-hetero", "[C](=[!#1;!#6])[!#1;!#6]", 2, {6}, "[C](~[!#1;!#6])~[!#1;!#6]"}, + {"aromatic C = exocyclic N", "[c]=!@[N]", -1, {6, 7}, "[c]~[N]"}, + {"methyl", "[CX4H3]", 1, {6}, ""}, + {"guanidine terminal=N", "[#7]C(=[NR0])[#7H0]", 1, {6, 7}, "[#7]~[#6]~[#7]"}, + {"guanidine endocyclic=N", "[#7;R][#6;R]([N])=[#7;R]", 2, {6, 7}, "[#7]~[#6](~[N])~[#7]"}, + {"aci-nitro", "[#6]=[N+]([O-])[OH]", -4, {6, 7, 8}, "[#6]~[#7](~[#8])~[#8]"}}; return substructureTerms; } +std::vector getRelevantSubstructTermIndices( + const ROMol &mol, const std::vector &terms) { + // Prepare relevant SubstructTerms for this molecule by filtering in two + // stages: + // 1. Element check: skip terms requiring elements not in the molecule + // 2. Connectivity check: skip terms whose bond-order-agnostic pattern + // doesn't match (since tautomerization doesn't create/destroy bonds) + + std::unordered_set presentElements; + for (const auto atom : mol.atoms()) { + presentElements.insert(atom->getAtomicNum()); + } + + std::vector relevantIndices; + relevantIndices.reserve(terms.size()); + + SubstructMatchParameters params; + params.maxMatches = 1; + + for (size_t i = 0; i < terms.size(); ++i) { + const auto &term = terms[i]; + + bool hasAllElements = true; + for (int elem : term.requiredElements) { + if (presentElements.find(elem) == presentElements.end()) { + hasAllElements = false; + break; + } + } + if (!hasAllElements) { + continue; + } + if (term.connectivityMatcher.getNumAtoms() > 0) { + auto matches = SubstructMatch(mol, term.connectivityMatcher, params); + if (matches.empty()) { + continue; + } + } + relevantIndices.push_back(i); + } + + return relevantIndices; +} + +// --------------------------------------------------------------------------- +// Specialized matchers for simple scoring patterns. +// These avoid VF2 graph matching overhead by directly iterating atoms/bonds. +// Each function is equivalent to the corresponding SMARTS pattern but runs +// in O(n) time without the graph isomorphism overhead. +// --------------------------------------------------------------------------- + +// Count bonds between two element types with double or aromatic bond order. +// Matches: [#elem1]=,:[#elem2] +inline unsigned int countDoubleOrAromaticBonds(const ROMol &mol, int elem1, + int elem2) { + unsigned int count = 0; + for (const auto bond : mol.bonds()) { + const auto bt = bond->getBondType(); + if (bt != Bond::DOUBLE && bt != Bond::AROMATIC) { + continue; + } + const int a1 = bond->getBeginAtom()->getAtomicNum(); + const int a2 = bond->getEndAtom()->getAtomicNum(); + if ((a1 == elem1 && a2 == elem2) || (a1 == elem2 && a2 == elem1)) { + ++count; + } + } + return count; +} + +// Count methyl groups: [CX4H3] - sp3 carbon with exactly 3 total H +inline unsigned int countMethyls(const ROMol &mol) { + unsigned int count = 0; + for (const auto atom : mol.atoms()) { + if (atom->getAtomicNum() != 6) { + continue; + } + // X4 means total degree 4 (including implicit H) + if (atom->getTotalDegree() != 4) { + continue; + } + // H3 means exactly 3 total hydrogens + if (atom->getTotalNumHs() != 3) { + continue; + } + ++count; + } + return count; +} + +// Count [C]=[!#1;!#6] - aliphatic carbon double-bonded to heteroatom +inline unsigned int countCarbonDoubleHetero(const ROMol &mol) { + unsigned int count = 0; + for (const auto bond : mol.bonds()) { + if (bond->getBondType() != Bond::DOUBLE) { + continue; + } + const auto *a1 = bond->getBeginAtom(); + const auto *a2 = bond->getEndAtom(); + // [C] is aliphatic carbon (not aromatic) + // Check C double-bonded to heteroatom (not H, not C) + auto isAliphaticCarbonToHetero = [](const Atom *c, const Atom *het) { + return c->getAtomicNum() == 6 && !c->getIsAromatic() && + het->getAtomicNum() != 1 && het->getAtomicNum() != 6; + }; + if (isAliphaticCarbonToHetero(a1, a2) || + isAliphaticCarbonToHetero(a2, a1)) { + ++count; + } + } + return count; +} + +// Count [c]=!@[N] - aromatic carbon double-bonded to nitrogen (exocyclic) +inline unsigned int countAromaticCarbonExocyclicN(const ROMol &mol) { + unsigned int count = 0; + const auto *ringInfo = mol.getRingInfo(); + for (const auto bond : mol.bonds()) { + if (bond->getBondType() != Bond::DOUBLE) { + continue; + } + // !@ means bond not in ring + if (ringInfo->numBondRings(bond->getIdx()) > 0) { + continue; + } + const auto *a1 = bond->getBeginAtom(); + const auto *a2 = bond->getEndAtom(); + // [c] is aromatic carbon, [N] is any nitrogen + auto isAromaticCarbonToN = [](const Atom *c, const Atom *n) { + return c->getAtomicNum() == 6 && c->getIsAromatic() && + n->getAtomicNum() == 7; + }; + if (isAromaticCarbonToN(a1, a2) || isAromaticCarbonToN(a2, a1)) { + ++count; + } + } + return count; +} + +// Indices of the scoring patterns with dedicated matchers. +// These must match the order of patterns in getDefaultTautomerScoreSubstructs(). +namespace { +enum class PatternIdx { + CarbonylO = 2, + NO = 3, + PO = 4, + CHetero = 5, + AromaticCN = 7, + Methyl = 8, +}; +} // namespace + +int scoreSubstructsFiltered(const ROMol &mol, + const std::vector &terms, + const std::vector &relevantIndices) { + int score = 0; + SubstructMatchParameters params; + + for (const size_t idx : relevantIndices) { + const auto &term = terms[idx]; + if (!term.matcher.getNumAtoms()) { + continue; + } + + unsigned int nMatches = 0; + + // Use specialized matchers for simple patterns, VF2 for complex ones + switch (static_cast(idx)) { + case PatternIdx::CarbonylO: + nMatches = countDoubleOrAromaticBonds(mol, 6, 8); // C=O + break; + case PatternIdx::NO: + nMatches = countDoubleOrAromaticBonds(mol, 7, 8); // N=O + break; + case PatternIdx::PO: + nMatches = countDoubleOrAromaticBonds(mol, 15, 8); // P=O + break; + case PatternIdx::CHetero: + nMatches = countCarbonDoubleHetero(mol); + break; + case PatternIdx::AromaticCN: + nMatches = countAromaticCarbonExocyclicN(mol); + break; + case PatternIdx::Methyl: + nMatches = countMethyls(mol); + break; + default: + // Fall back to VF2 for complex patterns (benzoquinone, oxim, + // C(=hetero)-hetero, guanidine, aci-nitro) + nMatches = SubstructMatchCount(mol, term.matcher, params); + break; + } + + score += static_cast(nMatches) * term.score; + } + return score; +} + int scoreSubstructs(const ROMol &mol, const std::vector &substructureTerms) { int score = 0; + SubstructMatchParameters params; for (const auto &term : substructureTerms) { if (!term.matcher.getNumAtoms()) { BOOST_LOG(rdErrorLog) << " matcher for term " << term.name << " is invalid, ignoring it." << std::endl; continue; } - SubstructMatchParameters params; - const auto matches = SubstructMatch(mol, term.matcher, params); - // if (!matches.empty()) { - // std::cerr << " " << matches.size() << " matches to " << term.name - // << std::endl; - // } - score += static_cast(matches.size()) * term.score; + const auto nMatches = SubstructMatchCount(mol, term.matcher, params); + score += static_cast(nMatches) * term.score; } return score; } @@ -149,11 +360,11 @@ TautomerEnumerator::TautomerEnumerator(const CleanupParameters ¶ms) bool TautomerEnumerator::setTautomerStereoAndIsoHs( const ROMol &mol, ROMol &taut, const TautomerEnumeratorResult &res) const { bool modified = false; - for (auto atom : mol.atoms()) { - auto atomIdx = atom->getIdx(); - if (!res.d_modifiedAtoms.test(atomIdx)) { - continue; - } + // Iterate only the atoms/bonds actually modified by transforms. + for (auto atomIdx = res.d_modifiedAtoms.find_first(); + atomIdx != boost::dynamic_bitset<>::npos; + atomIdx = res.d_modifiedAtoms.find_next(atomIdx)) { + const auto atom = mol.getAtomWithIdx(static_cast(atomIdx)); auto tautAtom = taut.getAtomWithIdx(atomIdx); // clear chiral tag on sp2 atoms (also sp3 if d_removeSp3Stereo is true) if (tautAtom->getHybridization() == Atom::SP2 || d_removeSp3Stereo) { @@ -178,11 +389,26 @@ bool TautomerEnumerator::setTautomerStereoAndIsoHs( } } // remove stereochemistry on bonds that are part of a tautomeric path - for (auto bond : mol.bonds()) { - auto bondIdx = bond->getIdx(); - if (!res.d_modifiedBonds.test(bondIdx)) { - continue; + // Build bond lookup caches to avoid O(n) getBondWithIdx calls. + // getBondWithIdx iterates through all bonds to find the one with the given + // index, which is expensive when called multiple times in a loop. + std::vector molBonds; + std::vector tautBonds; + if (res.d_modifiedBonds.any()) { + const auto numBonds = mol.getNumBonds(); + molBonds.resize(numBonds); + tautBonds.resize(numBonds); + for (auto bond : mol.bonds()) { + molBonds[bond->getIdx()] = bond; } + for (auto bond : taut.bonds()) { + tautBonds[bond->getIdx()] = bond; + } + } + for (auto bondIdx = res.d_modifiedBonds.find_first(); + bondIdx != boost::dynamic_bitset<>::npos; + bondIdx = res.d_modifiedBonds.find_next(bondIdx)) { + const auto bond = molBonds[bondIdx]; std::vector bondsToClearDirs; if (bond->getBondType() == Bond::DOUBLE && bond->getStereo() > Bond::STEREOANY) { @@ -199,7 +425,7 @@ bool TautomerEnumerator::setTautomerStereoAndIsoHs( } } } - auto tautBond = taut.getBondWithIdx(bondIdx); + auto tautBond = tautBonds[bondIdx]; if (tautBond->getBondType() != Bond::DOUBLE || d_removeBondStereo) { // When bond stereo is being removed for bonds involved in tautomerism, // use STEREOANY (for *non-ring* double bonds) instead of STEREONONE. @@ -236,7 +462,7 @@ bool TautomerEnumerator::setTautomerStereoAndIsoHs( tautBond->setStereo(targetStereo); tautBond->getStereoAtoms().clear(); for (auto bi : bondsToClearDirs) { - taut.getBondWithIdx(bi)->setBondDir(Bond::NONE); + tautBonds[bi]->setBondDir(Bond::NONE); } } else { const INT_VECT &sa = bond->getStereoAtoms(); @@ -247,8 +473,7 @@ bool TautomerEnumerator::setTautomerStereoAndIsoHs( } tautBond->setStereo(bond->getStereo()); for (auto bi : bondsToClearDirs) { - taut.getBondWithIdx(bi)->setBondDir( - mol.getBondWithIdx(bi)->getBondDir()); + tautBonds[bi]->setBondDir(molBonds[bi]->getBondDir()); } } } @@ -335,14 +560,30 @@ TautomerEnumeratorResult TautomerEnumerator::enumerate(const ROMol &mol) const { MolOps::symmetrizeSSSR(*taut); } - // Create a kekulized form of the molecule to match the SMARTS against. - // canonical=true is required so that tautomer deduplication is independent - // of atom ordering in the molecule. - RWMOL_SPTR kekulized(new RWMol(*taut)); - MolOps::Kekulize(*kekulized, false, true); - res.d_tautomers = {{smi, Tautomer(taut, kekulized, 0, 0)}}; + // Kekulized form will be created lazily when needed for transform matching. + // canonical=true is used on demand so that tautomer deduplication is + // independent of atom ordering in the molecule. + res.d_tautomers = {{smi, Tautomer(taut, 0, 0)}}; res.d_modifiedAtoms.resize(mol.getNumAtoms()); res.d_modifiedBonds.resize(mol.getNumBonds()); + + // Keep running counts of modified atoms/bonds. + // `boost::dynamic_bitset<>::count()` is O(n) in the number of blocks, and we + // were previously calling it once per new tautomer, which is avoidable. + size_t numModifiedAtoms = 0; + size_t numModifiedBonds = 0; + const auto markAtomModified = [&res, &numModifiedAtoms](unsigned int idx) { + if (!res.d_modifiedAtoms.test(idx)) { + res.d_modifiedAtoms.set(idx); + ++numModifiedAtoms; + } + }; + const auto markBondModified = [&res, &numModifiedBonds](unsigned int idx) { + if (!res.d_modifiedBonds.test(idx)) { + res.d_modifiedBonds.set(idx); + ++numModifiedBonds; + } + }; bool completed = false; bool bailOut = false; unsigned int nTransforms = 0; @@ -378,8 +619,8 @@ TautomerEnumeratorResult TautomerEnumerator::enumerate(const ROMol &mol) const { if (bailOut) { break; } - // kmol is the kekulized version of the tautomer - const auto &kmol = smilesTautomerPair.second.kekulized; + // kmol is the kekulized version of the tautomer (created lazily) + const auto &kmol = smilesTautomerPair.second.getKekulized(); std::vector matches; unsigned int matched = SubstructMatch(*kmol, *(transform.Mol), matches); @@ -417,15 +658,15 @@ TautomerEnumeratorResult TautomerEnumerator::enumerate(const ROMol &mol) const { // Create a copy of in the input molecule so we can modify it // Use kekule form so bonds are explicitly single/double instead of // aromatic - RWMOL_SPTR product(new RWMol(*kmol)); + RWMOL_SPTR product(new RWMol(*kmol, true)); // Remove a hydrogen from the first matched atom and add one to the // last int firstIdx = match.front().second; int lastIdx = match.back().second; Atom *first = product->getAtomWithIdx(firstIdx); Atom *last = product->getAtomWithIdx(lastIdx); - res.d_modifiedAtoms.set(firstIdx); - res.d_modifiedAtoms.set(lastIdx); + markAtomModified(static_cast(firstIdx)); + markAtomModified(static_cast(lastIdx)); first->setNumExplicitHs( std::max(0, static_cast(first->getTotalNumHs()) - 1)); last->setNumExplicitHs(last->getTotalNumHs() + 1); @@ -465,7 +706,7 @@ TautomerEnumeratorResult TautomerEnumerator::enumerate(const ROMol &mol) const { #endif } } - res.d_modifiedBonds.set(bond->getIdx()); + markBondModified(bond->getIdx()); } // TODO adjust charges if (!transform.Charges.empty()) { @@ -485,14 +726,20 @@ TautomerEnumeratorResult TautomerEnumerator::enumerate(const ROMol &mol) const { } #endif - unsigned int failedOp; try { - MolOps::sanitizeMol(*product, failedOp, - MolOps::SANITIZE_KEKULIZE | - MolOps::SANITIZE_SETAROMATICITY | - MolOps::SANITIZE_SETCONJUGATION | - MolOps::SANITIZE_SETHYBRIDIZATION | - MolOps::SANITIZE_ADJUSTHS); + // We only change bond orders/H counts/charges; the molecular graph + // (and therefore ring topology) is unchanged. + // `sanitizeMol()` always calls `clearComputedProps()` which resets + // ring info and forces ring-finding for each generated tautomer. + // Avoid that by clearing computed props without touching rings, + // then running the specific sanitize steps we need. + product->clearComputedProps(false); + product->updatePropertyCache(false); + MolOps::Kekulize(*product); + MolOps::setAromaticity(*product); + MolOps::setConjugation(*product); + MolOps::setHybridization(*product); + MolOps::adjustHs(*product); } catch (const KekulizeException &) { continue; } @@ -518,19 +765,25 @@ TautomerEnumeratorResult TautomerEnumerator::enumerate(const ROMol &mol) const { } // in addition to the above transformations, sanitization may modify // bonds, e.g. Cc1nc2ccccc2[nH]1 - for (size_t i = 0; i < mol.getNumBonds(); i++) { - auto molBondType = mol.getBondWithIdx(i)->getBondType(); - auto tautBondType = product->getBondWithIdx(i)->getBondType(); - if (molBondType != tautBondType && !res.d_modifiedBonds.test(i)) { + // Use parallel iteration to avoid O(n) getBondWithIdx lookups + { + auto productBondIt = product->bonds().begin(); + for (const auto molBond : mol.bonds()) { + const auto productBond = *productBondIt; + ++productBondIt; + auto i = molBond->getIdx(); + if (molBond->getBondType() != productBond->getBondType() && + !res.d_modifiedBonds.test(i)) { #ifdef VERBOSE_ENUMERATION - std::cout << "Sanitization has modified bond " << i << std::endl; + std::cout << "Sanitization has modified bond " << i + << std::endl; #endif - res.d_modifiedBonds.set(i); + markBondModified(static_cast(i)); + } } } - RWMOL_SPTR kekulized_product(new RWMol(*product)); - // canonical=true for order-independent tautomer deduplication - MolOps::Kekulize(*kekulized_product, false, true); + // Kekulized form will be created lazily when needed; + // canonical=true is used on demand for order-independent deduplication. #ifdef VERBOSE_ENUMERATION auto it = res.d_tautomers.find(tsmiles); if (it == res.d_tautomers.end()) { @@ -539,8 +792,6 @@ TautomerEnumeratorResult TautomerEnumerator::enumerate(const ROMol &mol) const { std::cout << "New tautomer replaced for "; } std::cout << tsmiles << ", taut: " << MolToSmiles(*product) - << ", kek: " - << MolToSmiles(*kekulized_product, smilesWriteParams) << std::endl; #endif // BOOST_LOG(rdInfoLog) @@ -549,15 +800,15 @@ TautomerEnumeratorResult TautomerEnumerator::enumerate(const ROMol &mol) const { // transform.Mol->getProp(common_properties::_Name) // << " produced tautomer " << tsmiles << std::endl; res.d_tautomers[tsmiles] = Tautomer( - std::move(product), std::move(kekulized_product), - res.d_modifiedAtoms.count(), res.d_modifiedBonds.count()); + std::move(product), + numModifiedAtoms, numModifiedBonds); } } smilesTautomerPair.second.d_done = true; } completed = true; - size_t maxNumModifiedAtoms = res.d_modifiedAtoms.count(); - size_t maxNumModifiedBonds = res.d_modifiedBonds.count(); + size_t maxNumModifiedAtoms = numModifiedAtoms; + size_t maxNumModifiedBonds = numModifiedBonds; for (auto it = res.d_tautomers.begin(); it != res.d_tautomers.end();) { auto &taut = it->second; if (!taut.d_done) { @@ -642,7 +893,25 @@ ROMol *TautomerEnumerator::canonicalize( << "no tautomers found, returning input molecule" << std::endl; return new ROMol(mol); } - return pickCanonical(res, scoreFunc); + // When no custom scorer provided, use optimized scoring that pre-filters + // SubstructTerm patterns once for the input molecule rather than evaluating + // all 12 substructure matches per tautomer. This is safe because + // tautomerization only moves H and changes bond orders, never creates or + // destroys heavy-atom bonds. + if (!scoreFunc) { + scoreFunc = TautomerScoringFunctions::makeOptimizedScorer(mol); + } + ROMol *canonical = pickCanonical(res, scoreFunc); + // quickCopy during enumeration doesn't copy molecule properties or + // conformers. Restore both from the original molecule so that + // downstream code (e.g. InChI generation) that relies on 2D/3D + // coordinates or mol-level properties works correctly. + canonical->updateProps(mol); + for (auto confIt = mol.beginConformers(); confIt != mol.endConformers(); + ++confIt) { + canonical->addConformer(new Conformer(**confIt), true); + } + return canonical; } void TautomerEnumerator::canonicalizeInPlace( @@ -655,34 +924,48 @@ void TautomerEnumerator::canonicalizeInPlace( << "no tautomers found, molecule unchanged" << std::endl; return; } + // When no custom scorer provided, use optimized scoring that pre-filters + // SubstructTerm patterns once for the input molecule. + if (!scoreFunc) { + scoreFunc = TautomerScoringFunctions::makeOptimizedScorer(mol); + } std::unique_ptr tmp{pickCanonical(res, scoreFunc)}; TEST_ASSERT(tmp->getNumAtoms() == mol.getNumAtoms()); TEST_ASSERT(tmp->getNumBonds() == mol.getNumBonds()); // now copy the info from the canonical tautomer over to the input molecule - for (const auto tmpAtom : tmp->atoms()) { - auto atom = mol.getAtomWithIdx(tmpAtom->getIdx()); - TEST_ASSERT(tmpAtom->getAtomicNum() == atom->getAtomicNum()); - atom->setFormalCharge(tmpAtom->getFormalCharge()); - atom->setNoImplicit(tmpAtom->getNoImplicit()); - atom->setIsAromatic(tmpAtom->getIsAromatic()); - atom->setNumExplicitHs(tmpAtom->getNumExplicitHs()); - atom->setNumRadicalElectrons(tmpAtom->getNumRadicalElectrons()); - atom->setChiralTag(tmpAtom->getChiralTag()); - } - for (const auto tmpBond : tmp->bonds()) { - auto bond = mol.getBondWithIdx(tmpBond->getIdx()); - TEST_ASSERT(tmpBond->getBeginAtomIdx() == bond->getBeginAtomIdx()); - TEST_ASSERT(tmpBond->getEndAtomIdx() == bond->getEndAtomIdx()); - bond->setBondType(tmpBond->getBondType()); - bond->setBondDir(tmpBond->getBondDir()); - bond->setIsAromatic(tmpBond->getIsAromatic()); - bond->setIsConjugated(tmpBond->getIsConjugated()); - if (tmpBond->getStereoAtoms().size() == 2) { - bond->setStereoAtoms(tmpBond->getStereoAtoms()[0], - tmpBond->getStereoAtoms()[1]); + // iterate both molecules' atoms and bonds in parallel - they have matching indices + { + auto molAtomIt = mol.atoms().begin(); + for (const auto tmpAtom : tmp->atoms()) { + auto atom = *molAtomIt; + ++molAtomIt; + TEST_ASSERT(tmpAtom->getAtomicNum() == atom->getAtomicNum()); + atom->setFormalCharge(tmpAtom->getFormalCharge()); + atom->setNoImplicit(tmpAtom->getNoImplicit()); + atom->setIsAromatic(tmpAtom->getIsAromatic()); + atom->setNumExplicitHs(tmpAtom->getNumExplicitHs()); + atom->setNumRadicalElectrons(tmpAtom->getNumRadicalElectrons()); + atom->setChiralTag(tmpAtom->getChiralTag()); + } + } + { + auto molBondIt = mol.bonds().begin(); + for (const auto tmpBond : tmp->bonds()) { + auto bond = *molBondIt; + ++molBondIt; + TEST_ASSERT(tmpBond->getBeginAtomIdx() == bond->getBeginAtomIdx()); + TEST_ASSERT(tmpBond->getEndAtomIdx() == bond->getEndAtomIdx()); + bond->setBondType(tmpBond->getBondType()); + bond->setBondDir(tmpBond->getBondDir()); + bond->setIsAromatic(tmpBond->getIsAromatic()); + bond->setIsConjugated(tmpBond->getIsConjugated()); + if (tmpBond->getStereoAtoms().size() == 2) { + bond->setStereoAtoms(tmpBond->getStereoAtoms()[0], + tmpBond->getStereoAtoms()[1]); + } + bond->setStereo(tmpBond->getStereo()); } - bond->setStereo(tmpBond->getStereo()); } mol.updatePropertyCache(false); } diff --git a/Code/GraphMol/MolStandardize/Tautomer.h b/Code/GraphMol/MolStandardize/Tautomer.h index 2aafb8c40..3fd60deca 100644 --- a/Code/GraphMol/MolStandardize/Tautomer.h +++ b/Code/GraphMol/MolStandardize/Tautomer.h @@ -48,7 +48,15 @@ struct RDKIT_MOLSTANDARDIZE_EXPORT SubstructTerm { int score; RWMol matcher; // requires assignment - SubstructTerm(std::string aname, std::string asmarts, int ascore); + // Pre-screening support: elements that must be present (empty = no filter) + std::vector requiredElements; + // Bond-order-agnostic connectivity pattern for pre-screening (may be empty) + std::string connectivitySmarts; + RWMol connectivityMatcher; + + SubstructTerm(std::string aname, std::string asmarts, int ascore, + std::vector reqElements = {}, + std::string connSmarts = ""); SubstructTerm(const SubstructTerm &rhs) = default; bool operator==(const SubstructTerm &rhs) const { @@ -78,6 +86,25 @@ RDKIT_MOLSTANDARDIZE_EXPORT int scoreRings(const ROMol &mol); RDKIT_MOLSTANDARDIZE_EXPORT int scoreSubstructs( const ROMol &mol, const std::vector &terms = getDefaultTautomerScoreSubstructs()); + +//! Determine which SubstructTerms are potentially relevant for a molecule. +/// This pre-filters terms in two stages: +/// 1. Element check: skip terms requiring elements not in the molecule +/// 2. Connectivity check: skip terms whose bond-order-agnostic pattern +/// doesn't match (since tautomerization doesn't create/destroy bonds) +/// Returns indices into the terms vector for relevant terms. +RDKIT_MOLSTANDARDIZE_EXPORT std::vector getRelevantSubstructTermIndices( + const ROMol &mol, + const std::vector &terms = getDefaultTautomerScoreSubstructs()); + +//! Score substructures using only the terms at the specified indices. +/// Uses specialized matchers for simple patterns (C=O, N=O, P=O, methyl, etc.) +/// and falls back to VF2 for complex patterns. Use with +/// getRelevantSubstructTermIndices for efficient scoring of many tautomers +/// from the same parent molecule. +RDKIT_MOLSTANDARDIZE_EXPORT int scoreSubstructsFiltered( + const ROMol &mol, const std::vector &terms, + const std::vector &relevantIndices); //! scoreHeteroHs score the molecules hydrogens /// This gives a negative penalty to hydrogens attached to S,P, Se and Te /*! @@ -89,6 +116,25 @@ RDKIT_MOLSTANDARDIZE_EXPORT int scoreHeteroHs(const ROMol &mol); inline int scoreTautomer(const ROMol &mol) { return scoreRings(mol) + scoreSubstructs(mol) + scoreHeteroHs(mol); } + +//! Create an optimized scoring function for tautomers of a specific molecule. +/// This pre-filters SubstructTerms based on elements and connectivity present +/// in the input molecule, avoiding unnecessary substructure searches for +/// patterns that can never match any tautomer of this molecule. +/// The returned function captures the filtered term indices and can be passed +/// to pickCanonical or canonicalize. +inline boost::function makeOptimizedScorer( + const ROMol &mol) { + auto relevantIndices = getRelevantSubstructTermIndices(mol); + // Capture by value since the indices are small and we want the lambda to + // outlive this function. + return [relevantIndices](const ROMol &taut) { + const auto &terms = getDefaultTautomerScoreSubstructs(); + return scoreRings(taut) + + scoreSubstructsFiltered(taut, terms, relevantIndices) + + scoreHeteroHs(taut); + }; +} } // namespace TautomerScoringFunctions enum class TautomerEnumeratorStatus { @@ -103,6 +149,13 @@ class Tautomer { public: Tautomer() : d_numModifiedAtoms(0), d_numModifiedBonds(0), d_done(false) {} + // Constructor with just the tautomer - kekulized form will be created lazily + Tautomer(ROMOL_SPTR t, size_t a = 0, size_t b = 0) + : tautomer(std::move(t)), + d_numModifiedAtoms(a), + d_numModifiedBonds(b), + d_done(false) {} + // Legacy constructor for compatibility Tautomer(ROMOL_SPTR t, ROMOL_SPTR k, size_t a = 0, size_t b = 0) : tautomer(std::move(t)), kekulized(std::move(k)), @@ -110,7 +163,16 @@ class Tautomer { d_numModifiedBonds(b), d_done(false) {} ROMOL_SPTR tautomer; - ROMOL_SPTR kekulized; + mutable ROMOL_SPTR kekulized; // Lazily initialized + + // Get the kekulized form, creating it lazily if needed + const ROMOL_SPTR &getKekulized() const { + if (!kekulized && tautomer) { + kekulized.reset(new RWMol(*tautomer)); + MolOps::Kekulize(static_cast(*kekulized), false, true); + } + return kekulized; + } private: size_t d_numModifiedAtoms; @@ -440,12 +502,13 @@ class RDKIT_MOLSTANDARDIZE_EXPORT TautomerEnumerator { https://doi.org/10.1007/s10822-010-9346-4 */ - ROMol *canonicalize(const ROMol &mol, - boost::function scoreFunc = - TautomerScoringFunctions::scoreTautomer) const; - void canonicalizeInPlace(RWMol &mol, - boost::function scoreFunc = - TautomerScoringFunctions::scoreTautomer) const; + /// When \p scoreFunc is empty (default), an optimized scorer is created + /// that pre-filters substructure patterns once for the input molecule. + ROMol *canonicalize( + const ROMol &mol, + boost::function scoreFunc = {}) const; + void canonicalizeInPlace( + RWMol &mol, boost::function scoreFunc = {}) const; private: bool setTautomerStereoAndIsoHs(const ROMol &mol, ROMol &taut, diff --git a/Code/GraphMol/MolStandardize/Wrap/Tautomer.cpp b/Code/GraphMol/MolStandardize/Wrap/Tautomer.cpp index dc790bf2a..533e85fac 100644 --- a/Code/GraphMol/MolStandardize/Wrap/Tautomer.cpp +++ b/Code/GraphMol/MolStandardize/Wrap/Tautomer.cpp @@ -116,7 +116,7 @@ ROMol *getTautomerHelper(const TAUT_SPTR &self) { } ROMol *getKekulizedHelper(const TAUT_SPTR &self) { - return new ROMol(*self->kekulized); + return new ROMol(*self->getKekulized()); } python::tuple smilesTautomerMapKeysHelper( diff --git a/Code/GraphMol/MolStandardize/Wrap/testMolStandardize.py b/Code/GraphMol/MolStandardize/Wrap/testMolStandardize.py index 09a642996..c3d1764f5 100644 --- a/Code/GraphMol/MolStandardize/Wrap/testMolStandardize.py +++ b/Code/GraphMol/MolStandardize/Wrap/testMolStandardize.py @@ -1971,5 +1971,53 @@ M END self.assertNotIn("/b", before_inchi) self.assertNotIn("/b", after_inchi) + def testCanonicalizeExocyclicDoubleBondRegression(self): + """Regression: canonicalize() picks the wrong canonical tautomer for a + molecule whose exocyclic C=C stereo was set via the API (SetStereo + + SetStereoAtoms) without corresponding bond directions. + + assignStereochemistry(force=true) inside enumerate() clears stereo + that lacks bond directions, so the resulting tautomer set differs + from what you get with SMILES-encoded E/Z (which carries bond + directions that survive re-perception). Among the no-stereo + tautomers the enumerator incorrectly chooses the endocyclic form + over the exocyclic one.""" + mol = Chem.MolFromSmiles("O=C(CC1=CC2=CC=COC2)NC1=O") + mol = Chem.RemoveHs(mol) + + # Pin unspecified exocyclic C=C bonds to STEREOTRANS via API + # (no bond directions set — stereo will be cleared by + # assignStereochemistry(force=true) during enumeration) + ranks = list(Chem.CanonicalRankAtoms(mol, breakTies=False)) + for bond in mol.GetBonds(): + if (bond.GetBondType() != Chem.rdchem.BondType.DOUBLE + or bond.GetStereo() != Chem.rdchem.BondStereo.STEREONONE + or bond.IsInRing() + or bond.GetBeginAtom().GetAtomicNum() != 6 + or bond.GetEndAtom().GetAtomicNum() != 6): + continue + bgn, end = bond.GetBeginAtom(), bond.GetEndAtom() + bgnNbrs = [n.GetIdx() for n in bgn.GetNeighbors() if n.GetIdx() != end.GetIdx()] + endNbrs = [n.GetIdx() for n in end.GetNeighbors() if n.GetIdx() != bgn.GetIdx()] + if not bgnNbrs or not endNbrs: + continue + if (len(set(ranks[i] for i in bgnNbrs)) + bgn.GetNumImplicitHs() < 2 + or len(set(ranks[i] for i in endNbrs)) + end.GetNumImplicitHs() < 2): + continue + bond.SetStereoAtoms( + min(bgnNbrs, key=lambda i: ranks[i]), + min(endNbrs, key=lambda i: ranks[i]), + ) + bond.SetStereo(Chem.rdchem.BondStereo.STEREOTRANS) + + params = rdMolStandardize.CleanupParameters() + params.tautomerRemoveBondStereo = False + params.tautomerRemoveSp3Stereo = False + enumerator = rdMolStandardize.TautomerEnumerator(params) + canon = enumerator.Canonicalize(mol) + smi = Chem.MolToSmiles(canon) + self.assertEqual(smi, "O=C1CC(=CC2=CC=COC2)C(=O)N1", + f"Expected exocyclic form, got: {smi}") + if __name__ == "__main__": unittest.main() diff --git a/Code/GraphMol/MolStandardize/catch_tests.cpp b/Code/GraphMol/MolStandardize/catch_tests.cpp index bed9e2023..868b84cdb 100644 --- a/Code/GraphMol/MolStandardize/catch_tests.cpp +++ b/Code/GraphMol/MolStandardize/catch_tests.cpp @@ -1754,3 +1754,83 @@ TEST_CASE("Custom Scoring Functions") { *mol, terms) == 1000); } } + +TEST_CASE("tautomer canonicalize preserves conformers") { + // Regression test: quickCopy during tautomer enumeration drops + // conformers. canonicalize() must restore them from the original + // molecule so that downstream code (e.g. InChI generation) that + // relies on 2D/3D coordinates works correctly. + std::string molblock = R"CTAB( + ChemDraw02102613032D + + 0 0 0 0 0 0 V3000 +M V30 BEGIN CTAB +M V30 COUNTS 17 18 0 0 0 +M V30 BEGIN ATOM +M V30 1 C -1.382449 2.541459 0.000000 0 +M V30 2 N -1.391616 1.716459 0.000000 0 +M V30 3 C -2.139845 1.368125 0.000000 0 +M V30 4 C -2.333490 0.560313 0.000000 0 +M V30 5 C -1.822449 -0.093386 0.000000 0 +M V30 6 C -0.992865 -0.099688 0.000000 0 +M V30 7 C -0.468073 0.540833 0.000000 0 +M V30 8 C -0.646824 1.348646 0.000000 0 +M V30 9 O 0.002291 1.857397 0.000000 0 +M V30 10 N 0.334583 0.349479 0.000000 0 +M V30 11 C 0.570052 -0.441719 0.000000 0 +M V30 12 O 0.003437 -1.040990 0.000000 0 +M V30 13 C 1.372708 -0.633073 0.000000 0 +M V30 14 C 1.608177 -1.423699 0.000000 0 +M V30 15 C 2.333490 -1.816147 0.000000 0 +M V30 16 C 1.941043 -2.541459 0.000000 0 +M V30 17 C 1.215730 -2.149012 0.000000 0 +M V30 END ATOM +M V30 BEGIN BOND +M V30 1 1 1 2 +M V30 2 1 2 8 +M V30 3 1 2 3 +M V30 4 1 3 4 +M V30 5 1 4 5 +M V30 6 1 5 6 +M V30 7 1 6 7 +M V30 8 1 7 8 +M V30 9 2 8 9 +M V30 10 1 7 10 +M V30 11 1 10 11 +M V30 12 2 11 12 +M V30 13 1 11 13 +M V30 14 2 13 14 +M V30 15 1 14 17 +M V30 16 1 14 15 +M V30 17 1 15 16 +M V30 18 1 16 17 +M V30 END BOND +M V30 END CTAB +M END +)CTAB"; + std::unique_ptr mol(MolBlockToMol(molblock)); + REQUIRE(mol); + REQUIRE(mol->getNumConformers() == 1); + + MolStandardize::CleanupParameters params; + params.tautomerRemoveBondStereo = false; + params.tautomerRemoveSp3Stereo = false; + MolStandardize::TautomerEnumerator te(params); + std::unique_ptr canon{te.canonicalize(*mol)}; + REQUIRE(canon); + + // Conformer must be preserved (quickCopy regression) + CHECK(canon->getNumConformers() == 1); + if (canon->getNumConformers() > 0) { + const auto &origConf = mol->getConformer(0); + const auto &canonConf = canon->getConformer(0); + CHECK(origConf.getNumAtoms() == canonConf.getNumAtoms()); + for (unsigned int i = 0; i < origConf.getNumAtoms(); ++i) { + auto origPos = origConf.getAtomPos(i); + auto canonPos = canonConf.getAtomPos(i); + CHECK(origPos.x == Catch::Approx(canonPos.x).epsilon(1e-4)); + CHECK(origPos.y == Catch::Approx(canonPos.y).epsilon(1e-4)); + } + } +} + diff --git a/Code/GraphMol/MolStandardize/testTautomer.cpp b/Code/GraphMol/MolStandardize/testTautomer.cpp index e37132a66..970d76d7e 100644 --- a/Code/GraphMol/MolStandardize/testTautomer.cpp +++ b/Code/GraphMol/MolStandardize/testTautomer.cpp @@ -993,6 +993,61 @@ void testCanonicalize() { BOOST_LOG(rdInfoLog) << "Finished" << std::endl; } +void testCanonicalizeInvariantAcrossInputTautomers() { + BOOST_LOG(rdInfoLog) + << "-----------------------\n Testing canonicalize() invariance across input tautomers" + << std::endl; + + auto tautparams = + std::unique_ptr(new TautomerCatalogParams("")); + TautomerEnumerator te(new TautomerCatalog(tautparams.get())); + + // The core behavior guarantee we care about for perf work: regardless of which + // tautomer form is provided as input, canonicalize() selects the same + // canonical tautomer. + // + // Keep this bounded so the unit test stays fast. + constexpr size_t maxMoleculesToCheck = 25; + constexpr size_t maxTautomersToCheck = 25; + + size_t moleculesChecked = 0; + for (const auto &itm : canonTautomerData) { + if (moleculesChecked >= maxMoleculesToCheck) { + break; + } + std::unique_ptr mol{SmilesToMol(itm.first)}; + TEST_ASSERT(mol); + + auto tautRes = te.enumerate(*mol); + if (tautRes.status() != TautomerEnumeratorStatus::Completed) { + continue; + } + if (tautRes.size() > maxTautomersToCheck) { + continue; + } + + bool expectedPresent = false; + for (const auto &taut : tautRes) { + if (MolToSmiles(*taut) == itm.second) { + expectedPresent = true; + break; + } + } + TEST_ASSERT(expectedPresent); + + for (const auto &taut : tautRes) { + std::unique_ptr canon{te.canonicalize(*taut)}; + TEST_ASSERT(canon); + TEST_ASSERT(MolToSmiles(*canon) == itm.second); + } + ++moleculesChecked; + } + + // Make sure we actually exercised the logic. + TEST_ASSERT(moleculesChecked >= 10); + BOOST_LOG(rdInfoLog) << "Finished" << std::endl; +} + void testPickCanonical() { BOOST_LOG(rdInfoLog) << "-----------------------\n Testing pickCanonical" << std::endl; @@ -1654,6 +1709,63 @@ void testGithub3755() { } } +void testCanonicalizePreservesNonTautomericBondStereo() { + BOOST_LOG(rdInfoLog) + << "-----------------------\n " + "testCanonicalizePreservesNonTautomericBondStereo" + << std::endl; + // Molecule with E double-bond stereo on a C=C that is NOT part of + // any tautomeric path, plus a tautomerizable keto group. + // The E/Z must survive canonicalize() and canonicalizeInPlace(). + { + std::unique_ptr mol(SmilesToMol("O=CC/C=C/c1ccccc1")); + TEST_ASSERT(mol); + bool foundStereo = false; + for (const auto bond : mol->bonds()) { + if (bond->getBondType() == Bond::DOUBLE && + bond->getStereo() > Bond::STEREOANY) { + foundStereo = true; + break; + } + } + TEST_ASSERT(foundStereo); + + CleanupParameters params; + params.tautomerRemoveBondStereo = false; + params.tautomerRemoveSp3Stereo = false; + TautomerEnumerator te(params); + + // canonicalize() + { + ROMOL_SPTR canon(te.canonicalize(*mol)); + TEST_ASSERT(canon); + bool hasStereo = false; + for (const auto bond : canon->bonds()) { + if (bond->getBondType() == Bond::DOUBLE && + bond->getStereo() > Bond::STEREOANY) { + hasStereo = true; + break; + } + } + TEST_ASSERT(hasStereo); + } + // canonicalizeInPlace() + { + RWMol molCopy(*mol); + te.canonicalizeInPlace(molCopy); + bool hasStereo = false; + for (const auto bond : molCopy.bonds()) { + if (bond->getBondType() == Bond::DOUBLE && + bond->getStereo() > Bond::STEREOANY) { + hasStereo = true; + break; + } + } + TEST_ASSERT(hasStereo); + } + } +} + int main() { RDLog::InitLogs(); #if 1 @@ -1661,6 +1773,7 @@ int main() { testEnumeratorParams(); testEnumeratorCallback(); testCanonicalize(); + testCanonicalizeInvariantAcrossInputTautomers(); testPickCanonical(); testCustomScoreFunc(); testEnumerationProblems(); @@ -1672,5 +1785,6 @@ int main() { testTautomerEnumeratorResult_const_iterator(); testGithub3430(); testGithub3755(); + testCanonicalizePreservesNonTautomericBondStereo(); return 0; } diff --git a/Code/GraphMol/PeriodicTable.cpp b/Code/GraphMol/PeriodicTable.cpp index c3e1ef5d0..11d4d900a 100644 --- a/Code/GraphMol/PeriodicTable.cpp +++ b/Code/GraphMol/PeriodicTable.cpp @@ -10,6 +10,7 @@ // #include "PeriodicTable.h" #include +#include #include typedef boost::tokenizer> tokenizer; #include @@ -21,6 +22,10 @@ typedef boost::tokenizer> tokenizer; namespace RDKit { +namespace { +std::atomic ds_rawInstance{nullptr}; +} // namespace + class std::unique_ptr PeriodicTable::ds_instance = nullptr; PeriodicTable::PeriodicTable() { @@ -101,10 +106,15 @@ PeriodicTable::PeriodicTable() { void PeriodicTable::initInstance() { ds_instance = std::unique_ptr(new PeriodicTable()); + ds_rawInstance.store(ds_instance.get(), std::memory_order_release); } PeriodicTable *PeriodicTable::getTable() { #ifdef RDK_BUILD_THREADSAFE_SSS + auto *raw = ds_rawInstance.load(std::memory_order_acquire); + if (raw) { + return raw; + } static std::once_flag pt_init_once; std::call_once(pt_init_once, initInstance); #else @@ -112,7 +122,7 @@ PeriodicTable *PeriodicTable::getTable() { initInstance(); } #endif - return ds_instance.get(); + return ds_rawInstance.load(std::memory_order_acquire); } } // namespace RDKit diff --git a/Code/GraphMol/ROMol.cpp b/Code/GraphMol/ROMol.cpp index ed25a631c..fad7e95e8 100644 --- a/Code/GraphMol/ROMol.cpp +++ b/Code/GraphMol/ROMol.cpp @@ -79,6 +79,9 @@ void ROMol::initFromOther(const ROMol &other, bool quickCopy, int confId) { numBonds = 0; // std::cerr<<" init from other: "< atoms; for (auto &otherAtom : otherGroup.getAtoms()) { diff --git a/Code/GraphMol/Substruct/SubstructMatch.cpp b/Code/GraphMol/Substruct/SubstructMatch.cpp index 0ca62a83d..a58d7437d 100644 --- a/Code/GraphMol/Substruct/SubstructMatch.cpp +++ b/Code/GraphMol/Substruct/SubstructMatch.cpp @@ -473,6 +473,24 @@ struct RecursiveLocker { } } }; + +// A minimal container which satisfies the vf2_all() output-sequence interface +// but only counts matches instead of storing them. +struct MatchCounter { + using value_type = ssPairType; + + void clear() { d_count = 0; } + void resize(size_t) { d_count = 0; } + void reserve(size_t) {} + + bool empty() const { return d_count == 0; } + size_t size() const { return d_count; } + + void push_back(const value_type &) { ++d_count; } + + private: + size_t d_count = 0; +}; } // namespace detail // ---------------------------------------------- @@ -524,6 +542,34 @@ std::vector SubstructMatch( return matches; } +unsigned int SubstructMatchCount(const ROMol &mol, const ROMol &query, + const SubstructMatchParameters ¶ms) { + if (!mol.getNumAtoms() || !query.getNumAtoms()) { + return 0; + } + + detail::RecursiveLocker locker(query, params.recursionPossible); + + if (params.recursionPossible) { + detail::SUBQUERY_MAP subqueryMap; + for (const auto atom : query.atoms()) { + if (atom->hasQuery()) { + detail::MatchSubqueries(mol, atom->getQuery(), params, subqueryMap, + locker.locked); + } + } + } + + detail::AtomLabelFunctor atomLabeler(query, mol, params); + detail::BondLabelFunctor bondLabeler(query, mol, params); + MolMatchFinalCheckFunctor matchChecker(query, mol, params); + + detail::MatchCounter counter; + boost::vf2_all(query.getTopology(), mol.getTopology(), atomLabeler, + bondLabeler, matchChecker, counter, params.maxMatches); + return static_cast(counter.size()); +} + std::vector SubstructMatch( const MolBundle &bundle, const ROMol &query, const SubstructMatchParameters ¶ms) { diff --git a/Code/GraphMol/Substruct/SubstructMatch.h b/Code/GraphMol/Substruct/SubstructMatch.h index 096806474..d48e661a6 100644 --- a/Code/GraphMol/Substruct/SubstructMatch.h +++ b/Code/GraphMol/Substruct/SubstructMatch.h @@ -107,6 +107,23 @@ RDKIT_SUBSTRUCTMATCH_EXPORT std::vector SubstructMatch( const ROMol &mol, const ROMol &query, const SubstructMatchParameters ¶ms = SubstructMatchParameters()); +//! Count substructure matches for a query in a molecule without materializing +//! the full match vectors. +/*! + + + + \param mol The ROMol to be searched + \param query The query ROMol + \param matchParams Parameters controlling the matching + + \return The number of matches found (capped by params.maxMatches) + +*/ +RDKIT_SUBSTRUCTMATCH_EXPORT unsigned int SubstructMatchCount( + const ROMol &mol, const ROMol &query, + const SubstructMatchParameters ¶ms = SubstructMatchParameters()); + //! Find all substructure matches for a query in a ResonanceMolSupplier object /*! \param resMolSuppl The ResonanceMolSupplier object to be searched diff --git a/Code/GraphMol/Substruct/testSubstructMatch.cpp b/Code/GraphMol/Substruct/testSubstructMatch.cpp index 8ff0bf453..6ac2f1e8f 100644 --- a/Code/GraphMol/Substruct/testSubstructMatch.cpp +++ b/Code/GraphMol/Substruct/testSubstructMatch.cpp @@ -1703,4 +1703,54 @@ M END const auto rAtom = mol->getAtomWithIdx(mol->getNumAtoms() - 1); CHECK(!isAtomTerminalRGroupOrQueryHydrogen(rAtom)); } -} \ No newline at end of file +} + +TEST_CASE("SubstructMatchCount regression", "[substruct]") { + { + auto mol = "c1ccccc1"_smiles; + REQUIRE(mol); + std::unique_ptr query{SmartsToMol("c:c")}; + REQUIRE(query); + + SubstructMatchParameters params; + const auto matches = SubstructMatch(*mol, *query, params); + const auto count = SubstructMatchCount(*mol, *query, params); + CHECK(count == matches.size()); + + params.maxMatches = 1; + const auto matchesCapped = SubstructMatch(*mol, *query, params); + const auto countCapped = SubstructMatchCount(*mol, *query, params); + CHECK(matchesCapped.size() == 1); + CHECK(countCapped == 1); + } + + { + // Uniquify=false should still match counts with the full match materializer. + // (We don't assert an exact number here because it depends on automorphisms.) + auto mol = "c1ccccc1"_smiles; + REQUIRE(mol); + std::unique_ptr query{SmartsToMol("c:c")}; + REQUIRE(query); + + SubstructMatchParameters params; + params.uniquify = false; + const auto matches = SubstructMatch(*mol, *query, params); + const auto count = SubstructMatchCount(*mol, *query, params); + CHECK(count == matches.size()); + } + + { + // Simple stereochem case to make sure chirality-related final checking is + // consistent. + auto mol = "C[C@H](F)Cl"_smiles; + REQUIRE(mol); + std::unique_ptr query{SmartsToMol("[C@H](F)Cl")}; + REQUIRE(query); + + SubstructMatchParameters params; + params.useChirality = true; + const auto matches = SubstructMatch(*mol, *query, params); + const auto count = SubstructMatchCount(*mol, *query, params); + CHECK(count == matches.size()); + } +} diff --git a/Code/GraphMol/TautomerQuery/TautomerQuery.cpp b/Code/GraphMol/TautomerQuery/TautomerQuery.cpp index d4a78efa5..fe003547d 100644 --- a/Code/GraphMol/TautomerQuery/TautomerQuery.cpp +++ b/Code/GraphMol/TautomerQuery/TautomerQuery.cpp @@ -187,7 +187,16 @@ TautomerQuery *TautomerQuery::fromMol( delete queryBond; } - return new TautomerQuery(res.tautomers(), + // Copy molecule properties from the original query to all tautomers. + // Tautomer enumeration uses quickCopy for performance, which doesn't copy + // molecule properties. We need to preserve them here for proper CXSMILES + // output (e.g. link nodes). + auto tautomers = res.tautomers(); + for (auto &tautomer : tautomers) { + tautomer->updateProps(query); + } + + return new TautomerQuery(std::move(tautomers), static_cast(templateMolecule), modifiedAtoms, modifiedBonds); } diff --git a/rdkit/Chem/MolStandardize/UnitTestMolStandardize.py b/rdkit/Chem/MolStandardize/UnitTestMolStandardize.py index cf87e9200..9da12f8b7 100644 --- a/rdkit/Chem/MolStandardize/UnitTestMolStandardize.py +++ b/rdkit/Chem/MolStandardize/UnitTestMolStandardize.py @@ -28,3 +28,62 @@ class TestCase(unittest.TestCase): tauts = enumerator.Enumerate(m) reordtauts = MolStandardize.ReorderTautomers(m) self.assertEqual(len(reordtauts), len(tauts)) + + def testNoUnassignedStereoAfterCanonicalize(self): + # tautomer canonicalization must not produce an unassigned stereocenter ('?') + # due to atom/bond reordering + stereochem reassignment. + # Enhanced StereoGroup over the stereocenters to indicate the `syn` nature of mol + molblock = r""" + ChemDraw01232415492D + + 0 0 0 0 0 0 V3000 +M V30 BEGIN CTAB +M V30 COUNTS 12 13 0 0 0 +M V30 BEGIN ATOM +M V30 1 O 1.690391 0.714714 0.000000 0 +M V30 2 C 1.278465 0.000859 0.000000 0 +M V30 3 O 1.690391 -0.712995 0.000000 0 +M V30 4 C 0.454037 0.000286 0.000000 0 +M V30 5 C -0.260390 0.412786 0.000000 0 +M V30 6 H 0.153828 1.125495 0.000000 0 +M V30 7 C -0.975391 0.831588 0.000000 0 +M V30 8 C -1.690391 0.412786 0.000000 0 +M V30 9 C -1.690391 -0.415651 0.000000 0 +M V30 10 C -0.975391 -0.825860 0.000000 0 +M V30 11 C -0.260390 -0.412214 0.000000 0 +M V30 12 H 0.152110 -1.125495 0.000000 0 +M V30 END ATOM +M V30 BEGIN BOND +M V30 1 1 1 2 +M V30 2 2 2 3 +M V30 3 1 4 2 CFG=3 +M V30 4 1 4 11 +M V30 5 1 4 5 +M V30 6 1 5 6 CFG=3 +M V30 7 1 5 11 +M V30 8 1 5 7 +M V30 9 1 7 8 +M V30 10 1 8 9 +M V30 11 1 9 10 +M V30 12 1 10 11 +M V30 13 1 11 12 CFG=3 +M V30 END BOND +M V30 END CTAB +M END +""" + + base = Chem.MolFromMolBlock(molblock, sanitize=True, removeHs=False) + enumerator = rdMolStandardize.TautomerEnumerator() + enumerator.SetRemoveSp3Stereo(False) + canon = enumerator.Canonicalize(base) + centers = Chem.FindMolChiralCenters( + canon, + includeUnassigned=True, + useLegacyImplementation=False, + ) + unassigned = [c for c in centers if c[1] == '?'] + self.assertFalse( + unassigned, + msg=f"produced unassigned stereocenters: {unassigned} (centers={centers})", + ) +