// // Copyright (C) 2011-2024 NextMove Software and other RDKit contributors // // @@ All Rights Reserved @@ // This file is part of the RDKit. // The contents are covered by the terms of the BSD license // which is included in the file license.txt, found at the root // of the RDKit source tree. #ifndef _CRT_SECURE_NO_WARNINGS #define _CRT_SECURE_NO_WARNINGS #endif #include #include #include #include // #define VERBOSE_HASH 1 #include #include #include #include #include #include #include "nmmolhash.h" #include "mf.h" namespace { void addCXExtensions(RDKit::RWMol *mol, std::string &result, unsigned additionalSkips = 0) { unsigned int cxflagsToSkip = additionalSkips | RDKit::SmilesWrite::CX_COORDS; auto cxext = RDKit::SmilesWrite::getCXExtensions( *mol, RDKit::SmilesWrite::CX_ALL ^ cxflagsToSkip); if (!cxext.empty()) { result += " " + cxext; } } unsigned int NMRDKitBondGetOrder(const RDKit::Bond *bnd) { PRECONDITION(bnd, "bad bond"); switch (bnd->getBondType()) { case RDKit::Bond::AROMATIC: case RDKit::Bond::SINGLE: return 1; case RDKit::Bond::DOUBLE: return 2; case RDKit::Bond::TRIPLE: return 3; case RDKit::Bond::QUADRUPLE: return 4; case RDKit::Bond::QUINTUPLE: return 5; case RDKit::Bond::HEXTUPLE: return 6; default: return 0; } } RDKit::Bond *NMRDKitMolNewBond(RDKit::RWMol *mol, RDKit::Atom *src, RDKit::Atom *dst, unsigned int order, bool arom) { PRECONDITION(mol, "bad molecule"); PRECONDITION(src, "bad src atom"); PRECONDITION(dst, "bad dest atom"); RDKit::Bond *result; result = mol->getBondBetweenAtoms(src->getIdx(), dst->getIdx()); if (result) { if (order == 1) { switch (result->getBondType()) { case RDKit::Bond::SINGLE: result->setBondType(RDKit::Bond::DOUBLE); break; case RDKit::Bond::DOUBLE: result->setBondType(RDKit::Bond::TRIPLE); break; default: break; } } return result; } RDKit::Bond::BondType type = RDKit::Bond::UNSPECIFIED; if (!arom) { switch (order) { case 1: type = RDKit::Bond::SINGLE; break; case 2: type = RDKit::Bond::DOUBLE; break; case 3: type = RDKit::Bond::TRIPLE; break; case 4: type = RDKit::Bond::QUADRUPLE; break; } } else { type = RDKit::Bond::AROMATIC; } result = new RDKit::Bond(type); result->setOwningMol(mol); result->setBeginAtom(src); result->setEndAtom(dst); mol->addBond(result, true); if (arom) { result->setIsAromatic(true); } return result; } void NMRDKitSanitizeHydrogens(RDKit::RWMol *mol) { PRECONDITION(mol, "bad molecule"); // Move all of the implicit Hs into one box for (auto aptr : mol->atoms()) { unsigned int hcount = aptr->getTotalNumHs(); aptr->setNoImplicit(true); aptr->setNumExplicitHs(hcount); bool strict = false; aptr->updatePropertyCache( strict); // or else the valence is reported incorrectly } } } // namespace namespace RDKit { namespace MolHash { namespace { unsigned int NMDetermineComponents(RWMol *mol, unsigned int *parts, unsigned int acount) { PRECONDITION(mol, "bad molecule"); PRECONDITION(parts, "bad parts pointer"); memset(parts, 0, acount * sizeof(unsigned int)); std::vector todo; unsigned int result = 0; for (auto aptr : mol->atoms()) { unsigned int idx = aptr->getIdx(); if (parts[idx] == 0) { parts[idx] = ++result; todo.push_back(aptr); while (!todo.empty()) { aptr = todo.back(); todo.pop_back(); for (auto nbri : boost::make_iterator_range(mol->getAtomNeighbors(aptr))) { auto nptr = (*mol)[nbri]; idx = nptr->getIdx(); if (parts[idx] == 0) { parts[idx] = result; todo.push_back(nptr); } } } } } return result; } std::string NMMolecularFormula(RWMol *mol, const unsigned int *parts, unsigned int part) { PRECONDITION(mol, "bad molecule"); PRECONDITION((!part || parts), "bad parts pointer"); unsigned int hist[256]; int charge = 0; memset(hist, 0, sizeof(hist)); for (auto aptr : mol->atoms()) { unsigned int idx = aptr->getIdx(); if (part == 0 || parts[idx] == part) { unsigned int elem = aptr->getAtomicNum(); if (elem < 256) { hist[elem]++; } else { hist[0]++; } hist[1] += aptr->getTotalNumHs(false); charge += aptr->getFormalCharge(); } } char buffer[16]; std::string result; const unsigned char *perm = hist[6] ? OrganicHillOrder : InorganicHillOrder; for (unsigned int i = 0; i < 119; i++) { unsigned int elem = perm[i]; if (hist[elem] > 0) { result += symbol[elem]; if (hist[elem] > 1) { sprintf(buffer, "%u", hist[elem]); result += buffer; } } } if (charge != 0) { if (charge > 0) { result += '+'; if (charge > 1) { sprintf(buffer, "%d", charge); result += buffer; } } else { // charge < 0 result += '-'; if (charge < -1) { sprintf(buffer, "%d", -charge); result += buffer; } } } return result; } std::string NMMolecularFormula(RWMol *mol, bool sep = false) { PRECONDITION(mol, "bad molecule"); if (!sep) { return NMMolecularFormula(mol, nullptr, 0); } unsigned int acount = mol->getNumAtoms(); if (acount == 0) { return ""; } auto size = (unsigned int)(acount * sizeof(int)); auto *parts = (unsigned int *)malloc(size); unsigned int pcount = NMDetermineComponents(mol, parts, acount); std::string result; if (pcount > 1) { std::vector vmf; for (unsigned int i = 1; i <= pcount; i++) { vmf.push_back(NMMolecularFormula(mol, parts, i)); } // sort result = vmf[0]; for (unsigned int i = 1; i < pcount; i++) { result += "."; result += vmf[i]; } } else { // pcount == 1 result = NMMolecularFormula(mol, parts, 1); } free(parts); return result; } void NormalizeHCount(Atom *aptr) { PRECONDITION(aptr, "bad atom pointer"); unsigned int hcount; switch (aptr->getAtomicNum()) { case 9: // Fluorine case 17: // Chlorine case 35: // Bromine case 53: // Iodine hcount = aptr->getDegree(); hcount = hcount < 1 ? 1 - hcount : 0; break; case 8: // Oxygen case 16: // Sulfur hcount = aptr->getDegree(); hcount = hcount < 2 ? 2 - hcount : 0; break; case 5: // Boron hcount = aptr->getDegree(); hcount = hcount < 3 ? 3 - hcount : 0; break; case 7: // Nitogen case 15: // Phosphorus hcount = aptr->getDegree(); if (hcount < 3) { hcount = 3 - hcount; } else if (hcount == 4) { hcount = 1; } else { hcount = 0; } break; case 6: // Carbon hcount = aptr->getDegree(); hcount = hcount < 4 ? 4 - hcount : 0; break; default: hcount = 0; } aptr->setNoImplicit(true); aptr->setNumExplicitHs(hcount); } namespace { std::string convertToSmilesWithCXFlags( const RWMol &mol, bool doingCXSmiles, unsigned cxFlagsToSkip, SmilesWriteParams ps = SmilesWriteParams()) { bool skipStereoGroups = cxFlagsToSkip & SmilesWrite::CXSmilesFields::CX_ENHANCEDSTEREO; return SmilesWrite::detail::MolToSmiles(mol, ps, doingCXSmiles, !skipStereoGroups); } } // namespace std::string AnonymousGraph(RWMol *mol, bool elem, bool useCXSmiles, unsigned cxFlagsToSkip = 0) { PRECONDITION(mol, "bad molecule"); std::string result; for (auto aptr : mol->atoms()) { aptr->setIsAromatic(false); aptr->setFormalCharge(0); if (!elem) { aptr->setNumExplicitHs(0); aptr->setNoImplicit(true); aptr->setAtomicNum(0); aptr->setIsotope(0); } else { NormalizeHCount(aptr); } } for (auto bptr : mol->bonds()) { bptr->setBondType(Bond::SINGLE); bptr->setIsAromatic(false); // clear aromatic flags } MolOps::assignRadicals(*mol); // we may have just destroyed some stereocenters/bonds // clean that up: bool cleanIt = true; bool force = true; MolOps::assignStereochemistry(*mol, cleanIt, force); result = convertToSmilesWithCXFlags(*mol, useCXSmiles, cxFlagsToSkip); if (useCXSmiles) { addCXExtensions(mol, result, cxFlagsToSkip | SmilesWrite::CX_RADICALS); } return result; } std::string MesomerHash(RWMol *mol, bool netq, bool useCXSmiles, unsigned cxFlagsToSkip = 0) { PRECONDITION(mol, "bad molecule"); std::string result; char buffer[32]; int charge = 0; for (auto aptr : mol->atoms()) { charge += aptr->getFormalCharge(); aptr->setIsAromatic(false); aptr->setFormalCharge(0); } for (auto bptr : mol->bonds()) { bptr->setBondType(Bond::SINGLE); } MolOps::assignRadicals(*mol); // we may have just destroyed some stereocenters/bonds // clean that up: bool cleanIt = true; bool force = true; MolOps::assignStereochemistry(*mol, cleanIt, force); result = convertToSmilesWithCXFlags(*mol, useCXSmiles, cxFlagsToSkip); if (netq) { sprintf(buffer, "_%d", charge); result += buffer; } if (useCXSmiles) { addCXExtensions(mol, result, cxFlagsToSkip | SmilesWrite::CX_RADICALS); } return result; } namespace details { constexpr std::uint64_t atomFlagProperty = 1; //*< atom has the exclude property constexpr std::uint64_t bondFlagCarboxyl = 1; //*< bond involved in in carboxyl, amide, etc. constexpr std::uint64_t bondFlagProperty = 2; //*< bond has the exclude property std::vector getBondFlags(const ROMol &mol) { // FIX: oversimplified, but should work for now static std::vector patterns{ "[C;!$(C-C(=[NH])-[NH2])]-[C;!$(C(-C)(=[NH])-[NH2])](=[O,N,S])-[O,N,S]", //< one side of the "amide", with an ugly exclusion for amidine "[A;!$(C=[O,N,S])]-[O,N,S]-C=[O,N,S]", //< the other side "[OH0,SH0]-C=[O,N,S]", //< "esters" and "carboxyls" "C-[N;$([N+]=C(N)(N)),$(N-C(N)=N)]", //< quanidine like "[C]-[c](:[o,n,s]):[o,n,s]", //< a limited version of handling this for // aromatic systems "*[SD4](=*)=*", // sulfates, sulfonyls, etc. "*=[SD4;$(S(=*)=*)](*)*", // the other side "*-[H0]=,#[C,N]=,#*", // isocyanates, azides, et al "*[#7+]-[O-]", // nitro groups and aromatic n-oxides "[O,S;H]-P=O", // phosphoric acid, etc. "[#6]-P=O", // phosphoric acid, etc. "[#6]-N=[SD4]=*" // don't know what this one is called }; static std::vector> queries; if (queries.empty()) { for (const auto &pattern : patterns) { queries.emplace_back(SmartsToMol(pattern)); } } std::vector bondFlags; bondFlags.reserve(mol.getNumBonds()); std::ranges::transform( mol.bonds(), std::back_inserter(bondFlags), [](const auto &bnd) { return bnd->hasProp(MolHash::excludeFromTautomerismProp) ? details::bondFlagProperty : 0; }); for (const auto &qry : queries) { auto matches = SubstructMatch(mol, *qry); for (const auto &match : matches) { const auto bnd = mol.getBondBetweenAtoms(match[0].second, match[1].second); bondFlags[bnd->getIdx()] |= bondFlagCarboxyl; } } return bondFlags; } } // namespace details namespace { // candidate atoms are either unsaturated or have implicit Hs // NOTE that being aromatic is not sufficient. The molecule // Cn1cccc1 is a good example of this: // - it should not be a tautomeric system // - the N is not unsaturated, but it is aromatic bool isCandidateAtom(const Atom *aptr, const std::vector &atomFlags) { return !atomFlags[aptr->getIdx()] && (aptr->getTotalNumHs() || queryAtomUnsaturated(aptr)); } // atomic number > 1, not carbon bool isHeteroAtom(const Atom *aptr) { auto atNum = aptr->getAtomicNum(); return atNum != 6 && atNum > 1; } // aromatic (flagged or bond order), double, or triple bool isUnsaturatedBond(const Bond *bptr) { return bptr->getIsAromatic() || bptr->getBondType() == Bond::BondType::AROMATIC || bptr->getBondType() == Bond::BondType::DOUBLE || bptr->getBondType() == Bond::BondType::TRIPLE; } // potential tautomeric bonds are unsaturated and both atoms are candidates bool isPossibleTautomericBond(const Bond *bptr, const std::vector &atomFlags, const std::vector &bondFlags) { return !bondFlags[bptr->getIdx()] && isUnsaturatedBond(bptr) && isCandidateAtom(bptr->getBeginAtom(), atomFlags) && isCandidateAtom(bptr->getEndAtom(), atomFlags); } // a bond is a possible starting bond if it involves a candidate heteroatom // (definition above) and an unsaturated atom bool isPossibleStartingBond(const Bond *bptr, const std::vector &atomFlags, const std::vector &bondFlags) { if (bondFlags[bptr->getIdx()] || atomFlags[bptr->getBeginAtom()->getIdx()] || atomFlags[bptr->getEndAtom()->getIdx()]) { return false; } auto heteroBeg = isHeteroAtom(bptr->getBeginAtom()) && isCandidateAtom(bptr->getBeginAtom(), atomFlags); auto heteroEnd = isHeteroAtom(bptr->getEndAtom()) && isCandidateAtom(bptr->getEndAtom(), atomFlags); // at least one atom has to be an eligible heteroatom: if (!heteroBeg && !heteroEnd) { return false; } // Do not include a check for aromaticity here. See comment for // isCandidateAtom() above. auto unsatBeg = queryAtomUnsaturated(bptr->getBeginAtom()); auto unsatEnd = queryAtomUnsaturated(bptr->getEndAtom()); // both we need a heteroatom on one side and an unsaturated atom on the // other: if (!((heteroBeg && unsatEnd) || (heteroEnd && unsatBeg))) { return false; } return true; } // is one of the atom's bonds in the startBonds set? bool hasStartBond(const Atom *aptr, const boost::dynamic_bitset<> &startBonds) { for (const auto nbr : aptr->getOwningMol().atomBonds(aptr)) { if (startBonds[nbr->getIdx()]) { return true; } } return false; } // Don't extend to a stereocenter via single non-conjugated bonds, // unless the source atom has a non-aromatic unsaturated bond // (indicating the stereocenter could become sp2 in a tautomer). // Aromatic atoms shouldn't pull in substituent stereocenters. bool shouldSkipChiralNeighbor(const Atom *sourceAtom, const Atom *targetAtom, const Bond *connectingBond) { if (targetAtom->getHybridization() != Atom::SP3 || targetAtom->getTotalNumHs() != 1) { return false; } if (isUnsaturatedBond(connectingBond) || connectingBond->getIsConjugated()) { return false; } // Check if source has a non-aromatic unsaturated bond for (const auto &srcBnd : sourceAtom->getOwningMol().atomBonds(sourceAtom)) { if (!srcBnd->getIsAromatic() && isUnsaturatedBond(srcBnd)) { return false; // Source could participate in tautomerism } } return true; // Skip this chiral neighbor } // skip the neighbor bond if otherAtom isn't a candidate and doesn't have a // start bond OR if the bond is neither unsaturated nor conjugated and atom // doesn't have a start bond bool skipNeighborBond(const Atom *atom, const Atom *otherAtom, const Bond *nbrBond, const boost::dynamic_bitset<> &startBonds, const std::vector &atomFlags, const std::vector &bondFlags) { if (bondFlags[nbrBond->getIdx()] || atomFlags[otherAtom->getIdx()] || atomFlags[atom->getIdx()]) { return true; } // Special case: prevent extending from an aromatic carbon (with no H) // to an exocyclic C=C system. This handles cases like stilbene-pyridyl // (c1ccccc1/C=C/c1ncccc1) where the aromatic C has no mobile H but the // algorithm would otherwise extend to the exocyclic C=C and destroy E/Z. // Check BOTH directions: either the aromatic atom extending to vinyl, // or the vinyl atom being connected from aromatic. if (!nbrBond->getIsAromatic()) { const Atom *aromaticAtom = nullptr; const Atom *nonAromaticAtom = nullptr; if (atom->getIsAromatic() && !otherAtom->getIsAromatic()) { aromaticAtom = atom; nonAromaticAtom = otherAtom; } else if (!atom->getIsAromatic() && otherAtom->getIsAromatic()) { aromaticAtom = otherAtom; nonAromaticAtom = atom; } // If we have an aromatic->non-aromatic transition if (aromaticAtom && nonAromaticAtom) { // Aromatic atom must have no H and be carbon if (aromaticAtom->getTotalNumHs() == 0 && aromaticAtom->getAtomicNum() == 6 && nonAromaticAtom->getAtomicNum() == 6) { // Check if nonAromaticAtom is part of a C=C system for (const auto &bnd : atom->getOwningMol().atomBonds(nonAromaticAtom)) { if (bnd->getBondType() == Bond::BondType::DOUBLE && !bnd->getIsAromatic()) { auto otherEnd = bnd->getOtherAtom(nonAromaticAtom); if (otherEnd->getAtomicNum() == 6) { // This is a C=C double bond - skip extending return true; } } } } } } return ((!isCandidateAtom(otherAtom, atomFlags) && !hasStartBond(otherAtom, startBonds)) || (!isUnsaturatedBond(nbrBond) && !nbrBond->getIsConjugated() && !hasStartBond(atom, startBonds))); } // counts the number of neighboring atoms that have conjugated bonds. unsigned int getNumConjugatedNeighbors( const Atom *atom, const boost::dynamic_bitset<> &startBonds) { unsigned int res = 0; for (auto nbr : atom->getOwningMol().atomNeighbors(atom)) { for (auto nbrBond : atom->getOwningMol().atomBonds(nbr)) { if (nbrBond->getIsConjugated() || startBonds[nbrBond->getIdx()]) { ++res; break; } } } return res; } // special case to prevent "overreach" with things like enamines. // the logic here prevents the first bond in CNC=C from being // included in the tautomeric system. So we get: [CH3]-[N]:[C]:[C] // instead of [C]:[N]:[C]:[C] // returns true if the bond is a candidate for overreach. // The arguments are, with atom indices corresponding to the example above: // atom: atom 1 // oatom: atom 0 // bond: bond between atom 1 and atom 2 // nbrBond: bond between atom 1 and atom 0 bool checkForOverreach( const Atom *atom, const Atom *oatom, const Bond *bond, const Bond *nbrBond, const boost::dynamic_bitset<> &startBonds, const std::vector &numConjugatedNeighbors) { return (startBonds[bond->getIdx()] || hasStartBond(atom, startBonds)) && !startBonds[nbrBond->getIdx()] && isHeteroAtom(atom) && !isUnsaturatedBond(nbrBond) && numConjugatedNeighbors[oatom->getIdx()] < 2; } } // namespace std::string TautomerHashv2(RWMol *mol, bool proto, bool useCXSmiles, unsigned cxFlagsToSkip = 0) { PRECONDITION(mol, "bad molecule"); if (!mol->getRingInfo()->isFindFastOrBetter()) { MolOps::fastFindRings(*mol); } std::string result; unsigned int hcount = 0; int charge = 0; std::vector atomFlags; atomFlags.reserve(mol->getNumAtoms()); std::ranges::transform(mol->atoms(), std::back_inserter(atomFlags), [](const auto &atom) { return atom->hasProp(excludeFromTautomerismProp) ? details::atomFlagProperty : 0; }); auto bondFlags = details::getBondFlags(*mol); boost::dynamic_bitset<> bondsToModify(mol->getNumBonds()); boost::dynamic_bitset<> bondsConsidered(mol->getNumBonds()); boost::dynamic_bitset<> startBonds(mol->getNumBonds()); for (const auto bnd : mol->bonds()) { auto isStartBond = isPossibleStartingBond(bnd, atomFlags, bondFlags); startBonds.set(bnd->getIdx(), isStartBond); } std::vector numConjugatedNeighbors(mol->getNumAtoms(), 0); for (const auto atm : mol->atoms()) { numConjugatedNeighbors[atm->getIdx()] = getNumConjugatedNeighbors(atm, startBonds); } #ifdef VERBOSE_HASH std::cerr << " START BONDS: " << startBonds << std::endl; #endif for (auto bptr : mol->bonds()) { // If this has already been considered or is not a possible starting bond, // then skip it if (bondsToModify[bptr->getIdx()] || bondsConsidered[bptr->getIdx()] || !startBonds[bptr->getIdx()]) { continue; } #ifdef VERBOSE_HASH std::cerr << "START BOND: " << bptr->getIdx() << std::endl; #endif boost::dynamic_bitset<> conjSystem(mol->getNumBonds()); boost::dynamic_bitset<> conjAtoms(mol->getNumAtoms()); boost::dynamic_bitset<> atomsInSystem(mol->getNumAtoms()); unsigned int numDonorCs = 0; unsigned int activeHeteroHs = 0; std::deque bq; // also include eligible neighbor bonds: for (const auto atm : std::vector{bptr->getBeginAtom(), bptr->getEndAtom()}) { if (atm->getAtomicNum() == 6) { if (atm->getTotalNumHs()) { ++numDonorCs; } } else if (isHeteroAtom(atm)) { activeHeteroHs += atm->getTotalNumHs(); } for (const auto nbrBond : mol->atomBonds(atm)) { if (nbrBond == bptr || bondsConsidered[nbrBond->getIdx()]) { continue; } auto oatom = nbrBond->getOtherAtom(atm); #ifdef VERBOSE_HASH std::cerr << " check neighbor1 " << nbrBond->getIdx() << " from " << atm->getIdx() << "-" << oatom->getIdx() << std::endl; std::cerr << " " << bondsConsidered[nbrBond->getIdx()] << " icao " << isCandidateAtom(oatom, atomFlags) << " hsbo " << hasStartBond(oatom, startBonds) << " unsat " << isUnsaturatedBond(nbrBond) << " icaa " << isCandidateAtom(atm, atomFlags) << " hsba " << hasStartBond(atm, startBonds) << " ncno: " << numConjugatedNeighbors[oatom->getIdx()] << std::endl; #endif if (checkForOverreach(atm, oatom, bptr, nbrBond, startBonds, numConjugatedNeighbors)) { continue; } if (shouldSkipChiralNeighbor(atm, oatom, nbrBond)) { continue; } // if the bond is not eligible, then we can skip this neighbor if (skipNeighborBond(atm, oatom, nbrBond, startBonds, atomFlags, bondFlags) && skipNeighborBond(oatom, atm, nbrBond, startBonds, atomFlags, bondFlags)) { continue; } if (std::find(bq.begin(), bq.end(), nbrBond) == bq.end()) { #ifdef VERBOSE_HASH std::cerr << " push " << nbrBond->getIdx() << " " << nbrBond->getBeginAtomIdx() << "-" << nbrBond->getEndAtomIdx() << std::endl; std::cerr << " #### SET1 " << bptr->getIdx() << std::endl; #endif bq.push_back(nbrBond); } // now we know that we should consider this bond bondsConsidered.set(bptr->getIdx()); conjSystem.set(bptr->getIdx()); conjAtoms.set(bptr->getBeginAtomIdx()); conjAtoms.set(bptr->getEndAtomIdx()); } } while (!bq.empty()) { auto bnd = bq.front(); bq.pop_front(); if (bondsConsidered[bnd->getIdx()]) { continue; } #ifdef VERBOSE_HASH std::cerr << "BQ: " << bnd->getIdx() << ": " << bnd->getBeginAtomIdx() << "-" << bnd->getEndAtomIdx() << std::endl; std::cerr << " #### SET2 " << bnd->getIdx() << std::endl; #endif bondsConsidered.set(bnd->getIdx()); conjSystem.set(bnd->getIdx()); conjAtoms.set(bnd->getBeginAtomIdx()); conjAtoms.set(bnd->getEndAtomIdx()); for (const auto atm : std::vector{bnd->getBeginAtom(), bnd->getEndAtom()}) { if (atomsInSystem[atm->getIdx()]) { continue; } if (atm->getAtomicNum() == 6) { if (atm->getTotalNumHs()) { ++numDonorCs; atomsInSystem.set(atm->getIdx()); } } else if (atm->getAtomicNum() > 1) { activeHeteroHs += atm->getTotalNumHs(); atomsInSystem.set(atm->getIdx()); } for (auto nbrBnd : mol->atomBonds(atm)) { if (bondsConsidered[nbrBnd->getIdx()] || std::find(bq.begin(), bq.end(), nbrBnd) != bq.end()) { continue; } auto oatom = nbrBnd->getOtherAtom(atm); #ifdef VERBOSE_HASH std::cerr << " check neighbor " << nbrBnd->getIdx() << " from " << atm->getIdx() << "-" << oatom->getIdx() << std::endl; std::cerr << " " << bondsConsidered[nbrBnd->getIdx()] << " icao " << isCandidateAtom(oatom, atomFlags) << " hsbo " << hasStartBond(oatom, startBonds) << " unsat " << isUnsaturatedBond(nbrBnd) << " icaa " << isCandidateAtom(atm, atomFlags) << " hsba " << hasStartBond(atm, startBonds) << " ncno: " << numConjugatedNeighbors[oatom->getIdx()] << std::endl; #endif if (checkForOverreach(atm, oatom, bnd, nbrBnd, startBonds, numConjugatedNeighbors)) { continue; } if (shouldSkipChiralNeighbor(atm, oatom, nbrBnd)) { continue; } if ((skipNeighborBond(atm, oatom, nbrBnd, startBonds, atomFlags, bondFlags) && skipNeighborBond(oatom, atm, nbrBnd, startBonds, atomFlags, bondFlags))) { continue; } bq.push_back(nbrBnd); #ifdef VERBOSE_HASH std::cerr << " added!" << std::endl; #endif } } } // we need to have at least two bonds and include at least one active H if (conjSystem.count() > 1 && (activeHeteroHs || numDonorCs)) { #ifdef VERBOSE_HASH std::cerr << "CONJ: " << conjSystem << " hetero " << activeHeteroHs << " donor " << std::endl; #endif // all bonds between conjugated atoms in the system are considered to be // in the system. There are situations where the traverse doesn't find the // closure bond, so we explicitly check: for (auto i = 0U; i < conjAtoms.size(); i++) { if (conjAtoms[i]) { for (const auto bnd : mol->atomBonds(mol->getAtomWithIdx(i))) { if (conjAtoms[bnd->getOtherAtomIdx(i)]) { #ifdef VERBOSE_HASH if (!conjSystem[bnd->getIdx()]) { std::cerr << " BACKFILL BOND: " << bnd->getIdx() << std::endl; } #endif bondsToModify.set(bnd->getIdx()); } } } } } else { // undo all the bonds we marked as considered in this conjugated system for (unsigned int i = 0; i < mol->getNumBonds(); ++i) { if (conjSystem[i]) { bondsConsidered.reset(i); } } #ifdef VERBOSE_HASH std::cerr << "REJECT CONJ: " << conjSystem << " hetero " << activeHeteroHs << " donor " << std::endl; #endif } } /* Another situation we need to correct for is the following: These two tautomers should be recognized as equivalent: O8 O9 || | C1-C2-C3-C4-N5=C6-C7 O8 O9 | | C1-C2=C3-C4-N5=C6-C7 In the first case neither the N5-C4 nor the C3-C4 bond is recognized as being tautomeric since C4 does not have two conjugated neighbors, in the second case the N5-C4 bond is recognized since C4 does have two conjugated neighbors. In order to fix this and other similar cases we need to check the neighboring atoms of each start bond and check to see they are connected to other atoms which are involved in bonds which are either in a tautomeric system already (i.e. bondsToModify is set for them) or are start bonds. So in the case of the first tautomer above, here we're going to check N5-C4, and recognize that C4 has a neighbor (C3) with an adjacent tautomeric bond (C3-C2). So we'll flag N5-C4 as being part of the tautomeric system. */ for (const auto bptr : mol->bonds()) { // If this is not a possible starting bond, // then skip it if (!startBonds[bptr->getIdx()]) { continue; } for (const auto atm : std::vector{bptr->getBeginAtom(), bptr->getEndAtom()}) { for (const auto nbrBond : mol->atomBonds(atm)) { if (nbrBond == bptr || bondsToModify[nbrBond->getIdx()]) { continue; } const auto oatom = nbrBond->getOtherAtom(atm); if (!oatom->getTotalNumHs()) { continue; } // don't extend to reach a potential stereocenter — doing so would // incorrectly pull the stereocenter into the tautomeric system and // destroy its chirality. Use structural criteria (SP3 + 1H) for // chain atoms so behavior is consistent regardless of annotation. // For ring atoms, only skip if chirality is actually annotated, // since ring SP3+1H atoms often genuinely participate in ring // tautomerism (e.g. purine NH, glutarimide). if (oatom->getHybridization() == Atom::SP3 && oatom->getTotalNumHs() == 1 && (!queryIsAtomInRing(oatom) || oatom->getChiralTag() != Atom::CHI_UNSPECIFIED)) { continue; } unsigned int numModifiedNeighbors = 0; for (const auto nbr : mol->atomNeighbors(oatom)) { if (nbr == atm) { continue; } for (const auto nbnd : mol->atomBonds(nbr)) { if (bondsToModify[nbnd->getIdx()] || startBonds[nbnd->getIdx()]) { ++numModifiedNeighbors; break; } } if (numModifiedNeighbors) { break; } } if (numModifiedNeighbors) { bondsToModify.set(nbrBond->getIdx()); } } } } #ifdef VERBOSE_HASH std::cerr << "FINAL: " << bondsToModify << std::endl; #endif boost::dynamic_bitset<> atomsToModify(mol->getNumAtoms()); if (bondsToModify.any()) { for (auto bptr : mol->bonds()) { if (!bondsToModify[bptr->getIdx()]) { continue; } // Preserve E/Z stereo on exocyclic double bonds (one atom in ring, one // not) to avoid merging distinct geometric isomers (e.g., E/Z // hydrazones). For these bonds, keep them as DOUBLE bonds (not AROMATIC) // so that E/Z stereo is preserved in the SMILES output. bool isExocyclicWithStereo = false; if (bptr->getStereo() != Bond::BondStereo::STEREONONE) { bool beginInRing = queryIsAtomInRing(bptr->getBeginAtom()); bool endInRing = queryIsAtomInRing(bptr->getEndAtom()); isExocyclicWithStereo = (beginInRing != endInRing); } if (!isExocyclicWithStereo) { bptr->setBondType(Bond::AROMATIC); bptr->setIsAromatic(true); // Must be consistent with bond type bptr->setStereo(Bond::BondStereo::STEREONONE); } else { bptr->setIsAromatic(false); } atomsToModify.set(bptr->getBeginAtomIdx()); atomsToModify.set(bptr->getEndAtomIdx()); } } if (atomsToModify.any()) { for (auto aptr : mol->atoms()) { if (!atomsToModify[aptr->getIdx()]) { continue; } charge += aptr->getFormalCharge(); hcount += aptr->getTotalNumHs(); aptr->setIsAromatic(false); aptr->setFormalCharge(0); aptr->setNoImplicit(true); aptr->setNumExplicitHs(0); } } if (!bondsToModify.empty() || !atomsToModify.empty()) { MolOps::assignRadicals(*mol); // we may have just destroyed some stereocenters/bonds // clean that up: bool cleanIt = true; bool force = true; MolOps::assignStereochemistry(*mol, cleanIt, force); } SmilesWriteParams ps; ps.allBondsExplicit = true; ps.allHsExplicit = true; result = convertToSmilesWithCXFlags(*mol, useCXSmiles, cxFlagsToSkip, ps); char buffer[32]; if (!proto) { sprintf(buffer, "_%d_%d", hcount, charge); } else { sprintf(buffer, "_%d", hcount - charge); } result += buffer; if (useCXSmiles) { addCXExtensions(mol, result, cxFlagsToSkip | SmilesWrite::CX_RADICALS); } return result; } std::string TautomerHash(RWMol *mol, bool proto, bool useCXSmiles, unsigned cxFlagsToSkip = 0) { PRECONDITION(mol, "bad molecule"); std::string result; char buffer[32]; int hcount = 0; int charge = 0; for (auto aptr : mol->atoms()) { charge += aptr->getFormalCharge(); aptr->setIsAromatic(false); aptr->setFormalCharge(0); if (aptr->getAtomicNum() != 6) { hcount += aptr->getTotalNumHs(false); aptr->setNoImplicit(true); aptr->setNumExplicitHs(0); } } for (auto bptr : mol->bonds()) { if (bptr->getBondType() != Bond::SINGLE && (bptr->getIsConjugated() || bptr->getBeginAtom()->getAtomicNum() != 6 || bptr->getEndAtom()->getAtomicNum() != 6)) { bptr->setIsAromatic(false); bptr->setBondType(Bond::SINGLE); bptr->setStereo(Bond::BondStereo::STEREONONE); } } MolOps::assignRadicals(*mol); // we may have just destroyed some stereocenters/bonds // clean that up: bool cleanIt = true; bool force = true; MolOps::assignStereochemistry(*mol, cleanIt, force); result = convertToSmilesWithCXFlags(*mol, useCXSmiles, cxFlagsToSkip); if (!proto) { sprintf(buffer, "_%d_%d", hcount, charge); } else { sprintf(buffer, "_%d", hcount - charge); } result += buffer; if (useCXSmiles) { addCXExtensions(mol, result, cxFlagsToSkip | SmilesWrite::CX_RADICALS); } return result; } bool TraverseForRing(Atom *atom, unsigned char *visit) { PRECONDITION(atom, "bad atom pointer"); PRECONDITION(visit, "bad pointer"); visit[atom->getIdx()] = 1; for (auto nbri : boost::make_iterator_range( atom->getOwningMol().getAtomNeighbors(atom))) { auto nptr = atom->getOwningMol()[nbri]; if (visit[nptr->getIdx()] == 0) { if (RDKit::queryIsAtomInRing(nptr)) { return true; } if (TraverseForRing(nptr, visit)) { return true; } } } return false; } bool DepthFirstSearchForRing(Atom *root, Atom *nbor, unsigned int maxatomidx) { PRECONDITION(root, "bad atom pointer"); PRECONDITION(nbor, "bad atom pointer"); unsigned int natoms = maxatomidx; auto *visit = (unsigned char *)alloca(natoms); memset(visit, 0, natoms); visit[root->getIdx()] = true; return TraverseForRing(nbor, visit); } bool IsInScaffold(Atom *atom, unsigned int maxatomidx) { PRECONDITION(atom, "bad atom pointer"); if (RDKit::queryIsAtomInRing(atom)) { return true; } unsigned int count = 0; for (auto nbri : boost::make_iterator_range( atom->getOwningMol().getAtomNeighbors(atom))) { auto nptr = atom->getOwningMol()[nbri]; if (DepthFirstSearchForRing(atom, nptr, maxatomidx)) { ++count; } } return count > 1; } bool HasNbrInScaffold(Atom *aptr, unsigned char *is_in_scaffold) { PRECONDITION(aptr, "bad atom pointer"); PRECONDITION(is_in_scaffold, "bad pointer"); for (auto nbri : boost::make_iterator_range( aptr->getOwningMol().getAtomNeighbors(aptr))) { auto nptr = aptr->getOwningMol()[nbri]; if (is_in_scaffold[nptr->getIdx()]) { return true; } } return false; } std::string ExtendedMurckoScaffold(RWMol *mol, bool useCXSmiles, unsigned cxFlagsToSkip = 0) { PRECONDITION(mol, "bad molecule"); if (!mol->getRingInfo()->isFindFastOrBetter()) { MolOps::fastFindRings(*mol); } unsigned int maxatomidx = mol->getNumAtoms(); auto *is_in_scaffold = (unsigned char *)alloca(maxatomidx); for (auto aptr : mol->atoms()) { is_in_scaffold[aptr->getIdx()] = IsInScaffold(aptr, maxatomidx); } std::vector for_deletion; for (auto aptr : mol->atoms()) { unsigned int aidx = aptr->getIdx(); if (is_in_scaffold[aidx]) { continue; } if (HasNbrInScaffold(aptr, is_in_scaffold)) { aptr->setAtomicNum(0); aptr->setFormalCharge(0); aptr->setNoImplicit(true); aptr->setNumExplicitHs(0); aptr->setIsotope(0); } else { for_deletion.push_back(aptr); } } mol->beginBatchEdit(); for (auto &i : for_deletion) { mol->removeAtom(i); } mol->commitBatchEdit(); MolOps::assignRadicals(*mol); // we may have just destroyed some stereocenters/bonds // clean that up: bool cleanIt = true; bool force = true; MolOps::assignStereochemistry(*mol, cleanIt, force); std::string result; result = convertToSmilesWithCXFlags(*mol, useCXSmiles, cxFlagsToSkip); if (useCXSmiles) { addCXExtensions(mol, result, cxFlagsToSkip | SmilesWrite::CX_RADICALS); } return result; } std::string MurckoScaffoldHash(RWMol *mol, bool useCXSmiles, unsigned cxFlagsToSkip = 0) { PRECONDITION(mol, "bad molecule"); std::vector for_deletion; do { for_deletion.clear(); for (auto aptr : mol->atoms()) { unsigned int deg = aptr->getDegree(); if (deg < 2) { if (deg == 1) { // i.e. not 0 and the last atom in the molecule for (const auto &nbri : boost::make_iterator_range( aptr->getOwningMol().getAtomBonds(aptr))) { auto bptr = (aptr->getOwningMol())[nbri]; Atom *nbr = bptr->getOtherAtom(aptr); unsigned int hcount = nbr->getTotalNumHs(false); nbr->setNumExplicitHs(hcount + NMRDKitBondGetOrder(bptr)); nbr->setNoImplicit(true); } } for_deletion.push_back(aptr); } } mol->beginBatchEdit(); for (auto &i : for_deletion) { mol->removeAtom(i); } mol->commitBatchEdit(); } while (!for_deletion.empty()); MolOps::assignRadicals(*mol); // we may have just destroyed some stereocenters/bonds // clean that up: bool cleanIt = true; bool force = true; MolOps::assignStereochemistry(*mol, cleanIt, force); std::string result; result = convertToSmilesWithCXFlags(*mol, useCXSmiles, cxFlagsToSkip); if (useCXSmiles) { addCXExtensions(mol, result, cxFlagsToSkip | SmilesWrite::CX_RADICALS); } return result; } std::string NetChargeHash(RWMol *mol) { PRECONDITION(mol, "bad molecule"); int totalq = 0; for (auto aptr : mol->atoms()) { totalq += aptr->getFormalCharge(); } char buffer[16]; sprintf(buffer, "%d", totalq); return buffer; } std::string SmallWorldHash(RWMol *mol, bool brl) { PRECONDITION(mol, "bad molecule"); char buffer[64]; unsigned int acount = mol->getNumAtoms(); unsigned int bcount = mol->getNumBonds(); unsigned int rcount = (bcount + 1) - acount; if (brl) { unsigned int lcount = 0; for (auto aptr : mol->atoms()) { if (aptr->getDegree() == 2) { lcount++; } } sprintf(buffer, "B%uR%uL%u", bcount, rcount, lcount); } else { sprintf(buffer, "B%uR%u", bcount, rcount); } return buffer; } void DegreeVector(RWMol *mol, unsigned int *v) { memset(v, 0, 4 * sizeof(unsigned int)); for (auto aptr : mol->atoms()) { switch (aptr->getDegree()) { case 4: v[0]++; break; case 3: v[1]++; break; case 2: v[2]++; break; case 1: v[3]++; break; } } } bool HasDoubleBond(Atom *atom) { PRECONDITION(atom, "bad atom"); for (const auto &nbri : boost::make_iterator_range(atom->getOwningMol().getAtomBonds(atom))) { auto bptr = (atom->getOwningMol())[nbri]; if (NMRDKitBondGetOrder(bptr) == 2) { return true; } } return false; } // Determine whether/how to fragment bond // -1 means don't fragment bond // 0 means break, with hydrogens on both beg and end // 1 means break, with asterisk on beg and hydrogen on end // 2 means break, with hydrogen on beg and asterisk on end // 3 means break, with asterisks on both beg and end int RegioisomerBond(Bond *bnd) { PRECONDITION(bnd, "bad bond"); if (NMRDKitBondGetOrder(bnd) != 1) { return -1; } if (RDKit::queryIsBondInRing(bnd)) { return -1; } Atom *beg = bnd->getBeginAtom(); Atom *end = bnd->getEndAtom(); unsigned int beg_elem = beg->getAtomicNum(); unsigned int end_elem = end->getAtomicNum(); if (beg_elem == 0 || end_elem == 0) { return -1; } if (RDKit::queryIsAtomInRing(beg)) { if (RDKit::queryIsAtomInRing(end)) { return 0; } return 2; } if (RDKit::queryIsAtomInRing(end)) { return 1; } if (beg_elem != 6 && end_elem == 6 && !HasDoubleBond(end)) { return 1; } if (beg_elem == 6 && end_elem != 6 && !HasDoubleBond(beg)) { return 2; } return -1; } void ClearEZStereo(Atom *atm) { PRECONDITION(atm, "bad atom"); for (const auto &nbri : boost::make_iterator_range(atm->getOwningMol().getAtomBonds(atm))) { auto bptr = (atm->getOwningMol())[nbri]; if (bptr->getStereo() > RDKit::Bond::STEREOANY) { bptr->setStereo(RDKit::Bond::STEREOANY); } } } std::string RegioisomerHash(RWMol *mol, bool useCXSmiles, unsigned cxFlagsToSkip = 0) { PRECONDITION(mol, "bad molecule"); // we need a copy of the molecule so that we can loop over the bonds of // something while modifying something else RDKit::ROMol molcpy(*mol); if (!molcpy.getRingInfo()->isFindFastOrBetter()) { MolOps::fastFindRings(molcpy); } for (int i = molcpy.getNumBonds() - 1; i >= 0; --i) { auto bptr = molcpy.getBondWithIdx(i); int split = RegioisomerBond(bptr); if (split >= 0) { bptr = mol->getBondWithIdx(i); Atom *beg = bptr->getBeginAtom(); Atom *end = bptr->getEndAtom(); mol->removeBond(bptr->getBeginAtomIdx(), bptr->getEndAtomIdx()); ClearEZStereo(beg); ClearEZStereo(end); if (split & 1) { Atom *star = new RDKit::Atom(0); mol->addAtom(star, true, true); star->setNoImplicit(true); NMRDKitMolNewBond(mol, beg, star, 1, false); } else { unsigned int hcount = beg->getTotalNumHs(false); beg->setNumExplicitHs(hcount + 1); beg->setNoImplicit(true); } if (split & 2) { Atom *star = new RDKit::Atom(0); mol->addAtom(star, true, true); star->setNoImplicit(true); NMRDKitMolNewBond(mol, end, star, 1, false); } else { unsigned int hcount = end->getTotalNumHs(false); end->setNumExplicitHs(hcount + 1); end->setNoImplicit(true); } } } // we may have just destroyed some stereocenters/bonds // clean that up: bool cleanIt = true; bool force = true; MolOps::assignStereochemistry(*mol, cleanIt, force); auto result = convertToSmilesWithCXFlags(*mol, useCXSmiles, cxFlagsToSkip); if (useCXSmiles) { addCXExtensions(mol, result, cxFlagsToSkip); } return result; } std::string ArthorSubOrderHash(RWMol *mol) { PRECONDITION(mol, "bad molecule"); char buffer[256]; unsigned int acount = mol->getNumAtoms(); unsigned int bcount = mol->getNumBonds(); unsigned int pcount = 1; unsigned int size = 4 * mol->getNumAtoms() + 4; auto *parts = (unsigned int *)malloc(size); if (parts) { memset(parts, 0, size); pcount = NMDetermineComponents(mol, parts, acount); free(parts); } unsigned int ccount = 0; unsigned int ocount = 0; unsigned int zcount = 0; unsigned int icount = 0; unsigned int qcount = 0; unsigned int rcount = 0; for (auto aptr : mol->atoms()) { unsigned int elem = aptr->getAtomicNum(); int charge = aptr->getFormalCharge(); switch (elem) { case 6: // Carbon ccount++; if (charge == 0 && aptr->getTotalValence() != 4) { rcount++; } break; case 7: // Nitrogen case 15: // Phosphorus ocount++; if (charge == 0) { unsigned int valence = aptr->getTotalValence(); if (valence != 3 && valence != 5) { rcount++; } } break; case 8: // Oxygen ocount++; if (charge && aptr->getTotalValence() != 2) { rcount++; } break; case 9: // Fluorine ocount++; if (charge && aptr->getTotalValence() != 1) { rcount++; } break; case 17: // Chlorine case 35: // Bromine case 53: // Iodine ocount++; if (charge == 0) { unsigned int valence = aptr->getTotalValence(); if (valence != 1 && valence != 3 && valence != 5 && valence != 7) { rcount++; } } break; case 16: // Sulfur ocount++; if (charge == 0) { unsigned int valence = aptr->getTotalValence(); if (valence != 2 && valence != 4 && valence != 6) { rcount++; } } break; } zcount += elem; if (aptr->getIsotope() != 0) { icount++; } if (charge != 0) { qcount++; } } if (acount > 0xffff) { acount = 0xffff; } if (bcount > 0xffff) { bcount = 0xffff; } if (pcount > 0xff) { pcount = 0xff; } if (ccount > 0xffff) { ccount = 0xffff; } if (ocount > 0xffff) { ocount = 0xffff; } if (zcount > 0xffffff) { zcount = 0xffffff; } if (rcount > 0xff) { rcount = 0xff; } if (qcount > 0xff) { qcount = 0xff; } if (icount > 0xff) { icount = 0xff; } sprintf(buffer, "%04x%04x%02x%04x%04x%06x%02x%02x%02x", acount, bcount, pcount, ccount, ocount, zcount, rcount, qcount, icount); return buffer; } } // namespace std::string MolHash(RWMol *mol, HashFunction func, bool useCXSmiles, unsigned cxFlagsToSkip) { PRECONDITION(mol, "bad molecule"); std::string result; char buffer[32]; NMRDKitSanitizeHydrogens(mol); switch (func) { default: case HashFunction::AnonymousGraph: result = AnonymousGraph(mol, false, useCXSmiles, cxFlagsToSkip); break; case HashFunction::ElementGraph: result = AnonymousGraph(mol, true, useCXSmiles, cxFlagsToSkip); break; case HashFunction::CanonicalSmiles: result = convertToSmilesWithCXFlags(*mol, useCXSmiles, cxFlagsToSkip); if (useCXSmiles) { addCXExtensions(mol, result, cxFlagsToSkip); } break; case HashFunction::MurckoScaffold: result = MurckoScaffoldHash(mol, useCXSmiles, cxFlagsToSkip); break; case HashFunction::ExtendedMurcko: result = ExtendedMurckoScaffold(mol, useCXSmiles, cxFlagsToSkip); break; case HashFunction::Mesomer: result = MesomerHash(mol, true, useCXSmiles, cxFlagsToSkip); break; case HashFunction::RedoxPair: result = MesomerHash(mol, false, useCXSmiles, cxFlagsToSkip); break; case HashFunction::HetAtomTautomer: result = TautomerHash(mol, false, useCXSmiles, cxFlagsToSkip); break; case HashFunction::HetAtomTautomerv2: result = TautomerHashv2(mol, false, useCXSmiles, cxFlagsToSkip); break; case HashFunction::HetAtomProtomer: result = TautomerHash(mol, true, useCXSmiles, cxFlagsToSkip); break; case HashFunction::HetAtomProtomerv2: result = TautomerHashv2(mol, true, useCXSmiles, cxFlagsToSkip); break; case HashFunction::MolFormula: result = NMMolecularFormula(mol); break; case HashFunction::AtomBondCounts: sprintf(buffer, "%u,%u", mol->getNumAtoms(), mol->getNumBonds()); result = buffer; break; case HashFunction::NetCharge: result = NetChargeHash(mol); break; case HashFunction::SmallWorldIndexBR: result = SmallWorldHash(mol, false); break; case HashFunction::SmallWorldIndexBRL: result = SmallWorldHash(mol, true); break; case HashFunction::DegreeVector: { unsigned int dv[4]; DegreeVector(mol, dv); sprintf(buffer, "%u,%u,%u,%u", dv[0], dv[1], dv[2], dv[3]); result = buffer; } break; case HashFunction::ArthorSubstructureOrder: result = ArthorSubOrderHash(mol); break; case HashFunction::Regioisomer: result = RegioisomerHash(mol, useCXSmiles, cxFlagsToSkip); break; } return result; } } // namespace MolHash } // namespace RDKit