// $Id$ // // Copyright (C) 2014 Novartis Institutes for BioMedical Research // // @@ All Rights Reserved @@ // This file is part of the RDKit. // The contents are covered by the terms of the BSD license // which is included in the file license.txt, found at the root // of the RDKit source tree. // #include #include #include #include #include #include #include #include "../Descriptors/MolDescriptors.h" #include "MolHash.h" namespace RDKit { namespace MolHash { struct MolFragment // Reference to a fragment of source molecule { std::vector Atoms; std::vector Bonds; std::vector AtomsIdx; std::vector BondsIdx; std::map MolAtomIdxMap; // Full Molecule to // fragment indeces // backward // conversion map public: boost::uint32_t getNumAtoms() const { return AtomsIdx.size(); } boost::uint32_t getNumBonds() const { return BondsIdx.size(); } }; // INTERNAL FUNCTIONS: static HashCodeType computeMorganCodeHash( const MolFragment &mol, const std::vector &atomLabels, const std::vector &bondLabels); static void prepareMolFragment(MolFragment &m, const ROMol &mol, const std::vector *atomsToUse, const std::vector *bondsToUse); static void prepareLabels(std::vector &atomLabels, std::vector &bondLabels, const ROMol &mol, const MolFragment &m, const std::vector *atomCodes, const std::vector *bondCodes); static boost::uint32_t computeCRC32(const void *data, size_t size) { boost::crc_32_type crc; crc.process_bytes(data, size); return crc.checksum(); } //============================================================================= // MolHash Module API implementation: //============================================================================= void fillAtomBondCodes( const ROMol &mol, boost::uint64_t flags // CodeFlags constants combination , std::vector *atomCodes // NULL is allowed , std::vector *bondCodes) // NULL is allowed { if (atomCodes) { unsigned n = mol.getNumAtoms(); atomCodes->resize(n); for (unsigned i = 0; i < n; i++) { if (0 == (CF_ATOM_ALL & flags)) // NO LABELS { (*atomCodes)[i] = 1; continue; } const Atom *atom = mol.getAtomWithIdx(i); (*atomCodes)[i] = 0; if (0 != (CF_ELEMENT & flags)) (*atomCodes)[i] |= atom->getAtomicNum(); if (0 != (CF_CHARGE & flags)) (*atomCodes)[i] |= (atom->getFormalCharge() + 8) << 8; // allowed range [-8, +8] if (0 != (CF_VALENCE & flags)) (*atomCodes)[i] |= (atom->getExplicitValence()) << 13; // getTotalValence() if (0 != (CF_ATOM_CHIRALITY & flags)) { char v = 0; if (atom->getChiralTag() == Atom::CHI_TETRAHEDRAL_CW || atom->getChiralTag() == Atom::CHI_TETRAHEDRAL_CCW) { if (atom->hasProp("_CIPCode")) { std::string code = atom->getProp("_CIPCode"); if (code == "R") v = 1; else if (code == "S") v = 2; } else if (atom->hasProp("_ringStereoAtoms")) { const INT_VECT &ringStereoAtoms = atom->getProp("_ringStereoAtoms"); if (ringStereoAtoms.size()) { if (ringStereoAtoms[0] < 0) { v = 1; } else { v = 2; } if (ringStereoAtoms.size() > 1) { BOOST_LOG(rdWarningLog) << "Warning: atom with more than 1 ring-stereo atoms found." << std::endl; } } } } else { v = atom->getChiralTag(); } (*atomCodes)[i] |= v << 18; // 2 bits } if (0 != (CF_ATOM_AROMATIC & flags)) (*atomCodes)[i] |= (atom->getIsAromatic() ? 1 : 0) << 20; // 1 bit // if(0!=( & flags)) // (*atomCodes)[i] |= (atom-()) << 21; // 3 bits reserved if (0 != (CF_ISOTOPE & flags)) (*atomCodes)[i] |= (atom->getIsotope()) << 24; } } if (bondCodes) { std::map bondsInRing; const RingInfo::VECT_INT_VECT &rings = mol.getRingInfo()->bondRings(); for (RingInfo::VECT_INT_VECT::const_iterator r = rings.begin(); r != rings.end(); r++) for (INT_VECT::const_iterator b = r->begin(); b != r->end(); b++) if (bondsInRing.end() == bondsInRing.find(*b)) bondsInRing[(unsigned)*b] = true; unsigned n = mol.getNumBonds(); bondCodes->resize(n); for (unsigned i = 0; i < n; i++) { if (0 == (CF_BOND_ALL & flags)) // NO LABELS { (*bondCodes)[i] = 1; continue; } const Bond *bond = mol.getBondWithIdx(i); (*bondCodes)[i] = 0; if (0 != (CF_BOND_ORDER & flags)) { unsigned order = bond->getBondType(); if (0 == (CF_BOND_AROMATIZATION & flags)) // ignore aromatization { static const unsigned orderMatch[Bond::ZERO + 1] = { Bond::UNSPECIFIED, Bond::SINGLE, Bond::DOUBLE, Bond::TRIPLE, Bond::QUADRUPLE, Bond::QUINTUPLE, Bond::HEXTUPLE, Bond::SINGLE, // ONEANDAHALF, Bond::DOUBLE, // TWOANDAHALF, Bond::TRIPLE, // THREEANDAHALF, Bond::QUADRUPLE, // FOURANDAHALF, Bond::QUINTUPLE, // FIVEANDAHALF, Bond::SINGLE, // AROMATIC, Bond::IONIC, Bond::HYDROGEN, Bond::THREECENTER, Bond::DATIVEONE, Bond::DATIVE, Bond::DATIVEL, Bond::DATIVER, Bond::OTHER, Bond::ZERO}; order = orderMatch[order]; } (*bondCodes)[i] |= order; } if (0 != (CF_BOND_AROMATIZATION & flags)) (*bondCodes)[i] |= ((bond->getIsAromatic() ? 1 : 0)) << 8; if (0 != (CF_BOND_CHIRALITY & flags)) { (*bondCodes)[i] |= bond->getStereo() << 9; } if (0 != (CF_BOND_IN_RING & flags)) (*bondCodes)[i] |= (bondsInRing.end() != bondsInRing.find(bond->getIdx()) ? 1 : 0) << 11; // 1 bit } } } //============================================================================= HashCodeType generateMoleculeHashCode( const ROMol &mol, const std::vector *atomsToUse, const std::vector *bondsToUse, const std::vector *atomCodes, const std::vector *bondCodes) { MolFragment m; prepareMolFragment(m, mol, atomsToUse, bondsToUse); if (0 == m.getNumAtoms() || 0 == m.getNumBonds()) return 0; std::vector atomLabels; std::vector bondLabels; prepareLabels(atomLabels, bondLabels, mol, m, atomCodes, bondCodes); return computeMorganCodeHash(m, atomLabels, bondLabels); } void generateMoleculeHashSet(const ROMol &mol, HashSet &res, const std::vector *atomsToUse, const std::vector *bondsToUse) { res.Version = 100; // v. 1.0 res.Reserved = 0; MolFragment m; prepareMolFragment(m, mol, atomsToUse, bondsToUse); res.NumAtoms = m.getNumAtoms(); res.NumBonds = m.getNumBonds(); if (0 == m.getNumAtoms() || 0 == m.getNumBonds()) return; std::string formula = RDKit::Descriptors::calcMolFormula(mol); res.FormulaCRC32 = computeCRC32(formula.c_str(), formula.length()); boost::uint64_t flags = 0; // CodeFlags constants combination std::vector atomCodes; std::vector bondCodes; std::vector atomLabels; std::vector bondLabels; // flags = CF_ATOM_ALL &(~(CF_BOND_CHIRALITY | CF_ATOM_CHIRALITY | // CF_ISOTOPE)); flags = CF_ELEMENT | CF_CHARGE | CF_ATOM_AROMATIC; /// | CF_VALENCE fillAtomBondCodes(mol, flags, &atomCodes, &bondCodes); prepareLabels(atomLabels, bondLabels, mol, m, &atomCodes, &bondCodes); res.NonChiralAtomsHash = computeMorganCodeHash(m, atomLabels, bondLabels); flags = CF_BOND_ALL & (~CF_BOND_CHIRALITY); fillAtomBondCodes(mol, flags, &atomCodes, &bondCodes); prepareLabels(atomLabels, bondLabels, mol, m, &atomCodes, &bondCodes); res.NonChiralBondsHash = computeMorganCodeHash(m, atomLabels, bondLabels); flags = CF_ATOM_CHIRALITY | CF_ISOTOPE; fillAtomBondCodes(mol, flags, &atomCodes, &bondCodes); prepareLabels(atomLabels, bondLabels, mol, m, &atomCodes, &bondCodes); res.ChiralAtomsHash = computeMorganCodeHash(m, atomLabels, bondLabels); flags = CF_BOND_CHIRALITY; fillAtomBondCodes(mol, flags, &atomCodes, &bondCodes); prepareLabels(atomLabels, bondLabels, mol, m, &atomCodes, &bondCodes); res.ChiralBondsHash = computeMorganCodeHash(m, atomLabels, bondLabels); flags = CF_BOND_CHIRALITY | CF_ATOM_CHIRALITY | CF_ISOTOPE; fillAtomBondCodes(mol, flags, &atomCodes, &bondCodes); prepareLabels(atomLabels, bondLabels, mol, m, &atomCodes, &bondCodes); res.ChiralityHash = computeMorganCodeHash(m, atomLabels, bondLabels); } //============================================================================= std::string generateMoleculeHashSet(const ROMol &mol, const std::vector *atomsToUse, const std::vector *bondsToUse) { std::string str; HashSet res; generateMoleculeHashSet(mol, res, atomsToUse, bondsToUse); // char buf[64]; // snprintf(buf, sizeof(buf),"%u-%u-%u-", res.Version, // res.NumAtoms,res.NumBonds); str = (boost::format("%u-%u-%u-") % (res.Version) % (res.NumAtoms) % (res.NumBonds)).str(); str += encode(&res.FormulaCRC32, sizeof(res.FormulaCRC32)); str += "-"; str += encode(&res.NonChiralAtomsHash, sizeof(res.NonChiralAtomsHash)); str += "-"; str += encode(&res.NonChiralBondsHash, sizeof(res.NonChiralBondsHash)); str += "-"; str += encode(&res.ChiralAtomsHash, sizeof(res.ChiralAtomsHash)); str += "-"; str += encode(&res.ChiralBondsHash, sizeof(res.ChiralBondsHash)); str += "-"; str += encode(&res.ChiralityHash, sizeof(res.ChiralityHash)); return str.c_str(); } //============================================================================= // INTERNAL FUNCTIONS: //============================================================================= static HashCodeType computeMorganCodeHash( const MolFragment &mol, const std::vector &atomLabels, const std::vector &bondLabels) { size_t nv = mol.getNumAtoms(); size_t ne = mol.getNumBonds(); std::vector currCodes(nv); std::vector prevCodes(nv); size_t nIterations = mol.getNumBonds(); if (nIterations > 5) nIterations = 5; for (unsigned molAtomIdx = 0; molAtomIdx < mol.getNumAtoms(); molAtomIdx++) currCodes[molAtomIdx] = atomLabels[mol.AtomsIdx[molAtomIdx]]; for (size_t iter = 0; iter < nIterations; iter++) { for (size_t i = 0; i < nv; i++) prevCodes[i] = currCodes[i]; for (size_t molBondIdx = 0; molBondIdx < ne; molBondIdx++) { const Bond *bond = mol.Bonds[molBondIdx]; unsigned order = bondLabels[mol.BondsIdx[molBondIdx]]; unsigned atom1 = mol.MolAtomIdxMap.find(bond->getBeginAtomIdx())->second; unsigned atom2 = mol.MolAtomIdxMap.find(bond->getEndAtomIdx())->second; boost::uint32_t v1 = prevCodes[atom1]; boost::uint32_t v2 = prevCodes[atom2]; currCodes[atom1] += v2 * v2 + (v2 + 23) * (order + 1721); currCodes[atom2] += v1 * v1 + (v1 + 23) * (order + 1721); } } HashCodeType result = 0; for (unsigned molAtomIdx = 0; molAtomIdx < nv; molAtomIdx++) { HashCodeType code = currCodes[molAtomIdx]; result += code * (code + 6849) + 29; } return result; } //============================================================================= static void prepareMolFragment(MolFragment &m, const ROMol &mol, const std::vector *atomsToUse, const std::vector *bondsToUse) { if (0 != atomsToUse && atomsToUse->empty()) atomsToUse = 0; if (0 != bondsToUse && bondsToUse->empty()) bondsToUse = 0; if (0 == atomsToUse && 0 == bondsToUse) // whole molecule { unsigned n = mol.getNumAtoms(); m.AtomsIdx.resize(n); for (unsigned i = 0; i < n; i++) m.AtomsIdx[i] = i; n = mol.getNumBonds(); m.BondsIdx.resize(n); for (unsigned i = 0; i < n; i++) m.BondsIdx[i] = i; } else if (0 != atomsToUse) // selected atoms only and all/selected bonds // between them { std::map addedBonds; unsigned n = atomsToUse->size(); m.AtomsIdx.resize(n); for (unsigned i = 0; i < n; i++) // add all selected atoms at first m.AtomsIdx[i] = (*atomsToUse)[i]; for (unsigned i = 0; i < n; i++) // add bonds between all selected atoms { ROMol::OEDGE_ITER beg, end; for (boost::tie(beg, end) = mol.getAtomBonds(mol.getAtomWithIdx(m.AtomsIdx[i])); beg != end; beg++) { const Bond *bond = &*((mol)[*beg]); if (addedBonds.end() != addedBonds.find(bond->getIdx())) continue; // the bond has been already added if (0 != bondsToUse && bondsToUse->end() == find(bondsToUse->begin(), bondsToUse->end(), bond->getIdx())) continue; // skip unselected bond unsigned endAtoms[2]; endAtoms[0] = bond->getBeginAtomIdx(); endAtoms[1] = bond->getEndAtomIdx(); for (unsigned ai = 0; ai < 2 && 0 != bond; ai++) // both ending bonds of the atom { if (0 != atomsToUse && atomsToUse->end() == find(atomsToUse->begin(), atomsToUse->end(), endAtoms[ai])) bond = 0; // check if both ending atoms of the bond are selected by // atoms filter } if (0 != bond) { addedBonds[bond->getIdx()] = m.BondsIdx.size(); m.BondsIdx.push_back(bond->getIdx()); } } } } else if (0 != bondsToUse) // note that 0==atomsToUse in this case { std::map addedAtoms; unsigned n = bondsToUse->size(); m.BondsIdx.resize(n); for (unsigned i = 0; i < n; i++) { const Bond *bond = mol.getBondWithIdx(i); m.BondsIdx[i] = bond->getIdx(); unsigned endAtoms[2]; endAtoms[0] = bond->getBeginAtomIdx(); endAtoms[1] = bond->getEndAtomIdx(); for (unsigned ai = 0; ai < 2 && 0 != bond; ai++) // both ending bonds of the atom { if (addedAtoms.end() == addedAtoms.find(endAtoms[ai])) { addedAtoms[endAtoms[ai]] = addedAtoms.size(); m.AtomsIdx.push_back( endAtoms[ai]); // the atom has NOT been already added } } } } unsigned n; n = m.getNumAtoms(); m.Atoms.resize(n); for (unsigned i = 0; i < n; i++) { m.Atoms[i] = mol.getAtomWithIdx(m.AtomsIdx[i]); m.MolAtomIdxMap[m.AtomsIdx[i]] = i; } n = m.getNumBonds(); m.Bonds.resize(n); for (unsigned i = 0; i < n; i++) { m.Bonds[i] = mol.getBondWithIdx(m.BondsIdx[i]); } } //============================================================================= static void prepareLabels(std::vector &atomLabels, std::vector &bondLabels, const ROMol &mol, const MolFragment &m, const std::vector *atomCodes, const std::vector *bondCodes) { RDUNUSED_PARAM(mol); unsigned n; n = m.getNumAtoms(); atomLabels.resize(n); for (unsigned i = 0; i < n; i++) { atomLabels[i] = atomCodes ? (*atomCodes)[m.AtomsIdx[i]] : 1; } n = m.getNumBonds(); bondLabels.resize(n); for (unsigned i = 0; i < n; i++) { bondLabels[i] = bondCodes ? (*bondCodes)[m.BondsIdx[i]] : 1; } } //============================================================================= } }