mirror of
https://github.com/rdkit/rdkit.git
synced 2026-06-04 21:54:27 +08:00
* Removes ATOM/BOND_SPTR in boost::graph in favor of raw pointers * Actually delete atoms and bonds... * RWMol::clear now calls destroy to handle atom/bond deletion * Changes broken Atom lookup for windows/gcc * Adds tests for running with valgrind * Adds test designed for valgrind and molecule deletions * Removes RNG, actually tests bond deletions * update swig wrappers * deal with most recent changes on the main branch
424 lines
16 KiB
C++
424 lines
16 KiB
C++
// $Id$
|
|
//
|
|
// Copyright (c) 2009-2010, Novartis Institutes for BioMedical Research Inc.
|
|
// All rights reserved.
|
|
//
|
|
// Redistribution and use in source and binary forms, with or without
|
|
// modification, are permitted provided that the following conditions are
|
|
// met:
|
|
//
|
|
// * Redistributions of source code must retain the above copyright
|
|
// notice, this list of conditions and the following disclaimer.
|
|
// * Redistributions in binary form must reproduce the above
|
|
// copyright notice, this list of conditions and the following
|
|
// disclaimer in the documentation and/or other materials provided
|
|
// with the distribution.
|
|
// * Neither the name of Novartis Institutes for BioMedical Research Inc.
|
|
// nor the names of its contributors may be used to endorse or promote
|
|
// products derived from this software without specific prior written
|
|
// permission.
|
|
//
|
|
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
|
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
|
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
|
// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
|
// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
|
// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
|
// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
|
// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
|
// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
|
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
|
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
//
|
|
// Created by Greg Landrum, July 2008
|
|
//
|
|
//
|
|
|
|
#include <GraphMol/RDKitBase.h>
|
|
#include <GraphMol/Fingerprints/MorganFingerprints.h>
|
|
#include <RDGeneral/hash/hash.hpp>
|
|
#include <GraphMol/SmilesParse/SmilesParse.h>
|
|
#include <GraphMol/Substruct/SubstructMatch.h>
|
|
|
|
#include <boost/dynamic_bitset.hpp>
|
|
#include <boost/tuple/tuple.hpp>
|
|
#include <boost/tuple/tuple_comparison.hpp>
|
|
#include <boost/foreach.hpp>
|
|
#include <algorithm>
|
|
|
|
#include <RDGeneral/BoostStartInclude.h>
|
|
#include <boost/flyweight.hpp>
|
|
#include <boost/flyweight/key_value.hpp>
|
|
#include <boost/flyweight/no_tracking.hpp>
|
|
#include <RDGeneral/BoostEndInclude.h>
|
|
|
|
namespace {
|
|
class ss_matcher {
|
|
public:
|
|
ss_matcher(){};
|
|
ss_matcher(const std::string &pattern) {
|
|
RDKit::RWMol *p = RDKit::SmartsToMol(pattern);
|
|
TEST_ASSERT(p);
|
|
m_matcher.reset(p);
|
|
};
|
|
|
|
// const RDKit::ROMOL_SPTR &getMatcher() const { return m_matcher; };
|
|
const RDKit::ROMol *getMatcher() const { return m_matcher.get(); };
|
|
|
|
private:
|
|
RDKit::ROMOL_SPTR m_matcher;
|
|
};
|
|
}
|
|
|
|
namespace RDKit {
|
|
namespace MorganFingerprints {
|
|
using boost::uint32_t;
|
|
using boost::int32_t;
|
|
typedef boost::tuple<boost::dynamic_bitset<>, uint32_t, unsigned int>
|
|
AccumTuple;
|
|
|
|
// Definitions for feature points adapted from:
|
|
// Gobbi and Poppinger, Biotech. Bioeng. _61_ 47-54 (1998)
|
|
const char *smartsPatterns[6] = {
|
|
"[$([N;!H0;v3,v4&+1]),\
|
|
$([O,S;H1;+0]),\
|
|
n&H1&+0]", // Donor
|
|
"[$([O,S;H1;v2;!$(*-*=[O,N,P,S])]),\
|
|
$([O,S;H0;v2]),\
|
|
$([O,S;-]),\
|
|
$([N;v3;!$(N-*=[O,N,P,S])]),\
|
|
n&H0&+0,\
|
|
$([o,s;+0;!$([o,s]:n);!$([o,s]:c:n)])]", // Acceptor
|
|
"[a]", // Aromatic
|
|
"[F,Cl,Br,I]", // Halogen
|
|
"[#7;+,\
|
|
$([N;H2&+0][$([C,a]);!$([C,a](=O))]),\
|
|
$([N;H1&+0]([$([C,a]);!$([C,a](=O))])[$([C,a]);!$([C,a](=O))]),\
|
|
$([N;H0&+0]([C;!$(C(=O))])([C;!$(C(=O))])[C;!$(C(=O))])]", // Basic
|
|
"[$([C,S](=[O,S,P])-[O;H1,-1])]" // Acidic
|
|
};
|
|
std::vector<std::string> defaultFeatureSmarts(smartsPatterns,
|
|
smartsPatterns + 6);
|
|
typedef boost::flyweight<boost::flyweights::key_value<std::string, ss_matcher>,
|
|
boost::flyweights::no_tracking> pattern_flyweight;
|
|
void getFeatureInvariants(const ROMol &mol, std::vector<uint32_t> &invars,
|
|
std::vector<const ROMol *> *patterns) {
|
|
unsigned int nAtoms = mol.getNumAtoms();
|
|
PRECONDITION(invars.size() >= nAtoms, "vector too small");
|
|
|
|
std::vector<const ROMol *> featureMatchers;
|
|
if (!patterns) {
|
|
featureMatchers.reserve(defaultFeatureSmarts.size());
|
|
for (std::vector<std::string>::const_iterator smaIt =
|
|
defaultFeatureSmarts.begin();
|
|
smaIt != defaultFeatureSmarts.end(); ++smaIt) {
|
|
const ROMol *matcher = pattern_flyweight(*smaIt).get().getMatcher();
|
|
CHECK_INVARIANT(matcher, "bad smarts");
|
|
featureMatchers.push_back(matcher);
|
|
}
|
|
patterns = &featureMatchers;
|
|
}
|
|
std::fill(invars.begin(), invars.end(), 0);
|
|
for (unsigned int i = 0; i < patterns->size(); ++i) {
|
|
unsigned int mask = 1 << i;
|
|
std::vector<MatchVectType> matchVect;
|
|
// to maintain thread safety, we have to copy the pattern
|
|
// molecules:
|
|
SubstructMatch(mol, ROMol(*(*patterns)[i], true), matchVect);
|
|
for (std::vector<MatchVectType>::const_iterator mvIt = matchVect.begin();
|
|
mvIt != matchVect.end(); ++mvIt) {
|
|
for (const auto &mIt : *mvIt) {
|
|
invars[mIt.second] |= mask;
|
|
}
|
|
}
|
|
}
|
|
} // end of getFeatureInvariants()
|
|
|
|
void getConnectivityInvariants(const ROMol &mol, std::vector<uint32_t> &invars,
|
|
bool includeRingMembership) {
|
|
unsigned int nAtoms = mol.getNumAtoms();
|
|
PRECONDITION(invars.size() >= nAtoms, "vector too small");
|
|
gboost::hash<std::vector<uint32_t> > vectHasher;
|
|
for (unsigned int i = 0; i < nAtoms; ++i) {
|
|
Atom const *atom = mol.getAtomWithIdx(i);
|
|
std::vector<uint32_t> components;
|
|
components.push_back(atom->getAtomicNum());
|
|
components.push_back(atom->getTotalDegree());
|
|
components.push_back(atom->getTotalNumHs());
|
|
components.push_back(atom->getFormalCharge());
|
|
int deltaMass = static_cast<int>(
|
|
atom->getMass() -
|
|
PeriodicTable::getTable()->getAtomicWeight(atom->getAtomicNum()));
|
|
components.push_back(deltaMass);
|
|
|
|
if (includeRingMembership &&
|
|
atom->getOwningMol().getRingInfo()->numAtomRings(atom->getIdx())) {
|
|
components.push_back(1);
|
|
}
|
|
invars[i] = vectHasher(components);
|
|
}
|
|
} // end of getConnectivityInvariants()
|
|
|
|
uint32_t updateElement(SparseIntVect<uint32_t> &v, unsigned int elem,
|
|
bool counting) {
|
|
uint32_t bit = elem % v.getLength();
|
|
if (counting) {
|
|
v.setVal(bit, v.getVal(bit) + 1);
|
|
} else {
|
|
v.setVal(bit, 1);
|
|
}
|
|
return bit;
|
|
}
|
|
uint32_t updateElement(ExplicitBitVect &v, unsigned int elem,
|
|
bool counting = false) {
|
|
RDUNUSED_PARAM(counting);
|
|
uint32_t bit = elem % v.getNumBits();
|
|
v.setBit(bit);
|
|
return bit;
|
|
}
|
|
|
|
template <typename T>
|
|
void calcFingerprint(const ROMol &mol, unsigned int radius,
|
|
std::vector<uint32_t> *invariants,
|
|
const std::vector<uint32_t> *fromAtoms, bool useChirality,
|
|
bool useBondTypes, bool useCounts,
|
|
bool onlyNonzeroInvariants, BitInfoMap *atomsSettingBits,
|
|
T &res) {
|
|
unsigned int nAtoms = mol.getNumAtoms();
|
|
bool owner = false;
|
|
if (!invariants) {
|
|
invariants = new std::vector<uint32_t>(nAtoms);
|
|
owner = true;
|
|
getConnectivityInvariants(mol, *invariants);
|
|
}
|
|
// Make a copy of the invariants:
|
|
std::vector<uint32_t> invariantCpy(nAtoms);
|
|
std::copy(invariants->begin(), invariants->end(), invariantCpy.begin());
|
|
|
|
// add the round 0 invariants to the result:
|
|
for (unsigned int i = 0; i < nAtoms; ++i) {
|
|
if (!fromAtoms ||
|
|
std::find(fromAtoms->begin(), fromAtoms->end(), i) !=
|
|
fromAtoms->end()) {
|
|
if (!onlyNonzeroInvariants || (*invariants)[i]) {
|
|
uint32_t bit = updateElement(res, (*invariants)[i], useCounts);
|
|
if (atomsSettingBits)
|
|
(*atomsSettingBits)[bit].push_back(std::make_pair(i, 0));
|
|
}
|
|
}
|
|
}
|
|
|
|
boost::dynamic_bitset<> chiralAtoms(nAtoms);
|
|
|
|
// these are the neighborhoods that have already been added
|
|
// to the fingerprint
|
|
std::vector<boost::dynamic_bitset<> > neighborhoods;
|
|
// these are the environments around each atom:
|
|
std::vector<boost::dynamic_bitset<> > atomNeighborhoods(
|
|
nAtoms, boost::dynamic_bitset<>(mol.getNumBonds()));
|
|
boost::dynamic_bitset<> deadAtoms(nAtoms);
|
|
|
|
boost::dynamic_bitset<> includeAtoms(nAtoms);
|
|
if (fromAtoms) {
|
|
BOOST_FOREACH (uint32_t idx, *fromAtoms) { includeAtoms.set(idx, 1); }
|
|
} else {
|
|
includeAtoms.set();
|
|
}
|
|
|
|
std::vector<unsigned int> atomOrder(nAtoms);
|
|
if (onlyNonzeroInvariants) {
|
|
std::vector<std::pair<int32_t, uint32_t> > ordering;
|
|
for (unsigned int i = 0; i < nAtoms; ++i) {
|
|
if (!(*invariants)[i])
|
|
ordering.push_back(std::make_pair(1, i));
|
|
else
|
|
ordering.push_back(std::make_pair(0, i));
|
|
}
|
|
std::sort(ordering.begin(), ordering.end());
|
|
for (unsigned int i = 0; i < nAtoms; ++i) {
|
|
atomOrder[i] = ordering[i].second;
|
|
}
|
|
} else {
|
|
for (unsigned int i = 0; i < nAtoms; ++i) {
|
|
atomOrder[i] = i;
|
|
}
|
|
}
|
|
// now do our subsequent rounds:
|
|
for (unsigned int layer = 0; layer < radius; ++layer) {
|
|
std::vector<uint32_t> roundInvariants(nAtoms);
|
|
std::vector<boost::dynamic_bitset<> > roundAtomNeighborhoods =
|
|
atomNeighborhoods;
|
|
std::vector<AccumTuple> neighborhoodsThisRound;
|
|
|
|
BOOST_FOREACH (unsigned int atomIdx, atomOrder) {
|
|
if (!deadAtoms[atomIdx]) {
|
|
const Atom *tAtom = mol.getAtomWithIdx(atomIdx);
|
|
if (!tAtom->getDegree()) {
|
|
deadAtoms.set(atomIdx, 1);
|
|
continue;
|
|
}
|
|
std::vector<std::pair<int32_t, uint32_t> > nbrs;
|
|
ROMol::OEDGE_ITER beg, end;
|
|
boost::tie(beg, end) = mol.getAtomBonds(tAtom);
|
|
while (beg != end) {
|
|
const Bond* bond = mol[*beg];
|
|
roundAtomNeighborhoods[atomIdx][bond->getIdx()] = 1;
|
|
|
|
unsigned int oIdx = bond->getOtherAtomIdx(atomIdx);
|
|
roundAtomNeighborhoods[atomIdx] |= atomNeighborhoods[oIdx];
|
|
|
|
int32_t bt = 1;
|
|
if (useBondTypes) {
|
|
if (!useChirality || bond->getBondType() != Bond::DOUBLE ||
|
|
bond->getStereo() == Bond::STEREONONE) {
|
|
bt = static_cast<int32_t>(bond->getBondType());
|
|
} else {
|
|
const int32_t stereoOffset = 100;
|
|
const int32_t bondTypeOffset = 10;
|
|
bt = stereoOffset +
|
|
bondTypeOffset * static_cast<int32_t>(bond->getBondType()) +
|
|
static_cast<int32_t>(bond->getStereo());
|
|
}
|
|
}
|
|
nbrs.push_back(std::make_pair(bt, (*invariants)[oIdx]));
|
|
|
|
++beg;
|
|
}
|
|
|
|
// sort the neighbor list:
|
|
std::sort(nbrs.begin(), nbrs.end());
|
|
// and now calculate the new invariant and test if the atom is newly
|
|
// "chiral"
|
|
boost::uint32_t invar = layer;
|
|
gboost::hash_combine(invar, (*invariants)[atomIdx]);
|
|
bool looksChiral = (tAtom->getChiralTag() != Atom::CHI_UNSPECIFIED);
|
|
for (std::vector<std::pair<int32_t, uint32_t> >::const_iterator it =
|
|
nbrs.begin();
|
|
it != nbrs.end(); ++it) {
|
|
// add the contribution to the new invariant:
|
|
gboost::hash_combine(invar, *it);
|
|
|
|
// std::cerr<<" "<<atomIdx<<": "<<it->first<<" "<<it->second<<" ->
|
|
// "<<invar<<std::endl;
|
|
|
|
// update our "chirality":
|
|
if (useChirality && looksChiral && chiralAtoms[atomIdx]) {
|
|
if (it->first != static_cast<int32_t>(Bond::SINGLE)) {
|
|
looksChiral = false;
|
|
} else if (it != nbrs.begin() && it->second == (it - 1)->second) {
|
|
looksChiral = false;
|
|
}
|
|
}
|
|
}
|
|
if (useChirality && looksChiral) {
|
|
chiralAtoms[atomIdx] = 1;
|
|
// add an extra value to the invariant to reflect chirality:
|
|
std::string cip = "";
|
|
tAtom->getPropIfPresent(common_properties::_CIPCode, cip);
|
|
if (cip == "R") {
|
|
gboost::hash_combine(invar, 3);
|
|
} else if (cip == "S") {
|
|
gboost::hash_combine(invar, 2);
|
|
} else {
|
|
gboost::hash_combine(invar, 1);
|
|
}
|
|
}
|
|
roundInvariants[atomIdx] = static_cast<uint32_t>(invar);
|
|
neighborhoodsThisRound.push_back(
|
|
boost::make_tuple(roundAtomNeighborhoods[atomIdx],
|
|
static_cast<uint32_t>(invar), atomIdx));
|
|
if (std::find(neighborhoods.begin(), neighborhoods.end(),
|
|
roundAtomNeighborhoods[atomIdx]) != neighborhoods.end()) {
|
|
// we have seen this exact environment before, this atom
|
|
// is now out of consideration:
|
|
deadAtoms[atomIdx] = 1;
|
|
// std::cerr<<" atom: "<< atomIdx <<" is dead."<<std::endl;
|
|
}
|
|
}
|
|
}
|
|
std::sort(neighborhoodsThisRound.begin(), neighborhoodsThisRound.end());
|
|
for (std::vector<AccumTuple>::const_iterator iter =
|
|
neighborhoodsThisRound.begin();
|
|
iter != neighborhoodsThisRound.end(); ++iter) {
|
|
// if we haven't seen this exact environment before, update the
|
|
// fingerprint:
|
|
if (std::find(neighborhoods.begin(), neighborhoods.end(),
|
|
iter->get<0>()) == neighborhoods.end()) {
|
|
if (!onlyNonzeroInvariants || invariantCpy[iter->get<2>()]) {
|
|
if (includeAtoms[iter->get<2>()]) {
|
|
uint32_t bit = updateElement(res, iter->get<1>(), useCounts);
|
|
if (atomsSettingBits)
|
|
(*atomsSettingBits)[bit].push_back(
|
|
std::make_pair(iter->get<2>(), layer + 1));
|
|
}
|
|
if (!fromAtoms ||
|
|
std::find(fromAtoms->begin(), fromAtoms->end(), iter->get<2>()) !=
|
|
fromAtoms->end()) {
|
|
neighborhoods.push_back(iter->get<0>());
|
|
}
|
|
}
|
|
// std::cerr<<" layer: "<<layer<<" atom: "<<iter->get<2>()<<" "
|
|
// <<iter->get<0>()<< " " << iter->get<1>() << " " <<
|
|
// deadAtoms[iter->get<2>()]<<std::endl;
|
|
} else {
|
|
// we have seen this exact environment before, this atom
|
|
// is now out of consideration:
|
|
// std::cerr<<" atom: "<< iter->get<2>()<<" is dead."<<std::endl;
|
|
deadAtoms[iter->get<2>()] = 1;
|
|
}
|
|
}
|
|
|
|
// the invariants from this round become the global invariants:
|
|
std::copy(roundInvariants.begin(), roundInvariants.end(),
|
|
invariants->begin());
|
|
|
|
atomNeighborhoods = roundAtomNeighborhoods;
|
|
}
|
|
|
|
if (owner) delete invariants;
|
|
}
|
|
|
|
SparseIntVect<uint32_t> *getFingerprint(const ROMol &mol, unsigned int radius,
|
|
std::vector<uint32_t> *invariants,
|
|
const std::vector<uint32_t> *fromAtoms,
|
|
bool useChirality, bool useBondTypes,
|
|
bool useCounts,
|
|
bool onlyNonzeroInvariants,
|
|
BitInfoMap *atomsSettingBits) {
|
|
SparseIntVect<uint32_t> *res;
|
|
res = new SparseIntVect<uint32_t>(std::numeric_limits<uint32_t>::max());
|
|
calcFingerprint(mol, radius, invariants, fromAtoms, useChirality,
|
|
useBondTypes, useCounts, onlyNonzeroInvariants,
|
|
atomsSettingBits, *res);
|
|
return res;
|
|
}
|
|
SparseIntVect<uint32_t> *getHashedFingerprint(
|
|
const ROMol &mol, unsigned int radius, unsigned int nBits,
|
|
std::vector<uint32_t> *invariants, const std::vector<uint32_t> *fromAtoms,
|
|
bool useChirality, bool useBondTypes, bool onlyNonzeroInvariants,
|
|
BitInfoMap *atomsSettingBits) {
|
|
SparseIntVect<uint32_t> *res;
|
|
res = new SparseIntVect<uint32_t>(nBits);
|
|
calcFingerprint(mol, radius, invariants, fromAtoms, useChirality,
|
|
useBondTypes, true, onlyNonzeroInvariants, atomsSettingBits,
|
|
*res);
|
|
return res;
|
|
}
|
|
|
|
ExplicitBitVect *getFingerprintAsBitVect(const ROMol &mol, unsigned int radius,
|
|
unsigned int nBits,
|
|
std::vector<uint32_t> *invariants,
|
|
const std::vector<uint32_t> *fromAtoms,
|
|
bool useChirality, bool useBondTypes,
|
|
bool onlyNonzeroInvariants,
|
|
BitInfoMap *atomsSettingBits) {
|
|
auto *res = new ExplicitBitVect(nBits);
|
|
calcFingerprint(mol, radius, invariants, fromAtoms, useChirality,
|
|
useBondTypes, false, onlyNonzeroInvariants, atomsSettingBits,
|
|
*res);
|
|
return res;
|
|
}
|
|
|
|
} // end of namespace MorganFingerprints
|
|
} // end of namespace RDKit
|