Files
rdkit/Code/GraphMol/GaussianShape/ShapeInput.cpp
David Cosgrove 9f551aedbe Multi conf gaussian shape (#9265)
* First import of GaussianShape.

* Tidying.

* Custom features.

* Optimise.

* Optimise.

* Return 3 scores rather than 2 including combo score.

* Rename useFeatures to useColors.

* Python wrappers.

* Python tests.

* Take out big test.

* Add new start mode, as PubChem does it.

* Doh!

* Fix MolTransforms eigenvalue return.

* Two cycle optimisation, mostly working.

* Take out bestSoFar score from SCA.

* Take out DTYPE.

* Tidy out redundant variables.

* Optimisation in 2 parts.

* More fiddling in pursuit of speed.

* Update Python wrapper.

* Tweak.

* Atom subsets and different radii.

* Fix test.

* Revert pubchem_shape's test.cpp.

* Serialize ShapeInput.

* Trigger build

* Remove pointers to std::arrays in ShapeInput.

* ShapeInput virtual d'tor.

* Precondition - ShapeInput needs a molecule with at least 1 conformer.

* Rename ShapeInput::d_centroid to ShapeInput::d_canonTrans.

* Fix normalization bugs.

* Select start mode using moments of inertia rather than eigenvalues of canonical transformation.

* Include color features in moments of inertia.

* Smidge faster.

* Tversky similarity.

* Tidy tests.

* Tests working on Linux.

* Revert force of right handed axes in MolTransforms::computePrincipalAxesAndMomentsFromGyrationMatrix replacing with a comment in the code.

* Response to review.

* Sneaky allCarbon bug.

* add multithreaded test

* Response to review.

* Doh! Don't recalculate normalization after every transformation.

* Re-instate d_normalizationOK.

* Re-name functions for fetching canonical transformations.

* Separate alpha from coords.

* MultiConf works with single conf extraction.

* Extract all conformations.
Max and best similarities.

* Renames d_currConformer to d_activeShape.

* Update shapeToMol.

* Update shapeToMol.

* Changes from synthon shape searching.

* Fix normalization of multiple confs.

* Update Python wrappers.

* Fix shape merge.

* Improve bestSimilarity.

* Fix python wrapper.

* Pull in changes from SynthonShapeSearch:
make pruneShapes public.
function to negate Alpha values.

* clang-tidy suggestions.

* clang-tidy suggestions.

* Bug in quaternion gradients - we now have only 3 coordinates.

* Tidy tests.

* Mac result slightly different.

* Multi conformer molecule alignment.

* Optionally return raw overlap volumes in score functions.

* Python wrappers for raw overlap volumes.

* Update Python wrapper ShapeInputOptions.

* Tidy for PR.

* Extra include file.

* Extra library

* Tidy forward declarations.

* Don't prune if threshold < 0.0.

* Windows exporty thing.

* Check SMILES on merge of ShapeInputs.

* PRECONDITION of SMILES on merge of ShapeInputs.

* Response to review - rename some functions.

* change how overlapVols is passed
add a test for it

* API suggestions

* Response to review.

* Remove debugging writes.

* Fix Python wrappers.

---------

Co-authored-by: David Cosgrove <david@cozchemix.co.uk>
Co-authored-by: greg landrum <greg.landrum@gmail.com>
2026-06-03 06:09:09 +02:00

1048 lines
38 KiB
C++

//
// Copyright (C) 2026 David Cosgrove and other RDKit contributors
//
// @@ All Rights Reserved @@
// This file is part of the RDKit.
// The contents are covered by the terms of the BSD license
// which is included in the file license.txt, found at the root
// of the RDKit source tree.
//
// Original author: David Cosgrove (CozChemIx Limited)
//
#include <Geometry/point.h>
#include <Geometry/Transform3D.h>
#include <GraphMol/MolOps.h>
#include <GraphMol/GaussianShape/GaussianShape.h>
#include <GraphMol/GaussianShape/ShapeInput.h>
#include <GraphMol/GaussianShape/SingleConformerAlignment.h>
#include <GraphMol/SmilesParse/SmilesParse.h>
#include <GraphMol/SmilesParse/SmilesWrite.h>
#include <GraphMol/Substruct/SubstructMatch.h>
#include <SimDivPickers/DistPicker.h>
#include <SimDivPickers/LeaderPicker.h>
#include <RDGeneral/BoostStartInclude.h>
#include <boost/flyweight.hpp>
#include <boost/flyweight/key_value.hpp>
#include <boost/flyweight/no_tracking.hpp>
#include <RDGeneral/BoostEndInclude.h>
#ifdef RDK_HAS_EIGEN3
#include <Eigen/Dense>
#endif
namespace RDKit {
namespace GaussianShape {
// Bondi radii
// You can find more of these in Table 12 of this publication:
// https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3658832/
const std::map<unsigned int, double> vdw_radii = {
{0, DUMMY_RAD}, // Dummy, same as Xe.
{1, 1.10}, // H
{2, 1.40}, // He
{3, 1.81}, // Li
{4, 1.53}, // Be
{5, 1.92}, // B
{6, CARBON_RAD}, // C
{7, 1.55}, // N
{8, 1.52}, // O
{9, 1.47}, // F
{10, 1.54}, // Ne
{11, 2.27}, // Na
{12, 1.73}, // Mg
{13, 1.84}, // Al
{14, 2.10}, // Si
{15, 1.80}, // P
{16, 1.80}, // S
{17, 1.75}, // Cl
{18, 1.88}, // Ar
{19, 2.75}, // K
{20, 2.31}, // Ca
{31, 1.87}, // Ga
{32, 2.11}, // Ge
{33, 1.85}, // As
{34, 1.90}, // Se
{35, 1.83}, // Br
{36, 2.02}, // Kr
{37, 3.03}, // Rb
{38, 2.49}, // Sr
{49, 1.93}, // In
{50, 2.17}, // Sn
{51, 2.06}, // Sb
{52, 2.06}, // Te
{53, 1.98}, // I
{54, 2.16}, // Xe
{55, 3.43}, // Cs
{56, 2.68}, // Ba
{81, 1.96}, // Tl
{82, 2.02}, // Pb
{83, 2.07}, // Bi
{84, 1.97}, // Po
{85, 2.02}, // At
{86, 2.20}, // Rn
{87, 3.48}, // Fr
{88, 2.83}, // Ra
};
constexpr double radius_color =
1.08265; // same radius for all feature/color "atoms", as used by the
// PubChem code.
namespace {
// Throw out any atoms we don't want. copySubsetMol does more work than
// is necessary for this and seemed to leave the molecule in an odd state
// that didn't play well with extractAtoms.
std::unique_ptr<RWMol> extractSubset(
const RWMol &mol, const std::vector<unsigned int> &atomsToKeep) {
auto retMol = std::make_unique<RWMol>(mol);
boost::dynamic_bitset<> keepAtoms(mol.getNumAtoms());
for (const auto atk : atomsToKeep) {
keepAtoms[atk] = true;
}
retMol->beginBatchEdit();
for (const auto atom : retMol->atoms()) {
if (!keepAtoms[atom->getIdx()]) {
retMol->removeAtom(atom->getIdx());
}
}
retMol->commitBatchEdit();
// Fix the aromatic flags on anything that is no longer in a ring.
MolOps::findSSSR(*retMol);
const auto rings = retMol->getRingInfo();
for (const auto a : retMol->atoms()) {
if (a->getIsAromatic() && !rings->numAtomRings(a->getIdx())) {
a->setIsAromatic(false);
}
}
return retMol;
}
} // namespace
ShapeInput::ShapeInput(const ROMol &mol, const int confId,
const ShapeInputOptions &opts,
const ShapeOverlayOptions &overlayOpts) {
PRECONDITION(mol.getNumConformers() > 0,
"ShapeInput object needs the molecule to have conformers. " +
mol.getProp<std::string>("_Name") + " " + MolToSmiles(mol));
// std::cout << "making shape from " << MolToSmiles(mol) << " with "
// << mol.getNumConformers() << " confs" << std::endl;
std::unique_ptr<RWMol> tmpMol;
// Subsetting the molecule makes any bespoke atom radii, identified
// by atom index, incorrect so stash them as atom properties.
for (const auto &[idx, radius] : opts.atomRadii) {
const auto atom = mol.getAtomWithIdx(idx);
atom->setProp<double>("BespokeRadius", radius);
}
if (!opts.atomSubset.empty()) {
tmpMol = extractSubset(mol, opts.atomSubset);
} else {
tmpMol.reset(new RWMol(mol));
}
d_smiles = MolToSmiles(*tmpMol);
// std::cout << "shape smiles : " << d_smiles << std::endl;
std::vector<unsigned int> atOrder;
tmpMol->getProp(common_properties::_smilesAtomOutputOrder, atOrder);
tmpMol.reset(dynamic_cast<RWMol *>(MolOps::renumberAtoms(*tmpMol, atOrder)));
if (!tmpMol->getRingInfo()->isInitialized()) {
// Query molecules don't seem to have the ring info generated on creation.
MolOps::findSSSR(*tmpMol);
}
d_normalizeds = boost::dynamic_bitset<>(tmpMol->getNumConformers());
d_normalizationOKs = boost::dynamic_bitset<>(tmpMol->getNumConformers());
// Molecules don't necessarily have consecutive conformer IDs from 0
// upwards. confNum is just the sequential number of the iterated
// conformers.
auto processConf = [&](const Conformer &conf, const unsigned int confNum,
const bool fa) {
extractAtoms(conf, opts, fa);
if (opts.useColors) {
extractFeatures(conf, confNum, opts, fa);
}
};
if (confId >= 0) {
processConf(tmpMol->getConformer(confId), 0, true);
} else {
unsigned int i = 0;
for (auto cfi = tmpMol->beginConformers(); cfi != tmpMol->endConformers();
++cfi, ++i) {
processConf(*(*cfi), i, i == 0);
}
}
d_activeShape = 0;
calcNormalization();
calcExtremes();
calculateSelfOverlaps(overlayOpts);
if (opts.shapePruneThreshold > 0.0 && tmpMol->getNumConformers() > 1) {
pruneShapes(opts.shapePruneThreshold);
}
if (opts.sortShapes) {
sortShapesByVolumes();
}
d_activeShape = 0;
}
ShapeInput::ShapeInput(const ShapeInput &other, const unsigned int shapeNum) {
PRECONDITION(
shapeNum < other.getNumShapes(),
"Invalid shape number in makeSingleShape : " + std::to_string(shapeNum) +
" vs " + std::to_string(other.getNumShapes()) + ".");
d_activeShape = 0;
d_coords.emplace_back(other.d_coords[shapeNum]);
d_alphas = other.d_alphas;
d_types = other.d_types;
d_numAtoms = other.d_numAtoms;
d_numFeats = other.d_numFeats;
d_selfOverlapShapeVols.emplace_back(other.d_selfOverlapShapeVols[shapeNum]);
d_selfOverlapColorVols.emplace_back(other.d_selfOverlapColorVols[shapeNum]);
d_extremePointss.emplace_back(other.d_extremePointss[shapeNum]);
if (other.d_carbonRadii) {
d_carbonRadii.reset(new boost::dynamic_bitset<>(*other.d_carbonRadii));
}
d_smiles = other.d_smiles;
d_normalizeds = boost::dynamic_bitset<>(1);
d_normalizeds[0] = other.d_normalizeds[shapeNum];
d_normalizationOKs = boost::dynamic_bitset<>(1);
d_normalizationOKs[0] = other.d_normalizationOKs[shapeNum];
d_canonRots.emplace_back(other.d_canonRots[shapeNum]);
d_canonTranss.emplace_back(other.d_canonTranss[shapeNum]);
d_eigenValuess.emplace_back(other.d_eigenValuess[shapeNum]);
}
ShapeInput::ShapeInput(const ShapeInput &other)
: d_activeShape(other.d_activeShape),
d_coords(other.d_coords),
d_alphas(other.d_alphas),
d_types(other.d_types),
d_numAtoms(other.d_numAtoms),
d_numFeats(other.d_numFeats),
d_selfOverlapShapeVols(other.d_selfOverlapShapeVols),
d_selfOverlapColorVols(other.d_selfOverlapColorVols),
d_extremePointss(other.d_extremePointss),
d_smiles(other.d_smiles),
d_normalizeds(other.d_normalizeds),
d_normalizationOKs(other.d_normalizationOKs),
d_canonRots(other.d_canonRots),
d_canonTranss(other.d_canonTranss),
d_eigenValuess(other.d_eigenValuess) {
if (other.d_carbonRadii) {
d_carbonRadii.reset(new boost::dynamic_bitset<>(*other.d_carbonRadii));
}
}
ShapeInput &ShapeInput::operator=(const ShapeInput &other) {
if (this == &other) {
return *this;
}
d_activeShape = other.d_activeShape;
d_coords = other.d_coords;
d_alphas = other.d_alphas;
d_types = other.d_types;
d_numAtoms = other.d_numAtoms;
d_numFeats = other.d_numFeats;
d_selfOverlapShapeVols = other.d_selfOverlapShapeVols;
d_selfOverlapColorVols = other.d_selfOverlapColorVols;
d_extremePointss = other.d_extremePointss;
d_smiles = other.d_smiles;
d_normalizeds = other.d_normalizeds;
d_normalizationOKs = other.d_normalizationOKs;
d_canonRots = other.d_canonRots;
d_canonTranss = other.d_canonTranss;
d_eigenValuess = other.d_eigenValuess;
if (other.d_carbonRadii) {
d_carbonRadii.reset(new boost::dynamic_bitset<>(*other.d_carbonRadii));
} else {
d_carbonRadii.reset();
}
return *this;
}
void ShapeInput::merge(ShapeInput &other) {
PRECONDITION(d_smiles == other.d_smiles,
"Shapes have different SMILES strings.");
if (!d_coords.empty() &&
d_coords.front().size() != other.d_coords.front().size()) {
BOOST_LOG(rdWarningLog) << "Can't merge shapes as different sizes.\n";
return;
}
if (other.d_coords.empty()) {
return;
}
d_coords.reserve(d_coords.size() + other.d_coords.size());
d_coords.insert(d_coords.end(),
std::make_move_iterator(other.d_coords.begin()),
std::make_move_iterator(other.d_coords.end()));
d_selfOverlapShapeVols.reserve(d_selfOverlapShapeVols.size() +
other.d_selfOverlapShapeVols.size());
d_selfOverlapShapeVols.insert(
d_selfOverlapShapeVols.end(),
std::make_move_iterator(other.d_selfOverlapShapeVols.begin()),
std::make_move_iterator(other.d_selfOverlapShapeVols.end()));
d_selfOverlapColorVols.reserve(d_selfOverlapColorVols.size() +
other.d_selfOverlapColorVols.size());
d_selfOverlapColorVols.insert(
d_selfOverlapColorVols.end(),
std::make_move_iterator(other.d_selfOverlapColorVols.begin()),
std::make_move_iterator(other.d_selfOverlapColorVols.end()));
d_extremePointss.reserve(d_extremePointss.size() +
other.d_extremePointss.size());
d_extremePointss.insert(
d_extremePointss.end(),
std::make_move_iterator(other.d_extremePointss.begin()),
std::make_move_iterator(other.d_extremePointss.end()));
d_canonRots.reserve(d_canonRots.size() + other.d_canonRots.size());
d_canonRots.insert(d_canonRots.end(),
std::make_move_iterator(other.d_canonRots.begin()),
std::make_move_iterator(other.d_canonRots.end()));
d_canonTranss.reserve(d_canonTranss.size() + other.d_canonTranss.size());
d_canonTranss.insert(d_canonTranss.end(),
std::make_move_iterator(other.d_canonTranss.begin()),
std::make_move_iterator(other.d_canonTranss.end()));
d_eigenValuess.reserve(d_eigenValuess.size() + other.d_eigenValuess.size());
d_eigenValuess.insert(d_eigenValuess.end(),
std::make_move_iterator(other.d_eigenValuess.begin()),
std::make_move_iterator(other.d_eigenValuess.end()));
auto joinBS =
[](const boost::dynamic_bitset<> &bs1,
const boost::dynamic_bitset<> &bs2) -> boost::dynamic_bitset<> {
boost::dynamic_bitset<> bs1Copy(bs1);
boost::dynamic_bitset<> bs2Copy(bs2);
const size_t totalSize = bs1.size() + bs2.size();
bs1Copy.resize(totalSize);
bs2Copy.resize(totalSize);
bs1Copy <<= bs2.size();
bs1Copy |= bs2Copy;
return bs1Copy;
};
d_normalizeds = joinBS(other.d_normalizeds, d_normalizeds);
other.d_normalizeds.clear();
d_normalizationOKs = joinBS(other.d_normalizationOKs, d_normalizationOKs);
other.d_normalizationOKs.clear();
other.d_coords.clear();
other.d_selfOverlapShapeVols.clear();
other.d_selfOverlapColorVols.clear();
other.d_extremePointss.clear();
other.d_canonRots.clear();
other.d_canonTranss.clear();
other.d_eigenValuess.clear();
}
void ShapeInput::setActiveShape(const unsigned int newShape) {
PRECONDITION(newShape < d_coords.size(),
"Invalid shape number (" + std::to_string(newShape) + " vs " +
std::to_string(d_coords.size()) + ").");
if (d_activeShape != newShape) {
d_activeShape = newShape;
}
}
void ShapeInput::negateAlpha(const unsigned int alphaNum) {
PRECONDITION(alphaNum < d_alphas.size(), "Bad number for alpha negation");
d_alphas[alphaNum] *= -1.0;
}
std::vector<RDGeom::Point3D> ShapeInput::getAtomPoints(
const bool includeColors) const {
std::vector<RDGeom::Point3D> atomPoints;
unsigned int numPoints = getNumAtoms();
if (includeColors) {
numPoints += getNumFeatures();
}
atomPoints.reserve(numPoints);
for (unsigned int i = 0; i < 3 * numPoints; i += 3) {
atomPoints.emplace_back(RDGeom::Point3D(d_coords[d_activeShape][i],
d_coords[d_activeShape][i + 1],
d_coords[d_activeShape][i + 2]));
}
return atomPoints;
}
double ShapeInput::getShapeVolume(const unsigned int shapeNum) const {
PRECONDITION(shapeNum < d_coords.size(),
"Invalid shape number (" + std::to_string(shapeNum) + " vs " +
std::to_string(d_coords.size()) + ").");
return d_selfOverlapShapeVols[shapeNum];
}
double ShapeInput::getColorVolume(const unsigned int shapeNum) const {
PRECONDITION(shapeNum < d_coords.size(),
"Invalid shape number (" + std::to_string(shapeNum) + " vs " +
std::to_string(d_coords.size()) + ").");
return d_selfOverlapColorVols[shapeNum];
}
const std::array<double, 9> &ShapeInput::calcCanonicalRotation() {
if (!d_normalizationOKs[d_activeShape]) {
calcNormalization();
}
return d_canonRots[d_activeShape];
}
const std::array<double, 3> &ShapeInput::calcCanonicalTranslation() {
if (!d_normalizationOKs[d_activeShape]) {
calcNormalization();
}
return d_canonTranss[d_activeShape];
}
const std::array<double, 3> &ShapeInput::calcEigenValues() {
if (!d_normalizationOKs[d_activeShape]) {
calcNormalization();
}
return d_eigenValuess[d_activeShape];
}
const std::array<size_t, 6> &ShapeInput::calcExtremes() {
if (!d_normalizationOKs[d_activeShape]) {
calculateExtremes();
}
return d_extremePointss[d_activeShape];
}
std::array<double, 3> ShapeInput::calcMomentsOfInertia(
const bool includeColors) const {
const auto tmpMol = shapeToMol(includeColors);
std::array<double, 3> eVals;
#if RDK_HAS_EIGEN3
Eigen::Matrix3d axes;
Eigen::Vector3d moments;
MolTransforms::computePrincipalAxesAndMoments(tmpMol->getConformer(), axes,
moments);
eVals[0] = moments[0];
eVals[1] = moments[1];
eVals[2] = moments[2];
#else
std::unique_ptr<RDGeom::Transform3D> canonXform(
MolTransforms::computeCanonicalTransform(tmpMol->getConformer(), nullptr,
false, true, eVals.data()));
#endif
return eVals;
}
void ShapeInput::normalizeCoords() {
if (d_normalizeds[d_activeShape]) {
return;
}
if (!d_normalizationOKs[d_activeShape]) {
calcNormalization();
}
RDGeom::Transform3D canonRot;
for (unsigned int i = 0, k = 0; i < 3; ++i) {
for (unsigned int j = 0; j < 3; ++j, ++k) {
canonRot.setValUnchecked(i, j, d_canonRots[d_activeShape][k]);
}
}
RDGeom::Point3D trans{d_canonTranss[d_activeShape][0],
d_canonTranss[d_activeShape][1],
d_canonTranss[d_activeShape][2]};
canonRot.TransformPoint(trans);
canonRot.SetTranslation(trans);
transformCoords(canonRot);
d_normalizeds[d_activeShape] = true;
// Recalculate the extremes now we've changed the coordinates.
calcExtremes();
}
void ShapeInput::transformCoords(RDGeom::Transform3D &xform) {
applyTransformToShape(d_coords[d_activeShape], xform);
d_normalizeds[d_activeShape] = false;
d_normalizationOKs[d_activeShape] = false;
}
std::unique_ptr<RWMol> ShapeInput::shapeToMol(const bool includeColors,
const bool withBonds) const {
std::unique_ptr<RWMol> mol;
if (withBonds) {
// The SMILES string and the atom coordinates should be in the same
// order.
v2::SmilesParse::SmilesParserParams params;
params.sanitize = false;
mol = v2::SmilesParse::MolFromSmiles(d_smiles, params);
} else {
mol.reset(new RWMol());
for (unsigned int i = 0; i < getNumAtoms(); i++) {
const auto atom = new Atom(6);
mol->addAtom(atom, true, true);
}
}
if (includeColors) {
for (unsigned int i = 0; i < getNumFeatures(); i++) {
const auto atom = new Atom(54);
mol->addAtom(atom, true, true);
}
}
unsigned int num = getNumAtoms();
if (includeColors) {
num += getNumFeatures();
}
const auto conf = new Conformer(num);
const auto &shapeCds = getCoords();
for (unsigned int i = 0; i < num; i++) {
auto &pos = conf->getAtomPos(i);
pos.x = shapeCds[3 * i];
pos.y = shapeCds[3 * i + 1];
pos.z = shapeCds[3 * i + 2];
}
mol->addConformer(conf, true);
return mol;
}
std::array<double, 3> ShapeInput::bestSimilarity(
const ShapeInput &fitShape, unsigned int &bestThisShape,
unsigned int &bestFitShape, RDGeom::Transform3D &bestXform,
const double threshold, const ShapeOverlayOptions &overlayOpts) {
bestThisShape = -1;
bestFitShape = -1;
std::array<double, 3> bestSim{-1.0, -1.0, -1.0};
if (maxPossibleSimilarity(fitShape) < threshold) {
return bestSim;
}
RDGeom::Transform3D xform;
const auto currActiveShape = getActiveShape();
for (size_t i = 0; i < getNumShapes(); i++) {
setActiveShape(i);
for (size_t j = 0; j < fitShape.getNumShapes(); j++) {
const auto maxSim =
maxScore(getShapeVolume(), fitShape.getShapeVolume(j),
getColorVolume(), fitShape.getColorVolume(j), overlayOpts);
if (maxSim > threshold) {
ShapeInput singleFitShape(fitShape, j);
auto scores = AlignShape(*this, singleFitShape, &xform, overlayOpts);
if (scores[0] > bestSim[0]) {
bestSim = scores;
bestThisShape = i;
bestFitShape = j;
bestXform = xform;
}
}
// Floating point cruft means we sometimes get a similarity slightly
// above 1.0. 1.0 is the maximum possible, so stop if we hit it.
if (bestSim[0] > 1.0 || fabs(bestSim[0] - 1.0) < 1.0e-6) {
setActiveShape(currActiveShape);
return bestSim;
}
}
}
setActiveShape(currActiveShape);
return bestSim;
}
double ShapeInput::maxPossibleSimilarity(
const ShapeInput &fitShape, const ShapeOverlayOptions &overlayOpts) const {
// The best score achievable is when the smaller volume is entirely inside
// the larger volume.
double maxSim = 0.0;
for (unsigned int i = 0; i < getNumShapes(); i++) {
for (unsigned int j = 0; j < fitShape.getNumShapes(); j++) {
const auto sim = maxScore(fitShape.d_selfOverlapShapeVols[j],
d_selfOverlapShapeVols[i],
fitShape.d_selfOverlapColorVols[j],
d_selfOverlapColorVols[i], overlayOpts);
if (sim > maxSim) {
maxSim = sim;
}
}
}
return maxSim;
}
void ShapeInput::pruneShapes(const double simThreshold) {
if (d_coords.size() < 2 || simThreshold < 0.0) {
return;
}
class DistFunctor {
public:
explicit DistFunctor(const ShapeInput &shapes)
: d_shapes(shapes), d_shapesCp(shapes) {}
~DistFunctor() = default;
double operator()(const unsigned int i, const unsigned int j) {
d_shapes.setActiveShape(i);
d_shapesCp.setActiveShape(j);
const auto scores = AlignShape(d_shapes, d_shapesCp);
// The picker works on distances.
return 1.0 - scores[0];
}
ShapeInput d_shapes;
ShapeInput d_shapesCp;
};
const RDPickers::LeaderPicker leaderPicker;
DistFunctor distFunctor(*this);
// The picker works on distances.
auto picks = leaderPicker.lazyPick(distFunctor, d_coords.size(), 0,
1.0 - simThreshold);
// Allow for mysterious LeaderPicker behaviour where it returns a
// vector of 0s when it should only pick 1 shape.
std::ranges::sort(picks);
auto [first, last] = std::ranges::unique(picks);
picks.erase(first, last);
selectConformations(picks);
d_activeShape = 0;
}
namespace {
double getStandardAtomRadius(const unsigned int atomicNum) {
// Mostly they will be carbons, so just return that without lookup.
if (atomicNum == 6) {
return CARBON_RAD;
}
if (const auto rad = vdw_radii.find(static_cast<unsigned int>(atomicNum));
rad != vdw_radii.end()) {
return rad->second;
}
throw ValueErrorException("No VdW radius for atom with Z=" +
std::to_string(atomicNum));
}
} // namespace
void ShapeInput::extractAtoms(const Conformer &conf,
const ShapeInputOptions &shapeOpts,
const bool fillAlphas) {
if (!shapeOpts.allCarbonRadii) {
d_carbonRadii.reset(new boost::dynamic_bitset<>(
!shapeOpts.atomSubset.empty() ? shapeOpts.atomSubset.size()
: conf.getNumAtoms()));
}
std::vector<double> theseCoords;
theseCoords.reserve(conf.getNumAtoms() * 3);
for (const auto atom : conf.getOwningMol().atoms()) {
// Ignore H atoms except deuterium and tritium which are treated elsewhere
// as explicit atoms, and do use dummies if requested. The latter two
// are set to be ignored in the volume calculation, however. This is
// just to keep the atom numberings correct. Dummy atoms can also have
// isotope number 1.
if (atom->getAtomicNum() != 1 ||
(atom->getAtomicNum() == 1 && atom->getIsotope() > 1)) {
if (!shapeOpts.includeDummies && !atom->getAtomicNum()) {
continue;
}
const auto atIdx = atom->getIdx();
auto &pos = conf.getAtomPos(atIdx);
theseCoords.push_back(pos.x);
theseCoords.push_back(pos.y);
theseCoords.push_back(pos.z);
if (fillAlphas) {
if (shapeOpts.allCarbonRadii && !d_carbonRadii) {
d_alphas.push_back(KAPPA / (CARBON_RAD * CARBON_RAD));
} else {
double rad = 0.0;
if (shapeOpts.atomRadii.empty()) {
if ((shapeOpts.allCarbonRadii && atom->getAtomicNum()) ||
atom->getAtomicNum() == 6) {
(*d_carbonRadii)[atIdx] = true;
}
rad = getStandardAtomRadius(atom->getAtomicNum());
} else {
if (atom->hasProp("BespokeRadius")) {
rad = atom->getProp<double>("BespokeRadius");
} else {
rad = getStandardAtomRadius(atom->getAtomicNum());
if ((shapeOpts.allCarbonRadii && atom->getAtomicNum()) ||
atom->getAtomicNum() == 6) {
(*d_carbonRadii)[atIdx] = true;
}
}
}
// TODO: For now, check there hasn't been a logic error leaving a rad
// of 0.0. Take it out later when it's been tried enough we can be
// confident it's ok.
if (rad == 0.0) {
throw ValueErrorException("Atom has radius 0.0.");
}
d_alphas.push_back(KAPPA / (rad * rad));
}
if (atom->getAtomicNum() == 1) {
// So it's not included in the volume calcs.
d_alphas.back() = -1.0;
}
}
}
}
d_activeShape = d_coords.size();
d_coords.push_back(theseCoords);
if (fillAlphas) {
d_numAtoms = d_coords.front().size() / 3;
d_types.resize(d_numAtoms);
d_numFeats = 0;
}
}
namespace {
class ss_matcher {
public:
explicit ss_matcher(const std::string &pattern) : m_pattern(pattern) {
m_needCopies = pattern.find_first_of("$") != std::string::npos;
m_matcher = SmartsToMol(pattern);
POSTCONDITION(m_matcher, "no matcher");
};
const ROMol *getMatcher() const { return m_matcher; };
~ss_matcher() { delete m_matcher; };
private:
ss_matcher() = default;
std::string m_pattern;
bool m_needCopies{false};
const ROMol *m_matcher{nullptr};
};
} // namespace
// This came from the original PubChemShape.cpp
typedef boost::flyweight<boost::flyweights::key_value<std::string, ss_matcher>,
boost::flyweights::no_tracking>
pattern_flyweight;
// Definitions for feature points adapted from:
// Gobbi and Poppinger, Biotech. Bioeng. _61_ 47-54 (1998)
const std::vector<std::vector<std::string>> smartsPatterns = {
{"[$([N;!H0;v3,v4&+1]),\
$([O,S;H1;+0]),\
n&H1&+0]"}, // Donor
{"[$([O,S;H1;v2;!$(*-*=[O,N,P,S])]),\
$([O,S;H0;v2]),\
$([O,S;-]),\
$([N;v3;!$(N-*=[O,N,P,S])]),\
n&H0&+0,\
$([o,s;+0;!$([o,s]:n);!$([o,s]:c:n)])]"}, // Acceptor
{
"[r]1[r][r]1",
"[r]1[r][r][r]1",
"[r]1[r][r][r][r]1",
"[r]1[r][r][r][r][r]1",
"[r]1[r][r][r][r][r][r]1",
}, // rings
// "[a]", //
// Aromatic
// "[F,Cl,Br,I]", // Halogen
{"[#7;+,\
$([N;H2&+0][$([C,a]);!$([C,a](=O))]),\
$([N;H1&+0]([$([C,a]);!$([C,a](=O))])[$([C,a]);!$([C,a](=O))]),\
$([N;H0&+0]([C;!$(C(=O))])([C;!$(C(=O))])[C;!$(C(=O))])]"}, // Basic
{"[$([C,S](=[O,S,P])-[O;H1,-1])]"} // Acidic
};
std::vector<std::vector<const ROMol *>> *getPh4Patterns() {
static std::unique_ptr<std::vector<std::vector<const ROMol *>>> patterns;
if (!patterns) {
patterns.reset(new std::vector<std::vector<const ROMol *>>());
for (const auto &smartsV : smartsPatterns) {
std::vector<const ROMol *> v;
for (const auto &smarts : smartsV) {
const ROMol *matcher = pattern_flyweight(smarts).get().getMatcher();
CHECK_INVARIANT(matcher, "bad smarts");
v.push_back(matcher);
}
patterns->push_back(std::move(v));
}
}
return patterns.get();
}
// Extract the features for the color scores, using RDKit pphore features
// for now. Other options to be added later. This should be working on
// the parent molecule, but only extracting features that are due to
// atoms entirely within any atom subset.
void ShapeInput::extractFeatures(const Conformer &conf, unsigned int confNum,
const ShapeInputOptions &shapeOpts,
const bool fillAlphas) {
std::vector<CustomFeature> feats;
if (shapeOpts.customFeatures.empty()) {
findFeatures(conf, feats);
} else {
PRECONDITION(confNum < shapeOpts.customFeatures.size(),
"Conformer number " + std::to_string(confNum) +
" too large for custom features size " +
std::to_string(shapeOpts.customFeatures.size()));
feats = shapeOpts.customFeatures[confNum];
}
// Just copy them directly except that the last is a radius, so convert
// to alpha.
for (const auto &f : feats) {
const auto &pos = f.pos;
d_coords[d_activeShape].push_back(pos.x);
d_coords[d_activeShape].push_back(pos.y);
d_coords[d_activeShape].push_back(pos.z);
if (fillAlphas) {
const auto rad = f.rad;
d_alphas.push_back(KAPPA / (rad * rad));
d_types.push_back(f.type);
d_numFeats++;
}
}
}
void ShapeInput::calcNormalization() {
if (d_eigenValuess.size() < d_coords.size()) {
d_eigenValuess.resize(d_coords.size());
d_canonRots.resize(d_coords.size());
d_canonTranss.resize(d_coords.size());
d_eigenValuess.resize(d_coords.size());
d_extremePointss.resize(d_coords.size());
}
d_eigenValuess[d_activeShape] = std::array<double, 3>{0.0, 0.0, 0.0};
// Build a "molecule" just of the shape points, not the color features
// with which to calculate the canonical transformation. Doesn't ever
// use the input molecule in case the shape was built from a subset of
// atoms in that molecule.
const auto tmpMol = shapeToMol(false, false);
std::unique_ptr<RDGeom::Transform3D> canonXform(
MolTransforms::computeCanonicalTransform(
tmpMol->getConformer(), nullptr, false, true,
d_eigenValuess[d_activeShape].data()));
d_canonRots[d_activeShape] =
std::array<double, 9>{1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0};
for (unsigned int i = 0, k = 0; i < 3; ++i) {
for (unsigned int j = 0; j < 3; ++j, ++k) {
d_canonRots[d_activeShape][k] = canonXform->getValUnchecked(i, j);
}
}
d_canonTranss[d_activeShape] = std::array<double, 3>{0.0, 0.0, 0.0};
for (unsigned int i = 0; i < 3 * d_numAtoms; i += 3) {
d_canonTranss[d_activeShape][0] -= d_coords[d_activeShape][i];
d_canonTranss[d_activeShape][1] -= d_coords[d_activeShape][i + 1];
d_canonTranss[d_activeShape][2] -= d_coords[d_activeShape][i + 2];
}
d_canonTranss[d_activeShape][0] /= d_numAtoms;
d_canonTranss[d_activeShape][1] /= d_numAtoms;
d_canonTranss[d_activeShape][2] /= d_numAtoms;
d_normalizationOKs[d_activeShape] = true;
}
void ShapeInput::calculateExtremes() {
if (d_extremePointss.size() < d_coords.size()) {
d_extremePointss.resize(d_coords.size());
}
d_extremePointss[d_activeShape] = std::array<size_t, 6>{0, 0, 0, 0, 0, 0};
const std::vector<double> &theseCoords = d_coords[d_activeShape];
for (size_t i = 0, j = 0; i < theseCoords.size(); i += 3, ++j) {
if (theseCoords[i] < theseCoords[3 * d_extremePointss[d_activeShape][0]]) {
d_extremePointss[d_activeShape][0] = j;
}
if (theseCoords[i] > theseCoords[3 * d_extremePointss[d_activeShape][3]]) {
d_extremePointss[d_activeShape][3] = j;
}
if (theseCoords[i + 1] <
theseCoords[3 * d_extremePointss[d_activeShape][1] + 1]) {
d_extremePointss[d_activeShape][1] = j;
}
if (theseCoords[i + 1] >
theseCoords[3 * d_extremePointss[d_activeShape][4] + 1]) {
d_extremePointss[d_activeShape][4] = j;
}
if (theseCoords[i + 2] <
theseCoords[3 * d_extremePointss[d_activeShape][2] + 2]) {
d_extremePointss[d_activeShape][2] = j;
}
if (theseCoords[i + 2] >
theseCoords[3 * d_extremePointss[d_activeShape][5] + 2]) {
d_extremePointss[d_activeShape][5] = j;
}
}
}
void ShapeInput::selectConformations(const std::vector<int> &picks) {
std::vector<std::vector<double>> newCoords;
newCoords.reserve(picks.size());
std::vector<unsigned int> newMolConfs;
newMolConfs.reserve(picks.size());
std::vector<double> newDummyVols;
newDummyVols.reserve(picks.size());
std::vector<double> newShapeVolumes;
newShapeVolumes.reserve(picks.size());
std::vector<double> newColorVolumes;
newColorVolumes.reserve(picks.size());
std::vector<std::array<size_t, 6>> newExtremePointss;
newExtremePointss.reserve(picks.size());
boost::dynamic_bitset<> newNormalizeds(picks.size());
boost::dynamic_bitset<> newNormalizationOKs(picks.size());
std::vector<std::array<double, 9>> newCanRots;
newCanRots.reserve(picks.size());
std::vector<std::array<double, 3>> newCentroids;
newCentroids.reserve(picks.size());
std::vector<std::array<double, 3>> newEigenValuess;
newEigenValuess.reserve(picks.size());
unsigned int i = 0;
for (const auto p : picks) {
newCoords.push_back(std::move(d_coords[p]));
newShapeVolumes.push_back(d_selfOverlapShapeVols[p]);
newColorVolumes.push_back(d_selfOverlapColorVols[p]);
newExtremePointss.push_back(std::move(d_extremePointss[p]));
newNormalizeds[i] = d_normalizeds[p];
newNormalizationOKs[i] = d_normalizationOKs[p];
newCanRots.push_back(std::move(d_canonRots[p]));
newCentroids.push_back(std::move(d_canonTranss[p]));
newEigenValuess.push_back(std::move(d_eigenValuess[p]));
++i;
}
d_coords = std::move(newCoords);
d_selfOverlapShapeVols = std::move(newShapeVolumes);
d_selfOverlapColorVols = std::move(newColorVolumes);
d_extremePointss = std::move(newExtremePointss);
d_normalizeds = newNormalizeds;
d_normalizationOKs = newNormalizationOKs;
d_canonRots = std::move(newCanRots);
d_canonTranss = std::move(newCentroids);
d_eigenValuess = std::move(newEigenValuess);
}
void ShapeInput::calculateSelfOverlaps(const ShapeOverlayOptions &overlayOpts) {
d_selfOverlapShapeVols.reserve(getNumShapes());
d_selfOverlapColorVols.reserve(getNumShapes());
std::vector<double> gradConverters(12 * (d_numAtoms + d_numFeats));
for (unsigned int i = 0; i < getNumShapes(); i++) {
d_selfOverlapShapeVols.push_back(calcVolAndGrads(
d_coords[i].data(), d_alphas.data(), d_numAtoms, d_carbonRadii.get(),
d_coords[i].data(), d_alphas.data(), d_numAtoms, d_carbonRadii.get(),
gradConverters, overlayOpts.useDistCutoff,
overlayOpts.distCutoff * overlayOpts.distCutoff));
d_selfOverlapColorVols.push_back(calcVolAndGrads(
d_coords[i].data() + 3 * d_numAtoms, d_alphas.data() + d_numAtoms,
d_numFeats, d_types.data() + d_numAtoms,
d_coords[i].data() + 3 * d_numAtoms, d_alphas.data() + d_numAtoms,
d_numFeats, d_types.data() + d_numAtoms, d_numAtoms, gradConverters,
overlayOpts.useDistCutoff,
overlayOpts.distCutoff * overlayOpts.distCutoff, nullptr, nullptr));
}
}
void ShapeInput::sortShapesByVolumes() {
std::vector<std::pair<double, size_t>> vals;
vals.reserve(d_coords.size());
for (size_t i = 0; i < d_coords.size(); i++) {
vals.push_back(std::make_pair(
d_selfOverlapShapeVols[i] + d_selfOverlapColorVols[i], i));
}
std::ranges::sort(vals,
[](const std::pair<double, size_t> &a,
const std::pair<double, size_t> &b) -> bool {
return a.first > b.first;
});
std::vector<int> picks(vals.size());
std::ranges::transform(vals, begin(picks),
[](const std::pair<double, size_t> &a) -> int {
return static_cast<int>(a.second);
});
selectConformations(picks);
}
void findFeatures(const Conformer &conf, std::vector<CustomFeature> &features,
const std::optional<std::vector<unsigned int>> &atomSubset) {
unsigned pattIdx = 1;
for (const auto &patts : *getPh4Patterns()) {
for (const auto &patt : patts) {
std::vector<MatchVectType> matches;
// Once, recursive queries weren't thread safe but Greg assures
// me that they now are.
matches = SubstructMatch(conf.getOwningMol(), *patt);
for (const auto &match : matches) {
std::vector<unsigned int> ats;
bool featOk = true;
for (const auto &pr : match) {
// make sure all the atoms are in the subset, if there is one
if (atomSubset.has_value() && !atomSubset->empty()) {
if (std::ranges::find_if(*atomSubset, [pr](const auto &p) -> bool {
return p == static_cast<unsigned int>(pr.second);
}) == atomSubset->end()) {
featOk = false;
break;
}
}
ats.push_back(pr.second);
}
if (!featOk) {
continue;
}
auto featPos = computeFeaturePos(conf, ats);
features.emplace_back(
CustomFeature{pattIdx, featPos, radius_color, ats});
}
}
++pattIdx;
}
}
RDGeom::Point3D computeFeaturePos(const Conformer &conf,
const std::vector<unsigned int> &ats) {
RDGeom::Point3D featPos;
for (const auto at : ats) {
featPos += conf.getAtomPos(at);
}
featPos /= ats.size();
return featPos;
}
void applyTransformToShape(std::vector<double> &shape,
const RDGeom::Transform3D &xform) {
for (size_t i = 0; i < shape.size(); i += 3) {
RDGeom::Point3D pos{shape[i], shape[i + 1], shape[i + 2]};
xform.TransformPoint(pos);
shape[i] = pos.x;
shape[i + 1] = pos.y;
shape[i + 2] = pos.z;
}
}
void applyTransformToShape(const double *inShape, double *outShape,
const size_t numPoints,
const RDGeom::Transform3D &xform) {
for (size_t i = 0; i < 3 * numPoints; i += 3) {
RDGeom::Point3D pos{inShape[i], inShape[i + 1], inShape[i + 2]};
xform.TransformPoint(pos);
outShape[i] = pos.x;
outShape[i + 1] = pos.y;
outShape[i + 2] = pos.z;
}
}
void translateShape(std::vector<double> &shape,
const RDGeom::Point3D &translation) {
for (size_t i = 0; i < shape.size(); i += 3) {
shape[i] += translation.x;
shape[i + 1] += translation.y;
shape[i + 2] += translation.z;
}
}
void translateShape(const double *inShape, double *outShape,
const size_t numPoints,
const RDGeom::Point3D &translation) {
for (size_t i = 0; i < 3 * numPoints; i += 3) {
outShape[i] = inShape[i] + translation.x;
outShape[i + 1] = inShape[i + 1] + translation.y;
outShape[i + 2] = inShape[i + 2] + translation.z;
}
}
// Maximum possible score of the 2 shape (v[12]) and color (c[12]) volumes
double maxScore(const double v1, const double v2, const double c1,
const double c2, const ShapeOverlayOptions &overlayOpts) {
// We're dealing with a Tversky score
// s = O / (A * (R - O) + B * (F - O) + O)
// There are 2 cases to handle, where v1 < v2 in which case the max overlap
// is v1, and the opposite.
auto maxPart = [](const double p1, const double p2,
const ShapeOverlayOptions &ovlyOpts) -> double {
if (p1 < p2) {
return p1 / (ovlyOpts.simBeta * (p2 - p1) + p1);
}
return p2 / (ovlyOpts.simAlpha * (p1 - p2) + p2);
};
const auto maxSt = maxPart(v1, v2, overlayOpts);
const auto maxCt = maxPart(c1, c2, overlayOpts);
return maxSt * (1.0 - overlayOpts.optParam) + maxCt * overlayOpts.optParam;
}
} // namespace GaussianShape
} // namespace RDKit