mirror of
https://github.com/rdkit/rdkit.git
synced 2026-06-03 21:44:30 +08:00
* First import of GaussianShape. * Tidying. * Custom features. * Optimise. * Optimise. * Return 3 scores rather than 2 including combo score. * Rename useFeatures to useColors. * Python wrappers. * Python tests. * Take out big test. * Add new start mode, as PubChem does it. * Doh! * Fix MolTransforms eigenvalue return. * Two cycle optimisation, mostly working. * Take out bestSoFar score from SCA. * Take out DTYPE. * Tidy out redundant variables. * Optimisation in 2 parts. * More fiddling in pursuit of speed. * Update Python wrapper. * Tweak. * Atom subsets and different radii. * Fix test. * Revert pubchem_shape's test.cpp. * Serialize ShapeInput. * Trigger build * Remove pointers to std::arrays in ShapeInput. * ShapeInput virtual d'tor. * Precondition - ShapeInput needs a molecule with at least 1 conformer. * Rename ShapeInput::d_centroid to ShapeInput::d_canonTrans. * Fix normalization bugs. * Select start mode using moments of inertia rather than eigenvalues of canonical transformation. * Include color features in moments of inertia. * Smidge faster. * Tversky similarity. * Tidy tests. * Tests working on Linux. * Revert force of right handed axes in MolTransforms::computePrincipalAxesAndMomentsFromGyrationMatrix replacing with a comment in the code. * Response to review. * Sneaky allCarbon bug. * add multithreaded test * Response to review. * Doh! Don't recalculate normalization after every transformation. * Re-instate d_normalizationOK. * Re-name functions for fetching canonical transformations. * Separate alpha from coords. * MultiConf works with single conf extraction. * Extract all conformations. Max and best similarities. * Renames d_currConformer to d_activeShape. * Update shapeToMol. * Update shapeToMol. * Changes from synthon shape searching. * Fix normalization of multiple confs. * Update Python wrappers. * Fix shape merge. * Improve bestSimilarity. * Fix python wrapper. * Pull in changes from SynthonShapeSearch: make pruneShapes public. function to negate Alpha values. * clang-tidy suggestions. * clang-tidy suggestions. * Bug in quaternion gradients - we now have only 3 coordinates. * Tidy tests. * Mac result slightly different. * Multi conformer molecule alignment. * Optionally return raw overlap volumes in score functions. * Python wrappers for raw overlap volumes. * Update Python wrapper ShapeInputOptions. * Tidy for PR. * Extra include file. * Extra library * Tidy forward declarations. * Don't prune if threshold < 0.0. * Windows exporty thing. * Check SMILES on merge of ShapeInputs. * PRECONDITION of SMILES on merge of ShapeInputs. * Response to review - rename some functions. * change how overlapVols is passed add a test for it * API suggestions * Response to review. * Remove debugging writes. * Fix Python wrappers. --------- Co-authored-by: David Cosgrove <david@cozchemix.co.uk> Co-authored-by: greg landrum <greg.landrum@gmail.com>
403 lines
16 KiB
C++
403 lines
16 KiB
C++
//
|
|
// Copyright (C) 2026 David Cosgrove and other RDKit contributors
|
|
//
|
|
// @@ All Rights Reserved @@
|
|
// This file is part of the RDKit.
|
|
// The contents are covered by the terms of the BSD license
|
|
// which is included in the file license.txt, found at the root
|
|
// of the RDKit source tree.
|
|
//
|
|
// Original author: David Cosgrove (CozChemIx Limited)
|
|
//
|
|
|
|
#ifndef RDKIT_SHAPEINPUT_GUARD
|
|
#define RDKIT_SHAPEINPUT_GUARD
|
|
|
|
#include <array>
|
|
#include <vector>
|
|
|
|
#include <GraphMol/RWMol.h>
|
|
#include <RDGeneral/export.h>
|
|
#include <Geometry/Transform3D.h>
|
|
|
|
#include <RDGeneral/BoostStartInclude.h>
|
|
#include <boost/dynamic_bitset.hpp>
|
|
#ifdef RDK_USE_BOOST_SERIALIZATION
|
|
#include <boost/archive/text_oarchive.hpp>
|
|
#include <boost/archive/text_iarchive.hpp>
|
|
#include <boost/serialization/vector.hpp>
|
|
#include <boost/serialization/array.hpp>
|
|
#include <boost/serialization/unique_ptr.hpp>
|
|
#endif
|
|
#include <RDGeneral/BoostEndInclude.h>
|
|
|
|
#include <GraphMol/GaussianShape/ShapeOverlayOptions.h>
|
|
|
|
// The code below was provided by Claude (Sonnet 4.6).
|
|
// If first tried to get me to use boost/serialization/dynamic_bitset.hpp
|
|
// and then admitted that it had made that up.
|
|
namespace boost {
|
|
namespace serialization {
|
|
|
|
template <class Archive, typename Block, typename Allocator>
|
|
void serialize(Archive &ar, dynamic_bitset<Block, Allocator> &bs,
|
|
const unsigned int /*version*/) {
|
|
size_t num_bits = bs.size();
|
|
ar & num_bits;
|
|
|
|
std::vector<Block> blocks;
|
|
|
|
if (Archive::is_saving::value) {
|
|
to_block_range(bs, std::back_inserter(blocks));
|
|
}
|
|
|
|
ar & blocks;
|
|
|
|
if (Archive::is_loading::value) {
|
|
bs.resize(num_bits);
|
|
from_block_range(blocks.begin(), blocks.end(), bs);
|
|
bs.resize(num_bits); // trim any excess bits
|
|
}
|
|
}
|
|
|
|
} // namespace serialization
|
|
} // namespace boost
|
|
|
|
namespace RDKit {
|
|
class ROMol;
|
|
class Conformer;
|
|
namespace GaussianShape {
|
|
|
|
constexpr double CARBON_RAD = 1.70;
|
|
constexpr double DUMMY_RAD = 2.16; // same as Xe
|
|
// From Grant et al.
|
|
constexpr double P = 2.7;
|
|
constexpr double KAPPA = 2.41798793102;
|
|
|
|
struct CustomFeature {
|
|
CustomFeature(
|
|
const unsigned int t, const RDGeom::Point3D &p, const double r,
|
|
const std::vector<unsigned int> &a = std::vector<unsigned int>())
|
|
: type(t), pos(p), rad(r), atoms(a) {}
|
|
unsigned int type;
|
|
RDGeom::Point3D pos;
|
|
double rad;
|
|
std::vector<unsigned int>
|
|
atoms; // That the feature was derived from. May be left empty.
|
|
};
|
|
|
|
struct ShapeInputOptions {
|
|
ShapeInputOptions() = default;
|
|
ShapeInputOptions(const ShapeInputOptions &) = default;
|
|
ShapeInputOptions(ShapeInputOptions &&) = default;
|
|
ShapeInputOptions &operator=(const ShapeInputOptions &) = default;
|
|
ShapeInputOptions &operator=(ShapeInputOptions &&) = default;
|
|
|
|
~ShapeInputOptions() = default;
|
|
|
|
bool useColors{
|
|
true}; //! Whether to build the color features. By default, it will
|
|
//! create features using the RDKit pharmacophore definitions.
|
|
|
|
std::vector<std::vector<CustomFeature>>
|
|
customFeatures; //! Custom color features used verbatim. One outer
|
|
//! vector for each conformation in the molecule.
|
|
std::vector<unsigned int>
|
|
atomSubset; //! If not empty, use just these atoms in the molecule to
|
|
//! form the ShapeInput object.
|
|
std::vector<std::pair<unsigned int, double>>
|
|
atomRadii; //! Use these non-standard radii for these atoms. The int is
|
|
//! for the atom index in the molecule, not the atomic
|
|
//! number. Not all atoms need be specified; some radii
|
|
//! can be over-ridden, with the rest left as standard.
|
|
bool allCarbonRadii{
|
|
true}; //! Whether to use carbon radii for all atoms (which is quicker
|
|
//! but less accurate) or vdw radii appropriate for the elements.
|
|
double shapePruneThreshold{-1.0}; //! If there is more than 1 conformer for
|
|
//! the input molecule, prune the shapes so
|
|
//! that none of them are more similar to
|
|
//! each other than the threshold. Default
|
|
//! -1.0 means no pruning.
|
|
bool sortShapes{true}; //! If true, the shapes are sorted in descending order
|
|
//! of total volume.
|
|
bool includeDummies{true}; //! Whether to include dummy atoms in the shape
|
|
//! or not.
|
|
};
|
|
|
|
// Data for shape alignment code
|
|
class RDKIT_GAUSSIANSHAPE_EXPORT ShapeInput {
|
|
public:
|
|
//! Create the ShapeInput object.
|
|
//! @param mol: The molecule of interest
|
|
//! @param confId: The conformer to use. If -1, uses all conformers.
|
|
//! @param opts: Options for setting up the shape
|
|
//! @param overlayOpts: Options for controlling overlays. The distance cutoff
|
|
//! elements are used in the self-overlap calculations.
|
|
explicit ShapeInput(
|
|
const ROMol &mol, int confId = -1,
|
|
const ShapeInputOptions &opts = ShapeInputOptions(),
|
|
const ShapeOverlayOptions &overlayOpts = ShapeOverlayOptions());
|
|
//! Create a ShapeInput object with a single shape copied from
|
|
//! other.
|
|
//! @param other: the ShapeInput that supplies the shape
|
|
//! @param shapeNum: the number of the shape of interest.
|
|
ShapeInput(const ShapeInput &other, unsigned int shapeNum);
|
|
explicit ShapeInput(const std::string &str) {
|
|
#ifndef RDK_USE_BOOST_SERIALIZATION
|
|
PRECONDITION(0, "Boost SERIALIZATION is not enabled")
|
|
#else
|
|
std::stringstream ss(str);
|
|
boost::archive::text_iarchive ia(ss);
|
|
ia &*this;
|
|
#endif
|
|
}
|
|
ShapeInput(const ShapeInput &other);
|
|
ShapeInput(ShapeInput &&other) = default;
|
|
ShapeInput &operator=(const ShapeInput &other);
|
|
ShapeInput &operator=(ShapeInput &&other) = default;
|
|
~ShapeInput() = default;
|
|
|
|
//! Merge the other ShapeInput, assuming it has the correct number
|
|
//! of atoms etc. Empties other, unless they can't be merged in which case
|
|
//! it returns unscathed.
|
|
void merge(ShapeInput &other);
|
|
|
|
std::string toString() const {
|
|
#ifndef RDK_USE_BOOST_SERIALIZATION
|
|
PRECONDITION(0, "Boost SERIALIZATION is not enabled")
|
|
#else
|
|
std::stringstream ss;
|
|
boost::archive::text_oarchive oa(ss);
|
|
oa &*this;
|
|
return ss.str();
|
|
#endif
|
|
}
|
|
|
|
const std::string getSmiles() const { return d_smiles; }
|
|
unsigned int getActiveShape() const { return d_activeShape; }
|
|
//! Set the currently active conformation to the new value.
|
|
//! @param newShape: the number of the conformation to be used
|
|
//! for future calculations. Counts from 0,
|
|
//! obviously. If invalid, throws a runtime
|
|
//! error.
|
|
void setActiveShape(unsigned int newShape);
|
|
//! Return the coordinates of the currently active shape.
|
|
//! Note that the coords are returned as a vector size 3*getNumAtoms()
|
|
const std::vector<double> &getCoords() const {
|
|
return d_coords[d_activeShape];
|
|
}
|
|
//! Get the alpha values for the atoms and color features in the shape.
|
|
const std::vector<double> &getAlphas() const { return d_alphas; }
|
|
//! Multiply the alpha value for the given atom/feature by -1.0
|
|
//! which will toggle whether the atom/feature is used in the volume
|
|
//! calculation or not. For temporarily "turning off" an atom or feature.
|
|
void negateAlpha(unsigned int alphaNum);
|
|
|
|
//! Fetch the coordinates of the atoms and optionally features.
|
|
std::vector<RDGeom::Point3D> getAtomPoints(bool includeColors = false) const;
|
|
//! Return whether the coordinates for the current active shape are
|
|
//! normalized.
|
|
bool getIsNormalized() const { return d_normalizeds[d_activeShape]; }
|
|
//! Return the feature types of all atoms/features in the shape. Atoms
|
|
//! have type 0.
|
|
const std::vector<int> &getFeatureTypes() const { return d_types; }
|
|
//! Get the number of atoms in the shape.
|
|
unsigned int getNumAtoms() const { return d_numAtoms; }
|
|
//! Get the number of color features in the shape.
|
|
unsigned int getNumFeatures() const { return d_numFeats; }
|
|
//! Get the number of shapes/conformations in the shape object. This may
|
|
//! be smaller than the number of conformations in the input molecule if
|
|
//! shape pruning was performed.
|
|
unsigned int getNumShapes() const { return d_coords.size(); }
|
|
//! Get the volume of the atoms in the current active shape.
|
|
double getShapeVolume() const {
|
|
return d_selfOverlapShapeVols[d_activeShape];
|
|
}
|
|
//! Get the volume for the atoms for the given shape number.
|
|
double getShapeVolume(unsigned int shapeNum) const;
|
|
//! Get the volume of the color features in the current active shape.
|
|
double getColorVolume() const {
|
|
return d_selfOverlapColorVols[d_activeShape];
|
|
}
|
|
//! Get the volume of the color features for the given shape number.
|
|
double getColorVolume(unsigned int shapeNum) const;
|
|
//! Get the flags for which atoms have a carbon radius.
|
|
const boost::dynamic_bitset<> *getCarbonRadii() const {
|
|
return d_carbonRadii.get();
|
|
}
|
|
// These functions use cached values if available.
|
|
//! Get the canonical rotation for the current active shape.
|
|
const std::array<double, 9> &calcCanonicalRotation();
|
|
//! Get the canonical translation for the current active shape.
|
|
const std::array<double, 3> &calcCanonicalTranslation();
|
|
//! Get the eigen values for the coordinates matrix.
|
|
const std::array<double, 3> &calcEigenValues();
|
|
//! Get the numbers of the points at the extremes of x, y and z for the
|
|
//! current active shape. In the order minimum x, minimum y, minimum z,
|
|
//! then the maxima.
|
|
const std::array<size_t, 6> &calcExtremes();
|
|
//! Return the principal moments of inertia, if Eigen3 is available, and the
|
|
//! eigenvalues of the canonical transformation if not, for the current
|
|
//! active shape.
|
|
std::array<double, 3> calcMomentsOfInertia(bool includeColors = false) const;
|
|
|
|
//! Align the principal axes to the cartesian axes and centre on the origin
|
|
//! for the current active shape.
|
|
//! Doesn't require that the shape was created from a molecule. Creates
|
|
//! the necessary transformation if not already done.
|
|
void normalizeCoords();
|
|
|
|
//! Applies the given transformation to the current active shape.
|
|
void transformCoords(RDGeom::Transform3D &xform);
|
|
|
|
//! Make a molecule from the current active shape. If required, features
|
|
//! are added as xenon atoms. If withBonds is false, just makes a molecule
|
|
//! from the atoms, otherwise builds a full molecule.
|
|
std::unique_ptr<RWMol> shapeToMol(bool includeColors = false,
|
|
bool withBonds = true) const;
|
|
|
|
//! Find the best similarity score between all shapes in this shape and the
|
|
//! other one. Stops as soon as it gets something above the threshold.
|
|
//! The score runs between 0.0 and 1.0, so the default threshold of -1.0
|
|
//! means no threshold. Fills in the shape numbers of the two that were
|
|
//! responsible if there is something above the threshold, and the
|
|
//! transformation that did it. Returns -1.0 for the similarity if there was
|
|
//! nothing above the threshold. Note that the shape numbers are not
|
|
//! necessarily the same as the original molecule conformation numbers.
|
|
std::array<double, 3> bestSimilarity(
|
|
const ShapeInput &fitShape, unsigned int &bestThisShape,
|
|
unsigned int &bestFitShape, RDGeom::Transform3D &bestXform,
|
|
double threshold = -1.0,
|
|
const ShapeOverlayOptions &overlayOpts = ShapeOverlayOptions());
|
|
|
|
//! Return the maximum similarity achievable between the 2 shapes. The
|
|
//! maximum similarity is when one shape is entirely inside the other. This
|
|
//! returns the similarity in that case, which is the upper bound on what
|
|
//! is achievable between these 2 shapes.
|
|
double maxPossibleSimilarity(
|
|
const ShapeInput &fitShape,
|
|
const ShapeOverlayOptions &overlayOpts = ShapeOverlayOptions()) const;
|
|
|
|
//! Prune the shapes so none a more similar to each other than
|
|
//! the threshold.
|
|
void pruneShapes(double simThreshold);
|
|
|
|
#ifdef RDK_USE_BOOST_SERIALIZATION
|
|
template <class Archive>
|
|
void serialize(Archive &ar, unsigned int);
|
|
#endif
|
|
|
|
private:
|
|
void extractAtoms(const Conformer &conf, const ShapeInputOptions &shapeOpts,
|
|
bool fillAlphas);
|
|
// Extract the features for the color scores, using RDKit pphore features
|
|
// for now. Other options to be added later.
|
|
void extractFeatures(const Conformer &conf, unsigned int confNum,
|
|
const ShapeInputOptions &shapeOpts, bool fillAlphas);
|
|
// Calculate the rotation and translation that will align the principal axes
|
|
// to the cartesian axes and centre on the origin.
|
|
void calcNormalization();
|
|
|
|
void calculateExtremes();
|
|
|
|
unsigned int d_activeShape;
|
|
|
|
std::vector<std::vector<double>>
|
|
d_coords; // The coordinates for the atoms and features,
|
|
// packed as 3 floats per item - x, y, z
|
|
std::vector<double> d_alphas; // The alpha values for the atoms and features.
|
|
// alpha is KAPPA / (r * r) where r is the radius
|
|
// of the atom. This is not used if using all_atoms_carbon mode.
|
|
std::vector<int> d_types; // The feature types. The size is the same
|
|
// as the number of coordinates, padded with 0
|
|
// for the atoms.
|
|
unsigned int d_numAtoms; // The number of atoms
|
|
unsigned int d_numFeats; // The number of features
|
|
std::vector<double> d_selfOverlapShapeVols; // Shape volume
|
|
std::vector<double> d_selfOverlapColorVols; // Color volume
|
|
// These are the points at the extremes of the x, y and z axes.
|
|
// they are min_x, min_y, min_z and max_x, max_y, max_z.
|
|
std::vector<std::array<size_t, 6>> d_extremePointss;
|
|
std::unique_ptr<boost::dynamic_bitset<>>
|
|
d_carbonRadii; // Flags those atoms with a carbon radius, for faster
|
|
// calculation later.
|
|
std::string d_smiles; // The SMILES string of the input molecule
|
|
|
|
// These are the rotation and translation matrices to align the principal
|
|
// axes of the shape with cartesian axes. If d_normalized is true, it has
|
|
// been applied to the coordinates.
|
|
boost::dynamic_bitset<> d_normalizeds;
|
|
// If the shape is moved, the normalization matrices are no longer valid.
|
|
// This flags that so it is re-computed as required.
|
|
boost::dynamic_bitset<> d_normalizationOKs;
|
|
|
|
std::vector<std::array<double, 9>> d_canonRots;
|
|
std::vector<std::array<double, 3>> d_canonTranss;
|
|
// The sorted eigenvalues of the principal axes.
|
|
std::vector<std::array<double, 3>> d_eigenValuess;
|
|
|
|
void selectConformations(const std::vector<int> &picks);
|
|
void calculateSelfOverlaps(const ShapeOverlayOptions &overlayOpts);
|
|
// Sort the shapes in descending order of the sum of the shape
|
|
// and color volumes.
|
|
void sortShapesByVolumes();
|
|
};
|
|
|
|
#ifdef RDK_USE_BOOST_SERIALIZATION
|
|
template <class Archive>
|
|
void ShapeInput::serialize(Archive &ar, const unsigned int) {
|
|
ar & d_activeShape;
|
|
ar & d_coords;
|
|
ar & d_alphas;
|
|
ar & d_types;
|
|
ar & d_numAtoms;
|
|
ar & d_numFeats;
|
|
ar & d_selfOverlapShapeVols;
|
|
ar & d_selfOverlapColorVols;
|
|
ar & d_extremePointss;
|
|
ar & d_carbonRadii;
|
|
ar & d_smiles;
|
|
ar & d_normalizeds;
|
|
ar & d_normalizationOKs;
|
|
ar & d_canonRots;
|
|
ar & d_canonTranss;
|
|
ar & d_eigenValuess;
|
|
}
|
|
#endif
|
|
|
|
// Extract the features from the molecule, optionally just for the subset
|
|
// of atoms.
|
|
RDKIT_GAUSSIANSHAPE_EXPORT void findFeatures(
|
|
const Conformer &conf, std::vector<CustomFeature> &features,
|
|
const std::optional<std::vector<unsigned int>> &atomSubset = std::nullopt);
|
|
|
|
// Calculate the mean position of the given atoms.
|
|
RDKIT_GAUSSIANSHAPE_EXPORT RDGeom::Point3D computeFeaturePos(
|
|
const Conformer &conf, const std::vector<unsigned int> &ats);
|
|
|
|
RDKIT_GAUSSIANSHAPE_EXPORT RDGeom::Transform3D quatTransToTransform(
|
|
const double *quat, const double *trans);
|
|
|
|
// Apply the transformation to the coordinates assumed to be in
|
|
// ShapeInput.d_coords form.
|
|
RDKIT_GAUSSIANSHAPE_EXPORT void applyTransformToShape(
|
|
std::vector<double> &shape, const RDGeom::Transform3D &xform);
|
|
RDKIT_GAUSSIANSHAPE_EXPORT void applyTransformToShape(
|
|
const double *inShape, double *outShape, size_t numPoints,
|
|
const RDGeom::Transform3D &xform);
|
|
RDKIT_GAUSSIANSHAPE_EXPORT void translateShape(
|
|
std::vector<double> &shape, const RDGeom::Point3D &translation);
|
|
RDKIT_GAUSSIANSHAPE_EXPORT void translateShape(
|
|
const double *inShape, double *outShape, size_t numPoints,
|
|
const RDGeom::Point3D &translation);
|
|
|
|
// Maximum possible score of the 2 shape (v[12]) and color (c[12]) volumes
|
|
RDKIT_GAUSSIANSHAPE_EXPORT double maxScore(
|
|
double v1, double v2, double c1, double c2,
|
|
const ShapeOverlayOptions &overlayOpts);
|
|
|
|
} // namespace GaussianShape
|
|
} // namespace RDKit
|
|
|
|
#endif // RDKIT_SHAPEINPUT_GUARD
|