Provide an RDKit wrapper around pubchem-align3d (#7798)

* initial support for pubchem-align3d

* works, passes the tests

* bump commit we are using

* update readme

* disable pubchem shape for limited depedencies build

* comment from review

* get windows dll builds working

* stupid oversight

* changes in response to review
get rid of the "using namespace std"
test that the alignment is re-entrant
This commit is contained in:
Greg Landrum
2024-09-14 06:08:24 +02:00
committed by GitHub
parent 4c37b03071
commit da2a9242bf
15 changed files with 23250 additions and 0 deletions

View File

@@ -37,6 +37,7 @@ steps:
-DRDK_BUILD_CAIRO_SUPPORT=OFF \
-DRDK_BUILD_QT_SUPPORT=OFF \
-DRDK_BUILD_XYZ2MOL_SUPPORT=ON \
-DRDK_BUILD_PUBCHEMSHAPE_SUPPORT=OFF \
-DRDK_BUILD_SWIG_WRAPPERS=OFF \
-DRDK_SWIG_STATIC=OFF \
-DRDK_BUILD_THREADSAFE_SSS=OFF \

View File

@@ -59,6 +59,7 @@ option(RDK_BUILD_MOLINTERCHANGE_SUPPORT "build in support for CommonChem molecul
option(RDK_BUILD_YAEHMOP_SUPPORT "build support for the YAeHMOP wrapper" OFF)
option(RDK_BUILD_XYZ2MOL_SUPPORT "build in support for the RDKit's implementation of xyz2mol (in the DetermineBonds library)" OFF )
option(RDK_BUILD_STRUCTCHECKER_SUPPORT "build in support for the StructChecker alpha (not recommended, use the MolVS integration instead)" OFF )
option(RDK_BUILD_PUBCHEMSHAPE_SUPPORT "build the rdkit wrapper around pubchem-align3d" ON )
option(RDK_USE_URF "Build support for Florian Flachsenberg's URF library" ON)
option(RDK_INSTALL_DEV_COMPONENT "install libraries and headers" ON)
option(RDK_USE_BOOST_IOSTREAMS "use boost::iostreams" ON)

View File

@@ -5,3 +5,6 @@ add_subdirectory(CoordGen)
add_subdirectory(YAeHMOP)
add_subdirectory(RingFamilies)
add_subdirectory(GA)
if(RDK_BUILD_PUBCHEMSHAPE_SUPPORT)
add_subdirectory(pubchem_shape)
endif()

56
External/pubchem_shape/CMakeLists.txt vendored Normal file
View File

@@ -0,0 +1,56 @@
if(NOT RDK_BUILD_PUBCHEMSHAPE_SUPPORT)
return()
endif()
if(NOT DEFINED PUBCHEMSHAPE_DIR)
set(PUBCHEMSHAPE_DIR "${CMAKE_CURRENT_SOURCE_DIR}/pubchem-align3d")
set(fileToCheck "${PUBCHEMSHAPE_DIR}/shape_functions1.cpp")
set(needDownload "TRUE")
if(EXISTS "${fileToCheck}")
set(needDownload "FALSE")
endif()
else()
set(needDownload "FALSE")
endif()
if(needDownload)
set(PUBCHEM_COMMIT_SHA 27f063f)
if(NOT DEFINED PUBCHEMSHAPE_URL)
set(PUBCHEMSHAPE_URL "https://github.com/ncbi/pubchem-align3d/archive/${PUBCHEM_COMMIT_SHA}.tar.gz")
endif()
if(NOT DEFINED PUBCHEMSHAPE_MD5SUM)
set(PUBCHEMSHAPE_MD5SUM "4f763a1cdff13a1dd2bd4c4612e06462")
endif()
if(NOT DEFINED PUBCHEMSHAPE_BASE)
string(REGEX REPLACE "^.*/" "" PUBCHEMSHAPE_BASE "${PUBCHEMSHAPE_URL}")
endif()
downloadAndCheckMD5(${PUBCHEMSHAPE_URL} "${CMAKE_CURRENT_SOURCE_DIR}/${PUBCHEMSHAPE_BASE}" ${PUBCHEMSHAPE_MD5SUM})
execute_process(COMMAND ${CMAKE_COMMAND} -E tar xzf
${CMAKE_CURRENT_SOURCE_DIR}/${PUBCHEM_COMMIT_SHA}.tar.gz
WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR})
file(GLOB tar_dirname ${CMAKE_CURRENT_SOURCE_DIR}/pubchem-align3d-${PUBCHEM_COMMIT_SHA}*)
execute_process(COMMAND ${CMAKE_COMMAND} -E rename ${tar_dirname}
${PUBCHEMSHAPE_DIR}
WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR})
endif()
rdkit_library(pubchem_align3d ./pubchem-align3d/shape_functions1.cpp
./pubchem-align3d/shape_functions2.cpp ./pubchem-align3d/shape_neighbor.cpp SHARED)
if((MSVC AND RDK_INSTALL_DLLS_MSVC) OR ((NOT MSVC) AND WIN32))
set_target_properties(pubchem_align3d PROPERTIES WINDOWS_EXPORT_ALL_SYMBOLS TRUE)
endif()
rdkit_library(PubChemShape PubChemShape.cpp SHARED
LINK_LIBRARIES pubchem_align3d SmilesParse SubstructMatch)
target_compile_definitions(PubChemShape PRIVATE RDKIT_PUBCHEMSHAPE_BUILD)
rdkit_headers(PubChemShape.hpp DEST GraphMol)
rdkit_catch_test(shape_test test.cpp LINK_LIBRARIES PubChemShape FileParsers)
if(RDK_BUILD_PYTHON_WRAPPERS)
add_subdirectory(Wrap)
endif()

426
External/pubchem_shape/PubChemShape.cpp vendored Normal file
View File

@@ -0,0 +1,426 @@
#include <iostream>
#include <sstream>
#include <stdexcept>
#include <GraphMol/RWMol.h>
#include <GraphMol/SmilesParse/SmilesParse.h>
#include <GraphMol/Substruct/SubstructMatch.h>
#include <RDGeneral/BoostStartInclude.h>
#include <boost/flyweight.hpp>
#include <boost/flyweight/key_value.hpp>
#include <boost/flyweight/no_tracking.hpp>
#include <RDGeneral/BoostEndInclude.h>
#include "pubchem-align3d/shape_functions.hpp"
#include "PubChemShape.hpp"
constexpr auto pubchemFeatureName = "PUBCHEM_PHARMACOPHORE_FEATURES";
// #define DEBUG_MSG(msg_stream) cout << msg_stream << '\n'
#define DEBUG_MSG(msg_stream)
using namespace RDKit;
// Bondi radii
// can find more of these in Table 12 of this publication:
// https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3658832/
const std::map<unsigned int, double> vdw_radii = {
{0, 1.10}, // dummy atom (value copied from H)
{1, 1.10}, // H
{2, 1.40}, // He
{3, 1.81}, // Li
{4, 1.53}, // Be
{5, 1.92}, // B
{6, 1.70}, // C
{7, 1.55}, // N
{8, 1.52}, // O
{9, 1.47}, // F
{10, 1.54}, // Ne
{11, 2.27}, // Na
{12, 1.73}, // Mg
{13, 1.84}, // Al
{14, 2.10}, // Si
{15, 1.80}, // P
{16, 1.80}, // S
{17, 1.75}, // Cl
{18, 1.88}, // Ar
{19, 2.75}, // K
{20, 2.31}, // Ca
{31, 1.87}, // Ga
{32, 2.11}, // Ge
{33, 1.85}, // As
{34, 1.90}, // Se
{35, 1.83}, // Br
{36, 2.02}, // Kr
{37, 3.03}, // Rb
{38, 2.49}, // Sr
{49, 1.93}, // In
{50, 2.17}, // Sn
{51, 2.06}, // Sb
{52, 2.06}, // Te
{53, 1.98}, // I
{54, 2.16}, // Xe
{55, 3.43}, // Cs
{56, 2.68}, // Ba
{81, 1.96}, // Tl
{82, 2.02}, // Pb
{83, 2.07}, // Bi
{84, 1.97}, // Po
{85, 2.02}, // At
{86, 2.20}, // Rn
{87, 3.48}, // Fr
{88, 2.83}, // Ra
};
constexpr double radius_color =
1.08265; // same radius for all feature/color "atoms"
namespace {
class ss_matcher {
public:
ss_matcher(const std::string &pattern) : m_pattern(pattern) {
m_needCopies = (pattern.find_first_of("$") != std::string::npos);
RDKit::RWMol *p = RDKit::SmartsToMol(pattern);
m_matcher = p;
POSTCONDITION(m_matcher, "no matcher");
};
const RDKit::ROMol *getMatcher() const { return m_matcher; };
unsigned int countMatches(const RDKit::ROMol &mol) const {
PRECONDITION(m_matcher, "no matcher");
std::vector<RDKit::MatchVectType> matches;
// This is an ugly one. Recursive queries aren't thread safe.
// Unfortunately we have to take a performance hit here in order
// to guarantee thread safety
if (m_needCopies) {
const RDKit::ROMol nm(*(m_matcher), true);
RDKit::SubstructMatch(mol, nm, matches);
} else {
const RDKit::ROMol &nm = *m_matcher;
RDKit::SubstructMatch(mol, nm, matches);
}
return matches.size();
}
~ss_matcher() { delete m_matcher; };
private:
ss_matcher() : m_pattern("") {};
std::string m_pattern;
bool m_needCopies{false};
const RDKit::ROMol *m_matcher{nullptr};
};
} // namespace
typedef boost::flyweight<boost::flyweights::key_value<std::string, ss_matcher>,
boost::flyweights::no_tracking>
pattern_flyweight;
// Definitions for feature points adapted from:
// Gobbi and Poppinger, Biotech. Bioeng. _61_ 47-54 (1998)
const std::vector<std::vector<std::string>> smartsPatterns = {
{"[$([N;!H0;v3,v4&+1]),\
$([O,S;H1;+0]),\
n&H1&+0]"}, // Donor
{"[$([O,S;H1;v2;!$(*-*=[O,N,P,S])]),\
$([O,S;H0;v2]),\
$([O,S;-]),\
$([N;v3;!$(N-*=[O,N,P,S])]),\
n&H0&+0,\
$([o,s;+0;!$([o,s]:n);!$([o,s]:c:n)])]"}, // Acceptor
{
"[r]1[r][r]1",
"[r]1[r][r][r]1",
"[r]1[r][r][r][r]1",
"[r]1[r][r][r][r][r]1",
"[r]1[r][r][r][r][r][r]1",
}, // rings
// "[a]", //
// Aromatic
// "[F,Cl,Br,I]", // Halogen
{"[#7;+,\
$([N;H2&+0][$([C,a]);!$([C,a](=O))]),\
$([N;H1&+0]([$([C,a]);!$([C,a](=O))])[$([C,a]);!$([C,a](=O))]),\
$([N;H0&+0]([C;!$(C(=O))])([C;!$(C(=O))])[C;!$(C(=O))])]"}, // Basic
{"[$([C,S](=[O,S,P])-[O;H1,-1])]"} // Acidic
};
std::vector<std::vector<const ROMol *>> *getPh4Patterns() {
static std::unique_ptr<std::vector<std::vector<const ROMol *>>> patterns;
if (!patterns) {
patterns.reset(new std::vector<std::vector<const ROMol *>>());
for (const auto &smartsV : smartsPatterns) {
std::vector<const ROMol *> v;
for (const auto &smarts : smartsV) {
const ROMol *matcher = pattern_flyweight(smarts).get().getMatcher();
CHECK_INVARIANT(matcher, "bad smarts");
v.push_back(matcher);
}
patterns->push_back(std::move(v));
}
}
return patterns.get();
}
// the conformer is translated to the origin
ShapeInput PrepareConformer(const ROMol &mol, int confId, bool useColors) {
ShapeInput res;
// unpack features (PubChem-specific property from SDF)
// NOTE: this unpacking assumes that RWMol-atom-index = SDF-atom-number - 1
// e.g. RWMol uses [0..N-1] and SDF uses [1..N], with atoms in the same
// order
std::vector<std::pair<std::vector<unsigned int>, unsigned int>>
feature_idx_type;
if (useColors) {
std::string features;
if (mol.getPropIfPresent(pubchemFeatureName, features)) {
// regular atoms have type 0; feature "atoms" (features represented by a
// single point+radius) must have type > 0
static const std::map<std::string, unsigned int> atomTypes = {
{"acceptor", 1}, {"anion", 2}, {"cation", 3},
{"donor", 4}, {"hydrophobe", 5}, {"rings", 6},
};
std::istringstream iss(features);
std::string line;
unsigned int n = 0;
while (std::getline(iss, line)) {
if (n == 0) {
feature_idx_type.resize(stoul(line));
}
else {
unsigned int f = n - 1;
if (f >= feature_idx_type.size()) {
throw ValueErrorException("Too many features");
}
std::istringstream iss2(line);
std::string token;
unsigned int t = 0;
while (std::getline(iss2, token, ' ')) {
if (t == 0) {
feature_idx_type[f].first.resize(stoul(token));
} else if (t <= feature_idx_type[f].first.size()) {
feature_idx_type[f].first[t - 1] = stoul(token) - 1;
} else {
auto type = atomTypes.find(token);
if (type == atomTypes.end()) {
throw ValueErrorException("Invalid feature type: " + token);
}
feature_idx_type[f].second = type->second;
}
++t;
}
if (t != (feature_idx_type[f].first.size() + 2)) {
throw ValueErrorException("Wrong number of tokens in feature");
}
}
++n;
}
if (n != (feature_idx_type.size() + 1)) {
throw ValueErrorException("Wrong number of features");
}
DEBUG_MSG("# features: " << feature_idx_type.size());
} else {
const auto pattVects = getPh4Patterns();
feature_idx_type.clear();
unsigned pattIdx = 1;
for (const auto &patts : *pattVects) {
for (const auto patt : patts) {
auto matches = SubstructMatch(mol, *patt);
for (auto match : matches) {
std::vector<unsigned int> ats;
for (const auto &pr : match) {
ats.push_back(pr.second);
}
feature_idx_type.emplace_back(ats, pattIdx);
}
}
++pattIdx;
}
}
}
// unpack atoms
auto &conformer = mol.getConformer(confId);
if (!conformer.is3D()) {
throw ValueErrorException("Conformer must be 3D");
}
unsigned int nAtoms = mol.getNumAtoms();
// DEBUG_MSG("num atoms: " << nAtoms);
unsigned int nHeavyAtoms = mol.getNumHeavyAtoms();
DEBUG_MSG("num heavy atoms: " << nHeavyAtoms);
unsigned int nAlignmentAtoms = nHeavyAtoms + feature_idx_type.size();
std::vector<double> rad_vector(nAlignmentAtoms);
res.atom_type_vector.resize(nAlignmentAtoms, 0);
RDGeom::Point3D ave;
for (unsigned i = 0; i < nAtoms; ++i) {
unsigned int Z = mol.getAtomWithIdx(i)->getAtomicNum();
if (Z > 1) {
ave += conformer.getAtomPos(i);
if (vdw_radii.find(Z) == vdw_radii.end()) {
throw ValueErrorException("No VdW radius for atom with Z=" +
std::to_string(Z));
}
rad_vector[i] = vdw_radii.at(Z);
}
}
// translate steric center to origin
ave /= nHeavyAtoms;
DEBUG_MSG("steric center: (" << ave << ")");
res.shift = {-ave.x, -ave.y, -ave.z};
res.coord.resize(nAlignmentAtoms * 3);
for (unsigned i = 0; i < nAtoms; ++i) {
// translate all atoms
RDGeom::Point3D pos = conformer.getAtomPos(i);
pos -= ave;
// but use only non-H for alignment
if (mol.getAtomWithIdx(i)->getAtomicNum() > 1) {
res.coord[i * 3] = pos.x;
res.coord[(i * 3) + 1] = pos.y;
res.coord[(i * 3) + 2] = pos.z;
}
}
// get feature coordinates - simply the average of coords of all atoms in the
// feature
for (unsigned i = 0; i < feature_idx_type.size(); ++i) {
RDGeom::Point3D floc;
for (unsigned int j = 0; j < feature_idx_type[i].first.size(); ++j) {
unsigned int idx = feature_idx_type[i].first[j];
if (idx >= nAtoms || mol.getAtomWithIdx(idx)->getAtomicNum() <= 1) {
throw ValueErrorException("Invalid feature atom index");
}
floc += conformer.getAtomPos(idx);
}
floc /= feature_idx_type[i].first.size();
floc -= ave;
DEBUG_MSG("feature type " << feature_idx_type[i].second << " (" << floc
<< ")");
auto array_idx = nHeavyAtoms + i;
res.coord[array_idx * 3] = floc.x;
res.coord[(array_idx * 3) + 1] = floc.y;
res.coord[(array_idx * 3) + 2] = floc.z;
rad_vector[array_idx] = radius_color;
res.atom_type_vector[array_idx] = feature_idx_type[i].second;
}
Align3D::setAlpha(rad_vector.data(), rad_vector.size(), res.alpha_vector);
// regular atom self overlap
Align3D::getVolumeAtomIndexVector(res.atom_type_vector.data(),
res.atom_type_vector.size(),
res.volumeAtomIndexVector);
res.sov = Align3D::ComputeShapeOverlap(
res.coord.data(), res.alpha_vector, res.volumeAtomIndexVector,
res.coord.data(), res.alpha_vector, res.volumeAtomIndexVector);
DEBUG_MSG("sov: " << res.sov);
// feature self overlap
if (feature_idx_type.size() > 0) {
Align3D::getColorAtomType2IndexVectorMap(res.atom_type_vector.data(),
res.atom_type_vector.size(),
res.colorAtomType2IndexVectorMap);
res.sof = Align3D::ComputeFeatureOverlap(
res.coord.data(), res.alpha_vector, res.colorAtomType2IndexVectorMap,
res.coord.data(), res.alpha_vector, res.colorAtomType2IndexVectorMap);
DEBUG_MSG("sof: " << res.sof);
}
return res;
}
std::pair<double, double> AlignMolecule(const ShapeInput &refShape, ROMol &fit,
std::vector<float> &matrix,
int fitConfId, bool useColors,
double opt_param,
unsigned int max_preiters,
unsigned int max_postiters) {
PRECONDITION(matrix.size() == 12, "bad matrix size");
Align3D::setUseCutOff(true);
DEBUG_MSG("Fit details:");
auto fitShape = PrepareConformer(fit, fitConfId, useColors);
std::set<unsigned int> jointColorAtomTypeSet;
Align3D::getJointColorTypeSet(
refShape.atom_type_vector.data(), refShape.atom_type_vector.size(),
fitShape.atom_type_vector.data(), fitShape.atom_type_vector.size(),
jointColorAtomTypeSet);
auto mapCp = refShape.colorAtomType2IndexVectorMap;
Align3D::restrictColorAtomType2IndexVectorMap(mapCp, jointColorAtomTypeSet);
Align3D::restrictColorAtomType2IndexVectorMap(
fitShape.colorAtomType2IndexVectorMap, jointColorAtomTypeSet);
DEBUG_MSG("Running alignment...");
double nbr_st = 0.0;
double nbr_ct = 0.0;
Align3D::Neighbor_Conformers(
refShape.coord.data(), refShape.alpha_vector,
refShape.volumeAtomIndexVector, mapCp, refShape.sov, refShape.sof,
fitShape.coord.data(), fitShape.alpha_vector,
fitShape.volumeAtomIndexVector, fitShape.colorAtomType2IndexVectorMap,
fitShape.sov, fitShape.sof, !jointColorAtomTypeSet.empty(), max_preiters,
max_postiters, opt_param, matrix.data(), nbr_st, nbr_ct);
DEBUG_MSG("Done!");
DEBUG_MSG("nbr_st: " << nbr_st);
DEBUG_MSG("nbr_ct: " << nbr_ct);
// transform fit coords
Conformer &fit_conformer = fit.getConformer(fitConfId);
if (3 * fit.getNumAtoms() != fitShape.coord.size()) {
// Hs were removed
fitShape.coord.resize(3 * fit.getNumAtoms());
for (unsigned int i = 0; i < fit.getNumAtoms(); ++i) {
const auto &pos = fit_conformer.getAtomPos(i);
fitShape.coord[i * 3] = pos.x;
fitShape.coord[i * 3 + 1] = pos.y;
fitShape.coord[i * 3 + 2] = pos.z;
}
}
std::vector<float> transformed(fit.getNumAtoms() * 3);
Align3D::VApplyRotTransMatrix(transformed.data(), fitShape.coord.data(),
fit.getNumAtoms(), matrix.data());
for (unsigned i = 0; i < fit.getNumAtoms(); ++i) {
RDGeom::Point3D &pos = fit_conformer.getAtomPos(i);
// both conformers have been translated to the origin, translate the fit
// conformer back to the steric center of the reference.
pos.x = transformed[i * 3] - refShape.shift[0] + fitShape.shift[0];
pos.y = transformed[(i * 3) + 1] - refShape.shift[1] + fitShape.shift[1];
pos.z = transformed[(i * 3) + 2] - refShape.shift[2] + fitShape.shift[2];
}
fit.setProp("shape_align_shape_tanimoto", nbr_st);
fit.setProp("shape_align_color_tanimoto", nbr_ct);
return std::make_pair(nbr_st, nbr_ct);
}
std::pair<double, double> AlignMolecule(const ROMol &ref, ROMol &fit,
std::vector<float> &matrix,
int refConfId, int fitConfId,
bool useColors, double opt_param,
unsigned int max_preiters,
unsigned int max_postiters) {
Align3D::setUseCutOff(true);
DEBUG_MSG("Reference details:");
auto refShape = PrepareConformer(ref, refConfId, useColors);
return AlignMolecule(refShape, fit, matrix, fitConfId, useColors, opt_param,
max_preiters, max_postiters);
}

30
External/pubchem_shape/PubChemShape.hpp vendored Normal file
View File

@@ -0,0 +1,30 @@
#include <GraphMol/ROMol.h>
#include <map>
#include <vector>
struct RDKIT_PUBCHEMSHAPE_EXPORT ShapeInput {
std::vector<float> coord;
std::vector<double> alpha_vector;
std::vector<unsigned int> atom_type_vector;
std::vector<unsigned int> volumeAtomIndexVector;
std::map<unsigned int, std::vector<unsigned int>>
colorAtomType2IndexVectorMap;
std::vector<double> shift;
double sov{0.0};
double sof{0.0};
};
RDKIT_PUBCHEMSHAPE_EXPORT ShapeInput PrepareConformer(const RDKit::ROMol &mol,
int confId = -1,
bool useColors = true);
RDKIT_PUBCHEMSHAPE_EXPORT std::pair<double, double> AlignMolecule(
const ShapeInput &refShape, RDKit::ROMol &fit, std::vector<float> &matrix,
int fitConfId = -1, bool useColors = true, double opt_param = 0.5,
unsigned int max_preiters = 3u, unsigned int max_postiters = 16u);
RDKIT_PUBCHEMSHAPE_EXPORT std::pair<double, double> AlignMolecule(
const RDKit::ROMol &ref, RDKit::ROMol &fit, std::vector<float> &matrix,
int refConfId = -1, int fitConfId = -1, bool useColors = true,
double opt_param = 0.5, unsigned int max_preiters = 3u,
unsigned int max_postiters = 16u);

3
External/pubchem_shape/README.md vendored Normal file
View File

@@ -0,0 +1,3 @@
This is a thin RDKit wrapper around pubchem-align3d, a shape-based alignment code written by the PubChem team.
The repo for pubchem-align3d is here: https://github.com/ncbi/pubchem-align3d/

View File

@@ -0,0 +1,13 @@
find_package(Python3 COMPONENTS Interpreter Development NumPy REQUIRED)
find_package(Python3 COMPONENTS Interpreter REQUIRED)
set(py3lib "python${Python3_VERSION_MAJOR}${Python3_VERSION_MINOR}")
find_package(Boost COMPONENTS ${py3lib} REQUIRED CONFIG)
rdkit_python_extension(rdShapeAlign
cshapealign.cpp
DEST Chem
LINK_LIBRARIES
PubChemShape )
add_pytest(pyShapeAlign
${CMAKE_CURRENT_SOURCE_DIR}/test_rdshapealign.py)

View File

@@ -0,0 +1,94 @@
/*******************************************************************************
Copyright 2024 by Greg Landrum and the pubchem_shape contributors
This file is part of pubchem_shape
Permission is hereby granted, free of charge, to any person obtaining a copy of
this software and associated documentation files (the "Software"), to deal in
the Software without restriction, including without limitation the rights to
use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
the Software, and to permit persons to whom the Software is furnished to do so,
subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
***********************************************************************/
#include <boost/python.hpp>
#include <vector>
#include "../PubChemShape.hpp"
#include <GraphMol/RDKitBase.h>
#include <RDBoost/Wrap.h>
namespace python = boost::python;
namespace helpers {
python::tuple alignMol(const RDKit::ROMol &ref, RDKit::ROMol &probe,
int refConfId, int probeConfId, bool useColors,
double opt_param, unsigned int max_preiters,
unsigned int max_postiters) {
std::vector<float> matrix(12, 0.0);
auto [nbr_st, nbr_ct] =
AlignMolecule(ref, probe, matrix, refConfId, probeConfId, useColors,
opt_param, max_preiters, max_postiters);
return python::make_tuple(nbr_st, nbr_ct);
}
python::tuple alignMol2(const ShapeInput &ref, RDKit::ROMol &probe,
int probeConfId, bool useColors, double opt_param,
unsigned int max_preiters, unsigned int max_postiters) {
std::vector<float> matrix(12, 0.0);
auto [nbr_st, nbr_ct] =
AlignMolecule(ref, probe, matrix, probeConfId, useColors, opt_param,
max_preiters, max_postiters);
return python::make_tuple(nbr_st, nbr_ct);
}
ShapeInput *prepConf(const RDKit::ROMol &mol, int confId, bool useColors) {
return new ShapeInput(PrepareConformer(mol, confId, useColors));
}
} // namespace helpers
void wrap_pubchemshape() {
RegisterVectorConverter<float>("FloatVector");
RegisterVectorConverter<double>("DoubleVector");
RegisterVectorConverter<unsigned int>("UnsignedIntVector");
python::def(
"AlignMol", &helpers::alignMol,
(python::arg("ref"), python::arg("probe"), python::arg("refConfId") = -1,
python::arg("probeConfId") = -1, python::arg("useColors") = true,
python::arg("opt_param") = 0.5, python::arg("max_preiters") = 3,
python::arg("max_postiters") = 16),
"aligns probe to ref, probe is modified");
python::def("AlignMol", &helpers::alignMol2,
(python::arg("refShape"), python::arg("probe"),
python::arg("probeConfId") = -1, python::arg("useColors") = true,
python::arg("opt_param") = 0.5, python::arg("max_preiters") = 3,
python::arg("max_postiters") = 16),
"aligns probe to reference shape, probe is modified");
python::def("PrepareConformer", &helpers::prepConf,
(python::arg("mol"), python::arg("confId") = -1,
python::arg("useColors") = true),
"returns a shape object for a molecule",
python::return_value_policy<python::manage_new_object>());
python::class_<ShapeInput, boost::noncopyable>("ShapeInput", python::no_init)
.def_readwrite("coord", &ShapeInput::coord)
.def_readwrite("alpha_vector", &ShapeInput::alpha_vector)
.def_readwrite("atom_type_vector", &ShapeInput::atom_type_vector)
.def_readwrite("volumeAtomIndexVector",
&ShapeInput::volumeAtomIndexVector)
.def_readwrite("shift", &ShapeInput::shift)
.def_readwrite("sov", &ShapeInput::sov)
.def_readwrite("sof", &ShapeInput::sof);
}
BOOST_PYTHON_MODULE(rdShapeAlign) { wrap_pubchemshape(); }

View File

@@ -0,0 +1,35 @@
import unittest
from rdkit import Chem
from rdkit.Chem import rdShapeAlign
from rdkit import RDConfig
datadir = RDConfig.RDBaseDir + '/External/pubchem_shape/test_data';
class TestCase(unittest.TestCase):
def setUp(self):
suppl = Chem.SDMolSupplier(datadir + '/test1.sdf')
self.ref = suppl[0]
self.probe = suppl[1]
def test1_Defaults(self):
tpl = rdShapeAlign.AlignMol(self.ref, self.probe)
self.assertAlmostEqual(tpl[0], 0.773, places=3)
self.assertAlmostEqual(tpl[1], 0.303, places=3)
def test2_NoColor(self):
tpl = rdShapeAlign.AlignMol(self.ref, self.probe, useColors=False)
self.assertAlmostEqual(tpl[0], 0.773, places=3)
self.assertAlmostEqual(tpl[1], 0.0, places=3)
def test3_FromShape(self):
shp = rdShapeAlign.PrepareConformer(self.ref)
self.assertTrue(type(shp) == rdShapeAlign.ShapeInput)
tpl = rdShapeAlign.AlignMol(shp, self.probe)
self.assertAlmostEqual(tpl[0], 0.773, places=3)
self.assertAlmostEqual(tpl[1], 0.303, places=3)
if __name__ == '__main__':
unittest.main()

80
External/pubchem_shape/sdf_align.cpp vendored Normal file
View File

@@ -0,0 +1,80 @@
#include <iostream>
#include <sstream>
#include <stdexcept>
#include <GraphMol/FileParsers/MolSupplier.h>
#include <GraphMol/FileParsers/MolWriters.h>
#include <GraphMol/Fingerprints/FingerprintUtil.h>
#include <GraphMol/RWMol.h>
#include <GraphMol/SmilesParse/SmilesParse.h>
#include <GraphMol/Substruct/SubstructMatch.h>
#include <RDGeneral/BoostEndInclude.h>
#include <RDGeneral/BoostStartInclude.h>
#include <boost/flyweight.hpp>
#include <boost/flyweight/key_value.hpp>
#include <boost/flyweight/no_tracking.hpp>
#include "PubChemShape.hpp"
using namespace std;
#define ERRORTHROW(msg_stream) \
do { \
ostringstream os; \
os << msg_stream; \
throw runtime_error(os.str()); \
} while (0)
// #define DEBUG_MSG(msg_stream) cout << msg_stream << '\n'
#define DEBUG_MSG(msg_stream)
using namespace RDKit;
int main(int argc, char **argv) {
int status = -1;
try {
if (argc != 2)
ERRORTHROW(
"Usage: sdf_align <input.sdf> The first molecule in the SDF "
"is the reference"); //"opt_param
// max_preiters
// max_postiters");
bool useColors = true;
bool sanitize = true;
bool removeHs = false;
SDMolSupplier suppl(argv[1], sanitize, removeHs);
SDWriter writer("sdf_align.out.sdf");
unique_ptr<ROMol> ref{suppl[0]};
if (!ref.get()) {
ERRORTHROW("Failed to read ref conformer");
}
for (auto i = 1u; i < suppl.length(); ++i) {
std::unique_ptr<ROMol> fit{suppl[i]};
if (!fit) {
continue;
}
vector<float> matrix(12, 0.0);
int refConfId = -1;
int fitConfId = -1;
auto [nbr_st, nbr_ct] =
AlignMolecule(*ref, *fit, matrix, refConfId, fitConfId, useColors);
if (i == 1) {
writer.write(*ref, refConfId);
}
writer.write(*fit, fitConfId);
}
status = 0;
} catch (std::exception &e) {
cerr << "Caught std::exception: " << e.what() << '\n';
} catch (...) {
cerr << "Caught unknown exception\n";
}
return status;
}

121
External/pubchem_shape/test.cpp vendored Normal file
View File

@@ -0,0 +1,121 @@
#include <catch2/catch_test_macros.hpp>
#include <catch2/matchers/catch_matchers_floating_point.hpp>
#include <iostream>
#include <sstream>
#include <stdexcept>
#include <GraphMol/FileParsers/MolSupplier.h>
#include <GraphMol/RWMol.h>
#include "PubChemShape.hpp"
using namespace RDKit;
TEST_CASE("basic alignment") {
std::string dirName = getenv("RDBASE");
dirName += "/External/pubchem_shape/test_data";
auto suppl = v2::FileParsers::SDMolSupplier(dirName + "/test1.sdf");
auto ref = suppl[0];
REQUIRE(ref);
auto probe = suppl[1];
REQUIRE(probe);
SECTION("basics") {
RWMol cp(*probe);
std::vector<float> matrix(12, 0.0);
auto [nbr_st, nbr_ct] = AlignMolecule(*ref, cp, matrix);
CHECK_THAT(nbr_st, Catch::Matchers::WithinAbs(0.773, 0.005));
CHECK_THAT(nbr_ct, Catch::Matchers::WithinAbs(0.303, 0.005));
for (auto i = 0u; i < probe->getNumAtoms(); ++i) {
CHECK(probe->getConformer().getAtomPos(i).x !=
cp.getConformer().getAtomPos(i).x);
}
}
SECTION("from shape") {
auto ref_shape = PrepareConformer(*ref);
std::vector<float> matrix(12, 0.0);
auto [nbr_st, nbr_ct] = AlignMolecule(ref_shape, *probe, matrix);
CHECK_THAT(nbr_st, Catch::Matchers::WithinAbs(0.773, 0.005));
CHECK_THAT(nbr_ct, Catch::Matchers::WithinAbs(0.303, 0.005));
}
SECTION("RDKit features") {
ref->clearProp("PUBCHEM_PHARMACOPHORE_FEATURES");
probe->clearProp("PUBCHEM_PHARMACOPHORE_FEATURES");
std::vector<float> matrix(12, 0.0);
auto [nbr_st, nbr_ct] = AlignMolecule(*ref, *probe, matrix);
CHECK_THAT(nbr_st, Catch::Matchers::WithinAbs(0.773, 0.005));
CHECK_THAT(nbr_ct, Catch::Matchers::WithinAbs(0.231, 0.005));
}
SECTION("no colors") {
std::vector<float> matrix(12, 0.0);
int refConfId = -1;
int prbConfId = -1;
bool useColors = false;
auto [nbr_st, nbr_ct] =
AlignMolecule(*ref, *probe, matrix, refConfId, prbConfId, useColors);
CHECK_THAT(nbr_st, Catch::Matchers::WithinAbs(0.773, 0.005));
CHECK_THAT(nbr_ct, Catch::Matchers::WithinAbs(0.000, 0.005));
}
SECTION("we are re-entrant") {
RWMol cp(*probe);
std::vector<float> matrix(12, 0.0);
auto [nbr_st, nbr_ct] = AlignMolecule(*ref, cp, matrix);
RWMol cp2(cp);
auto [nbr_st2, nbr_ct2] = AlignMolecule(*ref, cp2, matrix);
CHECK_THAT(nbr_st, Catch::Matchers::WithinAbs(nbr_st2, 0.005));
CHECK_THAT(nbr_ct, Catch::Matchers::WithinAbs(nbr_ct2, 0.005));
for (auto i = 0u; i < probe->getNumAtoms(); ++i) {
CHECK_THAT(cp.getConformer().getAtomPos(i).x,
Catch::Matchers::WithinAbs(cp2.getConformer().getAtomPos(i).x,
0.005));
}
}
}
TEST_CASE("bulk") {
std::string dirName = getenv("RDBASE");
dirName += "/External/pubchem_shape/test_data";
auto suppl = v2::FileParsers::SDMolSupplier(dirName + "/bulk.pubchem.sdf");
auto ref = suppl[0];
REQUIRE(ref);
for (auto i = 1u; i < suppl.length(); ++i) {
auto probe = suppl[1];
REQUIRE(probe);
std::vector<float> matrix(12, 0.0);
auto [nbr_st, nbr_ct] = AlignMolecule(*ref, *probe, matrix);
CHECK_THAT(nbr_st,
Catch::Matchers::WithinAbs(
probe->getProp<float>("shape_align_shape_tanimoto"), 0.005));
CHECK_THAT(nbr_ct,
Catch::Matchers::WithinAbs(
probe->getProp<float>("shape_align_color_tanimoto"), 0.005));
}
}
TEST_CASE("handling molecules with Hs") {
std::string dirName = getenv("RDBASE");
dirName += "/External/pubchem_shape/test_data";
v2::FileParsers::MolFileParserParams params;
params.removeHs = false;
auto suppl =
v2::FileParsers::SDMolSupplier(dirName + "/align_with_hs.sdf", params);
auto ref = suppl[0];
REQUIRE(ref);
auto probe = suppl[1];
REQUIRE(probe);
SECTION("basics") {
RWMol cp(*probe);
std::vector<float> matrix(12, 0.0);
auto [nbr_st, nbr_ct] = AlignMolecule(*ref, cp, matrix);
CHECK_THAT(nbr_st, Catch::Matchers::WithinAbs(0.837, 0.005));
CHECK_THAT(nbr_ct, Catch::Matchers::WithinAbs(0.694, 0.005));
for (auto i = 0u; i < cp.getNumAtoms(); ++i) {
// the failure mode here was that Hs had HUGE coordinates
auto pos = cp.getConformer().getAtomPos(i);
CHECK((pos.x > -10 && pos.x < 10));
}
}
}

View File

@@ -0,0 +1,106 @@
ref
RDKit 3D
0 0 0 0 0 0 0 0 0 0999 V3000
M V30 BEGIN CTAB
M V30 COUNTS 20 20 0 0 0
M V30 BEGIN ATOM
M V30 1 C -1.243631 -1.056161 0.012000 0
M V30 2 C -2.484202 -0.539894 -0.306757 0
M V30 3 C -2.794438 0.793168 -0.108146 0
M V30 4 C -1.787925 1.596812 0.433091 0
M V30 5 N -0.571003 1.079472 0.743255 0
M V30 6 C -0.270575 -0.217935 0.549419 0
M V30 7 C 1.052234 -0.806392 0.880565 0
M V30 8 C 2.005463 -0.723537 -0.301576 0
M V30 9 C 2.215991 0.705977 -0.696919 0
M V30 10 H -1.040984 -2.099314 -0.157577 0
M V30 11 H -3.278069 -1.167168 -0.732104 0
M V30 12 H -3.767439 1.211397 -0.355050 0
M V30 13 H -2.013491 2.654386 0.597951 0
M V30 14 H 0.946668 -1.847615 1.199110 0
M V30 15 H 1.527663 -0.268265 1.740520 0
M V30 16 H 2.965394 -1.162120 0.010096 0
M V30 17 H 1.579697 -1.299175 -1.152527 0
M V30 18 H 1.276250 1.227295 -0.932445 0
M V30 19 H 2.773214 1.193331 0.150198 0
M V30 20 H 2.909182 0.725741 -1.573103 0
M V30 END ATOM
M V30 BEGIN BOND
M V30 1 2 1 2
M V30 2 1 2 3
M V30 3 2 3 4
M V30 4 1 4 5
M V30 5 2 5 6
M V30 6 1 6 7
M V30 7 1 7 8
M V30 8 1 8 9
M V30 9 1 6 1
M V30 10 1 1 10
M V30 11 1 2 11
M V30 12 1 3 12
M V30 13 1 4 13
M V30 14 1 7 14
M V30 15 1 7 15
M V30 16 1 8 16
M V30 17 1 8 17
M V30 18 1 9 18
M V30 19 1 9 19
M V30 20 1 9 20
M V30 END BOND
M V30 END CTAB
M END
$$$$
prb
RDKit 3D
0 0 0 0 0 0 0 0 0 0999 V3000
M V30 BEGIN CTAB
M V30 COUNTS 20 20 0 0 0
M V30 BEGIN ATOM
M V30 1 C -0.309632 -0.572843 -0.290631 0
M V30 2 C -1.554814 -0.141033 0.148348 0
M V30 3 C -2.633169 -1.122608 0.369224 0
M V30 4 C -1.738823 1.207246 0.364168 0
M V30 5 C -0.717149 2.120087 0.152894 0
M V30 6 N 0.488305 1.679942 -0.274679 0
M V30 7 C 0.699665 0.360415 -0.495647 0
M V30 8 C 2.027105 -0.141403 -0.968505 0
M V30 9 C 2.806988 -0.435730 0.319076 0
M V30 10 H -0.121517 -1.623149 -0.472715 0
M V30 11 H -2.575226 -1.932547 -0.406057 0
M V30 12 H -3.623083 -0.655015 0.416635 0
M V30 13 H -2.433528 -1.639935 1.352035 0
M V30 14 H -2.708923 1.581110 0.710578 0
M V30 15 H -0.886527 3.191467 0.331607 0
M V30 16 H 1.899404 -1.033381 -1.581815 0
M V30 17 H 2.568267 0.699915 -1.463488 0
M V30 18 H 2.092539 -1.007350 0.945729 0
M V30 19 H 3.060811 0.528814 0.798600 0
M V30 20 H 3.659309 -1.064002 0.044643 0
M V30 END ATOM
M V30 BEGIN BOND
M V30 1 2 1 2
M V30 2 1 2 3
M V30 3 1 2 4
M V30 4 2 4 5
M V30 5 1 5 6
M V30 6 2 6 7
M V30 7 1 7 8
M V30 8 1 8 9
M V30 9 1 7 1
M V30 10 1 1 10
M V30 11 1 3 11
M V30 12 1 3 12
M V30 13 1 3 13
M V30 14 1 4 14
M V30 15 1 5 15
M V30 16 1 8 16
M V30 17 1 8 17
M V30 18 1 9 18
M V30 19 1 9 19
M V30 20 1 9 20
M V30 END BOND
M V30 END CTAB
M END
$$$$

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,91 @@
2244
RDKit 3D
13 13 0 0 0 0 0 0 0 0999 V2000
-17.2334 -5.4951 2.3053 O 0 0 0 0 0 0 0 0 0 0 0 0
-14.0938 -3.8235 0.2824 O 0 0 0 0 0 0 0 0 0 0 0 0
-15.9621 -3.2015 1.3992 O 0 0 0 0 0 0 0 0 0 0 0 0
-17.3818 -6.7019 0.3049 O 0 0 0 0 0 0 0 0 0 0 0 0
-15.9632 -5.9632 2.4646 C 0 0 0 0 0 0 0 0 0 0 0 0
-14.8724 -5.2643 1.9474 C 0 0 0 0 0 0 0 0 0 0 0 0
-15.7617 -7.1573 3.1570 C 0 0 0 0 0 0 0 0 0 0 0 0
-13.5802 -5.7595 2.1226 C 0 0 0 0 0 0 0 0 0 0 0 0
-14.4695 -7.6525 3.3323 C 0 0 0 0 0 0 0 0 0 0 0 0
-13.3789 -6.9536 2.8151 C 0 0 0 0 0 0 0 0 0 0 0 0
-15.0605 -4.0152 1.2226 C 0 0 0 0 0 0 0 0 0 0 0 0
-17.8696 -5.9525 1.1405 C 0 0 0 0 0 0 0 0 0 0 0 0
-19.2553 -5.3841 1.0571 C 0 0 0 0 0 0 0 0 0 0 0 0
1 5 1 0
1 12 1 0
2 11 1 0
3 11 2 0
4 12 2 0
5 6 2 0
5 7 1 0
6 8 1 0
6 11 1 0
7 9 2 0
8 10 2 0
9 10 1 0
12 13 1 0
M END
> <PUBCHEM_PHARMACOPHORE_FEATURES>
5
1 2 acceptor
1 3 acceptor
1 4 acceptor
3 2 3 11 anion
6 5 6 7 8 9 10 rings
$$$$
166295140
RDKit 3D
16 17 0 0 1 0 0 0 0 0999 V2000
-2.8293 0.2150 1.4768 F 0 0 0 0 0 0 0 0 0 0 0 0
-4.0626 0.7806 -0.2297 F 0 0 0 0 0 0 0 0 0 0 0 0
0.1990 -2.7308 -0.6547 O 0 0 0 0 0 0 0 0 0 0 0 0
2.1655 0.9098 0.9439 O 0 0 0 0 0 0 0 0 0 0 0 0
0.2386 -1.4812 1.2511 O 0 0 0 0 0 0 0 0 0 0 0 0
-0.5537 0.6911 -0.1687 N 0 0 0 0 0 0 0 0 0 0 0 0
3.9396 -0.0288 -0.0987 N 0 0 0 0 0 0 0 0 0 0 0 0
-0.8484 -0.6331 -0.7303 C 0 0 2 0 0 0 0 0 0 0 0 0
-2.3589 -0.8260 -0.5838 C 0 0 0 0 0 0 0 0 0 0 0 0
-2.8195 0.4211 0.1395 C 0 0 0 0 0 0 0 0 0 0 0 0
-1.7941 1.4679 -0.2225 C 0 0 0 0 0 0 0 0 0 0 0 0
0.5493 1.3657 -0.8388 C 0 0 0 0 0 0 0 0 0 0 0 0
-0.0875 -1.6304 0.0829 C 0 0 0 0 0 0 0 0 0 0 0 0
1.8856 0.8393 -0.3872 C 0 0 0 0 0 0 0 0 0 0 0 0
2.9644 0.2690 -1.0161 C 0 0 0 0 0 0 0 0 0 0 0 0
3.4121 0.3708 1.0364 C 0 0 0 0 0 0 0 0 0 0 0 0
1 10 1 0
2 10 1 0
3 13 1 0
4 14 1 0
4 16 1 0
5 13 2 0
6 8 1 0
6 11 1 0
6 12 1 0
7 15 1 0
7 16 2 0
8 9 1 0
8 13 1 1
9 10 1 0
10 11 1 0
12 14 1 0
14 15 2 0
M END
> <PUBCHEM_PHARMACOPHORE_FEATURES>
6
1 3 acceptor
1 5 acceptor
1 6 cation
3 3 5 13 anion
5 4 7 14 15 16 rings
5 6 8 9 10 11 rings
$$$$