Files
rdkit/Code/GraphMol/MolAlign/AlignMolecules.cpp
Greg Landrum 63f514455f Allow Hs to be ignored in the RMSD alignment functions (#8976)
* first pass at allowing Hs to be ignored in getBestRMS()

* add support in getAllConformersBestRMS()

* do getBestAlignmentTransform()

* cleanup, error checking, and a fix

* this is more painful than it seems like it should be

* add python wrappers and tests

* response to review
2025-12-30 07:07:04 +01:00

479 lines
17 KiB
C++

//
// Copyright (C) 2001-2025 Greg Landrum and other RDKit contributors
//
// @@ All Rights Reserved @@
// This file is part of the RDKit.
// The contents are covered by the terms of the BSD license
// which is included in the file license.txt, found at the root
// of the RDKit source tree.
//
#include "AlignMolecules.h"
#include <Geometry/Transform3D.h>
#include <Numerics/Vector.h>
#include <GraphMol/Substruct/SubstructMatch.h>
#include <GraphMol/Conformer.h>
#include <GraphMol/ROMol.h>
#include <GraphMol/QueryOps.h>
#include <GraphMol/QueryBond.h>
#include <GraphMol/SmilesParse/SmilesParse.h>
#include <Numerics/Alignment/AlignPoints.h>
#include <GraphMol/MolTransforms/MolTransforms.h>
#include <RDGeneral/RDThreads.h>
#include <algorithm>
#include <boost/format.hpp>
namespace RDKit {
namespace MolAlign {
namespace details {
void symmetrizeTerminalAtoms(RWMol &mol) {
// clang-format off
static const std::string qsmarts =
"[{atomPattern};$([{atomPattern}]-[*]=[{atomPattern}]),$([{atomPattern}]=[*]-[{atomPattern}])]~[*]";
static std::map<std::string, std::string> replacements = {
{"{atomPattern}", "O,N;D1"}};
// clang-format on
static SmartsParserParams ps;
ps.replacements = &replacements;
static const std::unique_ptr<RWMol> qry{SmartsToMol(qsmarts, ps)};
CHECK_INVARIANT(qry, "bad query pattern");
auto matches = SubstructMatch(mol, *qry);
if (matches.empty()) {
return;
}
QueryBond qb;
qb.setQuery(makeSingleOrDoubleBondQuery());
for (const auto &match : matches) {
mol.getAtomWithIdx(match[0].second)->setFormalCharge(0);
auto obond = mol.getBondBetweenAtoms(match[0].second, match[1].second);
CHECK_INVARIANT(obond, "could not find expected bond");
mol.replaceBond(obond->getIdx(), &qb);
}
}
} // namespace details
namespace {
double alignConfsOnAtomMap(const Conformer &prbCnf, const Conformer &refCnf,
const MatchVectType &atomMap,
RDGeom::Transform3D &trans,
const RDNumeric::DoubleVector *weights, bool reflect,
unsigned int maxIterations) {
RDGeom::Point3DConstPtrVect refPoints, prbPoints;
for (const auto &mi : atomMap) {
prbPoints.push_back(&prbCnf.getAtomPos(mi.first));
refPoints.push_back(&refCnf.getAtomPos(mi.second));
}
double ssr = RDNumeric::Alignments::AlignPoints(
refPoints, prbPoints, trans, weights, reflect, maxIterations);
return ssr / static_cast<double>(prbPoints.size());
}
void getAllMatchesPrbRef(const ROMol &prbMol, const ROMol &refMol,
std::vector<MatchVectType> &matches, int maxMatches,
bool symmetrizeConjugatedTerminalGroups,
bool ignoreHs = false) {
bool uniquify = false;
bool recursionPossible = true;
bool useChirality = false;
bool useQueryQueryMatches = false;
std::unique_ptr<RWMol> prbMolSymm;
if (symmetrizeConjugatedTerminalGroups) {
prbMolSymm.reset(new RWMol(prbMol));
details::symmetrizeTerminalAtoms(*prbMolSymm);
}
const auto &prbMolForMatch = prbMolSymm ? *prbMolSymm : prbMol;
SubstructMatch(refMol, prbMolForMatch, matches, uniquify, recursionPossible,
useChirality, useQueryQueryMatches, maxMatches);
if (matches.empty()) {
throw MolAlignException(
"No sub-structure match found between the reference and probe mol");
}
if (matches.size() > 1e6) {
std::string name;
prbMol.getPropIfPresent(common_properties::_Name, name);
BOOST_LOG(rdWarningLog)
<< "Warning in " << __FUNCTION__ << ": " << matches.size()
<< " matches detected for molecule " << name << ", this may "
<< "lead to a performance slowdown." << std::endl;
}
if (ignoreHs) {
// filter Hs out of the matches
for (auto &match : matches) {
std::erase_if(match, [&prbMolForMatch](const auto &mi) {
return prbMolForMatch.getAtomWithIdx(mi.first)->getAtomicNum() == 1;
});
}
}
}
double calcMSDInternal(const Conformer &prbCnf, const Conformer &refCnf,
const MatchVectType &atomMap,
const RDNumeric::DoubleVector *weights) {
unsigned int npt = atomMap.size();
std::unique_ptr<RDNumeric::DoubleVector> unitWeights;
if (!weights) {
unitWeights.reset(new RDNumeric::DoubleVector(npt, 1.0));
weights = unitWeights.get();
} else {
PRECONDITION(npt == weights->size(), "Mismatch in number of weights");
}
RDGeom::Point3DConstPtrVect refPoints, prbPoints;
for (const auto &mi : atomMap) {
prbPoints.push_back(&prbCnf.getAtomPos(mi.first));
refPoints.push_back(&refCnf.getAtomPos(mi.second));
}
double ssr = 0.;
const RDGeom::Point3D *rpt;
const RDGeom::Point3D *ppt;
for (unsigned int i = 0; i < npt; ++i) {
rpt = refPoints[i];
ppt = prbPoints[i];
ssr += (*weights)[i] * (*ppt - *rpt).lengthSq();
}
return ssr / static_cast<double>(npt);
}
double getBestRMSInternal(const ROMol &prbMol, const ROMol &refMol, int prbCid,
int refCid, const std::vector<MatchVectType> &matches,
RDGeom::Transform3D *trans, MatchVectType *bestMatch,
const RDNumeric::DoubleVector *weights, bool reflect,
unsigned int maxIters, unsigned int numThreads) {
PRECONDITION(!matches.empty(), "matches must not be empty");
#ifndef RDK_BUILD_THREADSAFE_SSS
numThreads = 1;
#endif
double msdBest = std::numeric_limits<double>::max();
const Conformer &prbCnf = prbMol.getConformer(prbCid);
const Conformer &refCnf = refMol.getConformer(refCid);
const MatchVectType *bestMatchPtr = &matches[0];
if (numThreads == 1) {
for (const auto &matche : matches) {
RDGeom::Transform3D tmpTrans;
double msd = trans ? alignConfsOnAtomMap(prbCnf, refCnf, matche, tmpTrans,
weights, reflect, maxIters)
: calcMSDInternal(prbCnf, refCnf, matche, weights);
if (msd < msdBest) {
msdBest = msd;
bestMatchPtr = &matche;
if (trans) {
trans->assign(tmpTrans);
}
}
}
}
#ifdef RDK_BUILD_THREADSAFE_SSS
else {
std::vector<std::thread> tg;
std::vector<
std::vector<std::tuple<double, unsigned int, RDGeom::Transform3D>>>
rmsds(numThreads);
for (auto ti = 0u; ti < numThreads; ++ti) {
auto func = [&](unsigned int tidx) {
for (auto midx = tidx; midx < matches.size(); midx += numThreads) {
auto matche = matches[midx];
RDGeom::Transform3D tmpTrans;
auto msd = trans
? alignConfsOnAtomMap(prbCnf, refCnf, matche, tmpTrans,
weights, reflect, maxIters)
: calcMSDInternal(prbCnf, refCnf, matche, weights);
rmsds[tidx].emplace_back(msd, midx, tmpTrans);
}
};
tg.emplace_back(std::thread(func, ti));
}
for (auto &thread : tg) {
if (thread.joinable()) {
thread.join();
}
}
for (const auto &rv : rmsds) {
for (const auto &res : rv) {
const auto &[msd, midx, tf] = res;
if (msd < msdBest) {
msdBest = msd;
bestMatchPtr = &matches[midx];
if (trans) {
trans->assign(tf);
}
}
}
}
}
#endif
if (bestMatch) {
*bestMatch = *bestMatchPtr;
}
return sqrt(msdBest);
}
} // namespace
double getAlignmentTransform(const ROMol &prbMol, const ROMol &refMol,
RDGeom::Transform3D &trans, int prbCid, int refCid,
const MatchVectType *atomMap,
const RDNumeric::DoubleVector *weights,
bool reflect, unsigned int maxIterations) {
const Conformer &prbCnf = prbMol.getConformer(prbCid);
const Conformer &refCnf = refMol.getConformer(refCid);
MatchVectType match;
if (!atomMap) {
// we have to figure out the mapping between the two molecule
const bool recursionPossible = true;
const bool useChirality = false;
const bool useQueryQueryMatches = true;
if (SubstructMatch(refMol, prbMol, match, recursionPossible, useChirality,
useQueryQueryMatches)) {
atomMap = &match;
} else {
throw MolAlignException(
"No sub-structure match found between the probe and query mol");
}
}
double msd = alignConfsOnAtomMap(prbCnf, refCnf, *atomMap, trans, weights,
reflect, maxIterations);
return sqrt(msd);
}
double alignMol(ROMol &prbMol, const ROMol &refMol, int prbCid, int refCid,
const MatchVectType *atomMap,
const RDNumeric::DoubleVector *weights, bool reflect,
unsigned int maxIterations) {
RDGeom::Transform3D trans;
double res = getAlignmentTransform(prbMol, refMol, trans, prbCid, refCid,
atomMap, weights, reflect, maxIterations);
// now transform the relevant conformation on prbMol
Conformer &conf = prbMol.getConformer(prbCid);
MolTransforms::transformConformer(conf, trans);
return res;
}
double getBestAlignmentTransform(const ROMol &prbMol, const ROMol &refMol,
RDGeom::Transform3D &bestTrans,
MatchVectType &bestMatch,
const BestAlignmentParams &params, int prbCid,
int refCid, bool reflect,
unsigned int maxIters) {
std::vector<MatchVectType> allMatches;
if (params.map.empty()) {
getAllMatchesPrbRef(prbMol, refMol, allMatches, params.maxMatches,
params.symmetrizeConjugatedTerminalGroups,
params.ignoreHs);
}
const auto &matches = params.map.empty() ? allMatches : params.map;
auto bestRMS = getBestRMSInternal(
prbMol, refMol, prbCid, refCid, matches, &bestTrans, &bestMatch,
params.weights, reflect, maxIters, getNumThreadsToUse(params.numThreads));
return bestRMS;
}
double getBestRMS(ROMol &prbMol, const ROMol &refMol,
const BestAlignmentParams &params, int prbCid, int refCid) {
std::vector<MatchVectType> allMatches;
if (params.map.empty()) {
getAllMatchesPrbRef(prbMol, refMol, allMatches, params.maxMatches,
params.symmetrizeConjugatedTerminalGroups,
params.ignoreHs);
}
const auto &matches = params.map.empty() ? allMatches : params.map;
// TODO: check mismatch between weights size and match size.
if (!params.map.empty() && params.weights) {
auto nAtms = params.map[0].size();
if (params.weights->size() != nAtms) {
throw ValueErrorException(
(boost::format(
"Mismatch between number of weights (%d) and number of atoms in the alignment map (%d).") %
params.weights->size() % nAtms)
.str());
}
}
RDGeom::Transform3D trans;
bool reflect = false;
unsigned int maxIters = 50;
auto bestRMS = getBestRMSInternal(
prbMol, refMol, prbCid, refCid, matches, &trans, nullptr, params.weights,
reflect, maxIters, getNumThreadsToUse(params.numThreads));
// Perform a final alignment to the best alignment...
MolTransforms::transformConformer(prbMol.getConformer(prbCid), trans);
return bestRMS;
}
std::vector<double> getAllConformerBestRMS(const ROMol &mol,
const BestAlignmentParams &params) {
auto numThreads = getNumThreadsToUse(params.numThreads);
std::vector<MatchVectType> allMatches;
if (params.map.empty()) {
getAllMatchesPrbRef(mol, mol, allMatches, params.maxMatches,
params.symmetrizeConjugatedTerminalGroups,
params.ignoreHs);
}
const auto &matches = params.map.empty() ? allMatches : params.map;
std::vector<double> res;
RDGeom::Transform3D trans;
bool reflect = false;
unsigned int maxIters = 50;
std::vector<int> cids;
for (auto cit = mol.beginConformers(); cit != mol.endConformers(); ++cit) {
cids.push_back((*cit)->getId());
}
if (numThreads == 1) {
for (auto ci = 0u; ci < mol.getNumConformers(); ++ci) {
for (auto cj = 0u; cj < ci; ++cj) {
res.push_back(getBestRMSInternal(mol, mol, cids[ci], cids[cj], matches,
&trans, nullptr, params.weights,
reflect, maxIters, 1));
}
}
}
#ifdef RDK_BUILD_THREADSAFE_SSS
else {
std::vector<std::pair<unsigned int, unsigned int>> pairs;
for (auto ci = 0u; ci < mol.getNumConformers(); ++ci) {
for (auto cj = 0u; cj < ci; ++cj) {
pairs.emplace_back(cids[ci], cids[cj]);
}
}
std::vector<std::vector<std::pair<unsigned int, double>>> rmsds(numThreads);
auto func = [&](unsigned int tidx) {
RDGeom::Transform3D trans;
bool reflect = false;
unsigned int maxIters = 50;
for (auto i = tidx; i < pairs.size(); i += numThreads) {
auto rms = getBestRMSInternal(mol, mol, pairs[i].first, pairs[i].second,
matches, &trans, nullptr, params.weights,
reflect, maxIters, 1);
rmsds[tidx].emplace_back(i, rms);
}
};
std::vector<std::thread> tg;
for (auto ti = 0u; ti < numThreads; ++ti) {
tg.emplace_back(std::thread(func, ti));
}
for (auto &thread : tg) {
if (thread.joinable()) {
thread.join();
}
}
res.resize(pairs.size());
for (const auto &tres : rmsds) {
for (const auto &v : tres) {
res[v.first] = v.second;
}
}
}
#endif
return res;
}
double CalcRMS(ROMol &prbMol, const ROMol &refMol, int prbCid, int refCid,
const std::vector<MatchVectType> &map, int maxMatches,
bool symmetrizeConjugatedTerminalGroups,
const RDNumeric::DoubleVector *weights) {
std::vector<MatchVectType> allMatches;
if (map.empty()) {
getAllMatchesPrbRef(prbMol, refMol, allMatches, maxMatches,
symmetrizeConjugatedTerminalGroups);
}
const auto &matches = map.empty() ? allMatches : map;
bool reflect = false;
unsigned int maxIters = 50;
unsigned int numThreads = 1;
return getBestRMSInternal(prbMol, refMol, prbCid, refCid, matches, nullptr,
nullptr, weights, reflect, maxIters, numThreads);
}
double CalcRMS(ROMol &prbMol, const ROMol &refMol, int prbCid, int refCid,
const std::vector<MatchVectType> &map, int maxMatches,
const RDNumeric::DoubleVector *weights) {
return CalcRMS(prbMol, refMol, prbCid, refCid, map, maxMatches, false,
weights);
}
void _fillAtomPositions(RDGeom::Point3DConstPtrVect &pts, const Conformer &conf,
const std::vector<unsigned int> *atomIds = nullptr) {
unsigned int na = conf.getNumAtoms();
pts.clear();
if (atomIds == nullptr) {
unsigned int ai;
pts.reserve(na);
for (ai = 0; ai < na; ++ai) {
pts.push_back(&conf.getAtomPos(ai));
}
} else {
pts.reserve(atomIds->size());
std::vector<unsigned int>::const_iterator cai;
for (cai = atomIds->begin(); cai != atomIds->end(); cai++) {
pts.push_back(&conf.getAtomPos(*cai));
}
}
}
void alignMolConformers(ROMol &mol, const std::vector<unsigned int> *atomIds,
const std::vector<unsigned int> *confIds,
const RDNumeric::DoubleVector *weights, bool reflect,
unsigned int maxIters, std::vector<double> *RMSlist) {
if (mol.getNumConformers() == 0) {
// nothing to be done ;
return;
}
RDGeom::Point3DConstPtrVect refPoints, prbPoints;
int cid = -1;
if ((confIds != nullptr) && (confIds->size() > 0)) {
cid = confIds->front();
}
const Conformer &refCnf = mol.getConformer(cid);
_fillAtomPositions(refPoints, refCnf, atomIds);
// now loop throught the remaininf conformations and transform them
RDGeom::Transform3D trans;
double ssd;
if (confIds == nullptr) {
unsigned int i = 0;
ROMol::ConformerIterator cnfi;
// Conformer *conf;
for (cnfi = mol.beginConformers(); cnfi != mol.endConformers(); cnfi++) {
// conf = (*cnfi);
i += 1;
if (i == 1) {
continue;
}
_fillAtomPositions(prbPoints, *(*cnfi), atomIds);
ssd = RDNumeric::Alignments::AlignPoints(refPoints, prbPoints, trans,
weights, reflect, maxIters);
if (RMSlist) {
ssd /= (prbPoints.size());
RMSlist->push_back(sqrt(ssd));
}
MolTransforms::transformConformer(*(*cnfi), trans);
}
} else {
std::vector<unsigned int>::const_iterator cai;
unsigned int i = 0;
for (cai = confIds->begin(); cai != confIds->end(); cai++) {
i += 1;
if (i == 1) {
continue;
}
Conformer &conf = mol.getConformer(*cai);
_fillAtomPositions(prbPoints, conf, atomIds);
ssd = RDNumeric::Alignments::AlignPoints(refPoints, prbPoints, trans,
weights, reflect, maxIters);
if (RMSlist) {
ssd /= (prbPoints.size());
RMSlist->push_back(sqrt(ssd));
}
MolTransforms::transformConformer(conf, trans);
}
}
}
} // namespace MolAlign
} // namespace RDKit