Files
rdkit/Code/GraphMol/RGroupDecomposition/RGroupDecompParams.cpp
Gareth Jones 81d3705358 R group match any issue (#3767)
* RGD modifications for any atom and index labels

* Continued development

* All tests working

* Added comment

* CR changes suggested by PTosco

* Fix catch_rgd for autocrlf

* Core dummy matches on output. RGroups on heavy atom. Dummy atoms User rgroups only when they are degree 1.

* Start work on test fixes

* testRGroupDecomp test working

* CPP and Python tests working

* Removed options for matching core query atoms on sidechains

* Windows build fix

* R groups off ring. User group matches single heavy substituent. Remove extraneous hydrogens

* Updated fingerprint variance score and tie selection

* Refactor fingerprint variance score functions to class

* Removed fingerprint distance score

* Boost::trim fix

* Updated RGD test notebook

* Fixed AddHs.cpp

* - fixes the kekulization issue
- avoids that empty R-group labels are included in cores
- makes sure that SMILES cores are always canonical
- adds a few missing const declarations and avoids unintentional copying

* Support for allowNonTerminalRGroups parameter. Remove R groups that contain H or Nothing.  Ignore R group labels on non-dummy atoms

* Fixed tests for Paolo's changes. Rebuilt test notebook.  Increased weighting of rgroup penalty in fingerprint variance score

* remove some debug output

Co-authored-by: Brian Kelley <fustigator@gmail.com>
Co-authored-by: greg landrum <greg.landrum@gmail.com>
2021-03-10 12:56:42 +01:00

291 lines
9.5 KiB
C++

//
// Copyright (C) 2017 Novartis Institutes for BioMedical Research
//
// @@ All Rights Reserved @@
// This file is part of the RDKit.
// The contents are covered by the terms of the BSD license
// which is included in the file license.txt, found at the root
// of the RDKit source tree.
//
#include "RGroupDecomp.h"
#include "RGroupUtils.h"
#include <GraphMol/RDKitBase.h>
#include <GraphMol/Substruct/SubstructMatch.h>
#include <GraphMol/SmilesParse/SmilesParse.h>
#include <GraphMol/FMCS/FMCS.h>
#include <set>
namespace RDKit {
namespace {
bool hasLabel(const Atom *atom, unsigned int autoLabels) {
bool atomHasLabel = false;
if (autoLabels & MDLRGroupLabels) {
atomHasLabel |= atom->hasProp(common_properties::_MolFileRLabel);
}
if (autoLabels & IsotopeLabels) {
atomHasLabel |= (atom->getIsotope() > 0);
}
if (autoLabels & AtomMapLabels) {
atomHasLabel |= (atom->getAtomMapNum() > 0);
}
if (autoLabels & DummyAtomLabels) {
atomHasLabel |= (atom->getAtomicNum() == 0);
}
// don't match negative rgroups as these are used by AtomIndexRLabels which
// are set for the template core before the MCS and after the MCS for the
// other core
if (atom->hasProp(RLABEL)) {
auto label = atom->getProp<int>(RLABEL);
atomHasLabel |= label > 0;
}
return atomHasLabel;
}
/* When comparing atoms for the MCS overlay, we don't want to overlay a core
* atom that has a labelled rgroup on top of one that hasn't a labelled rgroup.
* The labelled rgroup may be on a terminal dummy atom bonded to the core atom.
*
* This function checks to see if there is a terminal dummy labelled rgroup
* attached to the atom.
*/
bool hasAttachedLabels(const ROMol &mol, const Atom *atom,
unsigned int autoLabels) {
RWMol::ADJ_ITER nbrIdx, endNbrs;
boost::tie(nbrIdx, endNbrs) = mol.getAtomNeighbors(atom);
while (nbrIdx != endNbrs) {
const auto neighborAtom = mol.getAtomWithIdx(*nbrIdx);
if (neighborAtom->getAtomicNum() == 0 && neighborAtom->getDegree() == 1 &&
hasLabel(neighborAtom, autoLabels)) {
return true;
}
++nbrIdx;
}
return false;
}
} // namespace
unsigned int RGroupDecompositionParameters::autoGetLabels(const RWMol &core) {
unsigned int autoLabels = 0;
if (!onlyMatchAtRGroups) {
autoLabels = AtomIndexLabels;
}
bool hasMDLRGroup = false;
bool hasAtomMapNum = false;
bool hasIsotopes = false;
bool hasDummies = false;
for (auto atm : core.atoms()) {
if (atm->getIsotope()) {
hasIsotopes = true;
}
if (atm->getAtomMapNum()) {
hasAtomMapNum = true;
}
if (atm->hasProp(common_properties::_MolFileRLabel)) {
hasMDLRGroup = true;
}
if (atm->getAtomicNum() == 0 && atm->getDegree() == 1) {
hasDummies = true;
}
}
if (hasMDLRGroup) {
return autoLabels | MDLRGroupLabels;
} else if (hasAtomMapNum) {
return autoLabels | AtomMapLabels;
} else if (hasIsotopes) {
return autoLabels | IsotopeLabels;
} else if (hasDummies) {
return autoLabels | DummyAtomLabels;
}
return autoLabels;
}
bool rgdAtomCompare(const MCSAtomCompareParameters &p, const ROMol &mol1,
unsigned int atom1, const ROMol &mol2, unsigned int atom2,
void *userData) {
if (!MCSAtomCompareElements(p, mol1, atom1, mol2, atom2, nullptr)) {
return false;
}
unsigned int autoLabels = *reinterpret_cast<unsigned int *>(userData);
const auto a1 = mol1.getAtomWithIdx(atom1);
const auto a2 = mol2.getAtomWithIdx(atom2);
bool atom1HasLabel = hasLabel(a1, autoLabels);
bool atom2HasLabel = hasLabel(a2, autoLabels);
// check for the presence of rgroup labels on adjacent terminal dummy atoms
atom1HasLabel |= hasAttachedLabels(mol1, a1, autoLabels);
atom2HasLabel |= hasAttachedLabels(mol2, a2, autoLabels);
return !(atom1HasLabel != atom2HasLabel);
}
bool RGroupDecompositionParameters::prepareCore(RWMol &core,
const RWMol *alignCore) {
const bool relabel = labels & RelabelDuplicateLabels;
unsigned int autoLabels = labels;
if (labels == AutoDetect) {
autoLabels = autoGetLabels(core);
if (!autoLabels) {
BOOST_LOG(rdWarningLog) << "RGroupDecomposition auto detect found no "
"rgroups and onlyMatchAtRgroups is set to true"
<< std::endl;
return false;
}
} else if (!onlyMatchAtRGroups) {
autoLabels |= AtomIndexLabels;
}
int maxLabel = 1;
if (alignCore && (alignment & MCS)) {
std::vector<ROMOL_SPTR> mols;
mols.push_back(ROMOL_SPTR(new ROMol(core)));
mols.push_back(ROMOL_SPTR(new ROMol(*alignCore)));
MCSParameters mcsParams;
if (autoLabels != AutoDetect) {
mcsParams.AtomTyper = rgdAtomCompare;
mcsParams.CompareFunctionsUserData = &autoLabels;
}
MCSResult res = findMCS(mols, &mcsParams);
if (res.isCompleted()) {
auto m = res.QueryMol;
if (m) {
MatchVectType match1;
MatchVectType match2;
bool target_matched1 = SubstructMatch(core, *m, match1);
bool target_matched2 = SubstructMatch(*alignCore, *m, match2);
CHECK_INVARIANT(match1.size() == match2.size(),
"Matches should be the same size in prepareCore");
if (target_matched1 && target_matched2) {
for (size_t i = 0; i < match1.size(); ++i) {
int queryAtomIdx1 = match1[i].first;
int coreAtomIdx = match1[i].second;
int queryAtomIdx2 = match2[i].first;
int alignCoreAtomIdx = match2[i].second;
CHECK_INVARIANT(queryAtomIdx1 == queryAtomIdx2,
"query atoms aren't the same");
Atom *coreAtm = core.getAtomWithIdx(coreAtomIdx);
const Atom *alignCoreAtm =
alignCore->getAtomWithIdx(alignCoreAtomIdx);
// clear up input rlabels
coreAtm->setAtomMapNum(0);
if (coreAtm->hasProp(common_properties::_MolFileRLabel)) {
coreAtm->clearProp(common_properties::_MolFileRLabel);
coreAtm->setIsotope(0);
}
if (alignCoreAtm->hasProp(RLABEL)) {
int rlabel = alignCoreAtm->getProp<int>(RLABEL);
maxLabel = (std::max)(maxLabel, rlabel + 1);
coreAtm->setProp(RLABEL, rlabel);
}
}
}
}
}
}
std::set<int> foundLabels;
int nextOffset = 0;
std::map<int, int> atomToLabel;
for (auto atom : core.atoms()) {
bool found = false;
if (atom->hasProp(RLABEL)) {
// set from MCS match
if (setLabel(atom, atom->getProp<int>(RLABEL), foundLabels, maxLabel,
relabel, Labelling::INTERNAL_LABELS)) {
found = true;
}
}
if (atom->getAtomicNum() == 0) {
if (!found && (autoLabels & MDLRGroupLabels)) {
unsigned int rgroup;
if (atom->getPropIfPresent<unsigned int>(
common_properties::_MolFileRLabel, rgroup)) {
if (setLabel(atom, rdcast<int>(rgroup), foundLabels, maxLabel,
relabel, Labelling::RGROUP_LABELS)) {
found = true;
checkNonTerminal(*atom);
}
}
}
if (!found && (autoLabels & IsotopeLabels) && atom->getIsotope() > 0) {
if (setLabel(atom, rdcast<int>(atom->getIsotope()), foundLabels,
maxLabel, relabel, Labelling::ISOTOPE_LABELS)) {
checkNonTerminal(*atom);
found = true;
}
}
if (!found && (autoLabels & AtomMapLabels) && atom->getAtomMapNum() > 0) {
if (setLabel(atom, rdcast<int>(atom->getAtomMapNum()), foundLabels,
maxLabel, relabel, Labelling::ATOMMAP_LABELS)) {
checkNonTerminal(*atom);
found = true;
}
}
if (!found && (autoLabels & DummyAtomLabels) &&
atom->getAtomicNum() == 0 && atom->getDegree() == 1) {
const bool forceRelabellingWithDummies = true;
int defaultDummyStartLabel = maxLabel;
if (setLabel(atom, defaultDummyStartLabel, foundLabels, maxLabel,
forceRelabellingWithDummies, Labelling::DUMMY_LABELS)) {
found = true;
}
}
}
// Unless there is an MCS match from above, we need to give different
// RLABELS to each core so keep track of which labels
// we have used (note that these are negative since they are
// potential rgroups and haven't been assigned yet)
if (!found && (autoLabels & AtomIndexLabels)) {
if (setLabel(atom, indexOffset - atom->getIdx(), foundLabels, maxLabel,
relabel, Labelling::INDEX_LABELS)) {
nextOffset++;
}
found = true;
}
clearInputLabels(atom);
int rlabel;
if (atom->getPropIfPresent(RLABEL, rlabel)) {
atomToLabel[atom->getIdx()] = rlabel;
}
}
indexOffset -= core.getNumAtoms();
MolOps::AdjustQueryParameters adjustParams;
adjustParams.makeDummiesQueries = true;
adjustParams.adjustDegree = false;
adjustQueryProperties(core, &adjustParams);
for (auto &it : atomToLabel) {
core.getAtomWithIdx(it.first)->setProp(RLABEL, it.second);
}
return true;
}
void RGroupDecompositionParameters::checkNonTerminal(const Atom &atom) const {
if (allowNonTerminalRGroups || atom.getDegree() == 1) {
return;
}
BOOST_LOG(rdWarningLog)
<< "Non terminal R group defined. To allow set allowNonTerminalRGroups "
"in RGroupDecompositionParameters"
<< std::endl;
throw ValueErrorException("Non terminal R group defined.");
}
} // namespace RDKit