- allowRGroups now also includes terminal query atoms matching hydrogen in additional to terminal dummy atoms (#6280)

- added relevant unit tests

Co-authored-by: Tosco, Paolo <paolo.tosco@novartis.com>
This commit is contained in:
Paolo Tosco
2023-04-12 06:26:54 +02:00
committed by GitHub
parent 36c4ec9e2b
commit 2aa4fe743d
7 changed files with 287 additions and 14 deletions

View File

@@ -22,6 +22,7 @@
#include <cmath>
#include <GraphMol/MolOps.h>
#include <GraphMol/Rings.h>
#include <GraphMol/QueryAtom.h>
#include <Geometry/point.h>
#include <Geometry/Transform2D.h>
#include <Geometry/Transform3D.h>
@@ -751,7 +752,7 @@ RDKit::MatchVectType generateDepictionMatching2DStructure(
// terminal dummy atoms
allowOptionalAttachments = false;
for (const auto queryAtom : query.atoms()) {
if (queryAtom->getAtomicNum() == 0 && queryAtom->getDegree() == 1) {
if (RDKit::isAtomTerminalRGroupOrQueryHydrogen(queryAtom)) {
allowOptionalAttachments = true;
break;
}

View File

@@ -1288,6 +1288,79 @@ M END)RES"_ctab;
msd /= static_cast<double>(matchVect.size());
TEST_ASSERT(msd < 1.0e-4);
}
// test that using a reference with query atoms including H works
auto scaffold = R"CTAB(
MJ201100
12 13 0 0 0 0 0 0 0 0999 V2000
-0.5398 0.0400 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0
-1.3648 0.0400 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0
-1.7773 -0.6745 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0
-1.3649 -1.3889 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0
-0.5399 -1.3889 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0
-0.1273 -0.6744 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0
0.6976 -0.6744 0.0000 L 0 0 0 0 0 0 0 0 0 0 0 0
-1.9167 0.6531 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0
-2.6704 0.3176 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0
-2.5842 -0.5028 0.0000 N 0 0 0 0 0 0 0 0 0 0 0 0
-3.3849 0.7302 0.0000 L 0 0 0 0 0 0 0 0 0 0 0 0
-1.7451 1.4600 0.0000 L 0 0 0 0 0 0 0 0 0 0 0 0
1 2 2 0 0 0 0
2 3 1 0 0 0 0
3 4 2 0 0 0 0
4 5 1 0 0 0 0
5 6 2 0 0 0 0
6 1 1 0 0 0 0
6 7 1 0 0 0 0
8 9 2 0 0 0 0
2 8 1 0 0 0 0
9 10 1 0 0 0 0
3 10 1 0 0 0 0
9 11 1 0 0 0 0
8 12 1 0 0 0 0
M ALS 7 10 F H C N O F P S Cl Br I
M ALS 11 10 F H C N O F P S Cl Br I
M ALS 12 10 F H C N O F P S Cl Br I
M END
)CTAB"_ctab;
auto mol = R"CTAB(
MJ201100
13 14 0 0 0 0 0 0 0 0999 V2000
-0.6112 0.3665 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0
-1.3648 0.0310 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0
-1.4510 -0.7895 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0
-0.7836 -1.2744 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0
-0.0299 -0.9389 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0
0.0562 -0.1183 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0
0.8099 0.2172 0.0000 O 0 0 0 0 0 0 0 0 0 0 0 0
-2.1184 0.3666 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0
-2.6705 -0.2464 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0
-2.2580 -0.9608 0.0000 N 0 0 0 0 0 0 0 0 0 0 0 0
0.6374 -1.4238 0.0000 O 0 0 0 0 0 0 0 0 0 0 0 0
0.8961 1.0377 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0
0.5512 -2.2443 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0
1 2 2 0 0 0 0
2 3 1 0 0 0 0
3 4 2 0 0 0 0
4 5 1 0 0 0 0
5 6 2 0 0 0 0
6 1 1 0 0 0 0
8 9 2 0 0 0 0
2 8 1 0 0 0 0
9 10 1 0 0 0 0
3 10 1 0 0 0 0
6 7 1 0 0 0 0
5 11 1 0 0 0 0
7 12 1 0 0 0 0
11 13 1 0 0 0 0
M END
)CTAB"_ctab;
auto matchVect = RDDepict::generateDepictionMatching2DStructure(
*mol, *scaffold, -1, nullptr, false, false, true);
TEST_ASSERT(mol->getNumConformers() == 1);
TEST_ASSERT(matchVect.size() == 10);
BOOST_LOG(rdInfoLog) << "Finished" << std::endl;
}

View File

@@ -79,8 +79,8 @@ class ScoreMatchesByDegreeOfCoreSubstitution {
bool doesRGroupMatchHydrogen(const std::pair<int, int> &pair) const {
const auto queryAtom = d_query.getAtomWithIdx(pair.first);
const auto molAtom = d_mol.getAtomWithIdx(pair.second);
return (queryAtom->getAtomicNum() == 0 && queryAtom->getDegree() == 1 &&
molAtom->getAtomicNum() == 1);
return (molAtom->getAtomicNum() == 1 &&
isAtomTerminalRGroupOrQueryHydrogen(queryAtom));
}
double computeScore(const RDKit::MatchVectType &match) const {
double penalty = 0.0;
@@ -217,6 +217,14 @@ std::vector<MatchVectType> sortMatchesByDegreeOfCoreSubstitution(
return matchScorer.sortMatchesByDegreeOfCoreSubstitution();
}
bool isAtomTerminalRGroupOrQueryHydrogen(const Atom *atom) {
return atom->getDegree() == 1 &&
(atom->getAtomicNum() == 0 ||
(atom->hasQuery() &&
describeQuery(atom).find("AtomAtomicNum 1 = val") !=
std::string::npos));
}
#define PT_OPT_GET(opt) params.opt = pt.get(#opt, params.opt)
#define PT_OPT_PUT(opt) pt.put(#opt, params.opt);

View File

@@ -39,6 +39,9 @@ RDKIT_SUBSTRUCTMATCH_EXPORT std::vector<MatchVectType>
sortMatchesByDegreeOfCoreSubstitution(
const ROMol& mol, const ROMol& core,
const std::vector<MatchVectType>& matches);
RDKIT_SUBSTRUCTMATCH_EXPORT bool isAtomTerminalRGroupOrQueryHydrogen(
const Atom* atom);
} // namespace RDKit
#endif

View File

@@ -1947,6 +1947,110 @@ void testLongRing() {
delete mol;
}
void testIsAtomTerminalRGroupOrQueryHydrogen() {
BOOST_LOG(rdErrorLog) << "-------------------------------------" << std::endl;
BOOST_LOG(rdErrorLog) << "Test isAtomTerminalRGroupOrQueryHydrogen"
<< std::endl;
{
auto mol = R"CTAB(
MJ201100
7 7 0 0 0 0 0 0 0 0999 V2000
-0.3795 1.5839 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0
-1.0939 1.1714 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0
-1.0939 0.3463 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0
-0.3795 -0.0661 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0
0.3349 0.3463 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0
0.3349 1.1714 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0
1.0494 1.5839 0.0000 R# 0 0 0 0 0 0 0 0 0 0 0 0
1 2 4 0 0 0 0
2 3 4 0 0 0 0
3 4 4 0 0 0 0
4 5 4 0 0 0 0
5 6 4 0 0 0 0
6 1 4 0 0 0 0
6 7 1 0 0 0 0
M RGP 1 7 1
M END
)CTAB"_ctab;
const auto rAtom = mol->getAtomWithIdx(mol->getNumAtoms() - 1);
TEST_ASSERT(isAtomTerminalRGroupOrQueryHydrogen(rAtom));
}
{
auto mol = R"CTAB(
MJ201100
6 6 0 0 0 0 0 0 0 0999 V2000
-0.7589 1.4277 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0
-1.4733 1.0152 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0
-1.4733 0.1901 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0
-0.7589 -0.2223 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0
-0.0444 0.1901 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0
-0.0444 1.0152 0.0000 R# 0 0 0 0 0 0 0 0 0 0 0 0
1 2 2 0 0 0 0
2 3 1 0 0 0 0
3 4 2 0 0 0 0
4 5 1 0 0 0 0
5 6 2 0 0 0 0
6 1 1 0 0 0 0
M RGP 1 6 1
M END
)CTAB"_ctab;
const auto rAtom = mol->getAtomWithIdx(mol->getNumAtoms() - 1);
TEST_ASSERT(!isAtomTerminalRGroupOrQueryHydrogen(rAtom));
}
{
auto mol = R"CTAB(
MJ201100
7 7 0 0 0 0 0 0 0 0999 V2000
-0.9152 0.2893 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0
-1.6296 -0.1231 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0
-1.6296 -0.9482 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0
-0.9152 -1.3607 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0
-0.2007 -0.9482 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0
-0.2007 -0.1231 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0
0.5137 0.2893 0.0000 L 0 0 0 0 0 0 0 0 0 0 0 0
1 2 4 0 0 0 0
2 3 4 0 0 0 0
3 4 4 0 0 0 0
4 5 4 0 0 0 0
5 6 4 0 0 0 0
6 1 4 0 0 0 0
6 7 1 0 0 0 0
M ALS 7 10 F H C N O F P S Cl Br I
M END
)CTAB"_ctab;
const auto rAtom = mol->getAtomWithIdx(mol->getNumAtoms() - 1);
TEST_ASSERT(isAtomTerminalRGroupOrQueryHydrogen(rAtom));
}
{
auto mol = R"CTAB(
MJ201100
7 7 0 0 0 0 0 0 0 0999 V2000
-0.9152 0.2893 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0
-1.6296 -0.1231 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0
-1.6296 -0.9482 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0
-0.9152 -1.3607 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0
-0.2007 -0.9482 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0
-0.2007 -0.1231 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0
0.5137 0.2893 0.0000 L 0 0 0 0 0 0 0 0 0 0 0 0
1 2 4 0 0 0 0
2 3 4 0 0 0 0
3 4 4 0 0 0 0
4 5 4 0 0 0 0
5 6 4 0 0 0 0
6 1 4 0 0 0 0
6 7 1 0 0 0 0
M ALS 7 9 F C N O F P S Cl Br I
M END
)CTAB"_ctab;
const auto rAtom = mol->getAtomWithIdx(mol->getNumAtoms() - 1);
TEST_ASSERT(!isAtomTerminalRGroupOrQueryHydrogen(rAtom));
}
}
int main(int argc, char *argv[]) {
RDLog::InitLogs();
test1();
@@ -1974,6 +2078,7 @@ int main(int argc, char *argv[]) {
testEZVsCisTransMatch();
testMostSubstitutedCoreMatch();
testLongRing();
testIsAtomTerminalRGroupOrQueryHydrogen();
return 0;
}

View File

@@ -844,6 +844,10 @@ void test_coords() {
printf(" test_coords\n");
char *mpkl;
size_t mpkl_size;
char *mpkl2;
size_t mpkl2_size;
char *mpkl3;
size_t mpkl3_size;
mpkl = get_mol("C1CNC1CC", &mpkl_size, "");
char *cxsmi = get_cxsmiles(mpkl, mpkl_size, NULL);
@@ -870,6 +874,10 @@ void test_coords() {
// aligned
char *tpkl;
size_t tpkl_size;
char *tpkl2;
size_t tpkl2_size;
char *tpkl3;
size_t tpkl3_size;
tpkl = get_mol(
"\n\
Mrv2102 04062106432D\n\
@@ -1004,6 +1012,75 @@ M END\n",
8 7 1 0\n\
M END\n",
&mpkl_size, "");
tpkl3 = get_mol(
"\n\
MJ201100 \n\
\n\
12 13 0 0 0 0 0 0 0 0999 V2000\n\
-0.5398 0.0400 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0\n\
-1.3648 0.0400 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0\n\
-1.7773 -0.6745 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0\n\
-1.3649 -1.3889 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0\n\
-0.5399 -1.3889 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0\n\
-0.1273 -0.6744 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0\n\
0.6976 -0.6744 0.0000 L 0 0 0 0 0 0 0 0 0 0 0 0\n\
-1.9167 0.6531 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0\n\
-2.6704 0.3176 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0\n\
-2.5842 -0.5028 0.0000 N 0 0 0 0 0 0 0 0 0 0 0 0\n\
-3.3849 0.7302 0.0000 L 0 0 0 0 0 0 0 0 0 0 0 0\n\
-1.7451 1.4600 0.0000 L 0 0 0 0 0 0 0 0 0 0 0 0\n\
1 2 2 0 0 0 0\n\
2 3 1 0 0 0 0\n\
3 4 2 0 0 0 0\n\
4 5 1 0 0 0 0\n\
5 6 2 0 0 0 0\n\
6 1 1 0 0 0 0\n\
6 7 1 0 0 0 0\n\
8 9 2 0 0 0 0\n\
2 8 1 0 0 0 0\n\
9 10 1 0 0 0 0\n\
3 10 1 0 0 0 0\n\
9 11 1 0 0 0 0\n\
8 12 1 0 0 0 0\n\
M ALS 7 10 F H C N O F P S Cl Br I \n\
M ALS 11 10 F H C N O F P S Cl Br I \n\
M ALS 12 10 F H C N O F P S Cl Br I \n\
M END\n",
&tpkl3_size, "");
mpkl3 = get_mol(
"\n\
MJ201100 \n\
\n\
13 14 0 0 0 0 0 0 0 0999 V2000\n\
-0.6112 0.3665 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0\n\
-1.3648 0.0310 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0\n\
-1.4510 -0.7895 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0\n\
-0.7836 -1.2744 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0\n\
-0.0299 -0.9389 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0\n\
0.0562 -0.1183 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0\n\
0.8099 0.2172 0.0000 O 0 0 0 0 0 0 0 0 0 0 0 0\n\
-2.1184 0.3666 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0\n\
-2.6705 -0.2464 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0\n\
-2.2580 -0.9608 0.0000 N 0 0 0 0 0 0 0 0 0 0 0 0\n\
0.6374 -1.4238 0.0000 O 0 0 0 0 0 0 0 0 0 0 0 0\n\
0.8961 1.0377 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0\n\
0.5512 -2.2443 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0\n\
1 2 2 0 0 0 0\n\
2 3 1 0 0 0 0\n\
3 4 2 0 0 0 0\n\
4 5 1 0 0 0 0\n\
5 6 2 0 0 0 0\n\
6 1 1 0 0 0 0\n\
8 9 2 0 0 0 0\n\
2 8 1 0 0 0 0\n\
9 10 1 0 0 0 0\n\
3 10 1 0 0 0 0\n\
6 7 1 0 0 0 0\n\
5 11 1 0 0 0 0\n\
7 12 1 0 0 0 0\n\
11 13 1 0 0 0 0\n\
M END\n",
&mpkl3_size, "");
float **mol_coords = _get_coord_array(mpkl, mpkl_size);
assert(mol_coords);
float bond_length11_12 = sqrt(_sq_dist(mol_coords[11], mol_coords[12]));
@@ -1025,8 +1102,6 @@ M END\n",
char *mpkl2_molblock_after = NULL;
char *mpkl_smi = NULL;
size_t mpkl_smi_size;
char *mpkl2 = NULL;
size_t mpkl2_size;
for (i = 0; i < 2; ++i) {
// this has no initial coordinates and matches the template
@@ -1124,12 +1199,25 @@ M END\n",
// coordinates should be present since coordinate generation has taken place anyway using CoordGen
assert(has_coords(mpkl_smi, mpkl_smi_size));
free(mpkl_smi);
memset(details, 0, 200);
sprintf(details,
"{\"acceptFailure\":false,\"allowRGroups\":true,\"alignOnly\":%s}",
align_only_choices[i]);
assert(set_2d_coords_aligned(&mpkl3, &mpkl3_size, tpkl3, tpkl3_size,
details, &match_json));
assert(!strcmp(
match_json,
"{\"atoms\":[0,1,2,3,4,5,6,7,8,9],\"bonds\":[0,1,2,3,4,5,10,6,7,8,9]}"));
free(match_json);
}
_free_coord_array(mol_coords);
_free_coord_array(tpl_coords);
free(mpkl);
free(tpkl);
free(mpkl3);
free(tpkl3);
printf(" done\n");
printf("--------------------------\n");

View File

@@ -881,13 +881,9 @@ std::string generate_aligned_coords(ROMol &mol, const ROMol &templateMol,
std::unique_ptr<ROMol> molHs;
ROMol *prbMol = &mol;
if (allowRGroups) {
allowRGroups = false;
for (const auto templateAtom : templateMol.atoms()) {
if (templateAtom->getAtomicNum() == 0 && templateAtom->getDegree() == 1) {
allowRGroups = true;
break;
}
}
auto atoms = templateMol.atoms();
allowRGroups = std::any_of(atoms.begin(), atoms.end(),
isAtomTerminalRGroupOrQueryHydrogen);
}
if (allowRGroups) {
molHs.reset(MolOps::addHs(mol));
@@ -906,8 +902,7 @@ std::string generate_aligned_coords(ROMol &mol, const ROMol &templateMol,
for (const auto &pair : match) {
const auto templateAtom = templateMol.getAtomWithIdx(pair.first);
const auto prbAtom = prbMol->getAtomWithIdx(pair.second);
bool isRGroup = templateAtom->getAtomicNum() == 0 && templateAtom->getDegree() == 1;
if (isRGroup) {
if (isAtomTerminalRGroupOrQueryHydrogen(templateAtom)) {
if (prbAtom->getAtomicNum() == 1) {
continue;
}