Files
rdkit/Code/GraphMol/FileParsers/macromols_catch.cpp
tadhurst-cdd 2ae0a75a25 Handle chiral atoms with 2 hydrogens in SCSR parser (#8939)
* added flag to allow chiral settings on Atoms with 2 hydrogens for SCSR treatment

* added parameter to allow chiral 2Hs on template mols in SCSR parser

* fixed testShapeHelpers for arm64 build

* Remove checking for two Hs in chiral atom in atomChiralTypeFromBondDirPseudo3D

* removal of flag for allow2Hs

* remove setup.bat file
2026-01-29 08:08:14 +01:00

409 lines
17 KiB
C++

//
// Copyright (C) 2025 Tad Hurst and other RDKit contributors
// @@ All Rights Reserved @@
// This file is part of the RDKit.
// The contents are covered by the terms of the BSD license
// which is included in the file license.txt, found at the root
// of the RDKit source tree.
//
#include <algorithm>
#include <fstream>
#include <string>
#include <sstream>
#include <streambuf>
#include "RDGeneral/test.h"
#include <catch2/catch_all.hpp>
#include <RDGeneral/Invariant.h>
#include <GraphMol/RDKitBase.h>
#include <GraphMol/FileParsers/FileParsers.h>
#include <GraphMol/Atropisomers.h>
#include <GraphMol/SmilesParse/SmilesWrite.h>
#include <GraphMol/Atom.h>
#include <RDGeneral/BoostStartInclude.h>
#include <boost/lexical_cast.hpp>
#include <RDGeneral/BoostEndInclude.h>
#include <filesystem>
using namespace RDKit;
using namespace v2::FileParsers;
using namespace RDKit;
class ScsiMolTest {
public:
public:
bool generateExpectedFiles = false;
ScsiMolTest() {}
class ScsiTest {
public:
std::string fileName;
unsigned int totalAtomCount;
unsigned int totalBondCount;
unsigned int sgroupCount;
unsigned int totalQueryAtomCount;
unsigned int totalQueryBondCount;
unsigned int querySgroupCount;
bool scsrExpandResult;
SCSRBaseHbondOptions scsrBaseHbondOptions;
std::vector<std::pair<unsigned int, Atom::ChiralType>> chiralChecks;
std::vector<std::pair<unsigned int, Atom::ChiralType>> chiralChecksQuery;
ScsiTest(std::string fileNameInit, bool scsrExpandResult,
SCSRBaseHbondOptions scsrBaseHbondOptions,
unsigned int totalAtomCountInit, unsigned int totalBondCountInit,
unsigned int sgroupCountInit, unsigned int totalQueryAtomCountInit,
unsigned int totalQueryBondCountInit,
unsigned int querySgroupCountInit,
std::vector<std::pair<unsigned int, Atom::ChiralType>>
chiralChecksInit = {},
std::vector<std::pair<unsigned int, Atom::ChiralType>>
chiralChecksQueryInit = {})
: fileName(fileNameInit),
totalAtomCount(totalAtomCountInit),
totalBondCount(totalBondCountInit),
sgroupCount(sgroupCountInit),
totalQueryAtomCount(totalQueryAtomCountInit),
totalQueryBondCount(totalQueryBondCountInit),
querySgroupCount(querySgroupCountInit),
scsrExpandResult(scsrExpandResult),
scsrBaseHbondOptions(scsrBaseHbondOptions),
chiralChecks(chiralChecksInit),
chiralChecksQuery(chiralChecksQueryInit) {};
};
void testScsiFiles(const ScsiTest *scsiTest) {
BOOST_LOG(rdInfoLog) << "testing scsr files" << std::endl;
INFO(scsiTest->fileName);
std::string rdbase = getenv("RDBASE");
std::string fName = rdbase +
"/Code/GraphMol/FileParsers/test_data/macromols/" +
scsiTest->fileName;
RDKit::v2::FileParsers::MolFileParserParams pp;
pp.sanitize = true;
pp.removeHs = false;
pp.strictParsing = true;
RDKit::v2::FileParsers::MolFromSCSRParams molFromSCSRParams;
molFromSCSRParams.includeLeavingGroups = true;
molFromSCSRParams.scsrBaseHbondOptions = scsiTest->scsrBaseHbondOptions;
std::unique_ptr<RDKit::RWMol> mol;
if (scsiTest->scsrExpandResult) {
REQUIRE_NOTHROW(mol = MolFromSCSRFile(fName, pp, molFromSCSRParams));
} else {
REQUIRE_THROWS(mol = MolFromSCSRFile(fName, pp, molFromSCSRParams));
return;
}
RDKit::Chirality::removeNonExplicit3DChirality(*(mol.get()));
CHECK(mol != nullptr);
CHECK(mol->getNumAtoms() == scsiTest->totalAtomCount);
CHECK(mol->getNumBonds() == scsiTest->totalBondCount);
CHECK(getSubstanceGroups(*mol).size() == scsiTest->sgroupCount);
for (auto chiralCheck : scsiTest->chiralChecks) {
CHECK(mol->getAtomWithIdx(chiralCheck.first)->getChiralTag() ==
chiralCheck.second);
}
// now make the expanded mol in "query" mode - not including any leaving
// groups
molFromSCSRParams.includeLeavingGroups = false;
std::unique_ptr<RDKit::RWMol> molNoLeavingGroups;
if (scsiTest->scsrExpandResult) {
REQUIRE_NOTHROW(molNoLeavingGroups =
MolFromSCSRFile(fName, pp, molFromSCSRParams));
} else {
REQUIRE_THROWS(molNoLeavingGroups =
MolFromSCSRFile(fName, pp, molFromSCSRParams));
return;
}
CHECK(molNoLeavingGroups != nullptr);
CHECK(molNoLeavingGroups->getNumAtoms() == scsiTest->totalQueryAtomCount);
CHECK(molNoLeavingGroups->getNumBonds() == scsiTest->totalQueryBondCount);
CHECK(getSubstanceGroups(*molNoLeavingGroups).size() ==
scsiTest->querySgroupCount);
for (auto chiralCheck : scsiTest->chiralChecksQuery) {
CHECK(mol->getAtomWithIdx(chiralCheck.first)->getChiralTag() ==
chiralCheck.second);
}
}
void threeLetterCodeTest(const ScsiTest *scsiTest) {
BOOST_LOG(rdInfoLog) << "testing scsr files with three letter codes"
<< std::endl;
std::string rdbase = getenv("RDBASE");
std::string fName = rdbase +
"/Code/GraphMol/FileParsers/test_data/macromols/" +
scsiTest->fileName;
RDKit::v2::FileParsers::MolFileParserParams pp;
pp.sanitize = false;
pp.removeHs = false;
pp.strictParsing = false;
RDKit::v2::FileParsers::MolFromSCSRParams molFromSCSRParams;
molFromSCSRParams.includeLeavingGroups = true;
molFromSCSRParams.scsrTemplateNames =
RDKit::v2::FileParsers::SCSRTemplateNames::AsEntered;
molFromSCSRParams.scsrBaseHbondOptions = scsiTest->scsrBaseHbondOptions;
std::unique_ptr<RDKit::RWMol> mol;
if (scsiTest->scsrExpandResult) {
REQUIRE_NOTHROW(mol = MolFromSCSRFile(fName, pp, molFromSCSRParams));
} else {
REQUIRE_THROWS(mol = MolFromSCSRFile(fName, pp, molFromSCSRParams));
return;
}
RDKit::Chirality::removeNonExplicit3DChirality(*(mol.get()));
CHECK(mol);
CHECK(mol->getNumAtoms() == scsiTest->totalAtomCount);
CHECK(mol->getNumBonds() == scsiTest->totalBondCount);
CHECK(getSubstanceGroups(*mol).size() == scsiTest->sgroupCount);
molFromSCSRParams.includeLeavingGroups = true;
molFromSCSRParams.scsrTemplateNames =
RDKit::v2::FileParsers::SCSRTemplateNames::UseFirstName;
std::unique_ptr<RWMol> mol2;
if (scsiTest->scsrExpandResult) {
REQUIRE_NOTHROW(mol2 = MolFromSCSRFile(fName, pp, molFromSCSRParams));
} else {
REQUIRE_THROWS(mol2 = MolFromSCSRFile(fName, pp, molFromSCSRParams));
}
// const std::unique_ptr<RDKit::RWMol> mol;
CHECK(mol2);
CHECK(mol2->getNumAtoms() == scsiTest->totalQueryAtomCount);
CHECK(mol2->getNumBonds() == scsiTest->totalQueryBondCount);
CHECK(getSubstanceGroups(*mol2).size() == scsiTest->querySgroupCount);
};
};
TEST_CASE("scsiTests", "scsiTests") {
SECTION("basics") {
std::list<ScsiMolTest::ScsiTest> scsiTests{
ScsiMolTest::ScsiTest("DNASlurpErrorImport.mol", true,
SCSRBaseHbondOptions::Auto, 81, 90, 13, 79, 88,
11,
{{0, Atom::ChiralType::CHI_TETRAHEDRAL_CW},
{22, Atom::ChiralType::CHI_TETRAHEDRAL_CW},
{41, Atom::ChiralType::CHI_TETRAHEDRAL_CW},
{61, Atom::ChiralType::CHI_TETRAHEDRAL_CW}},
{{0, Atom::ChiralType::CHI_TETRAHEDRAL_CW},
{22, Atom::ChiralType::CHI_TETRAHEDRAL_CW},
{41, Atom::ChiralType::CHI_TETRAHEDRAL_CW},
{61, Atom::ChiralType::CHI_TETRAHEDRAL_CW}}),
ScsiMolTest::ScsiTest("DNASlurpErrorSketch.mol", true,
SCSRBaseHbondOptions::Auto, 84, 93, 14, 82, 91,
12,
{{51, Atom::ChiralType::CHI_TETRAHEDRAL_CCW},
{60, Atom::ChiralType::CHI_TETRAHEDRAL_CCW},
{68, Atom::ChiralType::CHI_TETRAHEDRAL_CCW},
{76, Atom::ChiralType::CHI_TETRAHEDRAL_CCW}},
{{51, Atom::ChiralType::CHI_TETRAHEDRAL_CCW},
{60, Atom::ChiralType::CHI_TETRAHEDRAL_CCW},
{68, Atom::ChiralType::CHI_TETRAHEDRAL_CCW},
{76, Atom::ChiralType::CHI_TETRAHEDRAL_CCW}}),
ScsiMolTest::ScsiTest("ValenceErrorScsr.mol", true,
SCSRBaseHbondOptions::Auto, 38, 39, 6, 35, 36, 3),
ScsiMolTest::ScsiTest("ValenceErrorScsr2.mol", true,
SCSRBaseHbondOptions::Auto, 28, 28, 6, 25, 25, 3),
ScsiMolTest::ScsiTest("RiboseFullname.mol", true,
SCSRBaseHbondOptions::Auto, 45, 49, 8, 43, 47, 6),
ScsiMolTest::ScsiTest("Conjugate.mol", true, SCSRBaseHbondOptions::Auto,
91, 91, 14, 87, 87, 10),
ScsiMolTest::ScsiTest("ModifiedPeptide2.mol", true,
SCSRBaseHbondOptions::Auto, 438, 444, 81, 407,
413, 50),
ScsiMolTest::ScsiTest("DnaBadPairs_NoCh.mol", true,
SCSRBaseHbondOptions::Auto, 84, 94, 14, 80, 90,
10),
ScsiMolTest::ScsiTest("DnaBadPairs.mol", true,
SCSRBaseHbondOptions::UseSapAll, 84, 94, 14, 80,
90, 10),
ScsiMolTest::ScsiTest("DnaTest.mol", false,
SCSRBaseHbondOptions::UseSapAll, 254, 300, 14,
250, 296, 10),
ScsiMolTest::ScsiTest("wobblePairs2.mol", true,
SCSRBaseHbondOptions::UseSapAll, 169, 196, 26,
165, 192, 22),
ScsiMolTest::ScsiTest("wobblePairs.mol", true,
SCSRBaseHbondOptions::UseSapAll, 169, 196, 26,
165, 192, 22),
ScsiMolTest::ScsiTest("KellanRNA.mol", true,
SCSRBaseHbondOptions::UseSapAll, 299, 353, 44,
295, 349, 40),
ScsiMolTest::ScsiTest("DnaTest2.mol", true,
SCSRBaseHbondOptions::UseSapAll, 83, 97, 14, 79,
93, 10),
ScsiMolTest::ScsiTest("DnaTest3.mol", true,
SCSRBaseHbondOptions::UseSapAll, 165, 194, 26,
161, 190, 22),
ScsiMolTest::ScsiTest("KellanError.mol", true,
SCSRBaseHbondOptions::UseSapAll, 244, 263, 39,
236, 255, 31),
ScsiMolTest::ScsiTest("TestRNA2_fixed.mol", true,
SCSRBaseHbondOptions::UseSapAll, 106, 117, 17,
104, 115, 15),
ScsiMolTest::ScsiTest("DnaTest.mol", true, SCSRBaseHbondOptions::Auto,
254, 300, 38, 250, 296, 34),
ScsiMolTest::ScsiTest("wobblePairs2.mol", true,
SCSRBaseHbondOptions::Auto, 169, 196, 26, 165,
192, 22),
ScsiMolTest::ScsiTest("wobblePairs.mol", true,
SCSRBaseHbondOptions::Auto, 169, 196, 26, 165,
192, 22),
ScsiMolTest::ScsiTest("DnaBadPairs.mol", true,
SCSRBaseHbondOptions::Auto, 84, 94, 14, 80, 90,
10),
ScsiMolTest::ScsiTest("DnaTest2.mol", true, SCSRBaseHbondOptions::Auto,
83, 97, 14, 79, 93, 10),
ScsiMolTest::ScsiTest("DnaTest3.mol", true, SCSRBaseHbondOptions::Auto,
165, 194, 26, 161, 190, 22),
ScsiMolTest::ScsiTest("KellanError.mol", true,
SCSRBaseHbondOptions::Auto, 244, 263, 39, 236,
255, 31),
ScsiMolTest::ScsiTest("TestRNA2_fixed.mol", true,
SCSRBaseHbondOptions::Auto, 106, 117, 17, 104,
115, 15),
ScsiMolTest::ScsiTest("TrastuzumabMaxPlus3Register.mol", true,
SCSRBaseHbondOptions::UseSapAll, 7606, 7793, 1451,
7080, 7267, 925),
ScsiMolTest::ScsiTest("TrastuzumabMaxRegister.mol", true,
SCSRBaseHbondOptions::UseSapAll, 7576, 7763, 1445,
7053, 7240, 922),
ScsiMolTest::ScsiTest("Mixed.mol", true,
SCSRBaseHbondOptions::UseSapAll, 51, 54, 8, 51,
54, 8),
ScsiMolTest::ScsiTest("CrossLink.mol", true,
SCSRBaseHbondOptions::UseSapAll, 47, 48, 10, 45,
46, 8),
ScsiMolTest::ScsiTest("cyclic.mol", true,
SCSRBaseHbondOptions::UseSapAll, 45, 47, 8, 45,
47, 8),
ScsiMolTest::ScsiTest("Triplet.mol", true,
SCSRBaseHbondOptions::UseSapAll, 30, 30, 6, 27,
27, 3),
ScsiMolTest::ScsiTest("FromBioviaDoc.mol", true,
SCSRBaseHbondOptions::UseSapAll, 27, 26, 7, 25,
24, 5),
ScsiMolTest::ScsiTest("testSCSR.mol", true,
SCSRBaseHbondOptions::UseSapAll, 64, 66, 15, 57,
59, 8),
ScsiMolTest::ScsiTest("badAtomName.mol", false,
SCSRBaseHbondOptions::UseSapAll, 0, 0, 0, 0, 0,
0),
ScsiMolTest::ScsiTest("badClass.mol", false,
SCSRBaseHbondOptions::UseSapAll, 0, 0, 0, 0, 0,
0),
ScsiMolTest::ScsiTest("badClassTemplate.mol", false,
SCSRBaseHbondOptions::UseSapAll, 0, 0, 0, 0, 0,
0),
ScsiMolTest::ScsiTest("badMissingTemplate.mol", false,
SCSRBaseHbondOptions::UseSapAll, 0, 0, 0, 0, 0,
0),
ScsiMolTest::ScsiTest("obj3dTest.mol", true,
SCSRBaseHbondOptions::UseSapAll, 27, 26, 7, 25,
24, 5),
ScsiMolTest::ScsiTest("obj3dTest2.mol", true,
SCSRBaseHbondOptions::UseSapAll, 27, 26, 7, 25,
24, 5),
ScsiMolTest::ScsiTest("obj3dFoundTwice.mol", false,
SCSRBaseHbondOptions::UseSapAll, 27, 26, 0, 27,
26, 0),
ScsiMolTest::ScsiTest("SgroupFoundTwice.mol", false,
SCSRBaseHbondOptions::UseSapAll, 0, 0, 0, 0, 0,
0),
};
ScsiMolTest scsiMolTest;
for (auto scsiTest : scsiTests) {
BOOST_LOG(rdInfoLog) << "Test: " << scsiTest.fileName << std::endl;
scsiMolTest.testScsiFiles(&scsiTest);
}
}
}
TEST_CASE("nestedParens", "nestedParens") {
SECTION("basics") {
BOOST_LOG(rdInfoLog) << "testing names with parens" << std::endl;
std::string filename = "Ugly.mol";
INFO(filename);
std::string rdbase = getenv("RDBASE");
std::string fName =
rdbase + "/Code/GraphMol/FileParsers/test_data/macromols/" + filename;
RDKit::v2::FileParsers::MolFileParserParams pp;
pp.sanitize = true;
pp.removeHs = false;
pp.strictParsing = true;
RDKit::v2::FileParsers::MolFromSCSRParams molFromSCSRParams;
molFromSCSRParams.includeLeavingGroups = true;
molFromSCSRParams.scsrBaseHbondOptions = SCSRBaseHbondOptions::Auto;
std::unique_ptr<RDKit::RWMol> mol;
REQUIRE_NOTHROW(mol = MolFromSCSRFile(fName, pp, molFromSCSRParams));
CHECK(mol != nullptr);
CHECK(mol->getNumAtoms() == 140);
CHECK(mol->getNumBonds() == 151);
CHECK(getSubstanceGroups(*mol).size() == 11);
// check that the macro atoms parsed have the correct names. In the
// output mol these names will not appear in the SGROUP that defined the
// macroatom. we willcheck just a couple of them
std::string sgroupName;
getSubstanceGroups(*mol)[1].getProp("LABEL", sgroupName);
std::string expected = "((cPr)O(2S-Me)Et)NGly";
CHECK(sgroupName.substr(0, expected.length()) == expected);
getSubstanceGroups(*mol)[5].getProp("LABEL", sgroupName);
expected = "Phe(b-Me2)";
CHECK(sgroupName.substr(0, expected.length()) == expected);
getSubstanceGroups(*mol)[7].getProp("LABEL", sgroupName);
expected = "(PhO(2S-Me)Et)NGly";
CHECK(sgroupName.substr(0, expected.length()) == expected);
}
}
TEST_CASE("threeLetterCodeTest", "threeLetterCodeTest") {
SECTION("basics") {
std::list<ScsiMolTest::ScsiTest> scsiTests{
ScsiMolTest::ScsiTest("PepTla.mol", true,
SCSRBaseHbondOptions::UseSapAll, 26, 25, 7, 26,
25, 7),
};
ScsiMolTest scsiMolTest;
for (auto scsiTest : scsiTests) {
BOOST_LOG(rdInfoLog) << "Test: " << scsiTest.fileName << std::endl;
scsiMolTest.threeLetterCodeTest(&scsiTest);
}
}
}