diff --git a/Code/GraphMol/Atom.cpp b/Code/GraphMol/Atom.cpp index c80f4d4c4..95ffda2f4 100644 --- a/Code/GraphMol/Atom.cpp +++ b/Code/GraphMol/Atom.cpp @@ -25,7 +25,16 @@ namespace RDKit { // Determine whether or not a molecule is to the left of Carbon bool isEarlyAtom(int atomicNum) { - return (4 - PeriodicTable::getTable()->getNouterElecs(atomicNum)) > 0; + int eshift = 4 - PeriodicTable::getTable()->getNouterElecs(atomicNum); + if (eshift > 0) { + return true; + } else if (eshift < 0) { + return false; + } else { + // we make an arbitrary decision that Ge, Sn, and Pb + // are treated like early elements (part of github #2606) + return atomicNum > 14; + } } Atom::Atom() : RDProps() { diff --git a/Code/GraphMol/Kekulize.cpp b/Code/GraphMol/Kekulize.cpp index fe426c073..c69934c62 100644 --- a/Code/GraphMol/Kekulize.cpp +++ b/Code/GraphMol/Kekulize.cpp @@ -20,12 +20,6 @@ namespace RDKit { // Local utility namespace namespace { -// Determine whether or not a molecule is to the left of Carbon -bool isEarlyAtom(int atomicNum) { - // FIX: this is duplicated from Atom.cpp. It should be defined once! - return (4 - PeriodicTable::getTable()->getNouterElecs(atomicNum)) > 0; -} - void backTrack(RWMol &mol, INT_INT_DEQ_MAP &options, int lastOpt, INT_VECT &done, INT_DEQUE &aqueue, boost::dynamic_bitset<> &dBndCands, diff --git a/Code/GraphMol/MolStandardize/Charge.cpp b/Code/GraphMol/MolStandardize/Charge.cpp index 6776281c0..3093add2a 100644 --- a/Code/GraphMol/MolStandardize/Charge.cpp +++ b/Code/GraphMol/MolStandardize/Charge.cpp @@ -252,14 +252,14 @@ std::pair> *Reionizer::weakestIonized( } Uncharger::Uncharger() - : pos_h(SmartsToMol("[+!H0!$(*~[-])]")), - pos_quat(SmartsToMol("[+H0!$(*~[-])]")), - neg(SmartsToMol("[-!$(*~[+H0])]")), + : pos_h(SmartsToMol("[+,+2,+3,+4;!H0!$(*~[-])]")), + pos_noh(SmartsToMol("[+,+2,+3,+4;H0;!$(*~[-])]")), + neg(SmartsToMol("[-!$(*~[+,+2,+3,+4])]")), neg_acid(SmartsToMol("[$([O-][C,P,S]=O),$([n-]1nnnc1),$(n1[n-]nnc1)]")){}; Uncharger::Uncharger(const Uncharger &other) { pos_h = other.pos_h; - pos_quat = other.pos_quat; + pos_noh = other.pos_noh; neg = other.neg; neg_acid = other.neg_acid; }; @@ -277,7 +277,11 @@ ROMol *Uncharger::uncharge(const ROMol &mol) { // Get atom ids for matches SubstructMatch(*omol, *(this->pos_h), p_matches); - unsigned int q_matched = SubstructMatch(*omol, *(this->pos_quat), q_matches); + SubstructMatch(*omol, *(this->pos_noh), q_matches); + unsigned int q_matched = 0; + for (const auto &match : q_matches) { + q_matched += omol->getAtomWithIdx(match[0].second)->getFormalCharge(); + } unsigned int n_matched = SubstructMatch(*omol, *(this->neg), n_matches); unsigned int a_matched = SubstructMatch(*omol, *(this->neg_acid), a_matches); @@ -377,24 +381,36 @@ ROMol *Uncharger::uncharge(const ROMol &mol) { } } } - // Neutralize positive charges - std::vector p_idx_matches; - for (const auto &match : p_matches) { - for (const auto &pair : match) { - p_idx_matches.push_back(pair.second); - } + + // Neutralize cations until there is no longer a net charge remaining: + int netCharge = 0; + for (const auto &at : omol->atoms()) { + netCharge += at->getFormalCharge(); } - for (const auto &idx : p_idx_matches) { - Atom *atom = omol->getAtomWithIdx(idx); - if (!atom->getNumExplicitHs()) { - // atoms from places like Mol blocks are normally missing explicit Hs: - atom->setNumExplicitHs(atom->getTotalNumHs()); + + if (netCharge > 0) { + // Neutralize positive charges where H counts can be adjusted + std::vector p_idx_matches; + for (const auto &match : p_matches) { + for (const auto &pair : match) { + p_idx_matches.push_back(pair.second); + } } - atom->setNoImplicit(true); - while (atom->getFormalCharge() > 0 && atom->getNumExplicitHs() > 0) { - atom->setNumExplicitHs(atom->getTotalNumHs() - 1); - atom->setFormalCharge(atom->getFormalCharge() - 1); - BOOST_LOG(rdInfoLog) << "Removed positive charge.\n"; + for (const auto &idx : p_idx_matches) { + Atom *atom = omol->getAtomWithIdx(idx); + if (!atom->getNumExplicitHs()) { + // atoms from places like Mol blocks are normally missing explicit Hs: + atom->setNumExplicitHs(atom->getTotalNumHs()); + } + atom->setNoImplicit(true); + while (atom->getFormalCharge() > 0 && atom->getNumExplicitHs() > 0 && + netCharge > 0) { + atom->setNumExplicitHs(atom->getTotalNumHs() - 1); + atom->setFormalCharge(atom->getFormalCharge() - 1); + --netCharge; + BOOST_LOG(rdInfoLog) << "Removed positive charge.\n"; + } + if (!netCharge) break; } } return omol; diff --git a/Code/GraphMol/MolStandardize/Charge.h b/Code/GraphMol/MolStandardize/Charge.h index 2436bcc79..2a402c071 100644 --- a/Code/GraphMol/MolStandardize/Charge.h +++ b/Code/GraphMol/MolStandardize/Charge.h @@ -13,8 +13,8 @@ */ #include -#ifndef __RD_CHARGE_H__ -#define __RD_CHARGE_H__ +#ifndef RD_CHARGE_H +#define RD_CHARGE_H #include "MolStandardize.h" #include @@ -116,7 +116,7 @@ class RDKIT_MOLSTANDARDIZE_EXPORT Uncharger { private: bool df_canonicalOrdering = true; std::shared_ptr pos_h; - std::shared_ptr pos_quat; + std::shared_ptr pos_noh; std::shared_ptr neg; std::shared_ptr neg_acid; }; // Uncharger class diff --git a/Code/GraphMol/MolStandardize/catch_tests.cpp b/Code/GraphMol/MolStandardize/catch_tests.cpp index f9a9871ad..da1c7c95d 100644 --- a/Code/GraphMol/MolStandardize/catch_tests.cpp +++ b/Code/GraphMol/MolStandardize/catch_tests.cpp @@ -57,7 +57,7 @@ TEST_CASE("SKIP_IF_ALL_MATCH") { } } -TEST_CASE("symmetry in the uncharger") { +TEST_CASE("symmetry in the uncharger", "uncharger") { SECTION("case 1") { auto m = "C[N+](C)(C)CC(C(=O)[O-])CC(=O)[O-]"_smiles; REQUIRE(m); @@ -91,7 +91,7 @@ TEST_CASE("symmetry in the uncharger") { } } -TEST_CASE("uncharger bug with duplicates") { +TEST_CASE("uncharger bug with duplicates", "uncharger") { SECTION("case 1") { auto m = "[NH3+]CC([O-])C[O-]"_smiles; REQUIRE(m); @@ -130,7 +130,7 @@ TEST_CASE("uncharger bug with duplicates") { TEST_CASE( "github #2411: MolStandardize: FragmentRemover should not sanitize " - "fragments ") { + "fragments") { SECTION("demo") { std::string smi = "CN(C)(C)C.Cl"; bool debugParse = false; @@ -147,7 +147,7 @@ TEST_CASE( TEST_CASE( "github #2452: incorrectly removing charge from boron anions" - "fragments ") { + "fragments,uncharger") { SECTION("demo") { auto m = "C[B-](C)(C)C"_smiles; REQUIRE(m); @@ -170,4 +170,74 @@ TEST_CASE( CHECK(outm->getAtomWithIdx(1)->getFormalCharge() == 0); CHECK(MolToSmiles(*outm) == "CB(C)C"); } +} + +TEST_CASE("github #2602: Uncharger ignores dications", "uncharger") { + SECTION("demo") { + auto m = "[O-]CCC[O-].[Ca+2]"_smiles; + REQUIRE(m); + bool canonicalOrdering = true; + MolStandardize::Uncharger uncharger(canonicalOrdering); + std::unique_ptr outm(uncharger.uncharge(*m)); + REQUIRE(outm); + CHECK(outm->getAtomWithIdx(5)->getFormalCharge() == 2); + CHECK(outm->getAtomWithIdx(0)->getFormalCharge() == -1); + CHECK(outm->getAtomWithIdx(4)->getFormalCharge() == -1); + CHECK(MolToSmiles(*outm) == "[Ca+2].[O-]CCC[O-]"); + } +} + +TEST_CASE( + "github #2605: Uncharger incorrectly neutralizes cations when " + "non-neutralizable anions are present.", + "uncharger") { + SECTION("demo") { + auto m = "F[B-](F)(F)F.[NH3+]CCC"_smiles; + REQUIRE(m); + bool canonicalOrdering = true; + MolStandardize::Uncharger uncharger(canonicalOrdering); + std::unique_ptr outm(uncharger.uncharge(*m)); + REQUIRE(outm); + CHECK(outm->getAtomWithIdx(1)->getFormalCharge() == -1); + CHECK(outm->getAtomWithIdx(5)->getFormalCharge() == 1); + CHECK(MolToSmiles(*outm) == "CCC[NH3+].F[B-](F)(F)F"); + } + SECTION("multiple positively charged sites") { + auto m = "F[B-](F)(F)F.[NH3+]CC=C[NH3+]"_smiles; + REQUIRE(m); + bool canonicalOrdering = true; + MolStandardize::Uncharger uncharger(canonicalOrdering); + std::unique_ptr outm(uncharger.uncharge(*m)); + REQUIRE(outm); + CHECK(outm->getAtomWithIdx(1)->getFormalCharge() == -1); + CHECK(outm->getAtomWithIdx(5)->getFormalCharge() == 0); + CHECK(outm->getAtomWithIdx(9)->getFormalCharge() == 1); + CHECK(MolToSmiles(*outm) == "F[B-](F)(F)F.NCC=C[NH3+]"); + } + SECTION("make sure we don't go too far") { + auto m = "F[B-](F)(F)F.[NH4+2]CCC"_smiles; // totally bogus structure + REQUIRE(m); + bool canonicalOrdering = true; + MolStandardize::Uncharger uncharger(canonicalOrdering); + std::unique_ptr outm(uncharger.uncharge(*m)); + REQUIRE(outm); + CHECK(outm->getAtomWithIdx(1)->getFormalCharge() == -1); + CHECK(outm->getAtomWithIdx(5)->getFormalCharge() == 1); + CHECK(MolToSmiles(*outm) == "CCC[NH3+].F[B-](F)(F)F"); + } +} + +TEST_CASE("github #2610: Uncharger incorrectly modifying a zwitterion.", + "uncharger") { + SECTION("demo") { + auto m = "C1=CC=CC[NH+]1-[O-]"_smiles; + REQUIRE(m); + bool canonicalOrdering = true; + MolStandardize::Uncharger uncharger(canonicalOrdering); + std::unique_ptr outm(uncharger.uncharge(*m)); + REQUIRE(outm); + CHECK(outm->getAtomWithIdx(5)->getFormalCharge() == 1); + CHECK(outm->getAtomWithIdx(6)->getFormalCharge() == -1); + CHECK(MolToSmiles(*outm) == "[O-][NH+]1C=CC=CC1"); + } } \ No newline at end of file diff --git a/Code/GraphMol/atomic_data.cpp b/Code/GraphMol/atomic_data.cpp index 798a7fce8..3bd6b7893 100644 --- a/Code/GraphMol/atomic_data.cpp +++ b/Code/GraphMol/atomic_data.cpp @@ -74,7 +74,7 @@ const std::string periodicTableAtomData = 47 Ag 1.59 1.34 1.72 107.868 11 107 106.905097 -1 \n \ 48 Cd 1.69 1.48 1.58 112.412 2 114 113.9033585 -1 \n \ 49 In 1.63 1.44 1.93 114.818 3 115 114.903878 3 \n \ -50 Sn 1.46 1.385 2.17 118.711 4 120 119.9021947 4 \n \ +50 Sn 1.46 1.385 2.17 118.711 4 120 119.9021947 2 4 \n \ 51 Sb 1.46 1.4 2.2 121.76 5 121 120.9038157 3 5 7 \n \ 52 Te 1.47 1.378 2.06 127.6 6 130 129.9062244 2 4 6\n \ 53 I 1.4 1.387 2.15 126.904 7 127 126.904473 1 2 5 \n \ @@ -106,7 +106,7 @@ const std::string periodicTableAtomData = 79 Au 1.5 1.34 1.66 196.967 11 197 196.9665687 -1 \n \ 80 Hg 1.7 1.49 1.55 200.59 2 202 201.970643 -1 \n" "81 Tl 1.55 1.48 1.96 204.383 3 205 204.9744275 3 \n \ -82 Pb 1.54 1.48 2.02 207.2 4 208 207.9766521 4 \n \ +82 Pb 1.54 1.48 2.02 207.2 4 208 207.9766521 2 4 \n \ 83 Bi 1.54 1.45 1.7 208.98 5 209 208.9803987 3 5 7 \n \ 84 Po 1.68 1.46 1.7 209 6 209 208.9824304 2 \n \ 85 At 1.7 1.45 1.7 210 7 210 209.987148 1 \n \ @@ -3345,3 +3345,4 @@ const std::string isotopesAtomData[] = { 118 Og 295 295.21624 0 \n", "EOS"}; } // namespace RDKit + diff --git a/Code/GraphMol/catch_tests.cpp b/Code/GraphMol/catch_tests.cpp index 2d0a840f2..b4581a2bc 100644 --- a/Code/GraphMol/catch_tests.cpp +++ b/Code/GraphMol/catch_tests.cpp @@ -257,10 +257,12 @@ TEST_CASE("github #908: AddHs() using 3D coordinates with 2D conformations", } #endif -TEST_CASE("github #2437: Canon::rankMolAtoms results in crossed double bonds in rings", - "[bug, molops]") { +TEST_CASE( + "github #2437: Canon::rankMolAtoms results in crossed double bonds in " + "rings", + "[bug, molops]") { SECTION("underlying problem") { - std::string molb=R"CTAB(testmol + std::string molb = R"CTAB(testmol Mrv1824 05081910082D 4 4 0 0 0 0 999 V2000 @@ -274,21 +276,21 @@ TEST_CASE("github #2437: Canon::rankMolAtoms results in crossed double bonds in 2 4 2 0 0 0 0 M END )CTAB"; - bool sanitize=false; - bool removeHs=false; - std::unique_ptr mol(MolBlockToMol(molb,sanitize,removeHs)); + bool sanitize = false; + bool removeHs = false; + std::unique_ptr mol(MolBlockToMol(molb, sanitize, removeHs)); REQUIRE(mol); mol->updatePropertyCache(); - CHECK(mol->getBondWithIdx(3)->getBondType()==Bond::BondType::DOUBLE); - CHECK(mol->getBondWithIdx(3)->getBondDir()==Bond::BondDir::NONE); + CHECK(mol->getBondWithIdx(3)->getBondType() == Bond::BondType::DOUBLE); + CHECK(mol->getBondWithIdx(3)->getBondDir() == Bond::BondDir::NONE); std::vector ranks; CHECK(!mol->getRingInfo()->isInitialized()); - Canon::rankMolAtoms(*mol,ranks); + Canon::rankMolAtoms(*mol, ranks); CHECK(!mol->getRingInfo()->isInitialized()); } SECTION("as discovered") { - std::string molb = R"CTAB(testmol + std::string molb = R"CTAB(testmol Mrv1824 05081910082D 4 4 0 0 0 0 999 V2000 @@ -302,21 +304,21 @@ M END 2 4 2 0 0 0 0 M END )CTAB"; - bool sanitize = false; - bool removeHs = false; - std::unique_ptr mol(MolBlockToMol(molb, sanitize, removeHs)); - REQUIRE(mol); - mol->updatePropertyCache(); - CHECK(mol->getBondWithIdx(3)->getBondType() == Bond::BondType::DOUBLE); - CHECK(mol->getBondWithIdx(3)->getBondDir() == Bond::BondDir::NONE); - auto nmb = MolToMolBlock(*mol); - CHECK(nmb.find("2 4 2 3") == std::string::npos); - CHECK(nmb.find("2 4 2 0") != std::string::npos); - std::vector ranks; - Canon::rankMolAtoms(*mol, ranks); - nmb = MolToMolBlock(*mol); - CHECK(nmb.find("2 4 2 3") == std::string::npos); - CHECK(nmb.find("2 4 2 0") != std::string::npos); + bool sanitize = false; + bool removeHs = false; + std::unique_ptr mol(MolBlockToMol(molb, sanitize, removeHs)); + REQUIRE(mol); + mol->updatePropertyCache(); + CHECK(mol->getBondWithIdx(3)->getBondType() == Bond::BondType::DOUBLE); + CHECK(mol->getBondWithIdx(3)->getBondDir() == Bond::BondDir::NONE); + auto nmb = MolToMolBlock(*mol); + CHECK(nmb.find("2 4 2 3") == std::string::npos); + CHECK(nmb.find("2 4 2 0") != std::string::npos); + std::vector ranks; + Canon::rankMolAtoms(*mol, ranks); + nmb = MolToMolBlock(*mol); + CHECK(nmb.find("2 4 2 3") == std::string::npos); + CHECK(nmb.find("2 4 2 0") != std::string::npos); } } TEST_CASE( @@ -337,3 +339,98 @@ M END)CTAB"; CHECK(mol->getAtomWithIdx(0)->getTotalNumHs() == 0); } } + +TEST_CASE( + "github #2606: Bad valence corrections on Pb, Sn" + "[bug, molops]") { + SECTION("basics-Pb") { + std::string mb = R"CTAB( + Mrv1810 08141905562D + + 5 0 0 0 0 0 999 V2000 + -3.6316 -0.4737 0.0000 Pb 0 0 0 0 0 0 0 0 0 0 0 0 + -3.6541 0.3609 0.0000 O 0 5 0 0 0 0 0 0 0 0 0 0 + -2.4586 -0.5188 0.0000 O 0 5 0 0 0 0 0 0 0 0 0 0 + -3.6992 -1.5338 0.0000 O 0 5 0 0 0 0 0 0 0 0 0 0 + -4.5789 -0.4286 0.0000 O 0 5 0 0 0 0 0 0 0 0 0 0 +M CHG 5 1 4 2 -1 3 -1 4 -1 5 -1 +M END +)CTAB"; + std::unique_ptr mol(MolBlockToMol(mb)); + REQUIRE(mol); + CHECK(mol->getAtomWithIdx(0)->getFormalCharge() == 4); + CHECK(mol->getAtomWithIdx(0)->getTotalNumHs() == 0); + } + SECTION("basics-Sn") { + std::string mb = R"CTAB( + Mrv1810 08141905562D + + 5 0 0 0 0 0 999 V2000 + -3.6316 -0.4737 0.0000 Sn 0 0 0 0 0 0 0 0 0 0 0 0 + -3.6541 0.3609 0.0000 O 0 5 0 0 0 0 0 0 0 0 0 0 + -2.4586 -0.5188 0.0000 O 0 5 0 0 0 0 0 0 0 0 0 0 + -3.6992 -1.5338 0.0000 O 0 5 0 0 0 0 0 0 0 0 0 0 + -4.5789 -0.4286 0.0000 O 0 5 0 0 0 0 0 0 0 0 0 0 +M CHG 5 1 4 2 -1 3 -1 4 -1 5 -1 +M END +)CTAB"; + std::unique_ptr mol(MolBlockToMol(mb)); + REQUIRE(mol); + CHECK(mol->getAtomWithIdx(0)->getFormalCharge() == 4); + CHECK(mol->getAtomWithIdx(0)->getTotalNumHs() == 0); + } + SECTION("basics-Ge") { + std::string mb = R"CTAB( + Mrv1810 08141905562D + + 5 0 0 0 0 0 999 V2000 + -3.6316 -0.4737 0.0000 Ge 0 0 0 0 0 0 0 0 0 0 0 0 + -3.6541 0.3609 0.0000 O 0 5 0 0 0 0 0 0 0 0 0 0 + -2.4586 -0.5188 0.0000 O 0 5 0 0 0 0 0 0 0 0 0 0 + -3.6992 -1.5338 0.0000 O 0 5 0 0 0 0 0 0 0 0 0 0 + -4.5789 -0.4286 0.0000 O 0 5 0 0 0 0 0 0 0 0 0 0 +M CHG 5 1 4 2 -1 3 -1 4 -1 5 -1 +M END +)CTAB"; + std::unique_ptr mol(MolBlockToMol(mb)); + REQUIRE(mol); + CHECK(mol->getAtomWithIdx(0)->getFormalCharge() == 4); + CHECK(mol->getAtomWithIdx(0)->getTotalNumHs() == 0); + } +} +TEST_CASE( + "github #2607: Pb, Sn should support valence 2" + "[bug, molops]") { + SECTION("basics-Pb") { + std::string mb = R"CTAB( + Mrv1810 08141905562D + + 3 0 0 0 0 0 999 V2000 + -3.6316 -0.4737 0.0000 Pb 0 0 0 0 0 0 0 0 0 0 0 0 + -3.6541 0.3609 0.0000 O 0 5 0 0 0 0 0 0 0 0 0 0 + -2.4586 -0.5188 0.0000 O 0 5 0 0 0 0 0 0 0 0 0 0 +M CHG 3 1 2 2 -1 3 -1 +M END +)CTAB"; + std::unique_ptr mol(MolBlockToMol(mb)); + REQUIRE(mol); + CHECK(mol->getAtomWithIdx(0)->getFormalCharge() == 2); + CHECK(mol->getAtomWithIdx(0)->getTotalNumHs() == 0); + } + SECTION("basics-Sn") { + std::string mb = R"CTAB( + Mrv1810 08141905562D + + 3 0 0 0 0 0 999 V2000 + -3.6316 -0.4737 0.0000 Sn 0 0 0 0 0 0 0 0 0 0 0 0 + -3.6541 0.3609 0.0000 O 0 5 0 0 0 0 0 0 0 0 0 0 + -2.4586 -0.5188 0.0000 O 0 5 0 0 0 0 0 0 0 0 0 0 +M CHG 3 1 2 2 -1 3 -1 +M END +)CTAB"; + std::unique_ptr mol(MolBlockToMol(mb)); + REQUIRE(mol); + CHECK(mol->getAtomWithIdx(0)->getFormalCharge() == 2); + CHECK(mol->getAtomWithIdx(0)->getTotalNumHs() == 0); + } +} \ No newline at end of file