// // Copyright (C) 2019-2025 Greg Landrum and other RDKit contributors // // @@ All Rights Reserved @@ // This file is part of the RDKit. // The contents are covered by the terms of the BSD license // which is included in the file license.txt, found at the root // of the RDKit source tree. // #include #include #include #include #include #include "MolHash.h" #include using namespace RDKit; TEST_CASE("Basic MolHash", "[molhash]") { SECTION("basics") { auto om = "C1CCCC(O)C1c1ccnc(OC)c1"_smiles; REQUIRE(om); { std::unique_ptr m(new RWMol(*om)); auto hsh = MolHash::MolHash(m.get(), MolHash::HashFunction::AnonymousGraph); CHECK(hsh == "***1****(*2*****2*)*1"); } { std::unique_ptr m(new RWMol(*om)); auto hsh = MolHash::MolHash(m.get(), MolHash::HashFunction::ElementGraph); CHECK(hsh == "COC1CC(C2CCCCC2O)CCN1"); } { std::unique_ptr m(new RWMol(*om)); auto hsh = MolHash::MolHash(m.get(), MolHash::HashFunction::CanonicalSmiles); CHECK(hsh == "COc1cc(C2CCCCC2O)ccn1"); } { std::unique_ptr m(new RWMol(*om)); auto hsh = MolHash::MolHash(m.get(), MolHash::HashFunction::MurckoScaffold); CHECK(hsh == "c1cc(C2CCCCC2)ccn1"); } { std::unique_ptr m(new RWMol(*om)); auto hsh = MolHash::MolHash(m.get(), MolHash::HashFunction::ExtendedMurcko); CHECK(hsh == "*c1cc(C2CCCCC2*)ccn1"); } { std::unique_ptr m(new RWMol(*om)); auto hsh = MolHash::MolHash(m.get(), MolHash::HashFunction::MolFormula); CHECK(hsh == "C12H17NO2"); } { std::unique_ptr m(new RWMol(*om)); auto hsh = MolHash::MolHash(m.get(), MolHash::HashFunction::AtomBondCounts); CHECK(hsh == "15,16"); } { std::unique_ptr m(new RWMol(*om)); auto hsh = MolHash::MolHash(m.get(), MolHash::HashFunction::DegreeVector); CHECK(hsh == "0,4,9,2"); } { std::unique_ptr m(new RWMol(*om)); auto hsh = MolHash::MolHash(m.get(), MolHash::HashFunction::Mesomer); CHECK(hsh == "CO[C]1[CH][C](C2CCCCC2O)[CH][CH][N]1_0"); } { std::unique_ptr m(new RWMol(*om)); auto hsh = MolHash::MolHash(m.get(), MolHash::HashFunction::Regioisomer); CHECK(hsh == "*O.*O*.C.C1CCCCC1.c1ccncc1"); } { std::unique_ptr m(new RWMol(*om)); auto hsh = MolHash::MolHash(m.get(), MolHash::HashFunction::NetCharge); CHECK(hsh == "0"); } { std::unique_ptr m(new RWMol(*om)); auto hsh = MolHash::MolHash(m.get(), MolHash::HashFunction::SmallWorldIndexBR); CHECK(hsh == "B16R2"); } { std::unique_ptr m(new RWMol(*om)); auto hsh = MolHash::MolHash(m.get(), MolHash::HashFunction::SmallWorldIndexBRL); CHECK(hsh == "B16R2L9"); } { std::unique_ptr m(new RWMol(*om)); auto hsh = MolHash::MolHash( m.get(), MolHash::HashFunction::ArthorSubstructureOrder); CHECK(hsh == "000f001001000c000300005f000000"); } } SECTION("tautomers") { auto om = "C(CC1=NNC=C1)C1=CNC=N1"_smiles; REQUIRE(om); { std::unique_ptr m(new RWMol(*om)); auto hsh = MolHash::MolHash(m.get(), MolHash::HashFunction::HetAtomTautomer); CHECK(hsh == "[CH]1[CH][C](CC[C]2[CH][N][CH][N]2)[N][N]1_2_0"); } { std::unique_ptr m(new RWMol(*om)); auto hsh = MolHash::MolHash(m.get(), MolHash::HashFunction::HetAtomProtomer); CHECK(hsh == "[CH]1[CH][C](CC[C]2[CH][N][CH][N]2)[N][N]1_2"); } } SECTION("tautomers 2") { { auto om = "C/C=C/C"_smiles; REQUIRE(om); auto hsh = MolHash::MolHash(om.get(), MolHash::HashFunction::HetAtomTautomer); CHECK(hsh == "C/C=C/C_0_0"); } { auto om = "C/C=N/C"_smiles; REQUIRE(om); auto hsh = MolHash::MolHash(om.get(), MolHash::HashFunction::HetAtomTautomer); CHECK(hsh == "C[CH][N]C_0_0"); } { auto om = "C/C=C/C=C/C"_smiles; REQUIRE(om); auto hsh = MolHash::MolHash(om.get(), MolHash::HashFunction::HetAtomTautomer); CHECK(hsh == "C[CH][CH][CH][CH]C_0_0"); } } SECTION("tautomers bug found in testing") { auto m1 = "CCC(=Cc1sc2cc(C)c(C)cc2[n+]1CC(O)CS(=O)(=O)[O-])C=C1[Se]c2ccc(C)cc2[NH+]1CC"_smiles; REQUIRE(m1); auto m2 = "CCC(=Cc1[se]c2ccc(C)cc2[n+]1CC)C=C1Sc2cc(C)c(C)cc2N1CC(O)CS(=O)(=O)O"_smiles; REQUIRE(m2); std::unique_ptr t1(new RWMol(*m1)); auto hsh1 = MolHash::MolHash(t1.get(), MolHash::HashFunction::HetAtomTautomer); std::unique_ptr t2(new RWMol(*m2)); auto hsh2 = MolHash::MolHash(t2.get(), MolHash::HashFunction::HetAtomTautomer); CHECK(hsh1 == hsh2); CHECK(hsh1 == "CC[C]([CH][C]1S[C]2[CH][C](C)[C](C)[CH][C]2N1CC([O])CS([O])([O])[O])" "[CH][C]1[Se][C]2[CH][CH][C](C)[CH][C]2N1CC_2_1"); } SECTION("tautomers bug found in testing2") { auto m1 = "N/C(=N\\[N+](=O)[O-])NCCCCCCCC(=O)NC(CC(=O)OCc1ccccc1)C(=O)NCCCCN/C(N)=N/[N+](=O)[O-]"_smiles; REQUIRE(m1); auto m2 = "N/C(=N\\CCCCCCCC(=O)NC(CC(=O)OCc1ccccc1)C(=O)NCCCC/N=C(\\N)N[N+](=O)[O-])N[N+](=O)[O-]"_smiles; REQUIRE(m2); std::unique_ptr t1(new RWMol(*m1)); auto hsh1 = MolHash::MolHash(t1.get(), MolHash::HashFunction::HetAtomTautomer); std::unique_ptr t2(new RWMol(*m2)); auto hsh2 = MolHash::MolHash(t2.get(), MolHash::HashFunction::HetAtomTautomer); CHECK(hsh1 == hsh2); CHECK(hsh1 == "[N][C]([N]CCCCCCC[C]([O])[N]C(C[C]([O])OC[C]1[CH][CH][CH][CH][CH]1)[" "C]([O])[N]CCCC[N][C]([N])[N]N([O])[O])[N]N([O])[O]_8_0"); } } TEST_CASE("Tautomers and chirality", "[molhash]") { SECTION("basics") { auto om = "C[C@H](C(=O)O)C(=O)[O-]"_smiles; REQUIRE(om); { std::unique_ptr m(new RWMol(*om)); auto hsh = MolHash::MolHash(m.get(), MolHash::HashFunction::CanonicalSmiles); CHECK(hsh == "C[C@@H](C(=O)[O-])C(=O)O"); } { std::unique_ptr m(new RWMol(*om)); auto hsh = MolHash::MolHash(m.get(), MolHash::HashFunction::HetAtomTautomer); CHECK(hsh == "CC([C]([O])[O])[C]([O])[O]_1_-1"); } { std::unique_ptr m(new RWMol(*om)); auto hsh = MolHash::MolHash(m.get(), MolHash::HashFunction::HetAtomProtomer); CHECK(hsh == "CC([C]([O])[O])[C]([O])[O]_2"); } } } TEST_CASE("Molecular formula with fragments", "[molhash]") { SECTION("basics") { auto om = "CC(=O)[O-].C[N+](C)(C)C"_smiles; REQUIRE(om); auto hsh = MolHash::MolHash(om.get(), MolHash::HashFunction::MolFormula); CHECK(hsh == "C6H15NO2"); } } TEST_CASE("Github issues", "[molhash]") { SECTION("Issue #4222: MolHash fails on non-standard valences") { SmilesParserParams p; p.sanitize = false; std::unique_ptr mol(SmilesToMol("C[Cl]C", p)); REQUIRE(mol); auto hsh = MolHash::MolHash(mol.get(), MolHash::HashFunction::MolFormula); CHECK(hsh == "C2H6Cl"); } } TEST_CASE("MolHash with CX extensions", "[molhash]") { SECTION("Tautomer") { auto mol = "C[C@@H](O)[C@@H](C)[C@@H](C)C[C@H](C1=CN=CN1)C1=CNC=N1 |o1:8,5,&1:1,3,r,c:11,18,t:9,15|"_smiles; REQUIRE(mol); { RWMol cp(*mol); auto hsh = MolHash::MolHash(&cp, MolHash::HashFunction::HetAtomTautomer); CHECK( hsh == "C[C@H]([C@@H](C)[O])[C@@H](C)CC([C]1[CH][N][CH][N]1)[C]1[CH][N][CH][N]1_3_0"); } { RWMol cp(*mol); auto hsh = MolHash::MolHash(&cp, MolHash::HashFunction::HetAtomTautomer, true); CHECK( hsh == "C[C@H](CC([C]1[CH][N][CH][N]1)[C]1[CH][N][CH][N]1)[C@@H](C)[C@H](C)[O]_3_0 |o1:1,&1:14,16|"); } } SECTION("no coordinates please") { auto mol = R"CTAB( Mrv2108 03032205502D 0 0 0 0 0 999 V3000 M V30 BEGIN CTAB M V30 COUNTS 15 16 0 0 1 M V30 BEGIN ATOM M V30 1 C -0.4657 -3.589 0 0 CFG=1 M V30 2 C -0.4657 -2.049 0 0 M V30 3 C 0.8679 -4.359 0 0 M V30 4 C 2.2016 -3.589 0 0 CFG=2 M V30 5 C 3.5353 -4.359 0 0 M V30 6 C 4.9422 -3.7327 0 0 M V30 7 N 5.9726 -4.8771 0 0 M V30 8 C 5.2026 -6.2108 0 0 M V30 9 N 3.6963 -5.8906 0 0 M V30 10 C 2.2016 -2.049 0 0 M V30 11 C 0.9557 -1.1438 0 0 M V30 12 N 1.4316 0.3208 0 0 M V30 13 C 2.9716 0.3208 0 0 M V30 14 N 3.4475 -1.1438 0 0 M V30 15 F -1.7994 -4.359 0 0 M V30 END ATOM M V30 BEGIN BOND M V30 1 1 1 2 CFG=1 M V30 2 1 1 3 M V30 3 1 4 3 CFG=1 M V30 4 1 4 5 M V30 5 2 5 6 M V30 6 1 6 7 M V30 7 2 7 8 M V30 8 1 8 9 M V30 9 1 5 9 M V30 10 1 4 10 M V30 11 2 10 11 M V30 12 1 11 12 M V30 13 1 12 13 M V30 14 2 13 14 M V30 15 1 10 14 M V30 16 1 1 15 M V30 END BOND M V30 BEGIN COLLECTION M V30 MDLV30/STEREL1 ATOMS=(1 1) M V30 MDLV30/STERAC1 ATOMS=(1 4) M V30 END COLLECTION M V30 END CTAB M END )CTAB"_ctab; REQUIRE(mol); { RWMol cp(*mol); auto hsh = MolHash::MolHash(&cp, MolHash::HashFunction::HetAtomTautomer, true); CHECK(hsh == "C[C@H](F)CC([C]1[CH][N][CH][N]1)[C]1[CH][N][CH][N]1_2_0 |o1:1|"); } } SECTION("Mesomer") { auto mol = "C[C@H](F)C[C@@](C([NH-])=O)C([O-])=N |o1:1,&1:4|"_smiles; REQUIRE(mol); { RWMol cp(*mol); auto hsh = MolHash::MolHash(&cp, MolHash::HashFunction::Mesomer); CHECK(hsh == "C[C@H](F)C[C]([C]([NH])[O])[C]([NH])[O]_-2"); } { RWMol cp(*mol); auto hsh = MolHash::MolHash(&cp, MolHash::HashFunction::Mesomer, true); CHECK(hsh == "C[C@H](F)C[C]([C]([NH])[O])[C]([NH])[O]_-2 |o1:1|"); } } SECTION("Extended Murcko") { auto mol = "CC1=CC=CC=C1[C@@H](C[C@@H](C1CC1)C1CCC1)C1=CC=CC=C1O |o1:9,&1:7|"_smiles; REQUIRE(mol); { RWMol cp(*mol); auto hsh = MolHash::MolHash(&cp, MolHash::HashFunction::ExtendedMurcko); CHECK(hsh == "*c1ccccc1C(C[C@H](C1CCC1)C1CC1)c1ccccc1*"); } { RWMol cp(*mol); auto hsh = MolHash::MolHash(&cp, MolHash::HashFunction::ExtendedMurcko, true); CHECK(hsh == "*c1ccccc1C(C[C@H](C1CCC1)C1CC1)c1ccccc1* |o1:9|"); } } SECTION("Murcko") { auto mol = "CC1=CC=CC=C1[C@@H](C[C@@H](C1CC1)C1CCC1)C1=CC=CC=C1O |o1:9,&1:7|"_smiles; REQUIRE(mol); { RWMol cp(*mol); auto hsh = MolHash::MolHash(&cp, MolHash::HashFunction::MurckoScaffold); CHECK(hsh == "c1ccc(C(C[C@H](C2CCC2)C2CC2)c2ccccc2)cc1"); } { RWMol cp(*mol); auto hsh = MolHash::MolHash(&cp, MolHash::HashFunction::MurckoScaffold, true); CHECK(hsh == "c1ccc(C(C[C@H](C2CCC2)C2CC2)c2ccccc2)cc1 |o1:6|"); } } SECTION("Element") { auto mol = "C([C@@H](C1CC1)C1CCC1)[C@@H](C1CCCCC1)C1=CC=CC=C1 |o1:1,&1:9,c:21,23,t:19|"_smiles; REQUIRE(mol); { RWMol cp(*mol); auto hsh = MolHash::MolHash(&cp, MolHash::HashFunction::ElementGraph); CHECK(hsh == "C1CCC(C(C[C@H](C2CCC2)C2CC2)C2CCCCC2)CC1"); } { RWMol cp(*mol); auto hsh = MolHash::MolHash(&cp, MolHash::HashFunction::ElementGraph, true); CHECK(hsh == "C1CCC(C(C[C@H](C2CCC2)C2CC2)C2CCCCC2)CC1 |o1:6|"); } } SECTION("Anonymous") { auto mol = "C([C@@H](C1CC1)C1CCC1)[C@@H](C1CCCCC1)C1=CC=CC=N1 |o1:1,&1:9,c:21,23,t:19|"_smiles; REQUIRE(mol); { RWMol cp(*mol); auto hsh = MolHash::MolHash(&cp, MolHash::HashFunction::AnonymousGraph); CHECK(hsh == "*1***(*(**(*2***2)*2**2)*2*****2)**1"); } { RWMol cp(*mol); auto hsh = MolHash::MolHash(&cp, MolHash::HashFunction::AnonymousGraph, true); CHECK(hsh == "*1***(*(**(*2***2)*2**2)*2*****2)**1"); } } } TEST_CASE("tautomer v2") { SECTION("matches") { // pairs of {molecules with the same hash} {molecules with different hashes // from those} std::vector, std::vector>> data = { {{"CC=O", "C=CO"}, {}}, {{"CCC=O", "CC=CO"}, {"C=CCO"}}, // this next one is the specific test to ensure that amino acid // stereochemistry is not lost {{"CCC(=O)NC", "CCC(O)=NC"}, {"CC=C(O)NC"}}, {{"CC(=O)CC(=O)C", "C=C(O)CC(=O)C", "CC(=O)C=C(O)C", "C=C(O)C=C(O)C", "C=C(O)CC(O)=C"}, {}}, {{"CN=CCF", "CNC=CF"}, {"C=NCCF", "CNCCF"}}, {{"CN=C(C)F", "CNC(=C)F"}, {"C=NC(C)F"}}, {{"Cc1n[nH]cc1", "Cc1[nH][n]cc1", "CC1=NN=CC1", "CC1N=NCC=1"}, {}}, {{"O=C1C=CC(=O)C=C1"}, {"Oc1ccc(O)cc1", "O=C1C=CC(O)C=C1"}}, {{"CC(=O)CCC(=O)C", "CC(=O)CCC(O)=C", "C=C(O)CCC(O)=C"}, {"CC(O)C=CC(=O)C", "CC(=O)C=CC(O)C"}}, {{"c1ccccc1/C=C/c1ccccc1"}, {"c1ccccc1/C=C\\c1ccccc1", "c1ccccc1C=Cc1ccccc1"}}, // imine stereochemistry is lost: {{"CC/C=N/C", "CC/C=N\\C", "CCC=NC", "C/C=C/NC"}, {}}, // but only when tautomers can happen: {{"FC(F)(F)/C(F)=N/C(F)(F)F"}, {"FC(F)(F)/C(F)=N\\C(F)(F)F", "FC(F)(F)C(F)=NC(F)(F)F"}}, {{"NC(=N)CC(=O)C", "NC(N)=CC(=O)C", "NC(=N)CC(O)=C", "NC(=N)C=C(O)C"}, {}}, {{"CC(=O)C=CC", "C=C(O)C=CC"}, {"CC(=O)CC=C"}}, {{"N=C1N=CN(C)C2N=CNC=21", "NC1N=CN(C)C2=NC=NC2=1"}, {}}, { { "S=C1N=CN=C2NC=NC12", "S=C2C1N=CN=C1NC=N2", "S=C1NC=NC2N=CNC1=2", "S=C1N=CN=C2N=CNC12", "S=C2C1NC=NC1=NC=N2", "S=C2C1NC=NC=1NC=N2", }, {}, }, {{"S=C1NC=NC2N=CNC1=2", "S=C1NC=NC2NC=NC1=2", "SC1=NC=NC2N=CNC1=2"}, {}}, {{"CC1=CN=CN1", "CC1CN=CN=1"}, {}}, {{ "N1C(=O)NC(=O)C2C=NNC=21", "N1C(=O)NC(=O)C2=CNNC2=1", "N1C(=O)NC(=O)C2=CNNC2=1", "N1C(=O)NC(=O)C2CN=NC=21", }, {"N1C(=O)NC(=O)C2CN=NC2=1"}}, // --------------------------- // more stereochemistry // --------------------------- {{"C[C@H](F)C=O", "C[C@@H](F)C=O", "CC(F)C=O"}, {}}, {{"C[C@H](F)CC=O"}, {"C[C@@H](F)CC=O", "CC(F)CC=O"}}, {{"C/C=C/O", "C/C=C\\O", "CC=CO"}, {}}, {{"C/C=C/C=O", "C/C=C\\C=O", "CC=CC=O"}, {}}, {{"C/C=C/CC=O"}, {"C/C=C\\CC=O", "CC=CCC=O"}}, {{"C1C=CC=C2CC(=O)NC=12", "C2=C1N=C(CC1=CC=C2)O", "C1C=CC=C2CC(O)=NC=12" }, { "C1C=CC=C2C=C(O)NC=12", }}, // --------------------------- // E/Z isomers with heteroaromatic rings // --------------------------- // Stilbene with pyridyl: E and Z are NOT tautomers and should have // different hashes. Previously the algorithm incorrectly treated // aromatic heteroatoms like pyridine N as tautomeric candidates, // causing stereo to be stripped. {{"c1ccccc1/C=C/c1ncccc1"}, // in ChEMBL (CHEMBL1877619) {"c1ccccc1/C=C\\c1ncccc1", "c1ccccc1C=Cc1ncccc1"}}, // 5-benzylidenerhodanine: E/Z isomers are NOT tautomers and should // have different hashes. The exocyclic C=C to phenyl should // preserve stereochemistry. {{"O=C1NC(=S)S/C1=C/c2ccccc2"}, // in ChEMBL (CHEMBL4796170) {"O=C1NC(=S)S/C1=C\\c2ccccc2", "O=C1NC(=S)SC1=Cc2ccccc2"}}, // E/Z hydrazones with exocyclic C=N to a ring: E and Z isomers // are NOT tautomers and should have different hashes. {{"c1ccccc1N/N=C2\\CCCCC2C"}, // in SureChEMBL (11696321) {"c1ccccc1N/N=C2/CCCCC2C", "c1ccccc1NN=C2CCCCC2C"}}, // --------------------------- // stereocenters near amide bonds should not be destroyed // by extension through flagged bonds // --------------------------- // proline-like stereocenter between two amide C=O groups {{"NC(=O)[C@H]1CCCN1C=O"}, {"NC(=O)[C@@H]1CCCN1C=O"}}, // in SureChEMBL (8959051) // stereocenters near amide bonds on pyrrolidine ring {{"CC(=O)N[C@H]1CCNC1"}, {"CC(=O)N[C@@H]1CCNC1", "CC(=O)NC1CCNC1"}}, // in SureChEMBL (39850) // stereocenter adjacent to pyrimidine/pyrazole: enantiomers should // differ (stereocenter connected via single non-conjugated N-C // bond) {{"C[C@H](c1ccccc1)Nc2ncc(c(n2)Nc3cc([nH]n3)C4CC4)Cl"}, // in // SureChEMBL // (4072338) {"C[C@@H](c1ccccc1)Nc2ncc(c(n2)Nc3cc([nH]n3)C4CC4)Cl"}}, // diastereomers on indole ring: aromatic C should not pull in // stereocenters {{"C[C@@H]1Cc2c3ccccc3[nH]c2[C@@H](N1CC(F)(F)F)c4cc(ccc4Cl)OCCNCCCF"}, {"C[C@@H]1Cc2c3ccccc3[nH]c2[C@H](N1CC(F)(F)F)c4cc(ccc4Cl)OCCNCCCF"}}, // in ChEMBL CHEMBL5972799 }; for (const auto &[same, diff] : data) { std::unique_ptr m{SmilesToMol(same[0])}; REQUIRE(m); RWMol cp(*m); auto ref = MolHash::MolHash(&cp, MolHash::HashFunction::HetAtomTautomerv2); for (auto i = 1u; i < same.size(); ++i) { INFO(same[0] + "." + same[i]); std::unique_ptr m2{SmilesToMol(same[i])}; REQUIRE(m2); RWMol cp(*m2); auto hsh = MolHash::MolHash(&cp, MolHash::HashFunction::HetAtomTautomerv2); CHECK(hsh == ref); } for (auto i = 0u; i < diff.size(); ++i) { INFO(same[0] + "." + diff[i]); std::unique_ptr m2{SmilesToMol(diff[i])}; REQUIRE(m2); RWMol cp(*m2); auto hsh = MolHash::MolHash(&cp, MolHash::HashFunction::HetAtomTautomerv2); CHECK(hsh != ref); } } } SECTION("basics") { std::vector> data = { {"C=O", "[CH2][O]_0_0", "[CH2]=[O]_0_0"}, {"CC=O", "C[CH][O]_0_0", "[C]:[C]:[O]_4_0"}, {"C=CO", "[CH2][CH][O]_1_0", "[C]:[C]:[O]_4_0"}, {"c1ccccc1", "[CH]1[CH][CH][CH][CH][CH]1_0_0", "[cH]1:[cH]:[cH]:[cH]:[cH]:[cH]:1_0_0"}, {"n1ccccc1", "[CH]1[CH][CH][N][CH][CH]1_0_0", "[C]1:[C]:[C]:[N]:[C]:[C]:1_5_0"}, {"Nc1ccccc1", "[N][C]1[CH][CH][CH][CH][CH]1_2_0", "[N]:[C]1:[C]:[C]:[C]:[C]:[C]:1_7_0"}, {"C=COC", "[CH2][CH]OC_0_0", "[CH2]=[CH]-[O]-[CH3]_0_0"}, {"CC(C)(C)C=O", "CC(C)(C)[CH][O]_0_0", "[CH3]-[C](-[CH3])(-[CH3])-[CH]=[O]_0_0"}, {"CC(C)=CO", "C[C](C)[CH][O]_1_0", "[CH3]-[C](-[CH3]):[C]:[O]_2_0"}, {"COC=O", "CO[CH][O]_0_0", "[CH3]-[O]-[CH]=[O]_0_0"}, {"CNC=O", "C[N][CH][O]_1_0", "[CH3]-[N]:[C]:[O]_2_0"}, {"CN(C)C=O", "CN(C)[CH][O]_0_0", "[CH3]-[N](-[CH3]):[C]:[O]_1_0"}, {"CC(C)(C)NC=O", "CC(C)(C)[N][CH][O]_1_0", "[CH3]-[C](-[CH3])(-[CH3])-[N]:[C]:[O]_2_0"}, {"CC(C)=O", "C[C](C)[O]_0_0", "[C]:[C](:[C]):[O]_6_0"}, {"C=C(C)O", "[CH2][C](C)[O]_1_0", "[C]:[C](:[C]):[O]_6_0"}, {"N1CCC1", "C1C[N]C1_1_0", "[CH2]1-[CH2]-[NH]-[CH2]-1_0_0"}, {"CC=CC(=O)C", "C[CH][CH][C](C)[O]_0_0", "[C]:[C](:[O]):[C]:[C]-[CH3]_5_0"}, {"N1C=CCC(F)C1", "FC1C[CH][CH][N]C1_1_0", "[F]-[CH]1-[CH2]-[C]:[C]:[N]-[CH2]-1_3_0"}, {"CCC=C(O)C", "CC[CH][C](C)[O]_1_0", "[C]:[C](:[O]):[C]-[CH2]-[CH3]_5_0"}, {"CCCC(=O)C", "CCC[C](C)[O]_0_0", "[C]:[C](:[O]):[C]-[CH2]-[CH3]_5_0"}, {"CCCC(O)=C", "[CH2][C]([O])CCC_1_0", "[C]:[C](:[O]):[C]-[CH2]-[CH3]_5_0"}, {"C=CCC(O)C", "C=CCC(C)[O]_1_0", "[CH2]=[CH]-[CH2]-[CH](-[CH3])-[OH]_0_0"}, {"C=NC(=O)C", "[CH2][N][C](C)[O]_0_0", "[C]:[N]:[C](-[CH3]):[O]_2_0"}, {"C=NC(O)=C", "[CH2][N][C]([CH2])[O]_1_0", "[C]:[N]:[C](:[C]):[O]_5_0"}, {"CC(=O)CC(=O)C", "C[C]([O])C[C](C)[O]_0_0", "[C]:[C](:[O]):[C]:[C](:[C]):[O]_8_0"}, {"CC(=O)C=C(O)C", "C[C]([O])[CH][C](C)[O]_1_0", "[C]:[C](:[O]):[C]:[C](:[C]):[O]_8_0"}, {"C=C(O)C=C(O)C", "[CH2][C]([O])[CH][C](C)[O]_2_0", "[C]:[C](:[O]):[C]:[C](:[C]):[O]_8_0"}, {"C=C(O)CC(O)=C", "[CH2][C]([O])C[C]([CH2])[O]_2_0", "[C]:[C](:[O]):[C]:[C](:[C]):[O]_8_0"}, {"CC(=O)CCC(=O)C", "C[C]([O])CC[C](C)[O]_0_0", "[C]:[C](:[O]):[C]-[C]:[C](:[C]):[O]_10_0"}, {"CC(=O)C=CC(=O)C", "C[C]([O])[CH][CH][C](C)[O]_0_0", "[C]:[C](:[O]):[C]:[C]:[C](:[C]):[O]_8_0"}, }; for (const auto &tpl : data) { INFO(std::get<0>(tpl)); std::unique_ptr m{SmilesToMol(std::get<0>(tpl))}; REQUIRE(m); { RWMol cp(*m); auto hsh1 = MolHash::MolHash(&cp, MolHash::HashFunction::HetAtomTautomer); CHECK(hsh1 == std::get<1>(tpl)); } { RWMol cp(*m); auto hsh2 = MolHash::MolHash(&cp, MolHash::HashFunction::HetAtomTautomerv2); CHECK(hsh2 == std::get<2>(tpl)); } } } } TEST_CASE("tautomer hash problem cases") { #if 1 SECTION("sulfur problem") { auto m = R"CTAB( RDKit 2D 22 24 0 0 0 0 0 0 0 0999 V2000 -1.3203 -10.1153 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0 -0.9127 -10.8290 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0 -0.0927 -10.8317 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0 0.3205 -10.1216 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0 -0.0921 -9.4072 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0 -0.9107 -9.4080 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0 -2.1477 -10.1129 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0 -2.5593 -9.3951 0.0000 O 0 0 0 0 0 0 0 0 0 0 0 0 -2.5637 -10.8282 0.0000 N 0 0 0 0 0 0 0 0 0 0 0 0 -3.3911 -10.8258 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0 -3.8786 -10.1547 0.0000 S 0 0 0 0 0 0 0 0 0 0 0 0 -4.6663 -10.4082 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0 -4.6688 -11.2356 0.0000 N 0 0 0 0 0 0 0 0 0 0 0 0 -3.8825 -11.4935 0.0000 N 0 0 0 0 0 0 0 0 0 0 0 0 1.1480 -10.1229 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0 1.5606 -10.8402 0.0000 O 0 0 0 0 0 0 0 0 0 0 0 0 1.5629 -9.4070 0.0000 N 0 0 0 0 0 0 0 0 0 0 0 0 2.3903 -9.4084 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0 2.8743 -10.0739 0.0000 N 0 0 0 0 0 0 0 0 0 0 0 0 3.6616 -9.8194 0.0000 N 0 0 0 0 0 0 0 0 0 0 0 0 3.6630 -8.9920 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0 2.8765 -8.7351 0.0000 S 0 0 0 0 0 0 0 0 0 0 0 0 2 3 1 0 5 6 2 0 6 1 1 0 1 2 2 0 11 12 1 0 12 13 2 0 13 14 1 0 14 10 2 0 1 7 1 0 4 15 1 0 3 4 2 0 15 16 2 0 7 8 2 0 15 17 1 0 17 18 1 0 18 19 2 0 7 9 1 0 4 5 1 0 9 10 1 0 10 11 1 0 19 20 1 0 20 21 2 0 21 22 1 0 22 18 1 0 M END )CTAB"_ctab; REQUIRE(m); auto hsh = MolHash::MolHash(m.get(), MolHash::HashFunction::HetAtomTautomerv2); CHECK(hsh.find("[s]") == std::string::npos); } SECTION("atom order") { auto m = R"CTAB( RDKit 2D 17 18 0 0 0 0 0 0 0 0999 V2000 12.9442 -15.7431 0.0000 N 0 0 0 0 0 0 0 0 0 0 0 0 10.7279 -14.6717 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0 10.7266 -15.4976 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0 11.4381 -15.9096 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0 11.4361 -14.2599 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0 12.1526 -14.6678 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0 12.1578 -15.4930 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0 13.4251 -15.0725 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0 12.9359 -14.4079 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0 10.0130 -14.2612 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0 10.0123 -13.4380 0.0000 O 0 0 0 0 0 0 0 0 0 0 0 0 9.3004 -14.6734 0.0000 O 0 0 0 0 0 0 0 0 0 0 0 0 14.2501 -15.0676 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0 14.6661 -15.7779 0.0000 O 0 0 0 0 0 0 0 0 0 0 0 0 14.6573 -14.3521 0.0000 O 0 0 0 0 0 0 0 0 0 0 0 0 15.4893 -15.7728 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0 15.9055 -16.4831 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0 1 8 1 0 8 9 2 0 9 6 1 0 6 5 2 0 5 2 1 0 6 7 1 0 10 11 1 0 10 12 2 0 2 10 1 0 3 4 1 0 4 7 2 0 13 14 1 0 13 15 2 0 8 13 1 0 2 3 2 0 14 16 1 0 7 1 1 0 16 17 1 0 M END > CHEMBL503643 > None )CTAB"_ctab; REQUIRE(m); std::vector row = {"CCOC(=O)c1cc2cc(C(=O)O)ccc2[nH]1", "CCOC(=O)c1cc2cc(ccc2[nH]1)C(O)=O", "O(C(=O)c1cc2c(ccc(c2)C(=O)O)[nH]1)CC"}; auto hsh = MolHash::MolHash(m.get(), MolHash::HashFunction::HetAtomTautomerv2); for (const auto &smi : row) { std::unique_ptr mi{SmilesToMol(smi)}; REQUIRE(mi); auto hshi = MolHash::MolHash(mi.get(), MolHash::HashFunction::HetAtomTautomerv2); CHECK(hsh == hshi); break; } } #endif SECTION("atom order 2") { // {molecules with the same hash} std::vector> data = { {"Ic1ccc(Cn2cc[n+](Cc3ccc(I)cc3)c2)cc1", "c1[n+](Cc2ccc(cc2)I)cn(Cc2ccc(cc2)I)c1"}, {"CN1C(=O)c2cccnc2NC1c1cccnc1", "c1ncccc1C1N(C)C(c2cccnc2N1)=O", "c1nc2c(cc1)C(N(C)C(N2)c1cccnc1)=O"}, {"CC(=O)OCC1=C(C(=O)[O-])N2C(=O)C(=C(Br)Br)[C@H]2S(=O)(=O)C1", "BrC(=C1C(=O)N2C(=C(COC(=O)C)CS([C@@H]21)(=O)=O)C(=O)[O-])Br"}}; for (const auto &same : data) { std::unique_ptr m{SmilesToMol(same[0])}; REQUIRE(m); RWMol cp(*m); auto ref = MolHash::MolHash(&cp, MolHash::HashFunction::HetAtomTautomerv2); for (auto i = 1u; i < same.size(); ++i) { INFO(same[0] + "->" + same[i]); std::unique_ptr m2{SmilesToMol(same[i])}; REQUIRE(m2); RWMol cp(*m2); auto hsh = MolHash::MolHash(&cp, MolHash::HashFunction::HetAtomTautomerv2); CHECK(ref == hsh); } } } } TEST_CASE("GitHub Issue #6505") { const auto m = "CCCCCC[NH3+] |SgD:6:lambda max:230:=:nm::|"_smiles; REQUIRE(m); REQUIRE(getSubstanceGroups(*m).size() == 1); const auto use_cx_smiles = true; SECTION("Do not skip any CX flags") { const auto cx_to_skip = SmilesWrite::CXSmilesFields::CX_NONE; const auto hsh1 = MolHash::MolHash(m.get(), MolHash::HashFunction::HetAtomTautomerv2, use_cx_smiles, cx_to_skip); CHECK( hsh1 == "[CH3]-[CH2]-[CH2]-[CH2]-[CH2]-[CH2]-[NH3+]_0_0 |SgD:6:lambda max:230:=:nm::|"); } SECTION("Strip all CX flags") { const auto cx_to_skip = SmilesWrite::CXSmilesFields::CX_ALL; const auto hsh2 = MolHash::MolHash(m.get(), MolHash::HashFunction::HetAtomTautomerv2, use_cx_smiles, cx_to_skip); CHECK(hsh2 == "[CH3]-[CH2]-[CH2]-[CH2]-[CH2]-[CH2]-[NH3+]_0_0"); } } TEST_CASE("Github Issue #6855 MakeScaffoldGeneric isotope removal") { SECTION("Extended Murcko") { auto mol = "[235U]C1CC1"_smiles; REQUIRE(mol); { RWMol cp(*mol); auto hsh = MolHash::MolHash(&cp, MolHash::HashFunction::ExtendedMurcko); CHECK(hsh == "*C1CC1"); } } SECTION("Anonymous") { auto mol = "[235U]1CC1"_smiles; REQUIRE(mol); { RWMol cp(*mol); auto hsh = MolHash::MolHash(&cp, MolHash::HashFunction::AnonymousGraph); CHECK(hsh == "*1**1"); } } } TEST_CASE("Github Issue #6472 non-matching element and anononymous graph") { SECTION("Element graph test1") { auto mol = "C1COC(C1)C1=NC=NC=C1"_smiles; REQUIRE(mol); { RWMol cp(*mol); auto hsh = MolHash::MolHash(&cp, MolHash::HashFunction::ElementGraph); CHECK(hsh == "C1COC(C2CCNCN2)C1"); } } SECTION("Element graph test2") { auto mol = "C1CC(N=CN1)C1=CC=CO1"_smiles; REQUIRE(mol); { RWMol cp(*mol); auto hsh = MolHash::MolHash(&cp, MolHash::HashFunction::ElementGraph); CHECK(hsh == "C1COC(C2CCNCN2)C1"); } } SECTION("Anonymous graph test 1") { auto mol = "C1COC(C1)C1=NC=NC=C1"_smiles; REQUIRE(mol); { RWMol cp(*mol); auto hsh = MolHash::MolHash(&cp, MolHash::HashFunction::AnonymousGraph); CHECK(hsh == "*1***(*2****2)**1"); } } SECTION("Anonymous graph test 2") { auto mol = "C1CC(N=CN1)C1=CC=CO1"_smiles; REQUIRE(mol); { RWMol cp(*mol); auto hsh = MolHash::MolHash(&cp, MolHash::HashFunction::AnonymousGraph); CHECK(hsh == "*1***(*2****2)**1"); } } } TEST_CASE("tautomer overreach") { SECTION("as reported") { auto mol = "C1=CN(C[C@H]2CNCCO2)N=C1"_smiles; REQUIRE(mol); { RWMol cp(*mol); auto hsh = MolHash::MolHash(&cp, MolHash::HashFunction::HetAtomTautomerv2); CHECK( hsh == "[C]1:[C]:[N]:[N](-[CH2]-[C@H]2-[CH2]-[NH]-[CH2]-[CH2]-[O]-2):[C]:1_3_0"); } } SECTION("dbw example") { auto mol = "c1cccn1C[C@H](C)COC"_smiles; REQUIRE(mol); { RWMol cp(*mol); auto hsh = MolHash::MolHash(&cp, MolHash::HashFunction::HetAtomTautomerv2); CHECK( hsh == "[CH3]-[O]-[CH2]-[C@@H](-[CH3])-[CH2]-[n]1:[cH]:[cH]:[cH]:[cH]:1_0_0"); } } } TEST_CASE("HetAtomProtomerv2") { SECTION("matches") { // pairs of {molecules with the same hash} {molecules with different hashes // from those} std::vector, std::vector>> data = { // example from the NextMove documentation {{"Cc1c[nH]cn1", "Cc1cnc[nH]1", "Cc1c[nH]c[nH+]1"}, {}}, {{"CC=CO", "CCC=O"}, {"C=CCO"}}, }; for (const auto &[same, diff] : data) { std::unique_ptr m{SmilesToMol(same[0])}; REQUIRE(m); RWMol cp(*m); auto ref = MolHash::MolHash(&cp, MolHash::HashFunction::HetAtomProtomerv2); for (auto i = 1u; i < same.size(); ++i) { INFO(same[0] + "->" + same[i]); std::unique_ptr m2{SmilesToMol(same[i])}; REQUIRE(m2); RWMol cp(*m2); auto hsh = MolHash::MolHash(&cp, MolHash::HashFunction::HetAtomProtomerv2); CHECK(hsh == ref); } for (auto i = 0u; i < diff.size(); ++i) { INFO(same[0] + "->" + diff[i]); std::unique_ptr m2{SmilesToMol(diff[i])}; REQUIRE(m2); RWMol cp(*m2); auto hsh = MolHash::MolHash(&cp, MolHash::HashFunction::HetAtomProtomerv2); CHECK(hsh != ref); } } } } TEST_CASE("overreach with v2 tautomer hashes and imines") { SECTION("basics") { std::vector smileses = {"C[C@H](F)NC1=CCCCC1", "C[C@H](F)N=C1CCCCC1"}; for (const auto &smiles : smileses) { auto m = v2::SmilesParse::MolFromSmiles(smiles); REQUIRE(m); auto hsh = MolHash::MolHash(m.get(), MolHash::HashFunction::HetAtomTautomerv2); CHECK(hsh == "[CH3]-[C@H](-[F])-[N]:[C]1:[C]-[CH2]-[CH2]-[CH2]-[C]:1_4_0"); } } } TEST_CASE("v2 tautomers, carboxylic acids, amids, and related structures") { SECTION("basics") { std::vector> data = { {"CC(=O)O", "[CH3]-[C](:[O]):[O]_1_0"}, {"CC(=O)OCC", "[CH3]-[CH2]-[O]-[C](-[CH3])=[O]_0_0"}, {"CC(=N)O", "[CH3]-[C](:[N]):[O]_2_0"}, {"CC(=O)NCC", "[CH3]-[CH2]-[N]:[C](-[CH3]):[O]_1_0"}, {"CC(=N)N", "[C]:[C](:[N]):[N]_6_0"}, }; for (const auto &[smiles, ref] : data) { INFO(smiles); auto m = v2::SmilesParse::MolFromSmiles(smiles); REQUIRE(m); auto hsh = MolHash::MolHash(m.get(), MolHash::HashFunction::HetAtomTautomerv2); CHECK(hsh == ref); } } SECTION("specific problems") { std::vector> data = { // losing amino acid chirality: {"CC(C)C[C@@H](C(=O)O)N", "[CH3]-[CH](-[CH3])-[CH2]-[C@H](-[NH2])-[C](:[O]):[O]_1_0"}, // github #8090 (carboxylate) {"O=C(O)CCC", "[CH3]-[CH2]-[CH2]-[C](:[O]):[O]_1_0"}, // aromatic "imine" {"CC[C@@H](N)C1=NC=CN1", "[CH3]-[CH2]-[C@@H](-[NH2])-[C]1:[N]:[C]:[C]:[N]:1_3_0"}, }; for (const auto &[smiles, ref] : data) { INFO(smiles); auto m = v2::SmilesParse::MolFromSmiles(smiles); REQUIRE(m); auto hsh = MolHash::MolHash(m.get(), MolHash::HashFunction::HetAtomTautomerv2); CHECK(hsh == ref); } } } TEST_CASE("github #8205: order dependence in tautomer hash") { SECTION("as reported") { auto mol1 = R"CTAB( RDKit 2D 0 0 0 0 0 0 0 0 0 0999 V3000 M V30 BEGIN CTAB M V30 COUNTS 15 16 0 0 0 M V30 BEGIN ATOM M V30 1 C -0.006857 -1.225143 0.000000 0 M V30 2 C -1.242571 -0.508286 0.000000 0 M V30 3 C -2.481143 -1.220000 0.000000 0 M V30 4 C -2.484000 -2.648571 0.000000 0 M V30 5 N -1.248286 -3.365429 0.000000 0 M V30 6 C -0.009714 -2.653714 0.000000 0 M V30 7 O -3.722571 -3.360572 0.000000 0 M V30 8 N -1.239714 0.920286 0.000000 0 M V30 9 C 1.549143 3.346571 0.000000 0 M V30 10 C 2.260857 2.108000 0.000000 0 M V30 11 C 1.302857 1.048286 0.000000 0 M V30 12 C -0.001143 1.632000 0.000000 0 M V30 13 C 0.151143 3.052571 0.000000 0 M V30 14 C -1.251227 -4.793997 0.000000 0 M V30 15 C -0.908467 4.010717 0.000000 0 M V30 END ATOM M V30 BEGIN BOND M V30 1 1 1 2 M V30 2 2 2 3 M V30 3 1 3 4 M V30 4 1 4 5 M V30 5 1 5 6 M V30 6 2 6 1 M V30 7 2 4 7 M V30 8 1 2 8 M V30 9 1 9 10 M V30 10 1 10 11 M V30 11 1 11 12 M V30 12 1 12 13 M V30 13 1 13 9 M V30 14 1 12 8 CFG=1 M V30 15 1 5 14 M V30 16 1 13 15 M V30 END BOND M V30 END CTAB M END)CTAB"_ctab; REQUIRE(mol1); auto mol2 = v2::SmilesParse::MolFromSmiles(MolToSmiles(*mol1)); REQUIRE(mol2); auto hsh1 = MolHash::MolHash(mol1.get(), MolHash::HashFunction::HetAtomTautomerv2); auto hsh2 = MolHash::MolHash(mol2.get(), MolHash::HashFunction::HetAtomTautomerv2); CHECK(hsh1 == hsh2); // make sure the chirality wasn't destroyed CHECK(hsh1.find("@") != std::string::npos); } SECTION("tests for the same issue from #8320 report") { std::vector smileses = { "CC(=O)N[C@H](C)S(N)(=O)=O", "O=P(O)(O)[C@@H](O)c1cccc2ccccc12", }; for (const auto &smiles : smileses) { INFO(smiles); auto m = v2::SmilesParse::MolFromSmiles(smiles); REQUIRE(m); auto hsh = MolHash::MolHash(m.get(), MolHash::HashFunction::HetAtomTautomerv2); INFO(hsh); CHECK(hsh.find("@") != std::string::npos); } } } TEST_CASE("examples found in ChEMBL") { SECTION("things that should match") { std::vector> smileses = { {"S1(Nc2ccccc2N1)(=O)(=O)", "O=S1(=O)Nc2ccccc2N1"}, {"c1ccccc1CN=C=S", "S=C=NCc1ccccc1"}, {"O=C1NCCC1", "OC1=NCCC1"}, {"c1no[n+]([O-])c1", "n1o[n+](cc1)[O-]"}, }; for (const auto &[smi1, smi2] : smileses) { INFO(smi1 + " " + smi2); auto m1 = v2::SmilesParse::MolFromSmiles(smi1); REQUIRE(m1); auto hsh1 = MolHash::MolHash(m1.get(), MolHash::HashFunction::HetAtomTautomerv2); auto m2 = v2::SmilesParse::MolFromSmiles(smi2); REQUIRE(m2); auto hsh2 = MolHash::MolHash(m2.get(), MolHash::HashFunction::HetAtomTautomerv2); CHECK(hsh1 == hsh2); } } SECTION("specific problems") { std::vector> data = { {"NNC(=O)CC1=NNC(=O)C1", "[NH2]-[N]:[C](:[O]):[C]:[C]1:[C]:[C](:[O]):[N]:[N]:1_6_0"}, {"Cc1ncn2c1NC=NC2N", "[C]:[C]1:[N]:[C]:[N]2:[C](-[NH2]):[N]:[C]:[N]:[C]:1:2_7_0"}, {"Nc1nc2c(c(=O)[nH]1)CC=N2", "[N]:[C]1:[N]:[C](:[O]):[C]2:[C]:[C]:[N]:[C]:2:[N]:1_6_0"}, {"NC(N)=[N+]1CCc2ccccc2C1", "[NH2]-[C](-[NH2])=[N+]1-[CH2]-[CH2]-[c]2:[cH]:[cH]:[cH]:[cH]:[c]:2-[CH2]-1_0_0"}, {"O=S1(=Nc2ccncc2)CCCCC1", "[O]=[S]1(=[N]-[C]2:[C]:[C]:[N]:[C]:[C]:2)-[CH2]-[CH2]-[CH2]-[CH2]-[CH2]-1_4_0"}, }; for (const auto &[smiles, ref] : data) { INFO(smiles); auto m = v2::SmilesParse::MolFromSmiles(smiles); REQUIRE(m); auto hsh = MolHash::MolHash(m.get(), MolHash::HashFunction::HetAtomTautomerv2); CHECK(hsh == ref); } } } TEST_CASE("github #8405: some tautomer mismatches") { SECTION("things that should match") { std::vector> smilesSets = { { "O=C1CC(C)NC(=O)C1", "OC1=CC(C)N=C(C1)O", "OC1=CC(C)NC(C1)=O", "O=C1CC(C)N=C(C1)O", }, }; for (const auto &smileses : smilesSets) { auto m0 = v2::SmilesParse::MolFromSmiles(smileses[0]); REQUIRE(m0); auto hsh0 = MolHash::MolHash(m0.get(), MolHash::HashFunction::HetAtomTautomerv2); for (auto i = 1u; i < smileses.size(); ++i) { auto smi = smileses[i]; INFO(smi); auto m = v2::SmilesParse::MolFromSmiles(smi); REQUIRE(m); auto hsh = MolHash::MolHash(m.get(), MolHash::HashFunction::HetAtomTautomerv2); CHECK(hsh == hsh0); } } } } TEST_CASE("github #8654: stereogroups incorrectly included in hash") { SECTION("as reported") { auto m = "O=C(N[C@H]1C[C@@H](C(=O)O)[C@@H]2C[C@H]12)C1CC(=O)N(Cc2ccccn2)C1 |&1:3,5,9,11|"_smiles; REQUIRE(m); auto m2 = "O=C(N[C@H]1C[C@@H](C(=O)O)[C@@H]2C[C@@H]21)C1CC(=O)N(Cc2ccccn2)C1"_smiles; REQUIRE(m2); bool useCxSmiles = true; { auto cxToSkip = SmilesWrite::CXSmilesFields::CX_ALL; auto hsh1 = MolHash::MolHash(m.get(), MolHash::HashFunction::HetAtomTautomerv2, useCxSmiles, cxToSkip); auto hsh2 = MolHash::MolHash(m2.get(), MolHash::HashFunction::HetAtomTautomerv2, useCxSmiles, cxToSkip); CHECK(hsh1 == hsh2); } { auto cxToSkip = SmilesWrite::CXSmilesFields::CX_ENHANCEDSTEREO; auto hsh1 = MolHash::MolHash(m.get(), MolHash::HashFunction::HetAtomTautomerv2, useCxSmiles, cxToSkip); auto hsh2 = MolHash::MolHash(m2.get(), MolHash::HashFunction::HetAtomTautomerv2, useCxSmiles, cxToSkip); CHECK(hsh1 == hsh2); } } } TEST_CASE("exclude atoms with properties") { SECTION("basics") { auto m = "OC=CC"_smiles; REQUIRE(m); RWMol m2(*m); auto hsh1 = MolHash::MolHash(&m2, MolHash::HashFunction::HetAtomTautomerv2); CHECK(hsh1 == "[CH3]-[C]:[C]:[O]_3_0"); for (auto i : {0, 1, 2}) { INFO("excluding atom " + std::to_string(i)); RWMol m3(*m); m3.getAtomWithIdx(i)->setProp(MolHash::excludeFromTautomerismProp, "1"); auto hsh2 = MolHash::MolHash(&m3, MolHash::HashFunction::HetAtomTautomerv2); CHECK(hsh2 == "[CH3]-[CH]=[CH]-[OH]_0_0"); } } SECTION("more complex example") { auto m = "OC=CC=C"_smiles; REQUIRE(m); RWMol m2(*m); auto hsh1 = MolHash::MolHash(&m2, MolHash::HashFunction::HetAtomTautomerv2); CHECK(hsh1 == "[C]:[C]:[C]:[C]:[O]_6_0"); { RWMol m3(*m); m3.getAtomWithIdx(3)->setProp(MolHash::excludeFromTautomerismProp, "1"); auto hsh2 = MolHash::MolHash(&m3, MolHash::HashFunction::HetAtomTautomerv2); CHECK(hsh2 == "[CH2]=[CH]-[C]:[C]:[O]_3_0"); } { RWMol m3(*m); m3.getAtomWithIdx(4)->setProp(MolHash::excludeFromTautomerismProp, "1"); auto hsh2 = MolHash::MolHash(&m3, MolHash::HashFunction::HetAtomTautomerv2); CHECK(hsh2 == "[CH2]=[C]:[C]:[C]:[O]_4_0"); } } } TEST_CASE("exclude bonds with properties") { SECTION("basics") { auto m = "OC=CC"_smiles; REQUIRE(m); RWMol m2(*m); auto hsh1 = MolHash::MolHash(&m2, MolHash::HashFunction::HetAtomTautomerv2); CHECK(hsh1 == "[CH3]-[C]:[C]:[O]_3_0"); for (auto i : {0, 1}) { INFO("excluding bond " + std::to_string(i)); RWMol m3(*m); m3.getBondWithIdx(i)->setProp(MolHash::excludeFromTautomerismProp, "1"); auto hsh2 = MolHash::MolHash(&m3, MolHash::HashFunction::HetAtomTautomerv2); CHECK(hsh2 == "[CH3]-[CH]=[CH]-[OH]_0_0"); } } SECTION("more complex example") { auto m = "OC=CC=C"_smiles; REQUIRE(m); RWMol m2(*m); auto hsh1 = MolHash::MolHash(&m2, MolHash::HashFunction::HetAtomTautomerv2); CHECK(hsh1 == "[C]:[C]:[C]:[C]:[O]_6_0"); { RWMol m3(*m); m3.getBondWithIdx(2)->setProp(MolHash::excludeFromTautomerismProp, "1"); auto hsh2 = MolHash::MolHash(&m3, MolHash::HashFunction::HetAtomTautomerv2); CHECK(hsh2 == "[CH2]=[CH]-[C]:[C]:[O]_3_0"); } } }