diff --git a/Code/GraphMol/MolStandardize/MolStandardize.cpp b/Code/GraphMol/MolStandardize/MolStandardize.cpp index 638d2aad2..2184d7292 100644 --- a/Code/GraphMol/MolStandardize/MolStandardize.cpp +++ b/Code/GraphMol/MolStandardize/MolStandardize.cpp @@ -155,18 +155,11 @@ std::vector enumerateTautomerSmiles( cleanup(*mol, params); MolOps::sanitizeMol(*mol); - auto *tautparams = new TautomerCatalogParams(params.tautomerTransforms); - // unsigned int ntautomers = tautparams->getNumTautomers(); - TautomerEnumerator te(new TautomerCatalog(tautparams)); + TautomerEnumerator te(params); - std::vector res = te.enumerate(*mol); + auto res = te.enumerate(*mol); - std::vector tsmiles; - for (const auto &r : res) { - tsmiles.push_back(MolToSmiles(*r)); - } - - return tsmiles; + return res.smiles(); } } // end of namespace MolStandardize diff --git a/Code/GraphMol/MolStandardize/MolStandardize.h b/Code/GraphMol/MolStandardize/MolStandardize.h index ff05c74db..f6d6991c4 100644 --- a/Code/GraphMol/MolStandardize/MolStandardize.h +++ b/Code/GraphMol/MolStandardize/MolStandardize.h @@ -50,15 +50,25 @@ struct RDKIT_MOLSTANDARDIZE_EXPORT CleanupParameters { // std::vector chargeCorrections; std::string tautomerTransforms; // std::vector TautomerScores; - int maxRestarts{200}; // The maximum number of times to attempt to apply the - // series of normalizations (default 200). - int maxTautomers{1000}; // The maximum number of tautomers to enumerate (default - // 1000). - bool preferOrganic{false}; // Whether to prioritize organic fragments when choosing - // fragment parent (default False). - bool doCanonical{true}; // whether or not to apply normalizations in a canonical - // order - + int maxRestarts{200}; //! The maximum number of times to attempt to apply the + //! series of normalizations (default 200). + bool preferOrganic{false}; //! Whether to prioritize organic fragments when + //! choosing fragment parent (default False). + bool doCanonical{true}; //! Whether to apply normalizations in a + //! canonical order + int maxTautomers{1000}; //! The maximum number of tautomers to enumerate + //! (default 1000). + int maxTransforms{1000}; //! The maximum number of tautomer transformations + //! to apply (default 1000). + bool tautomerRemoveSp3Stereo{ + true}; //! Whether to remove stereochemistry from sp3 + //! centers involved in tautomerism (defaults to true) + bool tautomerRemoveBondStereo{ + true}; //! Whether to remove stereochemistry from double + //! bonds involved in tautomerism (defaults to true) + bool tautomerReassignStereo{ + true}; //! Whether enumerate() should call assignStereochemistry + //! on all generated tautomers (defaults to true) CleanupParameters() : // TODO // normalizations(""),//this->DEFAULT_TRANSFORMS), diff --git a/Code/GraphMol/MolStandardize/Tautomer.cpp b/Code/GraphMol/MolStandardize/Tautomer.cpp index 1c7ad5b60..952a1dbff 100644 --- a/Code/GraphMol/MolStandardize/Tautomer.cpp +++ b/Code/GraphMol/MolStandardize/Tautomer.cpp @@ -34,20 +34,6 @@ namespace RDKit { namespace MolStandardize { -namespace detail { -std::vector> pairwise( - const std::vector vect) { - std::vector> pvect; - for (size_t i = 0; i < vect.size() - 1; ++i) { - std::pair p = - std::pair(vect[i], vect[i + 1]); - pvect.push_back(p); - } - return pvect; -} - -} // namespace detail - namespace TautomerScoringFunctions { int scoreRings(const ROMol &mol) { int score = 0; @@ -93,7 +79,7 @@ int scoreRings(const ROMol &mol) { struct smarts_mol_holder { std::string d_smarts; - std::shared_ptr dp_mol; + ROMOL_SPTR dp_mol; smarts_mol_holder(const std::string &smarts) : d_smarts(smarts) { dp_mol.reset(SmartsToMol(smarts)); } @@ -108,7 +94,7 @@ struct SubstructTerm { std::string name; std::string smarts; int score; - std::shared_ptr matcher; + ROMOL_SPTR matcher; SubstructTerm(std::string aname, std::string asmarts, int ascore) : name(std::move(aname)), smarts(std::move(asmarts)), score(ascore) { matcher = smarts_mol_flyweight(smarts).get().dp_mol; @@ -148,7 +134,8 @@ int scoreSubstructs(const ROMol &mol) { score += matches.size() * term.score; } return score; -}; +} + int scoreHeteroHs(const ROMol &mol) { int score = 0; for (const auto &at : mol.atoms()) { @@ -158,47 +145,113 @@ int scoreHeteroHs(const ROMol &mol) { } } return score; - - return 1; -}; +} } // namespace TautomerScoringFunctions -unsigned int MAX_TAUTOMERS = 1000; +TautomerEnumerator::TautomerEnumerator(const CleanupParameters ¶ms) + : d_maxTautomers(params.maxTautomers), + d_maxTransforms(params.maxTransforms), + d_removeSp3Stereo(params.tautomerRemoveSp3Stereo), + d_removeBondStereo(params.tautomerRemoveBondStereo), + d_reassignStereo(params.tautomerReassignStereo) { + TautomerCatalogParams tautParams(params.tautomerTransforms); + dp_catalog.reset(new TautomerCatalog(&tautParams)); +} -ROMol *TautomerEnumerator::pickCanonical( - const std::vector &tautomers, - boost::function scoreFunc) const { - PRECONDITION(scoreFunc, "no scoring function"); - if (tautomers.size() == 1) { - return new ROMol(*tautomers[0]); - } - // Calculate score for each tautomer - int bestScore = std::numeric_limits::min(); - std::string bestSmiles = ""; - ROMOL_SPTR bestMol; - for (const auto &t : tautomers) { - auto score = scoreFunc(*t); -#ifdef VERBOSE_ENUMERATION - std::cerr << " " << MolToSmiles(*t) << " " << score << std::endl; -#endif - if (score > bestScore) { - bestScore = score; - bestSmiles = MolToSmiles(*t); - bestMol = t; - } else if (score == bestScore) { - auto smiles = MolToSmiles(*t); - if (smiles < bestSmiles) { - bestSmiles = smiles; - bestMol = t; +bool TautomerEnumerator::setTautomerStereo( + const ROMol &mol, ROMol &taut, const TautomerEnumeratorResult &res) const { + bool modified = false; + for (auto atom : mol.atoms()) { + auto atomIdx = atom->getIdx(); + if (!res.d_modifiedAtoms.test(atomIdx)) { + continue; + } + auto tautAtom = taut.getAtomWithIdx(atomIdx); + // clear chiral tag on sp2 atoms + if (tautAtom->getHybridization() == Atom::SP2 || d_removeSp3Stereo) { + modified |= (tautAtom->getChiralTag() != Atom::CHI_UNSPECIFIED); + tautAtom->setChiralTag(Atom::CHI_UNSPECIFIED); + if (tautAtom->hasProp(common_properties::_CIPCode)) { + tautAtom->clearProp(common_properties::_CIPCode); + } + } else { + modified |= (tautAtom->getChiralTag() != atom->getChiralTag()); + tautAtom->setChiralTag(atom->getChiralTag()); + if (atom->hasProp(common_properties::_CIPCode)) { + tautAtom->setProp( + common_properties::_CIPCode, + atom->getProp(common_properties::_CIPCode)); } } } - return new ROMol(*bestMol); + // remove stereochemistry on bonds that are part of a tautomeric path + for (auto bond : mol.bonds()) { + auto bondIdx = bond->getIdx(); + if (!res.d_modifiedBonds.test(bondIdx)) { + continue; + } + std::vector bondsToClearDirs; + if (bond->getBondType() == Bond::DOUBLE && + bond->getStereo() > Bond::STEREOANY) { + // look around the beginning and end atoms and check for bonds with + // direction set + for (auto atom : {bond->getBeginAtom(), bond->getEndAtom()}) { + for (const auto &nbri : + boost::make_iterator_range(mol.getAtomBonds(atom))) { + const auto &obnd = mol[nbri]; + if (obnd->getBondDir() == Bond::ENDDOWNRIGHT || + obnd->getBondDir() == Bond::ENDUPRIGHT) { + bondsToClearDirs.push_back(obnd->getIdx()); + } + } + } + } + auto tautBond = taut.getBondWithIdx(bondIdx); + if (tautBond->getBondType() != Bond::DOUBLE || d_removeBondStereo) { + modified |= (tautBond->getStereo() != Bond::STEREONONE); + tautBond->setStereo(Bond::STEREONONE); + tautBond->getStereoAtoms().clear(); + for (auto bi : bondsToClearDirs) { + taut.getBondWithIdx(bi)->setBondDir(Bond::NONE); + } + } else { + const INT_VECT &sa = bond->getStereoAtoms(); + modified |= (tautBond->getStereo() != bond->getStereo() || + sa.size() != tautBond->getStereoAtoms().size()); + if (sa.size() == 2) { + tautBond->setStereoAtoms(sa.front(), sa.back()); + } + tautBond->setStereo(bond->getStereo()); + for (auto bi : bondsToClearDirs) { + taut.getBondWithIdx(bi)->setBondDir( + mol.getBondWithIdx(bi)->getBondDir()); + } + } + } + if (d_reassignStereo) { + static const bool cleanIt = true; + static const bool force = true; + MolOps::assignStereochemistry(taut, cleanIt, force); + } else { + taut.setProp(common_properties::_StereochemDone, 1); + } + return modified; } std::vector TautomerEnumerator::enumerate( const ROMol &mol, boost::dynamic_bitset<> *modifiedAtoms, boost::dynamic_bitset<> *modifiedBonds) const { + TautomerEnumeratorResult tresult = enumerate(mol); + if (modifiedAtoms) { + *modifiedAtoms = tresult.modifiedAtoms(); + } + if (modifiedBonds) { + *modifiedBonds = tresult.modifiedBonds(); + } + return tresult.tautomers(); +} + +TautomerEnumeratorResult TautomerEnumerator::enumerate(const ROMol &mol) const { #ifdef VERBOSE_ENUMERATION std::cout << "**********************************" << std::endl; #endif @@ -206,44 +259,34 @@ std::vector TautomerEnumerator::enumerate( const TautomerCatalogParams *tautparams = dp_catalog->getCatalogParams(); PRECONDITION(tautparams, ""); - PRECONDITION(!modifiedAtoms || modifiedAtoms->size() >= mol.getNumAtoms(), - "bitset too small"); - PRECONDITION(!modifiedBonds || modifiedBonds->size() >= mol.getNumBonds(), - "bitset too small"); - - std::unique_ptr> matp; - if (!modifiedAtoms) { - // we need the modified atom for internal purposes. If the user didn't give - // us one, we have to make our own: - matp.reset(new boost::dynamic_bitset<>(mol.getNumAtoms())); - modifiedAtoms = matp.get(); - } - std::unique_ptr> mbp; - if (!modifiedBonds) { - // we need the modified bonds for internal purposes. If the user didn't give - // us one, we have to make our own: - mbp.reset(new boost::dynamic_bitset<>(mol.getNumBonds())); - modifiedBonds = mbp.get(); - } + TautomerEnumeratorResult res; const std::vector &transforms = tautparams->getTransforms(); - // Enumerate all possible tautomers and return them as a list. + // Enumerate all possible tautomers and return them as a vector. + // smi is the input molecule SMILES std::string smi = MolToSmiles(mol, true); - boost::shared_ptr taut(new ROMol(mol)); - std::map> tautomers = {{smi, taut}}; + // taut is a copy of the input molecule + ROMOL_SPTR taut(new ROMol(mol)); // Create a kekulized form of the molecule to match the SMARTS against - boost::shared_ptr kekulized(new RWMol(mol)); + RWMOL_SPTR kekulized(new RWMol(mol)); MolOps::Kekulize(*kekulized, false); - std::map> kekulized_mols = { - {smi, kekulized}}; - std::vector done; - bool broken = false; + res.d_tautomers = {{smi, Tautomer(taut, kekulized, 0, 0)}}; + std::set done; + res.d_modifiedAtoms.resize(mol.getNumAtoms()); + res.d_modifiedBonds.resize(mol.getNumBonds()); + bool completed = false; + bool bailOut = false; + unsigned int nTransforms = 0; + static const std::array statusMsg{ + "completed", "max tautomers reached", "max transforms reached", + "canceled"}; - while (tautomers.size() < MAX_TAUTOMERS) { - // std::map automatically sorts tautomers into alphabetical order (SMILES) - for (const auto &tautomer : tautomers) { + while (!completed && !bailOut) { + // std::map automatically sorts res.d_tautomers into alphabetical order + // (SMILES) + for (const auto &tautomer : res.d_tautomers) { #ifdef VERBOSE_ENUMERATION std::cout << "Done : " << std::endl; for (const auto d : done) { @@ -252,56 +295,62 @@ std::vector TautomerEnumerator::enumerate( std::cout << "Looking at tautomer: " << tautomer.first << std::endl; #endif std::string tsmiles; - if (std::find(done.begin(), done.end(), tautomer.first) != done.end()) { + if (done.count(tautomer.first)) { continue; } else { // done does not contain tautomer for (const auto &transform : transforms) { - // find kekulized_mol in kekulized_mols with same smiles as taut - auto kmol = kekulized_mols.find(tautomer.first); - // if (search != - // kekulized_mols.end() for - // (const auto &mol : kekulized_mols) { if (mol.first == - // tautomer.first) { std::cout << mol.first << std::endl; - // } - // std::cout << - // MolToSmiles(*transform.Mol) - //<< std::endl; + if (bailOut) { + break; + } + // kmol is the kekulized version of the tautomer + const auto &kmol = tautomer.second.kekulized; std::vector matches; unsigned int matched = - SubstructMatch(*(kmol->second), *(transform.Mol), matches); - std::string name; - (transform.Mol)->getProp(common_properties::_Name, name); + SubstructMatch(*kmol, *(transform.Mol), matches); if (!matched) { continue; - } else { -#ifdef VERBOSE_ENUMERATION - std::cout << "kmol: " << kmol->first << std::endl; - std::cout << MolToSmiles(*(kmol->second)) << std::endl; - std::cout << "transform mol: " << MolToSmarts(*(transform.Mol)) - << std::endl; - - std::cout << "Matched: " << name << std::endl; -#endif } + ++nTransforms; +#ifdef VERBOSE_ENUMERATION + std::string name; + (transform.Mol)->getProp(common_properties::_Name, name); + std::cout << "kmol: " << kmol->first << std::endl; + std::cout << MolToSmiles(*(kmol->second)) << std::endl; + std::cout << "transform mol: " << MolToSmarts(*(transform.Mol)) + << std::endl; + + std::cout << "Matched: " << name << std::endl; +#endif + // loop over transform matches for (const auto &match : matches) { - std::vector idx_matches; - for (const auto &pair : match) { - idx_matches.push_back(pair.second); + if (nTransforms >= d_maxTransforms) { + res.d_status = TautomerEnumeratorStatus::MaxTransformsReached; + bailOut = true; + } else if (res.d_tautomers.size() >= d_maxTautomers) { + res.d_status = TautomerEnumeratorStatus::MaxTautomersReached; + bailOut = true; + } else if (d_callback.get() && !(*d_callback)(mol, res)) { + res.d_status = TautomerEnumeratorStatus::Canceled; + bailOut = true; + } + if (bailOut) { + break; } // Create a copy of in the input molecule so we can modify it // Use kekule form so bonds are explicitly single/double instead of // aromatic - boost::shared_ptr product(new ROMol(*(kmol->second))); + ROMOL_SPTR product(new ROMol(*kmol)); // Remove a hydrogen from the first matched atom and add one to the // last - Atom *first = product->getAtomWithIdx(idx_matches[0]); - Atom *last = product->getAtomWithIdx(idx_matches.back()); - modifiedAtoms->set(idx_matches[0]); - modifiedAtoms->set(idx_matches.back()); - first->setNumExplicitHs( - std::max((unsigned int)0, first->getTotalNumHs() - 1)); + int firstIdx = match.front().second; + int lastIdx = match.back().second; + Atom *first = product->getAtomWithIdx(firstIdx); + Atom *last = product->getAtomWithIdx(lastIdx); + res.d_modifiedAtoms.set(firstIdx); + res.d_modifiedAtoms.set(lastIdx); + first->setNumExplicitHs(std::max(0U, first->getTotalNumHs() - 1)); last->setNumExplicitHs(last->getTotalNumHs() + 1); // Remove any implicit hydrogens from the first and last atoms // now we have set the count explicitly @@ -309,13 +358,11 @@ std::vector TautomerEnumerator::enumerate( last->setNoImplicit(true); // Adjust bond orders unsigned int bi = 0; - std::vector> pvect = - detail::pairwise(idx_matches); - for (const auto &pair : pvect) { - Bond *bond = - product->getBondBetweenAtoms(pair.first, pair.second); + for (size_t i = 0; i < match.size() - 1; ++i) { + Bond *bond = product->getBondBetweenAtoms(match[i].second, + match[i + 1].second); ASSERT_INVARIANT(bond, "required bond not found"); - // check if bonds is specified in tatuomer.in file + // check if bonds is specified in tautomer.in file if (!transform.BondTypes.empty()) { bond->setBondType(transform.BondTypes[bi]); ++bi; @@ -326,40 +373,44 @@ std::vector TautomerEnumerator::enumerate( << std::endl; std::cout << bondtype << std::endl; #endif - if (bondtype == 1) { + if (bondtype == Bond::SINGLE) { bond->setBondType(Bond::DOUBLE); #ifdef VERBOSE_ENUMERATION std::cout << "Set bond to double" << std::endl; #endif } - if (bondtype == 2) { + if (bondtype == Bond::DOUBLE) { bond->setBondType(Bond::SINGLE); #ifdef VERBOSE_ENUMERATION std::cout << "Set bond to single" << std::endl; #endif } - modifiedBonds->set(bond->getIdx()); } + res.d_modifiedBonds.set(bond->getIdx()); } // TODO adjust charges if (!transform.Charges.empty()) { unsigned int ci = 0; - for (const auto idx : idx_matches) { - Atom *atom = product->getAtomWithIdx(idx); + for (const auto &pair : match) { + Atom *atom = product->getAtomWithIdx(pair.second); atom->setFormalCharge(atom->getFormalCharge() + - transform.Charges[ci]); - ++ci; + transform.Charges[ci++]); } } - boost::shared_ptr wproduct(new RWMol(*product)); + RWMOL_SPTR wproduct(new RWMol(*product)); #ifdef VERBOSE_ENUMERATION - // wproduct->updatePropertyCache(false); std::cout << "pre-sanitization: " << MolToSmiles(*wproduct, true, true) << std::endl; #endif - MolOps::sanitizeMol(*wproduct); - // MolOps::sanitizeMol(*static_cast(product.get())); + unsigned int failedOp; + MolOps::sanitizeMol(*wproduct, failedOp, + MolOps::SANITIZE_KEKULIZE | + MolOps::SANITIZE_SETAROMATICITY | + MolOps::SANITIZE_SETCONJUGATION | + MolOps::SANITIZE_SETHYBRIDIZATION | + MolOps::SANITIZE_ADJUSTHS); + setTautomerStereo(mol, *wproduct, res); tsmiles = MolToSmiles(*wproduct, true); #ifdef VERBOSE_ENUMERATION std::string name; @@ -367,119 +418,132 @@ std::vector TautomerEnumerator::enumerate( std::cout << "Applied rule: " << name << " to " << tautomer.first << std::endl; #endif - const bool is_in = tautomers.find(tsmiles) != tautomers.end(); - if (!is_in) { -#ifdef VERBOSE_ENUMERATION - std::cout << "New tautomer produced: " << tsmiles << std::endl; -#endif - // in addition to the above transformations, sanitzation may modify - // bonds, e.g. Cc1nc2ccccc2[nH]1 - for (size_t i = 0; i < mol.getNumBonds(); i++) { - auto molBondType = mol.getBondWithIdx(i)->getBondType(); - auto tautBondType = wproduct->getBondWithIdx(i)->getBondType(); - if (molBondType != tautBondType && !modifiedBonds->operator[](i)) { -#ifdef VERBOSE_ENUMERATION - std::cout << "Sanitization has modified bond " << i - << std::endl; -#endif - modifiedBonds->set(i); - } - } - boost::shared_ptr kekulized_product(new RWMol(*wproduct)); - tautomers[tsmiles] = wproduct; - MolOps::Kekulize(*kekulized_product, false); - kekulized_mols[tsmiles] = kekulized_product; - -#ifdef VERBOSE_ENUMERATION - std::cout << "Now completed: " << std::endl; - for (const auto &tautomer : tautomers) { - std::cout << tautomer.first << std::endl; - } -#endif - - } else { + if (res.d_tautomers.find(tsmiles) != res.d_tautomers.end()) { #ifdef VERBOSE_ENUMERATION std::cout << "Previous tautomer produced again: " << tsmiles << std::endl; #endif + continue; } +#ifdef VERBOSE_ENUMERATION + std::cout << "New tautomer produced: " << tsmiles << std::endl; +#endif + // in addition to the above transformations, sanitzation may modify + // bonds, e.g. Cc1nc2ccccc2[nH]1 + for (size_t i = 0; i < mol.getNumBonds(); i++) { + auto molBondType = mol.getBondWithIdx(i)->getBondType(); + auto tautBondType = wproduct->getBondWithIdx(i)->getBondType(); + if (molBondType != tautBondType && !res.d_modifiedBonds.test(i)) { +#ifdef VERBOSE_ENUMERATION + std::cout << "Sanitization has modified bond " << i + << std::endl; +#endif + res.d_modifiedBonds.set(i); + } + } + RWMOL_SPTR kekulized_product(new RWMol(*wproduct)); + MolOps::Kekulize(*kekulized_product, false); + res.d_tautomers[tsmiles] = Tautomer( + std::move(wproduct), std::move(kekulized_product), + res.d_modifiedAtoms.count(), res.d_modifiedBonds.count()); + +#ifdef VERBOSE_ENUMERATION + std::cout << "Now completed: " << std::endl; + for (const auto &tautomer : res.d_tautomers) { + std::cout << tautomer.first << std::endl; + } +#endif } } } - done.push_back(tautomer.first); + done.insert(tautomer.first); } - if (tautomers.size() == done.size()) { - broken = true; - break; + completed = (res.d_tautomers.size() <= done.size()); + size_t maxNumModifiedAtoms = res.d_modifiedAtoms.count(); + size_t maxNumModifiedBonds = res.d_modifiedBonds.count(); + for (auto it = res.d_tautomers.begin(); it != res.d_tautomers.end();) { + auto &taut = it->second; + if ((taut.d_numModifiedAtoms < maxNumModifiedAtoms || + taut.d_numModifiedBonds < maxNumModifiedBonds) && + setTautomerStereo(mol, *taut.tautomer, res)) { + Tautomer tautStored = std::move(taut); + it = res.d_tautomers.erase(it); + tautStored.d_numModifiedAtoms = maxNumModifiedAtoms; + tautStored.d_numModifiedBonds = maxNumModifiedBonds; + auto insertRes = res.d_tautomers.insert(std::make_pair( + MolToSmiles(*tautStored.tautomer), std::move(tautStored))); + if (insertRes.second) { + it = insertRes.first; + } + } else { + ++it; + } + } + if (bailOut && res.d_tautomers.size() < d_maxTautomers && + res.d_status == TautomerEnumeratorStatus::MaxTautomersReached) { + res.d_status = TautomerEnumeratorStatus::Completed; + bailOut = false; } } // while - if (!broken) { - BOOST_LOG(rdWarningLog) << "Tautomer enumeration stopped at maximum " - << MAX_TAUTOMERS << std::endl; + res.fillTautomersItVec(); + if (!completed) { + BOOST_LOG(rdWarningLog) + << "Tautomer enumeration stopped at " << res.d_tautomers.size() + << " tautomers: " << statusMsg.at(static_cast(res.d_status)) + << std::endl; } - // remove chirality on atoms that were modified: - for (auto atom : mol.atoms()) { - auto atomIdx = atom->getIdx(); - if ((*modifiedAtoms)[atomIdx]) { - for (auto &tautomer : tautomers) { - tautomer.second->getAtomWithIdx(atomIdx)->setChiralTag( - Atom::ChiralType::CHI_UNSPECIFIED); - } - } - } - // remove chirality on bonds that are part of a tautomeric path - for (auto bond : mol.bonds()) { - auto bondIdx = bond->getIdx(); - if ((*modifiedBonds)[bondIdx] && bond->getBondType() == Bond::DOUBLE && - bond->getStereo() > Bond::BondStereo::STEREOANY) { - // look around the beginning and end atoms and check for bonds with - // direction set - boost::dynamic_bitset<> bondsToClearDirs(mol.getNumBonds()); - ROMol::OEDGE_ITER beg, end; - boost::tie(beg, end) = mol.getAtomBonds(bond->getBeginAtom()); - while (beg != end) { - auto obnd = mol[*beg]; - if (obnd->getBondDir() == Bond::BondDir::ENDDOWNRIGHT || - obnd->getBondDir() == Bond::BondDir::ENDUPRIGHT) { - bondsToClearDirs.set(obnd->getIdx()); - } - ++beg; - } - boost::tie(beg, end) = mol.getAtomBonds(bond->getEndAtom()); - while (beg != end) { - auto obnd = mol[*beg]; - if (obnd->getBondDir() == Bond::BondDir::ENDDOWNRIGHT || - obnd->getBondDir() == Bond::BondDir::ENDUPRIGHT) { - bondsToClearDirs.set(obnd->getIdx()); - } - ++beg; - } - for (auto &tautomer : tautomers) { - tautomer.second->getBondWithIdx(bondIdx)->setStereo(Bond::STEREONONE); - tautomer.second->getBondWithIdx(bondIdx)->getStereoAtoms().clear(); - for (unsigned int bi = 0; bi < mol.getNumBonds(); ++bi) { - if (bondsToClearDirs[bi]) { - tautomer.second->getBondWithIdx(bi)->setBondDir( - Bond::BondDir::NONE); - } - } - } - } - } - // Clean up stereochemistry - for (auto &tautomer : tautomers) { - bool cleanIt = true; - bool force = true; - MolOps::assignStereochemistry(*tautomer.second, cleanIt, force); - } - - std::vector res; - for (const auto &tautomer : tautomers) { - res.push_back(tautomer.second); - } return res; } +// pickCanonical non-templated overload that avoids recomputing SMILES +ROMol *TautomerEnumerator::pickCanonical( + const TautomerEnumeratorResult &tautRes, + boost::function scoreFunc) const { + ROMOL_SPTR bestMol; + if (tautRes.d_tautomers.size() == 1) { + bestMol = tautRes.d_tautomers.begin()->second.tautomer; + } else { + // Calculate score for each tautomer + int bestScore = std::numeric_limits::min(); + std::string bestSmiles = ""; + for (const auto &t : tautRes.d_tautomers) { + auto score = scoreFunc(*t.second.tautomer); +#ifdef VERBOSE_ENUMERATION + std::cerr << " " << t.first << " " << score << std::endl; +#endif + if (score > bestScore) { + bestScore = score; + bestSmiles = t.first; + bestMol = t.second.tautomer; + } else if (score == bestScore) { + if (t.first < bestSmiles) { + bestSmiles = t.first; + bestMol = t.second.tautomer; + } + } + } + } + ROMol *res = new ROMol(*bestMol); + static const bool cleanIt = true; + static const bool force = true; + MolOps::assignStereochemistry(*res, cleanIt, force); + + return res; +} + +ROMol *TautomerEnumerator::canonicalize( + const ROMol &mol, boost::function scoreFunc) const { + auto thisCopy = TautomerEnumerator(*this); + thisCopy.setReassignStereo(false); + auto res = thisCopy.enumerate(mol); + if (res.empty()) { + BOOST_LOG(rdWarningLog) + << "no tautomers found, returning input molecule" << std::endl; + return new ROMol(mol); + } + return pickCanonical(res, scoreFunc); +} + } // namespace MolStandardize } // namespace RDKit diff --git a/Code/GraphMol/MolStandardize/Tautomer.h b/Code/GraphMol/MolStandardize/Tautomer.h index 42d941f06..e7e56afe1 100644 --- a/Code/GraphMol/MolStandardize/Tautomer.h +++ b/Code/GraphMol/MolStandardize/Tautomer.h @@ -13,9 +13,12 @@ #include #include +#include #include +#include #include #include +#include #include namespace RDKit { @@ -40,29 +43,257 @@ inline int scoreTautomer(const ROMol &mol) { } } // namespace TautomerScoringFunctions +enum class TautomerEnumeratorStatus { + Completed = 0, + MaxTautomersReached, + MaxTransformsReached, + Canceled +}; + +class Tautomer { + friend class TautomerEnumerator; + + public: + Tautomer() : d_numModifiedAtoms(0), d_numModifiedBonds(0) {} + Tautomer(const ROMOL_SPTR &t, const ROMOL_SPTR &k, size_t a = 0, size_t b = 0) + : tautomer(t), + kekulized(k), + d_numModifiedAtoms(a), + d_numModifiedBonds(b) {} + ROMOL_SPTR tautomer; + ROMOL_SPTR kekulized; + + private: + size_t d_numModifiedAtoms; + size_t d_numModifiedBonds; +}; + +typedef std::map SmilesTautomerMap; +typedef std::pair SmilesTautomerPair; + +//! Contains results of tautomer enumeration +class RDKIT_MOLSTANDARDIZE_EXPORT TautomerEnumeratorResult { + friend class TautomerEnumerator; + + public: + class const_iterator { + public: + typedef ROMOL_SPTR value_type; + typedef std::ptrdiff_t difference_type; + typedef const ROMol *pointer; + typedef const ROMOL_SPTR &reference; + typedef std::bidirectional_iterator_tag iterator_category; + + explicit const_iterator(const SmilesTautomerMap::const_iterator &it) + : d_it(it) {} + reference operator*() const { return d_it->second.tautomer; } + pointer operator->() const { return d_it->second.tautomer.get(); } + bool operator==(const const_iterator &other) const { + return (d_it == other.d_it); + } + bool operator!=(const const_iterator &other) const { + return !(*this == other); + } + const_iterator operator++(int) { + const_iterator copy(d_it); + operator++(); + return copy; + } + const_iterator &operator++() { + ++d_it; + return *this; + } + const_iterator operator--(int) { + const_iterator copy(d_it); + operator--(); + return copy; + } + const_iterator &operator--() { + --d_it; + return *this; + } + + private: + SmilesTautomerMap::const_iterator d_it; + }; + TautomerEnumeratorResult() : d_status(TautomerEnumeratorStatus::Completed) {} + TautomerEnumeratorResult(const TautomerEnumeratorResult &other) + : d_tautomers(other.d_tautomers), + d_status(other.d_status), + d_modifiedAtoms(other.d_modifiedAtoms), + d_modifiedBonds(other.d_modifiedBonds) { + fillTautomersItVec(); + } + const const_iterator begin() const { + return const_iterator(d_tautomers.begin()); + } + const const_iterator end() const { return const_iterator(d_tautomers.end()); } + size_t size() const { return d_tautomers.size(); } + bool empty() const { return d_tautomers.empty(); } + const ROMOL_SPTR &at(size_t pos) const { + PRECONDITION(pos < d_tautomers.size(), "index out of bounds"); + return d_tautomersItVec.at(pos)->second.tautomer; + } + const ROMOL_SPTR &operator[](size_t pos) const { return at(pos); } + const boost::dynamic_bitset<> &modifiedAtoms() const { + return d_modifiedAtoms; + } + const boost::dynamic_bitset<> &modifiedBonds() const { + return d_modifiedBonds; + } + TautomerEnumeratorStatus status() const { return d_status; } + std::vector tautomers() const { + std::vector tautomerVec; + tautomerVec.reserve(d_tautomers.size()); + std::transform( + d_tautomers.begin(), d_tautomers.end(), std::back_inserter(tautomerVec), + [](const SmilesTautomerPair &t) { return t.second.tautomer; }); + return tautomerVec; + } + std::vector operator()() const { return tautomers(); } + std::vector smiles() const { + std::vector smilesVec; + smilesVec.reserve(d_tautomers.size()); + std::transform(d_tautomers.begin(), d_tautomers.end(), + std::back_inserter(smilesVec), + [](const SmilesTautomerPair &t) { return t.first; }); + return smilesVec; + } + const SmilesTautomerMap &smilesTautomerMap() const { return d_tautomers; } + + private: + void fillTautomersItVec() { + for (auto it = d_tautomers.begin(); it != d_tautomers.end(); ++it) { + d_tautomersItVec.push_back(it); + } + } + // the enumerated tautomers + SmilesTautomerMap d_tautomers; + // internal; vector of iterators into map items to enable random + // access to map items by index + std::vector d_tautomersItVec; + // status of the enumeration: did it complete? did it hit a limit? + // was it canceled? + TautomerEnumeratorStatus d_status; + // bit vector: flags atoms modified by the transforms + boost::dynamic_bitset<> d_modifiedAtoms; + // bit vector: flags bonds modified by the transforms + boost::dynamic_bitset<> d_modifiedBonds; +}; + +class RDKIT_MOLSTANDARDIZE_EXPORT TautomerEnumeratorCallback { + public: + TautomerEnumeratorCallback() {} + virtual ~TautomerEnumeratorCallback() {} + virtual bool operator()(const ROMol &, const TautomerEnumeratorResult &) = 0; +}; + class RDKIT_MOLSTANDARDIZE_EXPORT TautomerEnumerator { public: - TautomerEnumerator() = delete; - TautomerEnumerator(TautomerCatalog *tautCat) : dp_catalog(tautCat){}; + TautomerEnumerator(TautomerCatalog *tautCat) + : dp_catalog(tautCat), + d_maxTautomers(1000), + d_maxTransforms(1000), + d_removeSp3Stereo(true), + d_removeBondStereo(true), + d_reassignStereo(true) {} + TautomerEnumerator(const CleanupParameters ¶ms = CleanupParameters()); TautomerEnumerator(const TautomerEnumerator &other) - : dp_catalog(other.dp_catalog){}; + : dp_catalog(other.dp_catalog), + d_callback(other.d_callback.get()), + d_maxTautomers(other.d_maxTautomers), + d_maxTransforms(other.d_maxTransforms), + d_removeSp3Stereo(other.d_removeSp3Stereo), + d_removeBondStereo(other.d_removeBondStereo), + d_reassignStereo(other.d_reassignStereo) {} TautomerEnumerator &operator=(const TautomerEnumerator &other) { if (this == &other) return *this; dp_catalog = other.dp_catalog; + d_callback.reset(other.d_callback.get()); + d_maxTautomers = other.d_maxTautomers; + d_maxTransforms = other.d_maxTransforms; + d_removeSp3Stereo = other.d_removeSp3Stereo; + d_removeBondStereo = other.d_removeBondStereo; + d_reassignStereo = other.d_reassignStereo; return *this; } + //! \param maxTautomers maximum number of tautomers to be generated + void setMaxTautomers(unsigned int maxTautomers) { + d_maxTautomers = maxTautomers; + } + //! \return maximum number of tautomers to be generated + unsigned int getMaxTautomers() { return d_maxTautomers; } + /*! \param maxTransforms maximum number of transformations to be applied + this limit is usually hit earlier than the maxTautomers limit + and leads to a more linear scaling of CPU time with increasing + number of tautomeric centers (see Sitzmann et al.) + */ + void setMaxTransforms(unsigned int maxTransforms) { + d_maxTransforms = maxTransforms; + } + //! \return maximum number of transformations to be applied + unsigned int getMaxTransforms() { return d_maxTransforms; } + /*! \param removeSp3Stereo; if set to true, stereochemistry information + will be removed from sp3 atoms involved in tautomerism. + This means that S-aminoacids will lose their stereochemistry after going + through tautomer enumeration because of the amido-imidol tautomerism. + This defaults to true in RDKit, false in the workflow described + by Sitzmann et al. + */ + void setRemoveSp3Stereo(bool removeSp3Stereo) { + d_removeSp3Stereo = removeSp3Stereo; + } + /*! \return whether stereochemistry information will be removed from + sp3 atoms involved in tautomerism + */ + bool getRemoveSp3Stereo() { return d_removeSp3Stereo; } + /*! \param removeBondStereo; if set to true, stereochemistry information + will be removed from double bonds involved in tautomerism. + This means that enols will lose their E/Z stereochemistry after going + through tautomer enumeration because of the keto-enolic tautomerism. + This defaults to true in RDKit and also in the workflow described + by Sitzmann et al. + */ + void setRemoveBondStereo(bool removeBondStereo) { + d_removeBondStereo = removeBondStereo; + } + /*! \return whether stereochemistry information will be removed from + double bonds involved in tautomerism + */ + bool getRemoveBondStereo() { return d_removeBondStereo; } + /*! \param reassignStereo; if set to true, assignStereochemistry + will be called on each tautomer generated by the enumerate() method. + This defaults to true. + */ + void setReassignStereo(bool reassignStereo) { + d_reassignStereo = reassignStereo; + } + /*! \return whether assignStereochemistry will be called on each + tautomer generated by the enumerate() method + */ + bool getReassignStereo() { return d_reassignStereo; } + /*! set this to an instance of a class derived from + TautomerEnumeratorCallback where operator() is overridden. + DO NOT delete the instance as ownership of the pointer is transferred + to the TautomerEnumerator + */ + void setCallback(TautomerEnumeratorCallback *callback) { + d_callback.reset(callback); + } + /*! \return pointer to an instance of a class derived from + TautomerEnumeratorCallback. + DO NOT delete the instance as ownership of the pointer is transferred + to the TautomerEnumerator + */ + TautomerEnumeratorCallback *getCallback() const { return d_callback.get(); } - //! returns all tautomers for the input molecule + //! returns a \c TautomerEnumeratorResult structure for the input molecule /*! The enumeration rules are inspired by the publication: M. Sitzmann et al., “Tautomerism in Large Databases.”, JCAMD 24:521 (2010) https://doi.org/10.1007/s10822-010-9346-4 \param mol: the molecule to be enumerated - \param modifiedAtoms: if provided this is used to return which atoms are - modified during the tautomerization - \param modifiedBonds: if provided this is used to return which bonds are - modified during the tautomerization Note: the definitions used here are that the atoms modified during tautomerization are the atoms at the beginning and end of each tautomer @@ -72,27 +303,65 @@ class RDKIT_MOLSTANDARDIZE_EXPORT TautomerEnumerator { "acceptor") */ - std::vector enumerate( - const ROMol &mol, boost::dynamic_bitset<> *modifiedAtoms = nullptr, - boost::dynamic_bitset<> *modifiedBonds = nullptr) const; + TautomerEnumeratorResult enumerate(const ROMol &mol) const; - //! returns the canonical tautomer from a set of possible tautomers - /*! - Note that the canonical tautomer is very likely not the most stable tautomer - for any given conditions. The default scoring rules are designed to produce - "reasonable" tautomers, but the primary concern is that the results are - canonical: you always get the same canonical tautomer for a molecule - regardless of what the input tautomer or atom ordering were. + //! Deprecated, please use the form returning a \c TautomerEnumeratorResult + //! instead + [ + [deprecated("please use the form returning a TautomerEnumeratorResult " + "instead")]] std::vector + enumerate(const ROMol &mol, boost::dynamic_bitset<> *modifiedAtoms, + boost::dynamic_bitset<> *modifiedBonds = nullptr) const; - The default scoring scheme is inspired by the publication: - M. Sitzmann et al., “Tautomerism in Large Databases.”, JCAMD 24:521 (2010) - https://doi.org/10.1007/s10822-010-9346-4 - - */ - ROMol *pickCanonical(const std::vector &tautomers, + //! returns the canonical tautomer from a \c TautomerEnumeratorResult + ROMol *pickCanonical(const TautomerEnumeratorResult &tautRes, boost::function scoreFunc = TautomerScoringFunctions::scoreTautomer) const; + //! returns the canonical tautomer from an iterable of possible tautomers + // When Iterable is TautomerEnumeratorResult we use the other non-templated + // overload for efficiency (TautomerEnumeratorResult already has SMILES so no + // need to recompute them) + template ::value, + int>::type = 0> + ROMol *pickCanonical(const Iterable &tautomers, + boost::function scoreFunc = + TautomerScoringFunctions::scoreTautomer) const { + ROMOL_SPTR bestMol; + if (tautomers.size() == 1) { + bestMol = *tautomers.begin(); + } else { + // Calculate score for each tautomer + int bestScore = std::numeric_limits::min(); + std::string bestSmiles = ""; + for (const auto &t : tautomers) { + auto score = scoreFunc(*t); +#ifdef VERBOSE_ENUMERATION + std::cerr << " " << MolToSmiles(*t) << " " << score << std::endl; +#endif + if (score > bestScore) { + bestScore = score; + bestSmiles = MolToSmiles(*t); + bestMol = t; + } else if (score == bestScore) { + auto smiles = MolToSmiles(*t); + if (smiles < bestSmiles) { + bestSmiles = smiles; + bestMol = t; + } + } + } + } + ROMol *res = new ROMol(*bestMol); + static const bool cleanIt = true; + static const bool force = true; + MolOps::assignStereochemistry(*res, cleanIt, force); + + return res; + } + //! returns the canonical tautomer for a molecule /*! Note that the canonical tautomer is very likely not the most stable tautomer @@ -108,18 +377,18 @@ class RDKIT_MOLSTANDARDIZE_EXPORT TautomerEnumerator { */ ROMol *canonicalize(const ROMol &mol, boost::function scoreFunc = - TautomerScoringFunctions::scoreTautomer) const { - auto tautomers = enumerate(mol); - if (!tautomers.size()) { - BOOST_LOG(rdWarningLog) - << "no tautomers found, returning input molecule" << std::endl; - return new ROMol(mol); - } - return pickCanonical(tautomers, scoreFunc); - }; + TautomerScoringFunctions::scoreTautomer) const; private: + bool setTautomerStereo(const ROMol &mol, ROMol &taut, + const TautomerEnumeratorResult &res) const; std::shared_ptr dp_catalog; + std::unique_ptr d_callback; + unsigned int d_maxTautomers; + unsigned int d_maxTransforms; + bool d_removeSp3Stereo; + bool d_removeBondStereo; + bool d_reassignStereo; }; // TautomerEnumerator class } // namespace MolStandardize diff --git a/Code/GraphMol/MolStandardize/TautomerCatalog/TautomerCatalogParams.h b/Code/GraphMol/MolStandardize/TautomerCatalog/TautomerCatalogParams.h index bda63f83e..59c95dec5 100644 --- a/Code/GraphMol/MolStandardize/TautomerCatalog/TautomerCatalogParams.h +++ b/Code/GraphMol/MolStandardize/TautomerCatalog/TautomerCatalogParams.h @@ -38,6 +38,9 @@ class RDKIT_MOLSTANDARDIZE_EXPORT TautomerCatalogParams ~TautomerCatalogParams() override; + // DEPRECATED: remove in release 2021.01 + // the function name is misleading and getTransforms().size() + // yields the same information unsigned int getNumTautomers() const { return static_cast(d_transforms.size()); } diff --git a/Code/GraphMol/MolStandardize/Wrap/Tautomer.cpp b/Code/GraphMol/MolStandardize/Wrap/Tautomer.cpp index 51658fd18..fb95071bf 100644 --- a/Code/GraphMol/MolStandardize/Wrap/Tautomer.cpp +++ b/Code/GraphMol/MolStandardize/Wrap/Tautomer.cpp @@ -7,6 +7,8 @@ // which is included in the file license.txt, found at the root // of the RDKit source tree. // +#define NO_IMPORT_ARRAY +#include #include #include @@ -18,12 +20,183 @@ namespace python = boost::python; using namespace RDKit; namespace { +class PyTautomerEnumeratorResult { + public: + PyTautomerEnumeratorResult(const MolStandardize::TautomerEnumeratorResult &tr) + : d_tr(new MolStandardize::TautomerEnumeratorResult(std::move(tr))) { + python::list atList; + python::list bndList; + for (unsigned int i = 0; i < d_tr->modifiedAtoms().size(); ++i) { + if (d_tr->modifiedAtoms().test(i)) { + atList.append(i); + } + } + for (unsigned int i = 0; i < d_tr->modifiedBonds().size(); ++i) { + if (d_tr->modifiedBonds().test(i)) { + bndList.append(i); + } + } + d_atTuple = python::tuple(atList); + d_bndTuple = python::tuple(bndList); + } + inline const std::vector *tautomers() const { + return new std::vector(std::move(d_tr->tautomers())); + } + inline const std::vector *smiles() const { + return new std::vector(std::move(d_tr->smiles())); + } + inline const MolStandardize::SmilesTautomerMap &smilesTautomerMap() const { + return d_tr->smilesTautomerMap(); + } + inline MolStandardize::TautomerEnumeratorStatus status() const { + return d_tr->status(); + } + inline python::tuple modifiedAtoms() const { return d_atTuple; } + inline python::tuple modifiedBonds() const { return d_bndTuple; } + inline const MolStandardize::TautomerEnumeratorResult::const_iterator begin() + const { + return d_tr->begin(); + } + inline const MolStandardize::TautomerEnumeratorResult::const_iterator end() + const { + return d_tr->end(); + } + inline int size() const { return d_tr->size(); } + RDKit::ROMol *at(int pos) const { + if (pos < 0) { + pos += size(); + } + if (pos < 0 || pos >= size()) { + PyErr_SetString(PyExc_IndexError, "index out of bounds"); + python::throw_error_already_set(); + return nullptr; + } + return new RDKit::ROMol(*d_tr->at(pos)); + } + const MolStandardize::TautomerEnumeratorResult *get() { return d_tr.get(); } + + private: + boost::shared_ptr d_tr; + python::tuple d_atTuple; + python::tuple d_bndTuple; +}; + +class PyTautomerEnumeratorCallback + : public MolStandardize::TautomerEnumeratorCallback, + public python::wrapper { + public: + PyTautomerEnumeratorCallback() {} + PyTautomerEnumeratorCallback(const python::object &pyCallbackObject) { + PyTautomerEnumeratorCallback *pyCallback = + python::extract(pyCallbackObject); + *this = *pyCallback; + d_pyCallbackObject = pyCallbackObject; + pyCallback->d_cppCallback = this; + } + inline python::object getCallbackOverride() const { + return get_override("__call__"); + } + bool operator()(const ROMol &mol, + const MolStandardize::TautomerEnumeratorResult &res) { + PyTautomerEnumeratorResult pyRes(res); + return getCallbackOverride()(boost::ref(mol), boost::ref(pyRes)); + } + python::object getPyCallbackObject() { return d_pyCallbackObject; } + + private: + PyTautomerEnumeratorCallback *d_cppCallback; + python::object d_pyCallbackObject; +}; + +typedef boost::shared_ptr TAUT_SPTR; + +ROMol *getTautomerHelper(const TAUT_SPTR &self) { + return new ROMol(*self->tautomer); +} + +ROMol *getKekulizedHelper(const TAUT_SPTR &self) { + return new ROMol(*self->kekulized); +} + +python::tuple smilesTautomerMapKeysHelper( + const MolStandardize::SmilesTautomerMap &self) { + python::list keys; + for (const auto &pair : self) { + keys.append(pair.first); + } + return python::tuple(keys); +} + +python::tuple smilesTautomerMapValuesHelper( + const MolStandardize::SmilesTautomerMap &self) { + python::list values; + for (const auto &pair : self) { + values.append(TAUT_SPTR(new MolStandardize::Tautomer(pair.second))); + } + return python::tuple(values); +} + +python::tuple smilesTautomerMapItemsHelper( + const MolStandardize::SmilesTautomerMap &self) { + python::list items; + for (const auto &pair : self) { + items.append(python::make_tuple( + pair.first, TAUT_SPTR(new MolStandardize::Tautomer(pair.second)))); + } + return python::tuple(items); +} + +python::object getCallbackHelper(const MolStandardize::TautomerEnumerator &te) { + PyTautomerEnumeratorCallback *cppCallback = + dynamic_cast(te.getCallback()); + python::object res; + if (cppCallback) { + res = cppCallback->getPyCallbackObject(); + } + return res; +}; + +void setCallbackHelper(MolStandardize::TautomerEnumerator &te, + PyObject *callback) { + PRECONDITION(callback, "callback must not be NULL"); + if (callback == Py_None) { + te.setCallback(nullptr); + return; + } + python::object callbackObject(python::handle<>(python::borrowed(callback))); + python::extract extractCallback( + callbackObject); + if (extractCallback.check()) { + if (!PyCallable_Check(extractCallback()->getCallbackOverride().ptr())) { + PyErr_SetString(PyExc_AttributeError, + "The __call__ attribute in the " + "rdMolStandardize.TautomerEnumeratorCallback subclass " + "must exist and be a callable method"); + python::throw_error_already_set(); + } else { + te.setCallback(new PyTautomerEnumeratorCallback(callbackObject)); + } + } else { + PyErr_SetString(PyExc_TypeError, + "Expected an instance of a " + "rdMolStandardize.TautomerEnumeratorCallback subclass"); + python::throw_error_already_set(); + } +} + +std::string tautomerEnumeratorCallbackClassDoc = + R"DOC(Create a derived class from this abstract base class and + implement the __call__() method. + The __call__() method is called in the innermost loop of the + algorithm, and provides a mechanism to monitor or stop + its progress. + + To have your callback called, pass an instance of your + derived class to TautomerEnumerator.SetCallback())DOC"; + MolStandardize::TautomerEnumerator *EnumeratorFromParams( const MolStandardize::CleanupParameters ¶ms) { - auto tautparams = - new MolStandardize::TautomerCatalogParams(params.tautomerTransforms); - auto cat = new MolStandardize::TautomerCatalog(tautparams); - return new MolStandardize::TautomerEnumerator(cat); + return new MolStandardize::TautomerEnumerator(params); } MolStandardize::TautomerEnumerator *createDefaultEnumerator() { @@ -31,11 +204,6 @@ MolStandardize::TautomerEnumerator *createDefaultEnumerator() { return EnumeratorFromParams(ps); } -ROMol *canonicalizeHelper(const MolStandardize::TautomerEnumerator &self, - const ROMol &mol) { - return self.canonicalize(mol); -} - class pyobjFunctor { public: pyobjFunctor(python::object obj) : dp_obj(std::move(obj)) {} @@ -48,59 +216,151 @@ class pyobjFunctor { python::object dp_obj; }; +ROMol *canonicalizeHelper(const MolStandardize::TautomerEnumerator &self, + const ROMol &mol) { + return self.canonicalize(mol); +} + ROMol *canonicalizeHelper2(const MolStandardize::TautomerEnumerator &self, const ROMol &mol, python::object scoreFunc) { pyobjFunctor ftor(scoreFunc); return self.canonicalize(mol, ftor); } -double scoreHelper(const MolStandardize::TautomerEnumerator &self, - const ROMol &mol) { - RDUNUSED_PARAM(self); - return MolStandardize::TautomerScoringFunctions::scoreTautomer(mol); +inline std::vector extractPythonIterable(const python::object &o) { + if (!PyObject_HasAttrString(o.ptr(), "__iter__")) { + PyErr_SetString(PyExc_TypeError, + "the passed object should be an iterable of Mol objects"); + python::throw_error_already_set(); + return std::vector(); + } + return std::vector(python::stl_input_iterator(o), + python::stl_input_iterator()); } -std::vector enumerateHelper( - const MolStandardize::TautomerEnumerator &self, const ROMol &mol, - python::object pyModAtoms, python::object pyModBonds) { - if (pyModAtoms != python::object() || pyModBonds != python::object()) { - boost::dynamic_bitset<> modifiedAtoms(mol.getNumAtoms()); - boost::dynamic_bitset<> modifiedBonds(mol.getNumBonds()); - auto res = self.enumerate(mol, &modifiedAtoms, &modifiedBonds); - if (pyModAtoms != python::object()) { - python::list atList = python::extract(pyModAtoms); - for (unsigned int i = 0; i < mol.getNumAtoms(); ++i) { - if (modifiedAtoms[i]) { - atList.append(i); - } - } - } - if (pyModBonds != python::object()) { - python::list bndList = python::extract(pyModBonds); - for (unsigned int i = 0; i < mol.getNumBonds(); ++i) { - if (modifiedBonds[i]) { - bndList.append(i); - } - } - } - return res; - } else { - return self.enumerate(mol); +ROMol *pickCanonicalHelper(const MolStandardize::TautomerEnumerator &self, + const python::object &o) { + python::extract e(o); + if (e.check()) { + return self.pickCanonical(*e()->get()); } + return self.pickCanonical(extractPythonIterable(o)); +} + +ROMol *pickCanonicalHelper2(const MolStandardize::TautomerEnumerator &self, + const python::object &o, python::object scoreFunc) { + pyobjFunctor ftor(scoreFunc); + python::extract e(o); + if (e.check()) { + return self.pickCanonical(*e()->get(), ftor); + } + return self.pickCanonical(extractPythonIterable(o), ftor); +} + +PyTautomerEnumeratorResult *enumerateHelper( + const MolStandardize::TautomerEnumerator &self, const ROMol &mol) { + return new PyTautomerEnumeratorResult(self.enumerate(mol)); } } // namespace struct tautomer_wrapper { static void wrap() { + python::enum_( + "TautomerEnumeratorStatus") + .value("Completed", MolStandardize::TautomerEnumeratorStatus::Completed) + .value("MaxTautomersReached", + MolStandardize::TautomerEnumeratorStatus::MaxTautomersReached) + .value("MaxTransformsReached", + MolStandardize::TautomerEnumeratorStatus::MaxTransformsReached) + .value("Canceled", MolStandardize::TautomerEnumeratorStatus::Canceled); + + python::class_( + "TautomerEnumeratorCallback", + tautomerEnumeratorCallbackClassDoc.c_str(), python::init<>()) + .def("__call__", + python::pure_virtual(&PyTautomerEnumeratorCallback::operator()), + "This must be implemented in the derived class. " + "Return True if the tautomer enumeration should continue; " + "False if the tautomer enumeration should stop.\n"); + + python::class_( + "Tautomer", + "used to hold the aromatic and kekulized versions " + "of each tautomer", + python::no_init) + .add_property( + "tautomer", + python::make_function( + &getTautomerHelper, + python::return_value_policy()), + "aromatic version of the tautomer") + .add_property( + "kekulized", + python::make_function( + &getKekulizedHelper, + python::return_value_policy()), + "kekulized version of the tautomer"); + + python::class_( + "SmilesTautomerMap", + "maps SMILES strings to the respective Tautomer objects", + python::no_init) + .def(python::map_indexing_suite()) + .def("keys", &smilesTautomerMapKeysHelper) + .def("values", &smilesTautomerMapValuesHelper) + .def("items", &smilesTautomerMapItemsHelper); + + python::class_( + "TautomerEnumeratorResult", + "used to return tautomer enumeration results", python::no_init) + .add_property( + "tautomers", + python::make_function( + &PyTautomerEnumeratorResult::tautomers, + python::return_value_policy()), + "tautomers generated by the enumerator") + .add_property( + "smiles", + python::make_function( + &PyTautomerEnumeratorResult::smiles, + python::return_value_policy()), + "SMILES of tautomers generated by the enumerator") + .add_property("smilesTautomerMap", + python::make_function( + &PyTautomerEnumeratorResult::smilesTautomerMap, + python::return_value_policy< + python::reference_existing_object>()), + "dictionary mapping SMILES strings to the respective " + "Tautomer objects") + .add_property("status", &PyTautomerEnumeratorResult::status, + "whether the enumeration completed or not; see " + "TautomerEnumeratorStatus for possible values") + .add_property( + "modifiedAtoms", + python::make_function(&PyTautomerEnumeratorResult::modifiedAtoms), + "tuple of atom indices modified by the transforms") + .add_property( + "modifiedBonds", + python::make_function(&PyTautomerEnumeratorResult::modifiedBonds), + "tuple of bond indices modified by the transforms") + .def("__call__", &PyTautomerEnumeratorResult::tautomers, + python::return_value_policy(), + "tautomers generated by the enumerator") + .def("__iter__", python::range(&PyTautomerEnumeratorResult::begin, + &PyTautomerEnumeratorResult::end)) + .def("__getitem__", &PyTautomerEnumeratorResult::at, + python::return_value_policy()) + .def("__len__", &PyTautomerEnumeratorResult::size); + python::class_( "TautomerEnumerator", python::no_init) .def("__init__", python::make_constructor(createDefaultEnumerator)) .def("__init__", python::make_constructor(EnumeratorFromParams)) .def("Enumerate", &enumerateHelper, - (python::arg("self"), python::arg("mol"), - python::arg("modifiedAtoms") = python::object(), - python::arg("modifiedBonds") = python::object()), + (python::arg("self"), python::arg("mol")), + python::return_value_policy(), R"DOC(Generates the tautomers for a molecule. The enumeration rules are inspired by the publication: @@ -112,9 +372,10 @@ struct tautomer_wrapper { transform (the H "donor" and H "acceptor" in the transform) and the bonds modified during transformation are any bonds whose order is changed during the tautomer transform (these are the bonds between the "donor" and the - "acceptor"))DOC") + "acceptor").)DOC") .def("Canonicalize", &canonicalizeHelper, (python::arg("self"), python::arg("mol")), + python::return_value_policy(), R"DOC(Returns the canonical tautomer for a molecule. The default scoring scheme is inspired by the publication: @@ -125,18 +386,94 @@ struct tautomer_wrapper { for any given conditions. The default scoring rules are designed to produce "reasonable" tautomers, but the primary concern is that the results are canonical: you always get the same canonical tautomer for a molecule - regardless of what the input tautomer or atom ordering were.)DOC", - python::return_value_policy()) + regardless of what the input tautomer or atom ordering were.)DOC") .def( "Canonicalize", &canonicalizeHelper2, (python::arg("self"), python::arg("mol"), python::arg("scoreFunc")), - "returns the canonical tautomer for a molecule using a custom " - "scoring function", - python::return_value_policy()) - .def("ScoreTautomer", &scoreHelper, - (python::arg("self"), python::arg("mol")), + python::return_value_policy(), + "picks the canonical tautomer from an iterable of molecules " + "using a custom scoring function") + .def("PickCanonical", &pickCanonicalHelper, + (python::arg("self"), python::arg("iterable")), + python::return_value_policy(), + "picks the canonical tautomer from an iterable of molecules") + .def("PickCanonical", &pickCanonicalHelper2, + (python::arg("self"), python::arg("iterable"), + python::arg("scoreFunc")), + python::return_value_policy(), + "returns the canonical tautomer for a molecule using a custom " + "scoring function") + .def("ScoreTautomer", + &MolStandardize::TautomerScoringFunctions::scoreTautomer, + (python::arg("mol")), "returns the score for a tautomer using the default scoring " "scheme.") + .staticmethod("ScoreTautomer") + .def("SetMaxTautomers", + &MolStandardize::TautomerEnumerator::setMaxTautomers, + (python::arg("self"), python::arg("maxTautomers")), + "set the maximum number of tautomers to be generated.") + .def("GetMaxTautomers", + &MolStandardize::TautomerEnumerator::getMaxTautomers, + (python::arg("self")), + "returns the maximum number of tautomers to be generated.") + .def("SetMaxTransforms", + &MolStandardize::TautomerEnumerator::setMaxTransforms, + (python::arg("self"), python::arg("maxTransforms")), + "set the maximum number of transformations to be applied. " + "This limit is usually hit earlier than the maxTautomers limit " + "and leads to a more linear scaling of CPU time with increasing " + "number of tautomeric centers (see Sitzmann et al.).") + .def("GetMaxTransforms", + &MolStandardize::TautomerEnumerator::getMaxTransforms, + (python::arg("self")), + "returns the maximum number of transformations to be applied.") + .def("SetRemoveSp3Stereo", + &MolStandardize::TautomerEnumerator::setRemoveSp3Stereo, + (python::arg("self"), python::arg("removeSp3Stereo")), + "set to True if you wish stereochemistry information " + "to be removed from sp3 atoms involved in tautomerism. " + "This means that S-aminoacids will lose their stereochemistry " + "after going through tautomer enumeration because of the " + "amido-imidol tautomerism. This defaults to True in RDKit, " + "and to False in the workflow described by Sitzmann et al.") + .def("GetRemoveSp3Stereo", + &MolStandardize::TautomerEnumerator::getRemoveSp3Stereo, + (python::arg("self")), + "returns whether stereochemistry information will be removed from " + "sp3 atoms involved in tautomerism.") + .def("SetRemoveBondStereo", + &MolStandardize::TautomerEnumerator::setRemoveBondStereo, + (python::arg("self"), python::arg("removeBondStereo")), + "set to True if you wish stereochemistry information " + "to be removed from double bonds involved in tautomerism. " + "This means that enols will lose their E/Z stereochemistry " + "after going through tautomer enumeration because of the " + "keto-enolic tautomerism. This defaults to True in the " + "RDKit and also in the workflow described by Sitzmann et al.") + .def("GetRemoveBondStereo", + &MolStandardize::TautomerEnumerator::getRemoveBondStereo, + (python::arg("self")), + "returns whether stereochemistry information " + "will be removed from double bonds involved in tautomerism.") + .def("SetReassignStereo", + &MolStandardize::TautomerEnumerator::setReassignStereo, + (python::arg("self"), python::arg("reassignStereo")), + "set to True if you wish AssignStereochemistry to be called " + "on each tautomer generated by the Enumerate() method. " + "This defaults to True.") + .def("GetReassignStereo", + &MolStandardize::TautomerEnumerator::getReassignStereo, + (python::arg("self")), + "returns whether AssignStereochemistry will be called " + "on each tautomer generated by the Enumerate() method.") + .def("SetCallback", &setCallbackHelper, + "Pass an instance of a class derived from\n" + "TautomerEnumeratorCallback, which must implement the\n" + "__call__() method.") + .def("GetCallback", &getCallbackHelper, + "Get the TautomerEnumeratorCallback subclass instance,\n" + "or None if none was set.") .def_readonly( "tautomerScoreVersion", MolStandardize::TautomerScoringFunctions::tautomerScoringVersion); diff --git a/Code/GraphMol/MolStandardize/Wrap/rdMolStandardize.cpp b/Code/GraphMol/MolStandardize/Wrap/rdMolStandardize.cpp index 4bc420287..9b96d0057 100644 --- a/Code/GraphMol/MolStandardize/Wrap/rdMolStandardize.cpp +++ b/Code/GraphMol/MolStandardize/Wrap/rdMolStandardize.cpp @@ -103,9 +103,6 @@ BOOST_PYTHON_MODULE(rdMolStandardize) { .def_readwrite("maxRestarts", &RDKit::MolStandardize::CleanupParameters::maxRestarts, "maximum number of restarts") - .def_readwrite("maxTautomers", - &RDKit::MolStandardize::CleanupParameters::maxTautomers, - "maximum number of tautomers to generate") .def_readwrite("preferOrganic", &RDKit::MolStandardize::CleanupParameters::preferOrganic, "prefer organic fragments to inorganic ones when deciding " @@ -114,8 +111,29 @@ BOOST_PYTHON_MODULE(rdMolStandardize) { &RDKit::MolStandardize::CleanupParameters::doCanonical, "apply atom-order dependent normalizations (like " "uncharging) in a canonical order") - - ; + .def_readwrite("maxTautomers", + &RDKit::MolStandardize::CleanupParameters::maxTautomers, + "maximum number of tautomers to generate (defaults to " + "1000)") + .def_readwrite("maxTransforms", + &RDKit::MolStandardize::CleanupParameters::maxTransforms, + "maximum number of transforms to apply during tautomer " + "enumeration (defaults to 1000)") + .def_readwrite( + "tautomerRemoveSp3Stereo", + &RDKit::MolStandardize::CleanupParameters::tautomerRemoveSp3Stereo, + "remove stereochemistry from sp3 centers involved in " + "tautomerism (defaults to True)") + .def_readwrite( + "tautomerRemoveBondStereo", + &RDKit::MolStandardize::CleanupParameters::tautomerRemoveBondStereo, + "remove stereochemistry from double bonds involved in " + "tautomerism (defaults to True)") + .def_readwrite( + "tautomerReassignStereo", + &RDKit::MolStandardize::CleanupParameters::tautomerReassignStereo, + "call AssignStereochemistry on all generated tautomers " + "(defaults to True)"); docString = "Standardizes a molecule"; python::def("Cleanup", cleanupHelper, diff --git a/Code/GraphMol/MolStandardize/Wrap/testMolStandardize.py b/Code/GraphMol/MolStandardize/Wrap/testMolStandardize.py index 32dffe7c1..635b8f520 100644 --- a/Code/GraphMol/MolStandardize/Wrap/testMolStandardize.py +++ b/Code/GraphMol/MolStandardize/Wrap/testMolStandardize.py @@ -6,6 +6,7 @@ from rdkit import RDConfig import os import sys import math +from datetime import datetime, timedelta import unittest from rdkit import DataStructs from rdkit import Chem @@ -228,10 +229,21 @@ chlorine [Cl] ctaut = enumerator.Canonicalize(m) self.assertEqual(Chem.MolToSmiles(ctaut), "O=C1CCCCC1") - tauts = enumerator.Enumerate(m) - self.assertEqual(len(tauts), 2) - ctauts = list(sorted(Chem.MolToSmiles(x) for x in tauts)) + taut_res = enumerator.Enumerate(m) + self.assertEqual(len(taut_res), 2) + ctauts = list(sorted(Chem.MolToSmiles(x) for x in taut_res)) self.assertEqual(ctauts, ['O=C1CCCCC1', 'OC1=CCCCC1']) + self.assertEqual(list(taut_res.smiles), ['O=C1CCCCC1', 'OC1=CCCCC1']) + # this tests the non-templated overload + self.assertEqual(Chem.MolToSmiles(enumerator.PickCanonical(taut_res)), "O=C1CCCCC1") + # this tests the templated overload + self.assertEqual(Chem.MolToSmiles(enumerator.PickCanonical(set(taut_res()))), "O=C1CCCCC1") + with self.assertRaises(TypeError): + enumerator.PickCanonical(1) + with self.assertRaises(TypeError): + enumerator.PickCanonical([0, 1]) + self.assertEqual(Chem.MolToSmiles(enumerator.PickCanonical( + Chem.MolFromSmiles(x) for x in ['O=C1CCCCC1', 'OC1=CCCCC1'])), "O=C1CCCCC1") def scorefunc1(mol): ' stupid tautomer scoring function ' @@ -260,32 +272,453 @@ chlorine [Cl] self.assertEqual(enumerator.ScoreTautomer(Chem.MolFromSmiles('N=c1[nH]cccc1')), 99) self.assertEqual(enumerator.ScoreTautomer(Chem.MolFromSmiles('Nc1ncccc1')), 100) + def scorefunc2(mol): + ' stupid tautomer scoring function ' + p = Chem.MolFromSmarts('O=C') + return len(mol.GetSubstructMatches(p)) + + m = Chem.MolFromSmiles("C1(=CCCCC1)O") + ctaut = enumerator.Canonicalize(m, scorefunc1) + self.assertEqual(Chem.MolToSmiles(ctaut), "OC1=CCCCC1") + ctaut = enumerator.Canonicalize(m, scorefunc2) + self.assertEqual(Chem.MolToSmiles(ctaut), "O=C1CCCCC1") + # make sure lambdas work + ctaut = enumerator.Canonicalize( + m, lambda x: len(x.GetSubstructMatches(Chem.MolFromSmarts('C=O')))) + self.assertEqual(Chem.MolToSmiles(ctaut), "O=C1CCCCC1") + + # make sure we behave if we return something bogus from the scoring function + with self.assertRaises(TypeError): + ctaut = enumerator.Canonicalize(m, lambda x: 'fail') + + self.assertEqual(enumerator.ScoreTautomer(Chem.MolFromSmiles('N=c1[nH]cccc1')), 99) + self.assertEqual(enumerator.ScoreTautomer(Chem.MolFromSmiles('Nc1ncccc1')), 100) + + res = enumerator.Enumerate(m) + # this test the specialized overload + ctaut = enumerator.PickCanonical(res, scorefunc1) + self.assertEqual(Chem.MolToSmiles(ctaut), "OC1=CCCCC1") + ctaut = enumerator.PickCanonical(res, scorefunc2) + self.assertEqual(Chem.MolToSmiles(ctaut), "O=C1CCCCC1") + # make sure lambdas work + ctaut = enumerator.PickCanonical( + res, lambda x: len(x.GetSubstructMatches(Chem.MolFromSmarts('C=O')))) + self.assertEqual(Chem.MolToSmiles(ctaut), "O=C1CCCCC1") + # make sure we behave if we return something bogus from the scoring function + with self.assertRaises(TypeError): + ctaut = enumerator.PickCanonical(res, lambda x: 'fail') + # this test the non-specialized overload + ctaut = enumerator.PickCanonical(set(res()), scorefunc1) + self.assertEqual(Chem.MolToSmiles(ctaut), "OC1=CCCCC1") + ctaut = enumerator.PickCanonical(set(res()), scorefunc2) + self.assertEqual(Chem.MolToSmiles(ctaut), "O=C1CCCCC1") + # make sure lambdas work + ctaut = enumerator.PickCanonical( + set(res()), lambda x: len(x.GetSubstructMatches(Chem.MolFromSmarts('C=O')))) + self.assertEqual(Chem.MolToSmiles(ctaut), "O=C1CCCCC1") + # make sure we behave if we return something bogus from the scoring function + with self.assertRaises(TypeError): + ctaut = enumerator.PickCanonical(set(res()), lambda x: 'fail') + def test14TautomerDetails(self): enumerator = rdMolStandardize.TautomerEnumerator() m = Chem.MolFromSmiles("c1ccccc1CN=c1[nH]cccc1") - modatoms = [] - modbonds = [] - tauts = enumerator.Enumerate(m,modifiedAtoms=modatoms,modifiedBonds=modbonds) - self.assertEqual(len(tauts),2) - self.assertEqual(modatoms,[7,9]) - self.assertEqual(len(modbonds),7) - self.assertEqual(modbonds,[7,8,9,10,11,12,14]) - - modatoms = [] - tauts = enumerator.Enumerate(m,modifiedAtoms=modatoms) - self.assertEqual(len(tauts),2) - self.assertEqual(modatoms,[7,9]) - - modbonds = [] - tauts = enumerator.Enumerate(m,modifiedBonds=modbonds) - self.assertEqual(len(tauts),2) - self.assertEqual(len(modbonds),7) - self.assertEqual(modbonds,[7,8,9,10,11,12,14]) + taut_res = enumerator.Enumerate(m) + self.assertEqual(len(taut_res.tautomers),2) + self.assertEqual(taut_res.modifiedAtoms,(7,9)) + self.assertEqual(len(taut_res.modifiedBonds),7) + self.assertEqual(taut_res.modifiedBonds,(7,8,9,10,11,12,14)) + taut_res = enumerator.Enumerate(m) + self.assertEqual(len(taut_res.tautomers),2) + self.assertEqual(taut_res.modifiedAtoms,(7,9)) + taut_res = enumerator.Enumerate(m) + self.assertEqual(len(taut_res.tautomers),2) + self.assertEqual(len(taut_res.modifiedBonds),7) + self.assertEqual(taut_res.modifiedBonds,(7,8,9,10,11,12,14)) + def test15EnumeratorParams(self): + # Test a structure with hundreds of tautomers. + smi68 = "[H][C](CO)(NC(=O)C1=C(O)C(O)=CC=C1)C(O)=O" + m68 = Chem.MolFromSmiles(smi68) + enumerator = rdMolStandardize.TautomerEnumerator() + res68 = enumerator.Enumerate(m68) + self.assertEqual(len(res68), 292) + self.assertEqual(len(res68.tautomers), len(res68)) + self.assertEqual(res68.status, rdMolStandardize.TautomerEnumeratorStatus.MaxTransformsReached) + params = rdMolStandardize.CleanupParameters() + params.maxTautomers = 50; + enumerator = rdMolStandardize.TautomerEnumerator(params) + res68 = enumerator.Enumerate(m68) + self.assertEqual(len(res68), 50) + self.assertEqual(res68.status, rdMolStandardize.TautomerEnumeratorStatus.MaxTautomersReached) + + sAlaSmi = "C[C@H](N)C(=O)O" + sAla = Chem.MolFromSmiles(sAlaSmi) + # test remove (S)-Ala stereochemistry + self.assertEqual(sAla.GetAtomWithIdx(1).GetChiralTag(), Chem.ChiralType.CHI_TETRAHEDRAL_CCW) + self.assertEqual(sAla.GetAtomWithIdx(1).GetProp("_CIPCode"), "S") + params = rdMolStandardize.CleanupParameters() + params.tautomerRemoveSp3Stereo = True + enumerator = rdMolStandardize.TautomerEnumerator(params) + res = enumerator.Enumerate(sAla) + for taut in res: + self.assertEqual(taut.GetAtomWithIdx(1).GetChiralTag(), Chem.ChiralType.CHI_UNSPECIFIED) + self.assertFalse(taut.GetAtomWithIdx(1).HasProp("_CIPCode")) + for taut in res.tautomers: + self.assertEqual(taut.GetAtomWithIdx(1).GetChiralTag(), Chem.ChiralType.CHI_UNSPECIFIED) + self.assertFalse(taut.GetAtomWithIdx(1).HasProp("_CIPCode")) + for i, taut in enumerate(res): + self.assertEqual(Chem.MolToSmiles(taut), Chem.MolToSmiles(res.tautomers[i])) + self.assertEqual(len(res), len(res.smiles)) + self.assertEqual(len(res), len(res.tautomers)) + self.assertEqual(len(res), len(res())) + self.assertEqual(len(res), len(res.smilesTautomerMap)) + for i, taut in enumerate(res.tautomers): + self.assertEqual(Chem.MolToSmiles(taut), Chem.MolToSmiles(res[i])) + self.assertEqual(Chem.MolToSmiles(taut), res.smiles[i]) + self.assertEqual(Chem.MolToSmiles(taut), Chem.MolToSmiles(res.smilesTautomerMap.values()[i].tautomer)) + for i, k in enumerate(res.smilesTautomerMap.keys()): + self.assertEqual(k, res.smiles[i]) + for i, v in enumerate(res.smilesTautomerMap.values()): + self.assertEqual(Chem.MolToSmiles(v.tautomer), Chem.MolToSmiles(res[i])) + for i, (k, v) in enumerate(res.smilesTautomerMap.items()): + self.assertEqual(k, res.smiles[i]) + self.assertEqual(Chem.MolToSmiles(v.tautomer), Chem.MolToSmiles(res[i])) + for i, smiles in enumerate(res.smiles): + self.assertEqual(smiles, Chem.MolToSmiles(res[i])) + self.assertEqual(smiles, res.smilesTautomerMap.keys()[i]) + self.assertEqual(Chem.MolToSmiles(res.tautomers[-1]), Chem.MolToSmiles(res[-1])) + self.assertEqual(Chem.MolToSmiles(res[-1]), Chem.MolToSmiles(res[len(res)-1])) + self.assertEqual(Chem.MolToSmiles(res.tautomers[-1]), Chem.MolToSmiles(res.tautomers[len(res)-1])) + with self.assertRaises(IndexError): + res[len(res)] + with self.assertRaises(IndexError): + res[-len(res)-1] + with self.assertRaises(IndexError): + res.tautomers[len(res)] + with self.assertRaises(IndexError): + res.tautomers[-len(res.tautomers)-1] + + # test retain (S)-Ala stereochemistry + self.assertEqual(sAla.GetAtomWithIdx(1).GetChiralTag(), Chem.ChiralType.CHI_TETRAHEDRAL_CCW) + self.assertEqual(sAla.GetAtomWithIdx(1).GetProp("_CIPCode"), "S") + params = rdMolStandardize.CleanupParameters() + params.tautomerRemoveSp3Stereo = False + enumerator = rdMolStandardize.TautomerEnumerator(params) + res = enumerator.Enumerate(sAla) + for taut in res: + tautAtom = taut.GetAtomWithIdx(1) + if (tautAtom.GetHybridization() == Chem.HybridizationType.SP3): + self.assertEqual(tautAtom.GetChiralTag(), Chem.ChiralType.CHI_TETRAHEDRAL_CCW) + self.assertTrue(tautAtom.HasProp("_CIPCode")) + self.assertEqual(tautAtom.GetProp("_CIPCode"), "S") + else: + self.assertFalse(tautAtom.HasProp("_CIPCode")) + self.assertEqual(tautAtom.GetChiralTag(), Chem.ChiralType.CHI_UNSPECIFIED) + + eEnolSmi = "C/C=C/O" + eEnol = Chem.MolFromSmiles(eEnolSmi) + self.assertEqual(eEnol.GetBondWithIdx(1).GetStereo(), Chem.BondStereo.STEREOE) + # test remove enol E stereochemistry + params = rdMolStandardize.CleanupParameters() + params.tautomerRemoveBondStereo = True + enumerator = rdMolStandardize.TautomerEnumerator(params) + res = enumerator.Enumerate(eEnol) + for taut in res.tautomers: + self.assertEqual(taut.GetBondWithIdx(1).GetStereo(), Chem.BondStereo.STEREONONE) + # test retain enol E stereochemistry + params = rdMolStandardize.CleanupParameters() + params.tautomerRemoveBondStereo = False + enumerator = rdMolStandardize.TautomerEnumerator(params) + res = enumerator.Enumerate(eEnol) + for taut in res.tautomers: + if (taut.GetBondWithIdx(1).GetBondType() == Chem.BondType.DOUBLE): + self.assertEqual(taut.GetBondWithIdx(1).GetStereo(), Chem.BondStereo.STEREOE) + + zEnolSmi = "C/C=C\\O" + zEnol = Chem.MolFromSmiles(zEnolSmi) + self.assertEqual(zEnol.GetBondWithIdx(1).GetStereo(), Chem.BondStereo.STEREOZ) + # test remove enol Z stereochemistry + params = rdMolStandardize.CleanupParameters() + params.tautomerRemoveBondStereo = True + enumerator = rdMolStandardize.TautomerEnumerator(params) + res = enumerator.Enumerate(zEnol) + for taut in res: + self.assertEqual(taut.GetBondWithIdx(1).GetStereo(), Chem.BondStereo.STEREONONE) + # test retain enol Z stereochemistry + params = rdMolStandardize.CleanupParameters() + params.tautomerRemoveBondStereo = False + enumerator = rdMolStandardize.TautomerEnumerator(params) + res = enumerator.Enumerate(zEnol) + for taut in res: + if (taut.GetBondWithIdx(1).GetBondType() == Chem.BondType.DOUBLE): + self.assertEqual(taut.GetBondWithIdx(1).GetStereo(), Chem.BondStereo.STEREOZ) + + def test16EnumeratorCallback(self): + class MyTautomerEnumeratorCallback(rdMolStandardize.TautomerEnumeratorCallback): + def __init__(self, parent, timeout_ms): + super().__init__() + self._parent = parent + self._timeout = timedelta(milliseconds=timeout_ms) + self._start_time = datetime.now() + + def __call__(self, mol, res): + self._parent.assertTrue(isinstance(mol, Chem.Mol)) + self._parent.assertTrue(isinstance(res, rdMolStandardize.TautomerEnumeratorResult)) + return (datetime.now() - self._start_time < self._timeout) + + class MyBrokenCallback(rdMolStandardize.TautomerEnumeratorCallback): + pass + + class MyBrokenCallback2(rdMolStandardize.TautomerEnumeratorCallback): + __call__ = 1 + + # Test a structure with hundreds of tautomers. + smi68 = "[H][C](CO)(NC(=O)C1=C(O)C(O)=CC=C1)C(O)=O" + m68 = Chem.MolFromSmiles(smi68) + + params = rdMolStandardize.CleanupParameters() + params.maxTransforms = 10000 + params.maxTautomers = 10000 + enumerator = rdMolStandardize.TautomerEnumerator(params) + enumerator.SetCallback(MyTautomerEnumeratorCallback(self, 50.0)) + res68 = enumerator.Enumerate(m68) + # either the enumeration was canceled due to timeout + # or it has completed very quickly + hasReachedTimeout = (len(res68.tautomers) < 375 and + res68.status == rdMolStandardize.TautomerEnumeratorStatus.Canceled) + hasCompleted = (len(res68.tautomers) == 375 and + res68.status == rdMolStandardize.TautomerEnumeratorStatus.Completed) + if hasReachedTimeout: + print("Enumeration was canceled due to timeout (50 ms)", file=sys.stderr) + if hasCompleted: + print("Enumeration has completed", file=sys.stderr) + self.assertTrue(hasReachedTimeout or hasCompleted) + self.assertTrue(hasReachedTimeout ^ hasCompleted) + + enumerator = rdMolStandardize.TautomerEnumerator(params) + enumerator.SetCallback(MyTautomerEnumeratorCallback(self, 10000.0)) + res68 = enumerator.Enumerate(m68) + # either the enumeration completed + # or it ran very slowly and was canceled due to timeout + hasReachedTimeout = (len(res68.tautomers) < 375 and + res68.status == rdMolStandardize.TautomerEnumeratorStatus.Canceled) + hasCompleted = (len(res68.tautomers) == 375 and + res68.status == rdMolStandardize.TautomerEnumeratorStatus.Completed) + if hasReachedTimeout: + print("Enumeration was canceled due to timeout (10 s)", file=sys.stderr) + if hasCompleted: + print("Enumeration has completed", file=sys.stderr) + self.assertTrue(hasReachedTimeout or hasCompleted) + self.assertTrue(hasReachedTimeout ^ hasCompleted) + + enumerator = rdMolStandardize.TautomerEnumerator(params) + with self.assertRaises(AttributeError): + enumerator.SetCallback(MyBrokenCallback()) + with self.assertRaises(AttributeError): + enumerator.SetCallback(MyBrokenCallback2()) + + def test17PickCanonicalCIPChangeOnChiralCenter(self): + def get_canonical_taut(res): + best_idx = max([(rdMolStandardize.TautomerEnumerator.ScoreTautomer(t), i) + for i, t in enumerate(res.tautomers)])[1] + return res.tautomers[best_idx] + + smi = "CC\\C=C(/O)[C@@H](C)C(C)=O" + mol = Chem.MolFromSmiles(smi) + self.assertIsNotNone(mol) + self.assertEqual(mol.GetAtomWithIdx(5).GetChiralTag(), Chem.ChiralType.CHI_TETRAHEDRAL_CW) + self.assertEqual(mol.GetAtomWithIdx(5).GetProp("_CIPCode"), "R") + + # here the chirality disappears as the chiral center is itself involved in tautomerism + te = rdMolStandardize.TautomerEnumerator() + can_taut = te.Canonicalize(mol) + self.assertIsNotNone(can_taut) + self.assertEqual(can_taut.GetAtomWithIdx(5).GetChiralTag(), Chem.ChiralType.CHI_UNSPECIFIED) + self.assertFalse(can_taut.GetAtomWithIdx(5).HasProp("_CIPCode")) + self.assertEqual(Chem.MolToSmiles(can_taut), "CCCC(=O)C(C)C(C)=O") + + # here the chirality stays even if the chiral center is itself involved in tautomerism + # because of the tautomerRemoveSp3Stereo parameter being set to false + params = rdMolStandardize.CleanupParameters() + params.tautomerRemoveSp3Stereo = False + te = rdMolStandardize.TautomerEnumerator(params) + can_taut = te.Canonicalize(mol) + self.assertIsNotNone(can_taut) + self.assertEqual(can_taut.GetAtomWithIdx(5).GetChiralTag(), Chem.ChiralType.CHI_TETRAHEDRAL_CW) + self.assertEqual(can_taut.GetAtomWithIdx(5).GetProp("_CIPCode"), "S") + self.assertEqual(Chem.MolToSmiles(can_taut), "CCCC(=O)[C@@H](C)C(C)=O") + + # here the chirality disappears as the chiral center is itself involved in tautomerism + # the reassignStereo setting has no influence + te = rdMolStandardize.TautomerEnumerator() + res = te.Enumerate(mol) + self.assertEqual(res.status, rdMolStandardize.TautomerEnumeratorStatus.Completed) + self.assertEqual(len(res.tautomers), 8) + best_taut = get_canonical_taut(res) + self.assertIsNotNone(best_taut) + self.assertEqual(best_taut.GetAtomWithIdx(5).GetChiralTag(), Chem.ChiralType.CHI_UNSPECIFIED) + self.assertFalse(best_taut.GetAtomWithIdx(5).HasProp("_CIPCode")) + self.assertEqual(Chem.MolToSmiles(best_taut), "CCCC(=O)C(C)C(C)=O") + + # here the chirality disappears as the chiral center is itself involved in tautomerism + # the reassignStereo setting has no influence + params = rdMolStandardize.CleanupParameters() + params.tautomerReassignStereo = False + te = rdMolStandardize.TautomerEnumerator(params) + res = te.Enumerate(mol) + self.assertEqual(res.status, rdMolStandardize.TautomerEnumeratorStatus.Completed) + self.assertEqual(len(res.tautomers), 8) + best_taut = get_canonical_taut(res) + self.assertIsNotNone(best_taut) + self.assertEqual(best_taut.GetAtomWithIdx(5).GetChiralTag(), Chem.ChiralType.CHI_UNSPECIFIED) + self.assertFalse(best_taut.GetAtomWithIdx(5).HasProp("_CIPCode")) + self.assertEqual(Chem.MolToSmiles(best_taut), "CCCC(=O)C(C)C(C)=O") + + # here the chirality stays even if the chiral center is itself involved in tautomerism + # because of the tautomerRemoveSp3Stereo parameter being set to false + # as reassignStereo by default is true, the CIP code has been recomputed + # and therefore it is now S (correct) + params = rdMolStandardize.CleanupParameters() + params.tautomerRemoveSp3Stereo = False + te = rdMolStandardize.TautomerEnumerator(params) + res = te.Enumerate(mol) + self.assertEqual(res.status, rdMolStandardize.TautomerEnumeratorStatus.Completed) + self.assertEqual(len(res.tautomers), 8) + best_taut = get_canonical_taut(res) + self.assertIsNotNone(best_taut) + self.assertEqual(best_taut.GetAtomWithIdx(5).GetChiralTag(), Chem.ChiralType.CHI_TETRAHEDRAL_CW) + self.assertEqual(best_taut.GetAtomWithIdx(5).GetProp("_CIPCode"), "S") + self.assertEqual(Chem.MolToSmiles(best_taut), "CCCC(=O)[C@@H](C)C(C)=O") + + # here the chirality stays even if the chiral center is itself involved in tautomerism + # because of the tautomerRemoveSp3Stereo parameter being set to false + # as reassignStereo is false, the CIP code has not been recomputed + # and therefore it is still R (incorrect) + params = rdMolStandardize.CleanupParameters() + params.tautomerRemoveSp3Stereo = False + params.tautomerReassignStereo = False + te = rdMolStandardize.TautomerEnumerator(params) + res = te.Enumerate(mol) + self.assertEqual(res.status, rdMolStandardize.TautomerEnumeratorStatus.Completed) + self.assertEqual(len(res.tautomers), 8) + best_taut = get_canonical_taut(res) + self.assertIsNotNone(best_taut) + self.assertEqual(best_taut.GetAtomWithIdx(5).GetChiralTag(), Chem.ChiralType.CHI_TETRAHEDRAL_CW) + self.assertEqual(best_taut.GetAtomWithIdx(5).GetProp("_CIPCode"), "R") + self.assertEqual(Chem.MolToSmiles(best_taut), "CCCC(=O)[C@@H](C)C(C)=O") + + smi = "CC\\C=C(/O)[C@@](CC)(C)C(C)=O" + mol = Chem.MolFromSmiles(smi) + self.assertIsNotNone(mol) + self.assertEqual(mol.GetAtomWithIdx(5).GetProp("_CIPCode"), "S") + self.assertEqual(mol.GetAtomWithIdx(5).GetChiralTag(), Chem.ChiralType.CHI_TETRAHEDRAL_CW) + + # here the chirality stays no matter how tautomerRemoveSp3Stereo + # is set as the chiral center is not involved in tautomerism + te = rdMolStandardize.TautomerEnumerator() + can_taut = te.Canonicalize(mol) + self.assertIsNotNone(can_taut) + self.assertEqual(can_taut.GetAtomWithIdx(5).GetChiralTag(), Chem.ChiralType.CHI_TETRAHEDRAL_CW) + self.assertEqual(can_taut.GetAtomWithIdx(5).GetProp("_CIPCode"), "R") + self.assertEqual(Chem.MolToSmiles(can_taut), "CCCC(=O)[C@](C)(CC)C(C)=O") + + params = rdMolStandardize.CleanupParameters() + params.tautomerRemoveSp3Stereo = False + te = rdMolStandardize.TautomerEnumerator(params) + can_taut = te.Canonicalize(mol) + self.assertIsNotNone(can_taut) + self.assertEqual(can_taut.GetAtomWithIdx(5).GetChiralTag(), Chem.ChiralType.CHI_TETRAHEDRAL_CW) + self.assertEqual(can_taut.GetAtomWithIdx(5).GetProp("_CIPCode"), "R") + self.assertEqual(Chem.MolToSmiles(can_taut), "CCCC(=O)[C@](C)(CC)C(C)=O") + + # as reassignStereo by default is true, the CIP code has been recomputed + # and therefore it is now R (correct) + te = rdMolStandardize.TautomerEnumerator() + res = te.Enumerate(mol) + self.assertEqual(res.status, rdMolStandardize.TautomerEnumeratorStatus.Completed) + self.assertEqual(len(res.tautomers), 4) + best_taut = get_canonical_taut(res) + self.assertIsNotNone(best_taut) + self.assertEqual(best_taut.GetAtomWithIdx(5).GetChiralTag(), Chem.ChiralType.CHI_TETRAHEDRAL_CW) + self.assertEqual(best_taut.GetAtomWithIdx(5).GetProp("_CIPCode"), "R") + self.assertEqual(Chem.MolToSmiles(best_taut), "CCCC(=O)[C@](C)(CC)C(C)=O") + + # as reassignStereo is false, the CIP code has not been recomputed + # and therefore it is still S (incorrect) + params = rdMolStandardize.CleanupParameters() + params.tautomerReassignStereo = False + te = rdMolStandardize.TautomerEnumerator(params) + res = te.Enumerate(mol) + self.assertEqual(res.status, rdMolStandardize.TautomerEnumeratorStatus.Completed) + self.assertEqual(len(res.tautomers), 4) + best_taut = get_canonical_taut(res) + self.assertIsNotNone(best_taut) + self.assertEqual(best_taut.GetAtomWithIdx(5).GetChiralTag(), Chem.ChiralType.CHI_TETRAHEDRAL_CW) + self.assertEqual(best_taut.GetAtomWithIdx(5).GetProp("_CIPCode"), "S") + self.assertEqual(Chem.MolToSmiles(best_taut), "CCCC(=O)[C@](C)(CC)C(C)=O") + + # as reassignStereo by default is true, the CIP code has been recomputed + # and therefore it is now R (correct) + params = rdMolStandardize.CleanupParameters() + params.tautomerRemoveSp3Stereo = False + te = rdMolStandardize.TautomerEnumerator(params) + res = te.Enumerate(mol) + self.assertEqual(res.status, rdMolStandardize.TautomerEnumeratorStatus.Completed) + self.assertEqual(len(res.tautomers), 4) + best_taut = get_canonical_taut(res) + self.assertIsNotNone(best_taut) + self.assertEqual(best_taut.GetAtomWithIdx(5).GetChiralTag(), Chem.ChiralType.CHI_TETRAHEDRAL_CW) + self.assertEqual(best_taut.GetAtomWithIdx(5).GetProp("_CIPCode"), "R") + self.assertEqual(Chem.MolToSmiles(best_taut), "CCCC(=O)[C@](C)(CC)C(C)=O") + + # here the chirality stays even if the tautomerRemoveSp3Stereo parameter + # is set to false as the chiral center is not involved in tautomerism + # as reassignStereo is false, the CIP code has not been recomputed + # and therefore it is still S (incorrect) + params = rdMolStandardize.CleanupParameters() + params.tautomerRemoveSp3Stereo = False + params.tautomerReassignStereo = False + te = rdMolStandardize.TautomerEnumerator(params) + res = te.Enumerate(mol) + self.assertEqual(res.status, rdMolStandardize.TautomerEnumeratorStatus.Completed) + self.assertEqual(len(res.tautomers), 4) + best_taut = get_canonical_taut(res) + self.assertIsNotNone(best_taut) + self.assertEqual(best_taut.GetAtomWithIdx(5).GetChiralTag(), Chem.ChiralType.CHI_TETRAHEDRAL_CW) + self.assertEqual(best_taut.GetAtomWithIdx(5).GetProp("_CIPCode"), "S") + self.assertEqual(Chem.MolToSmiles(best_taut), "CCCC(=O)[C@](C)(CC)C(C)=O") + + def test18TautomerEnumeratorResultIter(self): + smi = "Cc1nnc(NC(=O)N2CCN(Cc3ccc(F)cc3)C(=O)C2)s1" + mol = Chem.MolFromSmiles(smi) + self.assertIsNotNone(mol) + te = rdMolStandardize.TautomerEnumerator() + res = te.Enumerate(mol) + res_it = iter(res) + i = 0 + while 1: + try: + t = next(res_it) + except StopIteration: + break + self.assertEqual(Chem.MolToSmiles(t), Chem.MolToSmiles(res[i])) + i += 1 + self.assertEqual(i, len(res)) + res_it = iter(res) + i = -len(res) + while 1: + try: + t = next(res_it) + except StopIteration: + break + self.assertEqual(Chem.MolToSmiles(t), Chem.MolToSmiles(res[i])) + i += 1 + self.assertEqual(i, 0) if __name__ == "__main__": unittest.main() diff --git a/Code/GraphMol/MolStandardize/testTautomer.cpp b/Code/GraphMol/MolStandardize/testTautomer.cpp index 028f7b618..153e890e2 100644 --- a/Code/GraphMol/MolStandardize/testTautomer.cpp +++ b/Code/GraphMol/MolStandardize/testTautomer.cpp @@ -11,6 +11,8 @@ #include #include #include +#include +#include using namespace RDKit; using namespace MolStandardize; @@ -24,722 +26,311 @@ void testEnumerator() { rdbase + "/Data/MolStandardize/tautomerTransforms.in"; auto tautparams = std::unique_ptr( new TautomerCatalogParams(tautomerFile)); - unsigned int ntautomers = tautparams->getNumTautomers(); - TEST_ASSERT(ntautomers == 34); + + // DEPRECATED, remove from here in release 2021.01 + { + unsigned int ntransforms = tautparams->getNumTautomers(); + TEST_ASSERT(ntransforms == 36); + } + // DEPRECATED, remove until here in release 2021.01 + + unsigned int ntransforms = tautparams->getTransforms().size(); + TEST_ASSERT(ntransforms == 36); TautomerEnumerator te(new TautomerCatalog(tautparams.get())); - // Enumerate 1,3 keto/enol tautomer. - std::string smi1 = "C1(=CCCCC1)O"; - std::shared_ptr m1(SmilesToMol(smi1)); - std::vector res = te.enumerate(*m1); - std::vector ans = {"O=C1CCCCC1", "OC1=CCCCC1"}; - TEST_ASSERT(res.size() == ans.size()); - for (size_t i = 0; i < res.size(); ++i) { - TEST_ASSERT(MolToSmiles(*res[i]) == ans[i]); - } + std::function &)> + checkAns([te](const std::string &smi, + const std::vector &ans) { + ROMOL_SPTR m(SmilesToMol(smi)); + TautomerEnumeratorResult res = te.enumerate(*m); + TEST_ASSERT(res.status() == TautomerEnumeratorStatus::Completed); + size_t sz = std::max(res.size(), ans.size()); + bool exceedingTautomer = false; + bool missingTautomer = false; + bool wrongTautomer = false; + for (size_t i = 0; i < sz; ++i) { + if (i >= res.size()) { + missingTautomer = true; + std::cerr << "missingTautomer, ans " << ans[i] << std::endl; + } else if (i >= ans.size()) { + exceedingTautomer = true; + std::cerr << "exceedingTautomer, taut " << MolToSmiles(*res.at(i)) + << std::endl; + } else if (MolToSmiles(*res[i]) != ans[i]) { + wrongTautomer = true; + std::cerr << "wrongTautomer, taut " << MolToSmiles(*res[i]) + << ", ans " << ans[i] << std::endl; + } + } + TEST_ASSERT(!(missingTautomer || exceedingTautomer || wrongTautomer)); + }); // Enumerate 1,3 keto/enol tautomer. - std::string smi2 = "C1(CCCCC1)=O"; - std::shared_ptr m2(SmilesToMol(smi2)); - std::vector res2 = te.enumerate(*m2); - std::vector ans2 = {"O=C1CCCCC1", "OC1=CCCCC1"}; - TEST_ASSERT(res2.size() == ans2.size()); - for (size_t i = 0; i < res2.size(); ++i) { - TEST_ASSERT(MolToSmiles(*res2[i]) == ans2[i]); - } + checkAns("C1(=CCCCC1)O", {"O=C1CCCCC1", "OC1=CCCCC1"}); + + // Enumerate 1,3 keto/enol tautomer. + checkAns("C1(=CCCCC1)O", {"O=C1CCCCC1", "OC1=CCCCC1"}); + + // Enumerate 1,3 keto/enol tautomer. + checkAns("C1(CCCCC1)=O", {"O=C1CCCCC1", "OC1=CCCCC1"}); // Enumerate acetophenone keto/enol tautomer. - std::string smi3 = "C(=C)(O)C1=CC=CC=C1"; - std::shared_ptr m3(SmilesToMol(smi3)); - std::vector res3 = te.enumerate(*m3); - std::vector ans3 = {"C=C(O)c1ccccc1", "CC(=O)c1ccccc1"}; - TEST_ASSERT(res3.size() == ans3.size()); - for (size_t i = 0; i < res3.size(); ++i) { - TEST_ASSERT(MolToSmiles(*res3[i]) == ans3[i]); - } + checkAns("C(=C)(O)C1=CC=CC=C1", {"C=C(O)c1ccccc1", "CC(=O)c1ccccc1"}); // Enumerate acetone keto/enol tautomer. - std::string smi4 = "CC(C)=O"; - std::shared_ptr m4(SmilesToMol(smi4)); - std::vector res4 = te.enumerate(*m4); - std::vector ans4 = {"C=C(C)O", "CC(C)=O"}; - TEST_ASSERT(res4.size() == ans4.size()); - for (size_t i = 0; i < res4.size(); ++i) { - TEST_ASSERT(MolToSmiles(*res4[i]) == ans4[i]); - } + checkAns("CC(C)=O", {"C=C(C)O", "CC(C)=O"}); // keto/enol tautomer - std::string smi5 = "OC(C)=C(C)C"; - std::shared_ptr m5(SmilesToMol(smi5)); - std::vector res5 = te.enumerate(*m5); - std::vector ans5 = {"C=C(O)C(C)C", "CC(=O)C(C)C", "CC(C)=C(C)O"}; - TEST_ASSERT(res5.size() == ans5.size()); - for (size_t i = 0; i < res5.size(); ++i) { - TEST_ASSERT(MolToSmiles(*res5[i]) == ans5[i]); - } + checkAns("OC(C)=C(C)C", {"C=C(O)C(C)C", "CC(=O)C(C)C", "CC(C)=C(C)O"}); // 1-phenyl-2-propanone enol/keto - std::string smi6 = "c1(ccccc1)CC(=O)C"; - std::shared_ptr m6(SmilesToMol(smi6)); - std::vector res6 = te.enumerate(*m6); - std::vector ans6 = {"C=C(O)Cc1ccccc1", "CC(=O)Cc1ccccc1", - "CC(O)=Cc1ccccc1"}; - TEST_ASSERT(res6.size() == ans6.size()); - for (size_t i = 0; i < res6.size(); ++i) { - TEST_ASSERT(MolToSmiles(*res6[i]) == ans6[i]); - } + checkAns("c1(ccccc1)CC(=O)C", + {"C=C(O)Cc1ccccc1", "CC(=O)Cc1ccccc1", "CC(O)=Cc1ccccc1"}); // 1,5 keto/enol tautomer - std::string smi7 = "Oc1nccc2cc[nH]c(=N)c12"; - std::shared_ptr m7(SmilesToMol(smi7)); - std::vector res7 = te.enumerate(*m7); - std::vector ans7 = { - "N=c1[nH]ccc2cc[nH]c(=O)c12", "N=c1[nH]ccc2ccnc(O)c12", - "N=c1nccc2cc[nH]c(O)c1-2", "Nc1[nH]ccc2ccnc(=O)c1-2", - "Nc1nccc2cc[nH]c(=O)c12", "Nc1nccc2ccnc(O)c12"}; - TEST_ASSERT(res7.size() == ans7.size()); - for (size_t i = 0; i < res7.size(); ++i) { - TEST_ASSERT(MolToSmiles(*res7[i]) == ans7[i]); - } + checkAns("Oc1nccc2cc[nH]c(=N)c12", + {"N=c1[nH]ccc2cc[nH]c(=O)c12", "N=c1[nH]ccc2ccnc(O)c12", + "N=c1nccc2cc[nH]c(O)c1-2", "Nc1[nH]ccc2ccnc(=O)c1-2", + "Nc1nccc2cc[nH]c(=O)c12", "Nc1nccc2ccnc(O)c12"}); // 1,5 keto/enol tautomer - std::string smi8 = "C1(C=CCCC1)=O"; - std::shared_ptr m8(SmilesToMol(smi8)); - std::vector res8 = te.enumerate(*m8); - std::vector ans8 = {"O=C1C=CCCC1", "O=C1CC=CCC1", "OC1=CC=CCC1", - "OC1=CCC=CC1", "OC1=CCCC=C1"}; - TEST_ASSERT(res8.size() == ans8.size()); - for (size_t i = 0; i < res8.size(); ++i) { - TEST_ASSERT(MolToSmiles(*res8[i]) == ans8[i]); - } + checkAns("C1(C=CCCC1)=O", {"O=C1C=CCCC1", "O=C1CC=CCC1", "OC1=CC=CCC1", + "OC1=CCC=CC1", "OC1=CCCC=C1"}); // 1,5 keto/enol tautomer - std::string smi9 = "C1(=CC=CCC1)O"; - std::shared_ptr m9(SmilesToMol(smi9)); - std::vector res9 = te.enumerate(*m9); - std::vector ans9 = {"O=C1C=CCCC1", "O=C1CC=CCC1", "OC1=CC=CCC1", - "OC1=CCC=CC1", "OC1=CCCC=C1"}; - TEST_ASSERT(res9.size() == ans9.size()); - for (size_t i = 0; i < res9.size(); ++i) { - TEST_ASSERT(MolToSmiles(*res9[i]) == ans9[i]); - } + checkAns("C1(=CC=CCC1)O", {"O=C1C=CCCC1", "O=C1CC=CCC1", "OC1=CC=CCC1", + "OC1=CCC=CC1", "OC1=CCCC=C1"}); // aliphatic imine tautomer - std::string smi10 = "C1(CCCCC1)=N"; - std::shared_ptr m10(SmilesToMol(smi10)); - std::vector res10 = te.enumerate(*m10); - std::vector ans10 = {"N=C1CCCCC1", "NC1=CCCCC1"}; - TEST_ASSERT(res10.size() == ans10.size()); - for (size_t i = 0; i < res10.size(); ++i) { - TEST_ASSERT(MolToSmiles(*res10[i]) == ans10[i]); - } + checkAns("C1(CCCCC1)=N", {"N=C1CCCCC1", "NC1=CCCCC1"}); // aliphatic imine tautomer - std::string smi11 = "C1(=CCCCC1)N"; - std::shared_ptr m11(SmilesToMol(smi11)); - std::vector res11 = te.enumerate(*m11); - std::vector ans11 = {"N=C1CCCCC1", "NC1=CCCCC1"}; - TEST_ASSERT(res11.size() == ans11.size()); - for (size_t i = 0; i < res11.size(); ++i) { - TEST_ASSERT(MolToSmiles(*res11[i]) == ans11[i]); - } + checkAns("C1(=CCCCC1)N", {"N=C1CCCCC1", "NC1=CCCCC1"}); // special imine tautomer - std::string smi12 = "C1(C=CC=CN1)=CC"; - std::shared_ptr m12(SmilesToMol(smi12)); - std::vector res12 = te.enumerate(*m12); - std::vector ans12 = {"CC=C1C=CC=CN1", "CC=C1C=CCC=N1", - "CCc1ccccn1"}; - TEST_ASSERT(res12.size() == ans12.size()); - for (size_t i = 0; i < res12.size(); ++i) { - TEST_ASSERT(MolToSmiles(*res12[i]) == ans12[i]); - } + checkAns("C1(C=CC=CN1)=CC", {"CC=C1C=CC=CN1", "CC=C1C=CCC=N1", "CCc1ccccn1"}); // special imine tautomer - std::string smi13 = "C1(=NC=CC=C1)CC"; - std::shared_ptr m13(SmilesToMol(smi13)); - std::vector res13 = te.enumerate(*m13); - std::vector ans13 = {"CC=C1C=CC=CN1", "CC=C1C=CCC=N1", - "CCc1ccccn1"}; - TEST_ASSERT(res13.size() == ans13.size()); - for (size_t i = 0; i < res13.size(); ++i) { - TEST_ASSERT(MolToSmiles(*res13[i]) == ans13[i]); - } + checkAns("C1(=NC=CC=C1)CC", {"CC=C1C=CC=CN1", "CC=C1C=CCC=N1", "CCc1ccccn1"}); // 1,3 aromatic heteroatom H shift - std::string smi14 = "O=c1cccc[nH]1"; - std::shared_ptr m14(SmilesToMol(smi14)); - std::vector res14 = te.enumerate(*m14); - std::vector ans14 = {"O=c1cccc[nH]1", "Oc1ccccn1"}; - TEST_ASSERT(res14.size() == ans14.size()); - for (size_t i = 0; i < res14.size(); ++i) { - TEST_ASSERT(MolToSmiles(*res14[i]) == ans14[i]); - } + checkAns("O=c1cccc[nH]1", {"O=c1cccc[nH]1", "Oc1ccccn1"}); // 1,3 aromatic heteroatom H shift - std::string smi15 = "Oc1ccccn1"; - std::shared_ptr m15(SmilesToMol(smi15)); - std::vector res15 = te.enumerate(*m15); - std::vector ans15 = {"O=c1cccc[nH]1", "Oc1ccccn1"}; - TEST_ASSERT(res15.size() == ans15.size()); - for (size_t i = 0; i < res15.size(); ++i) { - TEST_ASSERT(MolToSmiles(*res15[i]) == ans15[i]); - } + checkAns("Oc1ccccn1", {"O=c1cccc[nH]1", "Oc1ccccn1"}); // 1,3 aromatic heteroatom H shift - std::string smi16 = "Oc1ncc[nH]1"; - std::shared_ptr m16(SmilesToMol(smi16)); - std::vector res16 = te.enumerate(*m16); - std::vector ans16 = {"O=c1[nH]cc[nH]1", "Oc1ncc[nH]1"}; - TEST_ASSERT(res16.size() == ans16.size()); - for (size_t i = 0; i < res16.size(); ++i) { - TEST_ASSERT(MolToSmiles(*res16[i]) == ans16[i]); - } + checkAns("Oc1ncc[nH]1", {"O=c1[nH]cc[nH]1", "Oc1ncc[nH]1"}); // 1,3 heteroatom H shift - std::string smi17 = "OC(C)=NC"; - std::shared_ptr m17(SmilesToMol(smi17)); - std::vector res17 = te.enumerate(*m17); - std::vector ans17 = {"C=C(O)NC", "CN=C(C)O", "CNC(C)=O"}; - TEST_ASSERT(res17.size() == ans17.size()); - for (size_t i = 0; i < res17.size(); ++i) { - TEST_ASSERT(MolToSmiles(*res17[i]) == ans17[i]); - } + checkAns("OC(C)=NC", {"C=C(O)NC", "CN=C(C)O", "CNC(C)=O"}); // 1,3 heteroatom H shift - std::string smi18 = "CNC(C)=O"; - std::shared_ptr m18(SmilesToMol(smi18)); - std::vector res18 = te.enumerate(*m18); - std::vector ans18 = {"C=C(O)NC", "CN=C(C)O", "CNC(C)=O"}; - TEST_ASSERT(res18.size() == ans18.size()); - for (size_t i = 0; i < res18.size(); ++i) { - TEST_ASSERT(MolToSmiles(*res18[i]) == ans18[i]); - } + checkAns("CNC(C)=O", {"C=C(O)NC", "CN=C(C)O", "CNC(C)=O"}); // 1,3 heteroatom H shift - std::string smi19 = "S=C(N)N"; - std::shared_ptr m19(SmilesToMol(smi19)); - std::vector res19 = te.enumerate(*m19); - std::vector ans19 = {"N=C(N)S", "NC(N)=S"}; - TEST_ASSERT(res19.size() == ans19.size()); - for (size_t i = 0; i < res19.size(); ++i) { - TEST_ASSERT(MolToSmiles(*res19[i]) == ans19[i]); - } + checkAns("S=C(N)N", {"N=C(N)S", "NC(N)=S"}); // 1,3 heteroatom H shift - std::string smi20 = "SC(N)=N"; - std::shared_ptr m20(SmilesToMol(smi20)); - std::vector res20 = te.enumerate(*m20); - std::vector ans20 = {"N=C(N)S", "NC(N)=S"}; - TEST_ASSERT(res20.size() == ans20.size()); - for (size_t i = 0; i < res20.size(); ++i) { - TEST_ASSERT(MolToSmiles(*res20[i]) == ans20[i]); - } + checkAns("SC(N)=N", {"N=C(N)S", "NC(N)=S"}); // 1,3 heteroatom H shift - std::string smi21 = "N=c1[nH]ccn(C)1"; - std::shared_ptr m21(SmilesToMol(smi21)); - std::vector res21 = te.enumerate(*m21); - std::vector ans21 = {"Cn1cc[nH]c1=N", "Cn1ccnc1N"}; - TEST_ASSERT(res21.size() == ans21.size()); - for (size_t i = 0; i < res21.size(); ++i) { - TEST_ASSERT(MolToSmiles(*res21[i]) == ans21[i]); - } + checkAns("N=c1[nH]ccn(C)1", {"Cn1cc[nH]c1=N", "Cn1ccnc1N"}); // 1,3 heteroatom H shift - std::string smi22 = "CN=c1[nH]cncc1"; - std::shared_ptr m22(SmilesToMol(smi22)); - std::vector res22 = te.enumerate(*m22); - std::vector ans22 = {"CN=c1cc[nH]cn1", "CN=c1ccnc[nH]1", - "CNc1ccncn1"}; - TEST_ASSERT(res22.size() == ans22.size()); - for (size_t i = 0; i < res22.size(); ++i) { - TEST_ASSERT(MolToSmiles(*res22[i]) == ans22[i]); - } + checkAns("CN=c1[nH]cncc1", + {"CN=c1cc[nH]cn1", "CN=c1ccnc[nH]1", "CNc1ccncn1"}); // 1,5 aromatic heteroatom H shift - std::string smi23 = "Oc1cccc2ccncc12"; - std::shared_ptr m23(SmilesToMol(smi23)); - std::vector res23 = te.enumerate(*m23); - std::vector ans23 = {"O=c1cccc2cc[nH]cc1-2", "Oc1cccc2ccncc12"}; - TEST_ASSERT(res23.size() == ans23.size()); - for (size_t i = 0; i < res23.size(); ++i) { - TEST_ASSERT(MolToSmiles(*res23[i]) == ans23[i]); - } + checkAns("Oc1cccc2ccncc12", {"O=c1cccc2cc[nH]cc1-2", "Oc1cccc2ccncc12"}); // 1,5 aromatic heteroatom H shift - std::string smi24 = "O=c1cccc2cc[nH]cc1-2"; - std::shared_ptr m24(SmilesToMol(smi24)); - std::vector res24 = te.enumerate(*m24); - std::vector ans24 = {"O=c1cccc2cc[nH]cc1-2", "Oc1cccc2ccncc12"}; - TEST_ASSERT(res24.size() == ans24.size()); - for (size_t i = 0; i < res24.size(); ++i) { - TEST_ASSERT(MolToSmiles(*res24[i]) == ans24[i]); - } + checkAns("O=c1cccc2cc[nH]cc1-2", {"O=c1cccc2cc[nH]cc1-2", "Oc1cccc2ccncc12"}); // 1,5 aromatic heteroatom H shift - std::string smi25 = "Cc1n[nH]c2ncnn12"; - std::shared_ptr m25(SmilesToMol(smi25)); - std::vector res25 = te.enumerate(*m25); - std::vector ans25 = {"C=C1NN=C2N=CNN12", "C=C1NN=C2NC=NN12", - "C=C1NNc2ncnn21", "Cc1n[nH]c2ncnn12", - "Cc1nnc2[nH]cnn12", "Cc1nnc2nc[nH]n12"}; - TEST_ASSERT(res25.size() == ans25.size()); - for (size_t i = 0; i < res25.size(); ++i) { - TEST_ASSERT(MolToSmiles(*res25[i]) == ans25[i]); - } + checkAns("Cc1n[nH]c2ncnn12", + {"C=C1NN=C2N=CNN12", "C=C1NN=C2NC=NN12", "C=C1NNc2ncnn21", + "Cc1n[nH]c2ncnn12", "Cc1nnc2[nH]cnn12", "Cc1nnc2nc[nH]n12"}); // 1,5 aromatic heteroatom H shift - std::string smi26 = "Cc1nnc2nc[nH]n12"; - std::shared_ptr m26(SmilesToMol(smi26)); - std::vector res26 = te.enumerate(*m26); - std::vector ans26 = {"C=C1NN=C2N=CNN12", "C=C1NN=C2NC=NN12", - "C=C1NNc2ncnn21", "Cc1n[nH]c2ncnn12", - "Cc1nnc2[nH]cnn12", "Cc1nnc2nc[nH]n12"}; - TEST_ASSERT(res26.size() == ans26.size()); - for (size_t i = 0; i < res26.size(); ++i) { - TEST_ASSERT(MolToSmiles(*res26[i]) == ans26[i]); - } + checkAns("Cc1nnc2nc[nH]n12", + {"C=C1NN=C2N=CNN12", "C=C1NN=C2NC=NN12", "C=C1NNc2ncnn21", + "Cc1n[nH]c2ncnn12", "Cc1nnc2[nH]cnn12", "Cc1nnc2nc[nH]n12"}); // 1,5 aromatic heteroatom H shift - std::string smi29 = "Oc1ccncc1"; - std::shared_ptr m29(SmilesToMol(smi29)); - std::vector res29 = te.enumerate(*m29); - std::vector ans29 = {"O=c1cc[nH]cc1", "Oc1ccncc1"}; - TEST_ASSERT(res29.size() == ans29.size()); - for (size_t i = 0; i < res29.size(); ++i) { - TEST_ASSERT(MolToSmiles(*res29[i]) == ans29[i]); - } + checkAns("Oc1ccncc1", {"O=c1cc[nH]cc1", "Oc1ccncc1"}); // 1,5 aromatic heteroatom H shift - std::string smi27 = "Oc1c(cccc3)c3nc2ccncc12"; - std::shared_ptr m27(SmilesToMol(smi27)); - std::vector res27 = te.enumerate(*m27); - std::vector ans27 = {"O=c1c2c[nH]ccc-2nc2ccccc12", - "O=c1c2ccccc2[nH]c2ccncc12", - "Oc1c2ccccc2nc2ccncc12"}; - TEST_ASSERT(res27.size() == ans27.size()); - for (size_t i = 0; i < res27.size(); ++i) { - TEST_ASSERT(MolToSmiles(*res27[i]) == ans27[i]); - } + checkAns("Oc1c(cccc3)c3nc2ccncc12", + {"O=c1c2c[nH]ccc-2nc2ccccc12", "O=c1c2ccccc2[nH]c2ccncc12", + "Oc1c2ccccc2nc2ccncc12"}); // 1,3 and 1,5 aromatic heteroatom H shift - std::string smi28 = "Oc1ncncc1"; - std::shared_ptr m28(SmilesToMol(smi28)); - std::vector res28 = te.enumerate(*m28); - std::vector ans28 = {"O=c1cc[nH]cn1", "O=c1ccnc[nH]1", - "Oc1ccncn1"}; - TEST_ASSERT(res28.size() == ans28.size()); - for (size_t i = 0; i < res28.size(); ++i) { - TEST_ASSERT(MolToSmiles(*res28[i]) == ans28[i]); - } + checkAns("Oc1ncncc1", {"O=c1cc[nH]cn1", "O=c1ccnc[nH]1", "Oc1ccncn1"}); // 1,5 aromatic heteroatom H shift - std::string smi30 = "C2(=C1C(=NC=N1)[NH]C(=N2)N)O"; - std::shared_ptr m30(SmilesToMol(smi30)); - std::vector res30 = te.enumerate(*m30); - std::vector ans30 = { - "N=c1[nH]c(=O)c2[nH]cnc2[nH]1", "N=c1[nH]c(=O)c2nc[nH]c2[nH]1", - "N=c1[nH]c2ncnc-2c(O)[nH]1", "N=c1nc(O)c2[nH]cnc2[nH]1", - "N=c1nc(O)c2nc[nH]c2[nH]1", "N=c1nc2[nH]cnc2c(O)[nH]1", - "N=c1nc2nc[nH]c2c(O)[nH]1", "Nc1nc(=O)c2[nH]cnc2[nH]1", - "Nc1nc(=O)c2nc[nH]c2[nH]1", "Nc1nc(O)c2[nH]cnc2n1", - "Nc1nc(O)c2nc[nH]c2n1", "Nc1nc(O)c2ncnc-2[nH]1", - "Nc1nc2[nH]cnc2c(=O)[nH]1", "Nc1nc2nc[nH]c2c(=O)[nH]1", - "Nc1nc2ncnc-2c(O)[nH]1"}; - TEST_ASSERT(res30.size() == ans30.size()); - for (size_t i = 0; i < res30.size(); ++i) { - TEST_ASSERT(MolToSmiles(*res30[i]) == ans30[i]); - } + checkAns("C2(=C1C(=NC=N1)[NH]C(=N2)N)O", + {"N=c1[nH]c(=O)c2[nH]cnc2[nH]1", "N=c1[nH]c(=O)c2nc[nH]c2[nH]1", + "N=c1[nH]c2ncnc-2c(O)[nH]1", "N=c1nc(O)c2[nH]cnc2[nH]1", + "N=c1nc(O)c2nc[nH]c2[nH]1", "N=c1nc2[nH]cnc2c(O)[nH]1", + "N=c1nc2nc[nH]c2c(O)[nH]1", "Nc1nc(=O)c2[nH]cnc2[nH]1", + "Nc1nc(=O)c2nc[nH]c2[nH]1", "Nc1nc(O)c2[nH]cnc2n1", + "Nc1nc(O)c2nc[nH]c2n1", "Nc1nc(O)c2ncnc-2[nH]1", + "Nc1nc2[nH]cnc2c(=O)[nH]1", "Nc1nc2nc[nH]c2c(=O)[nH]1", + "Nc1nc2ncnc-2c(O)[nH]1"}); // 1,5 aromatic heteroatom H shift - std::string smi31 = "C2(C1=C([NH]C=N1)[NH]C(=N2)N)=O"; - std::shared_ptr m31(SmilesToMol(smi31)); - std::vector res31 = te.enumerate(*m31); - std::vector ans31 = { - "N=c1[nH]c(=O)c2[nH]cnc2[nH]1", "N=c1[nH]c(=O)c2nc[nH]c2[nH]1", - "N=c1[nH]c2ncnc-2c(O)[nH]1", "N=c1nc(O)c2[nH]cnc2[nH]1", - "N=c1nc(O)c2nc[nH]c2[nH]1", "N=c1nc2[nH]cnc2c(O)[nH]1", - "N=c1nc2nc[nH]c2c(O)[nH]1", "Nc1nc(=O)c2[nH]cnc2[nH]1", - "Nc1nc(=O)c2nc[nH]c2[nH]1", "Nc1nc(O)c2[nH]cnc2n1", - "Nc1nc(O)c2nc[nH]c2n1", "Nc1nc(O)c2ncnc-2[nH]1", - "Nc1nc2[nH]cnc2c(=O)[nH]1", "Nc1nc2nc[nH]c2c(=O)[nH]1", - "Nc1nc2ncnc-2c(O)[nH]1"}; - TEST_ASSERT(res31.size() == ans31.size()); - for (size_t i = 0; i < res31.size(); ++i) { - TEST_ASSERT(MolToSmiles(*res31[i]) == ans31[i]); - } + checkAns("C2(C1=C([NH]C=N1)[NH]C(=N2)N)=O", + {"N=c1[nH]c(=O)c2[nH]cnc2[nH]1", "N=c1[nH]c(=O)c2nc[nH]c2[nH]1", + "N=c1[nH]c2ncnc-2c(O)[nH]1", "N=c1nc(O)c2[nH]cnc2[nH]1", + "N=c1nc(O)c2nc[nH]c2[nH]1", "N=c1nc2[nH]cnc2c(O)[nH]1", + "N=c1nc2nc[nH]c2c(O)[nH]1", "Nc1nc(=O)c2[nH]cnc2[nH]1", + "Nc1nc(=O)c2nc[nH]c2[nH]1", "Nc1nc(O)c2[nH]cnc2n1", + "Nc1nc(O)c2nc[nH]c2n1", "Nc1nc(O)c2ncnc-2[nH]1", + "Nc1nc2[nH]cnc2c(=O)[nH]1", "Nc1nc2nc[nH]c2c(=O)[nH]1", + "Nc1nc2ncnc-2c(O)[nH]1"}); // 1,5 aromatic heteroatom H shift - std::string smi32 = "Oc1n(C)ncc1"; - std::shared_ptr m32(SmilesToMol(smi32)); - std::vector res32 = te.enumerate(*m32); - std::vector ans32 = {"CN1N=CCC1=O", "Cn1[nH]ccc1=O", - "Cn1nccc1O"}; - TEST_ASSERT(res32.size() == ans32.size()); - for (size_t i = 0; i < res32.size(); ++i) { - TEST_ASSERT(MolToSmiles(*res32[i]) == ans32[i]); - } + checkAns("Oc1n(C)ncc1", {"CN1N=CCC1=O", "Cn1[nH]ccc1=O", "Cn1nccc1O"}); // 1,5 aromatic heteroatom H shift - std::string smi33 = "O=c1nc2[nH]ccn2cc1"; - std::shared_ptr m33(SmilesToMol(smi33)); - std::vector res33 = te.enumerate(*m33); - std::vector ans33 = {"O=c1ccn2cc[nH]c2n1", "O=c1ccn2ccnc2[nH]1", - "Oc1ccn2ccnc2n1"}; - TEST_ASSERT(res33.size() == ans33.size()); - for (size_t i = 0; i < res33.size(); ++i) { - TEST_ASSERT(MolToSmiles(*res33[i]) == ans33[i]); - } + checkAns("O=c1nc2[nH]ccn2cc1", + {"O=c1ccn2cc[nH]c2n1", "O=c1ccn2ccnc2[nH]1", "Oc1ccn2ccnc2n1"}); // 1,5 aromatic heteroatom H shift - std::string smi34 = "N=c1nc[nH]cc1"; - std::shared_ptr m34(SmilesToMol(smi34)); - std::vector res34 = te.enumerate(*m34); - std::vector ans34 = {"N=c1cc[nH]cn1", "N=c1ccnc[nH]1", - "Nc1ccncn1"}; - TEST_ASSERT(res34.size() == ans34.size()); - for (size_t i = 0; i < res34.size(); ++i) { - TEST_ASSERT(MolToSmiles(*res34[i]) == ans34[i]); - } + checkAns("N=c1nc[nH]cc1", {"N=c1cc[nH]cn1", "N=c1ccnc[nH]1", "Nc1ccncn1"}); // 1,5 aromatic heteroatom H shift - std::string smi35 = "N=c(c1)ccn2cc[nH]c12"; - std::shared_ptr m35(SmilesToMol(smi35)); - std::vector res35 = te.enumerate(*m35); - std::vector ans35 = {"N=c1ccn2cc[nH]c2c1", "Nc1ccn2ccnc2c1"}; - TEST_ASSERT(res35.size() == ans35.size()); - for (size_t i = 0; i < res35.size(); ++i) { - TEST_ASSERT(MolToSmiles(*res35[i]) == ans35[i]); - } + checkAns("N=c(c1)ccn2cc[nH]c12", {"N=c1ccn2cc[nH]c2c1", "Nc1ccn2ccnc2c1"}); // 1,5 aromatic heteroatom H shift - std::string smi36 = "CN=c1nc[nH]cc1"; - std::shared_ptr m36(SmilesToMol(smi36)); - std::vector res36 = te.enumerate(*m36); - std::vector ans36 = {"CN=c1cc[nH]cn1", "CN=c1ccnc[nH]1", - "CNc1ccncn1"}; - TEST_ASSERT(res36.size() == ans36.size()); - for (size_t i = 0; i < res36.size(); ++i) { - TEST_ASSERT(MolToSmiles(*res36[i]) == ans36[i]); - } + checkAns("CN=c1nc[nH]cc1", + {"CN=c1cc[nH]cn1", "CN=c1ccnc[nH]1", "CNc1ccncn1"}); // 1,7 aromatic heteroatom H shift - std::string smi37 = "c1ccc2[nH]c(-c3nc4ccccc4[nH]3)nc2c1"; - std::shared_ptr m37(SmilesToMol(smi37)); - std::vector res37 = te.enumerate(*m37); - std::vector ans37 = {"c1ccc2[nH]c(-c3nc4ccccc4[nH]3)nc2c1", - "c1ccc2c(c1)=NC(c1nc3ccccc3[nH]1)N=2", - "c1ccc2c(c1)NC(=C1N=c3ccccc3=N1)N2"}; - TEST_ASSERT(res37.size() == ans37.size()); - for (size_t i = 0; i < res37.size(); ++i) { - TEST_ASSERT(MolToSmiles(*res37[i]) == ans37[i]); - } + checkAns("c1ccc2[nH]c(-c3nc4ccccc4[nH]3)nc2c1", + {"c1ccc2[nH]c(-c3nc4ccccc4[nH]3)nc2c1", + "c1ccc2c(c1)=NC(c1nc3ccccc3[nH]1)N=2", + "c1ccc2c(c1)NC(=C1N=c3ccccc3=N1)N2"}); // 1,7 aromatic heteroatom H shift - std::string smi38 = "c1ccc2c(c1)NC(=C1N=c3ccccc3=N1)N2"; - std::shared_ptr m38(SmilesToMol(smi38)); - std::vector res38 = te.enumerate(*m38); - std::vector ans38 = {"c1ccc2[nH]c(-c3nc4ccccc4[nH]3)nc2c1", - "c1ccc2c(c1)=NC(c1nc3ccccc3[nH]1)N=2", - "c1ccc2c(c1)NC(=C1N=c3ccccc3=N1)N2"}; - TEST_ASSERT(res38.size() == ans38.size()); - for (size_t i = 0; i < res38.size(); ++i) { - TEST_ASSERT(MolToSmiles(*res38[i]) == ans38[i]); - } + checkAns("c1ccc2c(c1)NC(=C1N=c3ccccc3=N1)N2", + {"c1ccc2[nH]c(-c3nc4ccccc4[nH]3)nc2c1", + "c1ccc2c(c1)=NC(c1nc3ccccc3[nH]1)N=2", + "c1ccc2c(c1)NC(=C1N=c3ccccc3=N1)N2"}); // 1,9 aromatic heteroatom H shift - std::string smi39 = "CNc1ccnc2ncnn21"; - std::shared_ptr m39(SmilesToMol(smi39)); - std::vector res39 = te.enumerate(*m39); - std::vector ans39 = {"CN=c1cc[nH]c2ncnn12", - "CN=c1ccnc2[nH]cnn12", - "CN=c1ccnc2nc[nH]n12", "CNc1ccnc2ncnn12"}; - TEST_ASSERT(res39.size() == ans39.size()); - for (size_t i = 0; i < res39.size(); ++i) { - TEST_ASSERT(MolToSmiles(*res39[i]) == ans39[i]); - } + checkAns("CNc1ccnc2ncnn21", {"CN=c1cc[nH]c2ncnn12", "CN=c1ccnc2[nH]cnn12", + "CN=c1ccnc2nc[nH]n12", "CNc1ccnc2ncnn12"}); // 1,9 aromatic heteroatom H shift - std::string smi40 = "CN=c1ccnc2nc[nH]n21"; - std::shared_ptr m40(SmilesToMol(smi40)); - std::vector res40 = te.enumerate(*m40); - std::vector ans40 = {"CN=c1cc[nH]c2ncnn12", - "CN=c1ccnc2[nH]cnn12", - "CN=c1ccnc2nc[nH]n12", "CNc1ccnc2ncnn12"}; - TEST_ASSERT(res40.size() == ans40.size()); - for (size_t i = 0; i < res40.size(); ++i) { - TEST_ASSERT(MolToSmiles(*res40[i]) == ans40[i]); - } + checkAns("CN=c1ccnc2nc[nH]n21", {"CN=c1cc[nH]c2ncnn12", "CN=c1ccnc2[nH]cnn12", + "CN=c1ccnc2nc[nH]n12", "CNc1ccnc2ncnn12"}); // 1,11 aromatic heteroatom H shift - std::string smi41 = "Nc1ccc(C=C2C=CC(=O)C=C2)cc1"; - std::shared_ptr m41(SmilesToMol(smi41)); - std::vector res41 = te.enumerate(*m41); - std::vector ans41 = { - "N=C1C=CC(=CC2C=CC(=O)C=C2)C=C1", "N=C1C=CC(=Cc2ccc(O)cc2)C=C1", - "N=C1C=CC(C=C2C=CC(=O)C=C2)C=C1", "Nc1ccc(C=C2C=CC(=O)C=C2)cc1"}; - TEST_ASSERT(res41.size() == ans41.size()); - for (size_t i = 0; i < res41.size(); ++i) { - TEST_ASSERT(MolToSmiles(*res41[i]) == ans41[i]); - } + checkAns("Nc1ccc(C=C2C=CC(=O)C=C2)cc1", + {"N=C1C=CC(=CC2C=CC(=O)C=C2)C=C1", "N=C1C=CC(=Cc2ccc(O)cc2)C=C1", + "N=C1C=CC(C=C2C=CC(=O)C=C2)C=C1", "Nc1ccc(C=C2C=CC(=O)C=C2)cc1"}); // 1,11 aromatic heteroatom H shift - std::string smi42 = "N=C1C=CC(=Cc2ccc(O)cc2)C=C1"; - std::shared_ptr m42(SmilesToMol(smi42)); - std::vector res42 = te.enumerate(*m42); - std::vector ans42 = { - "N=C1C=CC(=CC2C=CC(=O)C=C2)C=C1", "N=C1C=CC(=Cc2ccc(O)cc2)C=C1", - "N=C1C=CC(C=C2C=CC(=O)C=C2)C=C1", "Nc1ccc(C=C2C=CC(=O)C=C2)cc1"}; - TEST_ASSERT(res42.size() == ans42.size()); - for (size_t i = 0; i < res42.size(); ++i) { - TEST_ASSERT(MolToSmiles(*res42[i]) == ans42[i]); - } + checkAns("N=C1C=CC(=Cc2ccc(O)cc2)C=C1", + {"N=C1C=CC(=CC2C=CC(=O)C=C2)C=C1", "N=C1C=CC(=Cc2ccc(O)cc2)C=C1", + "N=C1C=CC(C=C2C=CC(=O)C=C2)C=C1", "Nc1ccc(C=C2C=CC(=O)C=C2)cc1"}); // heterocyclic tautomer - std::string smi43 = "n1ccc2ccc[nH]c12"; - std::shared_ptr m43(SmilesToMol(smi43)); - std::vector res43 = te.enumerate(*m43); - std::vector ans43 = {"c1c[nH]c2nccc-2c1", "c1cnc2[nH]ccc2c1"}; - TEST_ASSERT(res43.size() == ans43.size()); - for (size_t i = 0; i < res43.size(); ++i) { - TEST_ASSERT(MolToSmiles(*res43[i]) == ans43[i]); - } + checkAns("n1ccc2ccc[nH]c12", {"c1c[nH]c2nccc-2c1", "c1cnc2[nH]ccc2c1"}); // heterocyclic tautomer - std::string smi44 = "c1cc(=O)[nH]c2nccn12"; - std::shared_ptr m44(SmilesToMol(smi44)); - std::vector res44 = te.enumerate(*m44); - std::vector ans44 = {"O=c1ccn2cc[nH]c2n1", "O=c1ccn2ccnc2[nH]1", - "Oc1ccn2ccnc2n1"}; - TEST_ASSERT(res44.size() == ans44.size()); - for (size_t i = 0; i < res44.size(); ++i) { - TEST_ASSERT(MolToSmiles(*res44[i]) == ans44[i]); - } + checkAns("c1cc(=O)[nH]c2nccn12", + {"O=c1ccn2cc[nH]c2n1", "O=c1ccn2ccnc2[nH]1", "Oc1ccn2ccnc2n1"}); // heterocyclic tautomer - std::string smi45 = "c1cnc2c[nH]ccc12"; - std::shared_ptr m45(SmilesToMol(smi45)); - std::vector res45 = te.enumerate(*m45); - std::vector ans45 = {"c1cc2cc[nH]c2cn1", "c1cc2cc[nH]cc-2n1"}; - TEST_ASSERT(res45.size() == ans45.size()); - for (size_t i = 0; i < res45.size(); ++i) { - TEST_ASSERT(MolToSmiles(*res45[i]) == ans45[i]); - } + checkAns("c1cnc2c[nH]ccc12", {"c1cc2cc[nH]c2cn1", "c1cc2cc[nH]cc-2n1"}); // heterocyclic tautomer - std::string smi46 = "n1ccc2c[nH]ccc12"; - std::shared_ptr m46(SmilesToMol(smi46)); - std::vector res46 = te.enumerate(*m46); - std::vector ans46 = {"c1cc2[nH]ccc2cn1", "c1cc2c[nH]ccc-2n1"}; - TEST_ASSERT(res46.size() == ans46.size()); - for (size_t i = 0; i < res46.size(); ++i) { - TEST_ASSERT(MolToSmiles(*res46[i]) == ans46[i]); - } + checkAns("n1ccc2c[nH]ccc12", {"c1cc2[nH]ccc2cn1", "c1cc2c[nH]ccc-2n1"}); // heterocyclic tautomer - std::string smi47 = "c1cnc2ccc[nH]c12"; - std::shared_ptr m47(SmilesToMol(smi47)); - std::vector res47 = te.enumerate(*m47); - std::vector ans47 = {"c1c[nH]c2ccnc-2c1", "c1cnc2cc[nH]c2c1"}; - TEST_ASSERT(res47.size() == ans47.size()); - for (size_t i = 0; i < res47.size(); ++i) { - TEST_ASSERT(MolToSmiles(*res47[i]) == ans47[i]); - } + checkAns("c1cnc2ccc[nH]c12", {"c1c[nH]c2ccnc-2c1", "c1cnc2cc[nH]c2c1"}); // furanone tautomer - std::string smi48 = "C1=CC=C(O1)O"; - std::shared_ptr m48(SmilesToMol(smi48)); - std::vector res48 = te.enumerate(*m48); - std::vector ans48 = {"O=C1CC=CO1", "Oc1ccco1"}; - TEST_ASSERT(res48.size() == ans48.size()); - for (size_t i = 0; i < res48.size(); ++i) { - TEST_ASSERT(MolToSmiles(*res48[i]) == ans48[i]); - } + checkAns("C1=CC=C(O1)O", {"O=C1CC=CO1", "Oc1ccco1"}); // furanone tautomer - std::string smi49 = "O=C1CC=CO1"; - std::shared_ptr m49(SmilesToMol(smi49)); - std::vector res49 = te.enumerate(*m49); - std::vector ans49 = {"O=C1CC=CO1", "Oc1ccco1"}; - TEST_ASSERT(res49.size() == ans49.size()); - for (size_t i = 0; i < res49.size(); ++i) { - TEST_ASSERT(MolToSmiles(*res49[i]) == ans49[i]); - } + checkAns("O=C1CC=CO1", {"O=C1CC=CO1", "Oc1ccco1"}); // keten/ynol tautomer - std::string smi50 = "CC=C=O"; - std::shared_ptr m50(SmilesToMol(smi50)); - std::vector res50 = te.enumerate(*m50); - std::vector ans50 = {"CC#CO", "CC=C=O"}; - TEST_ASSERT(res50.size() == ans50.size()); - for (size_t i = 0; i < res50.size(); ++i) { - TEST_ASSERT(MolToSmiles(*res50[i]) == ans50[i]); - } + checkAns("CC=C=O", {"CC#CO", "CC=C=O"}); // keten/ynol tautomer - std::string smi51 = "CC#CO"; - std::shared_ptr m51(SmilesToMol(smi51)); - std::vector res51 = te.enumerate(*m51); - std::vector ans51 = {"CC#CO", "CC=C=O"}; - TEST_ASSERT(res51.size() == ans51.size()); - for (size_t i = 0; i < res51.size(); ++i) { - TEST_ASSERT(MolToSmiles(*res51[i]) == ans51[i]); - } + checkAns("CC#CO", {"CC#CO", "CC=C=O"}); // ionic nitro/aci-nitro tautomer - std::string smi52 = "C([N+](=O)[O-])C"; - std::shared_ptr m52(SmilesToMol(smi52)); - std::vector res52 = te.enumerate(*m52); - std::vector ans52 = {"CC=[N+]([O-])O", "CC[N+](=O)[O-]"}; - TEST_ASSERT(res52.size() == ans52.size()); - for (size_t i = 0; i < res52.size(); ++i) { - TEST_ASSERT(MolToSmiles(*res52[i]) == ans52[i]); - } + checkAns("C([N+](=O)[O-])C", {"CC=[N+]([O-])O", "CC[N+](=O)[O-]"}); // ionic nitro/aci-nitro tautomer - std::string smi53 = "C(=[N+](O)[O-])C"; - std::shared_ptr m53(SmilesToMol(smi53)); - std::vector res53 = te.enumerate(*m53); - std::vector ans53 = {"CC=[N+]([O-])O", "CC[N+](=O)[O-]"}; - TEST_ASSERT(res53.size() == ans53.size()); - for (size_t i = 0; i < res53.size(); ++i) { - TEST_ASSERT(MolToSmiles(*res53[i]) == ans53[i]); - } + checkAns("C(=[N+](O)[O-])C", {"CC=[N+]([O-])O", "CC[N+](=O)[O-]"}); // oxim nitroso tautomer - std::string smi54 = "CC(C)=NO"; - std::shared_ptr m54(SmilesToMol(smi54)); - std::vector res54 = te.enumerate(*m54); - std::vector ans54 = {"C=C(C)NO", "CC(C)=NO", "CC(C)N=O"}; - TEST_ASSERT(res54.size() == ans54.size()); - for (size_t i = 0; i < res54.size(); ++i) { - TEST_ASSERT(MolToSmiles(*res54[i]) == ans54[i]); - } + checkAns("CC(C)=NO", {"C=C(C)NO", "CC(C)=NO", "CC(C)N=O"}); // oxim nitroso tautomer - std::string smi55 = "CC(C)N=O"; - std::shared_ptr m55(SmilesToMol(smi55)); - std::vector res55 = te.enumerate(*m55); - std::vector ans55 = {"C=C(C)NO", "CC(C)=NO", "CC(C)N=O"}; - TEST_ASSERT(res55.size() == ans55.size()); - for (size_t i = 0; i < res55.size(); ++i) { - TEST_ASSERT(MolToSmiles(*res55[i]) == ans55[i]); - } + checkAns("CC(C)N=O", {"C=C(C)NO", "CC(C)=NO", "CC(C)N=O"}); // oxim/nitroso tautomer via phenol - std::string smi56 = "O=Nc1ccc(O)cc1"; - std::shared_ptr m56(SmilesToMol(smi56)); - std::vector res56 = te.enumerate(*m56); - std::vector ans56 = {"O=C1C=CC(=NO)C=C1", "O=NC1C=CC(=O)C=C1", - "O=Nc1ccc(O)cc1"}; - TEST_ASSERT(res56.size() == ans56.size()); - for (size_t i = 0; i < res56.size(); ++i) { - TEST_ASSERT(MolToSmiles(*res56[i]) == ans56[i]); - } + checkAns("O=Nc1ccc(O)cc1", + {"O=C1C=CC(=NO)C=C1", "O=NC1C=CC(=O)C=C1", "O=Nc1ccc(O)cc1"}); // oxim/nitroso tautomer via phenol - std::string smi57 = "O=C1C=CC(=NO)C=C1"; - std::shared_ptr m57(SmilesToMol(smi57)); - std::vector res57 = te.enumerate(*m57); - std::vector ans57 = {"O=C1C=CC(=NO)C=C1", "O=NC1C=CC(=O)C=C1", - "O=Nc1ccc(O)cc1"}; - TEST_ASSERT(res57.size() == ans57.size()); - for (size_t i = 0; i < res57.size(); ++i) { - TEST_ASSERT(MolToSmiles(*res57[i]) == ans57[i]); - } + checkAns("O=C1C=CC(=NO)C=C1", + {"O=C1C=CC(=NO)C=C1", "O=NC1C=CC(=O)C=C1", "O=Nc1ccc(O)cc1"}); // cyano/iso-cyanic acid tautomer - std::string smi58 = "C(#N)O"; - std::shared_ptr m58(SmilesToMol(smi58)); - std::vector res58 = te.enumerate(*m58); - std::vector ans58 = {"N#CO", "N=C=O"}; - TEST_ASSERT(res58.size() == ans58.size()); - for (size_t i = 0; i < res58.size(); ++i) { - TEST_ASSERT(MolToSmiles(*res58[i]) == ans58[i]); - } + checkAns("C(#N)O", {"N#CO", "N=C=O"}); // cyano/iso-cyanic acid tautomer - std::string smi59 = "C(=N)=O"; - std::shared_ptr m59(SmilesToMol(smi59)); - std::vector res59 = te.enumerate(*m59); - std::vector ans59 = {"N#CO", "N=C=O"}; - TEST_ASSERT(res59.size() == ans59.size()); - for (size_t i = 0; i < res59.size(); ++i) { - TEST_ASSERT(MolToSmiles(*res59[i]) == ans59[i]); - } + checkAns("C(=N)=O", {"N#CO", "N=C=O"}); + + // formamidinesulfinic acid tautomer + checkAns("NC(N)=S(=O)=O", + {"N=C(N)S(=O)O", "N=C(N)[SH](=O)=O", "NC(N)=S(=O)=O"}); + + // formamidinesulfinic acid tautomer + checkAns("NC(=N)S(=O)O", + {"N=C(N)S(=O)O", "N=C(N)[SH](=O)=O", "NC(N)=S(=O)=O"}); + + // formamidinesulfonic acid tautomer + checkAns("NC(=N)S(=O)(=O)O", {"N=C(N)S(=O)(=O)O"}); // isocyanide tautomer - std::string smi60 = "C#N"; - std::shared_ptr m60(SmilesToMol(smi60)); - std::vector res60 = te.enumerate(*m60); - std::vector ans60 = {"C#N", "[C-]#[NH+]"}; - TEST_ASSERT(res60.size() == ans60.size()); - for (size_t i = 0; i < res60.size(); ++i) { - TEST_ASSERT(MolToSmiles(*res60[i]) == ans60[i]); - } + checkAns("C#N", {"C#N", "[C-]#[NH+]"}); // isocyanide tautomer - std::string smi61 = "[C-]#[NH+]"; - std::shared_ptr m61(SmilesToMol(smi61)); - std::vector res61 = te.enumerate(*m61); - std::vector ans61 = {"C#N", "[C-]#[NH+]"}; - TEST_ASSERT(res61.size() == ans61.size()); - for (size_t i = 0; i < res61.size(); ++i) { - TEST_ASSERT(MolToSmiles(*res61[i]) == ans61[i]); - } + checkAns("[C-]#[NH+]", {"C#N", "[C-]#[NH+]"}); // phosphonic acid tautomer - std::string smi62 = "[PH](=O)(O)(O)"; - std::shared_ptr m62(SmilesToMol(smi62)); - std::vector res62 = te.enumerate(*m62); - std::vector ans62 = {"O=[PH](O)O", "OP(O)O"}; - TEST_ASSERT(res62.size() == ans62.size()); - for (size_t i = 0; i < res62.size(); ++i) { - TEST_ASSERT(MolToSmiles(*res62[i]) == ans62[i]); - } + checkAns("[PH](=O)(O)(O)", {"O=[PH](O)O", "OP(O)O"}); // phosphonic acid tautomer - std::string smi63 = "P(O)(O)O"; - std::shared_ptr m63(SmilesToMol(smi63)); - std::vector res63 = te.enumerate(*m63); - std::vector ans63 = {"O=[PH](O)O", "OP(O)O"}; - TEST_ASSERT(res63.size() == ans63.size()); - for (size_t i = 0; i < res63.size(); ++i) { - TEST_ASSERT(MolToSmiles(*res63[i]) == ans63[i]); - } + checkAns("P(O)(O)O", {"O=[PH](O)O", "OP(O)O"}); // Remove stereochemistry from mobile double bonds - std::string smi64 = "c1(ccccc1)/C=C(/O)\\C"; - std::shared_ptr m64(SmilesToMol(smi64)); - std::vector res64 = te.enumerate(*m64); - std::vector ans64 = {"CC(O)=Cc1ccccc1", "C=C(O)Cc1ccccc1", - "CC(=O)Cc1ccccc1"}; - TEST_ASSERT(res64.size() == ans64.size()); - for (size_t i = 0; i < res64.size(); ++i) { - // std::cout << MolToSmiles(*res64[i]) << ", " << ans64[i] << std::endl; - TEST_ASSERT(MolToSmiles(*res64[i]) == ans64[i]); - } + checkAns("c1(ccccc1)/C=C(/O)\\C", + {"C=C(O)Cc1ccccc1", "CC(=O)Cc1ccccc1", "CC(O)=Cc1ccccc1"}); // Remove stereochemistry from mobile double bonds - std::string smi65 = "C/C=C/C(C)=O"; - std::shared_ptr m65(SmilesToMol(smi65)); - std::vector res65 = te.enumerate(*m65); - std::vector ans65 = {"CC=CC(C)=O", "C=C(O)C=CC", "C=CC=C(C)O", - "C=CCC(=C)O", "C=CCC(C)=O"}; - TEST_ASSERT(res65.size() == ans65.size()); - for (size_t i = 0; i < res65.size(); ++i) { - TEST_ASSERT(MolToSmiles(*res65[i]) == ans65[i]); - } + checkAns("C/C=C/C(C)=O", {"C=C(O)C=CC", "C=CC=C(C)O", "C=CCC(=C)O", + "C=CCC(C)=O", "CC=CC(C)=O"}); // Remove stereochemistry from mobile double bonds std::string smi66 = "C/C=C\\C(C)=O"; - std::shared_ptr m66(SmilesToMol(smi66)); - std::vector res66 = te.enumerate(*m66); - std::vector ans66 = {"C=C(O)C=CC", "C=CC=C(C)O", "CC=CC(C)=O", - "C=CCC(=C)O", "C=CCC(C)=O"}; + ROMOL_SPTR m66(SmilesToMol(smi66)); + TautomerEnumeratorResult res66 = te.enumerate(*m66); + std::vector ans66 = {"C=C(O)C=CC", "C=CC=C(C)O", "C=CCC(=C)O", + "C=CCC(C)=O", "CC=CC(C)=O"}; TEST_ASSERT(res66.size() == ans66.size()); + TEST_ASSERT(res66.status() == TautomerEnumeratorStatus::Completed); std::vector sm66; for (const auto &r : res66) { @@ -752,8 +343,8 @@ void testEnumerator() { // Gaunine tautomers std::string smi67 = "N1C(N)=NC=2N=CNC2C1=O"; - std::shared_ptr m67(SmilesToMol(smi67)); - std::vector res67 = te.enumerate(*m67); + ROMOL_SPTR m67(SmilesToMol(smi67)); + TautomerEnumeratorResult res67 = te.enumerate(*m67); std::vector ans67 = { "N=c1[nH]c(=O)c2[nH]cnc2[nH]1", "N=c1[nH]c(=O)c2nc[nH]c2[nH]1", "N=c1[nH]c2ncnc-2c(O)[nH]1", "N=c1nc(O)c2[nH]cnc2[nH]1", @@ -764,20 +355,218 @@ void testEnumerator() { "Nc1nc2[nH]cnc2c(=O)[nH]1", "Nc1nc2nc[nH]c2c(=O)[nH]1", "Nc1nc2ncnc-2c(O)[nH]1"}; TEST_ASSERT(res67.size() == ans67.size()); + TEST_ASSERT(res67.status() == TautomerEnumeratorStatus::Completed); std::vector sm67; for (const auto &r : res67) { sm67.push_back(MolToSmiles(*r)); } - // sort both for alphabetical order + // sort both by alphabetical order std::sort(sm67.begin(), sm67.end()); std::sort(ans67.begin(), ans67.end()); TEST_ASSERT(sm67 == ans67); // Test a structure with hundreds of tautomers. std::string smi68 = "[H][C](CO)(NC(=O)C1=C(O)C(O)=CC=C1)C(O)=O"; - std::shared_ptr m68(SmilesToMol(smi68)); - std::vector res68 = te.enumerate(*m68); - TEST_ASSERT(res68.size() == 375); + ROMOL_SPTR m68(SmilesToMol(smi68)); + TautomerEnumeratorResult res68 = te.enumerate(*m68); + // the maxTransforms limit is hit before the maxTautomers one + TEST_ASSERT(res68.size() == 292); + TEST_ASSERT(res68.status() == TautomerEnumeratorStatus::MaxTransformsReached); + BOOST_LOG(rdInfoLog) << "Finished" << std::endl; +} + +void testEnumeratorParams() { + BOOST_LOG(rdInfoLog) + << "-----------------------\n Testing TautomerEnumerator params" + << std::endl; + + // Test a structure with hundreds of tautomers. + std::string smi68 = "[H][C](CO)(NC(=O)C1=C(O)C(O)=CC=C1)C(O)=O"; + ROMOL_SPTR m68(SmilesToMol(smi68)); + + { + TautomerEnumerator te; + TautomerEnumeratorResult res68 = te.enumerate(*m68); + TEST_ASSERT(res68.size() == 292); + TEST_ASSERT(res68.status() == + TautomerEnumeratorStatus::MaxTransformsReached); + } + { + CleanupParameters params; + params.maxTautomers = 50; + TautomerEnumerator te(params); + TautomerEnumeratorResult res68 = te.enumerate(*m68); + TEST_ASSERT(res68.size() == 50); + TEST_ASSERT(res68.status() == + TautomerEnumeratorStatus::MaxTautomersReached); + } + std::string sAlaSmi = "C[C@H](N)C(=O)O"; + ROMOL_SPTR sAla(SmilesToMol(sAlaSmi)); + { + // test remove (S)-Ala stereochemistry + TEST_ASSERT(sAla->getAtomWithIdx(1)->getChiralTag() == + Atom::CHI_TETRAHEDRAL_CCW); + TEST_ASSERT(sAla->getAtomWithIdx(1)->getProp( + common_properties::_CIPCode) == "S"); + CleanupParameters params; + params.tautomerRemoveSp3Stereo = true; + TautomerEnumerator te(params); + TautomerEnumeratorResult res = te.enumerate(*sAla); + for (const auto taut : res) { + TEST_ASSERT(taut->getAtomWithIdx(1)->getChiralTag() == + Atom::CHI_UNSPECIFIED); + TEST_ASSERT( + !taut->getAtomWithIdx(1)->hasProp(common_properties::_CIPCode)); + } + } + { + // test retain (S)-Ala stereochemistry + TEST_ASSERT(sAla->getAtomWithIdx(1)->getChiralTag() == + Atom::CHI_TETRAHEDRAL_CCW); + TEST_ASSERT(sAla->getAtomWithIdx(1)->getProp( + common_properties::_CIPCode) == "S"); + CleanupParameters params; + params.tautomerRemoveSp3Stereo = false; + TautomerEnumerator te(params); + TautomerEnumeratorResult res = te.enumerate(*sAla); + for (const auto taut : res) { + const auto tautAtom = taut->getAtomWithIdx(1); + if (tautAtom->getHybridization() == Atom::SP3) { + TEST_ASSERT(tautAtom->hasProp(common_properties::_CIPCode)); + TEST_ASSERT( + tautAtom->getProp(common_properties::_CIPCode) == "S"); + TEST_ASSERT(tautAtom->getChiralTag() == Atom::CHI_TETRAHEDRAL_CCW); + } else { + TEST_ASSERT(!tautAtom->hasProp(common_properties::_CIPCode)); + TEST_ASSERT(tautAtom->getChiralTag() == Atom::CHI_UNSPECIFIED); + } + } + } + std::string eEnolSmi = "C/C=C/O"; + ROMOL_SPTR eEnol(SmilesToMol(eEnolSmi)); + TEST_ASSERT(eEnol->getBondWithIdx(1)->getStereo() == Bond::STEREOE); + { + // test remove enol E stereochemistry + CleanupParameters params; + params.tautomerRemoveBondStereo = true; + TautomerEnumerator te(params); + TautomerEnumeratorResult res = te.enumerate(*eEnol); + for (const auto taut : res) { + TEST_ASSERT(taut->getBondWithIdx(1)->getStereo() == Bond::STEREONONE); + } + } + { + // test retain enol E stereochemistry + CleanupParameters params; + params.tautomerRemoveBondStereo = false; + TautomerEnumerator te(params); + TautomerEnumeratorResult res = te.enumerate(*eEnol); + for (const auto taut : res) { + if (taut->getBondWithIdx(1)->getBondType() == Bond::DOUBLE) { + TEST_ASSERT(taut->getBondWithIdx(1)->getStereo() == Bond::STEREOE); + } + } + } + std::string zEnolSmi = "C/C=C\\O"; + ROMOL_SPTR zEnol(SmilesToMol(zEnolSmi)); + TEST_ASSERT(zEnol->getBondWithIdx(1)->getStereo() == Bond::STEREOZ); + { + // test remove enol Z stereochemistry + CleanupParameters params; + params.tautomerRemoveBondStereo = true; + TautomerEnumerator te(params); + TautomerEnumeratorResult res = te.enumerate(*zEnol); + for (const auto taut : res) { + TEST_ASSERT(taut->getBondWithIdx(1)->getStereo() == Bond::STEREONONE); + } + } + { + // test retain enol Z stereochemistry + CleanupParameters params; + params.tautomerRemoveBondStereo = false; + TautomerEnumerator te(params); + TautomerEnumeratorResult res = te.enumerate(*zEnol); + for (const auto taut : res) { + if (taut->getBondWithIdx(1)->getBondType() == Bond::DOUBLE) { + TEST_ASSERT(taut->getBondWithIdx(1)->getStereo() == Bond::STEREOZ); + } + } + } + + BOOST_LOG(rdInfoLog) << "Finished" << std::endl; +} + +void testEnumeratorCallback() { + class MyTautomerEnumeratorCallback : public TautomerEnumeratorCallback { + public: + MyTautomerEnumeratorCallback(double timeoutMs) + : d_timeoutMs(timeoutMs), d_start(std::chrono::system_clock::now()) {} + bool operator()(const ROMol &, const TautomerEnumeratorResult &) override { + double elapsedMs = std::chrono::duration_cast( + std::chrono::system_clock::now() - d_start) + .count(); + return (elapsedMs < d_timeoutMs); + } + + private: + double d_timeoutMs; + std::chrono::time_point d_start; + }; + + BOOST_LOG(rdInfoLog) + << "-----------------------\n Testing TautomerEnumerator callback" + << std::endl; + + // Test a structure with hundreds of tautomers. + std::string smi68 = "[H][C](CO)(NC(=O)C1=C(O)C(O)=CC=C1)C(O)=O"; + ROMOL_SPTR m68(SmilesToMol(smi68)); + + CleanupParameters params; + params.maxTransforms = 10000; + params.maxTautomers = 10000; + { + TautomerEnumerator te(params); + te.setCallback(new MyTautomerEnumeratorCallback(50.0)); + TautomerEnumeratorResult res68 = te.enumerate(*m68); + // either the enumeration was canceled due to timeout + // or it has completed very quickly + bool hasReachedTimeout = + (res68.size() < 375 && + res68.status() == TautomerEnumeratorStatus::Canceled); + bool hasCompleted = (res68.size() == 375 && + res68.status() == TautomerEnumeratorStatus::Completed); + if (hasReachedTimeout) { + std::cerr << "Enumeration was canceled due to timeout (50 ms)" + << std::endl; + } + if (hasCompleted) { + std::cerr << "Enumeration has completed" << std::endl; + } + TEST_ASSERT(hasReachedTimeout || hasCompleted); + TEST_ASSERT(hasReachedTimeout ^ hasCompleted); + } + { + TautomerEnumerator te(params); + te.setCallback(new MyTautomerEnumeratorCallback(10000.0)); + TautomerEnumeratorResult res68 = te.enumerate(*m68); + // either the enumeration completed + // or it ran very slowly and was canceled due to timeout + bool hasReachedTimeout = + (res68.size() < 375 && + res68.status() == TautomerEnumeratorStatus::Canceled); + bool hasCompleted = (res68.size() == 375 && + res68.status() == TautomerEnumeratorStatus::Completed); + if (hasReachedTimeout) { + std::cerr << "Enumeration was canceled due to timeout (10 s)" + << std::endl; + } + if (hasCompleted) { + std::cerr << "Enumeration has completed" << std::endl; + } + TEST_ASSERT(hasReachedTimeout || hasCompleted); + TEST_ASSERT(hasReachedTimeout ^ hasCompleted); + } + BOOST_LOG(rdInfoLog) << "Finished" << std::endl; } @@ -860,8 +649,15 @@ void testCanonicalize() { rdbase + "/Data/MolStandardize/tautomerTransforms.in"; auto tautparams = std::unique_ptr( new TautomerCatalogParams(tautomerFile)); - unsigned int ntautomers = tautparams->getNumTautomers(); - TEST_ASSERT(ntautomers == 34); + // DEPRECATED, remove from here in release 2021.01 + { + unsigned int ntransforms = tautparams->getNumTautomers(); + TEST_ASSERT(ntransforms == 36); + } + // DEPRECATED, remove until here in release 2021.01 + + unsigned int ntransforms = tautparams->getTransforms().size(); + TEST_ASSERT(ntransforms == 36); TautomerEnumerator te(new TautomerCatalog(tautparams.get())); @@ -870,11 +666,6 @@ void testCanonicalize() { TEST_ASSERT(mol); std::unique_ptr res{te.canonicalize(*mol)}; TEST_ASSERT(res); - if (MolToSmiles(*res) != itm.second) { - std::cerr << itm.first << " -> " << MolToSmiles(*res) << " instead of " - << itm.second << std::endl; - } - TEST_ASSERT(MolToSmiles(*res) == itm.second); } BOOST_LOG(rdInfoLog) << "Finished" << std::endl; @@ -889,16 +680,23 @@ void testPickCanonical() { rdbase + "/Data/MolStandardize/tautomerTransforms.in"; auto tautparams = std::unique_ptr( new TautomerCatalogParams(tautomerFile)); - unsigned int ntautomers = tautparams->getNumTautomers(); - TEST_ASSERT(ntautomers == 34); + // DEPRECATED, remove from here in release 2021.01 + { + unsigned int ntransforms = tautparams->getNumTautomers(); + TEST_ASSERT(ntransforms == 36); + } + // DEPRECATED, remove until here in release 2021.01 + + unsigned int ntransforms = tautparams->getTransforms().size(); + TEST_ASSERT(ntransforms == 36); TautomerEnumerator te(new TautomerCatalog(tautparams.get())); for (const auto &itm : canonTautomerData) { std::unique_ptr mol{SmilesToMol(itm.first)}; TEST_ASSERT(mol); - auto tauts = te.enumerate(*mol); - std::unique_ptr res{te.pickCanonical(tauts)}; + auto tautRes = te.enumerate(*mol); + std::unique_ptr res{te.pickCanonical(tautRes)}; TEST_ASSERT(res); // std::cerr << itm.first<<" -> "<( new TautomerCatalogParams(tautomerFile)); - unsigned int ntautomers = tautparams->getNumTautomers(); - TEST_ASSERT(ntautomers == 34); + // DEPRECATED, remove from here in release 2021.01 + { + unsigned int ntransforms = tautparams->getNumTautomers(); + TEST_ASSERT(ntransforms == 36); + } + // DEPRECATED, remove until here in release 2021.01 + + unsigned int ntransforms = tautparams->getTransforms().size(); + TEST_ASSERT(ntransforms == 36); TautomerEnumerator te(new TautomerCatalog(tautparams.get())); @@ -933,6 +738,7 @@ void testCustomScoreFunc() { std::unique_ptr mol{SmilesToMol(itm.first)}; TEST_ASSERT(mol); { + // this uses the non-templated pickCanonical() function std::unique_ptr res{ te.canonicalize(*mol, [](const ROMol &m) -> int { return MolStandardize::TautomerScoringFunctions::scoreRings(m); @@ -941,9 +747,32 @@ void testCustomScoreFunc() { TEST_ASSERT(MolToSmiles(*res) == itm.second); } { - auto tauts = te.enumerate(*mol); + // this uses the non-templated pickCanonical() overload + auto tautRes = te.enumerate(*mol); std::unique_ptr res{ - te.pickCanonical(tauts, [](const ROMol &m) -> int { + te.pickCanonical(tautRes, [](const ROMol &m) -> int { + return MolStandardize::TautomerScoringFunctions::scoreRings(m); + })}; + TEST_ASSERT(res); + TEST_ASSERT(MolToSmiles(*res) == itm.second); + } + { + // this tests the templated pickCanonical() overload on a std::vector + auto tautRes = te.enumerate(*mol); + std::unique_ptr res{ + te.pickCanonical(tautRes.tautomers(), [](const ROMol &m) -> int { + return MolStandardize::TautomerScoringFunctions::scoreRings(m); + })}; + TEST_ASSERT(res); + TEST_ASSERT(MolToSmiles(*res) == itm.second); + } + { + // this tests the templated pickCanonical() overload + // with a different iterable container + auto tautRes = te.enumerate(*mol); + std::set tautomerSet(tautRes.begin(), tautRes.end()); + std::unique_ptr res{ + te.pickCanonical(tautomerSet, [](const ROMol &m) -> int { return MolStandardize::TautomerScoringFunctions::scoreRings(m); })}; TEST_ASSERT(res); @@ -963,15 +792,22 @@ void testEnumerationProblems() { rdbase + "/Data/MolStandardize/tautomerTransforms.in"; auto tautparams = std::unique_ptr( new TautomerCatalogParams(tautomerFile)); - unsigned int ntautomers = tautparams->getNumTautomers(); - TEST_ASSERT(ntautomers == 34); + // DEPRECATED, remove from here in release 2021.01 + { + unsigned int ntransforms = tautparams->getNumTautomers(); + TEST_ASSERT(ntransforms == 36); + } + // DEPRECATED, remove until here in release 2021.01 + + unsigned int ntransforms = tautparams->getTransforms().size(); + TEST_ASSERT(ntransforms == 36); TautomerEnumerator te(new TautomerCatalog(tautparams.get())); #if 1 { // from the discussion of #2908 auto mol = "O=C(C1=C[NH+]=CC=C1)[O-]"_smiles; - auto tauts = te.enumerate(*mol); - TEST_ASSERT(tauts.size() == 1); + auto tautRes = te.enumerate(*mol); + TEST_ASSERT(tautRes.size() == 1); } #endif { // one of the examples from the tautobase paper @@ -979,11 +815,11 @@ void testEnumerationProblems() { "[S:1]=[c:2]1[nH+:3][c:5]([NH2:9])[nH:8][c:7]2[c:4]1[n:6][nH:10][n:11]2"_smiles; TEST_ASSERT(m); - auto tauts = te.enumerate(*m); + auto tautRes = te.enumerate(*m); // for (auto taut : tauts) { // std::cerr << MolToSmiles(*taut) << std::endl; // } - TEST_ASSERT(tauts.size() == 12); + TEST_ASSERT(tautRes.size() == 12); } BOOST_LOG(rdInfoLog) << "Finished" << std::endl; @@ -998,28 +834,35 @@ void testPickCanonical2() { rdbase + "/Data/MolStandardize/tautomerTransforms.in"; auto tautparams = std::unique_ptr( new TautomerCatalogParams(tautomerFile)); - unsigned int ntautomers = tautparams->getNumTautomers(); - TEST_ASSERT(ntautomers == 34); + // DEPRECATED, remove from here in release 2021.01 + { + unsigned int ntransforms = tautparams->getNumTautomers(); + TEST_ASSERT(ntransforms == 36); + } + // DEPRECATED, remove until here in release 2021.01 + + unsigned int ntransforms = tautparams->getTransforms().size(); + TEST_ASSERT(ntransforms == 36); TautomerEnumerator te(new TautomerCatalog(tautparams.get())); { auto mol = "CN=c1nc[nH]cc1"_smiles; TEST_ASSERT(mol); - auto tauts = te.enumerate(*mol); - for (const auto &taut : tauts) { + auto tautRes = te.enumerate(*mol); + for (const auto &taut : tautRes) { std::cerr << MolToSmiles(*taut) << std::endl; } - std::unique_ptr canon{te.pickCanonical(tauts)}; + std::unique_ptr canon{te.pickCanonical(tautRes)}; std::cerr << "res: " << MolToSmiles(*canon) << std::endl; } { auto mol = "CN=c1[nH]cccc1"_smiles; TEST_ASSERT(mol); - auto tauts = te.enumerate(*mol); - for (const auto &taut : tauts) { + auto tautRes = te.enumerate(*mol); + for (const auto &taut : tautRes) { std::cerr << MolToSmiles(*taut) << std::endl; } - std::unique_ptr canon{te.pickCanonical(tauts)}; + std::unique_ptr canon{te.pickCanonical(tautRes)}; std::cerr << "res: " << MolToSmiles(*canon) << std::endl; } BOOST_LOG(rdInfoLog) << "Finished" << std::endl; @@ -1035,12 +878,35 @@ void testEnumerateDetails() { rdbase + "/Data/MolStandardize/tautomerTransforms.in"; auto tautparams = std::unique_ptr( new TautomerCatalogParams(tautomerFile)); - unsigned int ntautomers = tautparams->getNumTautomers(); - TEST_ASSERT(ntautomers == 34); + // DEPRECATED, remove from here in release 2021.01 + { + unsigned int ntransforms = tautparams->getNumTautomers(); + TEST_ASSERT(ntransforms == 36); + } + // DEPRECATED, remove until here in release 2021.01 + + unsigned int ntransforms = tautparams->getTransforms().size(); + TEST_ASSERT(ntransforms == 36); TautomerEnumerator te(new TautomerCatalog(tautparams.get())); { auto mol = "c1ccccc1CN=c1[nH]cccc1"_smiles; TEST_ASSERT(mol); + + auto tautRes = te.enumerate(*mol); + TEST_ASSERT(tautRes.size() == 2); + TEST_ASSERT(tautRes.modifiedAtoms().count() == 2); + TEST_ASSERT(tautRes.modifiedBonds().count() == 7); + TEST_ASSERT(tautRes.modifiedAtoms().test(7)); + TEST_ASSERT(tautRes.modifiedAtoms().test(9)); + TEST_ASSERT(!tautRes.modifiedBonds().test(0)); + TEST_ASSERT(tautRes.modifiedBonds().test(7)); + TEST_ASSERT(tautRes.modifiedBonds().test(8)); + TEST_ASSERT(tautRes.modifiedBonds().test(14)); + } + { + // test the deprecated form + auto mol = "c1ccccc1CN=c1[nH]cccc1"_smiles; + TEST_ASSERT(mol); boost::dynamic_bitset<> atomsModified(mol->getNumAtoms()); boost::dynamic_bitset<> bondsModified(mol->getNumBonds()); @@ -1068,15 +934,22 @@ void testGithub2990() { rdbase + "/Data/MolStandardize/tautomerTransforms.in"; auto tautparams = std::unique_ptr( new TautomerCatalogParams(tautomerFile)); - unsigned int ntautomers = tautparams->getNumTautomers(); - TEST_ASSERT(ntautomers == 34); + // DEPRECATED, remove from here in release 2021.01 + { + unsigned int ntransforms = tautparams->getNumTautomers(); + TEST_ASSERT(ntransforms == 36); + } + // DEPRECATED, remove until here in release 2021.01 + + unsigned int ntransforms = tautparams->getTransforms().size(); + TEST_ASSERT(ntransforms == 36); TautomerEnumerator te(new TautomerCatalog(tautparams.get())); { // atom stereo auto mol = "COC(=O)[C@@H](N)CO"_smiles; TEST_ASSERT(mol); - auto tauts = te.enumerate(*mol); - for (const auto &taut : tauts) { + auto res = te.enumerate(*mol); + for (const auto &taut : res) { auto smi = MolToSmiles(*taut); // std::cerr << smi << std::endl; TEST_ASSERT(smi.find("@H") == std::string::npos); @@ -1086,8 +959,8 @@ void testGithub2990() { // atom stereo, atoms not in the tautomer zone are still ok auto mol = "C[C@](Cl)(F)COC(=O)[C@@H](N)CO"_smiles; TEST_ASSERT(mol); - auto tauts = te.enumerate(*mol); - for (const auto &taut : tauts) { + auto res = te.enumerate(*mol); + for (const auto &taut : res) { auto smi = MolToSmiles(*taut); // std::cerr << smi << std::endl; TEST_ASSERT(smi.find("@H") == std::string::npos); @@ -1111,8 +984,8 @@ void testGithub2990() { TEST_ASSERT(mol->getBondBetweenAtoms(4, 5)->getStereo() > Bond::BondStereo::STEREOANY); - auto tauts = te.enumerate(*mol); - for (const auto &taut : tauts) { + auto res = te.enumerate(*mol); + for (const auto &taut : res) { TEST_ASSERT(taut->getBondBetweenAtoms(0, 1)->getBondDir() != Bond::BondDir::NONE); TEST_ASSERT(taut->getBondBetweenAtoms(2, 3)->getBondDir() != @@ -1131,10 +1004,338 @@ void testGithub2990() { BOOST_LOG(rdInfoLog) << "Finished" << std::endl; } +void testPickCanonicalCIPChangeOnChiralCenter() { + BOOST_LOG(rdInfoLog) + << "-----------------------\n testPickCanonicalCIPChangeOnChiralCenter" + << std::endl; + + struct CanonicalTaut { + static ROMOL_SPTR get(const TautomerEnumeratorResult &res) { + std::vector scores; + scores.reserve(res.size()); + std::transform(res.begin(), res.end(), std::back_inserter(scores), + [](const ROMOL_SPTR &m) { + return TautomerScoringFunctions::scoreTautomer(*m); + }); + std::vector indices(res.size()); + std::iota(indices.begin(), indices.end(), 0); + int bestIdx = + *std::max_element(indices.begin(), indices.end(), + [scores](const size_t &a, const size_t &b) { + if (scores.at(a) != scores.at(b)) { + return (scores.at(a) < scores.at(b)); + } + return (a < b); + }); + TEST_ASSERT(*std::max_element(scores.begin(), scores.end()) == + scores.at(bestIdx)); + return res.at(bestIdx); + } + }; + + auto mol = "CC\\C=C(/O)[C@@H](C)C(C)=O"_smiles; + TEST_ASSERT(mol.get()); + TEST_ASSERT(mol->getAtomWithIdx(5)->getChiralTag() == + Atom::CHI_TETRAHEDRAL_CW); + TEST_ASSERT(mol->getAtomWithIdx(5)->getProp( + common_properties::_CIPCode) == "R"); + { + // here the chirality disappears as the chiral center is itself involved in + // tautomerism + TautomerEnumerator te; + ROMOL_SPTR canTaut(te.canonicalize(*mol)); + TEST_ASSERT(canTaut.get()); + TEST_ASSERT(canTaut->getAtomWithIdx(5)->getChiralTag() == + Atom::CHI_UNSPECIFIED); + TEST_ASSERT( + !canTaut->getAtomWithIdx(5)->hasProp(common_properties::_CIPCode)); + TEST_ASSERT(MolToSmiles(*canTaut) == "CCCC(=O)C(C)C(C)=O"); + } + { + // here the chirality stays even if the chiral center is itself involved in + // tautomerism because of the tautomerRemoveSp3Stereo parameter being set to + // false + CleanupParameters params; + params.tautomerRemoveSp3Stereo = false; + TautomerEnumerator te(params); + ROMOL_SPTR canTaut(te.canonicalize(*mol)); + TEST_ASSERT(canTaut.get()); + TEST_ASSERT(canTaut->getAtomWithIdx(5)->getChiralTag() == + Atom::CHI_TETRAHEDRAL_CW); + TEST_ASSERT(canTaut->getAtomWithIdx(5)->getProp( + common_properties::_CIPCode) == "S"); + TEST_ASSERT(MolToSmiles(*canTaut) == "CCCC(=O)[C@@H](C)C(C)=O"); + } + { + // here the chirality disappears as the chiral center is itself involved in + // tautomerism; the reassignStereo setting has no influence + TautomerEnumerator te; + auto res = te.enumerate(*mol); + TEST_ASSERT(res.status() == TautomerEnumeratorStatus::Completed); + TEST_ASSERT(res.size() == 8); + ROMOL_SPTR bestTaut = CanonicalTaut::get(res); + TEST_ASSERT(bestTaut.get()); + TEST_ASSERT(bestTaut->getAtomWithIdx(5)->getChiralTag() == + Atom::CHI_UNSPECIFIED); + TEST_ASSERT( + !bestTaut->getAtomWithIdx(5)->hasProp(common_properties::_CIPCode)); + TEST_ASSERT(MolToSmiles(*bestTaut) == "CCCC(=O)C(C)C(C)=O"); + } + { + // here the chirality disappears as the chiral center is itself involved in + // tautomerism; the reassignStereo setting has no influence + CleanupParameters params; + params.tautomerReassignStereo = false; + TautomerEnumerator te(params); + auto res = te.enumerate(*mol); + TEST_ASSERT(res.status() == TautomerEnumeratorStatus::Completed); + TEST_ASSERT(res.size() == 8); + ROMOL_SPTR bestTaut = CanonicalTaut::get(res); + TEST_ASSERT(bestTaut.get()); + TEST_ASSERT(bestTaut->getAtomWithIdx(5)->getChiralTag() == + Atom::CHI_UNSPECIFIED); + TEST_ASSERT( + !bestTaut->getAtomWithIdx(5)->hasProp(common_properties::_CIPCode)); + TEST_ASSERT(MolToSmiles(*bestTaut) == "CCCC(=O)C(C)C(C)=O"); + } + { + // here the chirality stays even if the chiral center is itself involved in + // tautomerism because of the tautomerRemoveSp3Stereo parameter being set to + // false. As reassignStereo by default is true, the CIP code has been + // recomputed and therefore it is now S (correct) + CleanupParameters params; + params.tautomerRemoveSp3Stereo = false; + TautomerEnumerator te(params); + auto res = te.enumerate(*mol); + TEST_ASSERT(res.status() == TautomerEnumeratorStatus::Completed); + TEST_ASSERT(res.size() == 8); + ROMOL_SPTR bestTaut = CanonicalTaut::get(res); + TEST_ASSERT(bestTaut.get()); + TEST_ASSERT(bestTaut->getAtomWithIdx(5)->getChiralTag() == + Atom::CHI_TETRAHEDRAL_CW); + TEST_ASSERT(bestTaut->getAtomWithIdx(5)->getProp( + common_properties::_CIPCode) == "S"); + TEST_ASSERT(MolToSmiles(*bestTaut) == "CCCC(=O)[C@@H](C)C(C)=O"); + } + { + // here the chirality stays even if the chiral center is itself involved in + // tautomerism because of the tautomerRemoveSp3Stereo parameter being set to + // false. As reassignStereo is false, the CIP code has not been recomputed + // and therefore it is still R (incorrect) + CleanupParameters params; + params.tautomerRemoveSp3Stereo = false; + params.tautomerReassignStereo = false; + TautomerEnumerator te(params); + auto res = te.enumerate(*mol); + TEST_ASSERT(res.status() == TautomerEnumeratorStatus::Completed); + TEST_ASSERT(res.size() == 8); + ROMOL_SPTR bestTaut = CanonicalTaut::get(res); + TEST_ASSERT(bestTaut.get()); + TEST_ASSERT(bestTaut->getAtomWithIdx(5)->getChiralTag() == + Atom::CHI_TETRAHEDRAL_CW); + TEST_ASSERT(bestTaut->getAtomWithIdx(5)->getProp( + common_properties::_CIPCode) == "R"); + TEST_ASSERT(MolToSmiles(*bestTaut) == "CCCC(=O)[C@@H](C)C(C)=O"); + } + + mol = "CC\\C=C(/O)[C@@](CC)(C)C(C)=O"_smiles; + TEST_ASSERT(mol.get()); + TEST_ASSERT(mol->getAtomWithIdx(5)->getChiralTag() == + Atom::CHI_TETRAHEDRAL_CW); + TEST_ASSERT(mol->getAtomWithIdx(5)->getProp( + common_properties::_CIPCode) == "S"); + // here the chirality stays no matter how tautomerRemoveSp3Stereo + // is set as the chiral center is not involved in tautomerism + { + TautomerEnumerator te; + ROMOL_SPTR canTaut(te.canonicalize(*mol)); + TEST_ASSERT(canTaut.get()); + TEST_ASSERT(canTaut->getAtomWithIdx(5)->getChiralTag() == + Atom::CHI_TETRAHEDRAL_CW); + TEST_ASSERT(canTaut->getAtomWithIdx(5)->getProp( + common_properties::_CIPCode) == "R"); + TEST_ASSERT(MolToSmiles(*canTaut) == "CCCC(=O)[C@](C)(CC)C(C)=O"); + } + { + CleanupParameters params; + params.tautomerRemoveSp3Stereo = false; + TautomerEnumerator te(params); + ROMOL_SPTR canTaut(te.canonicalize(*mol)); + TEST_ASSERT(canTaut.get()); + TEST_ASSERT(canTaut->getAtomWithIdx(5)->getChiralTag() == + Atom::CHI_TETRAHEDRAL_CW); + TEST_ASSERT(canTaut->getAtomWithIdx(5)->getProp( + common_properties::_CIPCode) == "R"); + TEST_ASSERT(MolToSmiles(*canTaut) == "CCCC(=O)[C@](C)(CC)C(C)=O"); + } + { + // as reassignStereo by default is true, the CIP code has been recomputed + // and therefore it is now R (correct) + TautomerEnumerator te; + auto res = te.enumerate(*mol); + TEST_ASSERT(res.status() == TautomerEnumeratorStatus::Completed); + TEST_ASSERT(res.size() == 4); + ROMOL_SPTR bestTaut = CanonicalTaut::get(res); + TEST_ASSERT(bestTaut.get()); + TEST_ASSERT(bestTaut->getAtomWithIdx(5)->getChiralTag() == + Atom::CHI_TETRAHEDRAL_CW); + TEST_ASSERT(bestTaut->getAtomWithIdx(5)->getProp( + common_properties::_CIPCode) == "R"); + TEST_ASSERT(MolToSmiles(*bestTaut) == "CCCC(=O)[C@](C)(CC)C(C)=O"); + } + { + // as reassignStereo is false, the CIP code has not been recomputed + // and therefore it is still S (incorrect) + CleanupParameters params; + params.tautomerReassignStereo = false; + TautomerEnumerator te(params); + auto res = te.enumerate(*mol); + TEST_ASSERT(res.status() == TautomerEnumeratorStatus::Completed); + TEST_ASSERT(res.size() == 4); + ROMOL_SPTR bestTaut = CanonicalTaut::get(res); + TEST_ASSERT(bestTaut.get()); + TEST_ASSERT(bestTaut->getAtomWithIdx(5)->getChiralTag() == + Atom::CHI_TETRAHEDRAL_CW); + TEST_ASSERT(bestTaut->getAtomWithIdx(5)->getProp( + common_properties::_CIPCode) == "S"); + TEST_ASSERT(MolToSmiles(*bestTaut) == "CCCC(=O)[C@](C)(CC)C(C)=O"); + } + { + // as reassignStereo by default is true, the CIP code has been recomputed + // and therefore it is now R (correct) + CleanupParameters params; + params.tautomerRemoveSp3Stereo = false; + TautomerEnumerator te(params); + auto res = te.enumerate(*mol); + TEST_ASSERT(res.status() == TautomerEnumeratorStatus::Completed); + TEST_ASSERT(res.size() == 4); + ROMOL_SPTR bestTaut = CanonicalTaut::get(res); + TEST_ASSERT(bestTaut.get()); + TEST_ASSERT(bestTaut->getAtomWithIdx(5)->getChiralTag() == + Atom::CHI_TETRAHEDRAL_CW); + TEST_ASSERT(bestTaut->getAtomWithIdx(5)->getProp( + common_properties::_CIPCode) == "R"); + TEST_ASSERT(MolToSmiles(*bestTaut) == "CCCC(=O)[C@](C)(CC)C(C)=O"); + } + { + // here the chirality stays even if the tautomerRemoveSp3Stereo parameter + // is set to false as the chiral center is not involved in tautomerism. + // As reassignStereo is false, the CIP code has not been recomputed + // and therefore it is still S (incorrect) + CleanupParameters params; + params.tautomerRemoveSp3Stereo = false; + params.tautomerReassignStereo = false; + TautomerEnumerator te(params); + auto res = te.enumerate(*mol); + TEST_ASSERT(res.status() == TautomerEnumeratorStatus::Completed); + TEST_ASSERT(res.size() == 4); + ROMOL_SPTR bestTaut = CanonicalTaut::get(res); + TEST_ASSERT(bestTaut.get()); + TEST_ASSERT(bestTaut->getAtomWithIdx(5)->getChiralTag() == + Atom::CHI_TETRAHEDRAL_CW); + TEST_ASSERT(bestTaut->getAtomWithIdx(5)->getProp( + common_properties::_CIPCode) == "S"); + TEST_ASSERT(MolToSmiles(*bestTaut) == "CCCC(=O)[C@](C)(CC)C(C)=O"); + } + + BOOST_LOG(rdInfoLog) << "Finished" << std::endl; +} + +void testTautomerEnumeratorResult_const_iterator() { + BOOST_LOG(rdInfoLog) + << "-----------------------\n testTautomerEnumeratorResult_const_iterator" + << std::endl; + // CHEMBL3480964 + RWMOL_SPTR mol = "Cc1nnc(NC(=O)N2CCN(Cc3ccc(F)cc3)C(=O)C2)s1"_smiles; + TautomerEnumerator te; + auto res = te.enumerate(*mol); + TEST_ASSERT(res.status() == TautomerEnumeratorStatus::Completed); + TEST_ASSERT(res.size() == 12); + auto it = res.begin(); + auto it2 = res.begin(); + // Test semantic requirements of bidirectional_iterator + // https://en.cppreference.com/w/cpp/iterator/bidirectional_iterator + TEST_ASSERT(it == it2); + TEST_ASSERT(it++ == it2); + TEST_ASSERT(it == ++it2); + TEST_ASSERT(it == it2); + TEST_ASSERT(it-- == it2); + TEST_ASSERT(it == --it2); + TEST_ASSERT(it == it2); + ++it; + ++it2; + TEST_ASSERT(++(--it) == it2); + TEST_ASSERT(--(++it) == it2); + TEST_ASSERT(std::addressof(--it) == std::addressof(it)); + ++it; + TEST_ASSERT(it == it2); + it--; + --it2; + TEST_ASSERT(it == it2); + TEST_ASSERT(*it == res[0]); + TEST_ASSERT(*it++ == res.at(0)); + TEST_ASSERT(*it == res[1]); + TEST_ASSERT(*++it == res.at(2)); + TEST_ASSERT(*it == res[2]); + ++it; + TEST_ASSERT(*it == res[3]); + ++it; + TEST_ASSERT(*it == res[4]); + it++; + TEST_ASSERT(*it == res[5]); + TEST_ASSERT(*it-- == res.at(5)); + TEST_ASSERT(*it == res[4]); + TEST_ASSERT(*--it == res.at(3)); + TEST_ASSERT(*it == res[3]); + --it; + TEST_ASSERT(*it == res[2]); + --it; + TEST_ASSERT(*it == res[1]); + it--; + TEST_ASSERT(*it == res[0]); + std::ptrdiff_t i = 0; + for (auto t : res) { + TEST_ASSERT(t == res[i++]); + } + i = 0; + for (auto it = res.begin(); it != res.end(); ++it) { + TEST_ASSERT(std::distance(res.begin(), it) == i); + TEST_ASSERT(*it == res[i]); + TEST_ASSERT(it->getNumAtoms() == res[i++]->getNumAtoms()); + } + i = res.size(); + for (auto it = res.end(); it != res.begin();) { + TEST_ASSERT(std::distance(res.begin(), it) == i); + TEST_ASSERT(*--it == res[--i]); + TEST_ASSERT(it->getNumAtoms() == res[i]->getNumAtoms()); + } + i = 0; + for (const auto &pair : res.smilesTautomerMap()) { + TEST_ASSERT(pair.first == MolToSmiles(*res[i])); + TEST_ASSERT(pair.second.tautomer == res[i++]); + } + i = 0; + for (auto it = res.smilesTautomerMap().begin(); + it != res.smilesTautomerMap().end(); ++it) { + TEST_ASSERT(std::distance(res.smilesTautomerMap().begin(), it) == i); + TEST_ASSERT(it->first == MolToSmiles(*res[i])); + TEST_ASSERT(it->second.tautomer == res[i++]); + } + i = res.smilesTautomerMap().size(); + for (auto it = res.smilesTautomerMap().end(); + it != res.smilesTautomerMap().begin();) { + TEST_ASSERT(std::distance(res.smilesTautomerMap().begin(), it) == i); + TEST_ASSERT((--it)->first == MolToSmiles(*res[--i])); + TEST_ASSERT(it->second.tautomer == res[i]); + } +} + int main() { RDLog::InitLogs(); #if 1 testEnumerator(); + testEnumeratorParams(); + testEnumeratorCallback(); testCanonicalize(); testPickCanonical(); testCustomScoreFunc(); @@ -1143,5 +1344,7 @@ int main() { testPickCanonical2(); testEnumerateDetails(); testGithub2990(); + testPickCanonicalCIPChangeOnChiralCenter(); + testTautomerEnumeratorResult_const_iterator(); return 0; } diff --git a/Code/GraphMol/TautomerQuery/TautomerQuery.cpp b/Code/GraphMol/TautomerQuery/TautomerQuery.cpp index f99f794e6..26d998cda 100644 --- a/Code/GraphMol/TautomerQuery/TautomerQuery.cpp +++ b/Code/GraphMol/TautomerQuery/TautomerQuery.cpp @@ -132,22 +132,19 @@ TautomerQuery *TautomerQuery::fromMol( new MolStandardize::TautomerCatalogParams(tautomerFile)); MolStandardize::TautomerEnumerator tautomerEnumerator( new MolStandardize::TautomerCatalog(tautomerParams.get())); - boost::dynamic_bitset<> modifiedAtomsBitSet(query.getNumAtoms()); - boost::dynamic_bitset<> modifiedBondsBitSet(query.getNumBonds()); - const auto tautomers = tautomerEnumerator.enumerate( - query, &modifiedAtomsBitSet, &modifiedBondsBitSet); + const auto res = tautomerEnumerator.enumerate(query); std::vector modifiedAtoms; - modifiedAtoms.reserve(modifiedAtomsBitSet.count()); + modifiedAtoms.reserve(res.modifiedAtoms().count()); for (size_t i = 0; i < query.getNumAtoms(); i++) { - if (modifiedAtomsBitSet[i]) { + if (res.modifiedAtoms().test(i)) { modifiedAtoms.push_back(i); } } std::vector modifiedBonds; - modifiedBonds.reserve(modifiedBondsBitSet.count()); + modifiedBonds.reserve(res.modifiedBonds().count()); for (size_t i = 0; i < query.getNumBonds(); i++) { - if (modifiedBondsBitSet[i]) { + if (res.modifiedBonds().test(i)) { modifiedBonds.push_back(i); } } @@ -169,8 +166,9 @@ TautomerQuery *TautomerQuery::fromMol( delete queryBond; } - return new TautomerQuery(tautomers, (ROMol *)templateMolecule, modifiedAtoms, - modifiedBonds); + return new TautomerQuery(res.tautomers(), + static_cast(templateMolecule), + modifiedAtoms, modifiedBonds); } bool TautomerQuery::matchTautomer( diff --git a/Data/MolStandardize/tautomerTransforms.in b/Data/MolStandardize/tautomerTransforms.in index 686146f8a..3559f7334 100644 --- a/Data/MolStandardize/tautomerTransforms.in +++ b/Data/MolStandardize/tautomerTransforms.in @@ -29,8 +29,8 @@ oxim/nitroso via phenol f [O!H0]-[N]=[C]-[C]=[C]-[C]=[OH0] oxim/nitroso via phenol r [O!H0]-[c]=[c]-[c]=[c]-[N]=[OH0] cyano/iso-cyanic acid f [O!H0]-[C]#[N] == cyano/iso-cyanic acid r [N!H0]=[C]=[O] #- -//formamidinesulfinic acid f [O,N;!H0]-[C]=[S,Se,Te]=[O] =-- // TODO: WAT!? -//formamidinesulfinic acid r [O!H0]-[S,Se,Te]-[C]=[O,N] =-- +formamidinesulfinic acid f [O,N;!H0]-[C]=[S,Se,Te;v6]=[O] =-- +formamidinesulfinic acid r [O!H0]-[S,Se,Te;v4]-[C]=[O,N] ==- isocyanide f [C-0!H0]#[N+0] # -+ isocyanide r [N+!H0]#[C-] # -+ phosphonic acid f [OH]-[PH0] = diff --git a/ReleaseNotes.md b/ReleaseNotes.md index 3be860061..ea4bbe7ed 100644 --- a/ReleaseNotes.md +++ b/ReleaseNotes.md @@ -10,6 +10,13 @@ - The Open3DAlign functionality is now in its own separate library - `O3AAlign` in cmake. If you are working in C++ and using O3A functionality, you'll need to link against this library as well now. +- Due to improvements in the tautomer enumeration code, the method + `TautomerEnumerator::enumerate` now returns a `TautomerEnumeratorResult` + object instead of a vector of molecules. Note that if you are iterating over + the results of a call to `enumerate()` you shouldn't need to change your code. + If you want to invoke the old (and deprecated, see below) form from C++, call + `TautomerNumerator::enumerate(mol, nullptr)` or explicitly pass a + `boost::dynamic_bitset*` to capture the modified atoms. ## Code removed in this release: - To improve API consistency of the exceptions in RDKit with the default ones in @@ -23,6 +30,10 @@ the namespace QueryOps. Please use `QueryOps::replaceAtomWithQueryAtom()` instead. The version in the `FileParserUtils` namespace will be removed in the next release. +- The method `std::vector TautomerEnumerator::enumerate(const ROMol &mol, boost::dynamic_bitset<> *modifiedAtoms, boost::dynamic_bitset<> *modifiedBonds = nullptr)` + is deprecated and will be removed in a future release. + Please use `TautomerEnumeratorResult TautomerEnumerator::enumerate(const ROMol &mol,bool reassignStereo = true)` + instead.