// // Created by Gareth Jones on 5/7/2020. // // Copyright 2020 Schrodinger, Inc // @@ All Rights Reserved @@ // This file is part of the RDKit. // The contents are covered by the terms of the BSD license // which is included in the file license.txt, found at the root // of the RDKit source tree. #include "TautomerQuery.h" #include #include #include #include #include #include #include #include #include #include #include #include // #define VERBOSE #ifdef RDK_USE_BOOST_SERIALIZATION #include #include #include #include #include #endif #ifdef VERBOSE #include #endif namespace { using namespace RDKit; // Adapted from Code/GraphMol/Substruct/SubstructUtils.cpp#removeDuplicates void removeTautomerDuplicates(std::vector &matches, std::vector *matchingTautomers, int nAtoms) { // // This works by tracking the indices of the atoms in each match vector. // This can lead to unexpected behavior when looking at rings and queries // that don't specify bond orders. For example querying this molecule: // C1CCC=1 // with the pattern constructed from SMARTS C~C~C~C will return a // single match, despite the fact that there are 4 different paths // when valence is considered. The defense of this behavior is // that the 4 paths are equivalent in the semantics of the query. // Also, OELib returns the same results // std::set> seen; std::vector res; res.reserve(matches.size()); for (auto &&match : matches) { boost::dynamic_bitset<> val(nAtoms); for (const auto &ci : match) { val.set(ci.second); } auto pos = seen.lower_bound(val); if (pos == seen.end() || *pos != val) { res.push_back(std::move(match)); seen.insert(pos, std::move(val)); } else if (matchingTautomers) { auto position = res.size(); matchingTautomers->erase(matchingTautomers->begin() + position); } } res.shrink_to_fit(); matches = std::move(res); } } // namespace namespace RDKit { bool TautomerQueryCanSerialize() { #ifdef RDK_USE_BOOST_SERIALIZATION return true; #else return false; #endif } class TautomerQueryMatcher { private: const TautomerQuery &d_tautomerQuery; const SubstructMatchParameters &d_params; std::vector *d_matchingTautomers; public: TautomerQueryMatcher(const TautomerQuery &tautomerQuery, const SubstructMatchParameters ¶ms, std::vector *matchingTautomers) : d_tautomerQuery(tautomerQuery), d_params(params), d_matchingTautomers(matchingTautomers) {} bool match(const ROMol &mol, const std::span &match) { #ifdef VERBOSE std::cout << "Checking template match" << std::endl; #endif for (auto tautomer : d_tautomerQuery.getTautomers()) { #ifdef VERBOSE std::cout << "Checking Tautomer " << MolToSmiles(*tautomer) << std::endl; #endif if (d_tautomerQuery.matchTautomer(mol, *tautomer, match, d_params)) { auto matchingTautomer = d_params.extraFinalCheck ? d_params.extraFinalCheck(mol, match) : true; if (matchingTautomer) { #ifdef VERBOSE std::cout << "Got Match " << std::endl; #endif if (d_matchingTautomers) { d_matchingTautomers->push_back(tautomer); } } return matchingTautomer; } } return false; } }; TautomerQuery::TautomerQuery(std::vector tautomers, const ROMol *const templateMolecule, std::vector modifiedAtoms, std::vector modifiedBonds) : d_tautomers(std::move(tautomers)), d_templateMolecule(templateMolecule), d_modifiedAtoms(std::move(modifiedAtoms)), d_modifiedBonds(std::move(modifiedBonds)) {} TautomerQuery *TautomerQuery::fromMol( const ROMol &query, const std::string &tautomerTransformFile) { auto tautomerParams = std::unique_ptr( new MolStandardize::TautomerCatalogParams(tautomerTransformFile)); MolStandardize::TautomerEnumerator tautomerEnumerator( new MolStandardize::TautomerCatalog(tautomerParams.get())); const auto res = tautomerEnumerator.enumerate(query); std::vector modifiedAtoms; modifiedAtoms.reserve(res.modifiedAtoms().count()); for (size_t i = 0; i < query.getNumAtoms(); i++) { if (res.modifiedAtoms().test(i)) { modifiedAtoms.push_back(i); } } std::vector modifiedBonds; modifiedBonds.reserve(res.modifiedBonds().count()); for (size_t i = 0; i < query.getNumBonds(); i++) { if (res.modifiedBonds().test(i)) { modifiedBonds.push_back(i); } } auto templateMolecule = new RWMol(query); for (auto idx : modifiedAtoms) { const auto atom = query.getAtomWithIdx(idx); const auto queryAtom = new QueryAtom(atom->getAtomicNum()); // Forward original queries if (atom->hasQuery()) { auto originalAtomQuery = static_cast(atom)->getQuery(); queryAtom->setQuery(originalAtomQuery->copy()); } const bool updateLabel = false; const bool preserveProps = true; templateMolecule->replaceAtom(idx, queryAtom, updateLabel, preserveProps); delete queryAtom; } for (auto idx : modifiedBonds) { auto bondQuery = makeSingleOrDoubleOrAromaticBondQuery(); auto queryBond = new QueryBond(); queryBond->setQuery(bondQuery); templateMolecule->replaceBond(idx, queryBond, true); delete queryBond; } // Copy molecule properties from the original query to all tautomers. // Tautomer enumeration uses quickCopy for performance, which doesn't copy // molecule properties. We need to preserve them here for proper CXSMILES // output (e.g. link nodes). auto tautomers = res.tautomers(); for (auto &tautomer : tautomers) { tautomer->updateProps(query); } return new TautomerQuery(std::move(tautomers), static_cast(templateMolecule), modifiedAtoms, modifiedBonds); } bool TautomerQuery::matchTautomer( const ROMol &mol, const ROMol &tautomer, const std::span &match, const SubstructMatchParameters ¶ms) const { for (auto idx : d_modifiedAtoms) { const auto queryAtom = tautomer.getAtomWithIdx(idx); const auto targetAtom = mol.getAtomWithIdx(match[idx]); #ifdef VERBOSE std::cout << "Query atom " << queryAtom->getSymbol() << " target atom " << targetAtom->getSymbol() << std::endl; #endif if (!atomCompat(queryAtom, targetAtom, params)) { #ifdef VERBOSE std::cout << "Atom incompatibility" << std::endl; #endif return false; } } for (auto idx : d_modifiedBonds) { const auto queryBond = tautomer.getBondWithIdx(idx); const auto beginIdx = queryBond->getBeginAtomIdx(); const auto endIdx = queryBond->getEndAtomIdx(); const auto targetBeginIdx = match[beginIdx]; const auto targetEndIdx = match[endIdx]; const auto targetBond = mol.getBondBetweenAtoms(targetBeginIdx, targetEndIdx); #ifdef VERBOSE std::cout << "Query bond " << queryBond->getBondType() << " target bond " << targetBond->getBondType() << std::endl; #endif if (!bondCompat(queryBond, targetBond, params)) { #ifdef VERBOSE std::cout << "Bond incompatibility" << std::endl; #endif return false; } } #ifdef VERBOSE std::cout << "Tautomer match" << std::endl; #endif return true; } std::vector TautomerQuery::substructOf( const ROMol &mol, const SubstructMatchParameters ¶ms, std::vector *matchingTautomers) const { if (matchingTautomers) { matchingTautomers->clear(); } #ifdef VERBOSE std::cout << "Tautomer search with query " << MolToSmiles(*d_templateMolecule) << " on " << MolToSmiles(mol) << " max matches " << params.maxMatches << std::endl; #endif SubstructMatchParameters templateParams(params); // need to check all mappings of template to target templateParams.uniquify = false; TautomerQueryMatcher tautomerQueryMatcher(*this, params, matchingTautomers); // use this functor as a final check to see if any tautomer matches the target auto checker = [&tautomerQueryMatcher]( const ROMol &mol, const std::span &match) mutable { return tautomerQueryMatcher.match(mol, match); }; templateParams.extraFinalCheck = checker; auto matches = RDKit::SubstructMatch(mol, *d_templateMolecule, templateParams); #ifdef VERBOSE std::cout << "Found " << matches.size() << " matches " << std::endl; #endif if (params.uniquify && matches.size() > 1) { removeTautomerDuplicates(matches, matchingTautomers, mol.getNumAtoms()); #ifdef VERBOSE std::cout << "After removing duplicates " << matches.size() << " matches " << std::endl; #endif } return matches; } bool TautomerQuery::isSubstructOf(const ROMol &mol, const SubstructMatchParameters ¶ms) { SubstructMatchParameters params2(params); params2.maxMatches = 1; auto matches = substructOf(mol, params2); return matches.size() > 0; } ExplicitBitVect *TautomerQuery::patternFingerprintTemplate( unsigned int fpSize) const { return PatternFingerprintMol(*d_templateMolecule, fpSize, nullptr, nullptr, true); } ExplicitBitVect *TautomerQuery::patternFingerprintTarget(const ROMol &target, unsigned int fpSize) { return PatternFingerprintMol(target, fpSize, nullptr, nullptr, true); } std::vector SubstructMatch( const ROMol &mol, const TautomerQuery &query, const SubstructMatchParameters ¶ms) { return query.substructOf(mol, params); } void TautomerQuery::toStream(std::ostream &ss) const { #ifndef RDK_USE_BOOST_SERIALIZATION PRECONDITION(0, "Boost SERIALIZATION is not enabled") #else boost::archive::text_oarchive ar(ss); ar << *this; #endif } std::string TautomerQuery::serialize() const { std::stringstream ss; toStream(ss); return ss.str(); } void TautomerQuery::initFromStream(std::istream &ss) { #ifndef RDK_USE_BOOST_SERIALIZATION PRECONDITION(0, "Boost SERIALIZATION is not enabled") #else boost::archive::text_iarchive ar(ss); ar >> *this; #endif } void TautomerQuery::initFromString(const std::string &text) { std::stringstream ss(text); initFromStream(ss); } } // namespace RDKit