// // Created by Gareth Jones on 5/7/2020. // // Copyright 2020 Schrodinger, Inc // @@ All Rights Reserved @@ // This file is part of the RDKit. // The contents are covered by the terms of the BSD license // which is included in the file license.txt, found at the root // of the RDKit source tree. #include "TautomerQuery.h" #include #include #include #include #include #include #include #include #include #include // #define VERBOSE #ifdef VERBOSE #include #endif namespace { using namespace RDKit; // Adapted from Code/GraphMol/Substruct/SubstructUtils.cpp#removeDuplicates void removeTautomerDuplicates(std::vector &matches, std::vector *matchingTautomers, int nAtoms) { // // This works by tracking the indices of the atoms in each match vector. // This can lead to unexpected behavior when looking at rings and queries // that don't specify bond orders. For example querying this molecule: // C1CCC=1 // with the pattern constructed from SMARTS C~C~C~C will return a // single match, despite the fact that there are 4 different paths // when valence is considered. The defense of this behavior is // that the 4 paths are equivalent in the semantics of the query. // Also, OELib returns the same results // std::vector> seen; std::vector res; for (size_t i = 0; i < matches.size(); i++) { auto match = matches[i]; boost::dynamic_bitset<> val(nAtoms); for (const auto &ci : match) { val.set(ci.second); } if (std::find(seen.begin(), seen.end(), val) == seen.end()) { // it's something new res.push_back(match); seen.push_back(val); } else if (matchingTautomers) { int position = res.size(); matchingTautomers->erase(matchingTautomers->begin() + position); } } matches = res; } } // namespace namespace RDKit { class TautomerQueryMatcher { private: const TautomerQuery &d_tautomerQuery; const SubstructMatchParameters &d_params; std::vector *d_matchingTautomers; public: TautomerQueryMatcher(const TautomerQuery &tautomerQuery, const SubstructMatchParameters ¶ms, std::vector *matchingTautomers) : d_tautomerQuery(tautomerQuery), d_params(params), d_matchingTautomers(matchingTautomers) {} bool match(const ROMol &mol, const std::vector &match) { #ifdef VERBOSE std::cout << "Checking template match" << std::endl; #endif for (auto tautomer : d_tautomerQuery.getTautomers()) { #ifdef VERBOSE std::cout << "Checking Tautomer " << MolToSmiles(*tautomer) << std::endl; #endif if (d_tautomerQuery.matchTautomer(mol, *tautomer, match, d_params)) { auto matchingTautomer = d_params.extraFinalCheck ? d_params.extraFinalCheck(mol, match) : true; if (matchingTautomer) { #ifdef VERBOSE std::cout << "Got Match " << std::endl; #endif if (d_matchingTautomers) d_matchingTautomers->push_back(tautomer); } return matchingTautomer; } } return false; } }; TautomerQuery::TautomerQuery(const std::vector &tautomers, const ROMol *const templateMolecule, const std::vector &modifiedAtoms, const std::vector &modifiedBonds) : d_tautomers(tautomers), d_templateMolecule(templateMolecule), d_modifiedAtoms(modifiedAtoms), d_modifiedBonds(modifiedBonds) {} TautomerQuery *TautomerQuery::fromMol( const ROMol &query, const std::string &tautomerTransformFile) { auto tautomerFile = !tautomerTransformFile.empty() ? tautomerTransformFile : std::string(getenv("RDBASE")) + "/Data/MolStandardize/tautomerTransforms.in"; auto tautomerParams = std::unique_ptr( new MolStandardize::TautomerCatalogParams(tautomerFile)); MolStandardize::TautomerEnumerator tautomerEnumerator( new MolStandardize::TautomerCatalog(tautomerParams.get())); const auto res = tautomerEnumerator.enumerate(query); std::vector modifiedAtoms; modifiedAtoms.reserve(res.modifiedAtoms().count()); for (size_t i = 0; i < query.getNumAtoms(); i++) { if (res.modifiedAtoms().test(i)) { modifiedAtoms.push_back(i); } } std::vector modifiedBonds; modifiedBonds.reserve(res.modifiedBonds().count()); for (size_t i = 0; i < query.getNumBonds(); i++) { if (res.modifiedBonds().test(i)) { modifiedBonds.push_back(i); } } auto templateMolecule = new RWMol(query); for (auto idx : modifiedAtoms) { const auto atom = templateMolecule->getAtomWithIdx(idx); const auto queryAtom = new QueryAtom(atom->getAtomicNum()); const bool updateLabel = false; const bool preserveProps = true; templateMolecule->replaceAtom(idx, queryAtom, updateLabel, preserveProps); delete queryAtom; } for (auto idx : modifiedBonds) { auto bondQuery = makeSingleOrDoubleOrAromaticBondQuery(); auto queryBond = new QueryBond(); queryBond->setQuery(bondQuery); templateMolecule->replaceBond(idx, queryBond, true); delete queryBond; } return new TautomerQuery(res.tautomers(), static_cast(templateMolecule), modifiedAtoms, modifiedBonds); } bool TautomerQuery::matchTautomer( const ROMol &mol, const ROMol &tautomer, const std::vector &match, const SubstructMatchParameters ¶ms) const { for (auto idx : d_modifiedAtoms) { const auto queryAtom = tautomer.getAtomWithIdx(idx); const auto targetAtom = mol.getAtomWithIdx(match[idx]); #ifdef VERBOSE std::cout << "Query atom " << queryAtom->getSymbol() << " target atom " << targetAtom->getSymbol() << std::endl; #endif if (!atomCompat(queryAtom, targetAtom, params)) { #ifdef VERBOSE std::cout << "Atom incompatibility" << std::endl; #endif return false; } } for (auto idx : d_modifiedBonds) { const auto queryBond = tautomer.getBondWithIdx(idx); const auto beginIdx = queryBond->getBeginAtomIdx(); const auto endIdx = queryBond->getEndAtomIdx(); const auto targetBeginIdx = match[beginIdx]; const auto targetEndIdx = match[endIdx]; const auto targetBond = mol.getBondBetweenAtoms(targetBeginIdx, targetEndIdx); #ifdef VERBOSE std::cout << "Query bond " << queryBond->getBondType() << " target bond " << targetBond->getBondType() << std::endl; #endif if (!bondCompat(queryBond, targetBond, params)) { #ifdef VERBOSE std::cout << "Bond incompatibility" << std::endl; #endif return false; } } #ifdef VERBOSE std::cout << "Tautomer match" << std::endl; #endif return true; } std::vector TautomerQuery::substructOf( const ROMol &mol, const SubstructMatchParameters ¶ms, std::vector *matchingTautomers) const { if (matchingTautomers) { matchingTautomers->clear(); } #ifdef VERBOSE std::cout << "Tautomer search with query " << MolToSmiles(*d_templateMolecule) << " on " << MolToSmiles(mol) << " max matches " << params.maxMatches << std::endl; #endif SubstructMatchParameters templateParams(params); // need to check all mappings of template to target templateParams.uniquify = false; TautomerQueryMatcher tautomerQueryMatcher(*this, params, matchingTautomers); // use this functor as a final check to see if any tautomer matches the target auto checker = [&tautomerQueryMatcher]( const ROMol &mol, const std::vector &match) mutable { return tautomerQueryMatcher.match(mol, match); }; templateParams.extraFinalCheck = checker; auto matches = RDKit::SubstructMatch(mol, *d_templateMolecule, templateParams); #ifdef VERBOSE std::cout << "Found " << matches.size() << " matches " << std::endl; #endif if (params.uniquify && matches.size() > 1) { removeTautomerDuplicates(matches, matchingTautomers, mol.getNumAtoms()); #ifdef VERBOSE std::cout << "After removing duplicates " << matches.size() << " matches " << std::endl; #endif } return matches; } bool TautomerQuery::isSubstructOf(const ROMol &mol, const SubstructMatchParameters ¶ms) { SubstructMatchParameters params2(params); params2.maxMatches = 1; auto matches = substructOf(mol, params2); return matches.size() > 0; } ExplicitBitVect *TautomerQuery::patternFingerprintTemplate( unsigned int fpSize) const { return PatternFingerprintMol(*d_templateMolecule, fpSize, nullptr, nullptr, true); } ExplicitBitVect *TautomerQuery::patternFingerprintTarget(const ROMol &target, unsigned int fpSize) { return PatternFingerprintMol(target, fpSize, nullptr, nullptr, true); } std::vector SubstructMatch( const ROMol &mol, const TautomerQuery &query, const SubstructMatchParameters ¶ms) { return query.substructOf(mol, params); } } // namespace RDKit