// Copyright (c) 2017-2019, Novartis Institutes for BioMedical Research Inc. // All rights reserved. // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions are // met: // // * Redistributions of source code must retain the above copyright // notice, this list of conditions and the following disclaimer. // * Redistributions in binary form must reproduce the above // copyright notice, this list of conditions and the following // disclaimer in the documentation and/or other materials provided // with the distribution. // * Neither the name of Novartis Institutes for BioMedical Research Inc. // nor the names of its contributors may be used to endorse or promote // products derived from this software without specific prior written // permission. // // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. // #include "SubstructLibrary.h" #include #ifdef RDK_THREADSAFE_SSS #include #include #endif #include #include namespace RDKit { bool SubstructLibraryCanSerialize() { #ifdef RDK_USE_BOOST_SERIALIZATION return true; #else return false; #endif } struct Bits { const ExplicitBitVect *queryBits; const FPHolderBase *fps; bool recursionPossible; bool useChirality; bool useQueryQueryMatches; Bits(const FPHolderBase *fps, const ROMol &m, bool recursionPossible, bool useChirality, bool useQueryQueryMatches) : fps(fps), recursionPossible(recursionPossible), useChirality(useChirality), useQueryQueryMatches(useQueryQueryMatches) { if (fps) { queryBits = fps->makeFingerprint(m); } else { queryBits = nullptr; } } bool check(unsigned int idx) const { if (fps) { return fps->passesFilter(idx, *queryBits); } return true; } }; unsigned int SubstructLibrary::addMol(const ROMol &m) { unsigned int size = mols->addMol(m); if (fps) { unsigned int fpsize = fps->addMol(m); CHECK_INVARIANT(size == fpsize, "#mols different than #fingerprints in SubstructLibrary"); } return size; } namespace { // Return true if the pattern contains a ring query bool query_needs_rings(const ROMol &in_query) { for (auto &atom: in_query.atoms()) { if(atom->hasQuery()) { if (describeQuery(atom).find("Ring") != std::string::npos) { return true; } } } for (auto &bond: in_query.bonds()) { if(bond->hasQuery()) { if (describeQuery(bond).find("Ring") != std::string::npos) { return true; } } } return false; } // end is exclusive here void SubSearcher(const ROMol &in_query, const Bits &bits, const MolHolderBase &mols, std::vector &idxs, unsigned int start, unsigned int end, unsigned int numThreads, std::atomic &counter, const int maxResults, const bool needs_rings) { ROMol query(in_query); MatchVectType matchVect; for (unsigned int idx = start; idx < end && (maxResults == -1 || counter < maxResults); idx += numThreads) { if (!bits.check(idx)) { continue; } // need shared_ptr as it (may) control the lifespan of the // returned molecule! const boost::shared_ptr &m = mols.getMol(idx); ROMol *mol = m.get(); if (needs_rings && (!mol->getRingInfo() || !mol->getRingInfo()->isInitialized())) { // I have no idea what happens when symmetrizeSSSR gets called // on the same molecule twice in two threads. // This most likely WILL NOT HAPPEN since only one molholder // likely needs ring info. MolOps::symmetrizeSSSR(*mol); } if (SubstructMatch(*mol, query, matchVect, bits.recursionPossible, bits.useChirality, bits.useQueryQueryMatches)) { // this is squishy when updating the counter. While incrementing is // atomic // several substructure runs can update the counter beyond the maxResults // This okay: if we get one or two extra, we can fix it on the way out if (maxResults != -1 && counter >= maxResults) { break; } idxs.push_back(idx); if (maxResults != -1) { counter++; } } } } // end is inclusive here void SubSearchMatchCounter(const ROMol &in_query, const Bits &bits, const MolHolderBase &mols, unsigned int start, unsigned int end, int numThreads, std::atomic &counter, bool needs_rings) { ROMol query(in_query); MatchVectType matchVect; for (unsigned int idx = start; idx < end; idx += numThreads) { if (!bits.check(idx)) { continue; } // need shared_ptr as it (may) controls the lifespan of the // returned molecule! const boost::shared_ptr &m = mols.getMol(idx); ROMol *mol = m.get(); if (needs_rings && (!mol->getRingInfo() || !mol->getRingInfo()->isInitialized())) { // I have no idea what happens when symmetrizeSSSR gets called // on the same molecule twice in two threads. // This most likely WILL NOT HAPPEN since only one molholder // likely needs ring info. MolOps::symmetrizeSSSR(*mol); } if (SubstructMatch(*mol, query, matchVect, bits.recursionPossible, bits.useChirality, bits.useQueryQueryMatches)) { counter++; } } } std::vector internalGetMatches( const ROMol &query, MolHolderBase &mols, const FPHolderBase *fps, unsigned int startIdx, unsigned int endIdx, bool recursionPossible, bool useChirality, bool useQueryQueryMatches, int numThreads = -1, int maxResults = 1000) { PRECONDITION(startIdx < mols.size(), "startIdx out of bounds"); PRECONDITION(endIdx > startIdx, "endIdx > startIdx"); numThreads = (int)getNumThreadsToUse(numThreads); endIdx = std::min(mols.size(), endIdx); if (endIdx < static_cast(numThreads)) { numThreads = endIdx; } std::vector> thread_group; std::atomic counter(0); std::vector> internal_results(numThreads); bool needs_rings = query_needs_rings(query); Bits bits(fps, query, recursionPossible, useChirality, useQueryQueryMatches); for (int thread_group_idx = 0; thread_group_idx < numThreads; ++thread_group_idx) { // need to use boost::ref otherwise things are passed by value thread_group.emplace_back( std::async(std::launch::async, SubSearcher, std::ref(query), bits, std::ref(mols), std::ref(internal_results[thread_group_idx]), startIdx + thread_group_idx, endIdx, numThreads, std::ref(counter), maxResults, needs_rings)); } for (auto &fut : thread_group) { fut.get(); } delete bits.queryBits; std::vector results; for (int thread_group_idx = 0; thread_group_idx < numThreads; ++thread_group_idx) { results.insert(results.end(), internal_results[thread_group_idx].begin(), internal_results[thread_group_idx].end()); } // this is so we don't really have to do locking on the atomic counter... if (maxResults != -1 && rdcast(results.size()) > maxResults) { results.resize(maxResults); } return results; } int internalMatchCounter(const ROMol &query, MolHolderBase &mols, const FPHolderBase *fps, unsigned int startIdx, unsigned int endIdx, bool recursionPossible, bool useChirality, bool useQueryQueryMatches, int numThreads = -1) { PRECONDITION(startIdx < mols.size(), "startIdx out of bounds"); PRECONDITION(endIdx > startIdx, "endIdx > startIdx"); endIdx = std::min(mols.size(), endIdx); numThreads = (int)getNumThreadsToUse(numThreads); if (endIdx < static_cast(numThreads)) { numThreads = endIdx; } std::vector> thread_group; std::atomic counter(0); bool needs_rings = query_needs_rings(query); Bits bits(fps, query, recursionPossible, useChirality, useQueryQueryMatches); for (int thread_group_idx = 0; thread_group_idx < numThreads; ++thread_group_idx) { // need to use boost::ref otherwise things are passed by value thread_group.emplace_back( std::async(std::launch::async, SubSearchMatchCounter, std::ref(query), bits, std::ref(mols), startIdx + thread_group_idx, endIdx, numThreads, std::ref(counter), needs_rings)); } for (auto &thread : thread_group) { thread.get(); } delete bits.queryBits; return (int)counter; } } std::vector SubstructLibrary::getMatches( const ROMol &query, bool recursionPossible, bool useChirality, bool useQueryQueryMatches, int numThreads, int maxResults) { return getMatches(query, 0, mols->size(), recursionPossible, useChirality, useQueryQueryMatches, numThreads, maxResults); } std::vector SubstructLibrary::getMatches( const ROMol &query, unsigned int startIdx, unsigned int endIdx, bool recursionPossible, bool useChirality, bool useQueryQueryMatches, int numThreads, int maxResults) { return internalGetMatches(query, *mols, fps, startIdx, endIdx, recursionPossible, useChirality, useQueryQueryMatches, numThreads, maxResults); } unsigned int SubstructLibrary::countMatches(const ROMol &query, bool recursionPossible, bool useChirality, bool useQueryQueryMatches, int numThreads) { return countMatches(query, 0, mols->size(), recursionPossible, useChirality, useQueryQueryMatches, numThreads); } unsigned int SubstructLibrary::countMatches( const ROMol &query, unsigned int startIdx, unsigned int endIdx, bool recursionPossible, bool useChirality, bool useQueryQueryMatches, int numThreads) { return internalMatchCounter(query, *mols, fps, startIdx, endIdx, recursionPossible, useChirality, useQueryQueryMatches, numThreads); } bool SubstructLibrary::hasMatch(const ROMol &query, bool recursionPossible, bool useChirality, bool useQueryQueryMatches, int numThreads) { const int maxResults = 1; return getMatches(query, recursionPossible, useChirality, useQueryQueryMatches, numThreads, maxResults) .size() > 0; } bool SubstructLibrary::hasMatch(const ROMol &query, unsigned int startIdx, unsigned int endIdx, bool recursionPossible, bool useChirality, bool useQueryQueryMatches, int numThreads) { const int maxResults = 1; return getMatches(query, startIdx, endIdx, recursionPossible, useChirality, useQueryQueryMatches, numThreads, maxResults) .size() > 0; } void SubstructLibrary::toStream(std::ostream &ss) const { #ifndef RDK_USE_BOOST_SERIALIZATION PRECONDITION(0, "Boost SERIALIZATION is not enabled") #else boost::archive::text_oarchive ar(ss); ar << *this; #endif } std::string SubstructLibrary::Serialize() const { std::stringstream ss; toStream(ss); return ss.str(); } void SubstructLibrary::initFromStream(std::istream &ss) { #ifndef RDK_USE_BOOST_SERIALIZATION PRECONDITION(0, "Boost SERIALIZATION is not enabled") #else boost::archive::text_iarchive ar(ss); ar >> *this; #endif } void SubstructLibrary::initFromString(const std::string &text) { std::stringstream ss(text); initFromStream(ss); } }