Files
rdkit/Code/GraphMol/SubstructLibrary/SubstructLibrary.h
Brian Kelley d8722b1c64 First entry of the SubstructLibrary module (#1493)
* First entry of the SubstructLibrary module

* Switch to shared_ptrs for memory safety

* run clang-format and add a test

* only build when threadsafe_sss is enabled, only test when test_multithreaded is enabled

* Adds PRECONDITION

* Adds exceptions for out of range errors

* Changes getQueryBits to makeQueryBits for clarity

* Adds API point to add Binary and Smiles directly

* Replaces 0 with NULL for clarity.

* Removes debugging code.

* Makes endIdx non-inclusive

* Adds substruct searching options

* Adds preconditions

* Small documentation cleanup and removes a line of code.

* Add DataStructs library dependency

* Actually uses passed in parameters

* Adds TrustedSmiles/Binary inputs

* Adds more api testing options/trusted smiles and binary inputs

* Finalizes API adds docs and doctests

* Minor copyright fixes and such
2017-07-19 10:39:42 +02:00

466 lines
17 KiB
C++

// Copyright (c) 2017, Novartis Institutes for BioMedical Research Inc.
// All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above
// copyright notice, this list of conditions and the following
// disclaimer in the documentation and/or other materials provided
// with the distribution.
// * Neither the name of Novartis Institutes for BioMedical Research Inc.
// nor the names of its contributors may be used to endorse or promote
// products derived from this software without specific prior written
// permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
#ifndef RDKIT_SUBSTRUCT_LIBRARY
#define RDKIT_SUBSTRUCT_LIBRARY
#include <GraphMol/RDKitBase.h>
#include <GraphMol/MolPickler.h>
#include <GraphMol/SmilesParse/SmilesParse.h>
#include <GraphMol/SmilesParse/SmilesWrite.h>
#include <GraphMol/Fingerprints/Fingerprints.h>
#include <DataStructs/ExplicitBitVect.h>
#include <DataStructs/BitOps.h>
namespace RDKit {
//! Base class API for holding molecules so substructure search.
/*!
This is an API that hides the implementation details used for
indexing molecules for substructure searching. It simply
provides an API for adding and getting molecules from a set.
*/
class MolHolderBase {
public:
virtual ~MolHolderBase() {}
//! Add a new molecule to the substructure search library
//! Returns the molecules index in the library
virtual unsigned int addMol( const ROMol &m ) = 0;
// implementations should throw IndexError on out of range
virtual boost::shared_ptr<ROMol> getMol(unsigned int) const = 0;
//! Get the current library size
virtual unsigned int size() const = 0;
};
//! Concrete class that holds molecules in memory
/*!
This is currently one of the faster implementations.
However it is very memory intensive.
*/
class MolHolder : public MolHolderBase {
std::vector<boost::shared_ptr<ROMol> > mols;
public:
MolHolder() : MolHolderBase(), mols() {}
virtual unsigned int addMol(const ROMol &m) {
mols.push_back(boost::make_shared<ROMol>(m));
return size() - 1;
}
virtual boost::shared_ptr<ROMol> getMol(unsigned int idx) const {
if(idx >= mols.size())
throw IndexErrorException(idx);
return mols[idx];
}
virtual unsigned int size() const {
return rdcast<unsigned int>(mols.size());
}
};
//! Concrete class that holds binary cached molecules in memory
/*!
This implementation uses quite a bit less memory than the
non cached implementation. However, due to the reduced speed
it should be used in conjunction with a pattern fingerprinter.
See RDKit::FPHolder
*/
class CachedMolHolder : public MolHolderBase {
std::vector<std::string> mols;
public:
CachedMolHolder() : MolHolderBase(), mols() {}
virtual unsigned int addMol(const ROMol &m) {
mols.push_back(std::string());
MolPickler::pickleMol(m, mols.back());
return size() - 1;
}
//! Adds a pickled binary molecule, no validity checking of the input
//! is done.
unsigned int addBinary(const std::string &pickle) {
mols.push_back( pickle );
return size()-1;
}
virtual boost::shared_ptr<ROMol> getMol(unsigned int idx) const {
if(idx >= mols.size())
throw IndexErrorException(idx);
boost::shared_ptr<ROMol> mol(new ROMol);
MolPickler::molFromPickle(mols[idx], mol.get());
return mol;
}
virtual unsigned int size() const {
return rdcast<unsigned int>(mols.size());
}
};
//! Concrete class that holds smiles strings in memory
/*!
This implementation uses quite a bit less memory than the
cached binary or uncached implementation. However, due to the
reduced speed it should be used in conjunction with a pattern
fingerprinter.
See RDKit::FPHolder
*/
class CachedSmilesMolHolder : public MolHolderBase {
std::vector<std::string> mols;
public:
CachedSmilesMolHolder() : MolHolderBase(), mols() {}
virtual unsigned int addMol(const ROMol &m) {
bool doIsomericSmiles = true;
mols.push_back(MolToSmiles(m, doIsomericSmiles));
return size() - 1;
}
//! Add a smiles to the dataset, no validation is done
//! to the inputs.
unsigned int addSmiles( const std::string &smiles ) {
mols.push_back(smiles);
return size() - 1;
}
virtual boost::shared_ptr<ROMol> getMol(unsigned int idx) const {
if(idx >= mols.size())
throw IndexErrorException(idx);
boost::shared_ptr<ROMol> mol(SmilesToMol(mols[idx]));
return mol;
}
virtual unsigned int size() const {
return rdcast<unsigned int>(mols.size());
}
};
//! Concrete class that holds trusted smiles strings in memory
/*!
A trusted smiles is essentially a smiles string that
RDKit has generated. This indicates that fewer
sanitization steps are required. See
http://rdkit.blogspot.com/2016/09/avoiding-unnecessary-work-and.html
This implementation uses quite a bit less memory than the
cached binary or uncached implementation. However, due to the
reduced speed it should be used in conjunction with a pattern
fingerprinter.
See RDKit::FPHolder
*/
class CachedTrustedSmilesMolHolder : public MolHolderBase {
std::vector<std::string> mols;
public:
CachedTrustedSmilesMolHolder() : MolHolderBase(), mols() {}
virtual unsigned int addMol(const ROMol &m) {
bool doIsomericSmiles = true;
mols.push_back(MolToSmiles(m, doIsomericSmiles));
return size() - 1;
}
//! Add a smiles to the dataset, no validation is done
//! to the inputs.
unsigned int addSmiles( const std::string &smiles ) {
mols.push_back(smiles);
return size() - 1;
}
virtual boost::shared_ptr<ROMol> getMol(unsigned int idx) const {
if(idx >= mols.size())
throw IndexErrorException(idx);
RWMol *m = SmilesToMol(mols[idx], 0, false);
m->updatePropertyCache();
return boost::shared_ptr<ROMol>(m);
}
virtual unsigned int size() const {
return rdcast<unsigned int>(mols.size());
}
};
//! Base FPI for the fingerprinter used to rule out impossible matches
class FPHolderBase {
std::vector<ExplicitBitVect *> fps;
public:
virtual ~FPHolderBase() {
for (size_t i = 0; i < fps.size(); ++i) delete fps[i];
}
//! Adds a molecule to the fingerprinter
unsigned int addMol( const ROMol &m) {
fps.push_back(makeFingerprint(m));
return rdcast<unsigned int>(fps.size() - 1);
}
//! Adds a raw bit vector to the fingerprinter
unsigned int addFingerprint( const ExplicitBitVect &v ) {
fps.push_back( new ExplicitBitVect(v) );
return rdcast<unsigned int>(fps.size() - 1);
}
//! Return false if a substructure search can never match the molecule
bool passesFilter(unsigned int idx, const ExplicitBitVect &query) const {
if(idx >= fps.size())
throw IndexErrorException(idx);
return AllProbeBitsMatch(query, *fps[idx]);
}
//! Get the bit vector at the specified index (throws IndexError if out of range)
const ExplicitBitVect &getFingerprint(unsigned int idx) const {
if(idx >= fps.size())
throw IndexErrorException(idx);
return *fps[idx];
}
//! make the query vector
//! Caller owns the vector!
virtual ExplicitBitVect *makeFingerprint(const ROMol &m) const = 0;
};
//! Uses the pattern fingerprinter to rule out matches
class PatternHolder : public FPHolderBase {
public:
//! Caller owns the vector!
virtual ExplicitBitVect *makeFingerprint(const ROMol &m) const {
return PatternFingerprintMol(m, 2048);
}
};
//! Substtructure Search a library of molecules
/*! This class allows for multithreaded substructure searches os
large datasets.
The implementations can use fingerprints to speed up searches
and have molecules cached as binary forms to reduce memory
usage.
*/
class SubstructLibrary {
boost::shared_ptr<MolHolderBase> molholder;
boost::shared_ptr<FPHolderBase> fpholder;
MolHolderBase *mols; // used for a small optimization
FPHolderBase *fps;
public:
SubstructLibrary()
: molholder(new MolHolder),
fpholder(),
mols(molholder.get()),
fps(NULL) {}
SubstructLibrary(boost::shared_ptr<MolHolderBase> molecules)
: molholder(molecules), fpholder(), mols(molholder.get()), fps(0) {}
SubstructLibrary(boost::shared_ptr<MolHolderBase> molecules,
boost::shared_ptr<FPHolderBase> fingerprints)
: molholder(molecules),
fpholder(fingerprints),
mols(molholder.get()),
fps(fpholder.get()) {}
//!Get the underlying molecule holder implementation
MolHolderBase & getMolHolder() {
PRECONDITION(mols, "Molecule holder NULL in SubstructLibrary");
return *mols;
}
const MolHolderBase & getMolecules() const {
PRECONDITION(mols, "Molecule holder NULL in SubstructLibrary");
return *mols;
}
//!Get the underlying fingerprint implementation.
/*! Throws a value error if no fingerprints have been set */
FPHolderBase & getFingerprints() {
if (!fps)
throw ValueErrorException("Substruct Library does not have fingerprints");
return *fps;
}
const FPHolderBase & getFingerprints() const {
if (!fps)
throw ValueErrorException("Substruct Library does not have fingerprints");
return *fps;
}
//! Add a molecule to the library
/*!
\param mol Molecule to add
returns index for the molecule in the library
*/
unsigned int addMol(const ROMol &mol);
//! Get the matching indices for the query
/*!
\param query Query to match against molecules
\param recursionPossible flags whether or not recursive matches are allowed [ default true ]
\param useChirality use atomic CIP codes as part of the comparison [ default true ]
\param useQueryQueryMatches if set, the contents of atom and bond queries [ default false ]
will be used as part of the matching
\param numThreads If -1 use all available processors [default -1]
\param maxResults Maximum results to return, -1 means return all [default
-1]
*/
std::vector<unsigned int> getMatches(const ROMol &query,
bool recursionPossible=true,
bool useChirality=true,
bool useQueryQueryMatches=false,
int numThreads=-1,
int maxResults=-1);
//!Get the matching indices for the query between the given indices
/*!
\param query Query to match against molecules
\param startIdx Start index of the search
\param endIdx Ending idx (non-inclusive) of the search.
\param recursionPossible flags whether or not recursive matches are allowed [ default true ]
\param useChirality use atomic CIP codes as part of the comparison [ default true ]
\param useQueryQueryMatches if set, the contents of atom and bond queries [ default false ]
will be used as part of the matching
\param numThreads If -1 use all available processors [default -1]
\param maxResults Maximum results to return, -1 means return all [default
-1]
*/
std::vector<unsigned int> getMatches(const ROMol &query,
unsigned int startIdx, unsigned int endIdx,
bool recursionPossible=true,
bool useChirality=true,
bool useQueryQueryMatches=false,
int numThreads=-1,
int maxResults=-1);
//! Return the number of matches for the query
/*!
\param query Query to match against molecules
\param recursionPossible flags whether or not recursive matches are allowed [ default true ]
\param useChirality use atomic CIP codes as part of the comparison [ default true ]
\param useQueryQueryMatches if set, the contents of atom and bond queries [ default false ]
will be used as part of the matching
\param numThreads If -1 use all available processors [default -1]
*/
unsigned int countMatches(const ROMol &query,
bool recursionPossible=true,
bool useChirality=true,
bool useQueryQueryMatches=false,
int numThreads=-1);
//!Return the number of matches for the query between the given indices
/*!
\param query Query to match against molecules
\param startIdx Start index of the search
\param endIdx Ending idx (non-inclusive) of the search.
\param recursionPossible flags whether or not recursive matches are allowed [ default true ]
\param useChirality use atomic CIP codes as part of the comparison [ default true ]
\param useQueryQueryMatches if set, the contents of atom and bond queries [ default false ]
will be used as part of the matching
\param numThreads If -1 use all available processors [default -1]
*/
unsigned int countMatches(const ROMol &query,
unsigned int startIdx, unsigned int endIdx,
bool recursionPossible=true,
bool useChirality=true,
bool useQueryQueryMatches=false,
int numThreads=-1);
//! Returns true if any match exists for the query
/*!
\param query Query to match against molecules
\param recursionPossible flags whether or not recursive matches are allowed [ default true ]
\param useChirality use atomic CIP codes as part of the comparison [ default true ]
\param useQueryQueryMatches if set, the contents of atom and bond queries [ default false ]
will be used as part of the matching
\param numThreads If -1 use all available processors [default -1]
*/
bool hasMatch(const ROMol &query,
bool recursionPossible=true,
bool useChirality=true,
bool useQueryQueryMatches=false,
int numThreads=-1);
//! Returns true if any match exists for the query between the specified indices
/*!
\param query Query to match against molecules
\param startIdx Start index of the search
\param endIdx Ending idx (inclusive) of the search.
\param recursionPossible flags whether or not recursive matches are allowed [ default true ]
\param useChirality use atomic CIP codes as part of the comparison [ default true ]
\param useQueryQueryMatches if set, the contents of atom and bond queries [ default false ]
will be used as part of the matching
\param numThreads If -1 use all available processors [default -1]
*/
bool hasMatch(const ROMol &query,
unsigned int startIdx, unsigned int endIdx,
bool recursionPossible=true,
bool useChirality=true,
bool useQueryQueryMatches=false,
int numThreads=-1);
//! Returns the molecule at the given index
/*!
\param idx Index of the molecule in the library
*/
boost::shared_ptr<ROMol> getMol(unsigned int idx) const {
// expects implementation to throw IndexError if out of range
PRECONDITION(mols, "molholder is null in SubstructLibrary");
return mols->getMol(idx);
}
//! Returns the molecule at the given index
/*!
\param idx Index of the molecule in the library
*/
boost::shared_ptr<ROMol> operator[] (unsigned int idx) {
// expects implementation to throw IndexError if out of range
PRECONDITION(mols, "molholder is null in SubstructLibrary");
return mols->getMol(idx);
}
//! return the number of molecules in the library
unsigned int size() const {
PRECONDITION(mols, "molholder is null in SubstructLibrary");
return rdcast<unsigned int>(molholder->size());
}
};
}
#endif