Files
rdkit/Code/GraphMol/SynthonSpaceSearch/Synthon.cpp
David Cosgrove b76122b510 Synthon fingerprint search (#8025)
* First pass at splitting molecule.

* Interim commit.  Reading libraries from file in original format.

* Basic search seems to be working.

* Pattern fingerprint screening.

* Connector region heuristic.

* Fixed triazole (aromatic/non-aromatic connectors).

* Fix search with non-split parent query, where query is substructure of a single reagent.

* Remove duplicate hits by reaction/reagents used.

* Implement largest fragment heuristic.

* Extra test files.

* Read/write binary file.
Program for conversion from text format to binary format.

* Remove empty reagent sets on reading, probably due to synthon number counting from 1 rather than 0.

* Tidy SSSearch functions.

* Stash pending major surgery for triazole bug.

* Revert to using unique_ptr.
Correct use of reagent order.

* Function to summarise Hyperspace.

* Delay building hits till end and put cutoff on number.

* Earlier bale-out in getHitReagents.

* Streamline checkConnectorRegions.

* Remove free functions for search.

* Correct name of Python test.

* First stage of Python wrappers.

* Rename namespace.

* Parameters object.

* Mysterious windows export thing.

* Fix bug - not matching number of connectors in fragment and synthon.

* Back like it was.  The connector count wasn't the problem.

* Put the substructure results into their own class.

* gcc 14 didn't like my use of std::reduce.
Update expected test results.

* Remove write statement.

* Tidy.

* Tidy.

* Enable random sample of hits.

* Test that complex SMARTS works.
Update Python wrappers.

* Rename Hyperspace to SynthonSpace.

* More renaming.
Python test.

* Enable Python test.
Remove write.

* Plug memory leak.

* Response to Greg's initial look.

* More response to Greg's initial look.

* get the windows DLL builds working

* Do away with mutable.
Purge a few more uses of reagent in favour of synthon.
Remove the c++ exe for converting text to binary databases.

* Better Synthon c'tor.

* More feedback from Greg.

* Tidy the Python wrapper.

* Remove tags from catch tests.

* Don't allow copying of SubstructureResults.

* Revert to allow copying of SubstructureResults.  The Python wrapper needs it.

* Refinements based on CLion/clangd suggestions.

* Allow for map numbers in connectors in space file.

* Refactor to make the searcher a separate class from the space.

* Transfer Greg's review suggestions from Hyperspace merge.

* First cut of fingerprint searcher.

* Python wrapper.
Some tidying.

* Better random selection.

* Fix bug in preparing frags for fingerprints.
Re-factor.

* Minor-refactor.

* Sort hits by similarity if available.

* Option for a few different fingerprint types.  Pending a better solution.

* Write fingerprints to binary file.

* Use any fingerprint generator for similarity searching.  No Python wrapper yet.

* Python wrapper.

* Change random selection to use distribution weighted by number of hits in each reaction.

* Lots of suggestions from CLion/clang.

* Use boost discrete_distribution for cross-platform consistency.

* Tidy test up.

* Try boost rng as well.

* uniform_int_distribution to boost also.

* Small tidy.

* Method to write enumerated library.

* Windows export thing.

* Windows export thing.

* Allow for commas in tab-separated fields.

* win64 dll builds now work

* More aliphatic synthon, aromatic product joy.

* Force ring finding if it hasn't been done.

* Fingerprint hits not being sorted if maxHits reached.

* Remove debugging write.  Doh!

* Response to review of SynthonSpace2.

* Missed one.

* Add test file.

* Hand merge Greg's #8050.

* Discard nodiscard.

* Move include of export.h inside include guards.

* Response to review.

* Fix memory leaks.

---------

Co-authored-by: David Cosgrove <david@cozchemix.co.uk>
Co-authored-by: Greg Landrum <greg.landrum@gmail.com>
2024-11-29 13:20:15 +01:00

194 lines
5.9 KiB
C++

//
// Copyright (C) David Cosgrove 2024.
//
// @@ All Rights Reserved @@
// This file is part of the RDKit.
// The contents are covered by the terms of the BSD license
// which is included in the file license.txt, found at the root
// of the RDKit source tree.
//
#include <DataStructs/ExplicitBitVect.h>
#include <GraphMol/MolOps.h>
#include <GraphMol/MolPickler.h>
#include <GraphMol/Fingerprints/Fingerprints.h>
#include <GraphMol/SynthonSpaceSearch/Synthon.h>
#include <GraphMol/SmilesParse/SmilesParse.h>
#include <GraphMol/SmilesParse/SmilesWrite.h>
namespace RDKit::SynthonSpaceSearch {
Synthon::Synthon(const std::string &smi, const std::string &id)
: d_smiles(smi), d_id(id) {
v2::SmilesParse::SmilesParserParams params;
params.sanitize = false;
dp_origMol = v2::SmilesParse::MolFromSmiles(d_smiles, params);
if (!dp_origMol) {
// This should be rare, as it should be possible to assume that
// the people who made the SynthonSpace know what they're doing.
// Therefore, it's probably a corrupted or incorrect file, so
// bring it all down.
throw ValueErrorException("Unparsable synthon SMILES " + d_smiles +
" with ID " + d_id);
}
dp_origMol->setProp<std::string>(common_properties::_Name, d_id);
}
Synthon::Synthon(const Synthon &other)
: d_smiles(other.d_smiles),
d_id(other.d_id),
dp_origMol(std::make_unique<ROMol>(*other.dp_origMol)),
dp_searchMol(std::make_unique<ROMol>(*other.dp_searchMol)),
dp_pattFP(std::make_unique<ExplicitBitVect>(*other.dp_pattFP)),
d_connRegions(other.d_connRegions) {}
Synthon &Synthon::operator=(const Synthon &other) {
if (this == &other) {
return *this;
}
d_smiles = other.d_smiles;
d_id = other.d_id;
if (other.dp_origMol) {
dp_origMol = std::make_unique<ROMol>(*other.dp_origMol);
} else {
dp_origMol.reset();
}
if (other.dp_searchMol) {
dp_searchMol = std::make_unique<ROMol>(*other.dp_searchMol);
} else {
dp_searchMol.reset();
}
if (other.dp_pattFP) {
dp_pattFP = std::make_unique<ExplicitBitVect>(*other.dp_pattFP);
} else {
dp_pattFP.reset();
}
if (!other.d_connRegions.empty()) {
d_connRegions.clear();
std::transform(
other.d_connRegions.begin(), other.d_connRegions.end(),
std::back_inserter(d_connRegions),
[](const std::shared_ptr<ROMol> &m) -> std::shared_ptr<ROMol> {
return std::make_shared<ROMol>(*m);
});
} else {
d_connRegions.clear();
}
return *this;
}
const std::unique_ptr<ROMol> &Synthon::getOrigMol() const { return dp_origMol; }
const std::unique_ptr<ROMol> &Synthon::getSearchMol() const {
return dp_searchMol;
}
const std::unique_ptr<ExplicitBitVect> &Synthon::getPattFP() const {
return dp_pattFP;
}
const std::vector<std::shared_ptr<ROMol>> &Synthon::getConnRegions() const {
return d_connRegions;
}
void Synthon::setSearchMol(std::unique_ptr<RWMol> mol) {
dp_searchMol = std::move(mol);
finishInitialization();
}
void Synthon::writeToDBStream(std::ostream &os) const {
streamWrite(os, d_smiles);
streamWrite(os, d_id);
MolPickler::pickleMol(*dp_origMol, os, PicklerOps::AllProps);
MolPickler::pickleMol(*dp_searchMol, os, PicklerOps::AllProps);
const auto pattFPstr = getPattFP()->toString();
streamWrite(os, pattFPstr);
streamWrite(os, getConnRegions().size());
for (const auto &cr : getConnRegions()) {
MolPickler::pickleMol(*cr, os, PicklerOps::AllProps);
}
}
void Synthon::readFromDBStream(std::istream &is) {
streamRead(is, d_smiles, 0);
streamRead(is, d_id, 0);
dp_origMol = std::make_unique<ROMol>();
MolPickler::molFromPickle(is, *dp_origMol);
dp_searchMol = std::make_unique<ROMol>();
MolPickler::molFromPickle(is, *dp_searchMol);
std::string pickle;
streamRead(is, pickle, 0);
dp_pattFP = std::make_unique<ExplicitBitVect>(pickle);
size_t numConnRegs;
streamRead(is, numConnRegs);
d_connRegions.resize(numConnRegs);
for (size_t i = 0; i < numConnRegs; ++i) {
d_connRegions[i] = std::make_shared<ROMol>();
MolPickler::molFromPickle(is, *d_connRegions[i]);
}
}
void Synthon::tagAtomsAndBonds(const int molNum) const {
if (!dp_origMol) {
return;
}
for (const auto &atom : dp_origMol->atoms()) {
atom->setProp<int>("molNum", molNum);
atom->setProp<int>("idx", atom->getIdx());
}
for (const auto &bond : dp_origMol->bonds()) {
bond->setProp<int>("molNum", molNum);
bond->setProp<int>("idx", bond->getIdx());
}
}
void Synthon::finishInitialization() {
dp_pattFP.reset(PatternFingerprintMol(*dp_searchMol, 2048));
d_connRegions.clear();
if (const auto cr = getConnRegion(*dp_searchMol); cr) {
std::vector<std::unique_ptr<ROMol>> tmpFrags;
MolOps::getMolFrags(*cr, tmpFrags, false);
for (auto &f : tmpFrags) {
d_connRegions.push_back(std::shared_ptr<ROMol>(f.release()));
}
}
}
std::unique_ptr<ROMol> getConnRegion(const ROMol &mol) {
boost::dynamic_bitset<> inFrag(mol.getNumAtoms());
for (const auto a : mol.atoms()) {
if (!a->getAtomicNum() && a->getIsotope()) {
inFrag[a->getIdx()] = true;
for (const auto &n1 : mol.atomNeighbors(a)) {
inFrag[n1->getIdx()] = true;
for (const auto &n2 : mol.atomNeighbors(n1)) {
inFrag[n2->getIdx()] = true;
for (const auto &n3 : mol.atomNeighbors(n2)) {
inFrag[n3->getIdx()] = true;
}
}
}
}
}
if (!inFrag.count()) {
return std::unique_ptr<RWMol>();
}
std::unique_ptr<RWMol> molCp(new RWMol(mol));
molCp->beginBatchEdit();
for (const auto aCp : molCp->atoms()) {
if (!inFrag[aCp->getIdx()]) {
molCp->removeAtom(aCp);
} else {
if (!aCp->getAtomicNum()) {
aCp->setIsotope(1);
if (aCp->hasQuery()) {
aCp->expandQuery(makeAtomIsotopeQuery(1), Queries::COMPOSITE_OR);
}
}
}
}
molCp->commitBatchEdit();
return molCp;
}
} // namespace RDKit::SynthonSpaceSearch