RASCAL MCES (#6568)

This commit is contained in:
David Cosgrove
2023-08-27 12:51:49 +01:00
committed by GitHub
parent 9184a143d8
commit 2dd9c5f3cd
28 changed files with 5815 additions and 3 deletions

View File

@@ -87,6 +87,7 @@ add_subdirectory(MolDraw2D)
add_subdirectory(FMCS)
add_subdirectory(MolHash)
add_subdirectory(MMPA)
add_subdirectory(RascalMCES)
add_subdirectory(CIPLabeler)
add_subdirectory(Deprotect)
@@ -193,6 +194,6 @@ rdkit_catch_test(queryTestsCatch catch_queries.cpp
rdkit_catch_test(molbundleTestsCatch catch_molbundle.cpp
LINK_LIBRARIES SmilesParse GraphMol)
rdkit_catch_test(pickleTestsCatch catch_pickles.cpp
LINK_LIBRARIES FileParsers SmilesParse GraphMol)

View File

@@ -0,0 +1,16 @@
rdkit_library(RascalMCES
RascalMCES.cpp RascalCluster.cpp RascalButinaCluster.cpp
lap_a_la_scipy.cpp PartitionSet.cpp RascalResult.cpp
LINK_LIBRARIES SmilesParse FileParsers ChemTransforms SubstructMatch GraphMol)
target_compile_definitions(RascalMCES PRIVATE RDKIT_RASCALMCES_BUILD)
rdkit_headers(RascalMCES.h RascalOptions.h RascalClusterOptions.h RascalResult.h
DEST GraphMol/RascalMCES)
rdkit_catch_test(testRascalMCES mces_catch.cpp LINK_LIBRARIES RascalMCES)
rdkit_catch_test(testRascalCluster mces_cluster_catch.cpp LINK_LIBRARIES RascalMCES)
if (RDK_BUILD_PYTHON_WRAPPERS)
add_subdirectory(Wrap)
endif ()

View File

@@ -0,0 +1,220 @@
//
// Copyright (C) David Cosgrove 2023
//
// @@ All Rights Reserved @@
// This file is part of the RDKit.
// The contents are covered by the terms of the BSD license
// which is included in the file license.txt, found at the root
// of the RDKit source tree.
//
#include <algorithm>
#include <iostream>
#include <limits>
#include <map>
#include <memory>
#include "PartitionSet.h"
namespace RDKit {
namespace RascalMCES {
PartitionSet::PartitionSet(const std::vector<boost::dynamic_bitset<>> &modProd,
const std::vector<std::pair<int, int>> &vtxPairs,
const std::vector<unsigned int> &vtx1Labels,
const std::vector<unsigned int> &vtx2Labels,
unsigned int lowerBound)
: d_ModProd(new std::vector<boost::dynamic_bitset<>>(modProd)),
d_VtxPairs(new std::vector<std::pair<int, int>>(vtxPairs)),
d_vtx1Labels(new std::vector<unsigned int>(vtx1Labels)),
d_vtx2Labels(new std::vector<unsigned int>(vtx2Labels)) {
d_vtx1Counts = std::vector<int>(d_vtx1Labels->size(), 0);
d_vtx2Counts = std::vector<int>(d_vtx2Labels->size(), 0);
int firstVtx = -1;
// Clearly, a vertex in one of the line graphs can only match one vertex
// in the other. Thus, the initial partitions can be set up so that
// all vertices in a partition have the same vertex in the first
// line graph.
for (size_t i = 0; i < vtxPairs.size(); ++i) {
auto &vp = vtxPairs[i];
if (vp.first != firstVtx) {
d_parts.push_back(std::vector<unsigned int>());
d_parts.back().push_back(i);
firstVtx = vp.first;
} else {
d_parts.back().push_back(i);
}
d_vtx1Counts[vp.first]++;
d_vtx2Counts[vp.second]++;
}
if (d_parts.empty()) {
return;
}
// Now sort the partitions by size. This means that the vertices at the
// top of the partition set, above the lowerBound (or Pex as Raymond
// calls it in the paper), are the ones that match the least number of
// vertices in the other line graph. This has a dramatic effect on the
// speed compared with other things tried. I think it is what Raymond
// means when he says "Perform an initial partitioning of the vertices...
// using the labeled edge projection procedure."
sortPartitions();
// Now reassign vertices from above Pex to below it if possible.
// This also improves the speed of finding a large clique early.
// A vertex is moved to a partition where it isn't connected to a vertex
// in the modular product graph that is in the partition.
for (size_t i = d_parts.size() - 1; i > lowerBound; --i) {
bool reassigned = false;
for (auto &iv : d_parts[i]) {
for (size_t k = 0; k <= lowerBound; ++k) {
bool conn = false;
for (auto kv : d_parts[k]) {
if (modProd[iv][kv]) {
conn = true;
break;
}
}
if (!conn) {
d_parts[k].push_back(iv);
iv = std::numeric_limits<unsigned int>::max();
reassigned = true;
break;
}
}
}
if (reassigned) {
d_parts[i].erase(std::remove(d_parts[i].begin(), d_parts[i].end(),
std::numeric_limits<unsigned int>::max()),
d_parts[i].end());
}
}
d_parts.erase(std::remove_if(d_parts.begin(), d_parts.end(),
[](const std::vector<unsigned int> &v) {
return v.empty();
}),
d_parts.end());
// Sort again, to make sure the large partitions are dealt with as late as
// possible.
sortPartitions();
// Get the info together for the upper bound calculation.
calcVtxTypeCounts();
}
int PartitionSet::upperBound() {
int upperBound = 0;
for (size_t i = 0; i < d_vtx1TypeCounts.size(); ++i) {
upperBound += std::min(d_vtx1TypeCounts[i], d_vtx2TypeCounts[i]);
}
return upperBound;
}
unsigned int PartitionSet::popLastVertex() {
if (d_parts.empty()) {
throw std::runtime_error("PartitionSet set is empty.");
}
unsigned int ret_val = d_parts.back().back();
d_parts.back().pop_back();
if (d_parts.back().empty()) {
d_parts.pop_back();
}
decrementVertexCounts(ret_val);
return ret_val;
}
void PartitionSet::pruneVertices(unsigned int vtx_num) {
for (auto &part : d_parts) {
size_t i = 0;
while (i < part.size()) {
if (!(*d_ModProd)[part[i]][vtx_num]) {
decrementVertexCounts(part[i]);
part[i] = part.back();
part.pop_back();
} else {
++i;
}
}
}
d_parts.erase(std::remove_if(d_parts.begin(), d_parts.end(),
[](const std::vector<unsigned int> &v) {
return v.empty();
}),
d_parts.end());
sortPartitions();
}
void PartitionSet::sortPartitions() {
// When sorting lists with duplicate values, the order of the
// duplicates isn't defined. Different compilers do it differently.
// This can affect the results in the case where more than 1 MCES is
// possible, because the partition orders and hence the search tree
// traversal will be different. The results should be equivalent,
// though. To make things consistent, the sort is done with a
// tie-breaker on the first value in vectors of the same size. It
// doesn't slow things down very much on average, and it makes things
// tidier.
std::sort(d_parts.begin(), d_parts.end(),
[](const std::vector<unsigned int> &v1,
const std::vector<unsigned int> &v2) {
if (v1.size() == v2.size() && !v1.empty()) {
return v1.front() < v2.front();
} else {
return v1.size() > v2.size();
}
});
}
void PartitionSet::calcVtxTypeCounts() {
auto doIt = [](unsigned int maxLabel, const std::vector<int> &vtxCounts,
const std::vector<unsigned int> &vtxLabels,
std::vector<int> &vtxTypeCounts) -> void {
vtxTypeCounts = std::vector<int>(maxLabel + 1, 0);
for (size_t i = 0; i < vtxCounts.size(); ++i) {
if (vtxCounts[i]) {
++vtxTypeCounts[vtxLabels[i]];
}
}
};
unsigned int max_label = 0;
max_label =
std::max(*std::max_element(d_vtx1Labels->begin(), d_vtx1Labels->end()),
*std::max_element(d_vtx2Labels->begin(), d_vtx2Labels->end()));
doIt(max_label, d_vtx1Counts, *d_vtx1Labels, d_vtx1TypeCounts);
doIt(max_label, d_vtx2Counts, *d_vtx2Labels, d_vtx2TypeCounts);
}
void PartitionSet::decrementVertexCounts(int vtxNum) {
--d_vtx1Counts[(*d_VtxPairs)[vtxNum].first];
if (!d_vtx1Counts[(*d_VtxPairs)[vtxNum].first]) {
--d_vtx1TypeCounts[(*d_vtx1Labels)[(*d_VtxPairs)[vtxNum].first]];
}
--d_vtx2Counts[(*d_VtxPairs)[vtxNum].second];
if (!d_vtx2Counts[(*d_VtxPairs)[vtxNum].second]) {
--d_vtx2TypeCounts[(*d_vtx2Labels)[(*d_VtxPairs)[vtxNum].second]];
}
}
std::ostream &operator<<(std::ostream &os, const PartitionSet &pt) {
for (size_t i = 0; i < pt.d_parts.size(); ++i) {
os << i << " :: " << pt.d_parts[i].size() << " ::";
for (auto &mem : pt.d_parts[i]) {
os << " " << mem << " (" << (*pt.d_VtxPairs)[mem].first << ","
<< (*pt.d_VtxPairs)[mem].second << ")";
}
os << std::endl;
}
os << "vtx1_counts :";
for (auto vc : pt.d_vtx1Counts) {
os << " " << vc;
}
os << std::endl;
os << "vtx2_counts :";
for (auto vc : pt.d_vtx2Counts) {
os << " " << vc;
}
os << std::endl;
return os;
}
} // namespace RascalMCES
} // namespace RDKit

View File

@@ -0,0 +1,73 @@
//
// Copyright (C) David Cosgrove 2023
//
// @@ All Rights Reserved @@
// This file is part of the RDKit.
// The contents are covered by the terms of the BSD license
// which is included in the file license.txt, found at the root
// of the RDKit source tree.
//
#ifndef RASCALMCES_PARTITION_SET_H
#define RASCALMCES_PARTITION_SET_H
#include <map>
#include <vector>
#include <boost/dynamic_bitset.hpp>
namespace RDKit {
namespace RascalMCES {
class PartitionSet {
public:
// Make a partition set from the modular product and the labels
// of the vertices from the first graph. Each element in vtxPairs
// has a row/column in modProd. The partitions are sorted
// into descending order of sizes.
PartitionSet(const std::vector<boost::dynamic_bitset<>> &modProd,
const std::vector<std::pair<int, int>> &vtxPairs,
const std::vector<unsigned int> &vtx1Labels,
const std::vector<unsigned int> &vtx2Labels,
unsigned int lowerBound);
bool isEmpty() const { return d_parts.empty(); }
size_t numParts() const { return d_parts.size(); }
// Compute the upper bound on the clique that can be extracted from
// the current partition.
int upperBound();
friend std::ostream &operator<<(std::ostream &os, const PartitionSet &pt);
// removes the last element of the last partition and returns
// its value. Throws a runtime_error if empty.
unsigned int popLastVertex();
// remove from the partitions any vertex not connected to the given
// vertex
void pruneVertices(unsigned int vtx_num);
private:
std::shared_ptr<const std::vector<boost::dynamic_bitset<>>> d_ModProd;
std::shared_ptr<const std::vector<std::pair<int, int>>> d_VtxPairs;
std::shared_ptr<const std::vector<unsigned int>> d_vtx1Labels;
std::shared_ptr<const std::vector<unsigned int>> d_vtx2Labels;
std::vector<std::vector<unsigned int>> d_parts;
// counts of the number of times each vertex appears in the partitions
std::vector<int> d_vtx1Counts, d_vtx2Counts;
// counts of the number of times the d_vtx[12]_labels appear in the partitions
std::vector<int> d_vtx1TypeCounts, d_vtx2TypeCounts;
void sortPartitions();
void calcVtxTypeCounts();
void decrementVertexCounts(int vtxNum);
};
} // namespace RascalMCES
} // namespace RDKit
#endif // RASCALMCES_PARTITION_SET_H

View File

@@ -0,0 +1,118 @@
//
// Copyright (C) David Cosgrove 2023
//
// @@ All Rights Reserved @@
// This file is part of the RDKit.
// The contents are covered by the terms of the BSD license
// which is included in the file license.txt, found at the root
// of the RDKit source tree.
//
// This file contains an implementation of Butina clustering
// (Butina JCICS 39 747-750 (1999)) using the RascalMCES
// Johnson similarity metric. It is largely a transliteration
// of $RDBASE/rdkit/ML/Cluster/Butina.py.
#include <algorithm>
#include <iterator>
#include <vector>
#include <set>
#include <GraphMol/ROMol.h>
#include <GraphMol/RascalMCES/RascalMCES.h>
#include <GraphMol/RascalMCES/RascalClusterOptions.h>
#include <GraphMol/RascalMCES/RascalDetails.h>
namespace RDKit {
namespace RascalMCES {
namespace details {
std::vector<std::vector<unsigned int>> buildNborLists(
const std::vector<std::vector<ClusNode>> &proxGraph) {
std::vector<std::vector<unsigned int>> nborLists;
for (size_t i = 0; i < proxGraph.size(); ++i) {
std::vector<std::pair<unsigned int, double>> tmpList;
for (const auto &cn : proxGraph[i]) {
if (cn.d_res) {
if (i == cn.d_mol1Num) {
tmpList.push_back({cn.d_mol2Num, cn.d_sim});
} else {
tmpList.push_back({cn.d_mol1Num, cn.d_sim});
}
}
}
std::sort(tmpList.begin(), tmpList.end(),
[](const std::pair<unsigned int, double> &p1,
const std::pair<unsigned int, double> &p2) -> bool {
return p1.second > p2.second;
});
std::vector<unsigned int> nborList(tmpList.size() + 1, 0);
nborList[0] = i;
std::transform(
tmpList.begin(), tmpList.end(), nborList.begin() + 1,
[](const std::pair<unsigned int, double> &p) -> unsigned int {
return p.first;
});
nborLists.push_back(nborList);
}
std::sort(nborLists.begin(), nborLists.end(),
[](const std::vector<unsigned int> &nl1,
const std::vector<unsigned int> &nl2) -> bool {
if (nl1.size() == nl2.size()) {
return nl1 > nl2;
} else {
return nl1.size() > nl2.size();
}
});
return nborLists;
}
// This function destroys nborLists.
std::vector<std::vector<unsigned int>> formClusters(
std::vector<std::vector<unsigned int>> &nborLists) {
std::vector<std::vector<unsigned int>> clusters;
while (!nborLists.empty()) {
clusters.push_back(nborLists.front());
std::set<unsigned int> inNborList(nborLists.front().begin(),
nborLists.front().end());
nborLists.front().clear();
for (auto &nborList : nborLists) {
for (auto &n : nborList) {
if (inNborList.find(n) != inNborList.end()) {
n = std::numeric_limits<unsigned int>::max();
}
}
nborList.erase(std::remove(nborList.begin(), nborList.end(),
std::numeric_limits<unsigned int>::max()),
nborList.end());
}
nborLists.erase(
std::remove_if(nborLists.begin(), nborLists.end(),
[](const std::vector<unsigned int> &nl) -> bool {
return nl.empty();
}),
nborLists.end());
std::sort(nborLists.begin(), nborLists.end(),
[](const std::vector<unsigned int> &nl1,
const std::vector<unsigned int> &nl2) -> bool {
if (nl1.size() == nl2.size()) {
return nl1 > nl2;
} else {
return nl1.size() > nl2.size();
}
});
}
return clusters;
}
} // namespace details
std::vector<std::vector<unsigned int>> rascalButinaCluster(
const std::vector<std::shared_ptr<ROMol>> &mols,
const RascalClusterOptions &clusOpts) {
auto proxGraph = details::buildProximityGraph(mols, clusOpts);
auto nborLists = details::buildNborLists(proxGraph);
auto clusters = details::formClusters(nborLists);
return clusters;
}
} // namespace RascalMCES
} // namespace RDKit

View File

@@ -0,0 +1,382 @@
//
// Copyright (C) David Cosgrove 2023
//
// @@ All Rights Reserved @@
// This file is part of the RDKit.
// The contents are covered by the terms of the BSD license
// which is included in the file license.txt, found at the root
// of the RDKit source tree.
//
// This file contains an implementation of the clustering algorithm
// described in
// 'A Line Graph Algorithm for Clustering Chemical Structures Based
// on Common Substructural Cores', JW Raymond, PW Willett.
// https://match.pmf.kg.ac.rs/electronic_versions/Match48/match48_197-207.pdf
// https://eprints.whiterose.ac.uk/77598/
// It uses the RASCAL MCES algorithm to perform a fuzzy clustering
// of a set of molecules.
#include <algorithm>
#include <iterator>
#include <list>
#include <thread>
#include <vector>
#include <RDGeneral/RDThreads.h>
#include <GraphMol/ROMol.h>
#include <GraphMol/MolOps.h>
#include <GraphMol/RascalMCES/RascalClusterOptions.h>
#include <GraphMol/RascalMCES/RascalDetails.h>
#include <GraphMol/RascalMCES/RascalMCES.h>
#include <GraphMol/RascalMCES/RascalResult.h>
namespace RDKit {
namespace RascalMCES {
namespace details {
ClusNode calcMolMolSimilarity(
const std::tuple<
size_t, size_t, const std::vector<std::shared_ptr<ROMol>> *,
const RascalOptions *, const RascalClusterOptions *> &toDo) {
auto i = std::get<0>(toDo);
auto j = std::get<1>(toDo);
auto mols = std::get<2>(toDo);
auto opts = std::get<3>(toDo);
auto clusOpts = std::get<4>(toDo);
auto res = rascalMCES(*(*mols)[i], *(*mols)[j], *opts);
ClusNode cn;
cn.d_mol1Num = i;
cn.d_mol2Num = j;
if (res.empty()) {
// tier1Sim and tier2Sim were above the threshold, but no MCES
// was found.
cn.d_sim = 0.0;
} else {
if (res.front().getBondMatches().empty()) {
cn.d_sim = 0.0;
} else {
res.front().trimSmallFrags();
res.front().largestFragsOnly(clusOpts->maxNumFrags);
cn.d_sim = res.front().getSimilarity();
if (cn.d_sim >= opts->similarityThreshold) {
cn.d_res = std::shared_ptr<RascalResult>(new RascalResult(res.front()));
}
}
}
return cn;
}
std::vector<std::vector<ClusNode>> buildProximityGraph(
const std::vector<std::shared_ptr<ROMol>> &mols,
const RascalClusterOptions &clusOpts) {
if (mols.size() < 2) {
return std::vector<std::vector<ClusNode>>();
}
std::vector<std::vector<ClusNode>> proxGraph =
std::vector<std::vector<ClusNode>>(
mols.size(), std::vector<ClusNode>(mols.size(), ClusNode()));
std::vector<
std::tuple<size_t, size_t, const std::vector<std::shared_ptr<ROMol>> *,
const RascalOptions *, const RascalClusterOptions *>>
toDo;
RascalOptions opts;
opts.similarityThreshold = clusOpts.similarityCutoff;
for (size_t i = 0; i < mols.size() - 1; ++i) {
for (size_t j = i + 1; j < mols.size(); ++j) {
toDo.push_back({i, j, &mols, &opts, &clusOpts});
}
}
auto buildProxGraphPart =
[](const std::vector<std::tuple<
size_t, size_t, const std::vector<std::shared_ptr<ROMol>> *,
const RascalOptions *, const RascalClusterOptions *>> &toDo,
std::vector<ClusNode> &molSims, size_t start, size_t finish) -> void {
if (start > toDo.size()) {
return;
}
if (finish > toDo.size()) {
finish = toDo.size();
}
std::transform(toDo.begin() + start, toDo.begin() + finish,
molSims.begin() + start, calcMolMolSimilarity);
};
std::vector<ClusNode> molSims(toDo.size());
#if RDK_BUILD_THREADSAFE_SSS
auto numThreads = getNumThreadsToUse(clusOpts.numThreads);
if (numThreads > 1) {
size_t eachThread = 1 + (toDo.size() / numThreads);
size_t start = 0;
std::vector<std::thread> threads;
for (unsigned int i = 0U; i < numThreads; ++i, start += eachThread) {
threads.push_back(std::thread(buildProxGraphPart, std::ref(toDo),
std::ref(molSims), start,
start + eachThread));
}
for (auto &t : threads) {
t.join();
}
} else {
std::transform(toDo.begin(), toDo.end(), molSims.begin(),
calcMolMolSimilarity);
}
#else
std::transform(toDo.begin(), toDo.end(), molSims.begin(),
calcMolMolSimilarity);
#endif
for (const auto &cn : molSims) {
proxGraph[cn.d_mol1Num][cn.d_mol2Num] =
proxGraph[cn.d_mol2Num][cn.d_mol1Num] = cn;
}
return proxGraph;
}
// Split the proximity graph into its disconnected components,
// returning vectors of the molecule numbers of the disconnected
// graphs.
std::vector<std::vector<unsigned int>> disconnectProximityGraphs(
std::vector<std::vector<ClusNode>> &proxGraph) {
std::vector<std::vector<unsigned int>> subGraphs;
std::vector<bool> done(proxGraph.size(), false);
auto nextStart = std::find(done.begin(), done.end(), false);
while (nextStart != done.end()) {
std::list<unsigned int> nodes;
std::list<unsigned int> toDo(1, std::distance(done.begin(), nextStart));
while (!toDo.empty()) {
auto nextNode = toDo.front();
toDo.pop_front();
if (!done[nextNode]) {
nodes.push_back(nextNode);
}
done[nextNode] = true;
for (size_t i = 0; i < proxGraph.size(); ++i) {
if (!done[i] && proxGraph[nextNode][i].d_res) {
toDo.push_back(i);
nodes.push_back(i);
done[i] = true;
}
}
}
nodes.sort();
subGraphs.push_back(std::vector(nodes.begin(), nodes.end()));
nextStart = std::find(done.begin(), done.end(), false);
}
return subGraphs;
}
// Calculate G_{ij} for the molecule. p is the number of bonds that
// a fragment must exceed for it to be counted in the formula.
double g_ij(const std::shared_ptr<ROMol> &mol, double a, double b,
unsigned int p) {
auto molFrags = MolOps::getMolFrags(*mol, false);
int numBigFrags = 0;
for (const auto &mf : molFrags) {
if (mf->getNumBonds() > p) {
++numBigFrags;
}
}
numBigFrags = numBigFrags == 0 ? molFrags.size() : numBigFrags;
double g = mol->getNumAtoms();
g += b * (1.0 - a * (numBigFrags - 1)) * mol->getNumBonds();
return g;
}
std::vector<std::vector<unsigned int>> makeSubClusters(
const std::vector<ClusNode> &nbors, const RascalClusterOptions &clusOpts) {
std::vector<std::vector<unsigned int>> subClusters;
std::vector<const ClusNode *> tmpNbors;
for (const auto &n : nbors) {
tmpNbors.push_back(&n);
}
while (!tmpNbors.empty()) {
subClusters.push_back(std::vector<unsigned int>{
tmpNbors.front()->d_mol1Num, tmpNbors.front()->d_mol2Num});
auto m1 = tmpNbors.front()->d_res->getMcesMol();
auto g_12 = g_ij(m1, clusOpts.a, clusOpts.b, clusOpts.minFragSize);
for (size_t i = 1; i < tmpNbors.size(); ++i) {
auto m2 = tmpNbors[i]->d_res->getMcesMol();
auto g_13 = g_ij(m2, clusOpts.a, clusOpts.b, clusOpts.minFragSize);
auto results = RDKit::RascalMCES::rascalMCES(*m1, *m2);
if (results.empty() || results.front().getBondMatches().empty()) {
continue;
}
auto res = results.front();
auto g_12_13 =
g_ij(res.getMcesMol(), clusOpts.a, clusOpts.b, clusOpts.minFragSize);
double sim = g_12_13 / std::min(g_12, g_13);
if (sim > clusOpts.minIntraClusterSim) {
subClusters.back().push_back(tmpNbors[i]->d_mol2Num);
subClusters.back().push_back(tmpNbors[i]->d_mol1Num);
tmpNbors[i] = nullptr;
}
}
tmpNbors.front() = nullptr;
tmpNbors.erase(std::remove(tmpNbors.begin(), tmpNbors.end(), nullptr),
tmpNbors.end());
std::sort(subClusters.back().begin(), subClusters.back().end());
subClusters.back().erase(
std::unique(subClusters.back().begin(), subClusters.back().end()),
subClusters.back().end());
}
return subClusters;
}
std::vector<std::vector<unsigned int>> formInitialClusters(
const std::vector<unsigned int> &subGraph,
const std::vector<std::vector<ClusNode>> &proxGraph,
const RascalClusterOptions &clusOpts) {
std::vector<std::vector<unsigned int>> clusters;
if (subGraph.size() < 2) {
return clusters;
}
for (auto i : subGraph) {
std::vector<ClusNode> nbors;
for (auto j : subGraph) {
if (proxGraph[i][j].d_res) {
nbors.push_back(proxGraph[i][j]);
}
}
std::sort(nbors.begin(), nbors.end(),
[](const ClusNode &c1, const ClusNode &c2) -> bool {
return c1.d_sim > c2.d_sim;
});
if (!nbors.empty()) {
auto subClusters = makeSubClusters(nbors, clusOpts);
clusters.insert(clusters.end(), subClusters.begin(), subClusters.end());
}
}
std::sort(clusters.begin(), clusters.end(),
[](const std::vector<unsigned int> &c1,
const std::vector<unsigned int> &c2) -> bool {
if (c1.size() == c2.size()) {
return c1.front() < c2.front();
} else {
return c1.size() > c2.size();
}
});
clusters.erase(std::unique(clusters.begin(), clusters.end()), clusters.end());
return clusters;
}
std::vector<std::vector<unsigned int>> mergeClusters(
const std::vector<std::vector<unsigned int>> &clusters,
const RascalClusterOptions &clusOpts) {
std::vector<std::vector<unsigned int>> outClusters(clusters);
if (outClusters.size() < 2) {
return outClusters;
}
for (size_t i = 0; i < outClusters.size() - 1; ++i) {
for (size_t j = i + 1; j < outClusters.size(); ++j) {
std::vector<int> inCommon;
std::set_intersection(outClusters[i].begin(), outClusters[i].end(),
outClusters[j].begin(), outClusters[j].end(),
std::back_inserter(inCommon));
double s =
double(inCommon.size()) / std::min(double(outClusters[i].size()),
double(outClusters[j].size()));
if (s > clusOpts.clusterMergeSim) {
outClusters[i].insert(outClusters[i].end(), outClusters[j].begin(),
outClusters[j].end());
outClusters[j].clear();
std::sort(outClusters[i].begin(), outClusters[i].end());
outClusters[i].erase(
std::unique(outClusters[i].begin(), outClusters[i].end()),
outClusters[i].end());
}
}
outClusters.erase(
std::remove_if(outClusters.begin(), outClusters.end(),
[](const std::vector<unsigned int> &c) -> bool {
return c.empty();
}),
outClusters.end());
}
return outClusters;
}
void sortClusterMembersByMeanSim(
const std::vector<std::vector<ClusNode>> &proxGraph,
std::vector<std::vector<unsigned int>> &clusters) {
for (auto &clus : clusters) {
std::vector<std::pair<unsigned int, double>> clusSims;
for (unsigned int i = 0U; i < clus.size(); ++i) {
double totSim = 0.0;
for (unsigned int j = 0U; j < clus.size(); ++j) {
if (i != j) {
totSim += proxGraph[clus[i]][clus[j]].d_sim;
}
}
clusSims.push_back({clus[i], totSim / (clus.size() - 1)});
}
std::sort(clusSims.begin(), clusSims.end(),
[](const std::pair<unsigned int, double> &p1,
const std::pair<unsigned int, double> &p2) -> bool {
return p1.second > p2.second;
});
std::transform(
clusSims.begin(), clusSims.end(), clus.begin(),
[](const std::pair<unsigned int, double> &p) -> unsigned int {
return p.first;
});
}
}
std::vector<std::vector<unsigned int>> makeClusters(
const std::vector<std::vector<unsigned int>> &subGraphs,
const std::vector<std::vector<ClusNode>> &proxGraph,
const RascalClusterOptions &clusOpts) {
std::vector<std::vector<unsigned int>> clusters;
for (const auto &sg : subGraphs) {
auto theseClusters = formInitialClusters(sg, proxGraph, clusOpts);
auto mergedClusters = mergeClusters(theseClusters, clusOpts);
clusters.insert(clusters.end(), mergedClusters.begin(),
mergedClusters.end());
}
std::sort(clusters.begin(), clusters.end(),
[](const std::vector<unsigned int> &c1,
const std::vector<unsigned int> &c2) -> bool {
return c1.size() > c2.size();
});
return clusters;
}
std::vector<unsigned int> collectSingletons(
const std::vector<std::vector<ClusNode>> &proxGraph) {
std::vector<unsigned int> singletons;
for (size_t i = 0; i < proxGraph.size(); ++i) {
bool single = true;
for (const auto &cn : proxGraph[i]) {
if (cn.d_res) {
single = false;
break;
}
}
if (single) {
singletons.push_back(i);
}
}
return singletons;
}
} // namespace details
std::vector<std::vector<unsigned int>> rascalCluster(
const std::vector<std::shared_ptr<ROMol>> &mols,
const RascalClusterOptions &clusOpts) {
auto proxGraph = details::buildProximityGraph(mols, clusOpts);
auto subGraphs = details::disconnectProximityGraphs(proxGraph);
auto clusters = details::makeClusters(subGraphs, proxGraph, clusOpts);
auto singletons = details::collectSingletons(proxGraph);
clusters.push_back(singletons);
details::sortClusterMembersByMeanSim(proxGraph, clusters);
return clusters;
}
} // namespace RascalMCES
} // namespace RDKit

View File

@@ -0,0 +1,53 @@
//
// Copyright (C) David Cosgrove 2023
//
// @@ All Rights Reserved @@
// This file is part of the RDKit.
// The contents are covered by the terms of the BSD license
// which is included in the file license.txt, found at the root
// of the RDKit source tree.
//
// Options for Rascal Clustering. In general, the option names and defaults
// are taken from the paper:
// 'A Line Graph Algorithm for Clustering Chemical Structures Based
// on Common Substructural Cores', JW Raymond, PW Willett.
// https://match.pmf.kg.ac.rs/electronic_versions/Match48/match48_197-207.pdf
// https://eprints.whiterose.ac.uk/77598/
#include <RDGeneral/export.h>
#ifndef RASCALCLUSTEROPTIONS_H
#define RASCALCLUSTEROPTIONS_H
namespace RDKit {
namespace RascalMCES {
struct RDKIT_RASCALMCES_EXPORT RascalClusterOptions {
double similarityCutoff = 0.7; /* Similarity cutoff for clustering. Initial
clusters will have molecule pairs of at
least this similarity. */
double a = 0.05; /* penalty score for each unconnected component in MCES */
double b = 2.0; /* weight of matched bonds over matched atoms */
unsigned int minFragSize =
3; /* minimum number of atoms in a fragment for it to
be included in the MCES. Also p in the paper. */
double minIntraClusterSim = 0.9; /* two pairs of molecules are included in the
same cluster if the similarity between
their MCESs is greater than this. S_a
in the paper */
double clusterMergeSim = 0.6; /* two clusters are merged if fraction of
molecules they have in common is greater than
this. S_b in the paper */
unsigned int maxNumFrags = 2; /* The maximum number of fragments in any MCES.
Otherwise the MCES can be a lot of small
fragments scattered across the molecule - it
tries too hard to find a match, sometimes */
int numThreads = -1; /* The number of threads to use. If > 0, will use that
number. If <= 0, will use the number of hardware
threads plus this number. So if the number of
hardware threads is 8, and numThreads is -1, it will
use 7 threads. */
};
} // namespace RascalMCES
} // namespace RDKit
#endif // RASCALCLUSTEROPTIONS_H

View File

@@ -0,0 +1,94 @@
//
// Copyright (C) David Cosgrove 2023
//
// @@ All Rights Reserved @@
// This file is part of the RDKit.
// The contents are covered by the terms of the BSD license
// which is included in the file license.txt, found at the root
// of the RDKit source tree.
//
#include <RDGeneral/export.h>
#ifndef RDKIT_RASCAL_DETAILS_H
#define RDKIT_RASCAL_DETAILS_H
#include <map>
#include <GraphMol/RascalMCES/RascalOptions.h>
#include <GraphMol/RascalMCES/RascalResult.h>
namespace RDKit {
class ROMol;
namespace RascalMCES {
class RascalClusterOptions;
namespace details {
struct ClusNode {
std::shared_ptr<RascalResult> d_res;
double d_sim;
unsigned int d_mol1Num, d_mol2Num;
};
RDKIT_RASCALMCES_EXPORT double tier1Sim(
const RDKit::ROMol &mol1, const RDKit::ROMol &mol2,
std::map<int, std::vector<std::pair<int, int>>> &degSeqs1,
std::map<int, std::vector<std::pair<int, int>>> &degSeqs2);
RDKIT_RASCALMCES_EXPORT double tier2Sim(
const ROMol &mol1, const ROMol &mol2,
const std::map<int, std::vector<std::pair<int, int>>> &degSeqs1,
const std::map<int, std::vector<std::pair<int, int>>> &degSeqs2,
const std::vector<unsigned int> &bondLabels1,
const std::vector<unsigned int> &bondLabels2);
RDKIT_RASCALMCES_EXPORT void getBondLabels(
const RDKit::ROMol &mol1, const RDKit::ROMol &mol2,
const RascalOptions &opts, std::vector<unsigned int> &bondLabels1,
std::vector<unsigned int> &bondLabels2);
std::vector<std::vector<ClusNode>> buildProximityGraph(
const std::vector<std::shared_ptr<ROMol>> &mols,
const RascalClusterOptions &clusOpts);
RDKIT_RASCALMCES_EXPORT bool resultCompare(const RascalResult &res1,
const RascalResult &res2);
RDKIT_RASCALMCES_EXPORT void extractClique(
const std::vector<unsigned int> &clique,
const std::vector<std::pair<int, int>> &vtxPairs, bool swapped,
std::vector<std::pair<int, int>> &bondMatches);
// do some simple cleaning of the SMARTS, to make it more user-friendly.
RDKIT_RASCALMCES_EXPORT void cleanSmarts(std::string &smarts);
// Primarily for debugging, these write out the corresponding bonds/atoms
// in Python list format, for ease of cut/paste into a highlighted image
// creation.
RDKIT_RASCALMCES_EXPORT void printBondMatches(const RascalResult &res,
std::ostream &os);
RDKIT_RASCALMCES_EXPORT void printAtomMatches(const RascalResult &res,
std::ostream &os);
// This prints out the scores in the order they are used in resultCompare.
RDKIT_RASCALMCES_EXPORT void printScores(const RascalResult &res,
std::ostream &os);
// Calculate the Johnson similarity between the two molecules using the given
// bondMatches. It's the fraction of the 2 molecules that are in common,
// somewhat akin to the tanimoto - the square of the number of atoms plus
// number of bonds in the MCES divided by the product of the sums of the number
// of atoms and bonds in the 2 molecules.
// It has nothing to do with lying UK politicians.
RDKIT_RASCALMCES_EXPORT double johnsonSimilarity(
const std::vector<std::pair<int, int>> &bondMatches,
const std::vector<std::pair<int, int>> &atomMatches,
const RDKit::ROMol &mol1, const RDKit::ROMol &mol2);
} // namespace details
} // namespace RascalMCES
} // namespace RDKit
#endif // RDKIT_RASCAL_MCES_H

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,73 @@
//
// Copyright (C) David Cosgrove 2023
//
// @@ All Rights Reserved @@
// This file is part of the RDKit.
// The contents are covered by the terms of the BSD license
// which is included in the file license.txt, found at the root
// of the RDKit source tree.
//
#include <RDGeneral/export.h>
#ifndef RDKIT_RASCAL_MCES_H
#define RDKIT_RASCAL_MCES_H
#include <vector>
#include <GraphMol/RascalMCES/RascalClusterOptions.h>
#include <GraphMol/RascalMCES/RascalOptions.h>
#include <GraphMol/RascalMCES/RascalResult.h>
namespace RDKit {
class ROMol;
namespace RascalMCES {
// Find one or more MCESs between the two molecules. The MCES is the
// Maximum Common Edge Substructure, and is the largest set of bonds
// common to the 2 molecules.
/*!
*
* @param mol1 : first molecule
* @param mol2 : second molecule for MCES determination.
* @param opts : (optional) set of options controlling the MCES determination
* @return : vector of RascalResult objects.
*/
RDKIT_RASCALMCES_EXPORT std::vector<RascalResult> rascalMCES(
const ROMol &mol1, const ROMol &mol2,
const RascalOptions &opts = RascalOptions());
// Cluster the molecules using the Johnson similarity from rascalMCES
// and the algorithm of
// 'A Line Graph Algorithm for Clustering Chemical Structures Based
// on Common Substructural Cores', JW Raymond, PW Willett.
// https://match.pmf.kg.ac.rs/electronic_versions/Match48/match48_197-207.pdf
// https://eprints.whiterose.ac.uk/77598/
// This is a fuzzy clustering algorithm, so a molecule may appear in more than
// one cluster. The final cluster is all the molecules that didn't fit into
// another cluster (the singletons).
/*!
*
* @param mols : molecules to cluster
* @param clusOpts : (optional) cluster options
* @return clusters as vector of vectors of unsigned ints - indices into the
* input mols vector
*/
RDKIT_RASCALMCES_EXPORT std::vector<std::vector<unsigned int>> rascalCluster(
const std::vector<std::shared_ptr<ROMol>> &mols,
const RascalClusterOptions &clusOpts = RascalClusterOptions());
// Cluster the molecules using the Johnson similarity from rascalMCES and
// the Butina algorithm. Butina JCICS 39 747-750 (1999).
/*!
*
* @param mols : molecules to cluster
* @param clusOpts : (optional) cluster options
* @return clusters as vector of vectors of unsigned ints - indices into the
* input mols vector
*/
RDKIT_RASCALMCES_EXPORT std::vector<std::vector<unsigned int>>
rascalButinaCluster(
const std::vector<std::shared_ptr<ROMol>> &mols,
const RascalClusterOptions &clusOpts = RascalClusterOptions());
} // namespace RascalMCES
} // namespace RDKit
#endif // RDKIT_RASCAL_MCES_H

View File

@@ -0,0 +1,50 @@
//
// Copyright (C) David Cosgrove 2023
//
// @@ All Rights Reserved @@
// This file is part of the RDKit.
// The contents are covered by the terms of the BSD license
// which is included in the file license.txt, found at the root
// of the RDKit source tree.
//
#include <RDGeneral/export.h>
#ifndef RASCALOPTIONS_H
#define RASCALOPTIONS_H
namespace RDKit {
namespace RascalMCES {
struct RDKIT_RASCALMCES_EXPORT RascalOptions {
double similarityThreshold =
0.7; // if calculated below this, no MCES will be evaluated.
bool completeAromaticRings =
true; // if true, partial aromatic rings won't be returned
bool ringMatchesRingOnly =
false; // if true, ring bonds won't match non-ring bonds
bool singleLargestFrag =
false; /* if true, only return a single fragment for the MCES. Default
is to produce multiple matching fragments if necessary. */
int minFragSize =
-1; /* minimum number of atoms in any fragment - -1 means no minimum */
int maxFragSeparation = -1; /* biggest through-bond distance that bonds can
match. -1 means no limit. */
bool allBestMCESs =
false; /* If true, all MCESs are returned, in order of diminishing score.
This is likely to result in higher run times. */
int timeout = 60; // max run time, in seconds. -1 means no max.
bool doEquivBondPruning =
false; /* This might make the code run a bit faster in some
circumstances, but on average it is very marginal. */
bool returnEmptyMCES = false; /* if true, if the similarity thresholds aren't
matched still return a RascalResult with the
tier1 and tier2 sims filled in. */
int maxBondMatchPairs = 1000; /* Too many matching bond (vertex) pairs can
cause it to run out of memory. This is a
reasonable default for my Mac. */
};
} // namespace RascalMCES
} // namespace RDKit
#endif // RASCALOPTIONS_H

View File

@@ -0,0 +1,815 @@
//
// Copyright (C) David Cosgrove 2023
//
// @@ All Rights Reserved @@
// This file is part of the RDKit.
// The contents are covered by the terms of the BSD license
// which is included in the file license.txt, found at the root
// of the RDKit source tree.
//
#include <regex>
#include <set>
#include <boost/dynamic_bitset.hpp>
#include <GraphMol/MolOps.h>
#include <GraphMol/QueryAtom.h>
#include <GraphMol/QueryBond.h>
#include <GraphMol/QueryOps.h>
#include <GraphMol/SmilesParse/SmartsWrite.h>
#include <GraphMol/SmilesParse/SmilesWrite.h>
#include <GraphMol/RascalMCES/RascalDetails.h>
#include <GraphMol/RascalMCES/RascalResult.h>
namespace RDKit {
namespace RascalMCES {
RascalResult::RascalResult(const RDKit::ROMol &mol1, const RDKit::ROMol &mol2,
const std::vector<std::vector<int>> &adjMatrix1,
const std::vector<std::vector<int>> &adjMatrix2,
const std::vector<unsigned int> &clique,
const std::vector<std::pair<int, int>> &vtx_pairs,
bool timedOut, bool swapped, double tier1Sim,
double tier2Sim, bool ringMatchesRingOnly,
bool singleLargestFrag, int maxFragSep)
: d_timedOut(timedOut),
d_tier1Sim(tier1Sim),
d_tier2Sim(tier2Sim),
d_ringMatchesRingOnly(ringMatchesRingOnly),
d_maxFragSep(maxFragSep) {
const std::vector<std::vector<int>> *mol1AdjMatrix;
if (swapped) {
d_mol1.reset(new RDKit::ROMol(mol2));
d_mol2.reset(new RDKit::ROMol(mol1));
mol1AdjMatrix = &adjMatrix2;
} else {
d_mol1.reset(new RDKit::ROMol(mol1));
d_mol2.reset(new RDKit::ROMol(mol2));
mol1AdjMatrix = &adjMatrix1;
}
details::extractClique(clique, vtx_pairs, swapped, d_bondMatches);
matchCliqueAtoms(*mol1AdjMatrix);
if (d_maxFragSep != -1) {
applyMaxFragSep();
}
if (singleLargestFrag) {
largestFragOnly();
}
}
RascalResult::RascalResult(double tier1Sim, double tier2Sim)
: d_tier1Sim(tier1Sim), d_tier2Sim(tier2Sim) {}
RascalResult::RascalResult(const RascalResult &other)
: d_bondMatches(other.d_bondMatches),
d_atomMatches(other.d_atomMatches),
d_smarts(other.d_smarts),
d_timedOut(other.d_timedOut),
d_tier1Sim(other.d_tier1Sim),
d_tier2Sim(other.d_tier2Sim),
d_numFrags(other.d_numFrags),
d_ringNonRingBondScore(other.d_ringNonRingBondScore),
d_atomMatchScore(other.d_atomMatchScore),
d_maxDeltaAtomAtomDist(other.d_maxDeltaAtomAtomDist),
d_largestFragSize(other.d_largestFragSize) {
if (other.d_mol1) {
d_mol1.reset(new ROMol(*other.d_mol1));
}
if (other.d_mol2) {
d_mol2.reset(new ROMol(*other.d_mol2));
}
if (other.d_mcesMol) {
d_mcesMol.reset(new ROMol(*other.d_mcesMol));
}
}
RascalResult &RascalResult::operator=(const RascalResult &other) {
if (this == &other) {
return *this;
}
d_bondMatches = other.d_bondMatches;
d_atomMatches = other.d_atomMatches;
d_smarts = other.d_smarts;
d_timedOut = other.d_timedOut;
d_numFrags = other.d_numFrags;
d_ringNonRingBondScore = other.d_ringNonRingBondScore;
d_atomMatchScore = other.d_atomMatchScore;
d_maxDeltaAtomAtomDist = other.d_maxDeltaAtomAtomDist;
d_largestFragSize = other.d_largestFragSize;
if (other.d_mol1) {
d_mol1.reset(new ROMol(*other.d_mol1));
}
if (other.d_mol2) {
d_mol2.reset(new ROMol(*other.d_mol2));
}
if (other.d_mcesMol) {
d_mcesMol.reset(new ROMol(*other.d_mcesMol));
}
return *this;
}
void RascalResult::largestFragOnly() { largestFragsOnly(1); }
void RascalResult::largestFragsOnly(unsigned int numFrags) {
std::unique_ptr<RDKit::ROMol> mol1_frags(makeMolFrags(1));
// getMolFrags() returns boost::shared_ptr. Ho-hum.
auto frags = RDKit::MolOps::getMolFrags(*mol1_frags, false);
if (numFrags < 1 || frags.size() < numFrags) {
return;
}
std::sort(frags.begin(), frags.end(),
[](const boost::shared_ptr<ROMol> &f1,
const boost::shared_ptr<ROMol> &f2) -> bool {
return f1->getNumAtoms() > f2->getNumAtoms();
});
frags.erase(frags.begin() + numFrags, frags.end());
rebuildFromFrags(frags);
}
void RascalResult::trimSmallFrags(unsigned int minFragSize) {
std::unique_ptr<RDKit::ROMol> mol1_frags(makeMolFrags(1));
// getMolFrags() returns boost::shared_ptr. Ho-hum.
auto frags = RDKit::MolOps::getMolFrags(*mol1_frags, false);
frags.erase(std::remove_if(frags.begin(), frags.end(),
[&](const boost::shared_ptr<ROMol> &f) -> bool {
return f->getNumAtoms() < minFragSize;
}),
frags.end());
rebuildFromFrags(frags);
}
double RascalResult::getSimilarity() const {
if (!d_mol1 || !d_mol2) {
return 0.0;
}
return details::johnsonSimilarity(d_bondMatches, d_atomMatches, *d_mol1,
*d_mol2);
}
void RascalResult::rebuildFromFrags(
const std::vector<boost::shared_ptr<ROMol>> &frags) {
// Force the re-creation of the SMARTS and other properties next time
// they-re needed.
d_smarts = "";
d_maxFragSep = -1;
d_ringNonRingBondScore = -1;
d_maxDeltaAtomAtomDist = -1;
d_largestFragSize = -1;
// for now, this is always called after fragmenting d_mol1, but just for
// safety, protect against the frags coming from d_mol2 in some future
// use.
boost::dynamic_bitset<> fragAtoms(
std::max(d_mol1->getNumAtoms(), d_mol2->getNumAtoms()));
boost::dynamic_bitset<> fragBonds(
std::max(d_mol1->getNumBonds(), d_mol2->getNumBonds()));
for (const auto &f : frags) {
for (auto atom : f->atoms()) {
if (atom->hasProp("ORIG_INDEX")) {
fragAtoms.set(atom->getProp<int>("ORIG_INDEX"));
}
}
for (auto bond : f->bonds()) {
if (bond->hasProp("ORIG_INDEX")) {
fragBonds.set(bond->getProp<int>("ORIG_INDEX"));
}
}
}
std::vector<std::pair<int, int>> newAtomMatches;
for (const auto &am : d_atomMatches) {
if (fragAtoms[am.first]) {
newAtomMatches.push_back(am);
}
}
d_atomMatches = newAtomMatches;
std::vector<std::pair<int, int>> new_bond_matches;
for (const auto &bm : d_bondMatches) {
if (fragBonds[bm.first]) {
new_bond_matches.push_back(bm);
}
}
d_bondMatches = new_bond_matches;
d_numFrags = frags.size();
d_largestFragSize = frags.empty() ? 0 : frags.front()->getNumAtoms();
}
std::string RascalResult::createSmartsString() const {
if (!d_mol1 || !d_mol2) {
return "";
}
RWMol smartsMol;
std::map<int, unsigned int> atomMap;
auto mol1Rings = d_mol1->getRingInfo();
auto mol2Rings = d_mol2->getRingInfo();
for (const auto &am : d_atomMatches) {
RDKit::QueryAtom a;
auto mol1Atom = d_mol1->getAtomWithIdx(am.first);
a.setQuery(RDKit::makeAtomNumQuery(mol1Atom->getAtomicNum()));
auto mol2Atom = d_mol2->getAtomWithIdx(am.second);
if (mol1Atom->getAtomicNum() != mol2Atom->getAtomicNum()) {
a.expandQuery(RDKit::makeAtomNumQuery(mol2Atom->getAtomicNum()),
Queries::COMPOSITE_OR);
}
if (mol1Atom->getIsAromatic() && mol2Atom->getIsAromatic()) {
a.expandQuery(RDKit::makeAtomAromaticQuery(), Queries::COMPOSITE_AND,
true);
} else if (!mol1Atom->getIsAromatic() && !mol2Atom->getIsAromatic()) {
a.expandQuery(RDKit::makeAtomAliphaticQuery(), Queries::COMPOSITE_AND,
true);
}
if (d_ringMatchesRingOnly && !mol1Atom->getIsAromatic() &&
!mol2Atom->getIsAromatic() &&
mol1Rings->numAtomRings(mol1Atom->getIdx()) &&
mol2Rings->numAtomRings(mol2Atom->getIdx())) {
a.expandQuery(RDKit::makeAtomInRingQuery(), Queries::COMPOSITE_AND, true);
}
auto ai = smartsMol.addAtom(&a);
atomMap.insert(std::make_pair(am.first, ai));
}
for (const auto &bm : d_bondMatches) {
RDKit::QueryBond b;
auto mol1Bond = d_mol1->getBondWithIdx(bm.first);
b.setBeginAtomIdx(atomMap[mol1Bond->getBeginAtomIdx()]);
b.setEndAtomIdx(atomMap[mol1Bond->getEndAtomIdx()]);
b.setQuery(makeBondOrderEqualsQuery(mol1Bond->getBondType()));
auto mol2Bond = d_mol2->getBondWithIdx(bm.second);
if (mol1Bond->getBondType() != mol2Bond->getBondType()) {
b.expandQuery(makeBondOrderEqualsQuery(mol2Bond->getBondType()),
Queries::COMPOSITE_OR);
}
if (d_ringMatchesRingOnly && !mol1Bond->getIsAromatic() &&
!mol2Bond->getIsAromatic() &&
mol1Rings->numBondRings(mol1Bond->getIdx()) &&
mol2Rings->numBondRings(mol2Bond->getIdx())) {
b.expandQuery(RDKit::makeBondIsInRingQuery(), Queries::COMPOSITE_AND,
true);
}
smartsMol.addBond(&b, false);
}
std::string smt = RDKit::MolToSmarts(smartsMol, true);
details::cleanSmarts(smt);
return smt;
}
namespace {
// Return the atom common to the two bonds, -1 if there isn't one.
int common_atom_in_bonds(const RDKit::Bond *bond1, const RDKit::Bond *bond2) {
int commonAtom = -1;
if (bond1->getBeginAtomIdx() == bond2->getBeginAtomIdx()) {
commonAtom = bond1->getBeginAtomIdx();
} else if (bond1->getEndAtomIdx() == bond2->getBeginAtomIdx()) {
commonAtom = bond1->getEndAtomIdx();
} else if (bond1->getBeginAtomIdx() == bond2->getEndAtomIdx()) {
commonAtom = bond1->getBeginAtomIdx();
} else if (bond1->getEndAtomIdx() == bond2->getEndAtomIdx()) {
commonAtom = bond1->getEndAtomIdx();
}
return commonAtom;
}
} // namespace
void RascalResult::matchCliqueAtoms(
const std::vector<std::vector<int>> &mol1_adj_matrix) {
if (d_bondMatches.empty()) {
return;
}
std::vector<int> mol1Matches(d_mol1->getNumAtoms(), -1);
// set the clique atoms to -2 in mol1Matches, to mark them as yet undecided.
for (const auto &bm : d_bondMatches) {
auto bond1 = d_mol1->getBondWithIdx(bm.first);
mol1Matches[bond1->getBeginAtomIdx()] = -2;
mol1Matches[bond1->getEndAtomIdx()] = -2;
}
// First, use the line graphs to match atoms that have 2 matching bonds
// incident on them.
for (size_t i = 0; i < d_bondMatches.size() - 1; ++i) {
const auto &pair1 = d_bondMatches[i];
auto bond1_1 = d_mol1->getBondWithIdx(pair1.first);
auto bond2_1 = d_mol2->getBondWithIdx(pair1.second);
for (size_t j = i + 1; j < d_bondMatches.size(); ++j) {
const auto &pair2 = d_bondMatches[j];
if (mol1_adj_matrix[pair1.first][pair2.first]) {
// the 2 bonds are incident on the same atom, so the 2 atoms must match
auto bond1_2 = d_mol1->getBondWithIdx(pair2.first);
auto bond2_2 = d_mol2->getBondWithIdx(pair2.second);
auto mol1Atom = common_atom_in_bonds(bond1_1, bond1_2);
auto mol2Atom = common_atom_in_bonds(bond2_1, bond2_2);
if (mol1Atom != -1) {
mol1Matches[mol1Atom] = mol2Atom;
auto omol1Atom = bond1_1->getOtherAtomIdx(mol1Atom);
auto omol2Atom = bond2_1->getOtherAtomIdx(mol2Atom);
mol1Matches[omol1Atom] = omol2Atom;
omol1Atom = bond1_2->getOtherAtomIdx(mol1Atom);
omol2Atom = bond2_2->getOtherAtomIdx(mol2Atom);
mol1Matches[omol1Atom] = omol2Atom;
}
}
}
}
// if there are -2 entries in mol1Matches there's more to do.
if (std::count(mol1Matches.begin(), mol1Matches.end(), -2)) {
// Any -2 entries in mol1Matches are down to isolated bonds, which are a bit
// tricky.
for (const auto &pair1 : d_bondMatches) {
auto bond1_1 = d_mol1->getBondWithIdx(pair1.first);
if (mol1Matches[bond1_1->getBeginAtomIdx()] == -2 &&
mol1Matches[bond1_1->getEndAtomIdx()] == -2) {
auto bond2_1 = d_mol2->getBondWithIdx(pair1.second);
if (bond1_1->getBeginAtom()->getAtomicNum() !=
bond1_1->getEndAtom()->getAtomicNum()) {
// it's fairly straightforward:
if (bond1_1->getBeginAtom()->getAtomicNum() ==
bond2_1->getBeginAtom()->getAtomicNum()) {
mol1Matches[bond1_1->getBeginAtomIdx()] =
bond2_1->getBeginAtomIdx();
mol1Matches[bond1_1->getEndAtomIdx()] = bond2_1->getEndAtomIdx();
} else {
mol1Matches[bond1_1->getBeginAtomIdx()] = bond2_1->getEndAtomIdx();
mol1Matches[bond1_1->getEndAtomIdx()] = bond2_1->getBeginAtomIdx();
}
} else if (bond1_1->getBeginAtom()->getTotalNumHs() !=
bond1_1->getEndAtom()->getTotalNumHs()) {
// try it on number of hydrogens
if (bond1_1->getBeginAtom()->getTotalNumHs() >
bond1_1->getEndAtom()->getTotalNumHs()) {
mol1Matches[bond1_1->getBeginAtomIdx()] =
bond2_1->getBeginAtomIdx();
mol1Matches[bond1_1->getEndAtomIdx()] = bond2_1->getEndAtomIdx();
} else {
mol1Matches[bond1_1->getBeginAtomIdx()] = bond2_1->getEndAtomIdx();
mol1Matches[bond1_1->getEndAtomIdx()] = bond2_1->getBeginAtomIdx();
}
} else {
// it probably doesn't matter
mol1Matches[bond1_1->getBeginAtomIdx()] = bond2_1->getBeginAtomIdx();
mol1Matches[bond1_1->getEndAtomIdx()] = bond2_1->getEndAtomIdx();
}
}
}
}
for (size_t i = 0u; i < d_mol1->getNumAtoms(); ++i) {
if (mol1Matches[i] >= 0) {
d_atomMatches.push_back(std::make_pair(i, mol1Matches[i]));
}
}
}
void RascalResult::applyMaxFragSep() {
std::unique_ptr<RDKit::ROMol> mol1_frags(makeMolFrags(1));
auto frags1 = RDKit::MolOps::getMolFrags(*mol1_frags, false);
if (frags1.size() < 2) {
return;
}
auto fragFragDist = [](const boost::shared_ptr<RDKit::ROMol> &frag1,
const boost::shared_ptr<RDKit::ROMol> &frag2,
const double *pathMatrix, int num_atoms) -> double {
int minDist = std::numeric_limits<int>::max();
for (auto at1 : frag1->atoms()) {
int at1Idx = at1->getProp<int>("ORIG_INDEX");
for (auto at2 : frag2->atoms()) {
int at2Idx = at2->getProp<int>("ORIG_INDEX");
int dist = std::nearbyint(pathMatrix[at1Idx * num_atoms + at2Idx]);
if (dist < minDist) {
minDist = dist;
}
}
}
return minDist;
};
std::unique_ptr<RDKit::ROMol> mol2Frags(makeMolFrags(2));
auto frags2 = RDKit::MolOps::getMolFrags(*mol2Frags, false);
// These arrays must not be deleted - they are cached in the molecule and
// deleted when it is. The distance matrix will be re-calculated in case
// something's been copied over somewhere.
auto mol1Dists = RDKit::MolOps::getDistanceMat(*d_mol1, false, false, true);
auto mol2Dists = RDKit::MolOps::getDistanceMat(*d_mol2, false, false, true);
bool deletedFrag = false;
for (size_t i = 0; i < frags1.size() - 1; ++i) {
if (!frags1[i]) {
continue;
}
for (size_t j = i + 1; j < frags1.size(); ++j) {
if (!frags1[j]) {
continue;
}
int mol1Dist =
fragFragDist(frags1[i], frags1[j], mol1Dists, d_mol1->getNumAtoms());
int mol2Dist =
fragFragDist(frags2[i], frags2[j], mol2Dists, d_mol2->getNumAtoms());
if (mol1Dist > d_maxFragSep || mol2Dist > d_maxFragSep) {
deletedFrag = true;
if (frags1[i]->getNumAtoms() < frags1[j]->getNumAtoms()) {
frags1[i].reset();
frags2[i].reset();
} else {
frags1[j].reset();
frags2[j].reset();
}
}
}
}
if (deletedFrag) {
// rebuild the d_bondMatches
std::vector<std::pair<int, int>> new_bond_matches;
for (auto &frag : frags1) {
if (!frag) {
continue;
}
for (auto b : frag->bonds()) {
int b_idx = b->getProp<int>("ORIG_INDEX");
for (auto &bm : d_bondMatches) {
if (b_idx == bm.first) {
new_bond_matches.push_back(bm);
break;
}
}
}
}
d_bondMatches = new_bond_matches;
// and the d_atomMatches
std::vector<std::pair<int, int>> new_atom_matches;
for (auto &frag : frags1) {
if (!frag) {
continue;
}
for (auto a : frag->atoms()) {
int a_idx = a->getProp<int>("ORIG_INDEX");
for (auto &am : d_atomMatches) {
if (a_idx == am.first) {
new_atom_matches.push_back(am);
break;
}
}
}
}
d_atomMatches = new_atom_matches;
}
}
// Return a molecule with the clique in it. Each atom will have the property
// ORIG_INDEX giving its index in the original molecule.
RDKit::ROMol *RascalResult::makeMolFrags(int molNum) const {
std::shared_ptr<RDKit::ROMol> theMol;
if (molNum == 1) {
theMol = d_mol1;
} else if (molNum == 2) {
theMol = d_mol2;
} else {
return nullptr;
}
if (!theMol) {
return nullptr;
}
auto *molFrags = new RDKit::RWMol(*theMol);
std::vector<char> ainClique(theMol->getNumAtoms(), 0);
for (const auto &am : d_atomMatches) {
if (molNum == 1) {
ainClique[am.first] = 1;
} else {
ainClique[am.second] = 1;
}
}
std::vector<char> binClique(theMol->getNumBonds(), 0);
for (const auto &bm : d_bondMatches) {
if (molNum == 1) {
binClique[bm.first] = 1;
} else {
binClique[bm.second] = 1;
}
}
molFrags->beginBatchEdit();
for (auto &a : molFrags->atoms()) {
if (!ainClique[a->getIdx()]) {
molFrags->removeAtom(a);
} else {
a->setProp<int>("ORIG_INDEX", a->getIdx());
}
}
for (auto &b : molFrags->bonds()) {
if (!binClique[b->getIdx()]) {
molFrags->removeBond(b->getBeginAtomIdx(), b->getEndAtomIdx());
} else {
b->setProp<int>("ORIG_INDEX", b->getIdx());
}
}
molFrags->commitBatchEdit();
return molFrags;
}
// Calculate a score for how many bonds in the clique don't match
// cyclic/non-cyclic
int RascalResult::calcRingNonRingScore() const {
if (!d_mol1 || !d_mol2) {
return 0;
}
int score = 0;
for (const auto &bm : d_bondMatches) {
auto nbr1 = d_mol1->getRingInfo()->numBondRings(bm.first);
auto nbr2 = d_mol2->getRingInfo()->numBondRings(bm.second);
if ((nbr1 && !nbr2) || (!nbr1 && nbr2)) {
++score;
}
}
return score;
}
// Calculate a score for how well the atoms in the clique from mol1 match the
// atoms for the clique in mol2. The atom scores are made up of H count and
// summed for the molecule. Its so that, for example, an OH in mol1 that could
// match an OH or OMe matches the OH for preference.
int RascalResult::calcAtomMatchScore() const {
if (!d_mol1 || !d_mol2) {
return 0;
}
int score = 0;
for (const auto &am : d_atomMatches) {
int num_h_1 = d_mol1->getAtomWithIdx(am.first)->getTotalNumHs();
int num_h_2 = d_mol2->getAtomWithIdx(am.second)->getTotalNumHs();
score += std::abs(num_h_1 - num_h_2);
}
return score;
}
int RascalResult::calcMaxDeltaAtomAtomDistScore() const {
// Possibly this could be improved, to be the total of the minimum distances
// between each fragment.
if (d_atomMatches.empty()) {
return 0;
}
// These arrays are cached so shouldn't be deleted. The final 'true' in the
// call is to force recalculation, just in case there's some other type copied
// over from the input molecule.
const auto *mol1Dists =
RDKit::MolOps::getDistanceMat(*d_mol1, false, false, true);
const auto *mol2Dists =
RDKit::MolOps::getDistanceMat(*d_mol2, false, false, true);
int score = 0;
auto dist = [](int idx1, int idx2, const double *dists,
int num_atoms) -> int {
return int(std::nearbyint(dists[idx1 * num_atoms + idx2]));
};
for (size_t i = 0; i < d_atomMatches.size() - 1; ++i) {
for (size_t j = i + 1; j < d_atomMatches.size(); ++j) {
auto d1 = dist(d_atomMatches[i].first, d_atomMatches[j].first, mol1Dists,
d_mol1->getNumAtoms());
auto d2 = dist(d_atomMatches[i].second, d_atomMatches[j].second,
mol2Dists, d_mol2->getNumAtoms());
auto deltaDist = abs(d1 - d2);
if (deltaDist > score) {
score = deltaDist;
}
}
}
return score;
}
int RascalResult::calcLargestFragSize() const {
if (!d_mol1 || !d_mol2) {
return 0;
}
std::unique_ptr<RDKit::ROMol> mol1_frags(makeMolFrags(1));
std::vector<int> mapping;
auto numFrags = RDKit::MolOps::getMolFrags(*mol1_frags, mapping);
auto lfs = std::count(mapping.begin(), mapping.end(), 0);
for (unsigned int i = 1; i < numFrags; ++i) {
auto fragSize = std::count(mapping.begin(), mapping.end(), i);
lfs = std::max(lfs, fragSize);
}
return lfs;
}
int RascalResult::getNumFrags() const {
if (!d_mol1 || !d_mol2) {
return 0;
}
if (d_numFrags == -1) {
std::unique_ptr<RDKit::ROMol> mol1_frags(makeMolFrags(1));
std::vector<int> mol1_frag_mapping;
d_numFrags = RDKit::MolOps::getMolFrags(*mol1_frags, mol1_frag_mapping);
}
return d_numFrags;
}
int RascalResult::getRingNonRingBondScore() const {
if (!d_mol1 || !d_mol2) {
return 0;
}
if (d_ringNonRingBondScore == -1) {
d_ringNonRingBondScore = calcRingNonRingScore();
}
return d_ringNonRingBondScore;
}
int RascalResult::getAtomMatchScore() const {
if (!d_mol1 || !d_mol2) {
return 0;
}
if (d_atomMatchScore == -1) {
d_atomMatchScore = calcAtomMatchScore();
}
return d_atomMatchScore;
}
int RascalResult::getMaxDeltaAtomAtomDist() const {
if (!d_mol1 || !d_mol2) {
return 0;
}
if (d_maxDeltaAtomAtomDist == -1) {
d_maxDeltaAtomAtomDist = calcMaxDeltaAtomAtomDistScore();
}
return d_maxDeltaAtomAtomDist;
}
int RascalResult::getLargestFragSize() const {
if (!d_mol1 || !d_mol2) {
return 0;
}
if (d_largestFragSize == -1) {
d_largestFragSize = calcLargestFragSize();
}
return d_largestFragSize;
}
std::string RascalResult::getSmarts() const {
if (!d_mol1 || !d_mol2) {
return "";
}
if (d_smarts.empty()) {
d_smarts = createSmartsString();
}
return d_smarts;
}
const std::shared_ptr<ROMol> RascalResult::getMcesMol() const {
if (d_mcesMol || !d_mol1) {
return d_mcesMol;
}
boost::dynamic_bitset<> mol1Bonds(d_mol1->getNumBonds());
for (const auto &bm : d_bondMatches) {
mol1Bonds.set(bm.first);
}
boost::dynamic_bitset<> mol1Atoms(d_mol1->getNumAtoms());
for (const auto &am : d_atomMatches) {
mol1Atoms.set(am.first);
}
std::shared_ptr<RWMol> tmpMol(new RWMol(*d_mol1));
MolOps::KekulizeIfPossible(*tmpMol);
tmpMol->beginBatchEdit();
for (auto &bond : tmpMol->bonds()) {
if (!mol1Bonds[bond->getIdx()]) {
auto bo = bond->getBondType();
if (bond->getBeginAtom()->getNoImplicit() ||
(bond->getBeginAtom()->getIsAromatic() &&
bond->getBeginAtom()->getAtomicNum() != 6)) {
bond->getBeginAtom()->setNumExplicitHs(
bond->getBeginAtom()->getNumExplicitHs() + bo);
}
if (bond->getEndAtom()->getNoImplicit() ||
(bond->getEndAtom()->getIsAromatic() &&
bond->getEndAtom()->getAtomicNum() != 6)) {
bond->getEndAtom()->setNumExplicitHs(
bond->getEndAtom()->getNumExplicitHs() + bo);
}
tmpMol->removeBond(bond->getBeginAtomIdx(), bond->getEndAtomIdx());
}
}
for (auto atom : tmpMol->atoms()) {
if (!mol1Atoms[atom->getIdx()]) {
tmpMol->removeAtom(atom);
}
}
tmpMol->commitBatchEdit();
MolOps::removeHs(*tmpMol);
MolOps::sanitizeMol(*tmpMol);
d_mcesMol = tmpMol;
return d_mcesMol;
}
namespace details {
bool resultCompare(const RascalResult &res1, const RascalResult &res2) {
if (res1.getBondMatches().size() != res2.getBondMatches().size()) {
return res1.getBondMatches().size() > res2.getBondMatches().size();
}
if (res1.getNumFrags() != res2.getNumFrags()) {
return res1.getNumFrags() < res2.getNumFrags();
}
if (res1.getLargestFragSize() != res2.getLargestFragSize()) {
return res1.getLargestFragSize() > res2.getLargestFragSize();
}
if (res1.getRingNonRingBondScore() != res2.getRingNonRingBondScore()) {
return res1.getRingNonRingBondScore() < res2.getRingNonRingBondScore();
}
if (res1.getAtomMatchScore() != res2.getAtomMatchScore()) {
return res1.getAtomMatchScore() < res2.getAtomMatchScore();
}
if (res1.getMaxDeltaAtomAtomDist() != res2.getMaxDeltaAtomAtomDist()) {
return res1.getMaxDeltaAtomAtomDist() < res2.getMaxDeltaAtomAtomDist();
}
return res1.getSmarts() < res2.getSmarts();
}
void extractClique(const std::vector<unsigned int> &clique,
const std::vector<std::pair<int, int>> &vtxPairs,
bool swapped,
std::vector<std::pair<int, int>> &bondMatches) {
bondMatches.clear();
for (auto mem : clique) {
if (swapped) {
bondMatches.emplace_back(vtxPairs[mem].second, vtxPairs[mem].first);
} else {
bondMatches.push_back(vtxPairs[mem]);
}
}
std::sort(bondMatches.begin(), bondMatches.end());
}
void cleanSmarts(std::string &smarts) {
const static std::vector<std::pair<std::regex, std::string>> repls{
{std::regex(R"(\[#6&A\])"), "C"},
{std::regex(R"(\[#6&A&R\])"), "[C&R]"},
{std::regex(R"(\[#6&a\])"), "c"},
{std::regex(R"(\[#7&A\])"), "N"},
{std::regex(R"(\[#7&A&R\])"), "[N&R]"},
{std::regex(R"(\[#7&a\])"), "n"},
{std::regex(R"(\[#8&A\])"), "O"},
{std::regex(R"(\[#8&A&R\])"), "[O&R]"},
{std::regex(R"(\[#8&a\])"), "o"},
{std::regex(R"(\[#9&A\])"), "F"},
{std::regex(R"(\[#16&A\])"), "S"},
{std::regex(R"(\[#16&a\])"), "s"},
{std::regex(R"(\[#17&A\])"), "Cl"},
{std::regex(R"(\[#35&A\])"), "Br"},
{std::regex(R"(\[#53&A\])"), "I"},
{std::regex(R"(([A-Z])-([cnops]))"), "$1$2"},
{std::regex(R"(([cnops][1-9]*)-([A-Z]))"), "$1$2"},
{std::regex(R"(([A-Z][1-9]*)-([A-Z]))"), "$1$2"},
{std::regex(R"(([A-Z])-([1-9]))"), "$1$2"}};
// Sometimes it needs more than 1 pass through
std::string start_smt = "";
while (start_smt != smarts) {
start_smt = smarts;
for (auto [patt, repl] : repls) {
smarts = std::regex_replace(smarts, patt, repl);
}
}
}
void printBondMatches(const RascalResult &res, std::ostream &os) {
os << "Bond 1 matches : " << res.getBondMatches().size() << " : [";
for (const auto &bm : res.getBondMatches()) {
os << bm.first << ",";
}
os << "]" << std::endl;
os << "Bond 2 matches : " << res.getBondMatches().size() << " : [";
for (const auto &bm : res.getBondMatches()) {
os << bm.second << ",";
}
os << "]" << std::endl;
}
void printAtomMatches(const RascalResult &res, std::ostream &os) {
os << "Atom 1 matches : " << res.getAtomMatches().size() << " : [";
for (const auto &am : res.getAtomMatches()) {
os << am.first << ",";
}
os << "]" << std::endl;
os << "Atom 2 matches : " << res.getAtomMatches().size() << " : [";
for (const auto &am : res.getAtomMatches()) {
os << am.second << ",";
}
os << "]" << std::endl;
}
void printScores(const RascalResult &res, std::ostream &os) {
os << res.getBondMatches().size() << " : " << res.getNumFrags() << " : "
<< res.getLargestFragSize() << " : " << res.getRingNonRingBondScore()
<< " : " << res.getAtomMatchScore() << " : "
<< res.getMaxDeltaAtomAtomDist() << " : " << res.getSmarts() << std::endl;
}
double johnsonSimilarity(const std::vector<std::pair<int, int>> &bondMatches,
const std::vector<std::pair<int, int>> &atomMatches,
const RDKit::ROMol &mol1, const RDKit::ROMol &mol2) {
double num = (bondMatches.size() + atomMatches.size()) *
(bondMatches.size() + atomMatches.size());
double denom = (mol1.getNumAtoms() + mol1.getNumBonds()) *
(mol2.getNumAtoms() + mol2.getNumBonds());
return num / denom;
}
} // namespace details
} // namespace RascalMCES
} // namespace RDKit

View File

@@ -0,0 +1,153 @@
//
// Copyright (C) David Cosgrove 2023
//
// @@ All Rights Reserved @@
// This file is part of the RDKit.
// The contents are covered by the terms of the BSD license
// which is included in the file license.txt, found at the root
// of the RDKit source tree.
// A class to hold the results of a RASCAL MCES determination
// between 2 molecules. Contains the bonds and atoms that
// correspond between the molecules, and also a SMARTS pattern
// defining the MCES.
//
#include <RDGeneral/export.h>
#ifndef RASCALRESULT_H
#define RASCALRESULT_H
#include <vector>
#include <GraphMol/ROMol.h>
namespace RDKit {
namespace RascalMCES {
class RDKIT_RASCALMCES_EXPORT RascalResult {
public:
RascalResult(const RDKit::ROMol &mol1, const RDKit::ROMol &mol2,
const std::vector<std::vector<int>> &adjMatrix1,
const std::vector<std::vector<int>> &adjMatrix2,
const std::vector<unsigned int> &clique,
const std::vector<std::pair<int, int>> &vtx_pairs, bool timedOut,
bool swapped, double tier1Sim, double tier2Sim,
bool ringMatchesRingOnly, bool singleLargestFrag,
int minFragSep);
// For when the tier[12]Sim didn't hit the threshold, but it
// might be of interest what the estimates of similarity were.
RascalResult(double tier1Sim, double tier2Sim);
RascalResult(const RascalResult &other);
RascalResult(RascalResult &&other) = default;
~RascalResult() = default;
RascalResult &operator=(const RascalResult &other);
RascalResult &operator=(RascalResult &&other) = default;
// Cut the result down to the single largest fragment. This is
// irrecoverably destructive.
void largestFragOnly();
void largestFragsOnly(unsigned int numFrags = 2);
void trimSmallFrags(unsigned int minFragSize = 3);
std::vector<std::pair<int, int>> getBondMatches() const {
return d_bondMatches;
}
std::vector<std::pair<int, int>> getAtomMatches() const {
return d_atomMatches;
}
// The following 5 functions are used in resultCompare to rank
// 2 MCES of the same size for the same pair of molecules.
// returns the number of contiguous fragments in the MCES.
int getNumFrags() const;
// returns how many bonds in the clique don't match
// cyclic/non-cyclic i.e. count as a matche in the MCES but
// are ring bonds in one of the molecules and not in the other.
int getRingNonRingBondScore() const;
// returns a score for how well the atoms in the clique from mol1 match the
// atoms for the clique in mol2. Currently, the atom scores are the
// difference in H count for matching atoms, and summed for the molecule. Its
// so that, for example, an OH in mol1 that could match an OH or OMe matches
// the OH for preference.
int getAtomMatchScore() const;
// returns a score for the maximum difference in through-bond distance for
// pairs of matching atoms in the 2 molecules. An MCES where 2 atoms
// are far apart in one molecule and the corresponding atoms are close
// together in the other will get a high score by this measure.
int getMaxDeltaAtomAtomDist() const;
// returns the number of atoms in the largest contiguous fragment
// in the MCES.
int getLargestFragSize() const;
std::string getSmarts() const;
const std::shared_ptr<ROMol> getMcesMol() const;
bool getTimedOut() const { return d_timedOut; };
double getTier1Sim() const { return d_tier1Sim; }
double getTier2Sim() const { return d_tier2Sim; }
double getSimilarity() const;
private:
std::shared_ptr<ROMol> d_mol1;
std::shared_ptr<ROMol> d_mol2;
mutable std::shared_ptr<ROMol> d_mcesMol;
std::vector<std::pair<int, int>> d_bondMatches;
std::vector<std::pair<int, int>> d_atomMatches;
mutable std::string d_smarts;
bool d_timedOut{false};
double d_tier1Sim;
double d_tier2Sim;
bool d_ringMatchesRingOnly{false};
int d_maxFragSep{-1};
// These are used for sorting the results.
mutable int d_numFrags{-1};
mutable int d_ringNonRingBondScore{-1};
mutable int d_atomMatchScore{-1};
mutable int d_maxDeltaAtomAtomDist{-1};
mutable int d_largestFragSize{-1};
// Assuming the frags are all part of the original MCES, just cut it
// down to what's in the frags.
void rebuildFromFrags(const std::vector<boost::shared_ptr<ROMol>> &frags);
std::string createSmartsString() const;
void matchCliqueAtoms(const std::vector<std::vector<int>> &mol1_adj_matrix);
// If the clique involves a fragment that is more than d_maxFragSep from
// any other frag in either molecule, discard the smaller frag.
void applyMaxFragSep();
// Make the fragments for either mol1 or mol2. If molNum is not 1 or 2,
// returns nullptr.
RDKit::ROMol *makeMolFrags(int molNum) const;
int calcRingNonRingScore() const;
int calcAtomMatchScore() const;
int calcLargestFragSize() const;
// If there are multiple fragments, can be helpful as a tie-breaker. It's the
// maximum difference between through-bond distances between matching atoms in
// the 2 molecules.
int calcMaxDeltaAtomAtomDistScore() const;
};
} // namespace RascalMCES
} // namespace RDKit
#endif // RASCALRESULT_H

View File

@@ -0,0 +1,8 @@
remove_definitions(-DRDKIT_RASCALMCES_BUILD)
rdkit_python_extension(rdRascalMCES
rdRascalMCES.cpp
DEST Chem
LINK_LIBRARIES RascalMCES)
add_pytest(pyMolDraw2D ${CMAKE_CURRENT_SOURCE_DIR}/testRascalMCES.py)

View File

@@ -0,0 +1,217 @@
//
// Copyright (C) David Cosgrove 2023
//
// @@ All Rights Reserved @@
// This file is part of the RDKit.
// The contents are covered by the terms of the BSD license
// which is included in the file license.txt, found at the root
// of the RDKit source tree.
//
#include <RDBoost/python.h>
#include <RDBoost/Wrap.h>
#include <GraphMol/ROMol.h>
#include <GraphMol/RascalMCES/RascalMCES.h>
#include <GraphMol/RascalMCES/RascalClusterOptions.h>
#include <GraphMol/RascalMCES/RascalOptions.h>
#include <GraphMol/RascalMCES/RascalResult.h>
namespace python = boost::python;
namespace {
python::list convertVecPairInt(const std::vector<std::pair<int, int>> &vec) {
python::list pyres;
for (const auto &p : vec) {
python::tuple tup = python::make_tuple(p.first, p.second);
pyres.append(tup);
}
return pyres;
}
python::list bondMatches(const RDKit::RascalMCES::RascalResult &res) {
return convertVecPairInt(res.getBondMatches());
}
python::list atomMatches(const RDKit::RascalMCES::RascalResult &res) {
return convertVecPairInt(res.getAtomMatches());
}
void largestFragmentOnly(RDKit::RascalMCES::RascalResult &res) {
res.largestFragOnly();
}
struct RascalResult_wrapper {
static void wrap() {
std::string docString = "Used to return RASCAL MCES results.";
python::class_<RDKit::RascalMCES::RascalResult>(
"RascalResult", docString.c_str(), python::no_init)
.def_readonly("smartsString",
&RDKit::RascalMCES::RascalResult::getSmarts,
"SMARTS string defining the MCES.")
.def("bondMatches", &bondMatches,
"A function returning a list of list "
"of tuples, each inner list containing the matching bonds in the "
"MCES as tuples of bond indices from mol1 and mol2")
.def("atomMatches", &atomMatches, "Likewise for atoms.")
.def(
"largestFragmentOnly", &largestFragmentOnly,
"Function that cuts the MCES down to the single largest frag. This cannot be undone.")
.def_readonly("similarity",
&RDKit::RascalMCES::RascalResult::getSimilarity,
"Johnson similarity between 2 molecules.")
.def_readonly("numFragments",
&RDKit::RascalMCES::RascalResult::getNumFrags,
"Number of fragments in MCES.")
.def_readonly("largestFragmentSize",
&RDKit::RascalMCES::RascalResult::getLargestFragSize,
"Number of atoms in largest fragment.")
.def_readonly("timedOut", &RDKit::RascalMCES::RascalResult::getTimedOut,
"Whether it timed out.");
}
};
} // namespace
namespace RDKit {
python::list findMCESWrapper(const ROMol &mol1, const ROMol &mol2,
python::object py_opts) {
RascalMCES::RascalOptions opts;
if (!py_opts.is_none()) {
opts = python::extract<RascalMCES::RascalOptions>(py_opts);
}
std::vector<RDKit::RascalMCES::RascalResult> results;
{
NOGIL gil;
results = RascalMCES::rascalMCES(mol1, mol2, opts);
}
python::list pyres;
for (auto &res : results) {
pyres.append(res);
}
return pyres;
}
std::vector<std::shared_ptr<ROMol>> extractMols(python::object mols) {
std::vector<std::shared_ptr<ROMol>> cmols;
unsigned int nElems = python::extract<unsigned int>(mols.attr("__len__")());
cmols.resize(nElems);
for (unsigned int i = 0; i < nElems; ++i) {
if (!mols[i]) {
throw_value_error("molecule is None");
}
cmols[i] = python::extract<std::shared_ptr<ROMol>>(mols[i]);
}
return cmols;
}
python::list packOutputMols(
const std::vector<std::vector<unsigned int>> &clusters) {
python::list pyres;
for (auto &clus : clusters) {
python::list mols;
for (auto &m : clus) {
mols.append(m);
}
pyres.append(mols);
}
return pyres;
}
python::list rascalClusterWrapper(python::object mols, python::object py_opts) {
RascalMCES::RascalClusterOptions opts;
if (!py_opts.is_none()) {
opts = python::extract<RascalMCES::RascalClusterOptions>(py_opts);
}
auto cmols = extractMols(mols);
std::vector<RDKit::UINT_VECT> clusters;
{
NOGIL gil;
clusters = RascalMCES::rascalCluster(cmols, opts);
}
return packOutputMols(clusters);
}
python::list rascalButinaClusterWrapper(python::object mols,
python::object py_opts) {
RascalMCES::RascalClusterOptions opts;
if (!py_opts.is_none()) {
opts = python::extract<RascalMCES::RascalClusterOptions>(py_opts);
}
auto cmols = extractMols(mols);
std::vector<RDKit::UINT_VECT> clusters;
{
NOGIL gil;
clusters = RascalMCES::rascalButinaCluster(cmols, opts);
}
return packOutputMols(clusters);
}
BOOST_PYTHON_MODULE(rdRascalMCES) {
python::scope().attr("__doc__") =
"Module containing implementation of RASCAL Maximum Common Edge Substructure algorithm.";
RascalResult_wrapper::wrap();
std::string docString = "RASCAL Options";
python::class_<RDKit::RascalMCES::RascalOptions, boost::noncopyable>(
"RascalOptions", docString.c_str())
.def_readwrite(
"similarityThreshold",
&RDKit::RascalMCES::RascalOptions::similarityThreshold,
"Threshold below which MCES won't be run. Between 0.0 and 1.0, default=0.7.")
.def_readwrite(
"completeAromaticRings",
&RDKit::RascalMCES::RascalOptions::completeAromaticRings,
"If True (default), partial aromatic rings won't be returned.")
.def_readwrite("ringMatchesRingOnly",
&RDKit::RascalMCES::RascalOptions::ringMatchesRingOnly,
"If True (default), ring bonds won't match ring bonds.")
.def_readwrite(
"minFragSize", &RDKit::RascalMCES::RascalOptions::minFragSize,
"Imposes a minimum on the number of atoms in a fragment that may be part of the MCES. Default -1 means no minimum.")
.def_readwrite(
"maxFragSeparation",
&RDKit::RascalMCES::RascalOptions::maxFragSeparation,
"Maximum number of bonds between fragments in the MCES for both to be reported. Default -1 means no maximum. If exceeded, the smaller fragment will be removed.")
.def_readwrite(
"allBestMCESs", &RDKit::RascalMCES::RascalOptions::allBestMCESs,
"If True, reports all MCESs found of the same maximum size. Default False means just report the first found.")
.def_readwrite(
"timeout", &RDKit::RascalMCES::RascalOptions::timeout,
"Maximum time (in seconds) to spend on an individual MCESs determination. Default 60, -1 means no limit.");
docString =
"Find one or more MCESs between the 2 molecules given. Returns a list of "
"RascalResult objects."
"- mol1"
"- mol2 The two molecules for which to find the MCES"
"- opts Optional RascalOptions object changing the default run mode."
"";
python::def("FindMCES", &RDKit::findMCESWrapper,
(python::arg("mol1"), python::arg("mol2"),
python::arg("opts") = python::object()),
docString.c_str());
docString =
"Use the RASCAL MCES similarity metric to do fuzzy clustering. Returns a list of lists "
"of molecules, each inner list being a cluster. The last cluster is all the "
"molecules that didn't fit into another cluster (the singletons)."
"- mols List of molecules to be clustered"
"- opts Optional RascalOptions object changing the default run mode."
"";
python::def("RascalCluster", &RDKit::rascalClusterWrapper,
(python::arg("mols"), python::arg("opts") = python::object()),
docString.c_str());
docString =
"Use the RASCAL MCES similarity metric to do Butina clustering"
" (Butina JCICS 39 747-750 (1999)). Returns a list of lists of molecules,"
" each inner list being a cluster. The last cluster is all the"
" molecules that didn't fit into another cluster (the singletons)."
"- mols List of molecules to be clustered"
"- opts Optional RascalOptions object changing the default run mode."
"";
python::def("RascalButinaCluster", &RDKit::rascalButinaClusterWrapper,
(python::arg("mols"), python::arg("opts") = python::object()),
docString.c_str());
}
} // namespace RDKit

View File

@@ -0,0 +1,119 @@
# Copyright (c) 2023 David Cosgrove and other RDKit contributors
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are
# met:
#
# * Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.
# * Redistributions in binary form must reproduce the above
# copyright notice, this list of conditions and the following
# disclaimer in the documentation and/or other materials provided
# with the distribution.
# * Neither the name of Novartis Institutes for BioMedical Research Inc.
# nor the names of its contributors may be used to endorse or promote
# products derived from this software without specific prior written
# permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
# These tests are just to check that the Python wrappers are working
# ok. The bulk of the tests are in the C++ code.
import os
import unittest
from pathlib import Path
from rdkit import Chem
from rdkit.Chem import rdRascalMCES
class TestCase(unittest.TestCase):
def setUp(self):
pass
def test1(self):
mol1 = Chem.MolFromSmiles("c1ccccc1Cl")
mol2 = Chem.MolFromSmiles("c1ccccc1F")
opts = rdRascalMCES.RascalOptions()
results = rdRascalMCES.FindMCES(mol1, mol2, opts)
self.assertEqual(len(results), 1)
self.assertEqual(results[0].smartsString, 'c1:c:c:c:c:c:1')
self.assertEqual(len(results[0].bondMatches()), 6)
self.assertEqual(len(results[0].atomMatches()), 6)
def test2(self):
# Test single largest fragment extraction
ad1 = Chem.MolFromSmiles("CN(C)c1ccc(CC(=O)NCCCCCCCCCCNC23CC4CC(C2)CC(C3)C4)cc1 CHEMBL153934")
ad2 = Chem.MolFromSmiles("N(C)c1ccc(CC(=O)NCCCCCCCCCCCCNC23CC4CC(C2)CC(C3)C4)cc1 CHEMBL157336")
opts = rdRascalMCES.RascalOptions()
results = rdRascalMCES.FindMCES(ad1, ad2, opts)
self.assertEqual(len(results), 1)
self.assertEqual(results[0].smartsString,
'N(-C)-c1:c:c:c(-CC(=O)-NCCCCCCCCCC):c:c:1.NC12CC3CC(-C1)-CC(-C2)-C3')
results[0].largestFragmentOnly()
self.assertEqual(results[0].smartsString, 'N(-C)-c1:c:c:c(-CC(=O)-NCCCCCCCCCC):c:c:1')
def test3(self):
# Test not specifying options
mol1 = Chem.MolFromSmiles("c1ccccc1Cl")
mol2 = Chem.MolFromSmiles("c1ccccc1F")
results = rdRascalMCES.FindMCES(mol1, mol2)
self.assertEqual(len(results), 1)
self.assertEqual(results[0].smartsString, 'c1:c:c:c:c:c:1')
self.assertEqual(len(results[0].bondMatches()), 6)
self.assertEqual(len(results[0].atomMatches()), 6)
def test4(self):
# Test setting non-default option
mol1 = Chem.MolFromSmiles('Oc1cccc2C(=O)C=CC(=O)c12')
mol2 = Chem.MolFromSmiles('O1C(=O)C=Cc2cc(OC)c(O)cc12')
results = rdRascalMCES.FindMCES(mol1, mol2)
self.assertEqual(len(results), 0)
opts = rdRascalMCES.RascalOptions()
opts.similarityThreshold = 0.5
results = rdRascalMCES.FindMCES(mol1, mol2, opts)
self.assertEqual(len(results), 1)
def testRascalCluster(self):
cdk2_file = Path(os.environ['RDBASE']) / 'Contrib' / 'Fastcluster' / 'cdk2.smi'
suppl = Chem.SmilesMolSupplier(str(cdk2_file), '\t', 1, 0, False)
mols = [mol for mol in suppl]
clusters = rdRascalMCES.RascalCluster(mols)
self.assertEqual(len(clusters), 8)
expClusters = [7, 7, 6, 2, 2, 2, 2, 20]
for clus, expClusSize in zip(clusters, expClusters):
self.assertEqual(expClusSize, len(clus))
def testRascalButinaCluster(self):
cdk2_file = Path(os.environ['RDBASE']) / 'Contrib' / 'Fastcluster' / 'cdk2.smi'
suppl = Chem.SmilesMolSupplier(str(cdk2_file), '\t', 1, 0, False)
mols = [mol for mol in suppl]
clusters = rdRascalMCES.RascalButinaCluster(mols)
self.assertEqual(len(clusters), 29)
expClusters = [
6, 6, 6, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1
]
for clus, expClusSize in zip(clusters, expClusters):
self.assertEqual(expClusSize, len(clus))
if __name__ == "__main__":
unittest.main()

View File

@@ -0,0 +1,529 @@
CHEMBL1907596_1 CN1CCC[C@H]1COc2cccnc2
CHEMBL1907596_2 C(Oc1cncnc1)[C@@H]2CCN2
CHEMBL1907596_3 Clc1ccc(cn1)C2CC3CCC2N3
CHEMBL1907596_4 Fc1ncccc1OC[C@@H]2CCN2
CHEMBL1907596_5 Fc1ncc(\C=C\c2cc(OC[C@@H]3CCN3)cnc2Cl)cc1Br
CHEMBL1907596_6 Clc1ncc(OC[C@@H]2CCCN2)cc1\C=C\c3ccnc(Br)c3
CHEMBL1907596_7 Fc1cc(\C=C\c2cc(OC[C@@H]3CCCN3)cnc2Cl)ccn1
CHEMBL1907596_8 Clc1ncc(OC[C@@H]2CCN2)cc1\C=C\c3ccnc(Br)c3
CHEMBL1907596_9 Clc1ccc(OC[C@@H]2CCN2)cn1
CHEMBL1907596_10 Clc1ncc(OC[C@@H]2CCCN2)cc1c3cccnc3
CHEMBL1907596_11 Fc1ncc(OC[C@@H]2CCN2)cc1c3cccnc3
CHEMBL1907596_12 Brc1ncc(OC[C@@H]2CCN2)cc1c3cccnc3
CHEMBL1907596_13 Ic1ncc(OC[C@@H]2CCN2)cc1c3cccnc3
CHEMBL1907596_14 Clc1ccc(cn1)c2cc(OC[C@@H]3CCN3)cnc2Cl
CHEMBL1907596_15 Fc1ccc(cn1)c2cc(OC[C@@H]3CCN3)cnc2Cl
CHEMBL1907596_16 Clc1ncc(OC[C@@H]2CCN2)cc1c3cccnc3
CHEMBL1907596_17 Clc1ncc(OC[C@@H]2CCN2)cc1\C=C\c3ccncc3
CHEMBL1907596_18 Fc1cc(ccn1)c2cc(OC[C@@H]3CCN3)cnc2Cl
CHEMBL1907596_19 Fc1cc(\C=C\c2cc(OC[C@@H]3CCN3)cnc2Cl)ccn1
CHEMBL1907596_20 Fc1ccnc[n+]1c2cc(OC[C@@H]3CCN3)cnc2Cl
CHEMBL1907596_21 Clc1ncc(OC[C@@H]2CCN2)cc1c3ccnc(Br)c3
CHEMBL1907596_22 Clc1ncc(OC[C@@H]2CCCN2)cc1c3ccc(Br)nc3
CHEMBL1907596_23 Fc1ccc(cn1)c2cc(OC[C@@H]3CCCN3)cnc2Cl
CHEMBL1907596_24 Fc1ncc(cc1Br)c2cc(OC[C@@H]3CCN3)cnc2Cl
CHEMBL1907596_25 Clc1ncc(OC[C@@H]2CCN2)cc1c3ccc(Br)nc3
CHEMBL1907596_26 Clc1ccc(cn1)C2CC3CCC2N3
CHEMBL1907596_27 Clc1ncc(OCC2CCCN2)cc1\C=C\c3ccccn3
CHEMBL1907596_28 CN1CCC1COc2cnc(Cl)c(\C=C\c3ccncc3)c2
CHEMBL1907596_29 CN1CCCC1COc2cnc(Cl)c(\C=C\c3ccccn3)c2
CHEMBL1907596_30 CN1CCCC1COc2cnc(Cl)c(\C=C\c3cccnc3)c2
CHEMBL1907596_31 CN1CCCC1COc2cnc(Cl)c(\C=C\c3ccncc3)c2
CHEMBL1907596_32 CN1CCCC1COc2cnc(Cl)c(CCc3ccncc3)c2
CHEMBL1907596_33 Clc1ncc(OCC2CCN2)cc1\C=C\c3ccncc3
CHEMBL1907596_34 Clc1ncc(OCC2CCCN2)cc1CCc3ccncc3
CHEMBL1907596_35 Clc1ncc(OCC2CCCN2)cc1\C=C\c3ccncc3
CHEMBL1907596_36 Clc1ncc(OCC2CCCN2)cc1\C=C\c3cccnc3
CHEMBL1907596_37 Clc1ccc(cn1)C2CC3CCC2N3
CHEMBL1907596_38 CN1CCCC1c2ccc(Br)nc2
CHEMBL1907596_39 CN1CCCC1c2ccc(Cl)nc2
CHEMBL1907596_40 CN1CCC[C@H]1c2cccnc2
CHEMBL1907596_41 Clc1ccc(cn1)C2CC3CCC2N3
CHEMBL1907596_42 C1C[C@H]2CCC(N2)C(=C1)c3cccnc3
CHEMBL1907596_43 Clc1ccc(cn1)C2=CCC[C@H]3CCC2N3
CHEMBL1907596_44 Clc1ccc(cn1)C2CC3CCC2N3
CHEMBL1907596_45 CO[C@@H]1CC=C2CCN3CCC4=C(CC(=O)OC4)[C@@]23C1
CHEMBL1907596_46 CN1CCC[C@H]1c2cccnc2
CHEMBL1907596_47 Clc1ccc(cn1)C2CC3CCC2N3
CHEMBL1907596_48 CN1CCC[C@H]1c2cccnc2
CHEMBL1907596_49 Clc1ccc(cn1)C2CC3CCC2N3
CHEMBL1907596_50 Clc1ncc(cn1)C2CC3CCC2N3
CHEMBL1907596_51 C1CC2CCC(N2)C(=C1)c3cccnc3
CHEMBL1907596_52 Clc1ccc(cn1)C2=CCCC3CCC2N3
CHEMBL1907596_53 CN1CCC[C@H]1COc2cccnc2
CHEMBL1907596_54 CN1CCC[C@H]1COc2cncc(CCc3ccccc3)c2
CHEMBL1907596_55 CN1CCC[C@H]1COc2cncc(\C=C\c3ccccc3)c2
CHEMBL1907596_56 CN1CCC[C@H]1COc2cncc(c2)c3oc4ccccc4c3
CHEMBL1907596_57 CN1CCC[C@H]1COc2cncc(c2)C#Cc3ccccc3
CHEMBL1907596_58 CN1CCC[C@H]1COc2cncc(c2)c3cncnc3
CHEMBL1907596_59 CN1CCC[C@H]1COc2cncc(c2)c3ccc(F)c(Cl)c3
CHEMBL1907596_60 CN1CCC[C@H]1COc2cncc(c2)c3ccc(Cl)cc3Cl
CHEMBL1907596_61 CN1CCC[C@H]1COc2cncc(c2)c3ccc(Cl)cc3
CHEMBL1907596_62 CN1CCC[C@H]1COc2cncc(c2)c3ccc(C)cc3
CHEMBL1907596_63 CN1CCC[C@H]1COc2cncc(c2)c3ccc(F)cc3
CHEMBL1907596_64 CN1CCC[C@H]1COc2cncc(c2)c3cccc(N)c3
CHEMBL1907596_65 CN1CCC[C@H]1COc2cncc(c2)c3cccc(c3)[N+](=O)[O-]
CHEMBL1907596_66 CN1CCC[C@H]1COc2cncc(c2)c3ccc(cc3)C(F)(F)F
CHEMBL1907596_67 COc1ccc(cc1)c2cncc(OC[C@@H]3CCCN3C)c2
CHEMBL1907596_68 CN1CCC[C@H]1COc2cncc(c2)c3ccccc3C=O
CHEMBL1907596_69 CN1CCC[C@H]1COc2cncc(CCc3ccncc3)c2
CHEMBL1907596_70 CN1CCC[C@H]1COc2cncc(c2)c3ccccc3
CHEMBL1907596_71 CN1CCC[C@H]1COc2cncc(c2)C#Cc3ccc(C)cc3
CHEMBL1907596_72 CN1CCC[C@H]1COc2cncc(\C=C\c3ccncc3)c2
CHEMBL1907596_73 CN1CCC[C@H]1COc2cncc(c2)c3ccc4ccccc4c3
CHEMBL1907596_74 CN1CCC[C@H]1COc2cncc(c2)c3cccs3
CHEMBL1907596_75 CN1CCC[C@H]1COc2cncc(c2)c3occc3
CHEMBL1907596_76 CN1CCC[C@H]1COc2cncc(c2)c3cccnc3
CHEMBL1907596_77 CN1CCC[C@H]1COc2cncc(c2)c3cc4ccccc4n3C
CHEMBL1907596_78 CN1CCC[C@H]1COc2cncc(c2)c3cnc4ccccc4c3
CHEMBL1907596_79 Clc1ccc(cn1)[C@H]2C[C@@H]3CC[C@H]2N3
CHEMBL1907596_80 Brc1ccc(cn1)C2CC3CCC2N3
CHEMBL1907596_81 Ic1ccc(cn1)C2CC3CCC2N3
CHEMBL1907596_82 Fc1ccc(cn1)C2CC3CCC2N3
CHEMBL1907596_83 C1CC2NC1CC2c3cccnc3
CHEMBL1907596_84 Clc1ccc(cn1)[C@@H]2C[C@H]3CC[C@@H]2N3
CHEMBL1907596_85 CN1CCCC1c2ccc(Br)nc2
CHEMBL1907596_86 CN1CCCC1c2ccc(Cl)nc2
CHEMBL1907596_87 Clc1ccc(cn1)[C@H]2C[C@@H]3CC[C@H]2N3
CHEMBL1907596_88 Ic1ccc(cn1)C2CC3CCC2N3
CHEMBL1907596_89 C1CC2NC1CC2c3cccnc3
CHEMBL1907596_90 Clc1ccc(cn1)[C@@H]2C[C@H]3CC[C@@H]2N3
CHEMBL1907596_91 CN1CCC[C@H]1c2cccnc2
CHEMBL1907596_92 Clc1ccc(cn1)C2CC3CCC2N3
CHEMBL1907596_93 C1C[C@H]2CCC(N2)C(=C1)c3cccnc3
CHEMBL1907596_94 Clc1ccc(cn1)C2=CCC[C@H]3CCC2N3
CHEMBL1907596_95 C1C[C@H]2CCC(N2)C(=C1)c3cncnc3
CHEMBL1907596_96 Clc1ccc(cn1)C2CC3CCC2N3
CHEMBL1907596_97 Clc1ccc(cn1)C2CC3CCC2N3
CHEMBL1907596_98 Ic1cncc(O[C@H]2CCN2)c1
CHEMBL1907596_99 C(Oc1cccnc1)[C@@H]2CCN2
CHEMBL1907596_100 Clc1ccc(cn1)C2CC3CCC2N3
CHEMBL1907596_101 Brc1ccc(cn1)C2CC3CCC2N3
CHEMBL1907596_102 Fc1ccc(cn1)C2CC3CCC2N3
CHEMBL1907596_103 C1CC2NC1CC2c3cccnc3
CHEMBL1907596_104 Nc1ccc(cn1)C2CC3CCC2N3
CHEMBL1907596_105 C(Oc1cccnc1)[C@@H]2CCN2
CHEMBL1907596_106 Fc1ncccc1OC[C@@H]2CCN2
CHEMBL1907596_107 Clc1ccc(OC[C@@H]2CCN2)cn1
CHEMBL1907596_108 [O-][N+](=O)c1cncc(OC[C@@H]2CCN2)c1
CHEMBL1907596_109 CCOc1cncc(OC[C@@H]2CCN2)c1
CHEMBL1907596_110 CCCc1cncc(OC[C@@H]2CCN2)c1
CHEMBL1907596_111 Fc1cncc(OC[C@@H]2CCN2)c1
CHEMBL1907596_112 C(Oc1cncc(c1)c2ccccc2)[C@@H]3CCN3
CHEMBL1907596_113 CC(=O)NCc1cncc(OC[C@@H]2CCN2)c1
CHEMBL1907596_114 Cc1cncc(OC[C@H]2CCN2)c1
CHEMBL1907596_115 Oc1cncc(OC[C@H]2CCN2)c1
CHEMBL1907596_116 Clc1cncc(OC[C@H]2CCN2)c1
CHEMBL1907596_117 COc1ccc(OC[C@H]2CCN2)cn1
CHEMBL1907596_118 Clc1ccc(OC[C@H]2CCN2)cn1
CHEMBL1907596_119 Cc1ccc(OC[C@H]2CCN2)cn1
CHEMBL1907596_120 Cc1ccc(OC[C@@H]2CCN2)cn1
CHEMBL1907596_121 Brc1ccc(OC[C@H]2CCN2)cn1
CHEMBL1907596_122 Fc1ccc(OC[C@H]2CCN2)cn1
CHEMBL1907596_123 Fc1ccc(OC[C@@H]2CCN2)cn1
CHEMBL1907596_124 Brc1ccc(OC[C@@H]2CCN2)cn1
CHEMBL1907596_125 CCc1cncc(OC[C@@H]2CCN2)c1
CHEMBL1907596_126 Cc1cncc(OC[C@@H]2CCN2)c1
CHEMBL1907596_127 Nc1cncc(OC[C@@H]2CCN2)c1
CHEMBL1907596_128 FC(F)(F)c1cncc(OC[C@@H]2CCN2)c1
CHEMBL1907596_129 Brc1cncc(OC[C@@H]2CCN2)c1
CHEMBL1907596_130 Clc1cc(OC[C@@H]2CCN2)cnc1Cl
CHEMBL1907596_131 Clc1cncc(OC[C@@H]2CCN2)c1
CHEMBL1907596_132 Fc1ncccc1OC[C@H]2CCN2
CHEMBL1907596_133 Clc1ncc(OC[C@@H]2CCN2)cc1c3ccccc3
CHEMBL1907596_134 Clc1ncc(OC[C@@H]2CCN2)cc1Br
CHEMBL1907596_135 N#Cc1cncc(OC[C@@H]2CCN2)c1
CHEMBL1907596_136 Clc1ccc(cn1)C2CC3CCC2N3
CHEMBL1907596_137 CN1CCC[C@H]1c2cccnc2
CHEMBL1907596_138 Ic1ccc(cn1)C2CC3CCC2N3
CHEMBL1907596_139 Ic1ccc(cn1)C2CC3CCC2N3
CHEMBL1907596_140 Nc1cc(cnc1Cl)C2CC3CCC2N3
CHEMBL1907596_141 Clc1ncc(cc1I)C2CC3CCC2N3
CHEMBL1907596_142 Clc1ncc(cc1Br)C2CC3CCC2N3
CHEMBL1907596_143 Clc1ncc(cc1N=[N+]=[N-])C2CC3CCC2N3
CHEMBL1907596_144 Clc1ncc(cc1c2ccccc2)C3CC4CCC3N4
CHEMBL1907596_145 Fc1cc(cnc1Cl)C2CC3CCC2N3
CHEMBL1907596_146 Clc1cc(cnc1Cl)C2CC3CCC2N3
CHEMBL1907596_147 Clc1ccc(cn1)C2CC3CCC2N3
CHEMBL1907596_148 Fc1ncc(cc1c2ccccc2)[C@H]3C[C@@H]4CC[C@H]3N4
CHEMBL1907596_149 Fc1ncc(cc1c2ccccc2)[C@@H]3C[C@H]4CC[C@@H]3N4
CHEMBL1907596_150 Fc1ncc(cc1c2ccccc2)C3CC4CCC3N4
CHEMBL1907596_151 Clc1ccc(cn1)[C@H]2C[C@@H]3CC[C@H]2N3
CHEMBL1907596_152 Brc1ccc(cn1)C2CC3CCC2N3
CHEMBL1907596_153 Fc1ccc(cn1)C2CC3CCC2N3
CHEMBL1907596_154 C1CC2NC1CC2c3cccnc3
CHEMBL1907596_155 CN1CCC[C@H]1c2cccnc2
CHEMBL1907596_156 Clc1ccc(cn1)[C@H]2C[C@@H]3CC[C@H]2N3
CHEMBL1907596_157 CN1[C@@H]2CC[C@H]1[C@@H](C2)c3ccc(Cl)nc3
CHEMBL1907596_158 Cc1ccc(cn1)C2CC3CCC2N3
CHEMBL1907596_159 CN1[C@H]2CC[C@@H]1[C@H](C2)c3ccc(Cl)nc3
CHEMBL1907596_160 Clc1ccc(cn1)[C@H]2C[C@@H]3CC[C@H]2N3
CHEMBL1907596_161 Clc1ccc(cn1)[C@@H]2C[C@H]3CC[C@@H]2N3
CHEMBL1907596_162 Clc1ccc(cn1)[C@@H]2C[C@H]3CC[C@@H]2N3
CHEMBL1907596_163 CN1C2CCC1C(C2)c3ccc(Cl)nc3
CHEMBL1907596_164 Clc1ccc(cn1)C2CC3CCC2N3
CHEMBL1907596_165 Clc1ccc(cn1)[C@@H]2C[C@H]3CC[C@@H]2N3
CHEMBL1907596_166 Clc1ccc(cn1)[C@@H]2C[C@H]3CC[C@@H]2N3
CHEMBL1907596_167 C(Oc1cccnc1)[C@@H]2CCN2
CHEMBL1907596_168 Fc1ncccc1OC[C@@H]2CCN2
CHEMBL1907596_169 Fc1cncc(OC[C@@H]2CCN2)c1
CHEMBL1907596_170 Fc1ccc(OC[C@@H]2CCN2)cn1
CHEMBL1907596_171 Brc1cncc(OC[C@@H]2CCN2)c1
CHEMBL1907596_172 Clc1cncc(OC[C@@H]2CCN2)c1
CHEMBL1907596_173 Clc1ncccc1OC[C@@H]2CCN2
CHEMBL1907596_174 Ic1cncc(OC[C@@H]2CCN2)c1
CHEMBL1907596_175 Ic1ccc(OC[C@@H]2CCN2)cn1
CHEMBL1907596_176 Brc1ncccc1OC[C@@H]2CCN2
CHEMBL1907596_177 CN1CCC[C@H]1c2cccnc2
CHEMBL1907596_178 CCCCCCc1cncc(OC[C@@H]2CCCN2C)c1
CHEMBL1907596_179 CCCCc1cncc(OC[C@@H]2CCCN2C)c1
CHEMBL1907596_180 CC(C)Cc1cncc(OC[C@@H]2CCCN2C)c1
CHEMBL1907596_181 CCCc1cncc(OC[C@@H]2CCCN2C)c1
CHEMBL1907596_182 CN1CCC[C@H]1COc2cncc(N)c2
CHEMBL1907596_183 CN1CCC[C@H]1COc2cncc(Br)c2
CHEMBL1907596_184 CN1CCC[C@H]1COc2cncc(Cl)c2
CHEMBL1907596_185 CCc1cncc(OC[C@@H]2CCCN2C)c1
CHEMBL1907596_186 CN1CCC[C@H]1COc2cncc(C)c2
CHEMBL1907596_187 CN1CCC[C@H]1COc2ccc(Cl)nc2
CHEMBL1907596_188 CN1CCC[C@H]1COc2cccnc2
CHEMBL1907596_189 CN1CCC[C@H]1COc2cncc(F)c2
CHEMBL1907596_190 Clc1ccc(cn1)[C@H]2C[C@@H]3CC[C@H]2N3
CHEMBL1907596_191 Clc1ccc(cn1)[C@@H]2C[C@H]3CC[C@@H]2N3
CHEMBL1907596_192 Brc1ccc(cn1)C2CC3CCC2N3
CHEMBL1907596_193 C1CC2NC1CC2c3cccnc3
CHEMBL1907596_194 Fc1ccc(cn1)C2CC3CCC2N3
CHEMBL1907596_195 O=C1C=CC=C2[C@H]3CNC[C@H](C3)CN12
CHEMBL1907596_196 CN1CCC[C@H]1c2cccnc2
CHEMBL1907596_197 CN1CCC[C@H]1c2cccnc2
CHEMBL1907596_198 O=C1C=CC=C2[C@H]3CNC[C@H](C3)CN12
CHEMBL1907596_199 Cl.Cl.C1NCC2CC1c3cc4nccnc4cc23
CHEMBL1907596_200 Cl.[O-][N+](=O)c1ccc2C3CNCC(C3)c2c1
CHEMBL1907596_201 Clc1ccc(cn1)[C@H]2C[C@@H]3CC[C@H]2N3
CHEMBL1907596_202 Fc1ccc(cn1)C2CC3CCC2N3
CHEMBL1907596_203 Fc1ncc(cc1c2ccccc2)C3CC4CCC3N4
CHEMBL1907596_204 Clc1ccc(cn1)[C@@H]2C[C@H]3CC[C@@H]2N3
CHEMBL1907596_205 COc1ccc(cc1)c2cc(cnc2F)C3CC4CCC3N4
CHEMBL1907596_206 Fc1ccc(cc1)c2cc(cnc2F)C3CC4CCC3N4
CHEMBL1907596_207 Fc1ncc(cc1c2cccc(Cl)c2)C3CC4CCC3N4
CHEMBL1907596_208 Fc1cccc(c1)c2cc(cnc2F)C3CC4CCC3N4
CHEMBL1907596_209 Fc1ncc(cc1c2ccc(Cl)cc2)C3CC4CCC3N4
CHEMBL1907596_210 [O-][N+](=O)c1cccc(c1)c2cc(cnc2F)C3CC4CCC3N4
CHEMBL1907596_211 COc1cccc(c1)c2cc(cnc2F)C3CC4CCC3N4
CHEMBL1907596_212 [O-][N+](=O)c1ccc(cc1)c2cc(cnc2F)C3CC4CCC3N4
CHEMBL1907596_213 Nc1cccc(c1)c2cc(cnc2F)C3CC4CCC3N4
CHEMBL1907596_214 Nc1ccc(cc1)c2cc(cnc2F)C3CC4CCC3N4
CHEMBL1907596_215 Clc1ccc(cn1)C2CC3CCC2N3
CHEMBL1907596_216 CN1CCC[C@H]1COc2cncc(c2)C#Cc3ccccc3
CHEMBL1907596_217 CN1CCC[C@H]1COc2cncc(c2)C#CCO
CHEMBL1907596_218 CN1CCC[C@H]1COc2cncc(c2)C#CCCCCO
CHEMBL1907596_219 CN1CCC[C@H]1COc2cncc(c2)C#CCCCCF
CHEMBL1907596_220 CN1CCC[C@H]1c2cccnc2
CHEMBL1907596_221 Clc1ccc(cn1)C2CC3CCC2N3
CHEMBL1907596_222 Ic1cncc(c1)C2CC3CCC2N3
CHEMBL1907596_223 Nc1cncc(c1)C2CC3CCC2N3
CHEMBL1907596_224 C=Cc1cncc(c1)C2CC3CCC2N3
CHEMBL1907596_225 Fc1cncc(c1)C2CC3CCC2N3
CHEMBL1907596_226 Clc1cncc(c1)C2CC3CCC2N3
CHEMBL1907596_227 Brc1cncc(c1)C2CC3CCC2N3
CHEMBL1907596_228 C#Cc1cncc(c1)C2CC3CCC2N3
CHEMBL1907596_229 CN1C2CCC1C(C2)c3cncc(I)c3
CHEMBL1907596_230 Clc1ccc(cn1)C2CC3CCC2N3
CHEMBL1907596_231 Clc1ccc(cn1)C2CC3CCCCC2N3
CHEMBL1907596_232 Clc1ccc(cn1)[C@@H]2CC3CNC2C3
CHEMBL1907596_233 Clc1ccc(cn1)[C@@H]2CC3CC2CN3
CHEMBL1907596_234 Clc1ccc(cn1)C2CC3CCC2CN3
CHEMBL1907596_235 Clc1ccc(cn1)C2CC3CCC2N3
CHEMBL1907596_236 Clc1ccc(cn1)C2CC3CCC2NC3
CHEMBL1907596_237 Clc1ccc(cn1)[C@@H]2CC3CNC2C3
CHEMBL1907596_238 Clc1ccc(cn1)C2CC3CCC2N3
CHEMBL1907596_239 C1CC2NC1CC2c3cccnc3
CHEMBL1907596_240 Clc1ccc(cn1)[C@@H]2C[C@H]3CC[C@@H]2N3
CHEMBL1907596_241 C(Oc1cccnc1)C2CCN2
CHEMBL1907596_242 CN1CCCC1COc2cccnc2
CHEMBL1907596_243 C1NCC2CC1c3cc4nccnc4cc23
CHEMBL1907596_244 Clc1ccc2cc3C4CNCC(C4)c3cc2n1
CHEMBL1907596_245 Cc1ccc2cc3C4CNCC(C4)c3cc2n1
CHEMBL1907596_246 Cc1cnc2cc3C4CNCC(C4)c3cc2c1
CHEMBL1907596_247 [O-][N+](=O)c1ccc2C3CNCC(C3)c2c1
CHEMBL1907596_248 Fc1cc2C3CNCC(C3)c2cc1F
CHEMBL1907596_249 Clc1ccc(cn1)C2CC3CCC2N3
CHEMBL1907596_250 Fc1ncc(cc1c2ccccc2)C3CC4CCC3N4
CHEMBL1907596_251 Clc1ncc(cc1c2ccccc2)C3CC4CCC3N4
CHEMBL1907596_252 Fc1ccc(cc1)c2cc(cnc2F)C3CC4CCC3N4
CHEMBL1907596_253 Fc1ncc(cc1c2ccc(Cl)cc2)C3CC4CCC3N4
CHEMBL1907596_254 Fc1ncc(cc1c2ccc(cc2)C#N)C3CC4CCC3N4
CHEMBL1907596_255 Fc1ncc(cc1c2ccc(Cl)c(Cl)c2)C3CC4CCC3N4
CHEMBL1907596_256 CN1C2CCC1C(C2)c3cnc(Cl)c(c3)c4ccccc4
CHEMBL1907596_257 Cc1ccc(cc1)c2cc(cnc2Cl)C3CC4CCC3N4
CHEMBL1907596_258 COc1ccc(cc1)c2cc(cnc2Cl)C3CC4CCC3N4
CHEMBL1907596_259 Cc1ccc(cc1)c2cc(cnc2F)C3CC4CCC3N4
CHEMBL1907596_260 Cc1cccc(c1)c2cc(cnc2F)C3CC4CCC3N4
CHEMBL1907596_261 CN1CCCC1c2ccc(Br)nc2
CHEMBL1907596_262 CN1CCCC1c2ccc(Cl)nc2
CHEMBL1907596_263 Clc1ccc(cn1)C2CC3CCC2N3
CHEMBL1907596_264 C=CC1=CC=C2C3CNCC(C3)CN2C1=O
CHEMBL1907596_265 Clc1ccc(cn1)C2CC3CCC2N3
CHEMBL1907596_266 Clc1ccc(cn1)[C@H]2C[C@@H]3CC[C@H]2N3
CHEMBL1907596_267 Clc1ccc(cn1)[C@@H]2C[C@H]3CC[C@@H]2N3
CHEMBL1907596_268 C1CN(C[C@@H]2NC[C@H]12)c3cccnc3
CHEMBL1907596_269 Clc1ccc(cn1)N2CC[C@H]3CN[C@H]3C2
CHEMBL1907596_270 N#Cc1cncc(c1)N2CC[C@@H]3CN[C@@H]3C2
CHEMBL1907596_271 COc1cncc(c1)N2CC[C@H]3CN[C@H]3C2
CHEMBL1907596_272 Brc1ncc(cc1C#N)N2CC[C@H]3CN[C@H]3C2
CHEMBL1907596_273 N\C(=N\O)\c1cncc(c1)N2CC[C@@H]3CN[C@@H]3C2
CHEMBL1907596_274 C1C[C@H]2CN([C@H]2CN1)c3cccnc3
CHEMBL1907596_275 Brc1ncc(cc1C#N)N2C[C@@H]3CCNC[C@H]23
CHEMBL1907596_276 C1CN(C[C@@H]2NC[C@H]12)c3cncnc3
CHEMBL1907596_277 Clc1ccc(nn1)N2CC[C@H]3CN[C@H]3C2
CHEMBL1907596_278 Brc1cncc(c1)N2CC[C@H]3CN[C@H]3C2
CHEMBL1907596_279 Clc1cc(cnc1Cl)N2CC[C@H]3CN[C@H]3C2
CHEMBL1907596_280 COc1ccc(cn1)N2CC[C@H]3CN[C@H]3C2
CHEMBL1907596_281 CCOc1cncc(c1)N2CC[C@H]3CN[C@H]3C2
CHEMBL1907596_282 Clc1ccc(cn1)N2C[C@@H]3CCNC[C@H]23
CHEMBL1907596_283 Clc1cc(cnc1Cl)N2C[C@@H]3CCNC[C@H]23
CHEMBL1907596_284 Brc1cc(cnc1Br)N2CC[C@H]3CN[C@H]3C2
CHEMBL1907596_285 Cc1cncc(c1)N2CC[C@@H]3CN[C@@H]3C2
CHEMBL1907596_286 Cc1cc(cnc1Cl)N2C[C@@H]3CCNC[C@H]23
CHEMBL1907596_287 Cc1cc(cnc1Cl)N2CC[C@H]3CN[C@H]3C2
CHEMBL1907596_288 COc1cc(cnc1Br)N2CC[C@H]3CN[C@H]3C2
CHEMBL1907596_289 Clc1ccc(cn1)N2C[C@H]3CCNC[C@@H]23
CHEMBL1907596_290 Clc1cc(cnc1Cl)N2C[C@H]3CCNC[C@@H]23
CHEMBL1907596_291 N#Cc1cncc(c1)N2C[C@@H]3CCNC[C@H]23
CHEMBL1907596_292 Cc1cc(cnc1Cl)N2C[C@H]3CCNC[C@@H]23
CHEMBL1907596_293 COc1cc(cnc1Br)N2C[C@H]3CCNC[C@@H]23
CHEMBL1907596_294 Brc1ncc(cc1C#N)N2C[C@H]3CCNC[C@@H]23
CHEMBL1907596_295 COc1cncc(c1)N2C[C@H]3CCNC[C@@H]23
CHEMBL1907596_296 CCOc1cncc(c1)N2CC[C@@H]3CN[C@@H]3C2
CHEMBL1907596_297 C1CN(C[C@H]2NC[C@@H]12)c3cccnc3
CHEMBL1907596_298 Clc1ccc(cn1)N2CC[C@@H]3CN[C@@H]3C2
CHEMBL1907596_299 Brc1cncc(c1)N2CC[C@@H]3CN[C@@H]3C2
CHEMBL1907596_300 Clc1cc(cnc1Cl)N2CC[C@@H]3CN[C@@H]3C2
CHEMBL1907596_301 Brc1cc(cnc1Br)N2CC[C@@H]3CN[C@@H]3C2
CHEMBL1907596_302 COc1cncc(c1)N2CC[C@@H]3CN[C@@H]3C2
CHEMBL1907596_303 Cc1cc(cnc1Cl)N2CC[C@@H]3CN[C@@H]3C2
CHEMBL1907596_304 N#Cc1cncc(c1)N2CC[C@H]3CN[C@H]3C2
CHEMBL1907596_305 COc1cc(cnc1Br)N2CC[C@@H]3CN[C@@H]3C2
CHEMBL1907596_306 C1CN(C[C@H]2NC[C@@H]12)c3cncnc3
CHEMBL1907596_307 Clc1ccc(cn1)[C@H]2C[C@@H]3CC[C@H]2N3
CHEMBL1907596_308 Clc1ccc(cn1)[C@@H]2C[C@H]3CC[C@@H]2N3
CHEMBL1907596_309 Cl.Cl.Clc1ccc(cn1)C2CC3CCCC2N3
CHEMBL1907596_310 Cl.Cl.Cl.Clc1ccc(cn1)C2CC3CC2CN3
CHEMBL1907596_311 Cl.Clc1ccc(cn1)C2CC3CCC2N3
CHEMBL1907596_312 CN1CCC[C@H]1c2cccnc2
CHEMBL1907596_313 Clc1ccc(cn1)C2CC3CCC2N3
CHEMBL1907596_314 Clc1ccc(cn1)N2CC3CC2CN3
CHEMBL1907596_315 C1CC2CNC1CN2c3cccnc3
CHEMBL1907596_316 C1NC2CC1CN(C2)c3cccnc3
CHEMBL1907596_317 C1NCC2CC1CN2c3cccnc3
CHEMBL1907596_318 C1NC2CC1N(C2)c3cccnc3
CHEMBL1907596_319 [I-].C[N+]1(C)CC2CC1CN2c3ccc(Cl)nc3
CHEMBL1907596_320 Fc1ccc(cn1)N2CC3CC2CN3
CHEMBL1907596_321 COc1ccc(cn1)N2CC3CC2CN3
CHEMBL1907596_322 N#Cc1cncc(c1)N2CC3CC2CN3
CHEMBL1907596_323 Oc1cc(cnc1Cl)N2CC3CC2CN3
CHEMBL1907596_324 COc1cncc(c1)N2CC3CC2CN3
CHEMBL1907596_325 Cc1cc(cnc1Cl)N2CC3CC2CN3
CHEMBL1907596_326 Clc1cc(cnc1Cl)N2CC3CC2CN3
CHEMBL1907596_327 COc1cc(cnc1Cl)N2CC3CC2CN3
CHEMBL1907596_328 C1NC2CC1N(C2)c3cncnc3
CHEMBL1907596_329 C1NC2CC1N(C2)c3cc4ncccc4s3
CHEMBL1907596_330 Clc1ccc(cn1)C2CC3CCC2N3
CHEMBL1907596_331 Fc1ccc(cn1)C2CC3CCC2N3
CHEMBL1907596_332 C1CC2NC1CC2c3cccnc3
CHEMBL1907596_333 C1NCC2CC1c3cc4nccnc4cc23
CHEMBL1907596_334 Fc1ccc(cc1)c2cncc(c2)C3CC4CCC3N4
CHEMBL1907596_335 Clc1ccc(cc1)c2cncc(c2)C3CC4CCC3N4
CHEMBL1907596_336 Cl.Fc1ncc(cc1c2ccccc2)C3CC4CCC3N4
CHEMBL1907596_337 Cl.Fc1ncc(cc1c2ccc(Cl)cc2)C3CC4CCC3N4
CHEMBL1907596_338 Cl.Fc1cccc(c1)c2cc(cnc2F)C3CC4CCC3N4
CHEMBL1907596_339 Cl.Nc1cccc(c1)c2cc(cnc2F)C3CC4CCC3N4
CHEMBL1907596_340 Cl.Nc1ccc(cc1)c2cc(cnc2F)C3CC4CCC3N4
CHEMBL1907596_341 Cl.[O-][N+](=O)c1ccc(cc1)c2cc(cnc2F)C3CC4CCC3N4
CHEMBL1907596_342 Cl.Fc1ncc(cc1c2cccc(Cl)c2)C3CC4CCC3N4
CHEMBL1907596_343 Cl.Fc1ccc(cc1)c2cc(cnc2F)C3CC4CCC3N4
CHEMBL1907596_344 Cl.Nc1ccc(cc1)c2cncc(c2)C3CC4CCC3N4
CHEMBL1907596_345 Cl.COc1cccc(c1)c2cc(cnc2F)C3CC4CCC3N4
CHEMBL1907596_346 Cl.[O-][N+](=O)c1cccc(c1)c2cc(cnc2F)C3CC4CCC3N4
CHEMBL1907596_347 Cl.Cl.C1CC2NC1CC2c3cncc(c3)c4ccccc4
CHEMBL1907596_348 Cl.Cl.Fc1cccc(c1)c2cncc(c2)C3CC4CCC3N4
CHEMBL1907596_349 Cl.Cl.[O-][N+](=O)c1cccc(c1)c2cncc(c2)C3CC4CCC3N4
CHEMBL1907596_350 Cl.Cl.Cl.Clc1cccc(c1)c2cncc(c2)C3CC4CCC3N4
CHEMBL1907596_351 Cl.Cl.[O-][N+](=O)c1ccc(cc1)c2cncc(c2)C3CC4CCC3N4
CHEMBL1907596_352 Cl.Cl.COc1cccc(c1)c2cncc(c2)C3CC4CCC3N4
CHEMBL1907596_353 O=C1C=CC=C2[C@H]3CNC[C@H](C3)CN12
CHEMBL1907596_354 Clc1ccc(cn1)C2CC3CCC2N3
CHEMBL1907596_355 CN1[C@@H]2CC[C@@H]1[C@H](C2)c3cncc(c3)c4ccnc(F)c4
CHEMBL1907596_356 CN1[C@@H]2CC[C@@H]1[C@H](C2)c3cncc(c3)c4cccnc4F
CHEMBL1907596_357 CN1[C@@H]2CC[C@@H]1[C@H](C2)c3cncc(c3)c4cccc(F)n4
CHEMBL1907596_358 CN1C2CCC1C(C2)c3cncc(c3)c4ccc(F)nc4
CHEMBL1907596_359 Clc1ccc(cn1)N2CC3CC(C2)N3
CHEMBL1907596_360 C1C2CN(CC1N2)c3cccnc3
CHEMBL1907596_361 Brc1ccc(cn1)N2CC3CC(C2)N3
CHEMBL1907596_362 Clc1ccc(nn1)N2CC3CC(C2)N3
CHEMBL1907596_363 CN1C2CC1CN(C2)c3ccc(Cl)nc3
CHEMBL1907596_364 Clc1ccc(cn1)N2CC3CC2CN3
CHEMBL1907596_365 C[C@H](CCOC(=O)N1CC(C)C1)N(C)C.OC(=O)C(=O)O
CHEMBL1907596_366 Cc1cc(on1)[C@H]2C[C@H]3CC[C@H]2N3
CHEMBL1907596_367 Clc1ccc(cn1)C2CC3CCC2N3
CHEMBL1907596_368 Clc1ccc(cn1)C2=CCCC3CCC2N3
CHEMBL1907596_369 Cc1cc(on1)[C@H]2C[C@H]3CC[C@H]2N3
CHEMBL1907596_370 O=C1C=CC=C2[C@H]3CNC[C@H](C3)CN12
CHEMBL1907596_371 O=C1C=CC=C2[C@H]3CNC[C@H](C3)CN12
CHEMBL1907596_372 CN1CCC[C@H]1c2cccnc2
CHEMBL1907596_373 Clc1ccc(cn1)C2CC3CCC2N3
CHEMBL1907596_374 C(Oc1cncc(c1)N2C[C@H]3CNC[C@H]3C2)c4ccccc4
CHEMBL1907596_375 CC(C)Oc1cncc(c1)N2C[C@H]3CNC[C@H]3C2.OC(=O)\C=C\C(=O)O
CHEMBL1907596_376 OC(=O)C(F)(F)F.C1NC[C@H]2CN(C[C@@H]12)c3cncc(c3)c4ccccc4
CHEMBL1907596_377 Cl.Clc1ccc(cn1)N2C[C@H]3CNC[C@H]3C2
CHEMBL1907596_378 OC(=O)C(F)(F)F.Brc1ccc(cn1)N2C[C@H]3CNC[C@H]3C2
CHEMBL1907596_379 OC(=O)C(F)(F)F.Brc1cncc(c1)N2C[C@H]3CNC[C@H]3C2
CHEMBL1907596_380 CCCOc1cncc(c1)N2C[C@H]3CNC[C@H]3C2.OC(=O)\C=C\C(=O)O
CHEMBL1907596_381 Cl.Cl.C1NC[C@H]2CN(C[C@@H]12)c3cccnc3
CHEMBL1907596_382 Cl.Cl.Oc1cncc(c1)N2C[C@H]3CNC[C@H]3C2
CHEMBL1907596_383 Cl.Cl.COc1cncc(c1)N2C[C@H]3CNC[C@H]3C2
CHEMBL1907596_384 Cl.Cl.CCOc1cncc(c1)N2C[C@H]3CNC[C@H]3C2
CHEMBL1907596_385 CN1CCC[C@H]1c2cccnc2
CHEMBL1907596_386 Clc1ccc(cn1)C2CC3CCC2N3
CHEMBL1907596_387 Clc1ccc(OC[C@H]2CCN2)cn1
CHEMBL1907596_388 C1NC2CC1N(C2)c3cccnc3
CHEMBL1907596_389 Clc1ccc(cn1)C2CC3CCC2N3
CHEMBL1907596_390 CN1CCC[C@H]1COc2cnc(Cl)c(OCc3ccc(Cl)nc3)c2
CHEMBL1907596_391 CN1CCC[C@H]1COc2cnc(Cl)c(OCc3ccnc(F)c3)c2
CHEMBL1907596_392 Fc1cc(COc2cc(OC[C@@H]3CCCN3)cnc2Cl)ccn1
CHEMBL1907596_393 CN1CCC[C@H]1COc2cnc(Cl)c(c2)c3ccnc(F)c3
CHEMBL1907596_394 Clc1ccc(Oc2cc(OC[C@@H]3CCCN3)cnc2Cl)cn1
CHEMBL1907596_395 [11CH3]N1CCC[C@H]1COc2cnc(Cl)c(\C=C\c3ccncc3)c2
CHEMBL1907596_396 Clc1ccc(cn1)[C@@H]2C[C@H]3CC[C@@H]2N3
CHEMBL1907596_397 Clc1ncc(cc1c2ccccc2)[C@H]3C[C@@H]4CC[C@H]3N4
CHEMBL1907596_398 Fc1ccc(cc1)c2cc(cnc2Cl)[C@H]3C[C@@H]4CC[C@H]3N4
CHEMBL1907596_399 Fc1cccc(c1)c2cc(cnc2Cl)[C@H]3C[C@@H]4CC[C@H]3N4
CHEMBL1907596_400 Clc1ccc(cc1)c2cc(cnc2Cl)[C@H]3C[C@@H]4CC[C@H]3N4
CHEMBL1907596_401 Clc1cccc(c1)c2cc(cnc2Cl)[C@H]3C[C@@H]4CC[C@H]3N4
CHEMBL1907596_402 Clc1ncc(cc1c2cccc(Br)c2)[C@H]3C[C@@H]4CC[C@H]3N4
CHEMBL1907596_403 [O-][N+](=O)c1ccc(cc1)c2cc(cnc2Cl)[C@H]3C[C@@H]4CC[C@H]3N4
CHEMBL1907596_404 [O-][N+](=O)c1cccc(c1)c2cc(cnc2Cl)[C@H]3C[C@@H]4CC[C@H]3N4
CHEMBL1907596_405 Nc1ccc(cc1)c2cc(cnc2Cl)[C@H]3C[C@@H]4CC[C@H]3N4
CHEMBL1907596_406 Nc1cccc(c1)c2cc(cnc2Cl)[C@H]3C[C@@H]4CC[C@H]3N4
CHEMBL1907596_407 COc1ccc(cc1)c2cc(cnc2Cl)[C@H]3C[C@@H]4CC[C@H]3N4
CHEMBL1907596_408 COc1cccc(c1)c2cc(cnc2Cl)[C@H]3C[C@@H]4CC[C@H]3N4
CHEMBL1907596_409 CN(C)c1cccc(c1)c2cc(cnc2Cl)[C@H]3C[C@@H]4CC[C@H]3N4
CHEMBL1907596_410 Clc1ccc(nn1)N2C[C@H]3C[C@@H]2CN3
CHEMBL1907596_411 Clc1ccc(cn1)C2CC3CCC2N3
CHEMBL1907596_412 O=C1C=CC=C2[C@H]3CNC[C@H](C3)CN12
CHEMBL1907596_413 BrC1=CC=C2[C@H]3CNC[C@H](C3)CN2C1=O
CHEMBL1907596_414 BrN1C[C@H]2C[C@H](C1)C3=CC=CC(=O)N3C2
CHEMBL1907596_415 FC(F)(F)C1=CC=C2[C@H]3CNC[C@H](C3)CN2C1=O
CHEMBL1907596_416 CC1=CC=C2[C@H]3CNC[C@H](C3)CN2C1=O
CHEMBL1907596_417 IN1C[C@H]2C[C@H](C1)C3=CC=CC(=O)N3C2
CHEMBL1907596_418 Clc1ccc(cn1)C2CC3CCC2N3
CHEMBL1907596_419 Cl.OCCCc1cc(on1)c2cncc(OC[C@@H]3CCN3)c2
CHEMBL1907596_420 Cl.OCc1cc(on1)c2cncc(OC[C@@H]3CCN3)c2
CHEMBL1907596_421 Cl.OCCc1cc(on1)c2cncc(OC[C@@H]3CCN3)c2
CHEMBL1907596_422 Cl.OCCCc1cc(on1)c2cncc(OC[C@@H]3CCN3)c2
CHEMBL1907596_423 Cl.OCc1cc(on1)c2cncc(OC[C@@H]3CCN3)c2
CHEMBL1907596_424 Cl.OCCc1cc(on1)c2cncc(OC[C@@H]3CCN3)c2
CHEMBL1907596_425 C1NC[C@H]2CN(C[C@@H]12)c3cccnc3
CHEMBL1907596_426 C1C[C@@H]2CN(C[C@@H]2N1)c3cccnc3
CHEMBL1907596_427 Clc1cccc(NC(=O)c2cncc(n2)N3C[C@H]4CNC[C@H]4C3)c1
CHEMBL1907596_428 FC(F)(F)c1ccccc1CNC(=O)c2cncc(c2)N3C[C@H]4CNC[C@H]4C3
CHEMBL1907596_429 O=C(N1CCc2ccccc2C1)c3cncc(c3)N4C[C@H]5CNC[C@H]5C4
CHEMBL1907596_430 Fc1ccccc1CCNC(=O)c2cncc(c2)N3C[C@H]4CNC[C@H]4C3
CHEMBL1907596_431 Ic1cccc(NC(=O)c2cncc(c2)N3C[C@H]4CNC[C@H]4C3)c1
CHEMBL1907596_432 Clc1ccc(NC(=O)c2cncc(c2)N3C[C@H]4CNC[C@H]4C3)cc1
CHEMBL1907596_433 Cc1cccc(NC(=O)c2cncc(c2)N3C[C@H]4CNC[C@H]4C3)c1
CHEMBL1907596_434 Cc1cc(C)cc(NC(=O)c2cncc(c2)N3C[C@H]4CNC[C@H]4C3)c1
CHEMBL1907596_435 COc1cc(NC(=O)c2cncc(c2)N3C[C@H]4CNC[C@H]4C3)cc(OC)c1
CHEMBL1907596_436 Fc1cc(F)cc(NC(=O)c2cncc(c2)N3C[C@H]4CNC[C@H]4C3)c1
CHEMBL1907596_437 Clc1ccc(cn1)C2CC3CCC2N3
CHEMBL1907596_438 Cc1cc(on1)C2CC3CCC2N3
CHEMBL1907596_439 Cl.Clc1ccc(cn1)C2=CC3CCC2N3
CHEMBL1907596_440 Cl.Fc1ccc(cn1)C2=CC3CCC2N3
CHEMBL1907596_441 Cl.C1CC2NC1C=C2c3cccnc3
CHEMBL1907596_442 OCC[C@H]1C[C@@H]1c2cncc(OC[C@@H]3CCN3)c2
CHEMBL1907596_443 OCC[C@@H]1C[C@H]1c2cncc(OC[C@@H]3CCN3)c2
CHEMBL1907596_444 CNC(=O)OCC[C@H]1C[C@@H]1c2cncc(OC[C@@H]3CCN3)c2
CHEMBL1907596_445 CN(C)C(=O)OCC[C@H]1C[C@@H]1c2cncc(OC[C@@H]3CCN3)c2
CHEMBL1907596_446 O=C(NC1CC1)OCC[C@H]2C[C@@H]2c3cncc(OC[C@@H]4CCN4)c3
CHEMBL1907596_447 O=C(OCC[C@H]1C[C@@H]1c2cncc(OC[C@@H]3CCN3)c2)N4CCCC4
CHEMBL1907596_448 O=C(Nc1ccccc1)OCC[C@H]2C[C@@H]2c3cncc(OC[C@@H]4CCN4)c3
CHEMBL1907596_449 COCC[C@H]1C[C@@H]1c2cncc(OC[C@@H]3CCN3)c2
CHEMBL1907596_450 COCC[C@@H]1C[C@H]1c2cncc(OC[C@@H]3CCN3)c2
CHEMBL1907596_451 OCC[C@H]1C[C@@H]1c2cncc(OC[C@@H]3CCN3)c2
CHEMBL1907596_452 COCC[C@H]1C[C@@H]1c2cncc(OC[C@@H]3CCN3)c2
CHEMBL1907596_453 OCCCCC#Cc1cncc(OC[C@@H]2CCN2)c1
CHEMBL1907596_454 OCCCCC#Cc1cncc(OC[C@@H]2CCN2)c1
CHEMBL1907596_455 C(Oc1cccnc1)[C@@H]2CCN2
CHEMBL1907596_456 C1NCC2CC1c3cc4nccnc4cc23
CHEMBL1907596_457 O=C(C1CC1)N2CC3CNC(C3)C2
CHEMBL1907596_458 O=C1C=CC=C2[C@H]3CNC[C@H](C3)CN12
CHEMBL1907596_459 OC(=O)C(F)(F)F.FC(F)C[C@H]1C[C@@H]1c2cncc(OC[C@@H]3CCN3)c2
CHEMBL1907596_460 OC(=O)C(F)(F)F.FCC[C@H]1C[C@@H]1c2cncc(OC[C@@H]3CCN3)c2
CHEMBL1907596_461 CC[C@H]1C[C@@H]1c2cncc(OC[C@@H]3CCN3)c2.OC(=O)C(F)(F)F
CHEMBL1907596_462 OCCc1cc(on1)c2cncc(OC[C@@H]3CCN3)c2
CHEMBL1907596_463 FC(F)CCc1cc(on1)c2cncc(OC[C@@H]3CCN3)c2
CHEMBL1907596_464 FCCCc1cc(on1)c2cncc(OC[C@@H]3CCN3)c2
CHEMBL1907596_465 CCNC(=O)OCCc1cc(on1)c2cncc(OC[C@@H]3CCN3)c2
CHEMBL1907596_466 O=C(Nc1ccccc1)OCCc2cc(on2)c3cncc(OC[C@@H]4CCN4)c3
CHEMBL1907596_467 CCc1cc(on1)c2cncc(OC[C@@H]3CCN3)c2
CHEMBL1907596_468 FC(F)(F)CCc1cc(on1)c2cncc(OC[C@@H]3CCN3)c2
CHEMBL1907596_469 OCCc1cc(on1)c2cncc(OC[C@@H]3CCN3)c2
CHEMBL1907596_470 FC(F)CCc1cc(on1)c2cncc(OC[C@@H]3CCN3)c2
CHEMBL1907596_471 FCCCc1cc(on1)c2cncc(OC[C@@H]3CCN3)c2
CHEMBL1907596_472 O=C(Nc1ccccc1)OCCc2cc(on2)c3cncc(OC[C@@H]4CCN4)c3
CHEMBL1907596_473 CCc1cc(on1)c2cncc(OC[C@@H]3CCN3)c2
CHEMBL1907596_474 C(Oc1cccnc1)[C@@H]2CCN2
CHEMBL1907596_475 Ic1cncc(OC[C@@H]2CCN2)c1
CHEMBL1907596_476 Fc1ncccc1OC[C@H]2NCC=C2
CHEMBL1907596_477 Ic1cncc(OC[C@H]2NCC=C2)c1
CHEMBL1907596_478 Ic1cncc(OC[C@@H]2CCN2)c1
CHEMBL1907596_479 C1NCC2CC1c3cc4nccnc4cc23
CHEMBL1907596_480 OCCCCC#Cc1cncc(OC[C@@H]2CCN2)c1
CHEMBL1907596_481 CCOC(=O)NCC[C@H]1C[C@@H]1c2cncc(OC[C@@H]3CCN3)c2.OC(=O)C(F)(F)F
CHEMBL1907596_482 COCC[C@H]1C[C@@H]1c2cncc(OC[C@@H]3CCCN3C)c2.OC(=O)C(F)(F)F
CHEMBL1907596_483 COCC[C@H]1C[C@@H]1c2cncc(OC[C@@H]3CCNC3)c2.OC(=O)C(F)(F)F
CHEMBL1907596_484 OC[C@H]1C[C@@H]1c2cncc(OC[C@@H]3CCN3)c2.OC(=O)C(F)(F)F
CHEMBL1907596_485 OCCC[C@H]1C[C@@H]1c2cncc(OC[C@@H]3CCN3)c2.OC(=O)C(F)(F)F
CHEMBL1907596_486 CCOC(=O)NCC[C@H]1C[C@@H]1c2cncc(OC[C@@H]3CCN3)c2.OC(=O)C(F)(F)F
CHEMBL1907596_487 CC(C)OC(=O)NCC[C@H]1C[C@@H]1c2cncc(OC[C@@H]3CCN3)c2.OC(=O)C(F)(F)F
CHEMBL1907596_488 COCC[C@H]1C[C@@H]1c2cncc(OC[C@@H]3CCCN3)c2.OC(=O)C(F)(F)F
CHEMBL1907596_489 COCC[C@H]1C[C@@H]1c2cncc(OC[C@@H]3CCCN3C)c2.OC(=O)C(F)(F)F
CHEMBL1907596_490 COCC[C@H]1C[C@@H]1c2cncc(OC[C@@H]3CCNC3)c2.OC(=O)C(F)(F)F
CHEMBL1907596_491 OC[C@H]1C[C@@H]1c2cncc(OC[C@@H]3CCN3)c2.OC(=O)C(F)(F)F
CHEMBL1907596_492 OCCC[C@H]1C[C@@H]1c2cncc(OC[C@@H]3CCN3)c2.OC(=O)C(F)(F)F
CHEMBL1907596_493 OC(=O)CC[C@H]1C[C@@H]1c2cncc(OC[C@@H]3CCN3)c2.OC(=O)C(F)(F)F
CHEMBL1907596_494 OC(=O)C(F)(F)F.FC(F)(F)OCC[C@H]1C[C@@H]1c2cncc(OC[C@@H]3CCN3)c2
CHEMBL1907596_495 O=C1C=CC=C2[C@H]3CNC[C@H](C3)CN12
CHEMBL1907596_496 O=C1N2C[C@@H]3CNC[C@@H](C3)C2=CC=C1c4cccnc4
CHEMBL1907596_497 COc1ccc(cc1Cl)C#CC(=O)N2C[C@@H]3CNC[C@@H](C3)C2
CHEMBL1907596_498 O=C1C=CC=C2[C@H]3CNC[C@H](C3)CN12
CHEMBL1907596_499 O=C1N2C[C@@H]3CNC[C@@H](C3)C2=CC=C1c4cccnc4
CHEMBL1907596_500 CCCCC#Cc1cncc(OC[C@@H]2CCN2)c1
CHEMBL1907596_501 CCCCC#Cc1cncc(OC[C@H]2CCN2)c1
CHEMBL1907596_502 CCCCC#Cc1cncc(OC[C@H]2CCCN2)c1
CHEMBL1907596_503 [N-]=[N+]=NCCCCC#Cc1cncc(OC[C@@H]2CCN2)c1
CHEMBL1907596_504 C1NCC2CC1c3cc4nccnc4cc23
CHEMBL1907596_505 C1NC[C@H]2CN(C[C@@H]12)c3cccnc3
CHEMBL1907596_506 OCCCCC#Cc1cncc(OC[C@@H]2CCN2)c1
CHEMBL1907596_507 CCOCC[C@H]1C[C@@H]1c2cncc(c2)N3C[C@H]4CNC[C@H]4C3.OC(=O)C(F)(F)F
CHEMBL1907596_508 CN(C)CCOc1cncc(c1)N2CC3CNCC(C3)C2
CHEMBL1907596_509 C1NCC2CC1CN(C2)c3cccnc3
CHEMBL1907596_510 C1NCC2CC1c3cc4nccnc4cc23
CHEMBL1907596_511 Fc1ccc(cn1)c2cncc(c2)[C@H]3C[C@H]4CC[C@H]3N4
CHEMBL1907596_512 Clc1ccc(cn1)[C@H]2C[C@H]3CC[C@H]2N3
CHEMBL1907596_513 C1C[C@H]2N[C@H]1C[C@@H]2c3cncc(c3)c4ccncc4
CHEMBL1907596_514 Fc1cc(ccn1)c2cncc(c2)[C@H]3C[C@H]4CC[C@H]3N4
CHEMBL1907596_515 Clc1cc(ccn1)c2cncc(c2)[C@H]3C[C@H]4CC[C@H]3N4
CHEMBL1907596_516 Nc1cc(ccn1)c2cncc(c2)[C@H]3C[C@H]4CC[C@H]3N4
CHEMBL1907596_517 COc1cc(ccn1)c2cncc(c2)[C@H]3C[C@H]4CC[C@H]3N4
CHEMBL1907596_518 C1C[C@H]2N[C@H]1C[C@@H]2c3cncc(c3)c4cccnc4
CHEMBL1907596_519 Clc1ccc(cn1)c2cncc(c2)[C@H]3C[C@H]4CC[C@H]3N4
CHEMBL1907596_520 Nc1ccc(cn1)c2cncc(c2)[C@H]3C[C@H]4CC[C@H]3N4
CHEMBL1907596_521 COc1ccc(cn1)c2cncc(c2)[C@H]3C[C@H]4CC[C@H]3N4
CHEMBL1907596_522 C1CNCCN(C1)c2cccnc2
CHEMBL1907596_523 CN1CCC[C@H]1COc2cncc(Br)c2
CHEMBL1907596_524 CN1CCC[C@H]1COc2cccnc2
CHEMBL1907596_525 CN1CCC[C@H]1COc2cncc(c2)c3ccccc3
CHEMBL1907596_526 C1NC2CC1N(C2)c3cccnc3
CHEMBL1907596_527 COCC[C@@H]1C[C@H]1c2cncc(c2)N3CCCNCC3
CHEMBL1907596_528 COCC[C@H]1C[C@@H]1c2cncc(c2)N3CCCNCC3
CHEMBL1907596_529 OCCc1cc(on1)c2cncc(c2)N3CCCNCC3

View File

@@ -0,0 +1,55 @@
CHEMBL214_1 CCCN(CCC)[C@@H]1CCc2ccc3[nH]c(cc3c2C1)C#N
CHEMBL214_2 Oc1cccc2CC[C@@H]3[C@@H](CN3CC=C)c12
CHEMBL214_3 COc1ccccc1N2CCN(CCN3C(=O)CC4(CCCC4)CC3=O)CC2
CHEMBL214_4 C[C@H]1C[C@@H](CCN1C[C@H](O)COc2cccc3[nH]c(C)cc23)c4cc5c(F)cccc5s4
CHEMBL214_5 NCCc1c[nH]c2ccc(OCc3cccc(COc4ccc5[nH]cc(CCN)c5c4)c3)cc12
CHEMBL214_6 COc1ccccc1N2CCN(CCCCNS(=O)(=O)c3ccc(C)cc3)CC2
CHEMBL214_7 CCCN(CCCc1c[nH]c2ccc(F)cc12)C3COc4c(F)ccc(C(=O)NC)c4C3
CHEMBL214_8 O=C1NCc2ccc(OCCCCN3CCN(CC3)c4cccc5ccccc45)cc12
CHEMBL214_9 Fc1ccc2cccc(N3CCN(CCCOc4ccc5CNC(=O)c5c4)CC3)c2c1
CHEMBL214_10 Fc1cccc2cccc(N3CCN(CCCOc4ccc5CNC(=O)c5c4)CC3)c12
CHEMBL214_11 O=C1NCc2ccc(OCCCCN3CCN(CC3)c4cccc5CCCc45)cc12
CHEMBL214_12 Fc1cc2CNC(=O)c2cc1OCCCN3CCN(CC3)c4cccc5ccccc45
CHEMBL214_13 COc1ccccc1N2CCN(CCN(C(=O)C34CCC(I)(CC3)C4)c5ccccn5)CC2
CHEMBL214_14 Cl.Cl.Cl.COc1ccccc1N2CCN(CCN(C(=O)C3CCCCC3)c4ccccn4)CC2
CHEMBL214_15 COc1ccccc1N2CCN(CCN(C(=O)C34C5C6C3C7C4C5C67CF)c8ccccn8)CC2
CHEMBL214_16 Oc1ccccc1N2CCN(CCN(C(=O)C34C5C6C3C7C4C5C67CF)c8ccccn8)CC2
CHEMBL214_17 O=C(Nc1cccnc1)Nc2cccc(CCN3CCN(CC3)c4ccccc4)c2
CHEMBL214_18 O=C(N1CCC(CCN2CCN(CC2)c3nsc4ccccc34)CC1)c5occc5
CHEMBL339_2 CNc1cc(OC)c(cc1Cl)C(=O)N[C@H]2CCN(C2)C3C4CCCC3CCC4
CHEMBL339_3 CCN1CCC[C@H]1CNC(=O)c2c(O)c(CCF)cc(OC)c2OC
CHEMBL339_4 CCN1CCC[C@H]1CNC(=O)c2cc(I)cc(OC)c2OC
CHEMBL339_5 CC(C)Oc1ccccc1N2CCN(Cc3ccc(CN4CCCCC4=O)n3C)CC2
CHEMBL339_6 CCN(CC)C(=O)N[C@@H]1C[C@H]2[C@@H](CC3=CCc4cccc2c34)N(C)C1
CHEMBL339_7 OC(=O)C(=O)O.Oc1ccc2CC[C@H](CN3CCc4ccccc4C3)Oc2c1
CHEMBL339_8 COc1ccccc1N2CCN(Cc3ccc([nH]3)c4ccccc4)CC2
CHEMBL339_9 Fc1ccc(CN2CN(c3ccccc3)C4(CCN(CCCC(=O)c5ccc(F)cc5)CC4)C2=O)cc1
CHEMBL339_10 CCCN1CCc2cccc3c2[C@H]1Cc4ccc(O)c(O)c34
CHEMBL339_11 O=C1CCc2ccc(OCCCN3CCN(CC3)c4cccc5sccc45)cc2N1
CHEMBL339_12 O=C1Nc2cc(OCCCN3CCN(CC3)c4cccc5sccc45)ccc2C=C1
CHEMBL1946_1 CCCC(=O)NCCC1CCc2c(OC)ccc3ccc(OC)c1c23
CHEMBL1946_3 COc1cccc(Cc2oc3ccc(OC)cc3c2CCNC(=O)C)c1
CHEMBL1946_4 COc1ccc2oc(Cc3ccccc3OC)c(CCNC(=O)C)c2c1
CHEMBL1946_5 CCC(=O)NC[C@@H]1C[C@H]1c2cccc3nc(CCCCc4ccccc4)oc23
CHEMBL1946_6 COc1ccc2[nH]cc(CCNC(=O)C)c2c1
CHEMBL1946_7 COc1cc2c(CCNC(=O)C)c(I)[nH]c2cc1[N+](=O)[O-]
CHEMBL1946_8 COc1ccc2[nH]cc(CCNC(=O)C)c2c1
CHEMBL1946_9 COc1ccc2cc(cc(CCNC(=O)C)c2c1)c3cccc(CBr)c3
CHEMBL1946_10 CCCC(=O)NCCCc1cc(OC)ccc1OCc2ccccc2
CHEMBL1946_11 CC(C)C1=C(CCNC(=O)C)c2c(C1)ccc3OCCc23
CHEMBL1946_12 CCNC(=O)NCCC1=C(Cc2ccc3OCCc3c12)C(C)C
CHEMBL1946_13 COc1ccc2cccc(\C=C\NC(=O)C)c2c1
CHEMBL1946_14 COc1ccc2cccc(CCC(=O)NS(=O)(=O)C)c2n1
CHEMBL1946_15 CCC(=O)NC[C@@H]1CCc2ccccc2[C@@H]1c3ccccc3
CHEMBL273_1 CCCN(CCC)[C@H]1CCc2cccc(C(=O)C)c2C1
CHEMBL273_2 COc1ccc2CCC(CCN3CCN(CC3)c4ccccn4)Cc2c1
CHEMBL273_3 CCCCN(CCCC)C(=O)c1cccc(CN2CCN(CC2)c3ccccc3OC(C)C)c1
CHEMBL273_4 CCCN(CCC)C1CCc2cccc(O)c2C1
CHEMBL273_5 COc1ccccc1N2CCN(CCCCN3C(=O)c4ccccc4C3=O)CC2
CHEMBL273_6 Cl.Cl.COc1ccccc1N2CCN(CCNC(=O)C34C[C@H]5CC(C[C@@H]3C5)C4)CC2
CHEMBL273_7 Cl.Cl.COc1ccccc1N2CCN(CCNC(=O)C34C[C@@H]5C[C@@H](C[C@@H](C5)C3)C4)CC2
CHEMBL273_8 Cl.C(Cc1ccccc1)N2CCN(CC2)c3cccc4ccoc34
CHEMBL273_9 CCCNC1CCc2ccc3[nH]cc(C=O)c3c2C1
CHEMBL273_10 CCCN(CCC)C1CCc2ccc3[nH]cc(C=O)c3c2C1
CHEMBL273_11 COc1ccccc1N2CCN(CCCCN3C(=O)c4ccccc4C3=O)CC2
CHEMBL273_12 CCCN(CCC)C1CCc2cccc(O)c2C1

View File

@@ -0,0 +1,227 @@
// This is a mildly modified version of the code in SciPy's
// scipy.optimize.linear_sum_assignment, extracted from
// rectangular_lsap.cpp.
// https://github.com/scipy/scipy/blob/main/scipy/optimize/rectangular_lsap/rectangular_lsap.cpp
// As such it is subject to the following notice:
/*
Copyright (c) 2001-2002 Enthought, Inc. 2003-2023, SciPy Developers.
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions
are met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above
copyright notice, this list of conditions and the following
disclaimer in the documentation and/or other materials provided
with the distribution.
3. Neither the name of the copyright holder nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
This code implements the shortest augmenting path algorithm for the
rectangular assignment problem. This implementation is based on the
pseudocode described in pages 1685-1686 of:
DF Crouse. On implementing 2D rectangular assignment algorithms.
IEEE Transactions on Aerospace and Electronic Systems
52(4):1679-1696, August 2016
doi: 10.1109/TAES.2016.140952
Author: PM Larsen
*/
#include <algorithm>
#include <iostream>
#include <limits>
#include <numeric>
#include <vector>
namespace RDKit {
namespace RascalMCES {
template <typename T>
std::vector<size_t> argsort_iter(const std::vector<T> &v) {
std::vector<size_t> index(v.size());
std::iota(index.begin(), index.end(), 0);
std::sort(index.begin(), index.end(),
[&v](size_t i, size_t j) { return v[i] < v[j]; });
return index;
}
static int augmenting_path(size_t nc, std::vector<int> &cost,
std::vector<double> &u, std::vector<double> &v,
std::vector<size_t> &path,
std::vector<size_t> &row4col,
std::vector<double> &shortestPathCosts, size_t i,
std::vector<bool> &SR, std::vector<bool> &SC,
std::vector<size_t> &remaining, double *p_minVal) {
double minVal = 0;
// Crouse's pseudocode uses set complements to keep track of remaining
// nodes. Here we use a vector, as it is more efficient in C++.
size_t num_remaining = nc;
for (size_t it = 0; it < nc; it++) {
// Filling this up in reverse order ensures that the solution of a
// constant cost matrix is the identity matrix (c.f. #11602).
remaining[it] = nc - it - 1;
}
std::fill(SR.begin(), SR.end(), false);
std::fill(SC.begin(), SC.end(), false);
std::fill(shortestPathCosts.begin(), shortestPathCosts.end(),
std::numeric_limits<double>::max());
// find shortest augmenting path
int sink = -1;
while (sink == -1) {
// Clearly this will produce an overflow and set index to a large integer.
// It is how the original code did it, and I assume whoever wrote it knew
// what they were doing.
size_t index = -1;
double lowest = std::numeric_limits<double>::max();
SR[i] = true;
for (size_t it = 0; it < num_remaining; it++) {
size_t j = remaining[it];
double r = minVal + cost[i * nc + j] - u[i] - v[j];
if (r < shortestPathCosts[j]) {
path[j] = i;
shortestPathCosts[j] = r;
}
// When multiple nodes have the minimum cost, we select one which
// gives us a new sink node. This is particularly important for
// integer cost matrices with small co-efficients.
if (shortestPathCosts[j] < lowest ||
(shortestPathCosts[j] == lowest &&
row4col[j] == static_cast<size_t>(-1))) {
lowest = shortestPathCosts[j];
index = it;
}
}
minVal = lowest;
if (minVal ==
std::numeric_limits<double>::max()) { // infeasible cost matrix
return -1;
}
size_t j = remaining[index];
if (row4col[j] == static_cast<size_t>(-1)) {
sink = j;
} else {
i = row4col[j];
}
SC[j] = true;
remaining[index] = remaining[--num_remaining];
}
*p_minVal = minVal;
return sink;
}
int lap_maximize(const std::vector<std::vector<int>> &costsMat,
std::vector<size_t> &a, std::vector<size_t> &b) {
if (costsMat.empty() || costsMat.front().empty()) {
return 0;
}
size_t nr = costsMat.size();
size_t nc = costsMat.front().size();
bool transpose = nc < nr;
std::vector<int> cost(nc * nr);
// for maximization, take -ve of costs.
for (size_t i = 0; i < nr; ++i) {
for (size_t j = 0; j < nc; ++j) {
if (transpose) {
cost[j * nr + i] = -costsMat[i][j];
} else {
cost[i * nc + j] = -costsMat[i][j];
}
}
}
if (transpose) {
std::swap(nc, nr);
}
// initialize variables
std::vector<double> u(nr, 0);
std::vector<double> v(nc, 0);
std::vector<double> shortestPathCosts(nc);
std::vector<size_t> path(nc, -1);
std::vector<size_t> col4row(nr, -1);
std::vector<size_t> row4col(nc, -1);
std::vector<bool> SR(nr);
std::vector<bool> SC(nc);
std::vector<size_t> remaining(nc);
// iteratively build the solution
for (size_t curRow = 0; curRow < nr; curRow++) {
double minVal;
int sink = augmenting_path(nc, cost, u, v, path, row4col, shortestPathCosts,
curRow, SR, SC, remaining, &minVal);
if (sink < 0) {
return -1;
}
// update dual variables
u[curRow] += minVal;
for (size_t i = 0; i < nr; i++) {
if (SR[i] && i != curRow) {
u[i] += minVal - shortestPathCosts[col4row[i]];
}
}
for (size_t j = 0; j < nc; j++) {
if (SC[j]) {
v[j] -= minVal - shortestPathCosts[j];
}
}
// augment previous solution
size_t j = sink;
while (1) {
size_t i = path[j];
row4col[j] = i;
std::swap(col4row[i], j);
if (i == curRow) {
break;
}
}
}
if (transpose) {
size_t i = 0;
for (auto v : argsort_iter(col4row)) {
a[i] = col4row[v];
b[i] = v;
i++;
}
} else {
for (size_t i = 0; i < nr; i++) {
a[i] = i;
b[i] = col4row[i];
}
}
return 0;
}
} // namespace RascalMCES
} // namespace RDKit

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,152 @@
//
// Copyright (C) 2023 David Cosgrove
//
// @@ All Rights Reserved @@
// This file is part of the RDKit.
// The contents are covered by the terms of the BSD license
// which is included in the file license.txt, found at the root
// of the RDKit source tree.
//
#include <chrono>
#include <random>
#include <vector>
#include <GraphMol/FileParsers/MolSupplier.h>
#include <GraphMol/SmilesParse/SmilesParse.h>
#include <GraphMol/SmilesParse/SmilesWrite.h>
#include <GraphMol/Substruct/SubstructMatch.h>
#include "catch.hpp"
#include <GraphMol/RascalMCES/RascalMCES.h>
#include <GraphMol/RascalMCES/RascalClusterOptions.h>
#include <GraphMol/RascalMCES/RascalResult.h>
TEST_CASE("Small test", "[basics]") {
std::string fName = getenv("RDBASE");
fName += "/Contrib/Fastcluster/cdk2.smi";
RDKit::SmilesMolSupplier suppl(fName, "\t", 1, 0, false);
std::vector<std::shared_ptr<RDKit::ROMol>> mols;
while (!suppl.atEnd()) {
std::shared_ptr<RDKit::ROMol> mol(suppl.next());
if (!mol) {
continue;
}
mols.push_back(mol);
}
RDKit::RascalMCES::RascalClusterOptions clusOpts;
auto clusters = RDKit::RascalMCES::rascalCluster(mols, clusOpts);
REQUIRE(clusters.size() == 8);
std::vector<size_t> expSizes{7, 7, 6, 2, 2, 2, 2, 20};
for (size_t i = 0; i < 8; ++i) {
REQUIRE(clusters[i].size() == expSizes[i]);
}
}
TEST_CASE("BLSets subset", "[basics]") {
std::string fName = getenv("RDBASE");
fName += "/Code/GraphMol/RascalMCES/data/test_cluster1.smi";
RDKit::SmilesMolSupplier suppl(fName, "\t", 1, 0, false);
std::vector<std::shared_ptr<RDKit::ROMol>> mols;
while (!suppl.atEnd()) {
std::shared_ptr<RDKit::ROMol> mol(suppl.next());
if (!mol) {
continue;
}
mols.push_back(mol);
}
auto clusters = RDKit::RascalMCES::rascalCluster(mols);
REQUIRE(clusters.size() == 12);
std::vector<size_t> expSizes{8, 4, 4, 3, 3, 3, 2, 2, 2, 2, 2, 21};
for (size_t i = 0; i < 12; ++i) {
REQUIRE(clusters[i].size() == expSizes[i]);
}
}
TEST_CASE("ChEMBL 1907596") {
std::string fName = getenv("RDBASE");
fName += "/Code/GraphMol/RascalMCES/data/chembl_1907596.smi";
std::cout << fName << std::endl;
RDKit::SmilesMolSupplier suppl(fName, "\t", 1, 0, false);
std::vector<std::shared_ptr<RDKit::ROMol>> mols;
while (!suppl.atEnd()) {
std::shared_ptr<RDKit::ROMol> mol(suppl.next());
if (!mol) {
continue;
}
mols.push_back(mol);
}
RDKit::RascalMCES::RascalClusterOptions clusOpts;
clusOpts.similarityCutoff = 0.7;
auto clusters = RDKit::RascalMCES::rascalCluster(mols, clusOpts);
REQUIRE(clusters.size() == 21);
std::vector<size_t> expSizes{342, 71, 64, 33, 23, 11, 10, 6, 6, 5, 5,
4, 3, 3, 3, 3, 3, 2, 2, 2, 14};
for (size_t i = 0; i < 21; ++i) {
REQUIRE(clusters[i].size() == expSizes[i]);
}
}
TEST_CASE("Small Butina test", "[basics]") {
std::string fName = getenv("RDBASE");
fName += "/Contrib/Fastcluster/cdk2.smi";
RDKit::SmilesMolSupplier suppl(fName, "\t", 1, 0, false);
std::vector<std::shared_ptr<RDKit::ROMol>> mols;
while (!suppl.atEnd()) {
std::shared_ptr<RDKit::ROMol> mol(suppl.next());
if (!mol) {
continue;
}
mols.push_back(mol);
}
RDKit::RascalMCES::RascalClusterOptions clusOpts;
auto clusters = RDKit::RascalMCES::rascalButinaCluster(mols, clusOpts);
int numMols = 0;
for (const auto &cl : clusters) {
numMols += cl.size();
}
REQUIRE(numMols == mols.size());
REQUIRE(clusters.size() == 29);
std::vector<size_t> expSizes{6, 6, 6, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
for (size_t i = 0; i < 29; ++i) {
REQUIRE(clusters[i].size() == expSizes[i]);
}
}
TEST_CASE("Small test, smaller number of threads", "[basics]") {
// I'm not sure how to test whether this has had the desired effect,
// but at least we'll know that it runs ok.
std::string fName = getenv("RDBASE");
fName += "/Contrib/Fastcluster/cdk2.smi";
RDKit::SmilesMolSupplier suppl(fName, "\t", 1, 0, false);
std::vector<std::shared_ptr<RDKit::ROMol>> mols;
while (!suppl.atEnd()) {
std::shared_ptr<RDKit::ROMol> mol(suppl.next());
if (!mol) {
continue;
}
mols.push_back(mol);
}
{
RDKit::RascalMCES::RascalClusterOptions clusOpts;
clusOpts.numThreads = 2;
auto clusters = RDKit::RascalMCES::rascalCluster(mols, clusOpts);
REQUIRE(clusters.size() == 8);
std::vector<size_t> expSizes{7, 7, 6, 2, 2, 2, 2, 20};
for (size_t i = 0; i < 8; ++i) {
REQUIRE(clusters[i].size() == expSizes[i]);
}
}
{
RDKit::RascalMCES::RascalClusterOptions clusOpts;
clusOpts.numThreads = -2;
auto clusters = RDKit::RascalMCES::rascalCluster(mols, clusOpts);
REQUIRE(clusters.size() == 8);
std::vector<size_t> expSizes{7, 7, 6, 2, 2, 2, 2, 20};
for (size_t i = 0; i < 8; ++i) {
REQUIRE(clusters[i].size() == expSizes[i]);
}
}
}

View File

@@ -1417,8 +1417,38 @@ or into a generic framework:
Maximum Common Substructure
***************************
The FindMCS function find a maximum common substructure (MCS) of two
or more molecules:
There are 2 methods for finding maximum common substructures. The first, FindMCS,
finds a single fragment maximum common substructure (MCS) of two or more molecules:
The second, RascalMCES, finds the maximum common edge substructure (MCES) between two
molecules and can return a multi-fragment MCES. The difference is demonstrated with the
following pair of molecules:
+-------------------------------------+
| .. image:: images/mcs_example_1.png |
+-------------------------------------+
| .. image:: images/mcs_example_2.png |
+-------------------------------------+
FMCS gives this maximum common substructure:
+-------------------------------------+
| .. image:: images/mcs_example_3.png |
+-------------------------------------+
| .. image:: images/mcs_example_4.png |
+-------------------------------------+
Whereas RascalMCES gives:
+-------------------------------------+
| .. image:: images/mcs_example_5.png |
+-------------------------------------+
| .. image:: images/mcs_example_6.png |
+-------------------------------------+
FindMCS
=======
FindMCS operates on 2 or more molecules:
.. doctest::
@@ -1555,6 +1585,135 @@ return the best match found in that time. If timeout is reached then the
(The MCS after 50 seconds contained 511 atoms.)
RascalMCES
==========
RascalMCES can only work on 2 molecules at a time:
.. doctest::
>>> from rdkit.Chem import rdRascalMCES
>>> mol1 = Chem.MolFromSmiles("CN(C)c1ccc(CC(=O)NCCCCCCCCCCNC23CC4CC(C2)CC(C3)C4)cc1 CHEMBL153934")
>>> mol2 = Chem.MolFromSmiles("CN(C)c1ccc(CC(=O)NCCCCCCCNC23CC4CC(C2)CC(C3)C4)cc1 CHEMBL152361")
>>> res = rdRascalMCES.FindMCES(mol1, mol2)
>>> res[0].smartsString
'CN(-C)-c1:c:c:c(-CC(=O)-NCCCCCCC):c:c:1.NC12CC3CC(-C1)-CC(-C2)-C3'
>>> len(res[0].bondMatches())
33
It returns a list of RascalResult objects. Each RascalResult contains the 2 molecules that
the result pertains to, the SMARTS string of the MCES, the lists of atoms and bonds in the
two molecules that match, the Johnson similarity between the 2 molecules, the number of
fragments in the MCES, the number of atoms in the largest fragment and whether the run
timed out or not. There is also the method largestFragmentOnly(), which cuts the MCES
down to the largest single fragment. This is a non-reversible change, so if you want both
results, take a copy first.
By default, the MCES algorithm returns the first result it finds of maximum size. Because of
symmetry, there may be other equivalent solutions with the same number of atoms and bonds,
but with different equivalent bonds matched to each other. If you want to see all MCESs of
maximum size, you can use the option allBestMCESs = True. This will increase the run time,
partly because more branches in the search tree must be examined, but mostly because sorting
the multiple results is quite time-consuming. The results are returned in a consistent order
sorted by number of bond matches, then number of fragments (fewer first), then largest
fragment size and so on. Some of these aren't trivial to compute. The adamantane example
above is particularly extreme because not only is there extensive symmetry about the
adamantane end and 2-fold symmetry at the phenyl end but also several points of breaking the
matching alkyl chain all of which give rise to valid MCESs of the same size. In this case,
sorting into a consistent order takes significantly longer than determining the MCESs in the
first place.
The MCES differs from a conventional MCS in that it is the maximum common substructure based
on bonds rather than atoms. Often the result is the same, but not always.
The Johnson similarity is akin to a Tanimoto similarity, but expressed in terms of the
atoms and bonds in the MCES. It is the square of the sum of the number of atoms and bonds
in the MCES divided by the product of the sums of the numbers of atoms and bonds in the
2 input molecules. It has values between 0.0 (no MCES between the molecules) and 1.0 (the
molecules are identical). A key source of efficiency in the RASCAL algorithm is a fast and
correct prediction of a maximum value for the Johnson similarity between 2 molecules and
hence the maximum size of the MCES. The first step in the algorithm is then a screening,
whereby the full MCES determination is not performed if the predicted similarity is less
than some desired threshold. The final similarity between the 2 molecules may be less
than the threshold, but it will never be higher than the predicted upper bound. RASCAL
stems from RApid Similarity CALulation.
The default settings for RascalMCES are good for general use, but they may be altered
by passing an optional RascalOptions object:
.. doctest::
>>> mol1 = Chem.MolFromSmiles('Oc1cccc2C(=O)C=CC(=O)c12')
>>> mol2 = Chem.MolFromSmiles('O1C(=O)C=Cc2cc(OC)c(O)cc12')
>>> results = rdRascalMCES.FindMCES(mol1, mol2)
>>> len(results)
0
>>> opts = rdRascalMCES.RascalOptions()
>>> opts.similarityThreshold = 0.5
>>> results = rdRascalMCES.FindMCES(mol1, mol2, opts)
>>> len(results)
1
>>> f'{results[0].similarity:.2f}'
'0.37'
>>> results[0].smartsString
'Oc1:c:c:c:c:c:1.[#6]=O'
>>> opts.minFragSize = 3
>>> results = rdRascalMCES.FindMCES(mol1, mol2, opts)
>>> len(results)
1
>>> f'{results[0].similarity:.2f}'
'0.25'
>>> results[0].smartsString
'Oc1:c:c:c:c:c:1'
In this case, the upper bound on the similarity score is below the default threshold
of 0.7, so no results are returned. Setting the threshold to 0.5 produces the second
result although, as can be seen, the final similarity is substantially below the
threshold. This example also shows a disadvantage of the MCES method, which is that
it can produce small fragments in the MCES which are rarely helpful. The option
minFragSize can be used to over-ride the default value of -1, which means no minimum
size.
Like FindMCS, there is a ringMatchesRingOnly option, and also there's
completeAromaticRings, which is True by default, and means that MCESs won't be returned
with partial aromatic rings matching:
.. doctest::
>>> mol1 = Chem.MolFromSmiles('C1CCCC1c1ccncc1')
>>> mol2 = Chem.MolFromSmiles('C1CCCC1c1ccccc1')
>>> results = rdRascalMCES.FindMCES(mol1, mol2, opts)
>>> f'{results[0].similarity:.2f}'
'0.27'
>>> results[0].smartsString
'C1CCCC1-c'
>>> opts.completeAromaticRings = False
>>> results = rdRascalMCES.FindMCES(mol1, mol2, opts)
>>> f'{results[0].similarity:.2f}'
'0.76'
>>> results[0].smartsString
'C1CCCC1-c(:c:c):c:c'
This result may look a bit odd, with a single aromatic carbon in the first SMARTS
string. This is a consequence of the fact that the MCES works on matching bonds.
A better, atom-centric, representation might be C1CCC[$(C-c)]1. When the
completeAromaticRings option is set to False, a larger MCES is found, with just
the pyridine nitrogen atom not matching the corresponding phenyl carbon atom.
Clustering with Rascal
======================
There are 2 clustering methods available using the Johnson metric. The first,
RascalCluster, is a fuzzy method described in 'A Line Graph Algorithm for
Clustering Chemical Structures Based on Common Substructural Cores', JW Raymond,
PW Willett
(https://match.pmf.kg.ac.rs/electronic_versions/Match48/match48_197-207.pdf also
available at https://eprints.whiterose.ac.uk/77598/).
The second, RascalButinaCluster, uses the Butina sphere-exclusion algorithm
(Butina JCICS 39 747-750 (1999)). Because of the time-consuming nature of the MCES
determination, these clustering methods can be slow to run, so are best used
on small sets (no more than a few hundred molecules) of small molecules.
Fingerprinting and Molecular Similarity
***************************************

Binary file not shown.

After

Width:  |  Height:  |  Size: 14 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 14 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 19 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 20 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 21 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 22 KiB