mirror of
https://github.com/rdkit/rdkit.git
synced 2026-06-03 21:44:30 +08:00
RASCAL MCES (#6568)
This commit is contained in:
@@ -87,6 +87,7 @@ add_subdirectory(MolDraw2D)
|
||||
add_subdirectory(FMCS)
|
||||
add_subdirectory(MolHash)
|
||||
add_subdirectory(MMPA)
|
||||
add_subdirectory(RascalMCES)
|
||||
|
||||
add_subdirectory(CIPLabeler)
|
||||
add_subdirectory(Deprotect)
|
||||
@@ -193,6 +194,6 @@ rdkit_catch_test(queryTestsCatch catch_queries.cpp
|
||||
|
||||
rdkit_catch_test(molbundleTestsCatch catch_molbundle.cpp
|
||||
LINK_LIBRARIES SmilesParse GraphMol)
|
||||
|
||||
|
||||
rdkit_catch_test(pickleTestsCatch catch_pickles.cpp
|
||||
LINK_LIBRARIES FileParsers SmilesParse GraphMol)
|
||||
|
||||
16
Code/GraphMol/RascalMCES/CMakeLists.txt
Normal file
16
Code/GraphMol/RascalMCES/CMakeLists.txt
Normal file
@@ -0,0 +1,16 @@
|
||||
|
||||
rdkit_library(RascalMCES
|
||||
RascalMCES.cpp RascalCluster.cpp RascalButinaCluster.cpp
|
||||
lap_a_la_scipy.cpp PartitionSet.cpp RascalResult.cpp
|
||||
LINK_LIBRARIES SmilesParse FileParsers ChemTransforms SubstructMatch GraphMol)
|
||||
target_compile_definitions(RascalMCES PRIVATE RDKIT_RASCALMCES_BUILD)
|
||||
|
||||
rdkit_headers(RascalMCES.h RascalOptions.h RascalClusterOptions.h RascalResult.h
|
||||
DEST GraphMol/RascalMCES)
|
||||
|
||||
rdkit_catch_test(testRascalMCES mces_catch.cpp LINK_LIBRARIES RascalMCES)
|
||||
rdkit_catch_test(testRascalCluster mces_cluster_catch.cpp LINK_LIBRARIES RascalMCES)
|
||||
|
||||
if (RDK_BUILD_PYTHON_WRAPPERS)
|
||||
add_subdirectory(Wrap)
|
||||
endif ()
|
||||
220
Code/GraphMol/RascalMCES/PartitionSet.cpp
Normal file
220
Code/GraphMol/RascalMCES/PartitionSet.cpp
Normal file
@@ -0,0 +1,220 @@
|
||||
//
|
||||
// Copyright (C) David Cosgrove 2023
|
||||
//
|
||||
// @@ All Rights Reserved @@
|
||||
// This file is part of the RDKit.
|
||||
// The contents are covered by the terms of the BSD license
|
||||
// which is included in the file license.txt, found at the root
|
||||
// of the RDKit source tree.
|
||||
//
|
||||
|
||||
#include <algorithm>
|
||||
#include <iostream>
|
||||
#include <limits>
|
||||
#include <map>
|
||||
#include <memory>
|
||||
|
||||
#include "PartitionSet.h"
|
||||
|
||||
namespace RDKit {
|
||||
|
||||
namespace RascalMCES {
|
||||
PartitionSet::PartitionSet(const std::vector<boost::dynamic_bitset<>> &modProd,
|
||||
const std::vector<std::pair<int, int>> &vtxPairs,
|
||||
const std::vector<unsigned int> &vtx1Labels,
|
||||
const std::vector<unsigned int> &vtx2Labels,
|
||||
unsigned int lowerBound)
|
||||
: d_ModProd(new std::vector<boost::dynamic_bitset<>>(modProd)),
|
||||
d_VtxPairs(new std::vector<std::pair<int, int>>(vtxPairs)),
|
||||
d_vtx1Labels(new std::vector<unsigned int>(vtx1Labels)),
|
||||
d_vtx2Labels(new std::vector<unsigned int>(vtx2Labels)) {
|
||||
d_vtx1Counts = std::vector<int>(d_vtx1Labels->size(), 0);
|
||||
d_vtx2Counts = std::vector<int>(d_vtx2Labels->size(), 0);
|
||||
int firstVtx = -1;
|
||||
// Clearly, a vertex in one of the line graphs can only match one vertex
|
||||
// in the other. Thus, the initial partitions can be set up so that
|
||||
// all vertices in a partition have the same vertex in the first
|
||||
// line graph.
|
||||
for (size_t i = 0; i < vtxPairs.size(); ++i) {
|
||||
auto &vp = vtxPairs[i];
|
||||
if (vp.first != firstVtx) {
|
||||
d_parts.push_back(std::vector<unsigned int>());
|
||||
d_parts.back().push_back(i);
|
||||
firstVtx = vp.first;
|
||||
} else {
|
||||
d_parts.back().push_back(i);
|
||||
}
|
||||
d_vtx1Counts[vp.first]++;
|
||||
d_vtx2Counts[vp.second]++;
|
||||
}
|
||||
if (d_parts.empty()) {
|
||||
return;
|
||||
}
|
||||
// Now sort the partitions by size. This means that the vertices at the
|
||||
// top of the partition set, above the lowerBound (or Pex as Raymond
|
||||
// calls it in the paper), are the ones that match the least number of
|
||||
// vertices in the other line graph. This has a dramatic effect on the
|
||||
// speed compared with other things tried. I think it is what Raymond
|
||||
// means when he says "Perform an initial partitioning of the vertices...
|
||||
// using the labeled edge projection procedure."
|
||||
sortPartitions();
|
||||
// Now reassign vertices from above Pex to below it if possible.
|
||||
// This also improves the speed of finding a large clique early.
|
||||
// A vertex is moved to a partition where it isn't connected to a vertex
|
||||
// in the modular product graph that is in the partition.
|
||||
for (size_t i = d_parts.size() - 1; i > lowerBound; --i) {
|
||||
bool reassigned = false;
|
||||
for (auto &iv : d_parts[i]) {
|
||||
for (size_t k = 0; k <= lowerBound; ++k) {
|
||||
bool conn = false;
|
||||
for (auto kv : d_parts[k]) {
|
||||
if (modProd[iv][kv]) {
|
||||
conn = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (!conn) {
|
||||
d_parts[k].push_back(iv);
|
||||
iv = std::numeric_limits<unsigned int>::max();
|
||||
reassigned = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
if (reassigned) {
|
||||
d_parts[i].erase(std::remove(d_parts[i].begin(), d_parts[i].end(),
|
||||
std::numeric_limits<unsigned int>::max()),
|
||||
d_parts[i].end());
|
||||
}
|
||||
}
|
||||
d_parts.erase(std::remove_if(d_parts.begin(), d_parts.end(),
|
||||
[](const std::vector<unsigned int> &v) {
|
||||
return v.empty();
|
||||
}),
|
||||
d_parts.end());
|
||||
// Sort again, to make sure the large partitions are dealt with as late as
|
||||
// possible.
|
||||
sortPartitions();
|
||||
|
||||
// Get the info together for the upper bound calculation.
|
||||
calcVtxTypeCounts();
|
||||
}
|
||||
|
||||
int PartitionSet::upperBound() {
|
||||
int upperBound = 0;
|
||||
for (size_t i = 0; i < d_vtx1TypeCounts.size(); ++i) {
|
||||
upperBound += std::min(d_vtx1TypeCounts[i], d_vtx2TypeCounts[i]);
|
||||
}
|
||||
return upperBound;
|
||||
}
|
||||
|
||||
unsigned int PartitionSet::popLastVertex() {
|
||||
if (d_parts.empty()) {
|
||||
throw std::runtime_error("PartitionSet set is empty.");
|
||||
}
|
||||
unsigned int ret_val = d_parts.back().back();
|
||||
d_parts.back().pop_back();
|
||||
if (d_parts.back().empty()) {
|
||||
d_parts.pop_back();
|
||||
}
|
||||
decrementVertexCounts(ret_val);
|
||||
return ret_val;
|
||||
}
|
||||
|
||||
void PartitionSet::pruneVertices(unsigned int vtx_num) {
|
||||
for (auto &part : d_parts) {
|
||||
size_t i = 0;
|
||||
while (i < part.size()) {
|
||||
if (!(*d_ModProd)[part[i]][vtx_num]) {
|
||||
decrementVertexCounts(part[i]);
|
||||
part[i] = part.back();
|
||||
part.pop_back();
|
||||
} else {
|
||||
++i;
|
||||
}
|
||||
}
|
||||
}
|
||||
d_parts.erase(std::remove_if(d_parts.begin(), d_parts.end(),
|
||||
[](const std::vector<unsigned int> &v) {
|
||||
return v.empty();
|
||||
}),
|
||||
d_parts.end());
|
||||
sortPartitions();
|
||||
}
|
||||
|
||||
void PartitionSet::sortPartitions() {
|
||||
// When sorting lists with duplicate values, the order of the
|
||||
// duplicates isn't defined. Different compilers do it differently.
|
||||
// This can affect the results in the case where more than 1 MCES is
|
||||
// possible, because the partition orders and hence the search tree
|
||||
// traversal will be different. The results should be equivalent,
|
||||
// though. To make things consistent, the sort is done with a
|
||||
// tie-breaker on the first value in vectors of the same size. It
|
||||
// doesn't slow things down very much on average, and it makes things
|
||||
// tidier.
|
||||
std::sort(d_parts.begin(), d_parts.end(),
|
||||
[](const std::vector<unsigned int> &v1,
|
||||
const std::vector<unsigned int> &v2) {
|
||||
if (v1.size() == v2.size() && !v1.empty()) {
|
||||
return v1.front() < v2.front();
|
||||
} else {
|
||||
return v1.size() > v2.size();
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
void PartitionSet::calcVtxTypeCounts() {
|
||||
auto doIt = [](unsigned int maxLabel, const std::vector<int> &vtxCounts,
|
||||
const std::vector<unsigned int> &vtxLabels,
|
||||
std::vector<int> &vtxTypeCounts) -> void {
|
||||
vtxTypeCounts = std::vector<int>(maxLabel + 1, 0);
|
||||
for (size_t i = 0; i < vtxCounts.size(); ++i) {
|
||||
if (vtxCounts[i]) {
|
||||
++vtxTypeCounts[vtxLabels[i]];
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
unsigned int max_label = 0;
|
||||
max_label =
|
||||
std::max(*std::max_element(d_vtx1Labels->begin(), d_vtx1Labels->end()),
|
||||
*std::max_element(d_vtx2Labels->begin(), d_vtx2Labels->end()));
|
||||
doIt(max_label, d_vtx1Counts, *d_vtx1Labels, d_vtx1TypeCounts);
|
||||
doIt(max_label, d_vtx2Counts, *d_vtx2Labels, d_vtx2TypeCounts);
|
||||
}
|
||||
|
||||
void PartitionSet::decrementVertexCounts(int vtxNum) {
|
||||
--d_vtx1Counts[(*d_VtxPairs)[vtxNum].first];
|
||||
if (!d_vtx1Counts[(*d_VtxPairs)[vtxNum].first]) {
|
||||
--d_vtx1TypeCounts[(*d_vtx1Labels)[(*d_VtxPairs)[vtxNum].first]];
|
||||
}
|
||||
--d_vtx2Counts[(*d_VtxPairs)[vtxNum].second];
|
||||
if (!d_vtx2Counts[(*d_VtxPairs)[vtxNum].second]) {
|
||||
--d_vtx2TypeCounts[(*d_vtx2Labels)[(*d_VtxPairs)[vtxNum].second]];
|
||||
}
|
||||
}
|
||||
|
||||
std::ostream &operator<<(std::ostream &os, const PartitionSet &pt) {
|
||||
for (size_t i = 0; i < pt.d_parts.size(); ++i) {
|
||||
os << i << " :: " << pt.d_parts[i].size() << " ::";
|
||||
for (auto &mem : pt.d_parts[i]) {
|
||||
os << " " << mem << " (" << (*pt.d_VtxPairs)[mem].first << ","
|
||||
<< (*pt.d_VtxPairs)[mem].second << ")";
|
||||
}
|
||||
os << std::endl;
|
||||
}
|
||||
os << "vtx1_counts :";
|
||||
for (auto vc : pt.d_vtx1Counts) {
|
||||
os << " " << vc;
|
||||
}
|
||||
os << std::endl;
|
||||
os << "vtx2_counts :";
|
||||
for (auto vc : pt.d_vtx2Counts) {
|
||||
os << " " << vc;
|
||||
}
|
||||
os << std::endl;
|
||||
return os;
|
||||
}
|
||||
|
||||
} // namespace RascalMCES
|
||||
} // namespace RDKit
|
||||
73
Code/GraphMol/RascalMCES/PartitionSet.h
Normal file
73
Code/GraphMol/RascalMCES/PartitionSet.h
Normal file
@@ -0,0 +1,73 @@
|
||||
//
|
||||
// Copyright (C) David Cosgrove 2023
|
||||
//
|
||||
// @@ All Rights Reserved @@
|
||||
// This file is part of the RDKit.
|
||||
// The contents are covered by the terms of the BSD license
|
||||
// which is included in the file license.txt, found at the root
|
||||
// of the RDKit source tree.
|
||||
//
|
||||
|
||||
#ifndef RASCALMCES_PARTITION_SET_H
|
||||
#define RASCALMCES_PARTITION_SET_H
|
||||
|
||||
#include <map>
|
||||
#include <vector>
|
||||
|
||||
#include <boost/dynamic_bitset.hpp>
|
||||
|
||||
namespace RDKit {
|
||||
|
||||
namespace RascalMCES {
|
||||
|
||||
class PartitionSet {
|
||||
public:
|
||||
// Make a partition set from the modular product and the labels
|
||||
// of the vertices from the first graph. Each element in vtxPairs
|
||||
// has a row/column in modProd. The partitions are sorted
|
||||
// into descending order of sizes.
|
||||
PartitionSet(const std::vector<boost::dynamic_bitset<>> &modProd,
|
||||
const std::vector<std::pair<int, int>> &vtxPairs,
|
||||
const std::vector<unsigned int> &vtx1Labels,
|
||||
const std::vector<unsigned int> &vtx2Labels,
|
||||
unsigned int lowerBound);
|
||||
|
||||
bool isEmpty() const { return d_parts.empty(); }
|
||||
|
||||
size_t numParts() const { return d_parts.size(); }
|
||||
|
||||
// Compute the upper bound on the clique that can be extracted from
|
||||
// the current partition.
|
||||
int upperBound();
|
||||
|
||||
friend std::ostream &operator<<(std::ostream &os, const PartitionSet &pt);
|
||||
|
||||
// removes the last element of the last partition and returns
|
||||
// its value. Throws a runtime_error if empty.
|
||||
unsigned int popLastVertex();
|
||||
|
||||
// remove from the partitions any vertex not connected to the given
|
||||
// vertex
|
||||
void pruneVertices(unsigned int vtx_num);
|
||||
|
||||
private:
|
||||
std::shared_ptr<const std::vector<boost::dynamic_bitset<>>> d_ModProd;
|
||||
std::shared_ptr<const std::vector<std::pair<int, int>>> d_VtxPairs;
|
||||
std::shared_ptr<const std::vector<unsigned int>> d_vtx1Labels;
|
||||
std::shared_ptr<const std::vector<unsigned int>> d_vtx2Labels;
|
||||
std::vector<std::vector<unsigned int>> d_parts;
|
||||
// counts of the number of times each vertex appears in the partitions
|
||||
std::vector<int> d_vtx1Counts, d_vtx2Counts;
|
||||
// counts of the number of times the d_vtx[12]_labels appear in the partitions
|
||||
std::vector<int> d_vtx1TypeCounts, d_vtx2TypeCounts;
|
||||
|
||||
void sortPartitions();
|
||||
|
||||
void calcVtxTypeCounts();
|
||||
|
||||
void decrementVertexCounts(int vtxNum);
|
||||
};
|
||||
} // namespace RascalMCES
|
||||
} // namespace RDKit
|
||||
|
||||
#endif // RASCALMCES_PARTITION_SET_H
|
||||
118
Code/GraphMol/RascalMCES/RascalButinaCluster.cpp
Normal file
118
Code/GraphMol/RascalMCES/RascalButinaCluster.cpp
Normal file
@@ -0,0 +1,118 @@
|
||||
//
|
||||
// Copyright (C) David Cosgrove 2023
|
||||
//
|
||||
// @@ All Rights Reserved @@
|
||||
// This file is part of the RDKit.
|
||||
// The contents are covered by the terms of the BSD license
|
||||
// which is included in the file license.txt, found at the root
|
||||
// of the RDKit source tree.
|
||||
//
|
||||
// This file contains an implementation of Butina clustering
|
||||
// (Butina JCICS 39 747-750 (1999)) using the RascalMCES
|
||||
// Johnson similarity metric. It is largely a transliteration
|
||||
// of $RDBASE/rdkit/ML/Cluster/Butina.py.
|
||||
|
||||
#include <algorithm>
|
||||
#include <iterator>
|
||||
#include <vector>
|
||||
#include <set>
|
||||
|
||||
#include <GraphMol/ROMol.h>
|
||||
#include <GraphMol/RascalMCES/RascalMCES.h>
|
||||
#include <GraphMol/RascalMCES/RascalClusterOptions.h>
|
||||
#include <GraphMol/RascalMCES/RascalDetails.h>
|
||||
|
||||
namespace RDKit {
|
||||
|
||||
namespace RascalMCES {
|
||||
namespace details {
|
||||
std::vector<std::vector<unsigned int>> buildNborLists(
|
||||
const std::vector<std::vector<ClusNode>> &proxGraph) {
|
||||
std::vector<std::vector<unsigned int>> nborLists;
|
||||
for (size_t i = 0; i < proxGraph.size(); ++i) {
|
||||
std::vector<std::pair<unsigned int, double>> tmpList;
|
||||
for (const auto &cn : proxGraph[i]) {
|
||||
if (cn.d_res) {
|
||||
if (i == cn.d_mol1Num) {
|
||||
tmpList.push_back({cn.d_mol2Num, cn.d_sim});
|
||||
} else {
|
||||
tmpList.push_back({cn.d_mol1Num, cn.d_sim});
|
||||
}
|
||||
}
|
||||
}
|
||||
std::sort(tmpList.begin(), tmpList.end(),
|
||||
[](const std::pair<unsigned int, double> &p1,
|
||||
const std::pair<unsigned int, double> &p2) -> bool {
|
||||
return p1.second > p2.second;
|
||||
});
|
||||
std::vector<unsigned int> nborList(tmpList.size() + 1, 0);
|
||||
nborList[0] = i;
|
||||
std::transform(
|
||||
tmpList.begin(), tmpList.end(), nborList.begin() + 1,
|
||||
[](const std::pair<unsigned int, double> &p) -> unsigned int {
|
||||
return p.first;
|
||||
});
|
||||
nborLists.push_back(nborList);
|
||||
}
|
||||
std::sort(nborLists.begin(), nborLists.end(),
|
||||
[](const std::vector<unsigned int> &nl1,
|
||||
const std::vector<unsigned int> &nl2) -> bool {
|
||||
if (nl1.size() == nl2.size()) {
|
||||
return nl1 > nl2;
|
||||
} else {
|
||||
return nl1.size() > nl2.size();
|
||||
}
|
||||
});
|
||||
return nborLists;
|
||||
}
|
||||
|
||||
// This function destroys nborLists.
|
||||
std::vector<std::vector<unsigned int>> formClusters(
|
||||
std::vector<std::vector<unsigned int>> &nborLists) {
|
||||
std::vector<std::vector<unsigned int>> clusters;
|
||||
|
||||
while (!nborLists.empty()) {
|
||||
clusters.push_back(nborLists.front());
|
||||
std::set<unsigned int> inNborList(nborLists.front().begin(),
|
||||
nborLists.front().end());
|
||||
nborLists.front().clear();
|
||||
for (auto &nborList : nborLists) {
|
||||
for (auto &n : nborList) {
|
||||
if (inNborList.find(n) != inNborList.end()) {
|
||||
n = std::numeric_limits<unsigned int>::max();
|
||||
}
|
||||
}
|
||||
nborList.erase(std::remove(nborList.begin(), nborList.end(),
|
||||
std::numeric_limits<unsigned int>::max()),
|
||||
nborList.end());
|
||||
}
|
||||
nborLists.erase(
|
||||
std::remove_if(nborLists.begin(), nborLists.end(),
|
||||
[](const std::vector<unsigned int> &nl) -> bool {
|
||||
return nl.empty();
|
||||
}),
|
||||
nborLists.end());
|
||||
std::sort(nborLists.begin(), nborLists.end(),
|
||||
[](const std::vector<unsigned int> &nl1,
|
||||
const std::vector<unsigned int> &nl2) -> bool {
|
||||
if (nl1.size() == nl2.size()) {
|
||||
return nl1 > nl2;
|
||||
} else {
|
||||
return nl1.size() > nl2.size();
|
||||
}
|
||||
});
|
||||
}
|
||||
return clusters;
|
||||
}
|
||||
|
||||
} // namespace details
|
||||
std::vector<std::vector<unsigned int>> rascalButinaCluster(
|
||||
const std::vector<std::shared_ptr<ROMol>> &mols,
|
||||
const RascalClusterOptions &clusOpts) {
|
||||
auto proxGraph = details::buildProximityGraph(mols, clusOpts);
|
||||
auto nborLists = details::buildNborLists(proxGraph);
|
||||
auto clusters = details::formClusters(nborLists);
|
||||
return clusters;
|
||||
}
|
||||
} // namespace RascalMCES
|
||||
} // namespace RDKit
|
||||
382
Code/GraphMol/RascalMCES/RascalCluster.cpp
Normal file
382
Code/GraphMol/RascalMCES/RascalCluster.cpp
Normal file
@@ -0,0 +1,382 @@
|
||||
//
|
||||
// Copyright (C) David Cosgrove 2023
|
||||
//
|
||||
// @@ All Rights Reserved @@
|
||||
// This file is part of the RDKit.
|
||||
// The contents are covered by the terms of the BSD license
|
||||
// which is included in the file license.txt, found at the root
|
||||
// of the RDKit source tree.
|
||||
//
|
||||
// This file contains an implementation of the clustering algorithm
|
||||
// described in
|
||||
// 'A Line Graph Algorithm for Clustering Chemical Structures Based
|
||||
// on Common Substructural Cores', JW Raymond, PW Willett.
|
||||
// https://match.pmf.kg.ac.rs/electronic_versions/Match48/match48_197-207.pdf
|
||||
// https://eprints.whiterose.ac.uk/77598/
|
||||
// It uses the RASCAL MCES algorithm to perform a fuzzy clustering
|
||||
// of a set of molecules.
|
||||
|
||||
#include <algorithm>
|
||||
#include <iterator>
|
||||
#include <list>
|
||||
#include <thread>
|
||||
#include <vector>
|
||||
|
||||
#include <RDGeneral/RDThreads.h>
|
||||
#include <GraphMol/ROMol.h>
|
||||
#include <GraphMol/MolOps.h>
|
||||
#include <GraphMol/RascalMCES/RascalClusterOptions.h>
|
||||
#include <GraphMol/RascalMCES/RascalDetails.h>
|
||||
#include <GraphMol/RascalMCES/RascalMCES.h>
|
||||
#include <GraphMol/RascalMCES/RascalResult.h>
|
||||
|
||||
namespace RDKit {
|
||||
namespace RascalMCES {
|
||||
namespace details {
|
||||
ClusNode calcMolMolSimilarity(
|
||||
const std::tuple<
|
||||
size_t, size_t, const std::vector<std::shared_ptr<ROMol>> *,
|
||||
const RascalOptions *, const RascalClusterOptions *> &toDo) {
|
||||
auto i = std::get<0>(toDo);
|
||||
auto j = std::get<1>(toDo);
|
||||
auto mols = std::get<2>(toDo);
|
||||
auto opts = std::get<3>(toDo);
|
||||
auto clusOpts = std::get<4>(toDo);
|
||||
auto res = rascalMCES(*(*mols)[i], *(*mols)[j], *opts);
|
||||
ClusNode cn;
|
||||
cn.d_mol1Num = i;
|
||||
cn.d_mol2Num = j;
|
||||
if (res.empty()) {
|
||||
// tier1Sim and tier2Sim were above the threshold, but no MCES
|
||||
// was found.
|
||||
cn.d_sim = 0.0;
|
||||
} else {
|
||||
if (res.front().getBondMatches().empty()) {
|
||||
cn.d_sim = 0.0;
|
||||
} else {
|
||||
res.front().trimSmallFrags();
|
||||
res.front().largestFragsOnly(clusOpts->maxNumFrags);
|
||||
cn.d_sim = res.front().getSimilarity();
|
||||
if (cn.d_sim >= opts->similarityThreshold) {
|
||||
cn.d_res = std::shared_ptr<RascalResult>(new RascalResult(res.front()));
|
||||
}
|
||||
}
|
||||
}
|
||||
return cn;
|
||||
}
|
||||
|
||||
std::vector<std::vector<ClusNode>> buildProximityGraph(
|
||||
const std::vector<std::shared_ptr<ROMol>> &mols,
|
||||
const RascalClusterOptions &clusOpts) {
|
||||
if (mols.size() < 2) {
|
||||
return std::vector<std::vector<ClusNode>>();
|
||||
}
|
||||
std::vector<std::vector<ClusNode>> proxGraph =
|
||||
std::vector<std::vector<ClusNode>>(
|
||||
mols.size(), std::vector<ClusNode>(mols.size(), ClusNode()));
|
||||
std::vector<
|
||||
std::tuple<size_t, size_t, const std::vector<std::shared_ptr<ROMol>> *,
|
||||
const RascalOptions *, const RascalClusterOptions *>>
|
||||
toDo;
|
||||
|
||||
RascalOptions opts;
|
||||
opts.similarityThreshold = clusOpts.similarityCutoff;
|
||||
for (size_t i = 0; i < mols.size() - 1; ++i) {
|
||||
for (size_t j = i + 1; j < mols.size(); ++j) {
|
||||
toDo.push_back({i, j, &mols, &opts, &clusOpts});
|
||||
}
|
||||
}
|
||||
|
||||
auto buildProxGraphPart =
|
||||
[](const std::vector<std::tuple<
|
||||
size_t, size_t, const std::vector<std::shared_ptr<ROMol>> *,
|
||||
const RascalOptions *, const RascalClusterOptions *>> &toDo,
|
||||
std::vector<ClusNode> &molSims, size_t start, size_t finish) -> void {
|
||||
if (start > toDo.size()) {
|
||||
return;
|
||||
}
|
||||
if (finish > toDo.size()) {
|
||||
finish = toDo.size();
|
||||
}
|
||||
std::transform(toDo.begin() + start, toDo.begin() + finish,
|
||||
molSims.begin() + start, calcMolMolSimilarity);
|
||||
};
|
||||
|
||||
std::vector<ClusNode> molSims(toDo.size());
|
||||
#if RDK_BUILD_THREADSAFE_SSS
|
||||
auto numThreads = getNumThreadsToUse(clusOpts.numThreads);
|
||||
if (numThreads > 1) {
|
||||
size_t eachThread = 1 + (toDo.size() / numThreads);
|
||||
size_t start = 0;
|
||||
std::vector<std::thread> threads;
|
||||
for (unsigned int i = 0U; i < numThreads; ++i, start += eachThread) {
|
||||
threads.push_back(std::thread(buildProxGraphPart, std::ref(toDo),
|
||||
std::ref(molSims), start,
|
||||
start + eachThread));
|
||||
}
|
||||
for (auto &t : threads) {
|
||||
t.join();
|
||||
}
|
||||
} else {
|
||||
std::transform(toDo.begin(), toDo.end(), molSims.begin(),
|
||||
calcMolMolSimilarity);
|
||||
}
|
||||
#else
|
||||
std::transform(toDo.begin(), toDo.end(), molSims.begin(),
|
||||
calcMolMolSimilarity);
|
||||
#endif
|
||||
for (const auto &cn : molSims) {
|
||||
proxGraph[cn.d_mol1Num][cn.d_mol2Num] =
|
||||
proxGraph[cn.d_mol2Num][cn.d_mol1Num] = cn;
|
||||
}
|
||||
return proxGraph;
|
||||
}
|
||||
|
||||
// Split the proximity graph into its disconnected components,
|
||||
// returning vectors of the molecule numbers of the disconnected
|
||||
// graphs.
|
||||
std::vector<std::vector<unsigned int>> disconnectProximityGraphs(
|
||||
std::vector<std::vector<ClusNode>> &proxGraph) {
|
||||
std::vector<std::vector<unsigned int>> subGraphs;
|
||||
std::vector<bool> done(proxGraph.size(), false);
|
||||
auto nextStart = std::find(done.begin(), done.end(), false);
|
||||
while (nextStart != done.end()) {
|
||||
std::list<unsigned int> nodes;
|
||||
std::list<unsigned int> toDo(1, std::distance(done.begin(), nextStart));
|
||||
while (!toDo.empty()) {
|
||||
auto nextNode = toDo.front();
|
||||
toDo.pop_front();
|
||||
if (!done[nextNode]) {
|
||||
nodes.push_back(nextNode);
|
||||
}
|
||||
done[nextNode] = true;
|
||||
for (size_t i = 0; i < proxGraph.size(); ++i) {
|
||||
if (!done[i] && proxGraph[nextNode][i].d_res) {
|
||||
toDo.push_back(i);
|
||||
nodes.push_back(i);
|
||||
done[i] = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
nodes.sort();
|
||||
subGraphs.push_back(std::vector(nodes.begin(), nodes.end()));
|
||||
nextStart = std::find(done.begin(), done.end(), false);
|
||||
}
|
||||
return subGraphs;
|
||||
}
|
||||
|
||||
// Calculate G_{ij} for the molecule. p is the number of bonds that
|
||||
// a fragment must exceed for it to be counted in the formula.
|
||||
double g_ij(const std::shared_ptr<ROMol> &mol, double a, double b,
|
||||
unsigned int p) {
|
||||
auto molFrags = MolOps::getMolFrags(*mol, false);
|
||||
int numBigFrags = 0;
|
||||
for (const auto &mf : molFrags) {
|
||||
if (mf->getNumBonds() > p) {
|
||||
++numBigFrags;
|
||||
}
|
||||
}
|
||||
numBigFrags = numBigFrags == 0 ? molFrags.size() : numBigFrags;
|
||||
double g = mol->getNumAtoms();
|
||||
g += b * (1.0 - a * (numBigFrags - 1)) * mol->getNumBonds();
|
||||
return g;
|
||||
}
|
||||
|
||||
std::vector<std::vector<unsigned int>> makeSubClusters(
|
||||
const std::vector<ClusNode> &nbors, const RascalClusterOptions &clusOpts) {
|
||||
std::vector<std::vector<unsigned int>> subClusters;
|
||||
|
||||
std::vector<const ClusNode *> tmpNbors;
|
||||
for (const auto &n : nbors) {
|
||||
tmpNbors.push_back(&n);
|
||||
}
|
||||
|
||||
while (!tmpNbors.empty()) {
|
||||
subClusters.push_back(std::vector<unsigned int>{
|
||||
tmpNbors.front()->d_mol1Num, tmpNbors.front()->d_mol2Num});
|
||||
auto m1 = tmpNbors.front()->d_res->getMcesMol();
|
||||
auto g_12 = g_ij(m1, clusOpts.a, clusOpts.b, clusOpts.minFragSize);
|
||||
for (size_t i = 1; i < tmpNbors.size(); ++i) {
|
||||
auto m2 = tmpNbors[i]->d_res->getMcesMol();
|
||||
auto g_13 = g_ij(m2, clusOpts.a, clusOpts.b, clusOpts.minFragSize);
|
||||
|
||||
auto results = RDKit::RascalMCES::rascalMCES(*m1, *m2);
|
||||
if (results.empty() || results.front().getBondMatches().empty()) {
|
||||
continue;
|
||||
}
|
||||
auto res = results.front();
|
||||
auto g_12_13 =
|
||||
g_ij(res.getMcesMol(), clusOpts.a, clusOpts.b, clusOpts.minFragSize);
|
||||
double sim = g_12_13 / std::min(g_12, g_13);
|
||||
if (sim > clusOpts.minIntraClusterSim) {
|
||||
subClusters.back().push_back(tmpNbors[i]->d_mol2Num);
|
||||
subClusters.back().push_back(tmpNbors[i]->d_mol1Num);
|
||||
tmpNbors[i] = nullptr;
|
||||
}
|
||||
}
|
||||
tmpNbors.front() = nullptr;
|
||||
tmpNbors.erase(std::remove(tmpNbors.begin(), tmpNbors.end(), nullptr),
|
||||
tmpNbors.end());
|
||||
std::sort(subClusters.back().begin(), subClusters.back().end());
|
||||
subClusters.back().erase(
|
||||
std::unique(subClusters.back().begin(), subClusters.back().end()),
|
||||
subClusters.back().end());
|
||||
}
|
||||
return subClusters;
|
||||
}
|
||||
|
||||
std::vector<std::vector<unsigned int>> formInitialClusters(
|
||||
const std::vector<unsigned int> &subGraph,
|
||||
const std::vector<std::vector<ClusNode>> &proxGraph,
|
||||
const RascalClusterOptions &clusOpts) {
|
||||
std::vector<std::vector<unsigned int>> clusters;
|
||||
if (subGraph.size() < 2) {
|
||||
return clusters;
|
||||
}
|
||||
for (auto i : subGraph) {
|
||||
std::vector<ClusNode> nbors;
|
||||
for (auto j : subGraph) {
|
||||
if (proxGraph[i][j].d_res) {
|
||||
nbors.push_back(proxGraph[i][j]);
|
||||
}
|
||||
}
|
||||
std::sort(nbors.begin(), nbors.end(),
|
||||
[](const ClusNode &c1, const ClusNode &c2) -> bool {
|
||||
return c1.d_sim > c2.d_sim;
|
||||
});
|
||||
if (!nbors.empty()) {
|
||||
auto subClusters = makeSubClusters(nbors, clusOpts);
|
||||
clusters.insert(clusters.end(), subClusters.begin(), subClusters.end());
|
||||
}
|
||||
}
|
||||
std::sort(clusters.begin(), clusters.end(),
|
||||
[](const std::vector<unsigned int> &c1,
|
||||
const std::vector<unsigned int> &c2) -> bool {
|
||||
if (c1.size() == c2.size()) {
|
||||
return c1.front() < c2.front();
|
||||
} else {
|
||||
return c1.size() > c2.size();
|
||||
}
|
||||
});
|
||||
clusters.erase(std::unique(clusters.begin(), clusters.end()), clusters.end());
|
||||
return clusters;
|
||||
}
|
||||
|
||||
std::vector<std::vector<unsigned int>> mergeClusters(
|
||||
const std::vector<std::vector<unsigned int>> &clusters,
|
||||
const RascalClusterOptions &clusOpts) {
|
||||
std::vector<std::vector<unsigned int>> outClusters(clusters);
|
||||
|
||||
if (outClusters.size() < 2) {
|
||||
return outClusters;
|
||||
}
|
||||
|
||||
for (size_t i = 0; i < outClusters.size() - 1; ++i) {
|
||||
for (size_t j = i + 1; j < outClusters.size(); ++j) {
|
||||
std::vector<int> inCommon;
|
||||
std::set_intersection(outClusters[i].begin(), outClusters[i].end(),
|
||||
outClusters[j].begin(), outClusters[j].end(),
|
||||
std::back_inserter(inCommon));
|
||||
double s =
|
||||
double(inCommon.size()) / std::min(double(outClusters[i].size()),
|
||||
double(outClusters[j].size()));
|
||||
if (s > clusOpts.clusterMergeSim) {
|
||||
outClusters[i].insert(outClusters[i].end(), outClusters[j].begin(),
|
||||
outClusters[j].end());
|
||||
outClusters[j].clear();
|
||||
std::sort(outClusters[i].begin(), outClusters[i].end());
|
||||
outClusters[i].erase(
|
||||
std::unique(outClusters[i].begin(), outClusters[i].end()),
|
||||
outClusters[i].end());
|
||||
}
|
||||
}
|
||||
outClusters.erase(
|
||||
std::remove_if(outClusters.begin(), outClusters.end(),
|
||||
[](const std::vector<unsigned int> &c) -> bool {
|
||||
return c.empty();
|
||||
}),
|
||||
outClusters.end());
|
||||
}
|
||||
|
||||
return outClusters;
|
||||
}
|
||||
|
||||
void sortClusterMembersByMeanSim(
|
||||
const std::vector<std::vector<ClusNode>> &proxGraph,
|
||||
std::vector<std::vector<unsigned int>> &clusters) {
|
||||
for (auto &clus : clusters) {
|
||||
std::vector<std::pair<unsigned int, double>> clusSims;
|
||||
for (unsigned int i = 0U; i < clus.size(); ++i) {
|
||||
double totSim = 0.0;
|
||||
for (unsigned int j = 0U; j < clus.size(); ++j) {
|
||||
if (i != j) {
|
||||
totSim += proxGraph[clus[i]][clus[j]].d_sim;
|
||||
}
|
||||
}
|
||||
clusSims.push_back({clus[i], totSim / (clus.size() - 1)});
|
||||
}
|
||||
std::sort(clusSims.begin(), clusSims.end(),
|
||||
[](const std::pair<unsigned int, double> &p1,
|
||||
const std::pair<unsigned int, double> &p2) -> bool {
|
||||
return p1.second > p2.second;
|
||||
});
|
||||
std::transform(
|
||||
clusSims.begin(), clusSims.end(), clus.begin(),
|
||||
[](const std::pair<unsigned int, double> &p) -> unsigned int {
|
||||
return p.first;
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
std::vector<std::vector<unsigned int>> makeClusters(
|
||||
const std::vector<std::vector<unsigned int>> &subGraphs,
|
||||
const std::vector<std::vector<ClusNode>> &proxGraph,
|
||||
const RascalClusterOptions &clusOpts) {
|
||||
std::vector<std::vector<unsigned int>> clusters;
|
||||
for (const auto &sg : subGraphs) {
|
||||
auto theseClusters = formInitialClusters(sg, proxGraph, clusOpts);
|
||||
auto mergedClusters = mergeClusters(theseClusters, clusOpts);
|
||||
clusters.insert(clusters.end(), mergedClusters.begin(),
|
||||
mergedClusters.end());
|
||||
}
|
||||
std::sort(clusters.begin(), clusters.end(),
|
||||
[](const std::vector<unsigned int> &c1,
|
||||
const std::vector<unsigned int> &c2) -> bool {
|
||||
return c1.size() > c2.size();
|
||||
});
|
||||
return clusters;
|
||||
}
|
||||
|
||||
std::vector<unsigned int> collectSingletons(
|
||||
const std::vector<std::vector<ClusNode>> &proxGraph) {
|
||||
std::vector<unsigned int> singletons;
|
||||
for (size_t i = 0; i < proxGraph.size(); ++i) {
|
||||
bool single = true;
|
||||
for (const auto &cn : proxGraph[i]) {
|
||||
if (cn.d_res) {
|
||||
single = false;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (single) {
|
||||
singletons.push_back(i);
|
||||
}
|
||||
}
|
||||
return singletons;
|
||||
}
|
||||
} // namespace details
|
||||
|
||||
std::vector<std::vector<unsigned int>> rascalCluster(
|
||||
const std::vector<std::shared_ptr<ROMol>> &mols,
|
||||
const RascalClusterOptions &clusOpts) {
|
||||
auto proxGraph = details::buildProximityGraph(mols, clusOpts);
|
||||
auto subGraphs = details::disconnectProximityGraphs(proxGraph);
|
||||
auto clusters = details::makeClusters(subGraphs, proxGraph, clusOpts);
|
||||
auto singletons = details::collectSingletons(proxGraph);
|
||||
clusters.push_back(singletons);
|
||||
details::sortClusterMembersByMeanSim(proxGraph, clusters);
|
||||
return clusters;
|
||||
}
|
||||
|
||||
} // namespace RascalMCES
|
||||
} // namespace RDKit
|
||||
53
Code/GraphMol/RascalMCES/RascalClusterOptions.h
Normal file
53
Code/GraphMol/RascalMCES/RascalClusterOptions.h
Normal file
@@ -0,0 +1,53 @@
|
||||
//
|
||||
// Copyright (C) David Cosgrove 2023
|
||||
//
|
||||
// @@ All Rights Reserved @@
|
||||
// This file is part of the RDKit.
|
||||
// The contents are covered by the terms of the BSD license
|
||||
// which is included in the file license.txt, found at the root
|
||||
// of the RDKit source tree.
|
||||
//
|
||||
// Options for Rascal Clustering. In general, the option names and defaults
|
||||
// are taken from the paper:
|
||||
// 'A Line Graph Algorithm for Clustering Chemical Structures Based
|
||||
// on Common Substructural Cores', JW Raymond, PW Willett.
|
||||
// https://match.pmf.kg.ac.rs/electronic_versions/Match48/match48_197-207.pdf
|
||||
// https://eprints.whiterose.ac.uk/77598/
|
||||
|
||||
#include <RDGeneral/export.h>
|
||||
|
||||
#ifndef RASCALCLUSTEROPTIONS_H
|
||||
#define RASCALCLUSTEROPTIONS_H
|
||||
|
||||
namespace RDKit {
|
||||
namespace RascalMCES {
|
||||
|
||||
struct RDKIT_RASCALMCES_EXPORT RascalClusterOptions {
|
||||
double similarityCutoff = 0.7; /* Similarity cutoff for clustering. Initial
|
||||
clusters will have molecule pairs of at
|
||||
least this similarity. */
|
||||
double a = 0.05; /* penalty score for each unconnected component in MCES */
|
||||
double b = 2.0; /* weight of matched bonds over matched atoms */
|
||||
unsigned int minFragSize =
|
||||
3; /* minimum number of atoms in a fragment for it to
|
||||
be included in the MCES. Also p in the paper. */
|
||||
double minIntraClusterSim = 0.9; /* two pairs of molecules are included in the
|
||||
same cluster if the similarity between
|
||||
their MCESs is greater than this. S_a
|
||||
in the paper */
|
||||
double clusterMergeSim = 0.6; /* two clusters are merged if fraction of
|
||||
molecules they have in common is greater than
|
||||
this. S_b in the paper */
|
||||
unsigned int maxNumFrags = 2; /* The maximum number of fragments in any MCES.
|
||||
Otherwise the MCES can be a lot of small
|
||||
fragments scattered across the molecule - it
|
||||
tries too hard to find a match, sometimes */
|
||||
int numThreads = -1; /* The number of threads to use. If > 0, will use that
|
||||
number. If <= 0, will use the number of hardware
|
||||
threads plus this number. So if the number of
|
||||
hardware threads is 8, and numThreads is -1, it will
|
||||
use 7 threads. */
|
||||
};
|
||||
} // namespace RascalMCES
|
||||
} // namespace RDKit
|
||||
#endif // RASCALCLUSTEROPTIONS_H
|
||||
94
Code/GraphMol/RascalMCES/RascalDetails.h
Normal file
94
Code/GraphMol/RascalMCES/RascalDetails.h
Normal file
@@ -0,0 +1,94 @@
|
||||
//
|
||||
// Copyright (C) David Cosgrove 2023
|
||||
//
|
||||
// @@ All Rights Reserved @@
|
||||
// This file is part of the RDKit.
|
||||
// The contents are covered by the terms of the BSD license
|
||||
// which is included in the file license.txt, found at the root
|
||||
// of the RDKit source tree.
|
||||
//
|
||||
|
||||
#include <RDGeneral/export.h>
|
||||
#ifndef RDKIT_RASCAL_DETAILS_H
|
||||
#define RDKIT_RASCAL_DETAILS_H
|
||||
|
||||
#include <map>
|
||||
|
||||
#include <GraphMol/RascalMCES/RascalOptions.h>
|
||||
#include <GraphMol/RascalMCES/RascalResult.h>
|
||||
namespace RDKit {
|
||||
class ROMol;
|
||||
|
||||
namespace RascalMCES {
|
||||
|
||||
class RascalClusterOptions;
|
||||
|
||||
namespace details {
|
||||
|
||||
struct ClusNode {
|
||||
std::shared_ptr<RascalResult> d_res;
|
||||
double d_sim;
|
||||
unsigned int d_mol1Num, d_mol2Num;
|
||||
};
|
||||
|
||||
RDKIT_RASCALMCES_EXPORT double tier1Sim(
|
||||
const RDKit::ROMol &mol1, const RDKit::ROMol &mol2,
|
||||
std::map<int, std::vector<std::pair<int, int>>> °Seqs1,
|
||||
std::map<int, std::vector<std::pair<int, int>>> °Seqs2);
|
||||
|
||||
RDKIT_RASCALMCES_EXPORT double tier2Sim(
|
||||
const ROMol &mol1, const ROMol &mol2,
|
||||
const std::map<int, std::vector<std::pair<int, int>>> °Seqs1,
|
||||
const std::map<int, std::vector<std::pair<int, int>>> °Seqs2,
|
||||
const std::vector<unsigned int> &bondLabels1,
|
||||
const std::vector<unsigned int> &bondLabels2);
|
||||
|
||||
RDKIT_RASCALMCES_EXPORT void getBondLabels(
|
||||
const RDKit::ROMol &mol1, const RDKit::ROMol &mol2,
|
||||
const RascalOptions &opts, std::vector<unsigned int> &bondLabels1,
|
||||
std::vector<unsigned int> &bondLabels2);
|
||||
|
||||
std::vector<std::vector<ClusNode>> buildProximityGraph(
|
||||
const std::vector<std::shared_ptr<ROMol>> &mols,
|
||||
const RascalClusterOptions &clusOpts);
|
||||
|
||||
RDKIT_RASCALMCES_EXPORT bool resultCompare(const RascalResult &res1,
|
||||
const RascalResult &res2);
|
||||
|
||||
RDKIT_RASCALMCES_EXPORT void extractClique(
|
||||
const std::vector<unsigned int> &clique,
|
||||
const std::vector<std::pair<int, int>> &vtxPairs, bool swapped,
|
||||
std::vector<std::pair<int, int>> &bondMatches);
|
||||
|
||||
// do some simple cleaning of the SMARTS, to make it more user-friendly.
|
||||
RDKIT_RASCALMCES_EXPORT void cleanSmarts(std::string &smarts);
|
||||
|
||||
// Primarily for debugging, these write out the corresponding bonds/atoms
|
||||
// in Python list format, for ease of cut/paste into a highlighted image
|
||||
// creation.
|
||||
RDKIT_RASCALMCES_EXPORT void printBondMatches(const RascalResult &res,
|
||||
std::ostream &os);
|
||||
|
||||
RDKIT_RASCALMCES_EXPORT void printAtomMatches(const RascalResult &res,
|
||||
std::ostream &os);
|
||||
|
||||
// This prints out the scores in the order they are used in resultCompare.
|
||||
RDKIT_RASCALMCES_EXPORT void printScores(const RascalResult &res,
|
||||
std::ostream &os);
|
||||
|
||||
// Calculate the Johnson similarity between the two molecules using the given
|
||||
// bondMatches. It's the fraction of the 2 molecules that are in common,
|
||||
// somewhat akin to the tanimoto - the square of the number of atoms plus
|
||||
// number of bonds in the MCES divided by the product of the sums of the number
|
||||
// of atoms and bonds in the 2 molecules.
|
||||
// It has nothing to do with lying UK politicians.
|
||||
RDKIT_RASCALMCES_EXPORT double johnsonSimilarity(
|
||||
const std::vector<std::pair<int, int>> &bondMatches,
|
||||
const std::vector<std::pair<int, int>> &atomMatches,
|
||||
const RDKit::ROMol &mol1, const RDKit::ROMol &mol2);
|
||||
|
||||
} // namespace details
|
||||
|
||||
} // namespace RascalMCES
|
||||
} // namespace RDKit
|
||||
#endif // RDKIT_RASCAL_MCES_H
|
||||
1093
Code/GraphMol/RascalMCES/RascalMCES.cpp
Normal file
1093
Code/GraphMol/RascalMCES/RascalMCES.cpp
Normal file
File diff suppressed because it is too large
Load Diff
73
Code/GraphMol/RascalMCES/RascalMCES.h
Normal file
73
Code/GraphMol/RascalMCES/RascalMCES.h
Normal file
@@ -0,0 +1,73 @@
|
||||
//
|
||||
// Copyright (C) David Cosgrove 2023
|
||||
//
|
||||
// @@ All Rights Reserved @@
|
||||
// This file is part of the RDKit.
|
||||
// The contents are covered by the terms of the BSD license
|
||||
// which is included in the file license.txt, found at the root
|
||||
// of the RDKit source tree.
|
||||
//
|
||||
|
||||
#include <RDGeneral/export.h>
|
||||
#ifndef RDKIT_RASCAL_MCES_H
|
||||
#define RDKIT_RASCAL_MCES_H
|
||||
|
||||
#include <vector>
|
||||
|
||||
#include <GraphMol/RascalMCES/RascalClusterOptions.h>
|
||||
#include <GraphMol/RascalMCES/RascalOptions.h>
|
||||
#include <GraphMol/RascalMCES/RascalResult.h>
|
||||
namespace RDKit {
|
||||
class ROMol;
|
||||
|
||||
namespace RascalMCES {
|
||||
|
||||
// Find one or more MCESs between the two molecules. The MCES is the
|
||||
// Maximum Common Edge Substructure, and is the largest set of bonds
|
||||
// common to the 2 molecules.
|
||||
/*!
|
||||
*
|
||||
* @param mol1 : first molecule
|
||||
* @param mol2 : second molecule for MCES determination.
|
||||
* @param opts : (optional) set of options controlling the MCES determination
|
||||
* @return : vector of RascalResult objects.
|
||||
*/
|
||||
RDKIT_RASCALMCES_EXPORT std::vector<RascalResult> rascalMCES(
|
||||
const ROMol &mol1, const ROMol &mol2,
|
||||
const RascalOptions &opts = RascalOptions());
|
||||
|
||||
// Cluster the molecules using the Johnson similarity from rascalMCES
|
||||
// and the algorithm of
|
||||
// 'A Line Graph Algorithm for Clustering Chemical Structures Based
|
||||
// on Common Substructural Cores', JW Raymond, PW Willett.
|
||||
// https://match.pmf.kg.ac.rs/electronic_versions/Match48/match48_197-207.pdf
|
||||
// https://eprints.whiterose.ac.uk/77598/
|
||||
// This is a fuzzy clustering algorithm, so a molecule may appear in more than
|
||||
// one cluster. The final cluster is all the molecules that didn't fit into
|
||||
// another cluster (the singletons).
|
||||
/*!
|
||||
*
|
||||
* @param mols : molecules to cluster
|
||||
* @param clusOpts : (optional) cluster options
|
||||
* @return clusters as vector of vectors of unsigned ints - indices into the
|
||||
* input mols vector
|
||||
*/
|
||||
RDKIT_RASCALMCES_EXPORT std::vector<std::vector<unsigned int>> rascalCluster(
|
||||
const std::vector<std::shared_ptr<ROMol>> &mols,
|
||||
const RascalClusterOptions &clusOpts = RascalClusterOptions());
|
||||
// Cluster the molecules using the Johnson similarity from rascalMCES and
|
||||
// the Butina algorithm. Butina JCICS 39 747-750 (1999).
|
||||
/*!
|
||||
*
|
||||
* @param mols : molecules to cluster
|
||||
* @param clusOpts : (optional) cluster options
|
||||
* @return clusters as vector of vectors of unsigned ints - indices into the
|
||||
* input mols vector
|
||||
*/
|
||||
RDKIT_RASCALMCES_EXPORT std::vector<std::vector<unsigned int>>
|
||||
rascalButinaCluster(
|
||||
const std::vector<std::shared_ptr<ROMol>> &mols,
|
||||
const RascalClusterOptions &clusOpts = RascalClusterOptions());
|
||||
} // namespace RascalMCES
|
||||
} // namespace RDKit
|
||||
#endif // RDKIT_RASCAL_MCES_H
|
||||
50
Code/GraphMol/RascalMCES/RascalOptions.h
Normal file
50
Code/GraphMol/RascalMCES/RascalOptions.h
Normal file
@@ -0,0 +1,50 @@
|
||||
//
|
||||
// Copyright (C) David Cosgrove 2023
|
||||
//
|
||||
// @@ All Rights Reserved @@
|
||||
// This file is part of the RDKit.
|
||||
// The contents are covered by the terms of the BSD license
|
||||
// which is included in the file license.txt, found at the root
|
||||
// of the RDKit source tree.
|
||||
//
|
||||
#include <RDGeneral/export.h>
|
||||
|
||||
#ifndef RASCALOPTIONS_H
|
||||
#define RASCALOPTIONS_H
|
||||
|
||||
namespace RDKit {
|
||||
|
||||
namespace RascalMCES {
|
||||
|
||||
struct RDKIT_RASCALMCES_EXPORT RascalOptions {
|
||||
double similarityThreshold =
|
||||
0.7; // if calculated below this, no MCES will be evaluated.
|
||||
bool completeAromaticRings =
|
||||
true; // if true, partial aromatic rings won't be returned
|
||||
bool ringMatchesRingOnly =
|
||||
false; // if true, ring bonds won't match non-ring bonds
|
||||
bool singleLargestFrag =
|
||||
false; /* if true, only return a single fragment for the MCES. Default
|
||||
is to produce multiple matching fragments if necessary. */
|
||||
int minFragSize =
|
||||
-1; /* minimum number of atoms in any fragment - -1 means no minimum */
|
||||
int maxFragSeparation = -1; /* biggest through-bond distance that bonds can
|
||||
match. -1 means no limit. */
|
||||
bool allBestMCESs =
|
||||
false; /* If true, all MCESs are returned, in order of diminishing score.
|
||||
This is likely to result in higher run times. */
|
||||
int timeout = 60; // max run time, in seconds. -1 means no max.
|
||||
bool doEquivBondPruning =
|
||||
false; /* This might make the code run a bit faster in some
|
||||
circumstances, but on average it is very marginal. */
|
||||
bool returnEmptyMCES = false; /* if true, if the similarity thresholds aren't
|
||||
matched still return a RascalResult with the
|
||||
tier1 and tier2 sims filled in. */
|
||||
int maxBondMatchPairs = 1000; /* Too many matching bond (vertex) pairs can
|
||||
cause it to run out of memory. This is a
|
||||
reasonable default for my Mac. */
|
||||
};
|
||||
} // namespace RascalMCES
|
||||
} // namespace RDKit
|
||||
|
||||
#endif // RASCALOPTIONS_H
|
||||
815
Code/GraphMol/RascalMCES/RascalResult.cpp
Normal file
815
Code/GraphMol/RascalMCES/RascalResult.cpp
Normal file
@@ -0,0 +1,815 @@
|
||||
//
|
||||
// Copyright (C) David Cosgrove 2023
|
||||
//
|
||||
// @@ All Rights Reserved @@
|
||||
// This file is part of the RDKit.
|
||||
// The contents are covered by the terms of the BSD license
|
||||
// which is included in the file license.txt, found at the root
|
||||
// of the RDKit source tree.
|
||||
//
|
||||
|
||||
#include <regex>
|
||||
#include <set>
|
||||
|
||||
#include <boost/dynamic_bitset.hpp>
|
||||
|
||||
#include <GraphMol/MolOps.h>
|
||||
#include <GraphMol/QueryAtom.h>
|
||||
#include <GraphMol/QueryBond.h>
|
||||
#include <GraphMol/QueryOps.h>
|
||||
#include <GraphMol/SmilesParse/SmartsWrite.h>
|
||||
#include <GraphMol/SmilesParse/SmilesWrite.h>
|
||||
|
||||
#include <GraphMol/RascalMCES/RascalDetails.h>
|
||||
#include <GraphMol/RascalMCES/RascalResult.h>
|
||||
|
||||
namespace RDKit {
|
||||
|
||||
namespace RascalMCES {
|
||||
|
||||
RascalResult::RascalResult(const RDKit::ROMol &mol1, const RDKit::ROMol &mol2,
|
||||
const std::vector<std::vector<int>> &adjMatrix1,
|
||||
const std::vector<std::vector<int>> &adjMatrix2,
|
||||
const std::vector<unsigned int> &clique,
|
||||
const std::vector<std::pair<int, int>> &vtx_pairs,
|
||||
bool timedOut, bool swapped, double tier1Sim,
|
||||
double tier2Sim, bool ringMatchesRingOnly,
|
||||
bool singleLargestFrag, int maxFragSep)
|
||||
: d_timedOut(timedOut),
|
||||
d_tier1Sim(tier1Sim),
|
||||
d_tier2Sim(tier2Sim),
|
||||
d_ringMatchesRingOnly(ringMatchesRingOnly),
|
||||
d_maxFragSep(maxFragSep) {
|
||||
const std::vector<std::vector<int>> *mol1AdjMatrix;
|
||||
if (swapped) {
|
||||
d_mol1.reset(new RDKit::ROMol(mol2));
|
||||
d_mol2.reset(new RDKit::ROMol(mol1));
|
||||
mol1AdjMatrix = &adjMatrix2;
|
||||
} else {
|
||||
d_mol1.reset(new RDKit::ROMol(mol1));
|
||||
d_mol2.reset(new RDKit::ROMol(mol2));
|
||||
mol1AdjMatrix = &adjMatrix1;
|
||||
}
|
||||
|
||||
details::extractClique(clique, vtx_pairs, swapped, d_bondMatches);
|
||||
matchCliqueAtoms(*mol1AdjMatrix);
|
||||
if (d_maxFragSep != -1) {
|
||||
applyMaxFragSep();
|
||||
}
|
||||
if (singleLargestFrag) {
|
||||
largestFragOnly();
|
||||
}
|
||||
}
|
||||
|
||||
RascalResult::RascalResult(double tier1Sim, double tier2Sim)
|
||||
: d_tier1Sim(tier1Sim), d_tier2Sim(tier2Sim) {}
|
||||
|
||||
RascalResult::RascalResult(const RascalResult &other)
|
||||
: d_bondMatches(other.d_bondMatches),
|
||||
d_atomMatches(other.d_atomMatches),
|
||||
d_smarts(other.d_smarts),
|
||||
d_timedOut(other.d_timedOut),
|
||||
d_tier1Sim(other.d_tier1Sim),
|
||||
d_tier2Sim(other.d_tier2Sim),
|
||||
d_numFrags(other.d_numFrags),
|
||||
d_ringNonRingBondScore(other.d_ringNonRingBondScore),
|
||||
d_atomMatchScore(other.d_atomMatchScore),
|
||||
d_maxDeltaAtomAtomDist(other.d_maxDeltaAtomAtomDist),
|
||||
d_largestFragSize(other.d_largestFragSize) {
|
||||
if (other.d_mol1) {
|
||||
d_mol1.reset(new ROMol(*other.d_mol1));
|
||||
}
|
||||
if (other.d_mol2) {
|
||||
d_mol2.reset(new ROMol(*other.d_mol2));
|
||||
}
|
||||
if (other.d_mcesMol) {
|
||||
d_mcesMol.reset(new ROMol(*other.d_mcesMol));
|
||||
}
|
||||
}
|
||||
|
||||
RascalResult &RascalResult::operator=(const RascalResult &other) {
|
||||
if (this == &other) {
|
||||
return *this;
|
||||
}
|
||||
d_bondMatches = other.d_bondMatches;
|
||||
d_atomMatches = other.d_atomMatches;
|
||||
d_smarts = other.d_smarts;
|
||||
d_timedOut = other.d_timedOut;
|
||||
d_numFrags = other.d_numFrags;
|
||||
d_ringNonRingBondScore = other.d_ringNonRingBondScore;
|
||||
d_atomMatchScore = other.d_atomMatchScore;
|
||||
d_maxDeltaAtomAtomDist = other.d_maxDeltaAtomAtomDist;
|
||||
d_largestFragSize = other.d_largestFragSize;
|
||||
if (other.d_mol1) {
|
||||
d_mol1.reset(new ROMol(*other.d_mol1));
|
||||
}
|
||||
if (other.d_mol2) {
|
||||
d_mol2.reset(new ROMol(*other.d_mol2));
|
||||
}
|
||||
if (other.d_mcesMol) {
|
||||
d_mcesMol.reset(new ROMol(*other.d_mcesMol));
|
||||
}
|
||||
return *this;
|
||||
}
|
||||
|
||||
void RascalResult::largestFragOnly() { largestFragsOnly(1); }
|
||||
|
||||
void RascalResult::largestFragsOnly(unsigned int numFrags) {
|
||||
std::unique_ptr<RDKit::ROMol> mol1_frags(makeMolFrags(1));
|
||||
// getMolFrags() returns boost::shared_ptr. Ho-hum.
|
||||
auto frags = RDKit::MolOps::getMolFrags(*mol1_frags, false);
|
||||
if (numFrags < 1 || frags.size() < numFrags) {
|
||||
return;
|
||||
}
|
||||
std::sort(frags.begin(), frags.end(),
|
||||
[](const boost::shared_ptr<ROMol> &f1,
|
||||
const boost::shared_ptr<ROMol> &f2) -> bool {
|
||||
return f1->getNumAtoms() > f2->getNumAtoms();
|
||||
});
|
||||
frags.erase(frags.begin() + numFrags, frags.end());
|
||||
rebuildFromFrags(frags);
|
||||
}
|
||||
|
||||
void RascalResult::trimSmallFrags(unsigned int minFragSize) {
|
||||
std::unique_ptr<RDKit::ROMol> mol1_frags(makeMolFrags(1));
|
||||
// getMolFrags() returns boost::shared_ptr. Ho-hum.
|
||||
auto frags = RDKit::MolOps::getMolFrags(*mol1_frags, false);
|
||||
frags.erase(std::remove_if(frags.begin(), frags.end(),
|
||||
[&](const boost::shared_ptr<ROMol> &f) -> bool {
|
||||
return f->getNumAtoms() < minFragSize;
|
||||
}),
|
||||
frags.end());
|
||||
rebuildFromFrags(frags);
|
||||
}
|
||||
|
||||
double RascalResult::getSimilarity() const {
|
||||
if (!d_mol1 || !d_mol2) {
|
||||
return 0.0;
|
||||
}
|
||||
return details::johnsonSimilarity(d_bondMatches, d_atomMatches, *d_mol1,
|
||||
*d_mol2);
|
||||
}
|
||||
|
||||
void RascalResult::rebuildFromFrags(
|
||||
const std::vector<boost::shared_ptr<ROMol>> &frags) {
|
||||
// Force the re-creation of the SMARTS and other properties next time
|
||||
// they-re needed.
|
||||
d_smarts = "";
|
||||
d_maxFragSep = -1;
|
||||
d_ringNonRingBondScore = -1;
|
||||
d_maxDeltaAtomAtomDist = -1;
|
||||
d_largestFragSize = -1;
|
||||
|
||||
// for now, this is always called after fragmenting d_mol1, but just for
|
||||
// safety, protect against the frags coming from d_mol2 in some future
|
||||
// use.
|
||||
boost::dynamic_bitset<> fragAtoms(
|
||||
std::max(d_mol1->getNumAtoms(), d_mol2->getNumAtoms()));
|
||||
boost::dynamic_bitset<> fragBonds(
|
||||
std::max(d_mol1->getNumBonds(), d_mol2->getNumBonds()));
|
||||
for (const auto &f : frags) {
|
||||
for (auto atom : f->atoms()) {
|
||||
if (atom->hasProp("ORIG_INDEX")) {
|
||||
fragAtoms.set(atom->getProp<int>("ORIG_INDEX"));
|
||||
}
|
||||
}
|
||||
for (auto bond : f->bonds()) {
|
||||
if (bond->hasProp("ORIG_INDEX")) {
|
||||
fragBonds.set(bond->getProp<int>("ORIG_INDEX"));
|
||||
}
|
||||
}
|
||||
}
|
||||
std::vector<std::pair<int, int>> newAtomMatches;
|
||||
for (const auto &am : d_atomMatches) {
|
||||
if (fragAtoms[am.first]) {
|
||||
newAtomMatches.push_back(am);
|
||||
}
|
||||
}
|
||||
d_atomMatches = newAtomMatches;
|
||||
std::vector<std::pair<int, int>> new_bond_matches;
|
||||
for (const auto &bm : d_bondMatches) {
|
||||
if (fragBonds[bm.first]) {
|
||||
new_bond_matches.push_back(bm);
|
||||
}
|
||||
}
|
||||
d_bondMatches = new_bond_matches;
|
||||
d_numFrags = frags.size();
|
||||
d_largestFragSize = frags.empty() ? 0 : frags.front()->getNumAtoms();
|
||||
}
|
||||
|
||||
std::string RascalResult::createSmartsString() const {
|
||||
if (!d_mol1 || !d_mol2) {
|
||||
return "";
|
||||
}
|
||||
RWMol smartsMol;
|
||||
std::map<int, unsigned int> atomMap;
|
||||
auto mol1Rings = d_mol1->getRingInfo();
|
||||
auto mol2Rings = d_mol2->getRingInfo();
|
||||
for (const auto &am : d_atomMatches) {
|
||||
RDKit::QueryAtom a;
|
||||
auto mol1Atom = d_mol1->getAtomWithIdx(am.first);
|
||||
a.setQuery(RDKit::makeAtomNumQuery(mol1Atom->getAtomicNum()));
|
||||
auto mol2Atom = d_mol2->getAtomWithIdx(am.second);
|
||||
if (mol1Atom->getAtomicNum() != mol2Atom->getAtomicNum()) {
|
||||
a.expandQuery(RDKit::makeAtomNumQuery(mol2Atom->getAtomicNum()),
|
||||
Queries::COMPOSITE_OR);
|
||||
}
|
||||
if (mol1Atom->getIsAromatic() && mol2Atom->getIsAromatic()) {
|
||||
a.expandQuery(RDKit::makeAtomAromaticQuery(), Queries::COMPOSITE_AND,
|
||||
true);
|
||||
} else if (!mol1Atom->getIsAromatic() && !mol2Atom->getIsAromatic()) {
|
||||
a.expandQuery(RDKit::makeAtomAliphaticQuery(), Queries::COMPOSITE_AND,
|
||||
true);
|
||||
}
|
||||
if (d_ringMatchesRingOnly && !mol1Atom->getIsAromatic() &&
|
||||
!mol2Atom->getIsAromatic() &&
|
||||
mol1Rings->numAtomRings(mol1Atom->getIdx()) &&
|
||||
mol2Rings->numAtomRings(mol2Atom->getIdx())) {
|
||||
a.expandQuery(RDKit::makeAtomInRingQuery(), Queries::COMPOSITE_AND, true);
|
||||
}
|
||||
auto ai = smartsMol.addAtom(&a);
|
||||
atomMap.insert(std::make_pair(am.first, ai));
|
||||
}
|
||||
|
||||
for (const auto &bm : d_bondMatches) {
|
||||
RDKit::QueryBond b;
|
||||
auto mol1Bond = d_mol1->getBondWithIdx(bm.first);
|
||||
b.setBeginAtomIdx(atomMap[mol1Bond->getBeginAtomIdx()]);
|
||||
b.setEndAtomIdx(atomMap[mol1Bond->getEndAtomIdx()]);
|
||||
b.setQuery(makeBondOrderEqualsQuery(mol1Bond->getBondType()));
|
||||
auto mol2Bond = d_mol2->getBondWithIdx(bm.second);
|
||||
if (mol1Bond->getBondType() != mol2Bond->getBondType()) {
|
||||
b.expandQuery(makeBondOrderEqualsQuery(mol2Bond->getBondType()),
|
||||
Queries::COMPOSITE_OR);
|
||||
}
|
||||
if (d_ringMatchesRingOnly && !mol1Bond->getIsAromatic() &&
|
||||
!mol2Bond->getIsAromatic() &&
|
||||
mol1Rings->numBondRings(mol1Bond->getIdx()) &&
|
||||
mol2Rings->numBondRings(mol2Bond->getIdx())) {
|
||||
b.expandQuery(RDKit::makeBondIsInRingQuery(), Queries::COMPOSITE_AND,
|
||||
true);
|
||||
}
|
||||
smartsMol.addBond(&b, false);
|
||||
}
|
||||
std::string smt = RDKit::MolToSmarts(smartsMol, true);
|
||||
details::cleanSmarts(smt);
|
||||
return smt;
|
||||
}
|
||||
|
||||
namespace {
|
||||
// Return the atom common to the two bonds, -1 if there isn't one.
|
||||
int common_atom_in_bonds(const RDKit::Bond *bond1, const RDKit::Bond *bond2) {
|
||||
int commonAtom = -1;
|
||||
if (bond1->getBeginAtomIdx() == bond2->getBeginAtomIdx()) {
|
||||
commonAtom = bond1->getBeginAtomIdx();
|
||||
} else if (bond1->getEndAtomIdx() == bond2->getBeginAtomIdx()) {
|
||||
commonAtom = bond1->getEndAtomIdx();
|
||||
} else if (bond1->getBeginAtomIdx() == bond2->getEndAtomIdx()) {
|
||||
commonAtom = bond1->getBeginAtomIdx();
|
||||
} else if (bond1->getEndAtomIdx() == bond2->getEndAtomIdx()) {
|
||||
commonAtom = bond1->getEndAtomIdx();
|
||||
}
|
||||
return commonAtom;
|
||||
}
|
||||
} // namespace
|
||||
|
||||
void RascalResult::matchCliqueAtoms(
|
||||
const std::vector<std::vector<int>> &mol1_adj_matrix) {
|
||||
if (d_bondMatches.empty()) {
|
||||
return;
|
||||
}
|
||||
std::vector<int> mol1Matches(d_mol1->getNumAtoms(), -1);
|
||||
// set the clique atoms to -2 in mol1Matches, to mark them as yet undecided.
|
||||
for (const auto &bm : d_bondMatches) {
|
||||
auto bond1 = d_mol1->getBondWithIdx(bm.first);
|
||||
mol1Matches[bond1->getBeginAtomIdx()] = -2;
|
||||
mol1Matches[bond1->getEndAtomIdx()] = -2;
|
||||
}
|
||||
|
||||
// First, use the line graphs to match atoms that have 2 matching bonds
|
||||
// incident on them.
|
||||
for (size_t i = 0; i < d_bondMatches.size() - 1; ++i) {
|
||||
const auto &pair1 = d_bondMatches[i];
|
||||
auto bond1_1 = d_mol1->getBondWithIdx(pair1.first);
|
||||
auto bond2_1 = d_mol2->getBondWithIdx(pair1.second);
|
||||
for (size_t j = i + 1; j < d_bondMatches.size(); ++j) {
|
||||
const auto &pair2 = d_bondMatches[j];
|
||||
if (mol1_adj_matrix[pair1.first][pair2.first]) {
|
||||
// the 2 bonds are incident on the same atom, so the 2 atoms must match
|
||||
auto bond1_2 = d_mol1->getBondWithIdx(pair2.first);
|
||||
auto bond2_2 = d_mol2->getBondWithIdx(pair2.second);
|
||||
auto mol1Atom = common_atom_in_bonds(bond1_1, bond1_2);
|
||||
auto mol2Atom = common_atom_in_bonds(bond2_1, bond2_2);
|
||||
if (mol1Atom != -1) {
|
||||
mol1Matches[mol1Atom] = mol2Atom;
|
||||
auto omol1Atom = bond1_1->getOtherAtomIdx(mol1Atom);
|
||||
auto omol2Atom = bond2_1->getOtherAtomIdx(mol2Atom);
|
||||
mol1Matches[omol1Atom] = omol2Atom;
|
||||
omol1Atom = bond1_2->getOtherAtomIdx(mol1Atom);
|
||||
omol2Atom = bond2_2->getOtherAtomIdx(mol2Atom);
|
||||
mol1Matches[omol1Atom] = omol2Atom;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
// if there are -2 entries in mol1Matches there's more to do.
|
||||
if (std::count(mol1Matches.begin(), mol1Matches.end(), -2)) {
|
||||
// Any -2 entries in mol1Matches are down to isolated bonds, which are a bit
|
||||
// tricky.
|
||||
for (const auto &pair1 : d_bondMatches) {
|
||||
auto bond1_1 = d_mol1->getBondWithIdx(pair1.first);
|
||||
if (mol1Matches[bond1_1->getBeginAtomIdx()] == -2 &&
|
||||
mol1Matches[bond1_1->getEndAtomIdx()] == -2) {
|
||||
auto bond2_1 = d_mol2->getBondWithIdx(pair1.second);
|
||||
if (bond1_1->getBeginAtom()->getAtomicNum() !=
|
||||
bond1_1->getEndAtom()->getAtomicNum()) {
|
||||
// it's fairly straightforward:
|
||||
if (bond1_1->getBeginAtom()->getAtomicNum() ==
|
||||
bond2_1->getBeginAtom()->getAtomicNum()) {
|
||||
mol1Matches[bond1_1->getBeginAtomIdx()] =
|
||||
bond2_1->getBeginAtomIdx();
|
||||
mol1Matches[bond1_1->getEndAtomIdx()] = bond2_1->getEndAtomIdx();
|
||||
} else {
|
||||
mol1Matches[bond1_1->getBeginAtomIdx()] = bond2_1->getEndAtomIdx();
|
||||
mol1Matches[bond1_1->getEndAtomIdx()] = bond2_1->getBeginAtomIdx();
|
||||
}
|
||||
} else if (bond1_1->getBeginAtom()->getTotalNumHs() !=
|
||||
bond1_1->getEndAtom()->getTotalNumHs()) {
|
||||
// try it on number of hydrogens
|
||||
if (bond1_1->getBeginAtom()->getTotalNumHs() >
|
||||
bond1_1->getEndAtom()->getTotalNumHs()) {
|
||||
mol1Matches[bond1_1->getBeginAtomIdx()] =
|
||||
bond2_1->getBeginAtomIdx();
|
||||
mol1Matches[bond1_1->getEndAtomIdx()] = bond2_1->getEndAtomIdx();
|
||||
} else {
|
||||
mol1Matches[bond1_1->getBeginAtomIdx()] = bond2_1->getEndAtomIdx();
|
||||
mol1Matches[bond1_1->getEndAtomIdx()] = bond2_1->getBeginAtomIdx();
|
||||
}
|
||||
} else {
|
||||
// it probably doesn't matter
|
||||
mol1Matches[bond1_1->getBeginAtomIdx()] = bond2_1->getBeginAtomIdx();
|
||||
mol1Matches[bond1_1->getEndAtomIdx()] = bond2_1->getEndAtomIdx();
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
for (size_t i = 0u; i < d_mol1->getNumAtoms(); ++i) {
|
||||
if (mol1Matches[i] >= 0) {
|
||||
d_atomMatches.push_back(std::make_pair(i, mol1Matches[i]));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void RascalResult::applyMaxFragSep() {
|
||||
std::unique_ptr<RDKit::ROMol> mol1_frags(makeMolFrags(1));
|
||||
auto frags1 = RDKit::MolOps::getMolFrags(*mol1_frags, false);
|
||||
if (frags1.size() < 2) {
|
||||
return;
|
||||
}
|
||||
auto fragFragDist = [](const boost::shared_ptr<RDKit::ROMol> &frag1,
|
||||
const boost::shared_ptr<RDKit::ROMol> &frag2,
|
||||
const double *pathMatrix, int num_atoms) -> double {
|
||||
int minDist = std::numeric_limits<int>::max();
|
||||
for (auto at1 : frag1->atoms()) {
|
||||
int at1Idx = at1->getProp<int>("ORIG_INDEX");
|
||||
for (auto at2 : frag2->atoms()) {
|
||||
int at2Idx = at2->getProp<int>("ORIG_INDEX");
|
||||
int dist = std::nearbyint(pathMatrix[at1Idx * num_atoms + at2Idx]);
|
||||
if (dist < minDist) {
|
||||
minDist = dist;
|
||||
}
|
||||
}
|
||||
}
|
||||
return minDist;
|
||||
};
|
||||
|
||||
std::unique_ptr<RDKit::ROMol> mol2Frags(makeMolFrags(2));
|
||||
auto frags2 = RDKit::MolOps::getMolFrags(*mol2Frags, false);
|
||||
// These arrays must not be deleted - they are cached in the molecule and
|
||||
// deleted when it is. The distance matrix will be re-calculated in case
|
||||
// something's been copied over somewhere.
|
||||
auto mol1Dists = RDKit::MolOps::getDistanceMat(*d_mol1, false, false, true);
|
||||
auto mol2Dists = RDKit::MolOps::getDistanceMat(*d_mol2, false, false, true);
|
||||
|
||||
bool deletedFrag = false;
|
||||
for (size_t i = 0; i < frags1.size() - 1; ++i) {
|
||||
if (!frags1[i]) {
|
||||
continue;
|
||||
}
|
||||
for (size_t j = i + 1; j < frags1.size(); ++j) {
|
||||
if (!frags1[j]) {
|
||||
continue;
|
||||
}
|
||||
int mol1Dist =
|
||||
fragFragDist(frags1[i], frags1[j], mol1Dists, d_mol1->getNumAtoms());
|
||||
int mol2Dist =
|
||||
fragFragDist(frags2[i], frags2[j], mol2Dists, d_mol2->getNumAtoms());
|
||||
if (mol1Dist > d_maxFragSep || mol2Dist > d_maxFragSep) {
|
||||
deletedFrag = true;
|
||||
if (frags1[i]->getNumAtoms() < frags1[j]->getNumAtoms()) {
|
||||
frags1[i].reset();
|
||||
frags2[i].reset();
|
||||
} else {
|
||||
frags1[j].reset();
|
||||
frags2[j].reset();
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (deletedFrag) {
|
||||
// rebuild the d_bondMatches
|
||||
std::vector<std::pair<int, int>> new_bond_matches;
|
||||
for (auto &frag : frags1) {
|
||||
if (!frag) {
|
||||
continue;
|
||||
}
|
||||
for (auto b : frag->bonds()) {
|
||||
int b_idx = b->getProp<int>("ORIG_INDEX");
|
||||
for (auto &bm : d_bondMatches) {
|
||||
if (b_idx == bm.first) {
|
||||
new_bond_matches.push_back(bm);
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
d_bondMatches = new_bond_matches;
|
||||
// and the d_atomMatches
|
||||
std::vector<std::pair<int, int>> new_atom_matches;
|
||||
for (auto &frag : frags1) {
|
||||
if (!frag) {
|
||||
continue;
|
||||
}
|
||||
for (auto a : frag->atoms()) {
|
||||
int a_idx = a->getProp<int>("ORIG_INDEX");
|
||||
for (auto &am : d_atomMatches) {
|
||||
if (a_idx == am.first) {
|
||||
new_atom_matches.push_back(am);
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
d_atomMatches = new_atom_matches;
|
||||
}
|
||||
}
|
||||
|
||||
// Return a molecule with the clique in it. Each atom will have the property
|
||||
// ORIG_INDEX giving its index in the original molecule.
|
||||
RDKit::ROMol *RascalResult::makeMolFrags(int molNum) const {
|
||||
std::shared_ptr<RDKit::ROMol> theMol;
|
||||
if (molNum == 1) {
|
||||
theMol = d_mol1;
|
||||
} else if (molNum == 2) {
|
||||
theMol = d_mol2;
|
||||
} else {
|
||||
return nullptr;
|
||||
}
|
||||
if (!theMol) {
|
||||
return nullptr;
|
||||
}
|
||||
auto *molFrags = new RDKit::RWMol(*theMol);
|
||||
std::vector<char> ainClique(theMol->getNumAtoms(), 0);
|
||||
for (const auto &am : d_atomMatches) {
|
||||
if (molNum == 1) {
|
||||
ainClique[am.first] = 1;
|
||||
} else {
|
||||
ainClique[am.second] = 1;
|
||||
}
|
||||
}
|
||||
std::vector<char> binClique(theMol->getNumBonds(), 0);
|
||||
for (const auto &bm : d_bondMatches) {
|
||||
if (molNum == 1) {
|
||||
binClique[bm.first] = 1;
|
||||
} else {
|
||||
binClique[bm.second] = 1;
|
||||
}
|
||||
}
|
||||
molFrags->beginBatchEdit();
|
||||
for (auto &a : molFrags->atoms()) {
|
||||
if (!ainClique[a->getIdx()]) {
|
||||
molFrags->removeAtom(a);
|
||||
} else {
|
||||
a->setProp<int>("ORIG_INDEX", a->getIdx());
|
||||
}
|
||||
}
|
||||
for (auto &b : molFrags->bonds()) {
|
||||
if (!binClique[b->getIdx()]) {
|
||||
molFrags->removeBond(b->getBeginAtomIdx(), b->getEndAtomIdx());
|
||||
} else {
|
||||
b->setProp<int>("ORIG_INDEX", b->getIdx());
|
||||
}
|
||||
}
|
||||
molFrags->commitBatchEdit();
|
||||
return molFrags;
|
||||
}
|
||||
|
||||
// Calculate a score for how many bonds in the clique don't match
|
||||
// cyclic/non-cyclic
|
||||
int RascalResult::calcRingNonRingScore() const {
|
||||
if (!d_mol1 || !d_mol2) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
int score = 0;
|
||||
for (const auto &bm : d_bondMatches) {
|
||||
auto nbr1 = d_mol1->getRingInfo()->numBondRings(bm.first);
|
||||
auto nbr2 = d_mol2->getRingInfo()->numBondRings(bm.second);
|
||||
|
||||
if ((nbr1 && !nbr2) || (!nbr1 && nbr2)) {
|
||||
++score;
|
||||
}
|
||||
}
|
||||
return score;
|
||||
}
|
||||
|
||||
// Calculate a score for how well the atoms in the clique from mol1 match the
|
||||
// atoms for the clique in mol2. The atom scores are made up of H count and
|
||||
// summed for the molecule. Its so that, for example, an OH in mol1 that could
|
||||
// match an OH or OMe matches the OH for preference.
|
||||
int RascalResult::calcAtomMatchScore() const {
|
||||
if (!d_mol1 || !d_mol2) {
|
||||
return 0;
|
||||
}
|
||||
int score = 0;
|
||||
for (const auto &am : d_atomMatches) {
|
||||
int num_h_1 = d_mol1->getAtomWithIdx(am.first)->getTotalNumHs();
|
||||
int num_h_2 = d_mol2->getAtomWithIdx(am.second)->getTotalNumHs();
|
||||
score += std::abs(num_h_1 - num_h_2);
|
||||
}
|
||||
return score;
|
||||
}
|
||||
|
||||
int RascalResult::calcMaxDeltaAtomAtomDistScore() const {
|
||||
// Possibly this could be improved, to be the total of the minimum distances
|
||||
// between each fragment.
|
||||
if (d_atomMatches.empty()) {
|
||||
return 0;
|
||||
}
|
||||
// These arrays are cached so shouldn't be deleted. The final 'true' in the
|
||||
// call is to force recalculation, just in case there's some other type copied
|
||||
// over from the input molecule.
|
||||
const auto *mol1Dists =
|
||||
RDKit::MolOps::getDistanceMat(*d_mol1, false, false, true);
|
||||
const auto *mol2Dists =
|
||||
RDKit::MolOps::getDistanceMat(*d_mol2, false, false, true);
|
||||
|
||||
int score = 0;
|
||||
auto dist = [](int idx1, int idx2, const double *dists,
|
||||
int num_atoms) -> int {
|
||||
return int(std::nearbyint(dists[idx1 * num_atoms + idx2]));
|
||||
};
|
||||
for (size_t i = 0; i < d_atomMatches.size() - 1; ++i) {
|
||||
for (size_t j = i + 1; j < d_atomMatches.size(); ++j) {
|
||||
auto d1 = dist(d_atomMatches[i].first, d_atomMatches[j].first, mol1Dists,
|
||||
d_mol1->getNumAtoms());
|
||||
auto d2 = dist(d_atomMatches[i].second, d_atomMatches[j].second,
|
||||
mol2Dists, d_mol2->getNumAtoms());
|
||||
auto deltaDist = abs(d1 - d2);
|
||||
if (deltaDist > score) {
|
||||
score = deltaDist;
|
||||
}
|
||||
}
|
||||
}
|
||||
return score;
|
||||
}
|
||||
|
||||
int RascalResult::calcLargestFragSize() const {
|
||||
if (!d_mol1 || !d_mol2) {
|
||||
return 0;
|
||||
}
|
||||
std::unique_ptr<RDKit::ROMol> mol1_frags(makeMolFrags(1));
|
||||
std::vector<int> mapping;
|
||||
auto numFrags = RDKit::MolOps::getMolFrags(*mol1_frags, mapping);
|
||||
auto lfs = std::count(mapping.begin(), mapping.end(), 0);
|
||||
for (unsigned int i = 1; i < numFrags; ++i) {
|
||||
auto fragSize = std::count(mapping.begin(), mapping.end(), i);
|
||||
lfs = std::max(lfs, fragSize);
|
||||
}
|
||||
return lfs;
|
||||
}
|
||||
|
||||
int RascalResult::getNumFrags() const {
|
||||
if (!d_mol1 || !d_mol2) {
|
||||
return 0;
|
||||
}
|
||||
if (d_numFrags == -1) {
|
||||
std::unique_ptr<RDKit::ROMol> mol1_frags(makeMolFrags(1));
|
||||
std::vector<int> mol1_frag_mapping;
|
||||
d_numFrags = RDKit::MolOps::getMolFrags(*mol1_frags, mol1_frag_mapping);
|
||||
}
|
||||
return d_numFrags;
|
||||
}
|
||||
|
||||
int RascalResult::getRingNonRingBondScore() const {
|
||||
if (!d_mol1 || !d_mol2) {
|
||||
return 0;
|
||||
}
|
||||
if (d_ringNonRingBondScore == -1) {
|
||||
d_ringNonRingBondScore = calcRingNonRingScore();
|
||||
}
|
||||
return d_ringNonRingBondScore;
|
||||
}
|
||||
|
||||
int RascalResult::getAtomMatchScore() const {
|
||||
if (!d_mol1 || !d_mol2) {
|
||||
return 0;
|
||||
}
|
||||
if (d_atomMatchScore == -1) {
|
||||
d_atomMatchScore = calcAtomMatchScore();
|
||||
}
|
||||
return d_atomMatchScore;
|
||||
}
|
||||
|
||||
int RascalResult::getMaxDeltaAtomAtomDist() const {
|
||||
if (!d_mol1 || !d_mol2) {
|
||||
return 0;
|
||||
}
|
||||
if (d_maxDeltaAtomAtomDist == -1) {
|
||||
d_maxDeltaAtomAtomDist = calcMaxDeltaAtomAtomDistScore();
|
||||
}
|
||||
return d_maxDeltaAtomAtomDist;
|
||||
}
|
||||
|
||||
int RascalResult::getLargestFragSize() const {
|
||||
if (!d_mol1 || !d_mol2) {
|
||||
return 0;
|
||||
}
|
||||
if (d_largestFragSize == -1) {
|
||||
d_largestFragSize = calcLargestFragSize();
|
||||
}
|
||||
return d_largestFragSize;
|
||||
}
|
||||
|
||||
std::string RascalResult::getSmarts() const {
|
||||
if (!d_mol1 || !d_mol2) {
|
||||
return "";
|
||||
}
|
||||
if (d_smarts.empty()) {
|
||||
d_smarts = createSmartsString();
|
||||
}
|
||||
return d_smarts;
|
||||
}
|
||||
|
||||
const std::shared_ptr<ROMol> RascalResult::getMcesMol() const {
|
||||
if (d_mcesMol || !d_mol1) {
|
||||
return d_mcesMol;
|
||||
}
|
||||
|
||||
boost::dynamic_bitset<> mol1Bonds(d_mol1->getNumBonds());
|
||||
for (const auto &bm : d_bondMatches) {
|
||||
mol1Bonds.set(bm.first);
|
||||
}
|
||||
boost::dynamic_bitset<> mol1Atoms(d_mol1->getNumAtoms());
|
||||
for (const auto &am : d_atomMatches) {
|
||||
mol1Atoms.set(am.first);
|
||||
}
|
||||
std::shared_ptr<RWMol> tmpMol(new RWMol(*d_mol1));
|
||||
MolOps::KekulizeIfPossible(*tmpMol);
|
||||
tmpMol->beginBatchEdit();
|
||||
for (auto &bond : tmpMol->bonds()) {
|
||||
if (!mol1Bonds[bond->getIdx()]) {
|
||||
auto bo = bond->getBondType();
|
||||
if (bond->getBeginAtom()->getNoImplicit() ||
|
||||
(bond->getBeginAtom()->getIsAromatic() &&
|
||||
bond->getBeginAtom()->getAtomicNum() != 6)) {
|
||||
bond->getBeginAtom()->setNumExplicitHs(
|
||||
bond->getBeginAtom()->getNumExplicitHs() + bo);
|
||||
}
|
||||
if (bond->getEndAtom()->getNoImplicit() ||
|
||||
(bond->getEndAtom()->getIsAromatic() &&
|
||||
bond->getEndAtom()->getAtomicNum() != 6)) {
|
||||
bond->getEndAtom()->setNumExplicitHs(
|
||||
bond->getEndAtom()->getNumExplicitHs() + bo);
|
||||
}
|
||||
tmpMol->removeBond(bond->getBeginAtomIdx(), bond->getEndAtomIdx());
|
||||
}
|
||||
}
|
||||
for (auto atom : tmpMol->atoms()) {
|
||||
if (!mol1Atoms[atom->getIdx()]) {
|
||||
tmpMol->removeAtom(atom);
|
||||
}
|
||||
}
|
||||
tmpMol->commitBatchEdit();
|
||||
MolOps::removeHs(*tmpMol);
|
||||
MolOps::sanitizeMol(*tmpMol);
|
||||
d_mcesMol = tmpMol;
|
||||
return d_mcesMol;
|
||||
}
|
||||
|
||||
namespace details {
|
||||
bool resultCompare(const RascalResult &res1, const RascalResult &res2) {
|
||||
if (res1.getBondMatches().size() != res2.getBondMatches().size()) {
|
||||
return res1.getBondMatches().size() > res2.getBondMatches().size();
|
||||
}
|
||||
if (res1.getNumFrags() != res2.getNumFrags()) {
|
||||
return res1.getNumFrags() < res2.getNumFrags();
|
||||
}
|
||||
if (res1.getLargestFragSize() != res2.getLargestFragSize()) {
|
||||
return res1.getLargestFragSize() > res2.getLargestFragSize();
|
||||
}
|
||||
if (res1.getRingNonRingBondScore() != res2.getRingNonRingBondScore()) {
|
||||
return res1.getRingNonRingBondScore() < res2.getRingNonRingBondScore();
|
||||
}
|
||||
if (res1.getAtomMatchScore() != res2.getAtomMatchScore()) {
|
||||
return res1.getAtomMatchScore() < res2.getAtomMatchScore();
|
||||
}
|
||||
if (res1.getMaxDeltaAtomAtomDist() != res2.getMaxDeltaAtomAtomDist()) {
|
||||
return res1.getMaxDeltaAtomAtomDist() < res2.getMaxDeltaAtomAtomDist();
|
||||
}
|
||||
return res1.getSmarts() < res2.getSmarts();
|
||||
}
|
||||
|
||||
void extractClique(const std::vector<unsigned int> &clique,
|
||||
const std::vector<std::pair<int, int>> &vtxPairs,
|
||||
bool swapped,
|
||||
std::vector<std::pair<int, int>> &bondMatches) {
|
||||
bondMatches.clear();
|
||||
for (auto mem : clique) {
|
||||
if (swapped) {
|
||||
bondMatches.emplace_back(vtxPairs[mem].second, vtxPairs[mem].first);
|
||||
} else {
|
||||
bondMatches.push_back(vtxPairs[mem]);
|
||||
}
|
||||
}
|
||||
std::sort(bondMatches.begin(), bondMatches.end());
|
||||
}
|
||||
|
||||
void cleanSmarts(std::string &smarts) {
|
||||
const static std::vector<std::pair<std::regex, std::string>> repls{
|
||||
{std::regex(R"(\[#6&A\])"), "C"},
|
||||
{std::regex(R"(\[#6&A&R\])"), "[C&R]"},
|
||||
{std::regex(R"(\[#6&a\])"), "c"},
|
||||
{std::regex(R"(\[#7&A\])"), "N"},
|
||||
{std::regex(R"(\[#7&A&R\])"), "[N&R]"},
|
||||
{std::regex(R"(\[#7&a\])"), "n"},
|
||||
{std::regex(R"(\[#8&A\])"), "O"},
|
||||
{std::regex(R"(\[#8&A&R\])"), "[O&R]"},
|
||||
{std::regex(R"(\[#8&a\])"), "o"},
|
||||
{std::regex(R"(\[#9&A\])"), "F"},
|
||||
{std::regex(R"(\[#16&A\])"), "S"},
|
||||
{std::regex(R"(\[#16&a\])"), "s"},
|
||||
{std::regex(R"(\[#17&A\])"), "Cl"},
|
||||
{std::regex(R"(\[#35&A\])"), "Br"},
|
||||
{std::regex(R"(\[#53&A\])"), "I"},
|
||||
{std::regex(R"(([A-Z])-([cnops]))"), "$1$2"},
|
||||
{std::regex(R"(([cnops][1-9]*)-([A-Z]))"), "$1$2"},
|
||||
{std::regex(R"(([A-Z][1-9]*)-([A-Z]))"), "$1$2"},
|
||||
{std::regex(R"(([A-Z])-([1-9]))"), "$1$2"}};
|
||||
// Sometimes it needs more than 1 pass through
|
||||
std::string start_smt = "";
|
||||
while (start_smt != smarts) {
|
||||
start_smt = smarts;
|
||||
for (auto [patt, repl] : repls) {
|
||||
smarts = std::regex_replace(smarts, patt, repl);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void printBondMatches(const RascalResult &res, std::ostream &os) {
|
||||
os << "Bond 1 matches : " << res.getBondMatches().size() << " : [";
|
||||
for (const auto &bm : res.getBondMatches()) {
|
||||
os << bm.first << ",";
|
||||
}
|
||||
os << "]" << std::endl;
|
||||
os << "Bond 2 matches : " << res.getBondMatches().size() << " : [";
|
||||
for (const auto &bm : res.getBondMatches()) {
|
||||
os << bm.second << ",";
|
||||
}
|
||||
os << "]" << std::endl;
|
||||
}
|
||||
|
||||
void printAtomMatches(const RascalResult &res, std::ostream &os) {
|
||||
os << "Atom 1 matches : " << res.getAtomMatches().size() << " : [";
|
||||
for (const auto &am : res.getAtomMatches()) {
|
||||
os << am.first << ",";
|
||||
}
|
||||
os << "]" << std::endl;
|
||||
os << "Atom 2 matches : " << res.getAtomMatches().size() << " : [";
|
||||
for (const auto &am : res.getAtomMatches()) {
|
||||
os << am.second << ",";
|
||||
}
|
||||
os << "]" << std::endl;
|
||||
}
|
||||
|
||||
void printScores(const RascalResult &res, std::ostream &os) {
|
||||
os << res.getBondMatches().size() << " : " << res.getNumFrags() << " : "
|
||||
<< res.getLargestFragSize() << " : " << res.getRingNonRingBondScore()
|
||||
<< " : " << res.getAtomMatchScore() << " : "
|
||||
<< res.getMaxDeltaAtomAtomDist() << " : " << res.getSmarts() << std::endl;
|
||||
}
|
||||
|
||||
double johnsonSimilarity(const std::vector<std::pair<int, int>> &bondMatches,
|
||||
const std::vector<std::pair<int, int>> &atomMatches,
|
||||
const RDKit::ROMol &mol1, const RDKit::ROMol &mol2) {
|
||||
double num = (bondMatches.size() + atomMatches.size()) *
|
||||
(bondMatches.size() + atomMatches.size());
|
||||
double denom = (mol1.getNumAtoms() + mol1.getNumBonds()) *
|
||||
(mol2.getNumAtoms() + mol2.getNumBonds());
|
||||
return num / denom;
|
||||
}
|
||||
} // namespace details
|
||||
|
||||
} // namespace RascalMCES
|
||||
} // namespace RDKit
|
||||
153
Code/GraphMol/RascalMCES/RascalResult.h
Normal file
153
Code/GraphMol/RascalMCES/RascalResult.h
Normal file
@@ -0,0 +1,153 @@
|
||||
//
|
||||
// Copyright (C) David Cosgrove 2023
|
||||
//
|
||||
// @@ All Rights Reserved @@
|
||||
// This file is part of the RDKit.
|
||||
// The contents are covered by the terms of the BSD license
|
||||
// which is included in the file license.txt, found at the root
|
||||
// of the RDKit source tree.
|
||||
|
||||
// A class to hold the results of a RASCAL MCES determination
|
||||
// between 2 molecules. Contains the bonds and atoms that
|
||||
// correspond between the molecules, and also a SMARTS pattern
|
||||
// defining the MCES.
|
||||
//
|
||||
#include <RDGeneral/export.h>
|
||||
|
||||
#ifndef RASCALRESULT_H
|
||||
#define RASCALRESULT_H
|
||||
|
||||
#include <vector>
|
||||
|
||||
#include <GraphMol/ROMol.h>
|
||||
|
||||
namespace RDKit {
|
||||
|
||||
namespace RascalMCES {
|
||||
|
||||
class RDKIT_RASCALMCES_EXPORT RascalResult {
|
||||
public:
|
||||
RascalResult(const RDKit::ROMol &mol1, const RDKit::ROMol &mol2,
|
||||
const std::vector<std::vector<int>> &adjMatrix1,
|
||||
const std::vector<std::vector<int>> &adjMatrix2,
|
||||
const std::vector<unsigned int> &clique,
|
||||
const std::vector<std::pair<int, int>> &vtx_pairs, bool timedOut,
|
||||
bool swapped, double tier1Sim, double tier2Sim,
|
||||
bool ringMatchesRingOnly, bool singleLargestFrag,
|
||||
int minFragSep);
|
||||
// For when the tier[12]Sim didn't hit the threshold, but it
|
||||
// might be of interest what the estimates of similarity were.
|
||||
RascalResult(double tier1Sim, double tier2Sim);
|
||||
|
||||
RascalResult(const RascalResult &other);
|
||||
|
||||
RascalResult(RascalResult &&other) = default;
|
||||
|
||||
~RascalResult() = default;
|
||||
|
||||
RascalResult &operator=(const RascalResult &other);
|
||||
|
||||
RascalResult &operator=(RascalResult &&other) = default;
|
||||
|
||||
// Cut the result down to the single largest fragment. This is
|
||||
// irrecoverably destructive.
|
||||
void largestFragOnly();
|
||||
void largestFragsOnly(unsigned int numFrags = 2);
|
||||
void trimSmallFrags(unsigned int minFragSize = 3);
|
||||
|
||||
std::vector<std::pair<int, int>> getBondMatches() const {
|
||||
return d_bondMatches;
|
||||
}
|
||||
|
||||
std::vector<std::pair<int, int>> getAtomMatches() const {
|
||||
return d_atomMatches;
|
||||
}
|
||||
|
||||
// The following 5 functions are used in resultCompare to rank
|
||||
// 2 MCES of the same size for the same pair of molecules.
|
||||
// returns the number of contiguous fragments in the MCES.
|
||||
int getNumFrags() const;
|
||||
|
||||
// returns how many bonds in the clique don't match
|
||||
// cyclic/non-cyclic i.e. count as a matche in the MCES but
|
||||
// are ring bonds in one of the molecules and not in the other.
|
||||
int getRingNonRingBondScore() const;
|
||||
|
||||
// returns a score for how well the atoms in the clique from mol1 match the
|
||||
// atoms for the clique in mol2. Currently, the atom scores are the
|
||||
// difference in H count for matching atoms, and summed for the molecule. Its
|
||||
// so that, for example, an OH in mol1 that could match an OH or OMe matches
|
||||
// the OH for preference.
|
||||
int getAtomMatchScore() const;
|
||||
|
||||
// returns a score for the maximum difference in through-bond distance for
|
||||
// pairs of matching atoms in the 2 molecules. An MCES where 2 atoms
|
||||
// are far apart in one molecule and the corresponding atoms are close
|
||||
// together in the other will get a high score by this measure.
|
||||
int getMaxDeltaAtomAtomDist() const;
|
||||
|
||||
// returns the number of atoms in the largest contiguous fragment
|
||||
// in the MCES.
|
||||
int getLargestFragSize() const;
|
||||
|
||||
std::string getSmarts() const;
|
||||
const std::shared_ptr<ROMol> getMcesMol() const;
|
||||
bool getTimedOut() const { return d_timedOut; };
|
||||
|
||||
double getTier1Sim() const { return d_tier1Sim; }
|
||||
double getTier2Sim() const { return d_tier2Sim; }
|
||||
double getSimilarity() const;
|
||||
|
||||
private:
|
||||
std::shared_ptr<ROMol> d_mol1;
|
||||
std::shared_ptr<ROMol> d_mol2;
|
||||
mutable std::shared_ptr<ROMol> d_mcesMol;
|
||||
std::vector<std::pair<int, int>> d_bondMatches;
|
||||
std::vector<std::pair<int, int>> d_atomMatches;
|
||||
|
||||
mutable std::string d_smarts;
|
||||
bool d_timedOut{false};
|
||||
double d_tier1Sim;
|
||||
double d_tier2Sim;
|
||||
bool d_ringMatchesRingOnly{false};
|
||||
int d_maxFragSep{-1};
|
||||
|
||||
// These are used for sorting the results.
|
||||
mutable int d_numFrags{-1};
|
||||
mutable int d_ringNonRingBondScore{-1};
|
||||
mutable int d_atomMatchScore{-1};
|
||||
mutable int d_maxDeltaAtomAtomDist{-1};
|
||||
mutable int d_largestFragSize{-1};
|
||||
|
||||
// Assuming the frags are all part of the original MCES, just cut it
|
||||
// down to what's in the frags.
|
||||
void rebuildFromFrags(const std::vector<boost::shared_ptr<ROMol>> &frags);
|
||||
|
||||
std::string createSmartsString() const;
|
||||
|
||||
void matchCliqueAtoms(const std::vector<std::vector<int>> &mol1_adj_matrix);
|
||||
|
||||
// If the clique involves a fragment that is more than d_maxFragSep from
|
||||
// any other frag in either molecule, discard the smaller frag.
|
||||
void applyMaxFragSep();
|
||||
|
||||
// Make the fragments for either mol1 or mol2. If molNum is not 1 or 2,
|
||||
// returns nullptr.
|
||||
RDKit::ROMol *makeMolFrags(int molNum) const;
|
||||
|
||||
int calcRingNonRingScore() const;
|
||||
|
||||
int calcAtomMatchScore() const;
|
||||
|
||||
int calcLargestFragSize() const;
|
||||
|
||||
// If there are multiple fragments, can be helpful as a tie-breaker. It's the
|
||||
// maximum difference between through-bond distances between matching atoms in
|
||||
// the 2 molecules.
|
||||
int calcMaxDeltaAtomAtomDistScore() const;
|
||||
};
|
||||
|
||||
} // namespace RascalMCES
|
||||
} // namespace RDKit
|
||||
|
||||
#endif // RASCALRESULT_H
|
||||
8
Code/GraphMol/RascalMCES/Wrap/CMakeLists.txt
Normal file
8
Code/GraphMol/RascalMCES/Wrap/CMakeLists.txt
Normal file
@@ -0,0 +1,8 @@
|
||||
remove_definitions(-DRDKIT_RASCALMCES_BUILD)
|
||||
|
||||
rdkit_python_extension(rdRascalMCES
|
||||
rdRascalMCES.cpp
|
||||
DEST Chem
|
||||
LINK_LIBRARIES RascalMCES)
|
||||
|
||||
add_pytest(pyMolDraw2D ${CMAKE_CURRENT_SOURCE_DIR}/testRascalMCES.py)
|
||||
217
Code/GraphMol/RascalMCES/Wrap/rdRascalMCES.cpp
Normal file
217
Code/GraphMol/RascalMCES/Wrap/rdRascalMCES.cpp
Normal file
@@ -0,0 +1,217 @@
|
||||
//
|
||||
// Copyright (C) David Cosgrove 2023
|
||||
//
|
||||
// @@ All Rights Reserved @@
|
||||
// This file is part of the RDKit.
|
||||
// The contents are covered by the terms of the BSD license
|
||||
// which is included in the file license.txt, found at the root
|
||||
// of the RDKit source tree.
|
||||
//
|
||||
|
||||
#include <RDBoost/python.h>
|
||||
#include <RDBoost/Wrap.h>
|
||||
|
||||
#include <GraphMol/ROMol.h>
|
||||
#include <GraphMol/RascalMCES/RascalMCES.h>
|
||||
#include <GraphMol/RascalMCES/RascalClusterOptions.h>
|
||||
#include <GraphMol/RascalMCES/RascalOptions.h>
|
||||
#include <GraphMol/RascalMCES/RascalResult.h>
|
||||
|
||||
namespace python = boost::python;
|
||||
|
||||
namespace {
|
||||
|
||||
python::list convertVecPairInt(const std::vector<std::pair<int, int>> &vec) {
|
||||
python::list pyres;
|
||||
for (const auto &p : vec) {
|
||||
python::tuple tup = python::make_tuple(p.first, p.second);
|
||||
pyres.append(tup);
|
||||
}
|
||||
return pyres;
|
||||
}
|
||||
|
||||
python::list bondMatches(const RDKit::RascalMCES::RascalResult &res) {
|
||||
return convertVecPairInt(res.getBondMatches());
|
||||
}
|
||||
python::list atomMatches(const RDKit::RascalMCES::RascalResult &res) {
|
||||
return convertVecPairInt(res.getAtomMatches());
|
||||
}
|
||||
|
||||
void largestFragmentOnly(RDKit::RascalMCES::RascalResult &res) {
|
||||
res.largestFragOnly();
|
||||
}
|
||||
|
||||
struct RascalResult_wrapper {
|
||||
static void wrap() {
|
||||
std::string docString = "Used to return RASCAL MCES results.";
|
||||
python::class_<RDKit::RascalMCES::RascalResult>(
|
||||
"RascalResult", docString.c_str(), python::no_init)
|
||||
.def_readonly("smartsString",
|
||||
&RDKit::RascalMCES::RascalResult::getSmarts,
|
||||
"SMARTS string defining the MCES.")
|
||||
.def("bondMatches", &bondMatches,
|
||||
"A function returning a list of list "
|
||||
"of tuples, each inner list containing the matching bonds in the "
|
||||
"MCES as tuples of bond indices from mol1 and mol2")
|
||||
.def("atomMatches", &atomMatches, "Likewise for atoms.")
|
||||
.def(
|
||||
"largestFragmentOnly", &largestFragmentOnly,
|
||||
"Function that cuts the MCES down to the single largest frag. This cannot be undone.")
|
||||
.def_readonly("similarity",
|
||||
&RDKit::RascalMCES::RascalResult::getSimilarity,
|
||||
"Johnson similarity between 2 molecules.")
|
||||
.def_readonly("numFragments",
|
||||
&RDKit::RascalMCES::RascalResult::getNumFrags,
|
||||
"Number of fragments in MCES.")
|
||||
.def_readonly("largestFragmentSize",
|
||||
&RDKit::RascalMCES::RascalResult::getLargestFragSize,
|
||||
"Number of atoms in largest fragment.")
|
||||
.def_readonly("timedOut", &RDKit::RascalMCES::RascalResult::getTimedOut,
|
||||
"Whether it timed out.");
|
||||
}
|
||||
};
|
||||
} // namespace
|
||||
|
||||
namespace RDKit {
|
||||
|
||||
python::list findMCESWrapper(const ROMol &mol1, const ROMol &mol2,
|
||||
python::object py_opts) {
|
||||
RascalMCES::RascalOptions opts;
|
||||
if (!py_opts.is_none()) {
|
||||
opts = python::extract<RascalMCES::RascalOptions>(py_opts);
|
||||
}
|
||||
std::vector<RDKit::RascalMCES::RascalResult> results;
|
||||
{
|
||||
NOGIL gil;
|
||||
results = RascalMCES::rascalMCES(mol1, mol2, opts);
|
||||
}
|
||||
python::list pyres;
|
||||
for (auto &res : results) {
|
||||
pyres.append(res);
|
||||
}
|
||||
return pyres;
|
||||
}
|
||||
|
||||
std::vector<std::shared_ptr<ROMol>> extractMols(python::object mols) {
|
||||
std::vector<std::shared_ptr<ROMol>> cmols;
|
||||
unsigned int nElems = python::extract<unsigned int>(mols.attr("__len__")());
|
||||
cmols.resize(nElems);
|
||||
for (unsigned int i = 0; i < nElems; ++i) {
|
||||
if (!mols[i]) {
|
||||
throw_value_error("molecule is None");
|
||||
}
|
||||
cmols[i] = python::extract<std::shared_ptr<ROMol>>(mols[i]);
|
||||
}
|
||||
return cmols;
|
||||
}
|
||||
|
||||
python::list packOutputMols(
|
||||
const std::vector<std::vector<unsigned int>> &clusters) {
|
||||
python::list pyres;
|
||||
for (auto &clus : clusters) {
|
||||
python::list mols;
|
||||
for (auto &m : clus) {
|
||||
mols.append(m);
|
||||
}
|
||||
pyres.append(mols);
|
||||
}
|
||||
return pyres;
|
||||
}
|
||||
|
||||
python::list rascalClusterWrapper(python::object mols, python::object py_opts) {
|
||||
RascalMCES::RascalClusterOptions opts;
|
||||
if (!py_opts.is_none()) {
|
||||
opts = python::extract<RascalMCES::RascalClusterOptions>(py_opts);
|
||||
}
|
||||
auto cmols = extractMols(mols);
|
||||
std::vector<RDKit::UINT_VECT> clusters;
|
||||
{
|
||||
NOGIL gil;
|
||||
clusters = RascalMCES::rascalCluster(cmols, opts);
|
||||
}
|
||||
return packOutputMols(clusters);
|
||||
}
|
||||
|
||||
python::list rascalButinaClusterWrapper(python::object mols,
|
||||
python::object py_opts) {
|
||||
RascalMCES::RascalClusterOptions opts;
|
||||
if (!py_opts.is_none()) {
|
||||
opts = python::extract<RascalMCES::RascalClusterOptions>(py_opts);
|
||||
}
|
||||
auto cmols = extractMols(mols);
|
||||
std::vector<RDKit::UINT_VECT> clusters;
|
||||
{
|
||||
NOGIL gil;
|
||||
clusters = RascalMCES::rascalButinaCluster(cmols, opts);
|
||||
}
|
||||
return packOutputMols(clusters);
|
||||
}
|
||||
|
||||
BOOST_PYTHON_MODULE(rdRascalMCES) {
|
||||
python::scope().attr("__doc__") =
|
||||
"Module containing implementation of RASCAL Maximum Common Edge Substructure algorithm.";
|
||||
RascalResult_wrapper::wrap();
|
||||
|
||||
std::string docString = "RASCAL Options";
|
||||
python::class_<RDKit::RascalMCES::RascalOptions, boost::noncopyable>(
|
||||
"RascalOptions", docString.c_str())
|
||||
.def_readwrite(
|
||||
"similarityThreshold",
|
||||
&RDKit::RascalMCES::RascalOptions::similarityThreshold,
|
||||
"Threshold below which MCES won't be run. Between 0.0 and 1.0, default=0.7.")
|
||||
.def_readwrite(
|
||||
"completeAromaticRings",
|
||||
&RDKit::RascalMCES::RascalOptions::completeAromaticRings,
|
||||
"If True (default), partial aromatic rings won't be returned.")
|
||||
.def_readwrite("ringMatchesRingOnly",
|
||||
&RDKit::RascalMCES::RascalOptions::ringMatchesRingOnly,
|
||||
"If True (default), ring bonds won't match ring bonds.")
|
||||
.def_readwrite(
|
||||
"minFragSize", &RDKit::RascalMCES::RascalOptions::minFragSize,
|
||||
"Imposes a minimum on the number of atoms in a fragment that may be part of the MCES. Default -1 means no minimum.")
|
||||
.def_readwrite(
|
||||
"maxFragSeparation",
|
||||
&RDKit::RascalMCES::RascalOptions::maxFragSeparation,
|
||||
"Maximum number of bonds between fragments in the MCES for both to be reported. Default -1 means no maximum. If exceeded, the smaller fragment will be removed.")
|
||||
.def_readwrite(
|
||||
"allBestMCESs", &RDKit::RascalMCES::RascalOptions::allBestMCESs,
|
||||
"If True, reports all MCESs found of the same maximum size. Default False means just report the first found.")
|
||||
.def_readwrite(
|
||||
"timeout", &RDKit::RascalMCES::RascalOptions::timeout,
|
||||
"Maximum time (in seconds) to spend on an individual MCESs determination. Default 60, -1 means no limit.");
|
||||
|
||||
docString =
|
||||
"Find one or more MCESs between the 2 molecules given. Returns a list of "
|
||||
"RascalResult objects."
|
||||
"- mol1"
|
||||
"- mol2 The two molecules for which to find the MCES"
|
||||
"- opts Optional RascalOptions object changing the default run mode."
|
||||
"";
|
||||
python::def("FindMCES", &RDKit::findMCESWrapper,
|
||||
(python::arg("mol1"), python::arg("mol2"),
|
||||
python::arg("opts") = python::object()),
|
||||
docString.c_str());
|
||||
docString =
|
||||
"Use the RASCAL MCES similarity metric to do fuzzy clustering. Returns a list of lists "
|
||||
"of molecules, each inner list being a cluster. The last cluster is all the "
|
||||
"molecules that didn't fit into another cluster (the singletons)."
|
||||
"- mols List of molecules to be clustered"
|
||||
"- opts Optional RascalOptions object changing the default run mode."
|
||||
"";
|
||||
python::def("RascalCluster", &RDKit::rascalClusterWrapper,
|
||||
(python::arg("mols"), python::arg("opts") = python::object()),
|
||||
docString.c_str());
|
||||
docString =
|
||||
"Use the RASCAL MCES similarity metric to do Butina clustering"
|
||||
" (Butina JCICS 39 747-750 (1999)). Returns a list of lists of molecules,"
|
||||
" each inner list being a cluster. The last cluster is all the"
|
||||
" molecules that didn't fit into another cluster (the singletons)."
|
||||
"- mols List of molecules to be clustered"
|
||||
"- opts Optional RascalOptions object changing the default run mode."
|
||||
"";
|
||||
python::def("RascalButinaCluster", &RDKit::rascalButinaClusterWrapper,
|
||||
(python::arg("mols"), python::arg("opts") = python::object()),
|
||||
docString.c_str());
|
||||
}
|
||||
|
||||
} // namespace RDKit
|
||||
119
Code/GraphMol/RascalMCES/Wrap/testRascalMCES.py
Normal file
119
Code/GraphMol/RascalMCES/Wrap/testRascalMCES.py
Normal file
@@ -0,0 +1,119 @@
|
||||
# Copyright (c) 2023 David Cosgrove and other RDKit contributors
|
||||
# All rights reserved.
|
||||
#
|
||||
# Redistribution and use in source and binary forms, with or without
|
||||
# modification, are permitted provided that the following conditions are
|
||||
# met:
|
||||
#
|
||||
# * Redistributions of source code must retain the above copyright
|
||||
# notice, this list of conditions and the following disclaimer.
|
||||
# * Redistributions in binary form must reproduce the above
|
||||
# copyright notice, this list of conditions and the following
|
||||
# disclaimer in the documentation and/or other materials provided
|
||||
# with the distribution.
|
||||
# * Neither the name of Novartis Institutes for BioMedical Research Inc.
|
||||
# nor the names of its contributors may be used to endorse or promote
|
||||
# products derived from this software without specific prior written
|
||||
# permission.
|
||||
#
|
||||
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
#
|
||||
|
||||
# These tests are just to check that the Python wrappers are working
|
||||
# ok. The bulk of the tests are in the C++ code.
|
||||
import os
|
||||
import unittest
|
||||
|
||||
from pathlib import Path
|
||||
|
||||
from rdkit import Chem
|
||||
from rdkit.Chem import rdRascalMCES
|
||||
|
||||
|
||||
class TestCase(unittest.TestCase):
|
||||
|
||||
def setUp(self):
|
||||
pass
|
||||
|
||||
def test1(self):
|
||||
mol1 = Chem.MolFromSmiles("c1ccccc1Cl")
|
||||
mol2 = Chem.MolFromSmiles("c1ccccc1F")
|
||||
opts = rdRascalMCES.RascalOptions()
|
||||
|
||||
results = rdRascalMCES.FindMCES(mol1, mol2, opts)
|
||||
self.assertEqual(len(results), 1)
|
||||
self.assertEqual(results[0].smartsString, 'c1:c:c:c:c:c:1')
|
||||
self.assertEqual(len(results[0].bondMatches()), 6)
|
||||
self.assertEqual(len(results[0].atomMatches()), 6)
|
||||
|
||||
def test2(self):
|
||||
# Test single largest fragment extraction
|
||||
ad1 = Chem.MolFromSmiles("CN(C)c1ccc(CC(=O)NCCCCCCCCCCNC23CC4CC(C2)CC(C3)C4)cc1 CHEMBL153934")
|
||||
ad2 = Chem.MolFromSmiles("N(C)c1ccc(CC(=O)NCCCCCCCCCCCCNC23CC4CC(C2)CC(C3)C4)cc1 CHEMBL157336")
|
||||
|
||||
opts = rdRascalMCES.RascalOptions()
|
||||
results = rdRascalMCES.FindMCES(ad1, ad2, opts)
|
||||
self.assertEqual(len(results), 1)
|
||||
self.assertEqual(results[0].smartsString,
|
||||
'N(-C)-c1:c:c:c(-CC(=O)-NCCCCCCCCCC):c:c:1.NC12CC3CC(-C1)-CC(-C2)-C3')
|
||||
results[0].largestFragmentOnly()
|
||||
self.assertEqual(results[0].smartsString, 'N(-C)-c1:c:c:c(-CC(=O)-NCCCCCCCCCC):c:c:1')
|
||||
|
||||
def test3(self):
|
||||
# Test not specifying options
|
||||
mol1 = Chem.MolFromSmiles("c1ccccc1Cl")
|
||||
mol2 = Chem.MolFromSmiles("c1ccccc1F")
|
||||
|
||||
results = rdRascalMCES.FindMCES(mol1, mol2)
|
||||
self.assertEqual(len(results), 1)
|
||||
self.assertEqual(results[0].smartsString, 'c1:c:c:c:c:c:1')
|
||||
self.assertEqual(len(results[0].bondMatches()), 6)
|
||||
self.assertEqual(len(results[0].atomMatches()), 6)
|
||||
|
||||
def test4(self):
|
||||
# Test setting non-default option
|
||||
mol1 = Chem.MolFromSmiles('Oc1cccc2C(=O)C=CC(=O)c12')
|
||||
mol2 = Chem.MolFromSmiles('O1C(=O)C=Cc2cc(OC)c(O)cc12')
|
||||
results = rdRascalMCES.FindMCES(mol1, mol2)
|
||||
self.assertEqual(len(results), 0)
|
||||
|
||||
opts = rdRascalMCES.RascalOptions()
|
||||
opts.similarityThreshold = 0.5
|
||||
results = rdRascalMCES.FindMCES(mol1, mol2, opts)
|
||||
self.assertEqual(len(results), 1)
|
||||
|
||||
def testRascalCluster(self):
|
||||
cdk2_file = Path(os.environ['RDBASE']) / 'Contrib' / 'Fastcluster' / 'cdk2.smi'
|
||||
suppl = Chem.SmilesMolSupplier(str(cdk2_file), '\t', 1, 0, False)
|
||||
mols = [mol for mol in suppl]
|
||||
clusters = rdRascalMCES.RascalCluster(mols)
|
||||
self.assertEqual(len(clusters), 8)
|
||||
expClusters = [7, 7, 6, 2, 2, 2, 2, 20]
|
||||
for clus, expClusSize in zip(clusters, expClusters):
|
||||
self.assertEqual(expClusSize, len(clus))
|
||||
|
||||
def testRascalButinaCluster(self):
|
||||
cdk2_file = Path(os.environ['RDBASE']) / 'Contrib' / 'Fastcluster' / 'cdk2.smi'
|
||||
suppl = Chem.SmilesMolSupplier(str(cdk2_file), '\t', 1, 0, False)
|
||||
mols = [mol for mol in suppl]
|
||||
clusters = rdRascalMCES.RascalButinaCluster(mols)
|
||||
self.assertEqual(len(clusters), 29)
|
||||
expClusters = [
|
||||
6, 6, 6, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1
|
||||
]
|
||||
for clus, expClusSize in zip(clusters, expClusters):
|
||||
self.assertEqual(expClusSize, len(clus))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main()
|
||||
529
Code/GraphMol/RascalMCES/data/chembl_1907596.smi
Normal file
529
Code/GraphMol/RascalMCES/data/chembl_1907596.smi
Normal file
@@ -0,0 +1,529 @@
|
||||
CHEMBL1907596_1 CN1CCC[C@H]1COc2cccnc2
|
||||
CHEMBL1907596_2 C(Oc1cncnc1)[C@@H]2CCN2
|
||||
CHEMBL1907596_3 Clc1ccc(cn1)C2CC3CCC2N3
|
||||
CHEMBL1907596_4 Fc1ncccc1OC[C@@H]2CCN2
|
||||
CHEMBL1907596_5 Fc1ncc(\C=C\c2cc(OC[C@@H]3CCN3)cnc2Cl)cc1Br
|
||||
CHEMBL1907596_6 Clc1ncc(OC[C@@H]2CCCN2)cc1\C=C\c3ccnc(Br)c3
|
||||
CHEMBL1907596_7 Fc1cc(\C=C\c2cc(OC[C@@H]3CCCN3)cnc2Cl)ccn1
|
||||
CHEMBL1907596_8 Clc1ncc(OC[C@@H]2CCN2)cc1\C=C\c3ccnc(Br)c3
|
||||
CHEMBL1907596_9 Clc1ccc(OC[C@@H]2CCN2)cn1
|
||||
CHEMBL1907596_10 Clc1ncc(OC[C@@H]2CCCN2)cc1c3cccnc3
|
||||
CHEMBL1907596_11 Fc1ncc(OC[C@@H]2CCN2)cc1c3cccnc3
|
||||
CHEMBL1907596_12 Brc1ncc(OC[C@@H]2CCN2)cc1c3cccnc3
|
||||
CHEMBL1907596_13 Ic1ncc(OC[C@@H]2CCN2)cc1c3cccnc3
|
||||
CHEMBL1907596_14 Clc1ccc(cn1)c2cc(OC[C@@H]3CCN3)cnc2Cl
|
||||
CHEMBL1907596_15 Fc1ccc(cn1)c2cc(OC[C@@H]3CCN3)cnc2Cl
|
||||
CHEMBL1907596_16 Clc1ncc(OC[C@@H]2CCN2)cc1c3cccnc3
|
||||
CHEMBL1907596_17 Clc1ncc(OC[C@@H]2CCN2)cc1\C=C\c3ccncc3
|
||||
CHEMBL1907596_18 Fc1cc(ccn1)c2cc(OC[C@@H]3CCN3)cnc2Cl
|
||||
CHEMBL1907596_19 Fc1cc(\C=C\c2cc(OC[C@@H]3CCN3)cnc2Cl)ccn1
|
||||
CHEMBL1907596_20 Fc1ccnc[n+]1c2cc(OC[C@@H]3CCN3)cnc2Cl
|
||||
CHEMBL1907596_21 Clc1ncc(OC[C@@H]2CCN2)cc1c3ccnc(Br)c3
|
||||
CHEMBL1907596_22 Clc1ncc(OC[C@@H]2CCCN2)cc1c3ccc(Br)nc3
|
||||
CHEMBL1907596_23 Fc1ccc(cn1)c2cc(OC[C@@H]3CCCN3)cnc2Cl
|
||||
CHEMBL1907596_24 Fc1ncc(cc1Br)c2cc(OC[C@@H]3CCN3)cnc2Cl
|
||||
CHEMBL1907596_25 Clc1ncc(OC[C@@H]2CCN2)cc1c3ccc(Br)nc3
|
||||
CHEMBL1907596_26 Clc1ccc(cn1)C2CC3CCC2N3
|
||||
CHEMBL1907596_27 Clc1ncc(OCC2CCCN2)cc1\C=C\c3ccccn3
|
||||
CHEMBL1907596_28 CN1CCC1COc2cnc(Cl)c(\C=C\c3ccncc3)c2
|
||||
CHEMBL1907596_29 CN1CCCC1COc2cnc(Cl)c(\C=C\c3ccccn3)c2
|
||||
CHEMBL1907596_30 CN1CCCC1COc2cnc(Cl)c(\C=C\c3cccnc3)c2
|
||||
CHEMBL1907596_31 CN1CCCC1COc2cnc(Cl)c(\C=C\c3ccncc3)c2
|
||||
CHEMBL1907596_32 CN1CCCC1COc2cnc(Cl)c(CCc3ccncc3)c2
|
||||
CHEMBL1907596_33 Clc1ncc(OCC2CCN2)cc1\C=C\c3ccncc3
|
||||
CHEMBL1907596_34 Clc1ncc(OCC2CCCN2)cc1CCc3ccncc3
|
||||
CHEMBL1907596_35 Clc1ncc(OCC2CCCN2)cc1\C=C\c3ccncc3
|
||||
CHEMBL1907596_36 Clc1ncc(OCC2CCCN2)cc1\C=C\c3cccnc3
|
||||
CHEMBL1907596_37 Clc1ccc(cn1)C2CC3CCC2N3
|
||||
CHEMBL1907596_38 CN1CCCC1c2ccc(Br)nc2
|
||||
CHEMBL1907596_39 CN1CCCC1c2ccc(Cl)nc2
|
||||
CHEMBL1907596_40 CN1CCC[C@H]1c2cccnc2
|
||||
CHEMBL1907596_41 Clc1ccc(cn1)C2CC3CCC2N3
|
||||
CHEMBL1907596_42 C1C[C@H]2CCC(N2)C(=C1)c3cccnc3
|
||||
CHEMBL1907596_43 Clc1ccc(cn1)C2=CCC[C@H]3CCC2N3
|
||||
CHEMBL1907596_44 Clc1ccc(cn1)C2CC3CCC2N3
|
||||
CHEMBL1907596_45 CO[C@@H]1CC=C2CCN3CCC4=C(CC(=O)OC4)[C@@]23C1
|
||||
CHEMBL1907596_46 CN1CCC[C@H]1c2cccnc2
|
||||
CHEMBL1907596_47 Clc1ccc(cn1)C2CC3CCC2N3
|
||||
CHEMBL1907596_48 CN1CCC[C@H]1c2cccnc2
|
||||
CHEMBL1907596_49 Clc1ccc(cn1)C2CC3CCC2N3
|
||||
CHEMBL1907596_50 Clc1ncc(cn1)C2CC3CCC2N3
|
||||
CHEMBL1907596_51 C1CC2CCC(N2)C(=C1)c3cccnc3
|
||||
CHEMBL1907596_52 Clc1ccc(cn1)C2=CCCC3CCC2N3
|
||||
CHEMBL1907596_53 CN1CCC[C@H]1COc2cccnc2
|
||||
CHEMBL1907596_54 CN1CCC[C@H]1COc2cncc(CCc3ccccc3)c2
|
||||
CHEMBL1907596_55 CN1CCC[C@H]1COc2cncc(\C=C\c3ccccc3)c2
|
||||
CHEMBL1907596_56 CN1CCC[C@H]1COc2cncc(c2)c3oc4ccccc4c3
|
||||
CHEMBL1907596_57 CN1CCC[C@H]1COc2cncc(c2)C#Cc3ccccc3
|
||||
CHEMBL1907596_58 CN1CCC[C@H]1COc2cncc(c2)c3cncnc3
|
||||
CHEMBL1907596_59 CN1CCC[C@H]1COc2cncc(c2)c3ccc(F)c(Cl)c3
|
||||
CHEMBL1907596_60 CN1CCC[C@H]1COc2cncc(c2)c3ccc(Cl)cc3Cl
|
||||
CHEMBL1907596_61 CN1CCC[C@H]1COc2cncc(c2)c3ccc(Cl)cc3
|
||||
CHEMBL1907596_62 CN1CCC[C@H]1COc2cncc(c2)c3ccc(C)cc3
|
||||
CHEMBL1907596_63 CN1CCC[C@H]1COc2cncc(c2)c3ccc(F)cc3
|
||||
CHEMBL1907596_64 CN1CCC[C@H]1COc2cncc(c2)c3cccc(N)c3
|
||||
CHEMBL1907596_65 CN1CCC[C@H]1COc2cncc(c2)c3cccc(c3)[N+](=O)[O-]
|
||||
CHEMBL1907596_66 CN1CCC[C@H]1COc2cncc(c2)c3ccc(cc3)C(F)(F)F
|
||||
CHEMBL1907596_67 COc1ccc(cc1)c2cncc(OC[C@@H]3CCCN3C)c2
|
||||
CHEMBL1907596_68 CN1CCC[C@H]1COc2cncc(c2)c3ccccc3C=O
|
||||
CHEMBL1907596_69 CN1CCC[C@H]1COc2cncc(CCc3ccncc3)c2
|
||||
CHEMBL1907596_70 CN1CCC[C@H]1COc2cncc(c2)c3ccccc3
|
||||
CHEMBL1907596_71 CN1CCC[C@H]1COc2cncc(c2)C#Cc3ccc(C)cc3
|
||||
CHEMBL1907596_72 CN1CCC[C@H]1COc2cncc(\C=C\c3ccncc3)c2
|
||||
CHEMBL1907596_73 CN1CCC[C@H]1COc2cncc(c2)c3ccc4ccccc4c3
|
||||
CHEMBL1907596_74 CN1CCC[C@H]1COc2cncc(c2)c3cccs3
|
||||
CHEMBL1907596_75 CN1CCC[C@H]1COc2cncc(c2)c3occc3
|
||||
CHEMBL1907596_76 CN1CCC[C@H]1COc2cncc(c2)c3cccnc3
|
||||
CHEMBL1907596_77 CN1CCC[C@H]1COc2cncc(c2)c3cc4ccccc4n3C
|
||||
CHEMBL1907596_78 CN1CCC[C@H]1COc2cncc(c2)c3cnc4ccccc4c3
|
||||
CHEMBL1907596_79 Clc1ccc(cn1)[C@H]2C[C@@H]3CC[C@H]2N3
|
||||
CHEMBL1907596_80 Brc1ccc(cn1)C2CC3CCC2N3
|
||||
CHEMBL1907596_81 Ic1ccc(cn1)C2CC3CCC2N3
|
||||
CHEMBL1907596_82 Fc1ccc(cn1)C2CC3CCC2N3
|
||||
CHEMBL1907596_83 C1CC2NC1CC2c3cccnc3
|
||||
CHEMBL1907596_84 Clc1ccc(cn1)[C@@H]2C[C@H]3CC[C@@H]2N3
|
||||
CHEMBL1907596_85 CN1CCCC1c2ccc(Br)nc2
|
||||
CHEMBL1907596_86 CN1CCCC1c2ccc(Cl)nc2
|
||||
CHEMBL1907596_87 Clc1ccc(cn1)[C@H]2C[C@@H]3CC[C@H]2N3
|
||||
CHEMBL1907596_88 Ic1ccc(cn1)C2CC3CCC2N3
|
||||
CHEMBL1907596_89 C1CC2NC1CC2c3cccnc3
|
||||
CHEMBL1907596_90 Clc1ccc(cn1)[C@@H]2C[C@H]3CC[C@@H]2N3
|
||||
CHEMBL1907596_91 CN1CCC[C@H]1c2cccnc2
|
||||
CHEMBL1907596_92 Clc1ccc(cn1)C2CC3CCC2N3
|
||||
CHEMBL1907596_93 C1C[C@H]2CCC(N2)C(=C1)c3cccnc3
|
||||
CHEMBL1907596_94 Clc1ccc(cn1)C2=CCC[C@H]3CCC2N3
|
||||
CHEMBL1907596_95 C1C[C@H]2CCC(N2)C(=C1)c3cncnc3
|
||||
CHEMBL1907596_96 Clc1ccc(cn1)C2CC3CCC2N3
|
||||
CHEMBL1907596_97 Clc1ccc(cn1)C2CC3CCC2N3
|
||||
CHEMBL1907596_98 Ic1cncc(O[C@H]2CCN2)c1
|
||||
CHEMBL1907596_99 C(Oc1cccnc1)[C@@H]2CCN2
|
||||
CHEMBL1907596_100 Clc1ccc(cn1)C2CC3CCC2N3
|
||||
CHEMBL1907596_101 Brc1ccc(cn1)C2CC3CCC2N3
|
||||
CHEMBL1907596_102 Fc1ccc(cn1)C2CC3CCC2N3
|
||||
CHEMBL1907596_103 C1CC2NC1CC2c3cccnc3
|
||||
CHEMBL1907596_104 Nc1ccc(cn1)C2CC3CCC2N3
|
||||
CHEMBL1907596_105 C(Oc1cccnc1)[C@@H]2CCN2
|
||||
CHEMBL1907596_106 Fc1ncccc1OC[C@@H]2CCN2
|
||||
CHEMBL1907596_107 Clc1ccc(OC[C@@H]2CCN2)cn1
|
||||
CHEMBL1907596_108 [O-][N+](=O)c1cncc(OC[C@@H]2CCN2)c1
|
||||
CHEMBL1907596_109 CCOc1cncc(OC[C@@H]2CCN2)c1
|
||||
CHEMBL1907596_110 CCCc1cncc(OC[C@@H]2CCN2)c1
|
||||
CHEMBL1907596_111 Fc1cncc(OC[C@@H]2CCN2)c1
|
||||
CHEMBL1907596_112 C(Oc1cncc(c1)c2ccccc2)[C@@H]3CCN3
|
||||
CHEMBL1907596_113 CC(=O)NCc1cncc(OC[C@@H]2CCN2)c1
|
||||
CHEMBL1907596_114 Cc1cncc(OC[C@H]2CCN2)c1
|
||||
CHEMBL1907596_115 Oc1cncc(OC[C@H]2CCN2)c1
|
||||
CHEMBL1907596_116 Clc1cncc(OC[C@H]2CCN2)c1
|
||||
CHEMBL1907596_117 COc1ccc(OC[C@H]2CCN2)cn1
|
||||
CHEMBL1907596_118 Clc1ccc(OC[C@H]2CCN2)cn1
|
||||
CHEMBL1907596_119 Cc1ccc(OC[C@H]2CCN2)cn1
|
||||
CHEMBL1907596_120 Cc1ccc(OC[C@@H]2CCN2)cn1
|
||||
CHEMBL1907596_121 Brc1ccc(OC[C@H]2CCN2)cn1
|
||||
CHEMBL1907596_122 Fc1ccc(OC[C@H]2CCN2)cn1
|
||||
CHEMBL1907596_123 Fc1ccc(OC[C@@H]2CCN2)cn1
|
||||
CHEMBL1907596_124 Brc1ccc(OC[C@@H]2CCN2)cn1
|
||||
CHEMBL1907596_125 CCc1cncc(OC[C@@H]2CCN2)c1
|
||||
CHEMBL1907596_126 Cc1cncc(OC[C@@H]2CCN2)c1
|
||||
CHEMBL1907596_127 Nc1cncc(OC[C@@H]2CCN2)c1
|
||||
CHEMBL1907596_128 FC(F)(F)c1cncc(OC[C@@H]2CCN2)c1
|
||||
CHEMBL1907596_129 Brc1cncc(OC[C@@H]2CCN2)c1
|
||||
CHEMBL1907596_130 Clc1cc(OC[C@@H]2CCN2)cnc1Cl
|
||||
CHEMBL1907596_131 Clc1cncc(OC[C@@H]2CCN2)c1
|
||||
CHEMBL1907596_132 Fc1ncccc1OC[C@H]2CCN2
|
||||
CHEMBL1907596_133 Clc1ncc(OC[C@@H]2CCN2)cc1c3ccccc3
|
||||
CHEMBL1907596_134 Clc1ncc(OC[C@@H]2CCN2)cc1Br
|
||||
CHEMBL1907596_135 N#Cc1cncc(OC[C@@H]2CCN2)c1
|
||||
CHEMBL1907596_136 Clc1ccc(cn1)C2CC3CCC2N3
|
||||
CHEMBL1907596_137 CN1CCC[C@H]1c2cccnc2
|
||||
CHEMBL1907596_138 Ic1ccc(cn1)C2CC3CCC2N3
|
||||
CHEMBL1907596_139 Ic1ccc(cn1)C2CC3CCC2N3
|
||||
CHEMBL1907596_140 Nc1cc(cnc1Cl)C2CC3CCC2N3
|
||||
CHEMBL1907596_141 Clc1ncc(cc1I)C2CC3CCC2N3
|
||||
CHEMBL1907596_142 Clc1ncc(cc1Br)C2CC3CCC2N3
|
||||
CHEMBL1907596_143 Clc1ncc(cc1N=[N+]=[N-])C2CC3CCC2N3
|
||||
CHEMBL1907596_144 Clc1ncc(cc1c2ccccc2)C3CC4CCC3N4
|
||||
CHEMBL1907596_145 Fc1cc(cnc1Cl)C2CC3CCC2N3
|
||||
CHEMBL1907596_146 Clc1cc(cnc1Cl)C2CC3CCC2N3
|
||||
CHEMBL1907596_147 Clc1ccc(cn1)C2CC3CCC2N3
|
||||
CHEMBL1907596_148 Fc1ncc(cc1c2ccccc2)[C@H]3C[C@@H]4CC[C@H]3N4
|
||||
CHEMBL1907596_149 Fc1ncc(cc1c2ccccc2)[C@@H]3C[C@H]4CC[C@@H]3N4
|
||||
CHEMBL1907596_150 Fc1ncc(cc1c2ccccc2)C3CC4CCC3N4
|
||||
CHEMBL1907596_151 Clc1ccc(cn1)[C@H]2C[C@@H]3CC[C@H]2N3
|
||||
CHEMBL1907596_152 Brc1ccc(cn1)C2CC3CCC2N3
|
||||
CHEMBL1907596_153 Fc1ccc(cn1)C2CC3CCC2N3
|
||||
CHEMBL1907596_154 C1CC2NC1CC2c3cccnc3
|
||||
CHEMBL1907596_155 CN1CCC[C@H]1c2cccnc2
|
||||
CHEMBL1907596_156 Clc1ccc(cn1)[C@H]2C[C@@H]3CC[C@H]2N3
|
||||
CHEMBL1907596_157 CN1[C@@H]2CC[C@H]1[C@@H](C2)c3ccc(Cl)nc3
|
||||
CHEMBL1907596_158 Cc1ccc(cn1)C2CC3CCC2N3
|
||||
CHEMBL1907596_159 CN1[C@H]2CC[C@@H]1[C@H](C2)c3ccc(Cl)nc3
|
||||
CHEMBL1907596_160 Clc1ccc(cn1)[C@H]2C[C@@H]3CC[C@H]2N3
|
||||
CHEMBL1907596_161 Clc1ccc(cn1)[C@@H]2C[C@H]3CC[C@@H]2N3
|
||||
CHEMBL1907596_162 Clc1ccc(cn1)[C@@H]2C[C@H]3CC[C@@H]2N3
|
||||
CHEMBL1907596_163 CN1C2CCC1C(C2)c3ccc(Cl)nc3
|
||||
CHEMBL1907596_164 Clc1ccc(cn1)C2CC3CCC2N3
|
||||
CHEMBL1907596_165 Clc1ccc(cn1)[C@@H]2C[C@H]3CC[C@@H]2N3
|
||||
CHEMBL1907596_166 Clc1ccc(cn1)[C@@H]2C[C@H]3CC[C@@H]2N3
|
||||
CHEMBL1907596_167 C(Oc1cccnc1)[C@@H]2CCN2
|
||||
CHEMBL1907596_168 Fc1ncccc1OC[C@@H]2CCN2
|
||||
CHEMBL1907596_169 Fc1cncc(OC[C@@H]2CCN2)c1
|
||||
CHEMBL1907596_170 Fc1ccc(OC[C@@H]2CCN2)cn1
|
||||
CHEMBL1907596_171 Brc1cncc(OC[C@@H]2CCN2)c1
|
||||
CHEMBL1907596_172 Clc1cncc(OC[C@@H]2CCN2)c1
|
||||
CHEMBL1907596_173 Clc1ncccc1OC[C@@H]2CCN2
|
||||
CHEMBL1907596_174 Ic1cncc(OC[C@@H]2CCN2)c1
|
||||
CHEMBL1907596_175 Ic1ccc(OC[C@@H]2CCN2)cn1
|
||||
CHEMBL1907596_176 Brc1ncccc1OC[C@@H]2CCN2
|
||||
CHEMBL1907596_177 CN1CCC[C@H]1c2cccnc2
|
||||
CHEMBL1907596_178 CCCCCCc1cncc(OC[C@@H]2CCCN2C)c1
|
||||
CHEMBL1907596_179 CCCCc1cncc(OC[C@@H]2CCCN2C)c1
|
||||
CHEMBL1907596_180 CC(C)Cc1cncc(OC[C@@H]2CCCN2C)c1
|
||||
CHEMBL1907596_181 CCCc1cncc(OC[C@@H]2CCCN2C)c1
|
||||
CHEMBL1907596_182 CN1CCC[C@H]1COc2cncc(N)c2
|
||||
CHEMBL1907596_183 CN1CCC[C@H]1COc2cncc(Br)c2
|
||||
CHEMBL1907596_184 CN1CCC[C@H]1COc2cncc(Cl)c2
|
||||
CHEMBL1907596_185 CCc1cncc(OC[C@@H]2CCCN2C)c1
|
||||
CHEMBL1907596_186 CN1CCC[C@H]1COc2cncc(C)c2
|
||||
CHEMBL1907596_187 CN1CCC[C@H]1COc2ccc(Cl)nc2
|
||||
CHEMBL1907596_188 CN1CCC[C@H]1COc2cccnc2
|
||||
CHEMBL1907596_189 CN1CCC[C@H]1COc2cncc(F)c2
|
||||
CHEMBL1907596_190 Clc1ccc(cn1)[C@H]2C[C@@H]3CC[C@H]2N3
|
||||
CHEMBL1907596_191 Clc1ccc(cn1)[C@@H]2C[C@H]3CC[C@@H]2N3
|
||||
CHEMBL1907596_192 Brc1ccc(cn1)C2CC3CCC2N3
|
||||
CHEMBL1907596_193 C1CC2NC1CC2c3cccnc3
|
||||
CHEMBL1907596_194 Fc1ccc(cn1)C2CC3CCC2N3
|
||||
CHEMBL1907596_195 O=C1C=CC=C2[C@H]3CNC[C@H](C3)CN12
|
||||
CHEMBL1907596_196 CN1CCC[C@H]1c2cccnc2
|
||||
CHEMBL1907596_197 CN1CCC[C@H]1c2cccnc2
|
||||
CHEMBL1907596_198 O=C1C=CC=C2[C@H]3CNC[C@H](C3)CN12
|
||||
CHEMBL1907596_199 Cl.Cl.C1NCC2CC1c3cc4nccnc4cc23
|
||||
CHEMBL1907596_200 Cl.[O-][N+](=O)c1ccc2C3CNCC(C3)c2c1
|
||||
CHEMBL1907596_201 Clc1ccc(cn1)[C@H]2C[C@@H]3CC[C@H]2N3
|
||||
CHEMBL1907596_202 Fc1ccc(cn1)C2CC3CCC2N3
|
||||
CHEMBL1907596_203 Fc1ncc(cc1c2ccccc2)C3CC4CCC3N4
|
||||
CHEMBL1907596_204 Clc1ccc(cn1)[C@@H]2C[C@H]3CC[C@@H]2N3
|
||||
CHEMBL1907596_205 COc1ccc(cc1)c2cc(cnc2F)C3CC4CCC3N4
|
||||
CHEMBL1907596_206 Fc1ccc(cc1)c2cc(cnc2F)C3CC4CCC3N4
|
||||
CHEMBL1907596_207 Fc1ncc(cc1c2cccc(Cl)c2)C3CC4CCC3N4
|
||||
CHEMBL1907596_208 Fc1cccc(c1)c2cc(cnc2F)C3CC4CCC3N4
|
||||
CHEMBL1907596_209 Fc1ncc(cc1c2ccc(Cl)cc2)C3CC4CCC3N4
|
||||
CHEMBL1907596_210 [O-][N+](=O)c1cccc(c1)c2cc(cnc2F)C3CC4CCC3N4
|
||||
CHEMBL1907596_211 COc1cccc(c1)c2cc(cnc2F)C3CC4CCC3N4
|
||||
CHEMBL1907596_212 [O-][N+](=O)c1ccc(cc1)c2cc(cnc2F)C3CC4CCC3N4
|
||||
CHEMBL1907596_213 Nc1cccc(c1)c2cc(cnc2F)C3CC4CCC3N4
|
||||
CHEMBL1907596_214 Nc1ccc(cc1)c2cc(cnc2F)C3CC4CCC3N4
|
||||
CHEMBL1907596_215 Clc1ccc(cn1)C2CC3CCC2N3
|
||||
CHEMBL1907596_216 CN1CCC[C@H]1COc2cncc(c2)C#Cc3ccccc3
|
||||
CHEMBL1907596_217 CN1CCC[C@H]1COc2cncc(c2)C#CCO
|
||||
CHEMBL1907596_218 CN1CCC[C@H]1COc2cncc(c2)C#CCCCCO
|
||||
CHEMBL1907596_219 CN1CCC[C@H]1COc2cncc(c2)C#CCCCCF
|
||||
CHEMBL1907596_220 CN1CCC[C@H]1c2cccnc2
|
||||
CHEMBL1907596_221 Clc1ccc(cn1)C2CC3CCC2N3
|
||||
CHEMBL1907596_222 Ic1cncc(c1)C2CC3CCC2N3
|
||||
CHEMBL1907596_223 Nc1cncc(c1)C2CC3CCC2N3
|
||||
CHEMBL1907596_224 C=Cc1cncc(c1)C2CC3CCC2N3
|
||||
CHEMBL1907596_225 Fc1cncc(c1)C2CC3CCC2N3
|
||||
CHEMBL1907596_226 Clc1cncc(c1)C2CC3CCC2N3
|
||||
CHEMBL1907596_227 Brc1cncc(c1)C2CC3CCC2N3
|
||||
CHEMBL1907596_228 C#Cc1cncc(c1)C2CC3CCC2N3
|
||||
CHEMBL1907596_229 CN1C2CCC1C(C2)c3cncc(I)c3
|
||||
CHEMBL1907596_230 Clc1ccc(cn1)C2CC3CCC2N3
|
||||
CHEMBL1907596_231 Clc1ccc(cn1)C2CC3CCCCC2N3
|
||||
CHEMBL1907596_232 Clc1ccc(cn1)[C@@H]2CC3CNC2C3
|
||||
CHEMBL1907596_233 Clc1ccc(cn1)[C@@H]2CC3CC2CN3
|
||||
CHEMBL1907596_234 Clc1ccc(cn1)C2CC3CCC2CN3
|
||||
CHEMBL1907596_235 Clc1ccc(cn1)C2CC3CCC2N3
|
||||
CHEMBL1907596_236 Clc1ccc(cn1)C2CC3CCC2NC3
|
||||
CHEMBL1907596_237 Clc1ccc(cn1)[C@@H]2CC3CNC2C3
|
||||
CHEMBL1907596_238 Clc1ccc(cn1)C2CC3CCC2N3
|
||||
CHEMBL1907596_239 C1CC2NC1CC2c3cccnc3
|
||||
CHEMBL1907596_240 Clc1ccc(cn1)[C@@H]2C[C@H]3CC[C@@H]2N3
|
||||
CHEMBL1907596_241 C(Oc1cccnc1)C2CCN2
|
||||
CHEMBL1907596_242 CN1CCCC1COc2cccnc2
|
||||
CHEMBL1907596_243 C1NCC2CC1c3cc4nccnc4cc23
|
||||
CHEMBL1907596_244 Clc1ccc2cc3C4CNCC(C4)c3cc2n1
|
||||
CHEMBL1907596_245 Cc1ccc2cc3C4CNCC(C4)c3cc2n1
|
||||
CHEMBL1907596_246 Cc1cnc2cc3C4CNCC(C4)c3cc2c1
|
||||
CHEMBL1907596_247 [O-][N+](=O)c1ccc2C3CNCC(C3)c2c1
|
||||
CHEMBL1907596_248 Fc1cc2C3CNCC(C3)c2cc1F
|
||||
CHEMBL1907596_249 Clc1ccc(cn1)C2CC3CCC2N3
|
||||
CHEMBL1907596_250 Fc1ncc(cc1c2ccccc2)C3CC4CCC3N4
|
||||
CHEMBL1907596_251 Clc1ncc(cc1c2ccccc2)C3CC4CCC3N4
|
||||
CHEMBL1907596_252 Fc1ccc(cc1)c2cc(cnc2F)C3CC4CCC3N4
|
||||
CHEMBL1907596_253 Fc1ncc(cc1c2ccc(Cl)cc2)C3CC4CCC3N4
|
||||
CHEMBL1907596_254 Fc1ncc(cc1c2ccc(cc2)C#N)C3CC4CCC3N4
|
||||
CHEMBL1907596_255 Fc1ncc(cc1c2ccc(Cl)c(Cl)c2)C3CC4CCC3N4
|
||||
CHEMBL1907596_256 CN1C2CCC1C(C2)c3cnc(Cl)c(c3)c4ccccc4
|
||||
CHEMBL1907596_257 Cc1ccc(cc1)c2cc(cnc2Cl)C3CC4CCC3N4
|
||||
CHEMBL1907596_258 COc1ccc(cc1)c2cc(cnc2Cl)C3CC4CCC3N4
|
||||
CHEMBL1907596_259 Cc1ccc(cc1)c2cc(cnc2F)C3CC4CCC3N4
|
||||
CHEMBL1907596_260 Cc1cccc(c1)c2cc(cnc2F)C3CC4CCC3N4
|
||||
CHEMBL1907596_261 CN1CCCC1c2ccc(Br)nc2
|
||||
CHEMBL1907596_262 CN1CCCC1c2ccc(Cl)nc2
|
||||
CHEMBL1907596_263 Clc1ccc(cn1)C2CC3CCC2N3
|
||||
CHEMBL1907596_264 C=CC1=CC=C2C3CNCC(C3)CN2C1=O
|
||||
CHEMBL1907596_265 Clc1ccc(cn1)C2CC3CCC2N3
|
||||
CHEMBL1907596_266 Clc1ccc(cn1)[C@H]2C[C@@H]3CC[C@H]2N3
|
||||
CHEMBL1907596_267 Clc1ccc(cn1)[C@@H]2C[C@H]3CC[C@@H]2N3
|
||||
CHEMBL1907596_268 C1CN(C[C@@H]2NC[C@H]12)c3cccnc3
|
||||
CHEMBL1907596_269 Clc1ccc(cn1)N2CC[C@H]3CN[C@H]3C2
|
||||
CHEMBL1907596_270 N#Cc1cncc(c1)N2CC[C@@H]3CN[C@@H]3C2
|
||||
CHEMBL1907596_271 COc1cncc(c1)N2CC[C@H]3CN[C@H]3C2
|
||||
CHEMBL1907596_272 Brc1ncc(cc1C#N)N2CC[C@H]3CN[C@H]3C2
|
||||
CHEMBL1907596_273 N\C(=N\O)\c1cncc(c1)N2CC[C@@H]3CN[C@@H]3C2
|
||||
CHEMBL1907596_274 C1C[C@H]2CN([C@H]2CN1)c3cccnc3
|
||||
CHEMBL1907596_275 Brc1ncc(cc1C#N)N2C[C@@H]3CCNC[C@H]23
|
||||
CHEMBL1907596_276 C1CN(C[C@@H]2NC[C@H]12)c3cncnc3
|
||||
CHEMBL1907596_277 Clc1ccc(nn1)N2CC[C@H]3CN[C@H]3C2
|
||||
CHEMBL1907596_278 Brc1cncc(c1)N2CC[C@H]3CN[C@H]3C2
|
||||
CHEMBL1907596_279 Clc1cc(cnc1Cl)N2CC[C@H]3CN[C@H]3C2
|
||||
CHEMBL1907596_280 COc1ccc(cn1)N2CC[C@H]3CN[C@H]3C2
|
||||
CHEMBL1907596_281 CCOc1cncc(c1)N2CC[C@H]3CN[C@H]3C2
|
||||
CHEMBL1907596_282 Clc1ccc(cn1)N2C[C@@H]3CCNC[C@H]23
|
||||
CHEMBL1907596_283 Clc1cc(cnc1Cl)N2C[C@@H]3CCNC[C@H]23
|
||||
CHEMBL1907596_284 Brc1cc(cnc1Br)N2CC[C@H]3CN[C@H]3C2
|
||||
CHEMBL1907596_285 Cc1cncc(c1)N2CC[C@@H]3CN[C@@H]3C2
|
||||
CHEMBL1907596_286 Cc1cc(cnc1Cl)N2C[C@@H]3CCNC[C@H]23
|
||||
CHEMBL1907596_287 Cc1cc(cnc1Cl)N2CC[C@H]3CN[C@H]3C2
|
||||
CHEMBL1907596_288 COc1cc(cnc1Br)N2CC[C@H]3CN[C@H]3C2
|
||||
CHEMBL1907596_289 Clc1ccc(cn1)N2C[C@H]3CCNC[C@@H]23
|
||||
CHEMBL1907596_290 Clc1cc(cnc1Cl)N2C[C@H]3CCNC[C@@H]23
|
||||
CHEMBL1907596_291 N#Cc1cncc(c1)N2C[C@@H]3CCNC[C@H]23
|
||||
CHEMBL1907596_292 Cc1cc(cnc1Cl)N2C[C@H]3CCNC[C@@H]23
|
||||
CHEMBL1907596_293 COc1cc(cnc1Br)N2C[C@H]3CCNC[C@@H]23
|
||||
CHEMBL1907596_294 Brc1ncc(cc1C#N)N2C[C@H]3CCNC[C@@H]23
|
||||
CHEMBL1907596_295 COc1cncc(c1)N2C[C@H]3CCNC[C@@H]23
|
||||
CHEMBL1907596_296 CCOc1cncc(c1)N2CC[C@@H]3CN[C@@H]3C2
|
||||
CHEMBL1907596_297 C1CN(C[C@H]2NC[C@@H]12)c3cccnc3
|
||||
CHEMBL1907596_298 Clc1ccc(cn1)N2CC[C@@H]3CN[C@@H]3C2
|
||||
CHEMBL1907596_299 Brc1cncc(c1)N2CC[C@@H]3CN[C@@H]3C2
|
||||
CHEMBL1907596_300 Clc1cc(cnc1Cl)N2CC[C@@H]3CN[C@@H]3C2
|
||||
CHEMBL1907596_301 Brc1cc(cnc1Br)N2CC[C@@H]3CN[C@@H]3C2
|
||||
CHEMBL1907596_302 COc1cncc(c1)N2CC[C@@H]3CN[C@@H]3C2
|
||||
CHEMBL1907596_303 Cc1cc(cnc1Cl)N2CC[C@@H]3CN[C@@H]3C2
|
||||
CHEMBL1907596_304 N#Cc1cncc(c1)N2CC[C@H]3CN[C@H]3C2
|
||||
CHEMBL1907596_305 COc1cc(cnc1Br)N2CC[C@@H]3CN[C@@H]3C2
|
||||
CHEMBL1907596_306 C1CN(C[C@H]2NC[C@@H]12)c3cncnc3
|
||||
CHEMBL1907596_307 Clc1ccc(cn1)[C@H]2C[C@@H]3CC[C@H]2N3
|
||||
CHEMBL1907596_308 Clc1ccc(cn1)[C@@H]2C[C@H]3CC[C@@H]2N3
|
||||
CHEMBL1907596_309 Cl.Cl.Clc1ccc(cn1)C2CC3CCCC2N3
|
||||
CHEMBL1907596_310 Cl.Cl.Cl.Clc1ccc(cn1)C2CC3CC2CN3
|
||||
CHEMBL1907596_311 Cl.Clc1ccc(cn1)C2CC3CCC2N3
|
||||
CHEMBL1907596_312 CN1CCC[C@H]1c2cccnc2
|
||||
CHEMBL1907596_313 Clc1ccc(cn1)C2CC3CCC2N3
|
||||
CHEMBL1907596_314 Clc1ccc(cn1)N2CC3CC2CN3
|
||||
CHEMBL1907596_315 C1CC2CNC1CN2c3cccnc3
|
||||
CHEMBL1907596_316 C1NC2CC1CN(C2)c3cccnc3
|
||||
CHEMBL1907596_317 C1NCC2CC1CN2c3cccnc3
|
||||
CHEMBL1907596_318 C1NC2CC1N(C2)c3cccnc3
|
||||
CHEMBL1907596_319 [I-].C[N+]1(C)CC2CC1CN2c3ccc(Cl)nc3
|
||||
CHEMBL1907596_320 Fc1ccc(cn1)N2CC3CC2CN3
|
||||
CHEMBL1907596_321 COc1ccc(cn1)N2CC3CC2CN3
|
||||
CHEMBL1907596_322 N#Cc1cncc(c1)N2CC3CC2CN3
|
||||
CHEMBL1907596_323 Oc1cc(cnc1Cl)N2CC3CC2CN3
|
||||
CHEMBL1907596_324 COc1cncc(c1)N2CC3CC2CN3
|
||||
CHEMBL1907596_325 Cc1cc(cnc1Cl)N2CC3CC2CN3
|
||||
CHEMBL1907596_326 Clc1cc(cnc1Cl)N2CC3CC2CN3
|
||||
CHEMBL1907596_327 COc1cc(cnc1Cl)N2CC3CC2CN3
|
||||
CHEMBL1907596_328 C1NC2CC1N(C2)c3cncnc3
|
||||
CHEMBL1907596_329 C1NC2CC1N(C2)c3cc4ncccc4s3
|
||||
CHEMBL1907596_330 Clc1ccc(cn1)C2CC3CCC2N3
|
||||
CHEMBL1907596_331 Fc1ccc(cn1)C2CC3CCC2N3
|
||||
CHEMBL1907596_332 C1CC2NC1CC2c3cccnc3
|
||||
CHEMBL1907596_333 C1NCC2CC1c3cc4nccnc4cc23
|
||||
CHEMBL1907596_334 Fc1ccc(cc1)c2cncc(c2)C3CC4CCC3N4
|
||||
CHEMBL1907596_335 Clc1ccc(cc1)c2cncc(c2)C3CC4CCC3N4
|
||||
CHEMBL1907596_336 Cl.Fc1ncc(cc1c2ccccc2)C3CC4CCC3N4
|
||||
CHEMBL1907596_337 Cl.Fc1ncc(cc1c2ccc(Cl)cc2)C3CC4CCC3N4
|
||||
CHEMBL1907596_338 Cl.Fc1cccc(c1)c2cc(cnc2F)C3CC4CCC3N4
|
||||
CHEMBL1907596_339 Cl.Nc1cccc(c1)c2cc(cnc2F)C3CC4CCC3N4
|
||||
CHEMBL1907596_340 Cl.Nc1ccc(cc1)c2cc(cnc2F)C3CC4CCC3N4
|
||||
CHEMBL1907596_341 Cl.[O-][N+](=O)c1ccc(cc1)c2cc(cnc2F)C3CC4CCC3N4
|
||||
CHEMBL1907596_342 Cl.Fc1ncc(cc1c2cccc(Cl)c2)C3CC4CCC3N4
|
||||
CHEMBL1907596_343 Cl.Fc1ccc(cc1)c2cc(cnc2F)C3CC4CCC3N4
|
||||
CHEMBL1907596_344 Cl.Nc1ccc(cc1)c2cncc(c2)C3CC4CCC3N4
|
||||
CHEMBL1907596_345 Cl.COc1cccc(c1)c2cc(cnc2F)C3CC4CCC3N4
|
||||
CHEMBL1907596_346 Cl.[O-][N+](=O)c1cccc(c1)c2cc(cnc2F)C3CC4CCC3N4
|
||||
CHEMBL1907596_347 Cl.Cl.C1CC2NC1CC2c3cncc(c3)c4ccccc4
|
||||
CHEMBL1907596_348 Cl.Cl.Fc1cccc(c1)c2cncc(c2)C3CC4CCC3N4
|
||||
CHEMBL1907596_349 Cl.Cl.[O-][N+](=O)c1cccc(c1)c2cncc(c2)C3CC4CCC3N4
|
||||
CHEMBL1907596_350 Cl.Cl.Cl.Clc1cccc(c1)c2cncc(c2)C3CC4CCC3N4
|
||||
CHEMBL1907596_351 Cl.Cl.[O-][N+](=O)c1ccc(cc1)c2cncc(c2)C3CC4CCC3N4
|
||||
CHEMBL1907596_352 Cl.Cl.COc1cccc(c1)c2cncc(c2)C3CC4CCC3N4
|
||||
CHEMBL1907596_353 O=C1C=CC=C2[C@H]3CNC[C@H](C3)CN12
|
||||
CHEMBL1907596_354 Clc1ccc(cn1)C2CC3CCC2N3
|
||||
CHEMBL1907596_355 CN1[C@@H]2CC[C@@H]1[C@H](C2)c3cncc(c3)c4ccnc(F)c4
|
||||
CHEMBL1907596_356 CN1[C@@H]2CC[C@@H]1[C@H](C2)c3cncc(c3)c4cccnc4F
|
||||
CHEMBL1907596_357 CN1[C@@H]2CC[C@@H]1[C@H](C2)c3cncc(c3)c4cccc(F)n4
|
||||
CHEMBL1907596_358 CN1C2CCC1C(C2)c3cncc(c3)c4ccc(F)nc4
|
||||
CHEMBL1907596_359 Clc1ccc(cn1)N2CC3CC(C2)N3
|
||||
CHEMBL1907596_360 C1C2CN(CC1N2)c3cccnc3
|
||||
CHEMBL1907596_361 Brc1ccc(cn1)N2CC3CC(C2)N3
|
||||
CHEMBL1907596_362 Clc1ccc(nn1)N2CC3CC(C2)N3
|
||||
CHEMBL1907596_363 CN1C2CC1CN(C2)c3ccc(Cl)nc3
|
||||
CHEMBL1907596_364 Clc1ccc(cn1)N2CC3CC2CN3
|
||||
CHEMBL1907596_365 C[C@H](CCOC(=O)N1CC(C)C1)N(C)C.OC(=O)C(=O)O
|
||||
CHEMBL1907596_366 Cc1cc(on1)[C@H]2C[C@H]3CC[C@H]2N3
|
||||
CHEMBL1907596_367 Clc1ccc(cn1)C2CC3CCC2N3
|
||||
CHEMBL1907596_368 Clc1ccc(cn1)C2=CCCC3CCC2N3
|
||||
CHEMBL1907596_369 Cc1cc(on1)[C@H]2C[C@H]3CC[C@H]2N3
|
||||
CHEMBL1907596_370 O=C1C=CC=C2[C@H]3CNC[C@H](C3)CN12
|
||||
CHEMBL1907596_371 O=C1C=CC=C2[C@H]3CNC[C@H](C3)CN12
|
||||
CHEMBL1907596_372 CN1CCC[C@H]1c2cccnc2
|
||||
CHEMBL1907596_373 Clc1ccc(cn1)C2CC3CCC2N3
|
||||
CHEMBL1907596_374 C(Oc1cncc(c1)N2C[C@H]3CNC[C@H]3C2)c4ccccc4
|
||||
CHEMBL1907596_375 CC(C)Oc1cncc(c1)N2C[C@H]3CNC[C@H]3C2.OC(=O)\C=C\C(=O)O
|
||||
CHEMBL1907596_376 OC(=O)C(F)(F)F.C1NC[C@H]2CN(C[C@@H]12)c3cncc(c3)c4ccccc4
|
||||
CHEMBL1907596_377 Cl.Clc1ccc(cn1)N2C[C@H]3CNC[C@H]3C2
|
||||
CHEMBL1907596_378 OC(=O)C(F)(F)F.Brc1ccc(cn1)N2C[C@H]3CNC[C@H]3C2
|
||||
CHEMBL1907596_379 OC(=O)C(F)(F)F.Brc1cncc(c1)N2C[C@H]3CNC[C@H]3C2
|
||||
CHEMBL1907596_380 CCCOc1cncc(c1)N2C[C@H]3CNC[C@H]3C2.OC(=O)\C=C\C(=O)O
|
||||
CHEMBL1907596_381 Cl.Cl.C1NC[C@H]2CN(C[C@@H]12)c3cccnc3
|
||||
CHEMBL1907596_382 Cl.Cl.Oc1cncc(c1)N2C[C@H]3CNC[C@H]3C2
|
||||
CHEMBL1907596_383 Cl.Cl.COc1cncc(c1)N2C[C@H]3CNC[C@H]3C2
|
||||
CHEMBL1907596_384 Cl.Cl.CCOc1cncc(c1)N2C[C@H]3CNC[C@H]3C2
|
||||
CHEMBL1907596_385 CN1CCC[C@H]1c2cccnc2
|
||||
CHEMBL1907596_386 Clc1ccc(cn1)C2CC3CCC2N3
|
||||
CHEMBL1907596_387 Clc1ccc(OC[C@H]2CCN2)cn1
|
||||
CHEMBL1907596_388 C1NC2CC1N(C2)c3cccnc3
|
||||
CHEMBL1907596_389 Clc1ccc(cn1)C2CC3CCC2N3
|
||||
CHEMBL1907596_390 CN1CCC[C@H]1COc2cnc(Cl)c(OCc3ccc(Cl)nc3)c2
|
||||
CHEMBL1907596_391 CN1CCC[C@H]1COc2cnc(Cl)c(OCc3ccnc(F)c3)c2
|
||||
CHEMBL1907596_392 Fc1cc(COc2cc(OC[C@@H]3CCCN3)cnc2Cl)ccn1
|
||||
CHEMBL1907596_393 CN1CCC[C@H]1COc2cnc(Cl)c(c2)c3ccnc(F)c3
|
||||
CHEMBL1907596_394 Clc1ccc(Oc2cc(OC[C@@H]3CCCN3)cnc2Cl)cn1
|
||||
CHEMBL1907596_395 [11CH3]N1CCC[C@H]1COc2cnc(Cl)c(\C=C\c3ccncc3)c2
|
||||
CHEMBL1907596_396 Clc1ccc(cn1)[C@@H]2C[C@H]3CC[C@@H]2N3
|
||||
CHEMBL1907596_397 Clc1ncc(cc1c2ccccc2)[C@H]3C[C@@H]4CC[C@H]3N4
|
||||
CHEMBL1907596_398 Fc1ccc(cc1)c2cc(cnc2Cl)[C@H]3C[C@@H]4CC[C@H]3N4
|
||||
CHEMBL1907596_399 Fc1cccc(c1)c2cc(cnc2Cl)[C@H]3C[C@@H]4CC[C@H]3N4
|
||||
CHEMBL1907596_400 Clc1ccc(cc1)c2cc(cnc2Cl)[C@H]3C[C@@H]4CC[C@H]3N4
|
||||
CHEMBL1907596_401 Clc1cccc(c1)c2cc(cnc2Cl)[C@H]3C[C@@H]4CC[C@H]3N4
|
||||
CHEMBL1907596_402 Clc1ncc(cc1c2cccc(Br)c2)[C@H]3C[C@@H]4CC[C@H]3N4
|
||||
CHEMBL1907596_403 [O-][N+](=O)c1ccc(cc1)c2cc(cnc2Cl)[C@H]3C[C@@H]4CC[C@H]3N4
|
||||
CHEMBL1907596_404 [O-][N+](=O)c1cccc(c1)c2cc(cnc2Cl)[C@H]3C[C@@H]4CC[C@H]3N4
|
||||
CHEMBL1907596_405 Nc1ccc(cc1)c2cc(cnc2Cl)[C@H]3C[C@@H]4CC[C@H]3N4
|
||||
CHEMBL1907596_406 Nc1cccc(c1)c2cc(cnc2Cl)[C@H]3C[C@@H]4CC[C@H]3N4
|
||||
CHEMBL1907596_407 COc1ccc(cc1)c2cc(cnc2Cl)[C@H]3C[C@@H]4CC[C@H]3N4
|
||||
CHEMBL1907596_408 COc1cccc(c1)c2cc(cnc2Cl)[C@H]3C[C@@H]4CC[C@H]3N4
|
||||
CHEMBL1907596_409 CN(C)c1cccc(c1)c2cc(cnc2Cl)[C@H]3C[C@@H]4CC[C@H]3N4
|
||||
CHEMBL1907596_410 Clc1ccc(nn1)N2C[C@H]3C[C@@H]2CN3
|
||||
CHEMBL1907596_411 Clc1ccc(cn1)C2CC3CCC2N3
|
||||
CHEMBL1907596_412 O=C1C=CC=C2[C@H]3CNC[C@H](C3)CN12
|
||||
CHEMBL1907596_413 BrC1=CC=C2[C@H]3CNC[C@H](C3)CN2C1=O
|
||||
CHEMBL1907596_414 BrN1C[C@H]2C[C@H](C1)C3=CC=CC(=O)N3C2
|
||||
CHEMBL1907596_415 FC(F)(F)C1=CC=C2[C@H]3CNC[C@H](C3)CN2C1=O
|
||||
CHEMBL1907596_416 CC1=CC=C2[C@H]3CNC[C@H](C3)CN2C1=O
|
||||
CHEMBL1907596_417 IN1C[C@H]2C[C@H](C1)C3=CC=CC(=O)N3C2
|
||||
CHEMBL1907596_418 Clc1ccc(cn1)C2CC3CCC2N3
|
||||
CHEMBL1907596_419 Cl.OCCCc1cc(on1)c2cncc(OC[C@@H]3CCN3)c2
|
||||
CHEMBL1907596_420 Cl.OCc1cc(on1)c2cncc(OC[C@@H]3CCN3)c2
|
||||
CHEMBL1907596_421 Cl.OCCc1cc(on1)c2cncc(OC[C@@H]3CCN3)c2
|
||||
CHEMBL1907596_422 Cl.OCCCc1cc(on1)c2cncc(OC[C@@H]3CCN3)c2
|
||||
CHEMBL1907596_423 Cl.OCc1cc(on1)c2cncc(OC[C@@H]3CCN3)c2
|
||||
CHEMBL1907596_424 Cl.OCCc1cc(on1)c2cncc(OC[C@@H]3CCN3)c2
|
||||
CHEMBL1907596_425 C1NC[C@H]2CN(C[C@@H]12)c3cccnc3
|
||||
CHEMBL1907596_426 C1C[C@@H]2CN(C[C@@H]2N1)c3cccnc3
|
||||
CHEMBL1907596_427 Clc1cccc(NC(=O)c2cncc(n2)N3C[C@H]4CNC[C@H]4C3)c1
|
||||
CHEMBL1907596_428 FC(F)(F)c1ccccc1CNC(=O)c2cncc(c2)N3C[C@H]4CNC[C@H]4C3
|
||||
CHEMBL1907596_429 O=C(N1CCc2ccccc2C1)c3cncc(c3)N4C[C@H]5CNC[C@H]5C4
|
||||
CHEMBL1907596_430 Fc1ccccc1CCNC(=O)c2cncc(c2)N3C[C@H]4CNC[C@H]4C3
|
||||
CHEMBL1907596_431 Ic1cccc(NC(=O)c2cncc(c2)N3C[C@H]4CNC[C@H]4C3)c1
|
||||
CHEMBL1907596_432 Clc1ccc(NC(=O)c2cncc(c2)N3C[C@H]4CNC[C@H]4C3)cc1
|
||||
CHEMBL1907596_433 Cc1cccc(NC(=O)c2cncc(c2)N3C[C@H]4CNC[C@H]4C3)c1
|
||||
CHEMBL1907596_434 Cc1cc(C)cc(NC(=O)c2cncc(c2)N3C[C@H]4CNC[C@H]4C3)c1
|
||||
CHEMBL1907596_435 COc1cc(NC(=O)c2cncc(c2)N3C[C@H]4CNC[C@H]4C3)cc(OC)c1
|
||||
CHEMBL1907596_436 Fc1cc(F)cc(NC(=O)c2cncc(c2)N3C[C@H]4CNC[C@H]4C3)c1
|
||||
CHEMBL1907596_437 Clc1ccc(cn1)C2CC3CCC2N3
|
||||
CHEMBL1907596_438 Cc1cc(on1)C2CC3CCC2N3
|
||||
CHEMBL1907596_439 Cl.Clc1ccc(cn1)C2=CC3CCC2N3
|
||||
CHEMBL1907596_440 Cl.Fc1ccc(cn1)C2=CC3CCC2N3
|
||||
CHEMBL1907596_441 Cl.C1CC2NC1C=C2c3cccnc3
|
||||
CHEMBL1907596_442 OCC[C@H]1C[C@@H]1c2cncc(OC[C@@H]3CCN3)c2
|
||||
CHEMBL1907596_443 OCC[C@@H]1C[C@H]1c2cncc(OC[C@@H]3CCN3)c2
|
||||
CHEMBL1907596_444 CNC(=O)OCC[C@H]1C[C@@H]1c2cncc(OC[C@@H]3CCN3)c2
|
||||
CHEMBL1907596_445 CN(C)C(=O)OCC[C@H]1C[C@@H]1c2cncc(OC[C@@H]3CCN3)c2
|
||||
CHEMBL1907596_446 O=C(NC1CC1)OCC[C@H]2C[C@@H]2c3cncc(OC[C@@H]4CCN4)c3
|
||||
CHEMBL1907596_447 O=C(OCC[C@H]1C[C@@H]1c2cncc(OC[C@@H]3CCN3)c2)N4CCCC4
|
||||
CHEMBL1907596_448 O=C(Nc1ccccc1)OCC[C@H]2C[C@@H]2c3cncc(OC[C@@H]4CCN4)c3
|
||||
CHEMBL1907596_449 COCC[C@H]1C[C@@H]1c2cncc(OC[C@@H]3CCN3)c2
|
||||
CHEMBL1907596_450 COCC[C@@H]1C[C@H]1c2cncc(OC[C@@H]3CCN3)c2
|
||||
CHEMBL1907596_451 OCC[C@H]1C[C@@H]1c2cncc(OC[C@@H]3CCN3)c2
|
||||
CHEMBL1907596_452 COCC[C@H]1C[C@@H]1c2cncc(OC[C@@H]3CCN3)c2
|
||||
CHEMBL1907596_453 OCCCCC#Cc1cncc(OC[C@@H]2CCN2)c1
|
||||
CHEMBL1907596_454 OCCCCC#Cc1cncc(OC[C@@H]2CCN2)c1
|
||||
CHEMBL1907596_455 C(Oc1cccnc1)[C@@H]2CCN2
|
||||
CHEMBL1907596_456 C1NCC2CC1c3cc4nccnc4cc23
|
||||
CHEMBL1907596_457 O=C(C1CC1)N2CC3CNC(C3)C2
|
||||
CHEMBL1907596_458 O=C1C=CC=C2[C@H]3CNC[C@H](C3)CN12
|
||||
CHEMBL1907596_459 OC(=O)C(F)(F)F.FC(F)C[C@H]1C[C@@H]1c2cncc(OC[C@@H]3CCN3)c2
|
||||
CHEMBL1907596_460 OC(=O)C(F)(F)F.FCC[C@H]1C[C@@H]1c2cncc(OC[C@@H]3CCN3)c2
|
||||
CHEMBL1907596_461 CC[C@H]1C[C@@H]1c2cncc(OC[C@@H]3CCN3)c2.OC(=O)C(F)(F)F
|
||||
CHEMBL1907596_462 OCCc1cc(on1)c2cncc(OC[C@@H]3CCN3)c2
|
||||
CHEMBL1907596_463 FC(F)CCc1cc(on1)c2cncc(OC[C@@H]3CCN3)c2
|
||||
CHEMBL1907596_464 FCCCc1cc(on1)c2cncc(OC[C@@H]3CCN3)c2
|
||||
CHEMBL1907596_465 CCNC(=O)OCCc1cc(on1)c2cncc(OC[C@@H]3CCN3)c2
|
||||
CHEMBL1907596_466 O=C(Nc1ccccc1)OCCc2cc(on2)c3cncc(OC[C@@H]4CCN4)c3
|
||||
CHEMBL1907596_467 CCc1cc(on1)c2cncc(OC[C@@H]3CCN3)c2
|
||||
CHEMBL1907596_468 FC(F)(F)CCc1cc(on1)c2cncc(OC[C@@H]3CCN3)c2
|
||||
CHEMBL1907596_469 OCCc1cc(on1)c2cncc(OC[C@@H]3CCN3)c2
|
||||
CHEMBL1907596_470 FC(F)CCc1cc(on1)c2cncc(OC[C@@H]3CCN3)c2
|
||||
CHEMBL1907596_471 FCCCc1cc(on1)c2cncc(OC[C@@H]3CCN3)c2
|
||||
CHEMBL1907596_472 O=C(Nc1ccccc1)OCCc2cc(on2)c3cncc(OC[C@@H]4CCN4)c3
|
||||
CHEMBL1907596_473 CCc1cc(on1)c2cncc(OC[C@@H]3CCN3)c2
|
||||
CHEMBL1907596_474 C(Oc1cccnc1)[C@@H]2CCN2
|
||||
CHEMBL1907596_475 Ic1cncc(OC[C@@H]2CCN2)c1
|
||||
CHEMBL1907596_476 Fc1ncccc1OC[C@H]2NCC=C2
|
||||
CHEMBL1907596_477 Ic1cncc(OC[C@H]2NCC=C2)c1
|
||||
CHEMBL1907596_478 Ic1cncc(OC[C@@H]2CCN2)c1
|
||||
CHEMBL1907596_479 C1NCC2CC1c3cc4nccnc4cc23
|
||||
CHEMBL1907596_480 OCCCCC#Cc1cncc(OC[C@@H]2CCN2)c1
|
||||
CHEMBL1907596_481 CCOC(=O)NCC[C@H]1C[C@@H]1c2cncc(OC[C@@H]3CCN3)c2.OC(=O)C(F)(F)F
|
||||
CHEMBL1907596_482 COCC[C@H]1C[C@@H]1c2cncc(OC[C@@H]3CCCN3C)c2.OC(=O)C(F)(F)F
|
||||
CHEMBL1907596_483 COCC[C@H]1C[C@@H]1c2cncc(OC[C@@H]3CCNC3)c2.OC(=O)C(F)(F)F
|
||||
CHEMBL1907596_484 OC[C@H]1C[C@@H]1c2cncc(OC[C@@H]3CCN3)c2.OC(=O)C(F)(F)F
|
||||
CHEMBL1907596_485 OCCC[C@H]1C[C@@H]1c2cncc(OC[C@@H]3CCN3)c2.OC(=O)C(F)(F)F
|
||||
CHEMBL1907596_486 CCOC(=O)NCC[C@H]1C[C@@H]1c2cncc(OC[C@@H]3CCN3)c2.OC(=O)C(F)(F)F
|
||||
CHEMBL1907596_487 CC(C)OC(=O)NCC[C@H]1C[C@@H]1c2cncc(OC[C@@H]3CCN3)c2.OC(=O)C(F)(F)F
|
||||
CHEMBL1907596_488 COCC[C@H]1C[C@@H]1c2cncc(OC[C@@H]3CCCN3)c2.OC(=O)C(F)(F)F
|
||||
CHEMBL1907596_489 COCC[C@H]1C[C@@H]1c2cncc(OC[C@@H]3CCCN3C)c2.OC(=O)C(F)(F)F
|
||||
CHEMBL1907596_490 COCC[C@H]1C[C@@H]1c2cncc(OC[C@@H]3CCNC3)c2.OC(=O)C(F)(F)F
|
||||
CHEMBL1907596_491 OC[C@H]1C[C@@H]1c2cncc(OC[C@@H]3CCN3)c2.OC(=O)C(F)(F)F
|
||||
CHEMBL1907596_492 OCCC[C@H]1C[C@@H]1c2cncc(OC[C@@H]3CCN3)c2.OC(=O)C(F)(F)F
|
||||
CHEMBL1907596_493 OC(=O)CC[C@H]1C[C@@H]1c2cncc(OC[C@@H]3CCN3)c2.OC(=O)C(F)(F)F
|
||||
CHEMBL1907596_494 OC(=O)C(F)(F)F.FC(F)(F)OCC[C@H]1C[C@@H]1c2cncc(OC[C@@H]3CCN3)c2
|
||||
CHEMBL1907596_495 O=C1C=CC=C2[C@H]3CNC[C@H](C3)CN12
|
||||
CHEMBL1907596_496 O=C1N2C[C@@H]3CNC[C@@H](C3)C2=CC=C1c4cccnc4
|
||||
CHEMBL1907596_497 COc1ccc(cc1Cl)C#CC(=O)N2C[C@@H]3CNC[C@@H](C3)C2
|
||||
CHEMBL1907596_498 O=C1C=CC=C2[C@H]3CNC[C@H](C3)CN12
|
||||
CHEMBL1907596_499 O=C1N2C[C@@H]3CNC[C@@H](C3)C2=CC=C1c4cccnc4
|
||||
CHEMBL1907596_500 CCCCC#Cc1cncc(OC[C@@H]2CCN2)c1
|
||||
CHEMBL1907596_501 CCCCC#Cc1cncc(OC[C@H]2CCN2)c1
|
||||
CHEMBL1907596_502 CCCCC#Cc1cncc(OC[C@H]2CCCN2)c1
|
||||
CHEMBL1907596_503 [N-]=[N+]=NCCCCC#Cc1cncc(OC[C@@H]2CCN2)c1
|
||||
CHEMBL1907596_504 C1NCC2CC1c3cc4nccnc4cc23
|
||||
CHEMBL1907596_505 C1NC[C@H]2CN(C[C@@H]12)c3cccnc3
|
||||
CHEMBL1907596_506 OCCCCC#Cc1cncc(OC[C@@H]2CCN2)c1
|
||||
CHEMBL1907596_507 CCOCC[C@H]1C[C@@H]1c2cncc(c2)N3C[C@H]4CNC[C@H]4C3.OC(=O)C(F)(F)F
|
||||
CHEMBL1907596_508 CN(C)CCOc1cncc(c1)N2CC3CNCC(C3)C2
|
||||
CHEMBL1907596_509 C1NCC2CC1CN(C2)c3cccnc3
|
||||
CHEMBL1907596_510 C1NCC2CC1c3cc4nccnc4cc23
|
||||
CHEMBL1907596_511 Fc1ccc(cn1)c2cncc(c2)[C@H]3C[C@H]4CC[C@H]3N4
|
||||
CHEMBL1907596_512 Clc1ccc(cn1)[C@H]2C[C@H]3CC[C@H]2N3
|
||||
CHEMBL1907596_513 C1C[C@H]2N[C@H]1C[C@@H]2c3cncc(c3)c4ccncc4
|
||||
CHEMBL1907596_514 Fc1cc(ccn1)c2cncc(c2)[C@H]3C[C@H]4CC[C@H]3N4
|
||||
CHEMBL1907596_515 Clc1cc(ccn1)c2cncc(c2)[C@H]3C[C@H]4CC[C@H]3N4
|
||||
CHEMBL1907596_516 Nc1cc(ccn1)c2cncc(c2)[C@H]3C[C@H]4CC[C@H]3N4
|
||||
CHEMBL1907596_517 COc1cc(ccn1)c2cncc(c2)[C@H]3C[C@H]4CC[C@H]3N4
|
||||
CHEMBL1907596_518 C1C[C@H]2N[C@H]1C[C@@H]2c3cncc(c3)c4cccnc4
|
||||
CHEMBL1907596_519 Clc1ccc(cn1)c2cncc(c2)[C@H]3C[C@H]4CC[C@H]3N4
|
||||
CHEMBL1907596_520 Nc1ccc(cn1)c2cncc(c2)[C@H]3C[C@H]4CC[C@H]3N4
|
||||
CHEMBL1907596_521 COc1ccc(cn1)c2cncc(c2)[C@H]3C[C@H]4CC[C@H]3N4
|
||||
CHEMBL1907596_522 C1CNCCN(C1)c2cccnc2
|
||||
CHEMBL1907596_523 CN1CCC[C@H]1COc2cncc(Br)c2
|
||||
CHEMBL1907596_524 CN1CCC[C@H]1COc2cccnc2
|
||||
CHEMBL1907596_525 CN1CCC[C@H]1COc2cncc(c2)c3ccccc3
|
||||
CHEMBL1907596_526 C1NC2CC1N(C2)c3cccnc3
|
||||
CHEMBL1907596_527 COCC[C@@H]1C[C@H]1c2cncc(c2)N3CCCNCC3
|
||||
CHEMBL1907596_528 COCC[C@H]1C[C@@H]1c2cncc(c2)N3CCCNCC3
|
||||
CHEMBL1907596_529 OCCc1cc(on1)c2cncc(c2)N3CCCNCC3
|
||||
55
Code/GraphMol/RascalMCES/data/test_cluster1.smi
Normal file
55
Code/GraphMol/RascalMCES/data/test_cluster1.smi
Normal file
@@ -0,0 +1,55 @@
|
||||
CHEMBL214_1 CCCN(CCC)[C@@H]1CCc2ccc3[nH]c(cc3c2C1)C#N
|
||||
CHEMBL214_2 Oc1cccc2CC[C@@H]3[C@@H](CN3CC=C)c12
|
||||
CHEMBL214_3 COc1ccccc1N2CCN(CCN3C(=O)CC4(CCCC4)CC3=O)CC2
|
||||
CHEMBL214_4 C[C@H]1C[C@@H](CCN1C[C@H](O)COc2cccc3[nH]c(C)cc23)c4cc5c(F)cccc5s4
|
||||
CHEMBL214_5 NCCc1c[nH]c2ccc(OCc3cccc(COc4ccc5[nH]cc(CCN)c5c4)c3)cc12
|
||||
CHEMBL214_6 COc1ccccc1N2CCN(CCCCNS(=O)(=O)c3ccc(C)cc3)CC2
|
||||
CHEMBL214_7 CCCN(CCCc1c[nH]c2ccc(F)cc12)C3COc4c(F)ccc(C(=O)NC)c4C3
|
||||
CHEMBL214_8 O=C1NCc2ccc(OCCCCN3CCN(CC3)c4cccc5ccccc45)cc12
|
||||
CHEMBL214_9 Fc1ccc2cccc(N3CCN(CCCOc4ccc5CNC(=O)c5c4)CC3)c2c1
|
||||
CHEMBL214_10 Fc1cccc2cccc(N3CCN(CCCOc4ccc5CNC(=O)c5c4)CC3)c12
|
||||
CHEMBL214_11 O=C1NCc2ccc(OCCCCN3CCN(CC3)c4cccc5CCCc45)cc12
|
||||
CHEMBL214_12 Fc1cc2CNC(=O)c2cc1OCCCN3CCN(CC3)c4cccc5ccccc45
|
||||
CHEMBL214_13 COc1ccccc1N2CCN(CCN(C(=O)C34CCC(I)(CC3)C4)c5ccccn5)CC2
|
||||
CHEMBL214_14 Cl.Cl.Cl.COc1ccccc1N2CCN(CCN(C(=O)C3CCCCC3)c4ccccn4)CC2
|
||||
CHEMBL214_15 COc1ccccc1N2CCN(CCN(C(=O)C34C5C6C3C7C4C5C67CF)c8ccccn8)CC2
|
||||
CHEMBL214_16 Oc1ccccc1N2CCN(CCN(C(=O)C34C5C6C3C7C4C5C67CF)c8ccccn8)CC2
|
||||
CHEMBL214_17 O=C(Nc1cccnc1)Nc2cccc(CCN3CCN(CC3)c4ccccc4)c2
|
||||
CHEMBL214_18 O=C(N1CCC(CCN2CCN(CC2)c3nsc4ccccc34)CC1)c5occc5
|
||||
CHEMBL339_2 CNc1cc(OC)c(cc1Cl)C(=O)N[C@H]2CCN(C2)C3C4CCCC3CCC4
|
||||
CHEMBL339_3 CCN1CCC[C@H]1CNC(=O)c2c(O)c(CCF)cc(OC)c2OC
|
||||
CHEMBL339_4 CCN1CCC[C@H]1CNC(=O)c2cc(I)cc(OC)c2OC
|
||||
CHEMBL339_5 CC(C)Oc1ccccc1N2CCN(Cc3ccc(CN4CCCCC4=O)n3C)CC2
|
||||
CHEMBL339_6 CCN(CC)C(=O)N[C@@H]1C[C@H]2[C@@H](CC3=CCc4cccc2c34)N(C)C1
|
||||
CHEMBL339_7 OC(=O)C(=O)O.Oc1ccc2CC[C@H](CN3CCc4ccccc4C3)Oc2c1
|
||||
CHEMBL339_8 COc1ccccc1N2CCN(Cc3ccc([nH]3)c4ccccc4)CC2
|
||||
CHEMBL339_9 Fc1ccc(CN2CN(c3ccccc3)C4(CCN(CCCC(=O)c5ccc(F)cc5)CC4)C2=O)cc1
|
||||
CHEMBL339_10 CCCN1CCc2cccc3c2[C@H]1Cc4ccc(O)c(O)c34
|
||||
CHEMBL339_11 O=C1CCc2ccc(OCCCN3CCN(CC3)c4cccc5sccc45)cc2N1
|
||||
CHEMBL339_12 O=C1Nc2cc(OCCCN3CCN(CC3)c4cccc5sccc45)ccc2C=C1
|
||||
CHEMBL1946_1 CCCC(=O)NCCC1CCc2c(OC)ccc3ccc(OC)c1c23
|
||||
CHEMBL1946_3 COc1cccc(Cc2oc3ccc(OC)cc3c2CCNC(=O)C)c1
|
||||
CHEMBL1946_4 COc1ccc2oc(Cc3ccccc3OC)c(CCNC(=O)C)c2c1
|
||||
CHEMBL1946_5 CCC(=O)NC[C@@H]1C[C@H]1c2cccc3nc(CCCCc4ccccc4)oc23
|
||||
CHEMBL1946_6 COc1ccc2[nH]cc(CCNC(=O)C)c2c1
|
||||
CHEMBL1946_7 COc1cc2c(CCNC(=O)C)c(I)[nH]c2cc1[N+](=O)[O-]
|
||||
CHEMBL1946_8 COc1ccc2[nH]cc(CCNC(=O)C)c2c1
|
||||
CHEMBL1946_9 COc1ccc2cc(cc(CCNC(=O)C)c2c1)c3cccc(CBr)c3
|
||||
CHEMBL1946_10 CCCC(=O)NCCCc1cc(OC)ccc1OCc2ccccc2
|
||||
CHEMBL1946_11 CC(C)C1=C(CCNC(=O)C)c2c(C1)ccc3OCCc23
|
||||
CHEMBL1946_12 CCNC(=O)NCCC1=C(Cc2ccc3OCCc3c12)C(C)C
|
||||
CHEMBL1946_13 COc1ccc2cccc(\C=C\NC(=O)C)c2c1
|
||||
CHEMBL1946_14 COc1ccc2cccc(CCC(=O)NS(=O)(=O)C)c2n1
|
||||
CHEMBL1946_15 CCC(=O)NC[C@@H]1CCc2ccccc2[C@@H]1c3ccccc3
|
||||
CHEMBL273_1 CCCN(CCC)[C@H]1CCc2cccc(C(=O)C)c2C1
|
||||
CHEMBL273_2 COc1ccc2CCC(CCN3CCN(CC3)c4ccccn4)Cc2c1
|
||||
CHEMBL273_3 CCCCN(CCCC)C(=O)c1cccc(CN2CCN(CC2)c3ccccc3OC(C)C)c1
|
||||
CHEMBL273_4 CCCN(CCC)C1CCc2cccc(O)c2C1
|
||||
CHEMBL273_5 COc1ccccc1N2CCN(CCCCN3C(=O)c4ccccc4C3=O)CC2
|
||||
CHEMBL273_6 Cl.Cl.COc1ccccc1N2CCN(CCNC(=O)C34C[C@H]5CC(C[C@@H]3C5)C4)CC2
|
||||
CHEMBL273_7 Cl.Cl.COc1ccccc1N2CCN(CCNC(=O)C34C[C@@H]5C[C@@H](C[C@@H](C5)C3)C4)CC2
|
||||
CHEMBL273_8 Cl.C(Cc1ccccc1)N2CCN(CC2)c3cccc4ccoc34
|
||||
CHEMBL273_9 CCCNC1CCc2ccc3[nH]cc(C=O)c3c2C1
|
||||
CHEMBL273_10 CCCN(CCC)C1CCc2ccc3[nH]cc(C=O)c3c2C1
|
||||
CHEMBL273_11 COc1ccccc1N2CCN(CCCCN3C(=O)c4ccccc4C3=O)CC2
|
||||
CHEMBL273_12 CCCN(CCC)C1CCc2cccc(O)c2C1
|
||||
227
Code/GraphMol/RascalMCES/lap_a_la_scipy.cpp
Normal file
227
Code/GraphMol/RascalMCES/lap_a_la_scipy.cpp
Normal file
@@ -0,0 +1,227 @@
|
||||
// This is a mildly modified version of the code in SciPy's
|
||||
// scipy.optimize.linear_sum_assignment, extracted from
|
||||
// rectangular_lsap.cpp.
|
||||
// https://github.com/scipy/scipy/blob/main/scipy/optimize/rectangular_lsap/rectangular_lsap.cpp
|
||||
// As such it is subject to the following notice:
|
||||
/*
|
||||
Copyright (c) 2001-2002 Enthought, Inc. 2003-2023, SciPy Developers.
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions
|
||||
are met:
|
||||
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
|
||||
2. Redistributions in binary form must reproduce the above
|
||||
copyright notice, this list of conditions and the following
|
||||
disclaimer in the documentation and/or other materials provided
|
||||
with the distribution.
|
||||
|
||||
3. Neither the name of the copyright holder nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
|
||||
This code implements the shortest augmenting path algorithm for the
|
||||
rectangular assignment problem. This implementation is based on the
|
||||
pseudocode described in pages 1685-1686 of:
|
||||
|
||||
DF Crouse. On implementing 2D rectangular assignment algorithms.
|
||||
IEEE Transactions on Aerospace and Electronic Systems
|
||||
52(4):1679-1696, August 2016
|
||||
doi: 10.1109/TAES.2016.140952
|
||||
|
||||
Author: PM Larsen
|
||||
*/
|
||||
|
||||
#include <algorithm>
|
||||
#include <iostream>
|
||||
#include <limits>
|
||||
#include <numeric>
|
||||
#include <vector>
|
||||
|
||||
namespace RDKit {
|
||||
namespace RascalMCES {
|
||||
template <typename T>
|
||||
std::vector<size_t> argsort_iter(const std::vector<T> &v) {
|
||||
std::vector<size_t> index(v.size());
|
||||
std::iota(index.begin(), index.end(), 0);
|
||||
std::sort(index.begin(), index.end(),
|
||||
[&v](size_t i, size_t j) { return v[i] < v[j]; });
|
||||
return index;
|
||||
}
|
||||
|
||||
static int augmenting_path(size_t nc, std::vector<int> &cost,
|
||||
std::vector<double> &u, std::vector<double> &v,
|
||||
std::vector<size_t> &path,
|
||||
std::vector<size_t> &row4col,
|
||||
std::vector<double> &shortestPathCosts, size_t i,
|
||||
std::vector<bool> &SR, std::vector<bool> &SC,
|
||||
std::vector<size_t> &remaining, double *p_minVal) {
|
||||
double minVal = 0;
|
||||
|
||||
// Crouse's pseudocode uses set complements to keep track of remaining
|
||||
// nodes. Here we use a vector, as it is more efficient in C++.
|
||||
size_t num_remaining = nc;
|
||||
for (size_t it = 0; it < nc; it++) {
|
||||
// Filling this up in reverse order ensures that the solution of a
|
||||
// constant cost matrix is the identity matrix (c.f. #11602).
|
||||
remaining[it] = nc - it - 1;
|
||||
}
|
||||
|
||||
std::fill(SR.begin(), SR.end(), false);
|
||||
std::fill(SC.begin(), SC.end(), false);
|
||||
std::fill(shortestPathCosts.begin(), shortestPathCosts.end(),
|
||||
std::numeric_limits<double>::max());
|
||||
|
||||
// find shortest augmenting path
|
||||
int sink = -1;
|
||||
while (sink == -1) {
|
||||
// Clearly this will produce an overflow and set index to a large integer.
|
||||
// It is how the original code did it, and I assume whoever wrote it knew
|
||||
// what they were doing.
|
||||
size_t index = -1;
|
||||
double lowest = std::numeric_limits<double>::max();
|
||||
SR[i] = true;
|
||||
|
||||
for (size_t it = 0; it < num_remaining; it++) {
|
||||
size_t j = remaining[it];
|
||||
|
||||
double r = minVal + cost[i * nc + j] - u[i] - v[j];
|
||||
if (r < shortestPathCosts[j]) {
|
||||
path[j] = i;
|
||||
shortestPathCosts[j] = r;
|
||||
}
|
||||
|
||||
// When multiple nodes have the minimum cost, we select one which
|
||||
// gives us a new sink node. This is particularly important for
|
||||
// integer cost matrices with small co-efficients.
|
||||
if (shortestPathCosts[j] < lowest ||
|
||||
(shortestPathCosts[j] == lowest &&
|
||||
row4col[j] == static_cast<size_t>(-1))) {
|
||||
lowest = shortestPathCosts[j];
|
||||
index = it;
|
||||
}
|
||||
}
|
||||
|
||||
minVal = lowest;
|
||||
if (minVal ==
|
||||
std::numeric_limits<double>::max()) { // infeasible cost matrix
|
||||
return -1;
|
||||
}
|
||||
|
||||
size_t j = remaining[index];
|
||||
if (row4col[j] == static_cast<size_t>(-1)) {
|
||||
sink = j;
|
||||
} else {
|
||||
i = row4col[j];
|
||||
}
|
||||
|
||||
SC[j] = true;
|
||||
remaining[index] = remaining[--num_remaining];
|
||||
}
|
||||
|
||||
*p_minVal = minVal;
|
||||
return sink;
|
||||
}
|
||||
|
||||
int lap_maximize(const std::vector<std::vector<int>> &costsMat,
|
||||
std::vector<size_t> &a, std::vector<size_t> &b) {
|
||||
if (costsMat.empty() || costsMat.front().empty()) {
|
||||
return 0;
|
||||
}
|
||||
size_t nr = costsMat.size();
|
||||
size_t nc = costsMat.front().size();
|
||||
bool transpose = nc < nr;
|
||||
std::vector<int> cost(nc * nr);
|
||||
// for maximization, take -ve of costs.
|
||||
for (size_t i = 0; i < nr; ++i) {
|
||||
for (size_t j = 0; j < nc; ++j) {
|
||||
if (transpose) {
|
||||
cost[j * nr + i] = -costsMat[i][j];
|
||||
} else {
|
||||
cost[i * nc + j] = -costsMat[i][j];
|
||||
}
|
||||
}
|
||||
}
|
||||
if (transpose) {
|
||||
std::swap(nc, nr);
|
||||
}
|
||||
// initialize variables
|
||||
std::vector<double> u(nr, 0);
|
||||
std::vector<double> v(nc, 0);
|
||||
std::vector<double> shortestPathCosts(nc);
|
||||
std::vector<size_t> path(nc, -1);
|
||||
std::vector<size_t> col4row(nr, -1);
|
||||
std::vector<size_t> row4col(nc, -1);
|
||||
std::vector<bool> SR(nr);
|
||||
std::vector<bool> SC(nc);
|
||||
std::vector<size_t> remaining(nc);
|
||||
|
||||
// iteratively build the solution
|
||||
for (size_t curRow = 0; curRow < nr; curRow++) {
|
||||
double minVal;
|
||||
int sink = augmenting_path(nc, cost, u, v, path, row4col, shortestPathCosts,
|
||||
curRow, SR, SC, remaining, &minVal);
|
||||
if (sink < 0) {
|
||||
return -1;
|
||||
}
|
||||
|
||||
// update dual variables
|
||||
u[curRow] += minVal;
|
||||
for (size_t i = 0; i < nr; i++) {
|
||||
if (SR[i] && i != curRow) {
|
||||
u[i] += minVal - shortestPathCosts[col4row[i]];
|
||||
}
|
||||
}
|
||||
|
||||
for (size_t j = 0; j < nc; j++) {
|
||||
if (SC[j]) {
|
||||
v[j] -= minVal - shortestPathCosts[j];
|
||||
}
|
||||
}
|
||||
|
||||
// augment previous solution
|
||||
size_t j = sink;
|
||||
while (1) {
|
||||
size_t i = path[j];
|
||||
row4col[j] = i;
|
||||
std::swap(col4row[i], j);
|
||||
if (i == curRow) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (transpose) {
|
||||
size_t i = 0;
|
||||
for (auto v : argsort_iter(col4row)) {
|
||||
a[i] = col4row[v];
|
||||
b[i] = v;
|
||||
i++;
|
||||
}
|
||||
} else {
|
||||
for (size_t i = 0; i < nr; i++) {
|
||||
a[i] = i;
|
||||
b[i] = col4row[i];
|
||||
}
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
} // namespace RascalMCES
|
||||
} // namespace RDKit
|
||||
1205
Code/GraphMol/RascalMCES/mces_catch.cpp
Normal file
1205
Code/GraphMol/RascalMCES/mces_catch.cpp
Normal file
File diff suppressed because it is too large
Load Diff
152
Code/GraphMol/RascalMCES/mces_cluster_catch.cpp
Normal file
152
Code/GraphMol/RascalMCES/mces_cluster_catch.cpp
Normal file
@@ -0,0 +1,152 @@
|
||||
//
|
||||
// Copyright (C) 2023 David Cosgrove
|
||||
//
|
||||
// @@ All Rights Reserved @@
|
||||
// This file is part of the RDKit.
|
||||
// The contents are covered by the terms of the BSD license
|
||||
// which is included in the file license.txt, found at the root
|
||||
// of the RDKit source tree.
|
||||
//
|
||||
|
||||
#include <chrono>
|
||||
#include <random>
|
||||
#include <vector>
|
||||
|
||||
#include <GraphMol/FileParsers/MolSupplier.h>
|
||||
#include <GraphMol/SmilesParse/SmilesParse.h>
|
||||
#include <GraphMol/SmilesParse/SmilesWrite.h>
|
||||
#include <GraphMol/Substruct/SubstructMatch.h>
|
||||
|
||||
#include "catch.hpp"
|
||||
|
||||
#include <GraphMol/RascalMCES/RascalMCES.h>
|
||||
#include <GraphMol/RascalMCES/RascalClusterOptions.h>
|
||||
#include <GraphMol/RascalMCES/RascalResult.h>
|
||||
|
||||
TEST_CASE("Small test", "[basics]") {
|
||||
std::string fName = getenv("RDBASE");
|
||||
fName += "/Contrib/Fastcluster/cdk2.smi";
|
||||
RDKit::SmilesMolSupplier suppl(fName, "\t", 1, 0, false);
|
||||
std::vector<std::shared_ptr<RDKit::ROMol>> mols;
|
||||
while (!suppl.atEnd()) {
|
||||
std::shared_ptr<RDKit::ROMol> mol(suppl.next());
|
||||
if (!mol) {
|
||||
continue;
|
||||
}
|
||||
mols.push_back(mol);
|
||||
}
|
||||
RDKit::RascalMCES::RascalClusterOptions clusOpts;
|
||||
auto clusters = RDKit::RascalMCES::rascalCluster(mols, clusOpts);
|
||||
REQUIRE(clusters.size() == 8);
|
||||
std::vector<size_t> expSizes{7, 7, 6, 2, 2, 2, 2, 20};
|
||||
for (size_t i = 0; i < 8; ++i) {
|
||||
REQUIRE(clusters[i].size() == expSizes[i]);
|
||||
}
|
||||
}
|
||||
|
||||
TEST_CASE("BLSets subset", "[basics]") {
|
||||
std::string fName = getenv("RDBASE");
|
||||
fName += "/Code/GraphMol/RascalMCES/data/test_cluster1.smi";
|
||||
RDKit::SmilesMolSupplier suppl(fName, "\t", 1, 0, false);
|
||||
std::vector<std::shared_ptr<RDKit::ROMol>> mols;
|
||||
while (!suppl.atEnd()) {
|
||||
std::shared_ptr<RDKit::ROMol> mol(suppl.next());
|
||||
if (!mol) {
|
||||
continue;
|
||||
}
|
||||
mols.push_back(mol);
|
||||
}
|
||||
auto clusters = RDKit::RascalMCES::rascalCluster(mols);
|
||||
REQUIRE(clusters.size() == 12);
|
||||
std::vector<size_t> expSizes{8, 4, 4, 3, 3, 3, 2, 2, 2, 2, 2, 21};
|
||||
for (size_t i = 0; i < 12; ++i) {
|
||||
REQUIRE(clusters[i].size() == expSizes[i]);
|
||||
}
|
||||
}
|
||||
|
||||
TEST_CASE("ChEMBL 1907596") {
|
||||
std::string fName = getenv("RDBASE");
|
||||
fName += "/Code/GraphMol/RascalMCES/data/chembl_1907596.smi";
|
||||
std::cout << fName << std::endl;
|
||||
RDKit::SmilesMolSupplier suppl(fName, "\t", 1, 0, false);
|
||||
std::vector<std::shared_ptr<RDKit::ROMol>> mols;
|
||||
while (!suppl.atEnd()) {
|
||||
std::shared_ptr<RDKit::ROMol> mol(suppl.next());
|
||||
if (!mol) {
|
||||
continue;
|
||||
}
|
||||
mols.push_back(mol);
|
||||
}
|
||||
RDKit::RascalMCES::RascalClusterOptions clusOpts;
|
||||
clusOpts.similarityCutoff = 0.7;
|
||||
auto clusters = RDKit::RascalMCES::rascalCluster(mols, clusOpts);
|
||||
REQUIRE(clusters.size() == 21);
|
||||
std::vector<size_t> expSizes{342, 71, 64, 33, 23, 11, 10, 6, 6, 5, 5,
|
||||
4, 3, 3, 3, 3, 3, 2, 2, 2, 14};
|
||||
for (size_t i = 0; i < 21; ++i) {
|
||||
REQUIRE(clusters[i].size() == expSizes[i]);
|
||||
}
|
||||
}
|
||||
|
||||
TEST_CASE("Small Butina test", "[basics]") {
|
||||
std::string fName = getenv("RDBASE");
|
||||
fName += "/Contrib/Fastcluster/cdk2.smi";
|
||||
RDKit::SmilesMolSupplier suppl(fName, "\t", 1, 0, false);
|
||||
std::vector<std::shared_ptr<RDKit::ROMol>> mols;
|
||||
while (!suppl.atEnd()) {
|
||||
std::shared_ptr<RDKit::ROMol> mol(suppl.next());
|
||||
if (!mol) {
|
||||
continue;
|
||||
}
|
||||
mols.push_back(mol);
|
||||
}
|
||||
RDKit::RascalMCES::RascalClusterOptions clusOpts;
|
||||
auto clusters = RDKit::RascalMCES::rascalButinaCluster(mols, clusOpts);
|
||||
int numMols = 0;
|
||||
for (const auto &cl : clusters) {
|
||||
numMols += cl.size();
|
||||
}
|
||||
REQUIRE(numMols == mols.size());
|
||||
REQUIRE(clusters.size() == 29);
|
||||
std::vector<size_t> expSizes{6, 6, 6, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1,
|
||||
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
|
||||
for (size_t i = 0; i < 29; ++i) {
|
||||
REQUIRE(clusters[i].size() == expSizes[i]);
|
||||
}
|
||||
}
|
||||
|
||||
TEST_CASE("Small test, smaller number of threads", "[basics]") {
|
||||
// I'm not sure how to test whether this has had the desired effect,
|
||||
// but at least we'll know that it runs ok.
|
||||
std::string fName = getenv("RDBASE");
|
||||
fName += "/Contrib/Fastcluster/cdk2.smi";
|
||||
RDKit::SmilesMolSupplier suppl(fName, "\t", 1, 0, false);
|
||||
std::vector<std::shared_ptr<RDKit::ROMol>> mols;
|
||||
while (!suppl.atEnd()) {
|
||||
std::shared_ptr<RDKit::ROMol> mol(suppl.next());
|
||||
if (!mol) {
|
||||
continue;
|
||||
}
|
||||
mols.push_back(mol);
|
||||
}
|
||||
{
|
||||
RDKit::RascalMCES::RascalClusterOptions clusOpts;
|
||||
clusOpts.numThreads = 2;
|
||||
auto clusters = RDKit::RascalMCES::rascalCluster(mols, clusOpts);
|
||||
REQUIRE(clusters.size() == 8);
|
||||
std::vector<size_t> expSizes{7, 7, 6, 2, 2, 2, 2, 20};
|
||||
for (size_t i = 0; i < 8; ++i) {
|
||||
REQUIRE(clusters[i].size() == expSizes[i]);
|
||||
}
|
||||
}
|
||||
{
|
||||
RDKit::RascalMCES::RascalClusterOptions clusOpts;
|
||||
clusOpts.numThreads = -2;
|
||||
auto clusters = RDKit::RascalMCES::rascalCluster(mols, clusOpts);
|
||||
REQUIRE(clusters.size() == 8);
|
||||
std::vector<size_t> expSizes{7, 7, 6, 2, 2, 2, 2, 20};
|
||||
for (size_t i = 0; i < 8; ++i) {
|
||||
REQUIRE(clusters[i].size() == expSizes[i]);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1417,8 +1417,38 @@ or into a generic framework:
|
||||
Maximum Common Substructure
|
||||
***************************
|
||||
|
||||
The FindMCS function find a maximum common substructure (MCS) of two
|
||||
or more molecules:
|
||||
There are 2 methods for finding maximum common substructures. The first, FindMCS,
|
||||
finds a single fragment maximum common substructure (MCS) of two or more molecules:
|
||||
The second, RascalMCES, finds the maximum common edge substructure (MCES) between two
|
||||
molecules and can return a multi-fragment MCES. The difference is demonstrated with the
|
||||
following pair of molecules:
|
||||
|
||||
+-------------------------------------+
|
||||
| .. image:: images/mcs_example_1.png |
|
||||
+-------------------------------------+
|
||||
| .. image:: images/mcs_example_2.png |
|
||||
+-------------------------------------+
|
||||
|
||||
FMCS gives this maximum common substructure:
|
||||
|
||||
+-------------------------------------+
|
||||
| .. image:: images/mcs_example_3.png |
|
||||
+-------------------------------------+
|
||||
| .. image:: images/mcs_example_4.png |
|
||||
+-------------------------------------+
|
||||
|
||||
Whereas RascalMCES gives:
|
||||
|
||||
+-------------------------------------+
|
||||
| .. image:: images/mcs_example_5.png |
|
||||
+-------------------------------------+
|
||||
| .. image:: images/mcs_example_6.png |
|
||||
+-------------------------------------+
|
||||
|
||||
FindMCS
|
||||
=======
|
||||
|
||||
FindMCS operates on 2 or more molecules:
|
||||
|
||||
.. doctest::
|
||||
|
||||
@@ -1555,6 +1585,135 @@ return the best match found in that time. If timeout is reached then the
|
||||
|
||||
(The MCS after 50 seconds contained 511 atoms.)
|
||||
|
||||
RascalMCES
|
||||
==========
|
||||
|
||||
RascalMCES can only work on 2 molecules at a time:
|
||||
|
||||
.. doctest::
|
||||
|
||||
>>> from rdkit.Chem import rdRascalMCES
|
||||
>>> mol1 = Chem.MolFromSmiles("CN(C)c1ccc(CC(=O)NCCCCCCCCCCNC23CC4CC(C2)CC(C3)C4)cc1 CHEMBL153934")
|
||||
>>> mol2 = Chem.MolFromSmiles("CN(C)c1ccc(CC(=O)NCCCCCCCNC23CC4CC(C2)CC(C3)C4)cc1 CHEMBL152361")
|
||||
>>> res = rdRascalMCES.FindMCES(mol1, mol2)
|
||||
>>> res[0].smartsString
|
||||
'CN(-C)-c1:c:c:c(-CC(=O)-NCCCCCCC):c:c:1.NC12CC3CC(-C1)-CC(-C2)-C3'
|
||||
>>> len(res[0].bondMatches())
|
||||
33
|
||||
|
||||
It returns a list of RascalResult objects. Each RascalResult contains the 2 molecules that
|
||||
the result pertains to, the SMARTS string of the MCES, the lists of atoms and bonds in the
|
||||
two molecules that match, the Johnson similarity between the 2 molecules, the number of
|
||||
fragments in the MCES, the number of atoms in the largest fragment and whether the run
|
||||
timed out or not. There is also the method largestFragmentOnly(), which cuts the MCES
|
||||
down to the largest single fragment. This is a non-reversible change, so if you want both
|
||||
results, take a copy first.
|
||||
|
||||
By default, the MCES algorithm returns the first result it finds of maximum size. Because of
|
||||
symmetry, there may be other equivalent solutions with the same number of atoms and bonds,
|
||||
but with different equivalent bonds matched to each other. If you want to see all MCESs of
|
||||
maximum size, you can use the option allBestMCESs = True. This will increase the run time,
|
||||
partly because more branches in the search tree must be examined, but mostly because sorting
|
||||
the multiple results is quite time-consuming. The results are returned in a consistent order
|
||||
sorted by number of bond matches, then number of fragments (fewer first), then largest
|
||||
fragment size and so on. Some of these aren't trivial to compute. The adamantane example
|
||||
above is particularly extreme because not only is there extensive symmetry about the
|
||||
adamantane end and 2-fold symmetry at the phenyl end but also several points of breaking the
|
||||
matching alkyl chain all of which give rise to valid MCESs of the same size. In this case,
|
||||
sorting into a consistent order takes significantly longer than determining the MCESs in the
|
||||
first place.
|
||||
|
||||
The MCES differs from a conventional MCS in that it is the maximum common substructure based
|
||||
on bonds rather than atoms. Often the result is the same, but not always.
|
||||
|
||||
The Johnson similarity is akin to a Tanimoto similarity, but expressed in terms of the
|
||||
atoms and bonds in the MCES. It is the square of the sum of the number of atoms and bonds
|
||||
in the MCES divided by the product of the sums of the numbers of atoms and bonds in the
|
||||
2 input molecules. It has values between 0.0 (no MCES between the molecules) and 1.0 (the
|
||||
molecules are identical). A key source of efficiency in the RASCAL algorithm is a fast and
|
||||
correct prediction of a maximum value for the Johnson similarity between 2 molecules and
|
||||
hence the maximum size of the MCES. The first step in the algorithm is then a screening,
|
||||
whereby the full MCES determination is not performed if the predicted similarity is less
|
||||
than some desired threshold. The final similarity between the 2 molecules may be less
|
||||
than the threshold, but it will never be higher than the predicted upper bound. RASCAL
|
||||
stems from RApid Similarity CALulation.
|
||||
|
||||
The default settings for RascalMCES are good for general use, but they may be altered
|
||||
by passing an optional RascalOptions object:
|
||||
|
||||
.. doctest::
|
||||
|
||||
>>> mol1 = Chem.MolFromSmiles('Oc1cccc2C(=O)C=CC(=O)c12')
|
||||
>>> mol2 = Chem.MolFromSmiles('O1C(=O)C=Cc2cc(OC)c(O)cc12')
|
||||
>>> results = rdRascalMCES.FindMCES(mol1, mol2)
|
||||
>>> len(results)
|
||||
0
|
||||
>>> opts = rdRascalMCES.RascalOptions()
|
||||
>>> opts.similarityThreshold = 0.5
|
||||
>>> results = rdRascalMCES.FindMCES(mol1, mol2, opts)
|
||||
>>> len(results)
|
||||
1
|
||||
>>> f'{results[0].similarity:.2f}'
|
||||
'0.37'
|
||||
>>> results[0].smartsString
|
||||
'Oc1:c:c:c:c:c:1.[#6]=O'
|
||||
>>> opts.minFragSize = 3
|
||||
>>> results = rdRascalMCES.FindMCES(mol1, mol2, opts)
|
||||
>>> len(results)
|
||||
1
|
||||
>>> f'{results[0].similarity:.2f}'
|
||||
'0.25'
|
||||
>>> results[0].smartsString
|
||||
'Oc1:c:c:c:c:c:1'
|
||||
|
||||
In this case, the upper bound on the similarity score is below the default threshold
|
||||
of 0.7, so no results are returned. Setting the threshold to 0.5 produces the second
|
||||
result although, as can be seen, the final similarity is substantially below the
|
||||
threshold. This example also shows a disadvantage of the MCES method, which is that
|
||||
it can produce small fragments in the MCES which are rarely helpful. The option
|
||||
minFragSize can be used to over-ride the default value of -1, which means no minimum
|
||||
size.
|
||||
|
||||
Like FindMCS, there is a ringMatchesRingOnly option, and also there's
|
||||
completeAromaticRings, which is True by default, and means that MCESs won't be returned
|
||||
with partial aromatic rings matching:
|
||||
|
||||
.. doctest::
|
||||
|
||||
>>> mol1 = Chem.MolFromSmiles('C1CCCC1c1ccncc1')
|
||||
>>> mol2 = Chem.MolFromSmiles('C1CCCC1c1ccccc1')
|
||||
>>> results = rdRascalMCES.FindMCES(mol1, mol2, opts)
|
||||
>>> f'{results[0].similarity:.2f}'
|
||||
'0.27'
|
||||
>>> results[0].smartsString
|
||||
'C1CCCC1-c'
|
||||
>>> opts.completeAromaticRings = False
|
||||
>>> results = rdRascalMCES.FindMCES(mol1, mol2, opts)
|
||||
>>> f'{results[0].similarity:.2f}'
|
||||
'0.76'
|
||||
>>> results[0].smartsString
|
||||
'C1CCCC1-c(:c:c):c:c'
|
||||
|
||||
This result may look a bit odd, with a single aromatic carbon in the first SMARTS
|
||||
string. This is a consequence of the fact that the MCES works on matching bonds.
|
||||
A better, atom-centric, representation might be C1CCC[$(C-c)]1. When the
|
||||
completeAromaticRings option is set to False, a larger MCES is found, with just
|
||||
the pyridine nitrogen atom not matching the corresponding phenyl carbon atom.
|
||||
|
||||
Clustering with Rascal
|
||||
======================
|
||||
|
||||
There are 2 clustering methods available using the Johnson metric. The first,
|
||||
RascalCluster, is a fuzzy method described in 'A Line Graph Algorithm for
|
||||
Clustering Chemical Structures Based on Common Substructural Cores', JW Raymond,
|
||||
PW Willett
|
||||
(https://match.pmf.kg.ac.rs/electronic_versions/Match48/match48_197-207.pdf also
|
||||
available at https://eprints.whiterose.ac.uk/77598/).
|
||||
The second, RascalButinaCluster, uses the Butina sphere-exclusion algorithm
|
||||
(Butina JCICS 39 747-750 (1999)). Because of the time-consuming nature of the MCES
|
||||
determination, these clustering methods can be slow to run, so are best used
|
||||
on small sets (no more than a few hundred molecules) of small molecules.
|
||||
|
||||
|
||||
Fingerprinting and Molecular Similarity
|
||||
***************************************
|
||||
|
||||
BIN
Docs/Book/images/mcs_example_1.png
Normal file
BIN
Docs/Book/images/mcs_example_1.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 14 KiB |
BIN
Docs/Book/images/mcs_example_2.png
Normal file
BIN
Docs/Book/images/mcs_example_2.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 14 KiB |
BIN
Docs/Book/images/mcs_example_3.png
Normal file
BIN
Docs/Book/images/mcs_example_3.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 19 KiB |
BIN
Docs/Book/images/mcs_example_4.png
Normal file
BIN
Docs/Book/images/mcs_example_4.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 20 KiB |
BIN
Docs/Book/images/mcs_example_5.png
Normal file
BIN
Docs/Book/images/mcs_example_5.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 21 KiB |
BIN
Docs/Book/images/mcs_example_6.png
Normal file
BIN
Docs/Book/images/mcs_example_6.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 22 KiB |
Reference in New Issue
Block a user