// // Copyright (C) David Cosgrove 2023 // // @@ All Rights Reserved @@ // This file is part of the RDKit. // The contents are covered by the terms of the BSD license // which is included in the file license.txt, found at the root // of the RDKit source tree. // // This file contains the driving functions for the determination of // the Maximum Common Edge Substructure (MCES) between 2 molecules. // It uses the RASCAL algorithm of John Raymond: // RASCAL: Calculation of Graph Similarity using Maximum Common // Edge Subgraphs, John W. Raymond, Eleanor J. Gardiner, Peter Willett // 'The Computer Journal', 45, 631-644 (2002). // https://eprints.whiterose.ac.uk/3568/1/willets3.pdf #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include namespace RDKit { namespace RascalMCES { class TimedOutException : public std::exception { public: TimedOutException(long long int run_time, std::vector> &bestCliques) : d_cliques(bestCliques) { d_message = "Timed out after " + std::to_string(run_time) + " second"; if (run_time == 1) { d_message += "."; } else { d_message += "s."; } } const char *what() const throw() override { return d_message.c_str(); } std::vector> d_cliques; private: std::string d_message; }; // This is in lap_a_la_scipy.cpp and solves the linear assignment problem. int lapMaximize(const std::vector> &costsMat, std::vector &a, std::vector &b); // Contains the information used to start off a Rascal job. struct RascalStartPoint { // The tier one and tier two similarities. The initial PartitionSet // will only be constructed if they are both above the RascalOptions // threshold. double d_tier1Sim{-1.0}; double d_tier2Sim{-1.0}; // The lower bound on the maximum clique size. Depends, amongst other things, // on opts.similarityThreshold. unsigned int d_lowerBound{0}; // a Delta-Y exchange requires extra treatment. They're rare, though. bool d_deltaYPoss{false}; // Some parts require mol2 to be the larger molecule. This records if they // have been swapped with respect to the input molecules. bool d_swapped{false}; // Pointers to copies of the input molecules, swapped if necessary, // so that d_mol1 is always the smaller molecule. std::unique_ptr d_mol1; std::unique_ptr d_mol2; std::vector> d_adjMatrix1, d_adjMatrix2; std::vector> d_vtxPairs; std::vector> d_modProd; // We might need to know which bonds are symmetrical equivalent. std::vector d_equivBonds1, d_equivBonds2; // The initial partition set. May be empty if the thresholds weren't met. std::shared_ptr d_partSet; }; // Get the sorted degree sequences for the molecule, one sequence for each // atomic number in the molecule. Each element in the degree sequence is // the degree of the atom and its index. void sortedDegreeSeqs( const ROMol &mol, std::map>> °Seqs) { for (const auto &a : mol.atoms()) { degSeqs[a->getAtomicNum()].push_back( std::make_pair(a->getDegree(), a->getIdx())); } for (auto &it : degSeqs) { std::sort(it.second.begin(), it.second.end(), [](const std::pair &p1, const std::pair &p2) -> bool { return p1.first > p2.first; }); } } // Make labels for the atoms - by default the atomic symbol. void getAtomLabels(const ROMol &mol, const RascalOptions &opts, std::vector &atomLabels) { atomLabels.resize(mol.getNumAtoms()); for (const auto &a : mol.atoms()) { std::string label = a->getSymbol(); if (opts.exactConnectionsMatch) { label += "X" + std::to_string(a->getDegree()); } if (!opts.ignoreAtomAromaticity && a->getIsAromatic()) { label = "a" + label; } atomLabels[a->getIdx()] = label; } } int calcCost(const std::vector &atomiBLs, const std::vector &atomjBLs) { std::unordered_set uniqAtomiBLs; for (const auto bl : atomiBLs) { uniqAtomiBLs.insert(bl); } int cost = 0; for (const auto &uai : uniqAtomiBLs) { int numAtomi = std::count(atomiBLs.begin(), atomiBLs.end(), uai); int numAtomj = std::count(atomjBLs.begin(), atomjBLs.end(), uai); cost += std::min(numAtomi, numAtomj); } return cost; } // assign the costs of matching each atom in atomDegrees1 to each atom in // atomDegrees2. void assignCosts(const std::vector> &atomDegrees1, const std::vector> &atomDegrees2, const std::vector &bondLabels1, const std::vector &bondLabels2, const ROMol &mol1, const ROMol &mol2, std::vector> &costsMat) { std::vector atomiBLs, atomjBLs; for (auto i = 0u; i < atomDegrees1.size(); ++i) { atomiBLs.clear(); const auto atomi = mol1.getAtomWithIdx(atomDegrees1[i].second); for (const auto b : mol1.atomBonds(atomi)) { atomiBLs.push_back(bondLabels1[b->getIdx()]); } for (auto j = 0u; j < atomDegrees2.size(); ++j) { const auto atomj = mol2.getAtomWithIdx(atomDegrees2[j].second); atomjBLs.clear(); for (const auto b : mol2.atomBonds(atomj)) { atomjBLs.push_back(bondLabels2[b->getIdx()]); } costsMat[i][j] = calcCost(atomiBLs, atomjBLs); } } } // Return the assignment score for the best match of the atoms and bonds in mol1 // to the atoms and bonds in mol2. int getAssignmentScore(const std::vector> &atomDegrees1, const std::vector> &atomDegrees2, const std::vector &bondLabels1, const std::vector &bondLabels2, const ROMol &mol1, const ROMol &mol2) { constexpr int bigScore(9999); constexpr size_t unassignedValue(99999999); std::vector> costsMat( atomDegrees1.size(), std::vector(atomDegrees2.size(), bigScore)); assignCosts(atomDegrees1, atomDegrees2, bondLabels1, bondLabels2, mol1, mol2, costsMat); std::vector a(std::min(atomDegrees1.size(), atomDegrees2.size()), unassignedValue); std::vector b(std::min(atomDegrees1.size(), atomDegrees2.size()), unassignedValue); int retVal = lapMaximize(costsMat, a, b); if (retVal < 0) { // no solution for the LAP was possible. return 0; } int totalCost = 0; for (auto i = 0u; i < a.size(); ++i) { totalCost += costsMat[a[i]][b[i]]; } return totalCost; } namespace details { // calculate the tier 1 similarity between the 2 mols. double tier1Sim(const ROMol &mol1, const ROMol &mol2, std::map>> °Seqs1, std::map>> °Seqs2) { sortedDegreeSeqs(mol1, degSeqs1); sortedDegreeSeqs(mol2, degSeqs2); int vg1g2 = 0; int eg1g2 = 0; for (const auto &it1 : degSeqs1) { const auto &seq2 = degSeqs2.find(it1.first); if (seq2 != degSeqs2.end()) { vg1g2 += std::min(it1.second.size(), seq2->second.size()); auto numToDo = std::min(it1.second.size(), seq2->second.size()); for (auto i = 0U; i < numToDo; ++i) { eg1g2 += std::min(it1.second[i].first, seq2->second[i].first); } } } eg1g2 /= 2; double sim = double((vg1g2 + eg1g2) * (vg1g2 + eg1g2)) / double((mol1.getNumAtoms() + mol1.getNumBonds()) * (mol2.getNumAtoms() + mol2.getNumBonds())); return sim; } // Calculate the tier 2 similarity between the 2 mols. double tier2Sim(const ROMol &mol1, const ROMol &mol2, const std::map>> °Seqs1, const std::map>> °Seqs2, const std::vector &bondLabels1, const std::vector &bondLabels2) { int vg1g2 = 0; int eg1g2 = 0; for (const auto &it1 : degSeqs1) { const auto &seq2 = degSeqs2.find(it1.first); if (seq2 != degSeqs2.end()) { vg1g2 += std::min(it1.second.size(), seq2->second.size()); eg1g2 += getAssignmentScore(it1.second, seq2->second, bondLabels1, bondLabels2, mol1, mol2); } } eg1g2 /= 2; double sim = double((vg1g2 + eg1g2) * (vg1g2 + eg1g2)) / double((mol1.getNumAtoms() + mol1.getNumBonds()) * (mol2.getNumAtoms() + mol2.getNumBonds())); return sim; } // For each bond in the molecule, encode it with its type and the labels of the // two end atoms, returning the results as strings. // Note that the molecule should not be in kekulized form. void getBondLabels(const ROMol &mol, const RascalOptions &opts, std::vector &bondLabels) { std::vector atomLabels; getAtomLabels(mol, opts, atomLabels); bondLabels = std::vector(mol.getNumBonds()); for (const auto &b : mol.bonds()) { std::string bondType; if (opts.ignoreBondOrders) { bondType = "0"; } else { bondType = std::to_string(b->getBondType()); } // The atom labels need to be in consistent order irrespective // of input order. auto lbl1 = atomLabels[b->getBeginAtomIdx()]; auto lbl2 = atomLabels[b->getEndAtomIdx()]; if (lbl1 < lbl2) { std::swap(lbl1, lbl2); } bondLabels[b->getIdx()] = lbl1 + bondType + lbl2; } } // Fills bondLabels[12] with a small integer denoting the type of the bond and // the types of the atoms at each end. Both molecules need to be done at the // same time so that the labels are consistent across both. void getBondLabels(const ROMol &mol1, const ROMol &mol2, const RascalOptions &opts, std::vector &bondLabels1, std::vector &bondLabels2) { std::vector tmpBondLabels1, tmpBondLabels2; getBondLabels(mol1, opts, tmpBondLabels1); getBondLabels(mol2, opts, tmpBondLabels2); // convert the bond labels, which are currently encoding the atoms and // bond type to a small set of sequential integers for ease of use later. // This results in loss of information, but that information is not currently // used anywhere. std::set allLabels; for (auto bl : tmpBondLabels1) { allLabels.insert(bl); } for (auto bl : tmpBondLabels2) { allLabels.insert(bl); } auto recodeBondLabels = [&](std::vector &strBondLabels, std::vector &bondLabels) { for (auto &bl : strBondLabels) { auto it = allLabels.find(bl); bondLabels.push_back(std::distance(allLabels.begin(), it)); } }; recodeBondLabels(tmpBondLabels1, bondLabels1); recodeBondLabels(tmpBondLabels2, bondLabels2); } } // namespace details // make the line graph for the molecule, as an adjacency matrix. Each // row/column is a bond, with a connection between 2 bonds if they share an // atom. The adjacency matrix is 0 for no bond, the atomic number of the // connecting atom otherwise. void makeLineGraph(const ROMol &mol, std::vector> &adjMatrix) { adjMatrix = std::vector>( mol.getNumBonds(), std::vector(mol.getNumBonds(), 0)); for (const auto &a : mol.atoms()) { for (const auto &b1 : mol.atomBonds(a)) { for (const auto &b2 : mol.atomBonds(a)) { if (b1 != b2) { adjMatrix[b1->getIdx()][b2->getIdx()] = a->getAtomicNum(); } } } } } // make sure that mol1_bond in mol1 and mol2_bond in mol2 are, if aromatic, in // at least one ring that is the same. bool checkAromaticRings(const ROMol &mol1, std::vector &mol1RingSmiles, int mol1BondIdx, const ROMol &mol2, std::vector &mol2RingSmiles, int mol2BondIdx) { auto mol1Bond = mol1.getBondWithIdx(mol1BondIdx); auto mol2Bond = mol2.getBondWithIdx(mol2BondIdx); if (!mol1Bond->getIsAromatic() || !mol2Bond->getIsAromatic()) { return true; } // If neither bond was in a ring, but they were marked aromatic, then // the two mols are fragments so it's ok to match these bonds. const auto &mol1BondRings = mol1.getRingInfo()->bondRings(); const auto &mol2BondRings = mol2.getRingInfo()->bondRings(); bool mol1BondInRing = mol1.getRingInfo()->numBondRings(mol1BondIdx); bool mol2BondInRing = mol2.getRingInfo()->numBondRings(mol2BondIdx); if (!mol1BondInRing && !mol2BondInRing) { return true; } for (size_t i = 0u; i < mol1BondRings.size(); ++i) { if (std::find(mol1BondRings[i].begin(), mol1BondRings[i].end(), mol1BondIdx) == mol1BondRings[i].end()) { continue; } for (size_t j = 0u; j < mol2BondRings.size(); ++j) { if (std::find(mol2BondRings[j].begin(), mol2BondRings[j].end(), mol2BondIdx) == mol2BondRings[j].end()) { continue; } if (mol1RingSmiles[i] == mol2RingSmiles[j]) { return true; } } } return false; } // Extract the rings from the given molecule, both as mol objects and SMILES // strings. The mol objects will have the original bond indices stored in the // property ORIG_INDEX. void extractRings(const ROMol &mol, std::vector> &molRings, std::vector &molRingSmiles) { const auto &molBondRings = mol.getRingInfo()->bondRings(); for (size_t i = 0u; i < molBondRings.size(); ++i) { std::unique_ptr ringMol(new RWMol(mol)); const auto &molAtomRings = mol.getRingInfo()->atomRings(); boost::dynamic_bitset<> atomsInRing(mol.getNumAtoms()); for (auto a : molAtomRings[i]) { atomsInRing.set(a); } for (auto ringBondIdx : molBondRings[i]) { auto ringBond = ringMol->getBondWithIdx(ringBondIdx); ringBond->setProp("ORIG_INDEX", ringBond->getIdx()); } ringMol->beginBatchEdit(); for (auto b : ringMol->bonds()) { if (!b->hasProp("ORIG_INDEX)")) { if (!atomsInRing[b->getBeginAtomIdx()]) { ringMol->removeAtom(b->getBeginAtom()); } if (!atomsInRing[b->getEndAtomIdx()]) { ringMol->removeAtom(b->getEndAtom()); } } } ringMol->commitBatchEdit(); molRingSmiles.push_back(MolToSmiles(*ringMol)); molRings.push_back(std::move(ringMol)); } } bool checkRingMatchesRing(const ROMol &mol1, int mol1BondIdx, const ROMol &mol2, int mol2BondIdx) { if (mol1.getRingInfo()->numBondRings(mol1BondIdx) && !mol2.getRingInfo()->numBondRings(mol2BondIdx)) { return false; } if (!mol1.getRingInfo()->numBondRings(mol1BondIdx) && mol2.getRingInfo()->numBondRings(mol2BondIdx)) { return false; } return true; } // Make the set of pairs of vertices, where they're a pair if the labels // match. void buildPairs(const ROMol &mol1, const std::vector &vtxLabels1, const ROMol &mol2, const std::vector &vtxLabels2, const RascalOptions &opts, std::vector> &vtxPairs) { std::vector mol1RingSmiles, mol2RingSmiles; std::vector> mol1Rings, mol2Rings; // For these purposes, it is correct that n1cccc1 and [nH]1cccc1 match - the // former would be from an N-substituted pyrrole, the latter from a plain // one. static const std::regex reg(R"(\[([np])H\])"); if (opts.completeAromaticRings) { extractRings(mol1, mol1Rings, mol1RingSmiles); for (auto &mrs : mol1RingSmiles) { mrs = std::regex_replace(mrs, reg, "$1"); } extractRings(mol2, mol2Rings, mol2RingSmiles); for (auto &mrs : mol2RingSmiles) { mrs = std::regex_replace(mrs, reg, "$1"); } } for (auto i = 0u; i < vtxLabels1.size(); ++i) { for (auto j = 0u; j < vtxLabels2.size(); ++j) { if (vtxLabels1[i] == vtxLabels2[j]) { if (opts.completeAromaticRings && !checkAromaticRings(mol1, mol1RingSmiles, i, mol2, mol2RingSmiles, j)) { continue; } if (opts.ringMatchesRingOnly && !checkRingMatchesRing(mol1, i, mol2, j)) { continue; } vtxPairs.push_back(std::make_pair(i, j)); } } } } // Make the modular product between the 2 graphs passed in. Each node in the // graph is a pair of vertices, one from the first graph, the other from the // second, whose labels match. Two vertices are connected in the modular // product if either the 2 matching vertices in the 2 input vertices are // connected by edges with the same label, or neither is connected. void makeModularProduct(const ROMol &mol1, const std::vector> &adjMatrix1, const std::vector &vtxLabels1, const std::vector> &distMatrix1, const ROMol &mol2, const std::vector> &adjMatrix2, const std::vector &vtxLabels2, const std::vector> &distMatrix2, const RascalOptions &opts, std::vector> &vtxPairs, std::vector> &modProd) { buildPairs(mol1, vtxLabels1, mol2, vtxLabels2, opts, vtxPairs); if (vtxPairs.empty()) { // There was nothing in common at all. But, what was the screening doing? modProd.clear(); return; } if (vtxPairs.size() > opts.maxBondMatchPairs) { BOOST_LOG(rdErrorLog) << "Too many matching bond pairs (" << vtxPairs.size() << ") so can't continue." << std::endl; modProd.clear(); return; } modProd = std::vector>( vtxPairs.size(), boost::dynamic_bitset<>(vtxPairs.size())); for (auto i = 0u; i < vtxPairs.size() - 1; ++i) { for (auto j = i + 1; j < vtxPairs.size(); ++j) { if (vtxPairs[i].first == vtxPairs[j].first || vtxPairs[i].second == vtxPairs[j].second) { continue; } bool distsOk = true; if (opts.maxFragSeparation != -1) { if (std::abs(distMatrix1[vtxPairs[i].first][vtxPairs[j].first] - distMatrix2[vtxPairs[i].second][vtxPairs[j].second]) > opts.maxFragSeparation) { distsOk = false; } } if (opts.singleLargestFrag && distMatrix1[vtxPairs[i].first][vtxPairs[j].first] != distMatrix2[vtxPairs[i].second][vtxPairs[j].second]) { distsOk = false; } if (distsOk && adjMatrix1[vtxPairs[i].first][vtxPairs[j].first] == adjMatrix2[vtxPairs[i].second][vtxPairs[j].second]) { modProd[i][j] = modProd[j][i] = 1; } } } } // Calculate the lower bound on the size of the MCES. This requires that mol1 // has more atoms than mol2 which is not checked. Returns a minimum of 1. unsigned int calcLowerBound(const ROMol &mol1, const ROMol &mol2, double simThresh) { std::unordered_set mol1AtNos; int maxAtNo = 0; for (const auto &a : mol1.atoms()) { mol1AtNos.insert(a->getAtomicNum()); maxAtNo = std::max(a->getAtomicNum(), maxAtNo); } boost::dynamic_bitset<> mol2AtNos(maxAtNo + 1); for (const auto &a : mol2.atoms()) { // since we're interested in the atoms that match in the 2 molecules, // it doesn't matter if mol2 has an atomic number higher than anything // in mol1 - that can't be a match. if (a->getAtomicNum() < maxAtNo) { mol2AtNos.set(a->getAtomicNum()); } } int deltaVg1 = 0; for (auto mol1AtNo : mol1AtNos) { if (!mol2AtNos[mol1AtNo]) { ++deltaVg1; } } double lb = sqrt((mol1.getNumAtoms() + mol1.getNumBonds()) * (mol2.getNumAtoms() + mol2.getNumBonds())); lb = lb * simThresh - mol1.getNumAtoms() + deltaVg1; lb = lb < 0 ? 0 : lb; unsigned int ilb(lb); if (ilb < 1) { ilb = 1; } return ilb; } void printClique(const std::vector &clique, const std::vector> &vtxPairs, bool swapped, std::ostream &os) { os << "Clique : " << clique.size() << " :"; for (auto mem : clique) { os << " " << mem; } os << std::endl; for (auto mem : clique) { if (swapped) { os << "{" << vtxPairs[mem].second << ", " << vtxPairs[mem].first << "},"; } else { os << "{" << vtxPairs[mem].first << ", " << vtxPairs[mem].second << "},"; } } std::cout << std::endl; std::cout << "mol 1 bonds : ["; for (auto mem : clique) { if (swapped) { os << vtxPairs[mem].second << ", "; } else { os << vtxPairs[mem].first << ", "; } } std::cout << "]" << std::endl; std::cout << "mol 2 bonds : ["; for (auto mem : clique) { if (swapped) { os << vtxPairs[mem].first << ", "; } else { os << vtxPairs[mem].second << ", "; } } std::cout << "]" << std::endl; } // if the clique involves a delta-y exchange, returns true. Should only be // called if it's a possibility. bool deltaYInClique(const std::vector &clique, const ROMol &mol1, const ROMol &mol2, const std::vector> &vtxPairs) { if (clique.size() < 3) { // there must be 3 bonds for a delta-y exchange, obs. return false; } // Map the clique onto the 2 molecules, counting the degrees of the atoms // if they are involved in the clique. When sorted, they will be the same // if no delta-y exchange has occurred. std::vector> bondMatches; for (auto mem : clique) { bondMatches.push_back( std::make_pair(vtxPairs[mem].first, vtxPairs[mem].second)); } std::vector cliqueDegs1(mol1.getNumAtoms(), 0); std::vector cliqueDegs2(mol2.getNumAtoms(), 0); for (const auto &bm : bondMatches) { const auto b1 = mol1.getBondWithIdx(bm.first); cliqueDegs1[b1->getBeginAtomIdx()]++; cliqueDegs1[b1->getEndAtomIdx()]++; const auto b2 = mol2.getBondWithIdx(bm.second); cliqueDegs2[b2->getBeginAtomIdx()]++; cliqueDegs2[b2->getEndAtomIdx()]++; } cliqueDegs1.erase(std::remove(cliqueDegs1.begin(), cliqueDegs1.end(), 0), cliqueDegs1.end()); std::sort(cliqueDegs1.begin(), cliqueDegs1.end()); cliqueDegs2.erase(std::remove(cliqueDegs2.begin(), cliqueDegs2.end(), 0), cliqueDegs2.end()); std::sort(cliqueDegs2.begin(), cliqueDegs2.end()); return cliqueDegs1 != cliqueDegs2; } // Return a molecule with the clique in it. Each atom will have the property // ORIG_INDEX giving its index in the original molecule. RWMol *makeCliqueFrags(const ROMol &mol, const std::vector &clique, const std::vector> &vtxPairs, int pairNum) { auto *molFrags = new RWMol(mol); boost::dynamic_bitset<> aInClique(mol.getNumAtoms()); boost::dynamic_bitset<> bInClique(mol.getNumBonds()); for (auto mem : clique) { const Bond *bond = nullptr; if (pairNum == 1) { bond = molFrags->getBondWithIdx(vtxPairs[mem].first); } else { bond = molFrags->getBondWithIdx(vtxPairs[mem].second); } bInClique[bond->getIdx()] = 1; aInClique.set(bond->getBeginAtomIdx()); bond->getBeginAtom()->setProp("ORIG_INDEX", bond->getBeginAtomIdx()); aInClique.set(bond->getEndAtomIdx()); bond->getEndAtom()->setProp("ORIG_INDEX", bond->getEndAtomIdx()); } molFrags->beginBatchEdit(); for (auto &a : molFrags->atoms()) { if (!aInClique[a->getIdx()]) { molFrags->removeAtom(a); } } for (auto &b : molFrags->bonds()) { if (!bInClique[b->getIdx()]) { molFrags->removeBond(b->getBeginAtomIdx(), b->getEndAtomIdx()); } } molFrags->commitBatchEdit(); return molFrags; } // Calculate the shortest bond distance between the 2 fragments in the // molecule. int minFragSeparation(const ROMol &mol, const ROMol &molFrags, std::vector &fragMapping, int frag1, int frag2) { auto extractFragAtoms = [&](int fragNum, std::vector &fragAtoms) { for (size_t i = 0u; i < fragMapping.size(); ++i) { if (fragMapping[i] == fragNum) { int origIdx = molFrags.getAtomWithIdx(i)->getProp("ORIG_INDEX"); fragAtoms.push_back(origIdx); } } }; std::vector frag1Atoms, frag2Atoms; extractFragAtoms(frag1, frag1Atoms); extractFragAtoms(frag2, frag2Atoms); auto pathMatrix = MolOps::getDistanceMat(mol); double minDist = std::numeric_limits::max(); for (const auto &at1 : frag1Atoms) { for (const auto &at2 : frag2Atoms) { auto dist = pathMatrix[mol.getNumAtoms() * at1 + at2]; minDist = std::min(dist, minDist); } } return std::nearbyint(minDist); } // Assess the clique in terms of opts, returning true if it satisfies them all bool cliqueOk(const std::vector clique, const RascalOptions &opts, const ROMol &mol1, const ROMol &mol2, const std::vector> &vtxPairs) { std::unique_ptr mol1Frags, mol2Frags; std::vector mol1FragMapping, mol2FragMapping; int numMol1Frags = 0, numMol2Frags = 0; auto buildFrags = [&]() -> void { if (mol1Frags) { return; } mol1Frags.reset(makeCliqueFrags(mol1, clique, vtxPairs, 1)); mol2Frags.reset(makeCliqueFrags(mol2, clique, vtxPairs, 2)); numMol1Frags = MolOps::getMolFrags(*mol1Frags, mol1FragMapping); numMol2Frags = MolOps::getMolFrags(*mol2Frags, mol2FragMapping); }; if (opts.minFragSize > 0) { buildFrags(); // only need to do it for mol1, as the fragments should match. for (int i = 0; i < numMol1Frags; ++i) { auto fragSize = std::count(mol1FragMapping.begin(), mol1FragMapping.end(), i); if (fragSize < opts.minFragSize) { return false; } } } return true; } // If this clique warrants it, update maxCliques. void updateMaxClique(const std::vector &clique, bool deltaYPoss, const RascalOptions &opts, const ROMol &mol1, const ROMol &mol2, const std::vector> &vtxPairs, std::vector> &maxCliques, unsigned int &lowerBound) { if (!maxCliques.empty() && clique.size() < maxCliques.front().size()) { return; } bool didDeltaY = !deltaYPoss ? false : deltaYInClique(clique, mol1, mol2, vtxPairs); if (!didDeltaY) { if (maxCliques.empty()) { if (cliqueOk(clique, opts, mol1, mol2, vtxPairs)) { maxCliques.push_back((clique)); } } else { bool goodClique = false, didCliqueOk = false; if (clique.size() > maxCliques.front().size()) { goodClique = cliqueOk(clique, opts, mol1, mol2, vtxPairs); didCliqueOk = true; if (goodClique) { maxCliques.clear(); } } if (!didCliqueOk) { goodClique = cliqueOk(clique, opts, mol1, mol2, vtxPairs); } if (goodClique && (maxCliques.empty() || clique.size() == maxCliques.front().size())) { maxCliques.push_back(clique); } } if (!maxCliques.empty() && maxCliques.front().size() > lowerBound) { lowerBound = maxCliques.front().size(); } } } // If the current time is beyond the timeout limit, throws a // TimedOutException. void checkTimeout( const std::chrono::time_point &startTime, const RascalOptions &opts, const std::vector &clique, std::vector> &maxCliques, unsigned long long &numSteps) { ++numSteps; if (numSteps == 100) { // This clock is very convenient, but seems quite expensive. Calling it // every step added 10% to the runtime. auto currTime = std::chrono::high_resolution_clock::now(); auto runTime = std::chrono::duration_cast(currTime - startTime) .count(); if (runTime > opts.timeout) { if (maxCliques.empty()) { maxCliques.push_back(clique); } else { if (clique.size() > maxCliques.front().size()) { maxCliques.clear(); } if (clique.size() >= maxCliques.front().size()) { maxCliques.push_back(clique); } } throw TimedOutException(runTime, maxCliques); } numSteps = 0ULL; } } bool equivalentRootAlreadyDone(unsigned int rootVtx, const std::vector> &vtxPairs, const std::vector &equivBonds1, const std::vector &equivBonds2, std::set> &rootClasses) { std::pair newClasses{equivBonds1[vtxPairs[rootVtx].first], equivBonds2[vtxPairs[rootVtx].second]}; if (newClasses.first == -1) { return false; } if (!rootClasses.empty() && rootClasses.find(newClasses) != rootClasses.end()) { return true; } rootClasses.insert(newClasses); return false; } namespace { bool hasSubstructMatch(const ROMol &mol, const ROMol &query) { SubstructMatchParameters ps; ps.maxMatches = 1; return !SubstructMatch(mol, query, ps).empty(); } } // namespace // There are some simple substructures for which equivalent bond pruning isn't // allowed. bool checkEquivalentsAllowed(const ROMol &mol) { const static std::vector notSmarts{ "*~*", "*~*1~*~*~1", "*12~*~*~2~*~1", "*14~*(~*~2~3~4)~*~2~*~3~1"}; static std::vector> notStructs; if (notStructs.empty()) { for (const auto &smt : notSmarts) { notStructs.emplace_back(SmartsToMol(smt)); } } const static std::vector> notStats{ {2, 1}, {4, 4}, {4, 5}, {5, 8}}; for (size_t i = 0; i < notStructs.size(); ++i) { if (mol.getNumAtoms() == notStats[i].first && mol.getNumBonds() == notStats[i].second && hasSubstructMatch(mol, *notStructs[i])) { return false; } } return true; } void explorePartitions( RascalStartPoint &starter, const std::chrono::time_point &startTime, const RascalOptions &opts, std::vector> &maxCliques) { unsigned long long numSteps = 0ULL; std::vector> parts(1, starter.d_partSet); std::vector clique; std::set> rootClasses; bool canDoEquivs = false; if (opts.doEquivBondPruning) { canDoEquivs = checkEquivalentsAllowed(*starter.d_mol1) && checkEquivalentsAllowed(*starter.d_mol2); } while (!parts.empty()) { if (opts.timeout != -1) { checkTimeout(startTime, opts, clique, maxCliques, numSteps); } auto part = parts.back(); bool goDeeper = false; bool backtrack = false; if (opts.allBestMCESs) { if (clique.size() + part->numParts() < starter.d_lowerBound) { backtrack = true; } } else { if (clique.size() + part->numParts() <= starter.d_lowerBound) { backtrack = true; } } if (!backtrack) { if (opts.allBestMCESs) { goDeeper = clique.size() + part->upperBound() >= starter.d_lowerBound; } else { goDeeper = clique.size() + part->upperBound() > starter.d_lowerBound; } if (goDeeper) { if (!part->isEmpty()) { std::shared_ptr nextPart(new PartitionSet(*part)); clique.push_back(nextPart->popLastVertex()); if (clique.size() == 1 && canDoEquivs && equivalentRootAlreadyDone(clique.front(), starter.d_vtxPairs, starter.d_equivBonds1, starter.d_equivBonds2, rootClasses)) { clique.pop_back(); backtrack = true; } else { nextPart->pruneVertices(clique.back()); updateMaxClique(clique, starter.d_deltaYPoss, opts, *starter.d_mol1, *starter.d_mol2, starter.d_vtxPairs, maxCliques, starter.d_lowerBound); parts.push_back(nextPart); } } else { backtrack = true; } } else { backtrack = true; } } if (backtrack || (!parts.empty() && parts.back()->isEmpty())) { while (!parts.empty()) { if (parts.back()->isEmpty()) { parts.pop_back(); if (!clique.empty()) { clique.pop_back(); } } else { parts.back()->popLastVertex(); if (!parts.back()->isEmpty()) { break; } } } } if (parts.empty()) { break; } } } bool deltaYExchangePossible(const ROMol &mol1, const ROMol &mol2) { // A Delta-y exchange is an incorrect match when a cyclopropyl ring (the // delta) is matched to a C(C)(C) group (the y) because they both have // isomorphic line graphs. This checks to see if that's something we need // to worry about for these molecules. const static std::unique_ptr delta(SmartsToMol("C1CC1")); const static std::unique_ptr y(SmartsToMol("C(C)C")); return (hasSubstructMatch(mol1, *delta) && hasSubstructMatch(mol2, *y)) || (hasSubstructMatch(mol2, *delta) && hasSubstructMatch(mol1, *y)); } void findEquivalentBonds(const ROMol &mol, std::vector &equivBonds) { equivBonds = std::vector(mol.getNumBonds(), -1); std::vector ranks(mol.getNumAtoms()); bool breakTies = false; Canon::rankMolAtoms(mol, ranks, breakTies); int nextClass = 0; for (const auto &b1 : mol.bonds()) { for (const auto &b2 : mol.bonds()) { if (b1->getIdx() != b2->getIdx()) { if ((ranks[b1->getBeginAtomIdx()] == ranks[b2->getBeginAtomIdx()] && ranks[b1->getEndAtomIdx()] == ranks[b2->getEndAtomIdx()]) || (ranks[b1->getBeginAtomIdx()] == ranks[b2->getEndAtomIdx()] && ranks[b1->getEndAtomIdx()] == ranks[b2->getBeginAtomIdx()])) { if (equivBonds[b1->getIdx()] == -1 && equivBonds[b2->getIdx()] == -1) { equivBonds[b1->getIdx()] = nextClass; equivBonds[b2->getIdx()] = nextClass; ++nextClass; } else if (equivBonds[b1->getIdx()] == -1) { equivBonds[b1->getIdx()] = equivBonds[b2->getIdx()]; } else if (equivBonds[b2->getIdx()] == -1) { equivBonds[b2->getIdx()] = equivBonds[b1->getIdx()]; } } } } } } // Use the Floyd-Warshall algorithm to compute the distance matrix from the // adjacency matrix. // Adapted from https://en.wikipedia.org/wiki/Floyd–Warshall_algorithm void calcDistMatrix(const std::vector> &adjMatrix, std::vector> &distMatrix) { distMatrix = std::vector>( adjMatrix.size(), std::vector(adjMatrix.size(), adjMatrix.size() + 1)); for (size_t i = 0u; i < adjMatrix.size(); ++i) { distMatrix[i][i] = 0; for (size_t j = 0u; j < adjMatrix.size(); ++j) { if (i != j && adjMatrix[i][j]) { distMatrix[i][j] = 1; } } } for (size_t k = 0u; k < adjMatrix.size(); ++k) { for (size_t i = 0u; i < adjMatrix.size(); ++i) { for (size_t j = 0u; j < adjMatrix.size(); ++j) { if (distMatrix[i][j] > distMatrix[i][k] + distMatrix[k][j]) { distMatrix[i][j] = distMatrix[i][k] + distMatrix[k][j]; } } } } } // Set the atomic number of the atoms that match the SMARTS in // RascalOptions.EquivalentAtoms to 110, 111 etc. These will // be mapped back at the end. void assignEquivalentAtoms(ROMol &mol, const std::string &equivalentAtoms) { if (equivalentAtoms.empty()) { return; } std::vector classSmarts; boost::split(classSmarts, equivalentAtoms, boost::is_any_of(" "), boost::token_compress_on); if (classSmarts.size() > 9) { throw ValueErrorException( "Too many classes of equivalent atoms. Maximum is 9."); } int atNum = 110; for (auto &smt : classSmarts) { if (smt.empty()) { continue; } auto qmol = v2::SmilesParse::MolFromSmarts(smt); std::vector hits_vect; if (RDKit::SubstructMatch(mol, *qmol, hits_vect)) { for (const auto &hv : hits_vect) { for (const auto &h : hv) { auto a = mol.getAtomWithIdx(h.second); a->setAtomicNum(atNum); } } } ++atNum; } } RascalStartPoint makeInitialPartitionSet(const ROMol *mol1, const ROMol *mol2, const RascalOptions &opts) { RascalStartPoint starter; if (mol1->getNumAtoms() <= mol2->getNumAtoms()) { starter.d_swapped = false; starter.d_mol1.reset(new ROMol(*mol1)); starter.d_mol2.reset(new ROMol(*mol2)); } else { starter.d_swapped = true; starter.d_mol1.reset(new ROMol(*mol2)); starter.d_mol2.reset(new ROMol(*mol1)); } assignEquivalentAtoms(*starter.d_mol1, opts.equivalentAtoms); assignEquivalentAtoms(*starter.d_mol2, opts.equivalentAtoms); std::map>> degSeqs1, degSeqs2; starter.d_tier1Sim = details::tier1Sim(*starter.d_mol1, *starter.d_mol2, degSeqs1, degSeqs2); if (starter.d_tier1Sim < opts.similarityThreshold) { return starter; } std::vector bondLabels1, bondLabels2; details::getBondLabels(*starter.d_mol1, *starter.d_mol2, opts, bondLabels1, bondLabels2); starter.d_tier2Sim = details::tier2Sim(*starter.d_mol1, *starter.d_mol2, degSeqs1, degSeqs2, bondLabels1, bondLabels2); if (starter.d_tier2Sim < opts.similarityThreshold) { return starter; } // Get the line graphs for the two molecules as adjacency matrices. makeLineGraph(*starter.d_mol1, starter.d_adjMatrix1); makeLineGraph(*starter.d_mol2, starter.d_adjMatrix2); std::vector> distMat1, distMat2; if (opts.maxFragSeparation > -1 || opts.singleLargestFrag) { calcDistMatrix(starter.d_adjMatrix1, distMat1); calcDistMatrix(starter.d_adjMatrix2, distMat2); } // pairs are vertices in the 2 line graphs that are the same type. // d_modProd is the modular product/correspondence graph of the two // line graphs. makeModularProduct(*starter.d_mol1, starter.d_adjMatrix1, bondLabels1, distMat1, *starter.d_mol2, starter.d_adjMatrix2, bondLabels2, distMat2, opts, starter.d_vtxPairs, starter.d_modProd); if (starter.d_modProd.empty()) { return starter; } if (opts.minCliqueSize > 0) { starter.d_lowerBound = opts.minCliqueSize; } else { starter.d_lowerBound = calcLowerBound(*starter.d_mol1, *starter.d_mol2, opts.similarityThreshold); } starter.d_partSet.reset(new PartitionSet(starter.d_modProd, starter.d_vtxPairs, bondLabels1, bondLabels2, starter.d_lowerBound)); starter.d_deltaYPoss = deltaYExchangePossible(*starter.d_mol1, *starter.d_mol2); if (opts.doEquivBondPruning) { // if equiv_bonds1[i] and equiv_bonds1[j] are equal, the bonds are // equivalent. findEquivalentBonds(*starter.d_mol1, starter.d_equivBonds1); findEquivalentBonds(*starter.d_mol2, starter.d_equivBonds2); } else { starter.d_equivBonds1 = std::vector(starter.d_mol1->getNumBonds(), -1); starter.d_equivBonds2 = std::vector(starter.d_mol2->getNumBonds(), -1); } return starter; } std::vector findMCES(RascalStartPoint &starter, const RascalOptions &opts) { std::vector clique; std::vector> maxCliques; auto startTime = std::chrono::high_resolution_clock::now(); bool timedOut = false; RascalOptions tmpOpts{opts}; if (opts.singleLargestFrag) { tmpOpts.allBestMCESs = true; } try { explorePartitions(starter, startTime, tmpOpts, maxCliques); } catch (TimedOutException &e) { BOOST_LOG(rdWarningLog) << e.what() << std::endl; maxCliques = e.d_cliques; timedOut = true; } std::vector results; for (const auto &c : maxCliques) { results.push_back( RascalResult(*starter.d_mol1, *starter.d_mol2, starter.d_adjMatrix1, starter.d_adjMatrix2, c, starter.d_vtxPairs, timedOut, starter.d_swapped, starter.d_tier1Sim, starter.d_tier2Sim, opts.ringMatchesRingOnly, opts.singleLargestFrag, opts.maxFragSeparation, opts.exactConnectionsMatch, opts.equivalentAtoms, opts.ignoreBondOrders)); } if (opts.singleLargestFrag) { std::sort( results.begin(), results.end(), [](const RascalResult &r1, const RascalResult &r2) -> bool { if (r1.getAtomMatches().size() == r2.getAtomMatches().size()) { if (r1.getBondMatches().size() == r2.getBondMatches().size()) { if (r1.getAtomMatches() == r2.getAtomMatches()) { return (r1.getBondMatches() < r2.getBondMatches()); } return r1.getAtomMatches() < r2.getAtomMatches(); } return r1.getBondMatches().size() > r2.getBondMatches().size(); } return (r1.getAtomMatches().size() > r2.getAtomMatches().size()); }); // the singleLargestFrag method throws bits of solutions out, so there may // now be duplicates and results that are different sizes. results.erase( std::unique(results.begin(), results.end(), [](const RascalResult &r1, const RascalResult &r2) -> bool { return (r1.getAtomMatches() == r2.getAtomMatches() && r1.getBondMatches() == r2.getBondMatches()); }), results.end()); boost::dynamic_bitset<> want(results.size()); want.set(); for (size_t i = 1; i < results.size(); ++i) { if (results[i].getAtomMatches().size() < results[0].getAtomMatches().size() || results[i].getBondMatches().size() < results[0].getBondMatches().size()) { want[i] = false; } } if (want.count() < results.size()) { size_t j = 0; for (size_t i = 0; i < results.size(); ++i) { if (want[i]) { results[j++] = results[i]; } } results.erase(results.begin() + j, results.end()); } } else { // If 2 cliques are the same size, this sort puts the one with the smaller // number of fragments first, which may have fewer atoms. std::sort(results.begin(), results.end(), details::resultCompare); } return results; } // calculate the RASCAL MCES between the 2 molecules, provided it is within // the similarity threshold given. std::vector rascalMCES(const ROMol &mol1, const ROMol &mol2, const RascalOptions &opts) { auto starter = makeInitialPartitionSet(&mol1, &mol2, opts); if (!starter.d_partSet) { if (opts.returnEmptyMCES) { return std::vector( 1, RascalResult(starter.d_tier1Sim, starter.d_tier2Sim)); } return std::vector(); } auto results = findMCES(starter, opts); if (results.empty() && opts.returnEmptyMCES) { return std::vector( 1, RascalResult(starter.d_tier1Sim, starter.d_tier2Sim)); } if (!opts.allBestMCESs && results.size() > 1) { results.erase(results.begin() + 1, results.end()); } return results; } } // namespace RascalMCES } // namespace RDKit