Files
rdkit/Code/GraphMol/FMCS/MaximumCommonSubgraph.cpp
Greg Landrum 24a9689f45 dos2unix run
2014-04-07 09:27:07 +02:00

892 lines
36 KiB
C++

#include <list>
#include <algorithm>
#include <math.h>
#include "../QueryAtom.h"
#include "../QueryBond.h"
#include "../SmilesParse/SmilesWrite.h"
#include "../SmilesParse/SmartsWrite.h"
//#include "../SmilesParse/SmilesParse.h"
//#include "../Substruct/SubstructMatch.h"
#include "SubstructMatchCustom.h"
#include "MaximumCommonSubgraph.h"
#ifdef VERBOSE_STATISTICS_ON
ExecStatistics stat;
#endif
namespace RDKit
{
namespace FMCS
{
bool ConsoleOutputEnabled = false;
}}
namespace RDKit
{
namespace FMCS
{
struct LabelDefinition
{
unsigned ItemIndex; // item with this label value
unsigned Value;
LabelDefinition() : ItemIndex(-1), Value(-1) {}
LabelDefinition(unsigned i, unsigned value) : ItemIndex(i), Value(value) {}
};
MaximumCommonSubgraph::MaximumCommonSubgraph(const MCSParameters* params)
{
Parameters = ( 0 != params ? *params : MCSParameters());
if (Parameters.ProgressCallback == MCSProgressCallbackTimeout)
Parameters.ProgressCallbackUserData = &To;
To = time(0);
}
static
bool molPtr_NumBondLess (const ROMol* l, const ROMol* r) // need for sorting the source molecules by size
{
return l->getNumBonds() < r->getNumBonds();
}
void MaximumCommonSubgraph::init()
{
QueryMolecule = Molecules.front();
void* userData = Parameters.CompareFunctionsUserData;
if(Parameters.BondCompareParameters.CompleteRingsOnly || Parameters.BondCompareParameters.RingMatchesRingOnly)
{
#ifdef FAST_SUBSTRUCT_CACHE
RingMatchTables.init(QueryMolecule);
Parameters.CompareFunctionsUserData = &RingMatchTables;
#endif
}
#ifdef FAST_SUBSTRUCT_CACHE
#ifdef PRECOMPUTED_TABLES_MATCH
// fill out RingMatchTables for check cache Hash collision by checking match a part of Query to Query
if(!userData // predefined functor - compute RingMatchTable for all targets
&& (Parameters.BondCompareParameters.CompleteRingsOnly || Parameters.BondCompareParameters.RingMatchesRingOnly))
RingMatchTables.computeRingMatchTable(QueryMolecule, QueryMolecule, Parameters);
// fill out match tables
size_t nq = QueryMolecule->getNumAtoms();
QueryAtomMatchTable.resize(nq, nq);
for(size_t aj = 0; aj < nq; aj++)
for(size_t ai = 0; ai < nq; ai++)
QueryAtomMatchTable.set(ai, aj, Parameters.AtomTyper(Parameters.AtomCompareParameters,
*QueryMolecule, ai, *QueryMolecule, aj, Parameters.CompareFunctionsUserData));
nq = QueryMolecule->getNumBonds();
QueryBondMatchTable.resize(nq, nq);
for(size_t aj = 0; aj < nq; aj++)
for(size_t ai = 0; ai < nq; ai++)
QueryBondMatchTable.set(ai, aj, Parameters.BondTyper(Parameters.BondCompareParameters,
*QueryMolecule, ai, *QueryMolecule, aj, Parameters.CompareFunctionsUserData));
#endif // PRECOMPUTED_TABLES_MATCH
// Compute label values based on current functor and parameters for code Morgan correct computation.
unsigned currentLabelValue = 1;
std::vector<LabelDefinition> labels;
nq = QueryMolecule->getNumAtoms();
QueryAtomLabels.resize(nq);
for(size_t ai = 0; ai < nq; ai++)
{
if(MCSAtomCompareAny == Parameters.AtomTyper) // predefined functor without atom compare parameters
QueryAtomLabels[ai] = 1;
else
{
const Atom* atom = QueryMolecule->getAtomWithIdx(ai);
if(MCSAtomCompareElements == Parameters.AtomTyper) // predefined functor without atom compare parameters
QueryAtomLabels[ai] = atom->getAtomicNum()|(Parameters.AtomCompareParameters.MatchValences ? (atom->getTotalValence()>>8):0);
else if(MCSAtomCompareIsotopes == Parameters.AtomTyper) // predefined functor without atom compare parameters
QueryAtomLabels[ai] = atom->getAtomicNum()|(atom->getIsotope()>>8)|(Parameters.AtomCompareParameters.MatchValences ? (atom->getTotalValence()>>16):0);
else // custom user defined functor
{
QueryAtomLabels[ai] = -1;
for(size_t i = 0; i < labels.size(); i++)
if(Parameters.AtomTyper(Parameters.AtomCompareParameters,
*QueryMolecule, labels[i].ItemIndex, *QueryMolecule, ai, userData)) // equal itoms
{
QueryAtomLabels[ai] = labels[i].Value;
break;
}
if(-1 == QueryAtomLabels[ai]) // not found -> create new label
{
QueryAtomLabels[ai] = ++currentLabelValue;
labels.push_back(LabelDefinition(ai, currentLabelValue));
}
}
}
}
labels.clear();
currentLabelValue = 1;
nq = QueryMolecule->getNumBonds();
QueryBondLabels.resize(nq);
for(size_t aj = 0; aj < nq; aj++)
{
const Bond* bond = QueryMolecule->getBondWithIdx(aj);
unsigned ring = 0;
if(Parameters.BondCompareParameters.CompleteRingsOnly || Parameters.BondCompareParameters.RingMatchesRingOnly)
{
ring = RingMatchTables.isQueryBondInRing(aj) ? 0 : 1; // is bond in ring
}
if(MCSBondCompareAny == Parameters.BondTyper) // predefined functor without atom compare parameters
QueryBondLabels[aj] = 1 | (ring>>8);
else if(MCSBondCompareOrderExact == Parameters.BondTyper) // predefined functor without compare parameters
QueryBondLabels[aj] = (bond->getBondType() + 1) | (ring>>8);
else if(MCSBondCompareOrder == Parameters.BondTyper) // predefined functor, ignore Aromatization
{
unsigned order = bond->getBondType();
if(Bond::AROMATIC == order || Bond::ONEANDAHALF == order) // ignore Aromatization
order = Bond::SINGLE;
else if(Bond::TWOANDAHALF == order)
order = Bond::DOUBLE;
else if(Bond::THREEANDAHALF == order)
order = Bond::TRIPLE;
else if(Bond::FOURANDAHALF == order)
order = Bond::QUADRUPLE;
else if(Bond::FIVEANDAHALF == order)
order = Bond::QUINTUPLE;
QueryBondLabels[aj] = (order + 1) | (ring>>8);
}
else // custom user defined functor
{
QueryBondLabels[aj] = -1;
for(size_t i = 0; i < labels.size(); i++)
if(Parameters.BondTyper(Parameters.BondCompareParameters,
*QueryMolecule, labels[i].ItemIndex, *QueryMolecule, aj, userData)) // equal bonds + ring ...
{
QueryBondLabels[aj] = labels[i].Value;
break;
}
if(-1 == QueryAtomLabels[aj]) // not found -> create new label
{
QueryBondLabels[aj] = ++currentLabelValue;
labels.push_back( LabelDefinition(aj, currentLabelValue));
}
}
}
#endif
Targets.resize(Molecules.size()-1);
size_t i=0;
for(std::vector<const ROMol*>::iterator it = Molecules.begin()+1; it != Molecules.end(); it++, i++)
{
Targets[i].Molecule = *it;
// build Target Topology ADD ATOMs
size_t j=0; // current item
for(ROMol::ConstAtomIterator a = Targets[i].Molecule->beginAtoms(); a != Targets[i].Molecule->endAtoms(); a++, j++)
{
Targets[i].Topology.addAtom((*a)->getIdx());
#ifdef FAST_INCREMENTAL_MATCH
Targets[i].AtomAdjacency.resize((*it)->getNumAtoms());
ROMol::OEDGE_ITER beg,end;
for(boost::tie(beg,end) = (*it)->getAtomBonds(*a); beg!=end; beg++) // all bonds from the atom
{
AtomAdjacency aa;
aa.Bond = (*(*it))[*beg].get();
aa.BondIdx = aa.Bond->getIdx();
aa.ConnectedAtomIdx = aa.Bond->getEndAtomIdx();
if((*a)->getIdx() == aa.ConnectedAtomIdx)
aa.ConnectedAtomIdx = aa.Bond->getBeginAtomIdx();
Targets[i].AtomAdjacency[j].push_back(aa);
}
#endif // FAST_INCREMENTAL_MATCH
}
// build Target Topology ADD BONDs
for(ROMol::ConstBondIterator b = Targets[i].Molecule->beginBonds(); b != Targets[i].Molecule->endBonds(); b++)
{
const Bond* bond = *b;
unsigned ii = bond->getBeginAtomIdx();
unsigned jj = bond->getEndAtomIdx();
Targets[i].Topology.addBond((*b)->getIdx(), ii, jj);
}
// fill out RingMatchTables
if(!userData // predefined functor - compute RingMatchTable for all targets
&& (Parameters.BondCompareParameters.CompleteRingsOnly || Parameters.BondCompareParameters.RingMatchesRingOnly))
{
#ifdef FAST_SUBSTRUCT_CACHE
RingMatchTables.addTargetBondRingsIndeces(Targets[i].Molecule);
RingMatchTables.computeRingMatchTable(QueryMolecule, Targets[i].Molecule, Parameters);
#endif
}
#ifdef PRECOMPUTED_TABLES_MATCH
// fill out match tables
size_t nq = QueryMolecule->getNumAtoms();
size_t nt = (*it)->getNumAtoms();
Targets[i].AtomMatchTable.resize(nq, nt);
for(size_t aj = 0; aj < nt; aj++)
for(size_t ai = 0; ai < nq; ai++)
Targets[i].AtomMatchTable.set(ai, aj, Parameters.AtomTyper(Parameters.AtomCompareParameters,
*QueryMolecule, ai, *Targets[i].Molecule, aj, Parameters.CompareFunctionsUserData));
nq = QueryMolecule->getNumBonds();
nt = (*it)->getNumBonds();
Targets[i].BondMatchTable.resize(nq, nt);
for(size_t aj = 0; aj < nt; aj++)
for(size_t ai = 0; ai < nq; ai++)
Targets[i].BondMatchTable.set(ai, aj, Parameters.BondTyper(Parameters.BondCompareParameters,
*QueryMolecule, ai, *Targets[i].Molecule, aj, Parameters.CompareFunctionsUserData));
#endif
}
Parameters.CompareFunctionsUserData = userData; // restore
}
void MaximumCommonSubgraph::makeInitialSeeds()
{
// build a set of initial seeds as "all" single bonds from query molecule
std::vector<bool> excludedBonds(QueryMolecule->getNumBonds());
for(size_t i = 0; i < excludedBonds.size(); i++)
excludedBonds[i] = false;
Seeds.clear();
//R1 additional performance OPTIMISATION
//if(Parameters.BondCompareParameters.CompleteRingsOnly)
// disable all mismatched rings, and do not generate initial seeds from such disabled bonds
// for( rings .....) for(i......)
// if(mismatched) excludedBonds[i.......] = true;
for(RWMol::ConstBondIterator bi = QueryMolecule->beginBonds(); bi != QueryMolecule->endBonds(); bi++)
{
//R1 additional performance OPTIMISATION
//if(excludedBonds[(*bi)->getIdx()])
// continue;
Seed seed;
#ifdef FAST_INCREMENTAL_MATCH
seed.MatchResult.resize(Targets.size());
#endif
#ifdef VERBOSE_STATISTICS_ON
++stat.Seed;
++stat.InitialSeed;
#endif
seed.addAtom((*bi)->getBeginAtom());
seed.addAtom((*bi)->getEndAtom());
seed.ExcludedBonds = excludedBonds; // all bonds from first to current
seed.addBond (*bi);
excludedBonds[(*bi)->getIdx()] = true;
seed.computeRemainingSize(*QueryMolecule);
if( ! checkIfMatchAndAppend(seed))
{
#ifdef VERBOSE_STATISTICS_ON
++stat.MismatchedInitialSeed;
#endif
// optionally remove all such bonds from all targets TOPOLOGY where it exists.
//..........
// disable (mark as already processed) mismatched bond in all seeds
for(SeedSet::iterator si = Seeds.begin(); si != Seeds.end(); si++)
si->ExcludedBonds[(*bi)->getIdx()] = true;
}
}
Seeds.MaxBonds = 1;
Seeds.MaxAtoms = 2;
}
void MaximumCommonSubgraph::growSeeds(MolFragment& mcsIdx, MCSResult& res)
{
unsigned steps = 99999; // steps from last progress callback call. call it immediately in the begining
// Find MCS -- SDF Seed growing OPTIMISATION (it works in 3 times faster)
while(!Seeds.empty())
{
++steps;
stat.TotalSteps++;
SeedSet::iterator si = Seeds.begin();
si->grow(*this, *QueryMolecule);
const Seed& fs = Seeds.front();
// bigger substructure found
if((!Parameters.MaximizeBonds && (fs.getNumAtoms() > res.NumAtoms || (fs.getNumAtoms() == res.NumAtoms && fs.getNumBonds() > res.NumBonds)))
||( Parameters.MaximizeBonds && (fs.getNumBonds() > res.NumBonds || (fs.getNumBonds() == res.NumBonds && fs.getNumAtoms() > res.NumAtoms)))
)
{
stat.MCSFoundStep = stat.TotalSteps;
res.NumAtoms = fs.getNumAtoms();
res.NumBonds = fs.getNumBonds();
mcsIdx.Atoms = fs.MoleculeFragment.Atoms;
mcsIdx.Bonds = fs.MoleculeFragment.Bonds;
mcsIdx.AtomsIdx = fs.MoleculeFragment.AtomsIdx;
mcsIdx.BondsIdx = fs.MoleculeFragment.BondsIdx;
//TMP DEBUG
/* std::cout<<"MCS atoms=(";
for(unsigned seedAtomIdx = 0; seedAtomIdx < mcs.getNumAtoms(); seedAtomIdx++)
{
const Atom* atom = mcs.MoleculeFragment.Atoms[seedAtomIdx];
std::cout<<atom->getIdx()<<", ";
}
std::cout<<") new atoms=[";
for(unsigned seedAtomIdx = mcs.LastAddedAtomsBeginIdx; seedAtomIdx < mcs.getNumAtoms(); seedAtomIdx++)
{
const Atom* atom = mcs.MoleculeFragment.Atoms[seedAtomIdx];
std::cout<<atom->getIdx()<<", ";
}
std::cout<<"] bonds=(";
for(int i=0; i<mcsIdx.BondsIdx.size(); i++)
std::cout<<mcsIdx.BondsIdx[i]<<", ";
std::cout<<") Size="<< mcs.getNumAtoms() <<", "<< mcs.getNumBonds() <<" Remain=" << mcs.RemainingAtoms <<", "<<mcs.RemainingBonds<<" = ";
std::cout<< MolFragmentToSmiles(*QueryMolecule, *(const std::vector<int>*) &mcs.MoleculeFragment.AtomsIdx, (const std::vector<int>*) &mcs.MoleculeFragment.BondsIdx) <<"\n"; // unsigned
*/
/*
if(0==si->MoleculeFragment.BondsIdx[0])
{
std::cout<<"\n"
<<" -----------------"<<(void*)&*si
<<"Remaining "<<si->RemainingBonds<< ", " << si->RemainingAtoms
<<": LastAddedAtomsBeginIdx = "<<si->LastAddedAtomsBeginIdx<<", LastAddedBondsBeginIdx = "<<si->LastAddedBondsBeginIdx<<"\n";
for(size_t i = 0; i < si->MoleculeFragment.Bonds.size(); i++)
std::cout << i << " "<<si->MoleculeFragment.Bonds[i]->getIdx()<<" : "
<<" "<<si->MoleculeFragment.Bonds[i]->getBeginAtom()->getIdx()
<<" "<<si->MoleculeFragment.Bonds[i]->getEndAtom()->getIdx()
<<"\n";
}
*/
/*
#ifdef xxxWIN32 // TEMP DEBUG !!!!!!!!!!!!!
if(30 == res.NumAtoms)
{
std::cout<<"-------------\n";
for(int i=0; i<mcs.getNumBonds(); i++)
std::cout<<i<<" bIdx=" << mcs.MoleculeFragment.BondsIdx[i]//<<"\n";
<<" =("<< QueryMolecule->getBondWithIdx(mcs.MoleculeFragment.BondsIdx[i])->getBeginAtomIdx()
<<" , "<< QueryMolecule->getBondWithIdx(mcs.MoleculeFragment.BondsIdx[i])->getEndAtomIdx()<<")\n";
std::cout<<"-------------\n";
for(int i=0; i<30; i++)
{
const Atom* a = QueryMolecule->getAtomWithIdx(mcs.MoleculeFragment.AtomsIdx[i]);
std::cout<<i<<" aIdx=" << mcs.MoleculeFragment.AtomsIdx[i]<<" : "<<a->getAtomicNum()<<"\n";
if(a->getAtomicNum()==7) // N
{
ROMol::OEDGE_ITER beg,end;
for(boost::tie(beg,end) = QueryMolecule->getAtomBonds(a); beg!=end; beg++)
{
const Bond* bond = &*((*QueryMolecule)[*beg]);
std::cout<<" bond="<<bond->getIdx()<<" =("<< bond->getBeginAtomIdx()
<<" , "<< bond->getEndAtomIdx()<<")\n";
}
}
}
std::cout<<"-------------\n";
}
#endif
*/
}
if(-1 == si->GrowingStage) //finished
Seeds.erase(si);
if(Parameters.ProgressCallback && (steps > 777))// || res.NumAtoms > Stat.NumAtoms))
{
steps = 0;
Stat.NumAtoms = res.NumAtoms;
Stat.NumBonds = res.NumBonds;
#ifdef VERBOSE_STATISTICS_ON
Stat.SeedProcessed = stat.Seed;
#endif
if(!Parameters.ProgressCallback(Stat, Parameters, Parameters.ProgressCallbackUserData))
{
res.Canceled = true;
break;
}
}
}
}
struct AtomMatch // for each seed atom (matched)
{
unsigned QueryAtomIdx;
unsigned TargetAtomIdx;
AtomMatch() : QueryAtomIdx(-1), TargetAtomIdx(-1) {}
};
typedef std::vector<AtomMatch> AtomMatchSet;
std::string MaximumCommonSubgraph::generateSMARTS(const MolFragment& mcsIdx)
{
// match the result MCS with all targets to check if it is exact match or template
Seed seed; // result MCS
seed.ExcludedBonds.resize(QueryMolecule->getNumBonds());
for(size_t i = 0; i < seed.ExcludedBonds.size(); i++)
seed.ExcludedBonds[i] = false;
std::vector<AtomMatchSet> atomMatchResult(Targets.size());
std::vector<unsigned> atomIdxMap(QueryMolecule->getNumAtoms());
std::vector<std::map<unsigned, const Bond*> > bondMatchSet (mcsIdx.Bonds.size()); //key is unique BondType
std::vector<std::map<unsigned, const Atom*> > atomMatchSet (mcsIdx.Atoms.size()); //key is unique atomic number
//isotope, mass, charge, Hs std::vector<std::map<unsigned, const Atom*> > atomMatchSet (mcsIdx.Atoms.size()); //key is unique atomic number
for(std::vector<const Atom*>::const_iterator atom = mcsIdx.Atoms.begin(); atom != mcsIdx.Atoms.end(); atom++)
{
atomIdxMap[(*atom)->getIdx()] = seed.getNumAtoms();
seed.addAtom((*atom));
}
for(std::vector<const Bond*>::const_iterator bond = mcsIdx.Bonds.begin(); bond != mcsIdx.Bonds.end(); bond++)
seed.addBond((*bond));
unsigned itarget = 0;
for(std::vector<Target>::const_iterator tag = Targets.begin(); tag != Targets.end(); tag++, itarget++)
{
match_V_t match; // THERE IS NO Bonds match INFO !!!!
bool target_matched =
#ifdef PRECOMPUTED_TABLES_MATCH
SubstructMatchCustomTable(tag->Topology, seed.Topology, tag->AtomMatchTable, tag->BondMatchTable, &match);
#else //noticable slowly:
SubstructMatchCustom(
tag->Topology.GraphTopology
, *tag->Molecule
, seed.GraphTopology
, *QueryMolecule
, Parameters.AtomTyper, Parameters.BondTyper
, Parameters.AtomCompareParameters, Parameters.BondCompareParameters, Parameters.CompareFunctionsUserData
);
#endif
atomMatchResult[itarget].resize(seed.getNumAtoms());
for(match_V_t::const_iterator mit = match.begin(); target_matched && mit != match.end(); mit++)
{
unsigned ai = mit->first; // SeedAtomIdx
atomMatchResult[itarget][ai].QueryAtomIdx = seed.Topology[mit->first];
atomMatchResult[itarget][ai].TargetAtomIdx = tag->Topology[mit->second];
const Atom* ta = tag->Molecule->getAtomWithIdx(tag->Topology[mit->second]);
if(ta && ta->getAtomicNum() != seed.MoleculeFragment.Atoms[ai]->getAtomicNum())
atomMatchSet[ai][ta->getAtomicNum()] = ta; // add
}
// AND BUILD BOND MATCH INFO
unsigned bi=0;
for(std::vector<const Bond*>::const_iterator bond = mcsIdx.Bonds.begin(); target_matched && bond != mcsIdx.Bonds.end(); bond++, bi++)
{
unsigned i = atomIdxMap[(*bond)->getBeginAtomIdx()];
unsigned j = atomIdxMap[(*bond)->getEndAtomIdx()];
unsigned ti= atomMatchResult[itarget][i].TargetAtomIdx;
unsigned tj= atomMatchResult[itarget][j].TargetAtomIdx;
const Bond* tBond = tag->Molecule->getBondBetweenAtoms(ti, tj);
if(tBond && (*bond)->getBondType() != tBond->getBondType())
bondMatchSet[bi] [tBond->getBondType()] = tBond; // add
}
}
// Generate result's SMARTS
RWMol mol; // create molecule from MCS for MolToSmarts()
unsigned ai = 0; // SeedAtomIdx
for(std::vector<const Atom*>::const_iterator atom = mcsIdx.Atoms.begin(); atom != mcsIdx.Atoms.end(); atom++, ai++)
{
QueryAtom* a = new QueryAtom(*(*atom)); // generate [#6] instead of C or c !
//for all atomMatchSet[ai] items add atom query to template
for(std::map<unsigned, const Atom*>::const_iterator am = atomMatchSet[ai].begin(); am != atomMatchSet[ai].end(); am++)
{
ATOM_OR_QUERY* a2 = new ATOM_OR_QUERY();
a2->addChild(QueryAtom::QUERYATOM_QUERY::CHILD_TYPE(makeAtomNumQuery((*atom)->getAtomicNum())));
a2->addChild(QueryAtom::QUERYATOM_QUERY::CHILD_TYPE(makeAtomNumQuery(am->second->getAtomicNum())));
a2->setDescription("AtomOr");
//ATOM_EQUALS_QUERY *makeAtomIsotopeQuery(int what)
//....
a->setQuery(a2);
}
mol.addAtom(a, true);
}
unsigned bi = 0; // Seed Idx
for(std::vector<const Bond*>::const_iterator bond = mcsIdx.Bonds.begin(); bond != mcsIdx.Bonds.end(); bond++, bi++)
{
unsigned i = atomIdxMap[(*bond)->getBeginAtomIdx()];
unsigned j = atomIdxMap[(*bond)->getEndAtomIdx()];
QueryBond* b = new QueryBond(*(*bond));
// add OR template if need
for(std::map<unsigned, const Bond*>::const_iterator bm = bondMatchSet[bi].begin(); bm != bondMatchSet[bi].end(); bm++)
{
BOND_OR_QUERY* b2 = new BOND_OR_QUERY();
b2->addChild(QueryBond::QUERYBOND_QUERY::CHILD_TYPE(makeBondOrderEqualsQuery((*bond)->getBondType())));
b2->addChild(QueryBond::QUERYBOND_QUERY::CHILD_TYPE(makeBondOrderEqualsQuery(bm->second->getBondType())));
b2->setDescription("BondOr");
b->setQuery(b2);
}
b->setBeginAtomIdx(i);
b->setEndAtomIdx(j);
mol.addBond(b, true);
}
return MolToSmarts(mol, true);
}
MCSResult MaximumCommonSubgraph::find(const std::vector<ROMOL_SPTR>& src_mols)
{
clear();
MCSResult res;
if(src_mols.size() < 2)
throw std::runtime_error("FMCS. Invalid argument. mols.size() must be at least 2");
if (Parameters.Threshold > 1.0)
throw std::runtime_error("FMCS. Invalid argument. Parameter Threshold must be 1.0 or less.");
ThresholdCount = (unsigned) ceil((src_mols.size()-1) * Parameters.Threshold); // minimal required number of matched targets
if (ThresholdCount < 1) // at least one target
ThresholdCount = 1;
if (ThresholdCount > src_mols.size()-1) // max all targets
ThresholdCount = src_mols.size()-1;
// Selecting CompleteRingsOnly option also enables --ring-matches-ring-only. ring--ring and chain bonds only match chain bonds.
if(Parameters.BondCompareParameters.CompleteRingsOnly)
Parameters.BondCompareParameters.RingMatchesRingOnly = true;
for(std::vector<ROMOL_SPTR>::const_iterator it = src_mols.begin(); it != src_mols.end(); it++)
{
Molecules.push_back((*it).get());
if(!Molecules.back()->getRingInfo()->isInitialized())
Molecules.back()->getRingInfo()->initialize(); // but do not fill out !!!
}
// sort source set of molecules by their 'size' and assume the smallest molecule as a query
std::stable_sort(Molecules.begin(), Molecules.end(), molPtr_NumBondLess);
init();
makeInitialSeeds();
/* // TEMP DEBUG !!!!!!!!!!!!!
{
std::cout<<"---Bonds----------\n";
for(int i=0; i<QueryMolecule->getNumBonds(); i++)
std::cout<<i<<" "
<<" =("<< QueryMolecule->getBondWithIdx(i)->getBeginAtomIdx()
<<" , "<< QueryMolecule->getBondWithIdx(i)->getEndAtomIdx()<<")\n";
std::cout<<"---Atoms----------\n";
for(int i=0; i<QueryMolecule->getNumAtoms(); i++)
{
const Atom* a = QueryMolecule->getAtomWithIdx(i);
std::cout<<i<<" : elem="<<a->getAtomicNum()<<" bonds: ";
ROMol::OEDGE_ITER beg,end;
for(boost::tie(beg,end) = QueryMolecule->getAtomBonds(a); beg!=end; beg++)
{
const Bond* bond = &*((*QueryMolecule)[*beg]);
std::cout<<bond->getIdx()<<"=("<< bond->getBeginAtomIdx()<<", "<<bond->getEndAtomIdx()<<") ";
}
std::cout<<"\n";
}
std::cout<<"-------------\n";
}
// !!!!!!!!!!!!!!!!!!!!!!
*/
MolFragment mcsIdx; // current MCS
growSeeds(mcsIdx, res);
res.SmartsString = generateSMARTS(mcsIdx);
/* // TMP DEBUG
std::cout<<"---MCS Bonds------\n";
for(int i=0; i<mcsIdx.BondsIdx.size(); i++)
std::cout<<i<<" bIdx=" << mcsIdx.BondsIdx[i]
<<" =("<< QueryMolecule->getBondWithIdx(mcsIdx.BondsIdx[i])->getBeginAtomIdx()
<<" , "<< QueryMolecule->getBondWithIdx(mcsIdx.BondsIdx[i])->getEndAtomIdx()<<")\n";
// TMP DEBUG
//---------------------------
std::cout<<"Query "<< MolToSmiles(*QueryMolecule) <<"\n";
std::cout<<"MCS Smiles "<< MolFragmentToSmiles(*QueryMolecule, *(const std::vector<int>*) &mcsIdx.AtomsIdx, (const std::vector<int>*) &mcsIdx.BondsIdx) <<"\n"; // unsigned
*/
/*
unsigned itarget = 0;
for(std::vector<Target>::const_iterator tag = Targets.begin(); tag != Targets.end(); tag++, itarget++)
{
MatchVectType match;
bool target_matched = SubstructMatch(*tag->Molecule, *QueryMolecule, match);
std::cout<<"Target "<< itarget+1 << (target_matched ? " matched" : " MISMATCHED") <<"\n";
}
*/
//---------------------------
#ifdef VERBOSE_STATISTICS_ON
if(ConsoleOutputEnabled)
{
std::cout << "STATISTICS:\n";
std::cout << "Total Growing Steps = " << stat.TotalSteps<<", MCS found on "<<stat.MCSFoundStep<<" step\n";
std::cout << "Initial Seeds = " << stat.InitialSeed << ", Mismatched " << stat.MismatchedInitialSeed<<"\n";
std::cout << "Inspected Seeds = " << stat.Seed<<"\n";
std::cout << "Rejected by BestSize = " << stat.RemainingSizeRejected << "\n";
#ifdef EXCLUDE_WRONG_COMPOSITION
std::cout << "Rejected by WrongComposition = " << stat.WrongCompositionRejected
<< " [ " << stat.WrongCompositionDetected << " Detected ]\n";
#endif
std::cout << "MatchCheck Seeds = " << stat.SeedCheck <<"\n";
std::cout //<< "\n"
<< " MatchCalls = " << stat.MatchCall <<"\n"
<< " MatchFound = " << stat.MatchCallTrue <<"\n";
#ifdef FAST_INCREMENTAL_MATCH
std::cout << " fastMatchCalls = " << stat.FastMatchCall <<"\n"
<< " fastMatchFound = " << stat.FastMatchCallTrue <<"\n";
std::cout << " slowMatchCalls = " << stat.MatchCall - stat.FastMatchCallTrue <<"\n"
<< " slowMatchFound = " << stat.MatchCallTrue - stat.FastMatchCallTrue <<"\n";
#endif
#ifdef PRECOMPUTED_TABLES_MATCH
std::cout << "--- USED PreComputed Match TABLES ! ---\n";
#endif
#ifdef VERBOSE_STATISTICS_FASTCALLS_ON
std::cout << "AtomFunctorCalls = " << stat.AtomFunctorCalls << "\n";
std::cout << "BondCompareCalls = " << stat.BondCompareCalls << "\n";
#endif
// std::cout << "\n";
std::cout << " DupCacheFound = " << stat.DupCacheFound
<<" "<< stat.DupCacheFoundMatch<<" matched, "
<<stat.DupCacheFound - stat.DupCacheFoundMatch <<" mismatched\n";
#ifdef FAST_SUBSTRUCT_CACHE
std::cout << "HashCache size = " << HashCache.keyssize() << " keys\n";
std::cout << "HashCache size = " << HashCache.fullsize() << " entries\n";
std::cout << "FindHashInCache = " << stat.FindHashInCache << "\n";
std::cout << "HashFoundInCache= " << stat.HashKeyFoundInCache << "\n";
std::cout << "ExactMatchCalls = " << stat.ExactMatchCall <<"\n"
<< "ExactMatchFound = " << stat.ExactMatchCallTrue <<"\n";
#endif
}
#endif
//---------------------
clear();
return res;
}
bool MaximumCommonSubgraph::checkIfMatchAndAppend(Seed& seed)
{
#ifdef TRACE_ON
TRACE() << "CHECK "; // print out time
for(std::vector<const Bond*>::const_iterator bi = seed.MoleculeFragment.Bonds.begin(); bi != seed.MoleculeFragment.Bonds.end(); bi++)
TRACE(0) << (*bi)->getIdx() << " ";
TRACE(0) << "\n";
#endif
#ifdef VERBOSE_STATISTICS_ON
++stat.SeedCheck;
#endif
bool foundInCache = false;
bool foundInDupCache = false;
#ifdef DUP_SUBSTRUCT_CACHE
if(DuplicateCache.find(seed.DupCacheKey, foundInCache))
{
// duplicate found. skip match() but store both seeds, because they will grow by different paths !!!
#ifdef VERBOSE_STATISTICS_ON
stat.DupCacheFound++;
stat.DupCacheFoundMatch += foundInCache ? 1 : 0;
#endif
if(!foundInCache) // mismatched !!!
return false;
}
foundInDupCache = foundInCache;
#endif
#ifdef FAST_SUBSTRUCT_CACHE
SubstructureCache::HashKey cacheKey;
SubstructureCache::TIndexEntry* cacheEntry = 0;
bool cacheEntryIsValid = false;
if(!foundInCache)
{
#ifdef VERBOSE_STATISTICS_ON
++stat.FindHashInCache;
#endif
cacheEntry = HashCache.find(seed, QueryAtomLabels, QueryBondLabels, cacheKey);
cacheEntryIsValid = true;
if(cacheEntry) // possibly found. check for hash collision
{
#ifdef VERBOSE_STATISTICS_ON
++stat.HashKeyFoundInCache;
#endif
// check hash collisions (time +3%):
for(SubstructureCache::TIndexEntry::const_iterator g = cacheEntry->begin(); !foundInCache && g != cacheEntry->end(); g++)
{
if(g->m_vertices.size() != seed.getNumAtoms() || g->m_edges.size() != seed.getNumBonds())
continue;
#ifdef VERBOSE_STATISTICS_ON
++stat.ExactMatchCall;
#endif
// EXACT MATCH
#ifdef PRECOMPUTED_TABLES_MATCH
foundInCache = SubstructMatchCustomTable((*g), seed.Topology, QueryAtomMatchTable, QueryBondMatchTable);
#else //..................
#endif
#ifdef VERBOSE_STATISTICS_ON
if(foundInCache)
++stat.ExactMatchCallTrue;
#endif
}
}
}
#endif
bool found = foundInCache;
if(!found)
{
found = match(seed);
}
if(found) // Store new generated seed, if found in cache or in all(- threshold) targets
{
Seed& new_seed = Seeds.add(seed);
#ifdef DUP_SUBSTRUCT_CACHE
if(!foundInDupCache && seed.getNumBonds() >= 3) // only seed with a ring can be duplicated - do not store very small seed in cache
DuplicateCache.add(seed.DupCacheKey, true);
#endif
#ifdef FAST_SUBSTRUCT_CACHE
if(!foundInCache)
HashCache.add(seed, cacheKey, cacheEntry);
#endif
}
else
{
#ifdef DUP_SUBSTRUCT_CACHE
if(seed.getNumBonds() > 3)
DuplicateCache.add(seed.DupCacheKey, false); //opt. cache mismatched duplicates too
#endif
}
return found; // new matched seed has been actualy added
}
bool MaximumCommonSubgraph::match(Seed& seed)
{
unsigned max_miss = Targets.size() - ThresholdCount;
unsigned missing = 0;
unsigned passed = 0;
unsigned itarget = 0;
for(std::vector<Target>::const_iterator tag = Targets.begin(); tag != Targets.end(); tag++, itarget++)
{
#ifdef VERBOSE_STATISTICS_ON
++stat.MatchCall;
#endif
bool target_matched = false;
#ifdef FAST_INCREMENTAL_MATCH
if(!seed.MatchResult[itarget].empty())
target_matched = matchIncrementalFast(seed, itarget);
#endif
if(!target_matched) // slow full match
{
match_V_t match; // THERE IS NO Bonds match INFO !!!!
target_matched =
#ifdef PRECOMPUTED_TABLES_MATCH
SubstructMatchCustomTable(tag->Topology, seed.Topology, tag->AtomMatchTable, tag->BondMatchTable, &match);
#else //noticable slowly:
SubstructMatchCustom(
tag->Topology
, *tag->Molecule
, seed.GraphTopology
, *QueryMolecule
, Parameters.AtomTyper, Parameters.BondTyper
, Parameters.AtomCompareParameters, Parameters.BondCompareParameters, Parameters.CompareFunctionsUserData
);
#endif
#ifdef FAST_INCREMENTAL_MATCH // save current match info
seed.MatchResult[itarget].clear();//resize(target_matched ? match.size() : 0);
for(match_V_t::const_iterator mit = match.begin(); target_matched && mit != match.end(); mit++)
{
BondMatch m;
m.SeedAtomIdx = mit->first;
m.QueryAtomIdx = seed.GraphTopology[mit->first];
m.TargetAtomIdx = mit->second; //==target.Topology[it->second]);
seed.MatchResult[itarget].push_back(m); // add for each matched bond
}
//AND RESTORE BOND MATCH INFO !!!!!!!!!!!!!!!!!!!!!
#endif
}
if(target_matched)
{
if(++passed >= ThresholdCount) // it's enought
break;
}
else // mismatched
{
if(++missing > max_miss)
break;
}
}
if(missing <= max_miss)
{
#ifdef VERBOSE_STATISTICS_ON
++stat.MatchCallTrue;
#endif
return true;
}
return false;
}
#ifdef FAST_INCREMENTAL_MATCH
// call it by target, if fail perform full match check
bool MaximumCommonSubgraph::matchIncrementalFast(Seed& seed, unsigned itarget)
{
// use and update results of previous match stored in the seed
#ifdef VERBOSE_STATISTICS_ON
++stat.FastMatchCall;
#endif
Target& t = Targets[itarget];
BondMatchSet& match = seed.MatchResult[itarget];
size_t previousLen = match.size();
for(unsigned newBondSeedIdx = seed.LastAddedBondsBeginIdx; newBondSeedIdx < seed.getNumBonds(); newBondSeedIdx++)
{
bool found=false;
const Bond* bond = seed.MoleculeFragment.Bonds[newBondSeedIdx];
unsigned newBondQueryIdx = seed.MoleculeFragment.BondsIdx[newBondSeedIdx];
unsigned i = seed.MoleculeFragment.SeedAtomIdxMap[bond->getBeginAtomIdx()];
if(i >= seed.LastAddedAtomsBeginIdx) // old atom in the seed
i = seed.MoleculeFragment.SeedAtomIdxMap[bond->getEndAtomIdx()];
unsigned newBondSourceAtomSeedIdx = i; // seed's index of atom from which new bond was added
const BondMatch& ma = seed.MatchResult[itarget][i];
unsigned newBondSourceAtomTargetIdx = ma.TargetAtomIdx; // corresponding atom in the target
unsigned newBondSourceAtomQueryIdx = ma.QueryAtomIdx;
//const Atom* newBondSourceAtomTarget = t.Molecule->getAtomWithIdx(newBondSourceAtomTargetIdx);
// check all unvisited bonds from newBondSourceAtomTargetIdx
const AtomAdjacencyList& taAdj = t.AtomAdjacency[newBondSourceAtomTargetIdx];
for(size_t itaAdji = 0; itaAdji < taAdj.size(); itaAdji++)
{
unsigned tbi = taAdj[itaAdji].BondIdx;
if(t.BondMatchTable.at(newBondQueryIdx, tbi))
{
/*
// and check ending atom of the bond too
//..........
bool visited = false;
//match.find
//MULTIPLE various bonds for one atom !!!
for(size_t im = 0; !visited && im < match.size(); im++)
if(seed.MatchResult[itarget][im]. == )
visited = true;
if(visited) // visited == targer bons already presents in the match of this seed
continue;
// append to match result
seed.MatchResult[itarget][]
found = true;
break;
*/
}
}
}
/*/-----------------------
//Seed:
unsigned LastAddedBondBeginIdx;
//history for each tag
seedBond, tagBond,
both (seedAtom, tagAtom) of
Graph q = seed.GraphTopology;
for(unsigned ib = seed.LastAddedBondBeginIdx; ib < seed.getNumBonds(); ib++) // all added bonds
ib
endAtom // of ib
unsigned tagBeginAtomIdx = ...; // already verified. matched with seed.
for() all outgoing bonds except already matched to seed
check match of bond to ib and endAtom to ia if ia is not already verified (ring back to seed)
// for(unsigned srcAtomIdx = seed.LastAddedAtomsBeginIdx; srcAtomIdx < seed.getNumAtoms(); srcAtomIdx++) // all added atoms
//-----------------------*/
if(false // match_ok
)
{
#ifdef VERBOSE_STATISTICS_ON
++stat.FastMatchCallTrue;
#endif
return true;
}
match.resize(previousLen); // clean up - remove new items only
return false;
}
#endif // FAST_INCREMENTAL_MATCH
}} // namespace RDKit