// $Id$ // // Copyright (C) 2003-2010 Greg Landrum and Rational Discovery LLC // // @@ All Rights Reserved @@ // This file is part of the RDKit. // The contents are covered by the terms of the BSD license // which is included in the file license.txt, found at the root // of the RDKit source tree. // #include #include #include #include #include "Fingerprints.h" #include #include #include #include #include #include #include #include #include #include #include #include //#define LAYEREDFP_USE_MT //#define VERBOSE_FINGERPRINTING 1 namespace RDKit{ namespace { bool isComplexQuery(const Bond *b){ if( !b->hasQuery()) return false; // negated things are always complex: if( b->getQuery()->getNegation()) return true; std::string descr=b->getQuery()->getDescription(); if(descr=="BondOrder") return false; if(descr=="BondAnd" || descr=="BondXor") return true; if(descr=="BondOr") { // detect the types of queries that appear for unspecified bonds in SMARTS: if(b->getQuery()->endChildren()-b->getQuery()->beginChildren()==2){ for(Bond::QUERYBOND_QUERY::CHILD_VECT_CI child=b->getQuery()->beginChildren(); child!=b->getQuery()->endChildren();++child){ if((*child)->getDescription()!="BondOrder" || (*child)->getNegation()) return true; if(static_cast(child->get())->getVal()!=Bond::SINGLE && static_cast(child->get())->getVal()!=Bond::AROMATIC) return true; return false; } } } return true; } bool isComplexQuery(const Atom *a){ if( !a->hasQuery()) return false; // negated things are always complex: if( a->getQuery()->getNegation()) return true; std::string descr=a->getQuery()->getDescription(); if(descr=="AtomAtomicNum") return false; if(descr=="AtomOr" || descr=="AtomXor") return true; if(descr=="AtomAnd"){ Queries::Query::CHILD_VECT_CI childIt=a->getQuery()->beginChildren(); if( (*childIt)->getDescription()=="AtomAtomicNum" && ((*(childIt+1))->getDescription()=="AtomIsAliphatic" || (*(childIt+1))->getDescription()=="AtomIsAromatic") && (childIt+2)==a->getQuery()->endChildren()){ return false; } return true; } return true; } bool isAtomAromatic(const Atom *a){ bool res=false; if( !a->hasQuery()){ res=a->getIsAromatic(); } else { std::string descr=a->getQuery()->getDescription(); if(descr=="AtomAtomicNum"){ res = a->getIsAromatic(); } else if(descr=="AtomIsAromatic") { res=true; if( a->getQuery()->getNegation()) res = !res; } else if(descr=="AtomIsAliphatic") { res=false; if( a->getQuery()->getNegation()) res = !res; } else if(descr=="AtomAnd"){ Queries::Query::CHILD_VECT_CI childIt=a->getQuery()->beginChildren(); if( (*childIt)->getDescription()=="AtomAtomicNum"){ if( a->getQuery()->getNegation()){ res = false; } else if((*(childIt+1))->getDescription()=="AtomIsAliphatic"){ res=false; } else if((*(childIt+1))->getDescription()=="AtomIsAromatic") { res=true; } } } } return res; } } // end of anonymous namespace // caller owns the result, it must be deleted ExplicitBitVect *RDKFingerprintMol(const ROMol &mol,unsigned int minPath, unsigned int maxPath, unsigned int fpSize,unsigned int nBitsPerHash, bool useHs, double tgtDensity,unsigned int minSize, bool branchedPaths, bool useBondOrder, std::vector *atomInvariants, const std::vector *fromAtoms ){ PRECONDITION(minPath!=0,"minPath==0"); PRECONDITION(maxPath>=minPath,"maxPathsize()>=mol.getNumAtoms(),"bad atomInvariants size"); typedef boost::mt19937 rng_type; typedef boost::uniform_int<> distrib_type; typedef boost::variate_generator source_type; rng_type generator(42u); // // if we generate arbitrarily sized ints then mod them down to the // appropriate size, we can guarantee that a fingerprint of // size x has the same bits set as one of size 2x that's been folded // in half. This is a nice guarantee to have. // distrib_type dist(0,INT_MAX); source_type randomSource(generator,dist); // build default aotm invariants if need be: std::vector lAtomInvariants; if(!atomInvariants){ lAtomInvariants.reserve(mol.getNumAtoms()); for(ROMol::ConstAtomIterator atomIt=mol.beginAtoms(); atomIt!=mol.endAtoms(); ++atomIt){ unsigned int aHash = ((*atomIt)->getAtomicNum()%128)<<1 | (*atomIt)->getIsAromatic(); lAtomInvariants.push_back(aHash); } atomInvariants=&lAtomInvariants; } ExplicitBitVect *res = new ExplicitBitVect(fpSize); INT_PATH_LIST_MAP allPaths; if(!fromAtoms){ if(branchedPaths){ allPaths = findAllSubgraphsOfLengthsMtoN(mol,minPath,maxPath, useHs); } else { allPaths = findAllPathsOfLengthsMtoN(mol,minPath,maxPath, useHs); } } else { BOOST_FOREACH(boost::uint32_t aidx,*fromAtoms){ INT_PATH_LIST_MAP tPaths; if(branchedPaths){ tPaths = findAllSubgraphsOfLengthsMtoN(mol,minPath,maxPath, useHs,aidx); } else { tPaths = findAllPathsOfLengthsMtoN(mol,minPath,maxPath, true,useHs,aidx); } for(INT_PATH_LIST_MAP::const_iterator tpit=tPaths.begin(); tpit!=tPaths.end();++tpit){ #ifdef VERBOSE_FINGERPRINTING std::cerr<<"paths from "<first<second){ std::cerr<<" path: "; std::copy(path.begin(),path.end(),std::ostream_iterator(std::cerr,", ")); std::cerr<first].insert(allPaths[tpit->first].begin(), tpit->second.begin(),tpit->second.end()); } } } std::vector bondCache; bondCache.resize(mol.getNumBonds()); ROMol::EDGE_ITER firstB,lastB; boost::tie(firstB,lastB) = mol.getEdges(); while(firstB!=lastB){ BOND_SPTR bond = mol[*firstB]; bondCache[bond->getIdx()]=bond.get(); ++firstB; } #ifdef VERBOSE_FINGERPRINTING std::cerr<<" n path sets: "<first<<" "<second.size()< atomsInPath(mol.getNumAtoms()); for(INT_PATH_LIST_MAP_CI paths=allPaths.begin();paths!=allPaths.end();paths++){ BOOST_FOREACH(const PATH_TYPE &path,paths->second){ #ifdef VERBOSE_FINGERPRINTING std::cerr<<"Path: "; std::copy(path.begin(),path.end(),std::ostream_iterator(std::cerr,", ")); std::cerr< bondNbrs(path.size()); std::fill(bondNbrs.begin(),bondNbrs.end(),0); atomsInPath.reset(); std::vector bondHashes; bondHashes.reserve(path.size()+1); for(unsigned int i=0;igetBeginAtomIdx()); atomsInPath.set(bi->getEndAtomIdx()); for(unsigned int j=i+1;jgetBeginAtomIdx()==bj->getBeginAtomIdx() || bi->getBeginAtomIdx()==bj->getEndAtomIdx() || bi->getEndAtomIdx()==bj->getBeginAtomIdx() || bi->getEndAtomIdx()==bj->getEndAtomIdx() ){ ++bondNbrs[i]; ++bondNbrs[j]; } } #ifdef VERBOSE_FINGERPRINTING std::cerr<<" bond("<getBeginAtomIdx()]; a2Hash = (*atomInvariants)[bi->getEndAtomIdx()]; if(a1HashgetIsAromatic()){ // makes sure aromatic bonds always hash the same: bondHash = Bond::AROMATIC; } else { bondHash = bi->getBondType(); } } boost::uint32_t nBitsInHash=0; boost::uint32_t ourHash=bondNbrs[i]%8; // 3 bits here nBitsInHash+=3; ourHash |= (bondHash%16)<(seed)); for(unsigned int i=0;isetBit(bit); #ifdef VERBOSE_FINGERPRINTING std::cerr<<" bit: "<0.0){ while( static_cast(res->getNumOnBits())/res->getNumBits() < tgtDensity && res->getNumBits() >= 2*minSize ){ ExplicitBitVect *tmpV=FoldFingerprint(*res,2); delete res; res = tmpV; } } return res; } // caller owns the result, it must be deleted ExplicitBitVect *LayeredFingerprintMol(const ROMol &mol, unsigned int layerFlags, unsigned int minPath, unsigned int maxPath, unsigned int fpSize, double tgtDensity,unsigned int minSize, std::vector *atomCounts, ExplicitBitVect *setOnlyBits, bool branchedPaths, const std::vector *fromAtoms ){ PRECONDITION(minPath!=0,"minPath==0"); PRECONDITION(maxPath>=minPath,"maxPathsize()>=mol.getNumAtoms(),"bad atomCounts size"); PRECONDITION(!setOnlyBits || setOnlyBits->getNumBits()==fpSize,"bad setOnlyBits size"); if(!mol.getRingInfo()->isInitialized()){ MolOps::findSSSR(mol); } #ifdef LAYEREDFP_USE_MT // create a mersenne twister with customized parameters. // The standard parameters (used to create boost::mt19937) // result in an RNG that's much too computationally intensive // to seed. typedef boost::random::mersenne_twister rng_type; typedef boost::uniform_int<> distrib_type; typedef boost::variate_generator source_type; rng_type generator(42u); // // if we generate arbitrarily sized ints then mod them down to the // appropriate size, we can guarantee that a fingerprint of // size x has the same bits set as one of size 2x that's been folded // in half. This is a nice guarantee to have. // distrib_type dist(0,INT_MAX); source_type randomSource(generator,dist); #endif std::vector bondCache; bondCache.resize(mol.getNumBonds()); std::vector isQueryBond(mol.getNumBonds(),0); ROMol::EDGE_ITER firstB,lastB; boost::tie(firstB,lastB) = mol.getEdges(); while(firstB!=lastB){ const Bond *bond = mol[*firstB].get(); isQueryBond[bond->getIdx()] = 0x0; bondCache[bond->getIdx()]=bond; if(isComplexQuery(bond)){ isQueryBond[bond->getIdx()] = 0x1; } if(isComplexQuery(bond->getBeginAtom())){ isQueryBond[bond->getIdx()] |= 0x2; } if(isComplexQuery(bond->getEndAtom())){ isQueryBond[bond->getIdx()] |= 0x4; } ++firstB; } std::vector aromaticAtoms(mol.getNumAtoms(),false); std::vector anums(mol.getNumAtoms(),0); ROMol::VERTEX_ITER firstA,lastA; boost::tie(firstA,lastA) = mol.getVertices(); while(firstA!=lastA){ const Atom *atom = mol[*firstA].get(); if(isAtomAromatic(atom)) aromaticAtoms[atom->getIdx()]=true; anums[atom->getIdx()]=atom->getAtomicNum(); ++firstA; } ExplicitBitVect *res = new ExplicitBitVect(fpSize); INT_PATH_LIST_MAP allPaths; if(!fromAtoms){ if(branchedPaths){ allPaths = findAllSubgraphsOfLengthsMtoN(mol,minPath,maxPath,false); } else { allPaths = findAllPathsOfLengthsMtoN(mol,minPath,maxPath,false); } } else { BOOST_FOREACH(boost::uint32_t aidx,*fromAtoms){ INT_PATH_LIST_MAP tPaths; if(branchedPaths){ tPaths = findAllSubgraphsOfLengthsMtoN(mol,minPath,maxPath, false,aidx); } else { tPaths = findAllPathsOfLengthsMtoN(mol,minPath,maxPath, true,false,aidx); } for(INT_PATH_LIST_MAP::const_iterator tpit=tPaths.begin(); tpit!=tPaths.end();++tpit){ allPaths[tpit->first].insert(allPaths[tpit->first].begin(), tpit->second.begin(),tpit->second.end()); } } } boost::dynamic_bitset<> atomsInPath(mol.getNumAtoms()); boost::dynamic_bitset<> bondsInPath(mol.getNumBonds()); for(INT_PATH_LIST_MAP_CI paths=allPaths.begin();paths!=allPaths.end();++paths){ for( PATH_LIST_CI pathIt=paths->second.begin(); pathIt!=paths->second.end(); ++pathIt ){ const PATH_TYPE &path=*pathIt; #ifdef VERBOSE_FINGERPRINTING std::cerr<<"Path: "; std::copy(path.begin(),path.end(),std::ostream_iterator(std::cerr,", ")); std::cerr< > hashLayers(maxFingerprintLayers); for(unsigned int i=0;i bondNbrs(path.size(),0); atomsInPath.reset(); for(unsigned int i=0;igetBeginAtomIdx()); atomsInPath.set(bi->getEndAtomIdx()); for(unsigned int j=i+1;jgetBeginAtomIdx()==bj->getBeginAtomIdx() || bi->getBeginAtomIdx()==bj->getEndAtomIdx() || bi->getEndAtomIdx()==bj->getBeginAtomIdx() || bi->getEndAtomIdx()==bj->getEndAtomIdx() ){ ++bondNbrs[i]; ++bondNbrs[j]; } } #ifdef VERBOSE_FINGERPRINTING std::cerr<<" bond("<getIsAromatic() && bi->getBondType()!=Bond::SINGLE && bi->getBondType()!=Bond::AROMATIC){ bondHash = bi->getBondType(); } else { bondHash = Bond::SINGLE; } ourHash = bondHash%8; ourHash |= (bondNbrs[i]%8)<<6; hashLayers[1].push_back(ourHash); } if(layerFlags & 0x4 && !(pathQueries&0x6) ){ //std::cerr<<" consider: "<getBeginAtomIdx()<<" - " <getEndAtomIdx()<getBeginAtomIdx()]%128); a2Hash = (anums[bi->getEndAtomIdx()]%128); if(a1HashgetBeginAtomIdx()<<" - " <getEndAtomIdx()<getBeginAtomIdx()]; bool a2Hash = aromaticAtoms[bi->getEndAtomIdx()]; if((!a1Hash) && a2Hash) std::swap(a1Hash,a2Hash); ourHash = a1Hash; ourHash |= a2Hash<<1; ourHash |= (bondNbrs[i]%8)<<5; hashLayers[5].push_back(ourHash); } } unsigned int l=0; bool flaggedPath=false; for(std::vector< std::vector >::iterator layerIt=hashLayers.begin(); layerIt!=hashLayers.end();++layerIt,++l){ if(!layerIt->size()) continue; // ---- std::sort(layerIt->begin(),layerIt->end()); // finally, we will add the number of distinct atoms in the path at the end // of the vect. This allows us to distinguish C1CC1 from CC(C)C layerIt->push_back(atomsInPath.count()); layerIt->push_back(l+1); // hash the path to generate a seed: unsigned long seed = gboost::hash_range(layerIt->begin(),layerIt->end()); #ifdef VERBOSE_FINGERPRINTING std::cerr<<" hash: "<(seed)); unsigned int bitId=randomSource()%fpSize; #else unsigned int bitId=seed%fpSize; #endif #ifdef VERBOSE_FINGERPRINTING std::cerr<<" bit: "<setBit(bitId); if(atomCounts && !flaggedPath){ for(unsigned int aIdx=0;aIdx0.0){ while( static_cast(res->getNumOnBits())/res->getNumBits() < tgtDensity && res->getNumBits() >= 2*minSize ){ ExplicitBitVect *tmpV=FoldFingerprint(*res,2); delete res; res = tmpV; } } } return res; } const char *pqs[]={ "[*]~[*]", "[*]~[*]~[*]", "[R]~1~[R]~[R]~1", "[*]~[*]~[*]~[*]", "[*]~[*](~[*])~[*]", "[*]~[R]~1[R]~[R]~1", "[R]~1[R]~[R]~[R]~1", "[*]~[*]~[*]~[*]~[*]", "[*]~[*]~[*](~[*])~[*]", "[*]~[R]~1[R]~[R]~1~[*]", "[R]~1~[R]~[R]~[R]~[R]~1", "[R]~1~[R]~[R]~[R]~[R]~[R]~1", #if 0 "[*]~[*](~[*])(~[*])~[*]", "[*]~[*]~[*]~[*]~[*]~[*]", "[*]~[*]~[*]~[*](~[*])~[*]", "[*]~[*]~[*](~[*])~[*]~[*]", "[*]~[*]~[*](~[*])(~[*])~[*]", "[*]~[*](~[*])~[*](~[*])~[*]", "[*]~[R]~1[R]~[R]~1(~[*])~[*]", "[*]~[R]~1[R](~[*])~[R]~1[*]", "[*]~[R]~1[R]~[R](~[*])~[R]~1", "[*]~[R]~1[R]~[R]~[R]~1[*]", "[*]~[R]~1[R]~[R]~[R]~[R]~1", "[*]~[R]~1(~[*])~[R]~[R]~[R]~1", "[*]~[*]~[*]~[*]~[*]~[*]~[*]", "[*]~[*]~[*]~[*]~[*](~[*])~[*]", "[*]~[*]~[*]~[*](~[*])~[*]~[*]", "[*]~[*]~[*]~[*](~[*])(~[*])~[*]", "[*]~[*]~[*](~[*])~[*](~[*])~[*]", "[*]~[*](~[*])~[*]~[*](~[*])~[*]", "[*]~[*](~[*])~[*](~[*])(~[*])~[*]", #endif ""}; namespace detail { void getAtomNumbers(const Atom *a,std::vector &atomNums){ atomNums.clear(); if( !a->hasQuery()){ atomNums.push_back(a->getAtomicNum()); return; } // negated things are always complex: if( a->getQuery()->getNegation()) return; std::string descr=a->getQuery()->getDescription(); if(descr=="AtomAtomicNum"){ atomNums.push_back(static_cast(a->getQuery())->getVal()); } else if(descr=="AtomXor"){ return; } else if(descr=="AtomAnd"){ Queries::Query::CHILD_VECT_CI childIt=a->getQuery()->beginChildren(); if( (*childIt)->getDescription()=="AtomAtomicNum" && ((*(childIt+1))->getDescription()=="AtomIsAliphatic" || (*(childIt+1))->getDescription()=="AtomIsAromatic") && (childIt+2)==a->getQuery()->endChildren()){ atomNums.push_back(static_cast((*childIt).get())->getVal()); return; } } else if(descr=="AtomOr"){ Queries::Query::CHILD_VECT_CI childIt=a->getQuery()->beginChildren(); while(childIt !=a->getQuery()->endChildren()){ if( (*childIt)->getDescription()=="AtomAtomicNum" ){ atomNums.push_back(static_cast((*childIt).get())->getVal()); } else if((*childIt)->getDescription()=="AtomAnd"){ Queries::Query::CHILD_VECT_CI childIt2=(*childIt)->beginChildren(); if( (*childIt2)->getDescription()=="AtomAtomicNum" && ((*(childIt2+1))->getDescription()=="AtomIsAliphatic" || (*(childIt2+1))->getDescription()=="AtomIsAromatic") && (childIt2+2)==(*childIt)->endChildren()){ atomNums.push_back(static_cast((*childIt2).get())->getVal()); } else { atomNums.clear(); return; } } else { atomNums.clear(); return; } ++childIt; } } return; } } // caller owns the result, it must be deleted ExplicitBitVect *LayeredFingerprintMol2(const ROMol &mol, unsigned int layerFlags, unsigned int minPath, unsigned int maxPath, unsigned int fpSize, std::vector *atomCounts, ExplicitBitVect *setOnlyBits, bool branchedPaths){ PRECONDITION(minPath!=0,"minPath==0"); PRECONDITION(maxPath>=minPath,"maxPathsize()>=mol.getNumAtoms(),"bad atomCounts size"); PRECONDITION(!setOnlyBits || setOnlyBits->getNumBits()==fpSize,"bad setOnlyBits size"); static std::vector patts; // FIX: need a mutex here to be threadsafe if(patts.size()==0){ unsigned int idx=0; while(1){ std::string pq=pqs[idx]; if(pq=="") break; idx++; RWMol *tm; try { tm = SmartsToMol(pq); }catch (...) { tm=NULL; } if(!tm) continue; patts.push_back(ROMOL_SPTR(static_cast(tm))); } } if(!mol.getRingInfo()->isInitialized()){ MolOps::findSSSR(mol); } boost::dynamic_bitset<> isQueryAtom(mol.getNumAtoms()),isQueryBond(mol.getNumBonds()); ROMol::VERTEX_ITER firstA,lastA; boost::tie(firstA,lastA) = mol.getVertices(); while(firstA!=lastA){ const Atom *at=mol[*firstA].get(); if(isComplexQuery(at)) isQueryAtom.set(at->getIdx()); ++firstA; } ROMol::EDGE_ITER firstB,lastB; boost::tie(firstB,lastB) = mol.getEdges(); while(firstB!=lastB){ const Bond *bond = mol[*firstB].get(); if( isComplexQuery(bond) ){ isQueryBond.set(bond->getIdx()); } ++firstB; } ExplicitBitVect *res = new ExplicitBitVect(fpSize); unsigned int pIdx=0; BOOST_FOREACH(ROMOL_SPTR patt,patts){ ++pIdx; //if(patt->getNumBonds()getNumBonds()>maxPath){ // continue; //} std::vector matches; SubstructMatch(mol,*(patt.get()),matches,false); boost::uint32_t mIdx=pIdx+patt->getNumAtoms()+patt->getNumBonds(); #if 0 // this was an effort to tune the composition of the fingerprint, // particularly when queries are used. It hasn't proved successful BOOST_FOREACH(MatchVectType &mv,matches){ // collect bits counting the number of occurances of the pattern: gboost::hash_combine(mIdx,0xBEEF); res->setBit(mIdx%fpSize); bool isQuery=false; boost::uint32_t bitId=pIdx; std::vector amap(mv.size(),0); BOOST_FOREACH(MatchVectType::value_type &p,mv){ if(isQueryAtom[p.second]){ isQuery=true; break; } gboost::hash_combine(bitId,mol.getAtomWithIdx(p.second)->getAtomicNum()); amap[p.first]=p.second; } if(!isQuery) res->setBit(bitId%(fpSize/2)); isQuery=false; bitId=pIdx; ROMol::EDGE_ITER firstB,lastB; boost::tie(firstB,lastB) = patt->getEdges(); while(firstB!=lastB){ BOND_SPTR pbond = (*patt.get())[*firstB]; ++firstB; if(isQueryBond[pbond->getIdx()]){ isQuery=true; break; } const Bond *mbond=mol.getBondBetweenAtoms(amap[pbond->getBeginAtomIdx()], amap[pbond->getEndAtomIdx()]); gboost::hash_combine(bitId,(boost::uint32_t)mbond->getBondType()); } if(!isQuery) res->setBit((fpSize/2) + bitId%(fpSize/2)); } #else BOOST_FOREACH(MatchVectType &mv,matches){ // collect bits counting the number of occurances of the pattern: gboost::hash_combine(mIdx,0xBEEF); res->setBit(mIdx%fpSize); bool isQuery=false; boost::uint32_t bitId=pIdx; std::vector amap(mv.size(),0); BOOST_FOREACH(MatchVectType::value_type &p,mv){ if(isQueryAtom[p.second]){ isQuery=true; break; } gboost::hash_combine(bitId,mol.getAtomWithIdx(p.second)->getAtomicNum()); amap[p.first]=p.second; } if(isQuery) continue; ROMol::EDGE_ITER firstB,lastB; boost::tie(firstB,lastB) = patt->getEdges(); while(firstB!=lastB){ BOND_SPTR pbond = (*patt.get())[*firstB]; ++firstB; if(isQueryBond[pbond->getIdx()]){ isQuery=true; break; } const Bond *mbond=mol.getBondBetweenAtoms(amap[pbond->getBeginAtomIdx()], amap[pbond->getEndAtomIdx()]); gboost::hash_combine(bitId,(boost::uint32_t)mbond->getBondType()); } if(!isQuery) res->setBit(bitId%fpSize); } #endif } return res; } }