// $Id$ // // Copyright (C) 2003-2013 Greg Landrum and Rational Discovery LLC // // @@ All Rights Reserved @@ // This file is part of the RDKit. // The contents are covered by the terms of the BSD license // which is included in the file license.txt, found at the root // of the RDKit source tree. // #include #include #include #include #include "Fingerprints.h" #include #include #include #include #include #include #include #include #include #include #include #include //#define VERBOSE_FINGERPRINTING 1 //#define REPORT_FP_STATS 1 #ifdef REPORT_FP_STATS #include #endif namespace RDKit{ namespace Fingerprints { namespace detail { bool isComplexQuery(const Bond *b){ if( !b->hasQuery()) return false; // negated things are always complex: if( b->getQuery()->getNegation()) return true; std::string descr=b->getQuery()->getDescription(); if(descr=="BondOrder") return false; if(descr=="BondAnd" || descr=="BondXor") return true; if(descr=="BondOr") { // detect the types of queries that appear for unspecified bonds in SMARTS: if(b->getQuery()->endChildren()-b->getQuery()->beginChildren()==2){ for(Bond::QUERYBOND_QUERY::CHILD_VECT_CI child=b->getQuery()->beginChildren(); child!=b->getQuery()->endChildren();++child){ if((*child)->getDescription()!="BondOrder" || (*child)->getNegation()) return true; if(static_cast(child->get())->getVal()!=Bond::SINGLE && static_cast(child->get())->getVal()!=Bond::AROMATIC) return true; return false; } } } return true; } bool isComplexQuery(const Atom *a){ if( !a->hasQuery()) return false; // negated things are always complex: if( a->getQuery()->getNegation()) return true; std::string descr=a->getQuery()->getDescription(); if(descr=="AtomAtomicNum") return false; if(descr=="AtomOr" || descr=="AtomXor") return true; if(descr=="AtomAnd"){ Queries::Query::CHILD_VECT_CI childIt=a->getQuery()->beginChildren(); if( (*childIt)->getDescription()=="AtomAtomicNum" && ((*(childIt+1))->getDescription()=="AtomIsAliphatic" || (*(childIt+1))->getDescription()=="AtomIsAromatic") && (childIt+2)==a->getQuery()->endChildren()){ return false; } return true; } return true; } bool isAtomAromatic(const Atom *a){ bool res=false; if( !a->hasQuery()){ res=a->getIsAromatic(); } else { std::string descr=a->getQuery()->getDescription(); if(descr=="AtomAtomicNum"){ res = a->getIsAromatic(); } else if(descr=="AtomIsAromatic") { res=true; if( a->getQuery()->getNegation()) res = !res; } else if(descr=="AtomIsAliphatic") { res=false; if( a->getQuery()->getNegation()) res = !res; } else if(descr=="AtomAnd"){ Queries::Query::CHILD_VECT_CI childIt=a->getQuery()->beginChildren(); if( (*childIt)->getDescription()=="AtomAtomicNum"){ if( a->getQuery()->getNegation()){ res = false; } else if((*(childIt+1))->getDescription()=="AtomIsAliphatic"){ res=false; } else if((*(childIt+1))->getDescription()=="AtomIsAromatic") { res=true; } } } } return res; } } //end of detail namespace } // end of Fingerprint namespace namespace { uint32_t hashBond(const Bond *bnd,const std::vector &atomInvariants, const std::vector &atomDegrees,uint32_t bondDegree, bool useBondOrder){ PRECONDITION(bnd,"bad bond"); uint32_t res; if(useBondOrder) { if(bnd->getIsAromatic()){ res = Bond::AROMATIC; } else { res=bnd->getBondType(); } } else { res = 1; } uint32_t iv1=atomInvariants[bnd->getBeginAtomIdx()]; uint32_t iv2=atomInvariants[bnd->getEndAtomIdx()]; uint32_t deg1=atomDegrees[bnd->getBeginAtomIdx()]; uint32_t deg2=atomDegrees[bnd->getEndAtomIdx()]; if(iv1>iv2){ std::swap(iv1,iv2); std::swap(deg1,deg2); } else if(iv1==iv2){ if(deg1>deg2){ std::swap(deg1,deg2); } } res = (res%8) | (iv1%128)<<3 | (iv2%128)<<10 | (deg1%8)<<17 | (deg2%8)<<20 | (bondDegree%8)<<23 ; //std::cerr<<"---->("<getIdx()<<") "<getBeginAtomIdx()<<"-"<getEndAtomIdx()<<" "< &bondCache, const std::vector &bondHashes){ std::deque< std::pair > > stack; uint32_t best; //std::cerr<<" hash: "; //std::copy(path.begin(),path.end(),std::ostream_iterator(std::cerr,", ")); for(unsigned int i=0;igetBeginAtomIdx()<<"-"<getEndAtomIdx()<<" "< bs(mol.getNumBonds()); bs.set(path[i]); stack.push_back(std::make_pair(i,bs)); best=bondHashes[i]; } else { if(bondHashes[i]<=best){ if(bondHashes[i] bs(mol.getNumBonds()); bs.set(path[i]); stack.push_back(std::make_pair(i,bs)); } } } //std::cerr<::max(); std::deque< std::pair > > newStack; while(!stack.empty()){ // assumption: each element of the stack corresponds to // the last point of a traversal of the path // res has been updated with all elements already traversed unsigned int i; boost::dynamic_bitset<> bondsThere; boost::tie(i,bondsThere)=stack.front(); //std::cerr<<" "<best) continue; if(obnd->getBeginAtomIdx()==bnd->getBeginAtomIdx() || obnd->getBeginAtomIdx()==bnd->getEndAtomIdx() || obnd->getEndAtomIdx()==bnd->getBeginAtomIdx() || obnd->getEndAtomIdx()==bnd->getEndAtomIdx() ){ // it's a neighbor and the hash is at least as good as what we've seen so far if(bondHashes[j] bs(bondsThere); bs.set(path[j]); newStack.push_back(std::make_pair(j,bs)); //std::cerr<<" "<::max(); newStack.clear(); } } gboost::hash_combine(res,path.size()); return res; } } // end of anonymous namespace // caller owns the result, it must be deleted ExplicitBitVect *RDKFingerprintMol(const ROMol &mol,unsigned int minPath, unsigned int maxPath, unsigned int fpSize,unsigned int nBitsPerHash, bool useHs, double tgtDensity,unsigned int minSize, bool branchedPaths, bool useBondOrder, std::vector *atomInvariants, const std::vector *fromAtoms, std::vector > *atomBits ){ PRECONDITION(minPath!=0,"minPath==0"); PRECONDITION(maxPath>=minPath,"maxPathsize()>=mol.getNumAtoms(),"bad atomInvariants size"); PRECONDITION(!atomBits||atomBits->size()>=mol.getNumAtoms(),"bad atomBits size"); // create a mersenne twister with customized parameters. // The standard parameters (used to create boost::mt19937) // result in an RNG that's much too computationally intensive // to seed. typedef boost::random::mersenne_twister rng_type; typedef boost::uniform_int<> distrib_type; typedef boost::variate_generator source_type; rng_type generator(42u); // // if we generate arbitrarily sized ints then mod them down to the // appropriate size, we can guarantee that a fingerprint of // size x has the same bits set as one of size 2x that's been folded // in half. This is a nice guarantee to have. // distrib_type dist(0,INT_MAX); source_type randomSource(generator,dist); // build default atom invariants if need be: std::vector lAtomInvariants; if(!atomInvariants){ lAtomInvariants.reserve(mol.getNumAtoms()); for(ROMol::ConstAtomIterator atomIt=mol.beginAtoms(); atomIt!=mol.endAtoms(); ++atomIt){ unsigned int aHash = ((*atomIt)->getAtomicNum()%128)<<1 | (*atomIt)->getIsAromatic(); lAtomInvariants.push_back(aHash); } atomInvariants=&lAtomInvariants; } ExplicitBitVect *res = new ExplicitBitVect(fpSize); INT_PATH_LIST_MAP allPaths; if(!fromAtoms){ if(branchedPaths){ allPaths = findAllSubgraphsOfLengthsMtoN(mol,minPath,maxPath, useHs); } else { allPaths = findAllPathsOfLengthsMtoN(mol,minPath,maxPath, useHs); } } else { BOOST_FOREACH(boost::uint32_t aidx,*fromAtoms){ INT_PATH_LIST_MAP tPaths; if(branchedPaths){ tPaths = findAllSubgraphsOfLengthsMtoN(mol,minPath,maxPath, useHs,aidx); } else { tPaths = findAllPathsOfLengthsMtoN(mol,minPath,maxPath, true,useHs,aidx); } for(INT_PATH_LIST_MAP::const_iterator tpit=tPaths.begin(); tpit!=tPaths.end();++tpit){ #ifdef VERBOSE_FINGERPRINTING std::cerr<<"paths from "<first<second){ std::cerr<<" path: "; std::copy(path.begin(),path.end(),std::ostream_iterator(std::cerr,", ")); std::cerr<first].insert(allPaths[tpit->first].begin(), tpit->second.begin(),tpit->second.end()); } } } std::vector bondInvariants(mol.getNumBonds()); std::vector bondCache; bondCache.resize(mol.getNumBonds()); std::vector isQueryBond(mol.getNumBonds(),0); ROMol::EDGE_ITER firstB,lastB; boost::tie(firstB,lastB) = mol.getEdges(); while(firstB!=lastB){ const Bond *bond = mol[*firstB].get(); isQueryBond[bond->getIdx()] = 0x0; bondCache[bond->getIdx()]=bond; if(Fingerprints::detail::isComplexQuery(bond)){ isQueryBond[bond->getIdx()] = 0x1; } if(Fingerprints::detail::isComplexQuery(bond->getBeginAtom())){ isQueryBond[bond->getIdx()] |= 0x2; } if(Fingerprints::detail::isComplexQuery(bond->getEndAtom())){ isQueryBond[bond->getIdx()] |= 0x4; } ++firstB; } if(atomBits){ for(unsigned int i=0;ifirst<<" "<second.size()< > bitSmiles; #endif boost::dynamic_bitset<> atomsInPath(mol.getNumAtoms()); for(INT_PATH_LIST_MAP_CI paths=allPaths.begin();paths!=allPaths.end();paths++){ BOOST_FOREACH(const PATH_TYPE &path,paths->second){ #ifdef REPORT_FP_STATS std::vector atomsToUse; #endif #ifdef VERBOSE_FINGERPRINTING std::cerr<<"Path: "; std::copy(path.begin(),path.end(),std::ostream_iterator(std::cerr,", ")); std::cerr< atomDegrees(mol.getNumAtoms(),0); for(unsigned int i=0;igetBeginAtomIdx()]++; atomDegrees[bi->getEndAtomIdx()]++; atomsInPath.set(bi->getBeginAtomIdx()); atomsInPath.set(bi->getEndAtomIdx()); if(isQueryBond[path[i]]) queryInPath=true; } if(queryInPath) continue; // ----------------- // calculate the bond hashes: std::vector bondNbrs(path.size(),0); std::vector bondHashes; bondHashes.reserve(path.size()+1); for(unsigned int i=0;igetBeginAtomIdx())==atomsToUse.end()){ atomsToUse.push_back(bi->getBeginAtomIdx()); } if(std::find(atomsToUse.begin(),atomsToUse.end(),bi->getEndAtomIdx())==atomsToUse.end()){ atomsToUse.push_back(bi->getEndAtomIdx()); } #endif for(unsigned int j=i+1;jgetBeginAtomIdx()==bj->getBeginAtomIdx() || bi->getBeginAtomIdx()==bj->getEndAtomIdx() || bi->getEndAtomIdx()==bj->getBeginAtomIdx() || bi->getEndAtomIdx()==bj->getEndAtomIdx() ){ ++bondNbrs[i]; ++bondNbrs[j]; } } #ifdef VERBOSE_FINGERPRINTING std::cerr<<" bond("<getBeginAtomIdx()]; unsigned int a2Hash = (*atomInvariants)[bi->getEndAtomIdx()]; unsigned int deg1=atomDegrees[bi->getBeginAtomIdx()]; unsigned int deg2=atomDegrees[bi->getEndAtomIdx()]; if(a1HashgetIsAromatic() || bi->getBondType()==Bond::AROMATIC){ // makes sure aromatic bonds always hash as aromatic bondHash = Bond::AROMATIC; } else { bondHash = bi->getBondType(); } } boost::uint32_t nBitsInHash=0; // boost::uint32_t ourHash=bondNbrs[i]%8; // 3 bits here // nBitsInHash+=3; // ourHash |= (bondHash%16)<getIdx()<<" "< "<1){ std::sort(bondHashes.begin(),bondHashes.end()); // finally, we will add the number of distinct atoms in the path at the end // of the vect. This allows us to distinguish C1CC1 from CC(C)C bondHashes.push_back(atomsInPath.count()); seed= gboost::hash_range(bondHashes.begin(),bondHashes.end()); } else { seed = bondHashes[0]; } #else if(atomBits){ atomsInPath.reset(); for(unsigned int i=0;igetBeginAtomIdx()); atomsInPath.set(bi->getEndAtomIdx()); } } std::vector bondInvariants(path.size()); std::vector bondDegrees(path.size(),0); std::vector atomDegrees(mol.getNumAtoms(),0); for(unsigned int i=0;igetBeginAtomIdx()]++; atomDegrees[bi->getEndAtomIdx()]++; for(unsigned int j=i;jgetBeginAtomIdx()==bj->getBeginAtomIdx()|| bi->getBeginAtomIdx()==bj->getEndAtomIdx()|| bi->getEndAtomIdx()==bj->getBeginAtomIdx()|| bi->getEndAtomIdx()==bj->getEndAtomIdx()){ bondDegrees[i]++; bondDegrees[j]++; } } #ifdef REPORT_FP_STATS if(std::find(atomsToUse.begin(),atomsToUse.end(),bi->getBeginAtomIdx())==atomsToUse.end()){ atomsToUse.push_back(bi->getBeginAtomIdx()); } if(std::find(atomsToUse.begin(),atomsToUse.end(),bi->getEndAtomIdx())==atomsToUse.end()){ atomsToUse.push_back(bi->getEndAtomIdx()); } #endif } for(unsigned int i=0;i(std::cerr,", ")); // std::cerr<<" || "; // std::copy(atomsToUse.begin(),atomsToUse.end(),std::ostream_iterator(std::cerr,", ")); // std::cerr<1){ // std::cerr<<" DUPE: "<(std::cerr,", ")); // std::cerr<<" || "; // std::copy(atomsToUse.begin(),atomsToUse.end(),std::ostream_iterator(std::cerr,", ")); // std::cerr<setBit(bit); if(atomBits){ boost::dynamic_bitset<>::size_type aIdx=atomsInPath.find_first(); while(aIdx!=boost::dynamic_bitset<>::npos){ if(std::find((*atomBits)[aIdx].begin(),(*atomBits)[aIdx].end(),bit)==(*atomBits)[aIdx].end()){ (*atomBits)[aIdx].push_back(bit); } aIdx = atomsInPath.find_next(aIdx); } } #ifdef VERBOSE_FINGERPRINTING std::cerr<<" bit: "<<0<<" "<1){ generator.seed(static_cast(seed)); for(unsigned int i=1;isetBit(bit); if(atomBits){ boost::dynamic_bitset<>::size_type aIdx=atomsInPath.find_first(); while(aIdx!=boost::dynamic_bitset<>::npos){ if(std::find((*atomBits)[aIdx].begin(),(*atomBits)[aIdx].end(),bit)==(*atomBits)[aIdx].end()){ (*atomBits)[aIdx].push_back(bit); } aIdx = atomsInPath.find_next(aIdx); } } #ifdef VERBOSE_FINGERPRINTING std::cerr<<" bit: "<0.0){ while( static_cast(res->getNumOnBits())/res->getNumBits() < tgtDensity && res->getNumBits() >= 2*minSize ){ ExplicitBitVect *tmpV=FoldFingerprint(*res,2); delete res; res = tmpV; } } #ifdef REPORT_FP_STATS std::cerr<<"BIT STATS"<size()){ for(unsigned int i=0;i1)){ std::cerr< *atomCounts, ExplicitBitVect *setOnlyBits, bool branchedPaths, const std::vector *fromAtoms ){ PRECONDITION(minPath!=0,"minPath==0"); PRECONDITION(maxPath>=minPath,"maxPathsize()>=mol.getNumAtoms(),"bad atomCounts size"); PRECONDITION(!setOnlyBits || setOnlyBits->getNumBits()==fpSize,"bad setOnlyBits size"); if(!mol.getRingInfo()->isInitialized()){ MolOps::findSSSR(mol); } std::vector bondCache; bondCache.resize(mol.getNumBonds()); std::vector isQueryBond(mol.getNumBonds(),0); ROMol::EDGE_ITER firstB,lastB; boost::tie(firstB,lastB) = mol.getEdges(); while(firstB!=lastB){ const Bond *bond = mol[*firstB].get(); isQueryBond[bond->getIdx()] = 0x0; bondCache[bond->getIdx()]=bond; if(Fingerprints::detail::isComplexQuery(bond)){ isQueryBond[bond->getIdx()] = 0x1; } if(Fingerprints::detail::isComplexQuery(bond->getBeginAtom())){ isQueryBond[bond->getIdx()] |= 0x2; } if(Fingerprints::detail::isComplexQuery(bond->getEndAtom())){ isQueryBond[bond->getIdx()] |= 0x4; } ++firstB; } std::vector aromaticAtoms(mol.getNumAtoms(),false); std::vector anums(mol.getNumAtoms(),0); ROMol::VERTEX_ITER firstA,lastA; boost::tie(firstA,lastA) = mol.getVertices(); while(firstA!=lastA){ const Atom *atom = mol[*firstA].get(); if(Fingerprints::detail::isAtomAromatic(atom)) aromaticAtoms[atom->getIdx()]=true; anums[atom->getIdx()]=atom->getAtomicNum(); ++firstA; } ExplicitBitVect *res = new ExplicitBitVect(fpSize); INT_PATH_LIST_MAP allPaths; if(!fromAtoms){ if(branchedPaths){ allPaths = findAllSubgraphsOfLengthsMtoN(mol,minPath,maxPath,false); } else { allPaths = findAllPathsOfLengthsMtoN(mol,minPath,maxPath,false); } } else { BOOST_FOREACH(boost::uint32_t aidx,*fromAtoms){ INT_PATH_LIST_MAP tPaths; if(branchedPaths){ tPaths = findAllSubgraphsOfLengthsMtoN(mol,minPath,maxPath, false,aidx); } else { tPaths = findAllPathsOfLengthsMtoN(mol,minPath,maxPath, true,false,aidx); } for(INT_PATH_LIST_MAP::const_iterator tpit=tPaths.begin(); tpit!=tPaths.end();++tpit){ allPaths[tpit->first].insert(allPaths[tpit->first].begin(), tpit->second.begin(),tpit->second.end()); } } } boost::dynamic_bitset<> atomsInPath(mol.getNumAtoms()); boost::dynamic_bitset<> bondsInPath(mol.getNumBonds()); for(INT_PATH_LIST_MAP_CI paths=allPaths.begin();paths!=allPaths.end();++paths){ for( PATH_LIST_CI pathIt=paths->second.begin(); pathIt!=paths->second.end(); ++pathIt ){ const PATH_TYPE &path=*pathIt; #ifdef VERBOSE_FINGERPRINTING std::cerr<<"Path: "; std::copy(path.begin(),path.end(),std::ostream_iterator(std::cerr,", ")); std::cerr< > hashLayers(maxFingerprintLayers); for(unsigned int i=0;i bondNbrs(path.size(),0); atomsInPath.reset(); std::vector atomDegrees(mol.getNumAtoms(),0); for(unsigned int i=0;igetBeginAtomIdx()]++; atomDegrees[bi->getEndAtomIdx()]++; atomsInPath.set(bi->getBeginAtomIdx()); atomsInPath.set(bi->getEndAtomIdx()); } for(unsigned int i=0;igetBeginAtomIdx()==bj->getBeginAtomIdx() || bi->getBeginAtomIdx()==bj->getEndAtomIdx() || bi->getEndAtomIdx()==bj->getBeginAtomIdx() || bi->getEndAtomIdx()==bj->getEndAtomIdx() ){ ++bondNbrs[i]; ++bondNbrs[j]; } } #ifdef VERBOSE_FINGERPRINTING std::cerr<<" bond("<getBeginAtomIdx()]; a2Deg = atomDegrees[bi->getEndAtomIdx()]; if(a1DeggetIsAromatic() && bi->getBondType()!=Bond::SINGLE && bi->getBondType()!=Bond::AROMATIC){ bondHash = bi->getBondType(); } else { bondHash = Bond::SINGLE; } unsigned int a1Deg,a2Deg; a1Deg = atomDegrees[bi->getBeginAtomIdx()]; a2Deg = atomDegrees[bi->getEndAtomIdx()]; if(a1DeggetBeginAtomIdx()<<" - " <getEndAtomIdx()<getBeginAtomIdx()]%128); a2Hash = (anums[bi->getEndAtomIdx()]%128); unsigned int a1Deg,a2Deg; a1Deg = atomDegrees[bi->getBeginAtomIdx()]; a2Deg = atomDegrees[bi->getEndAtomIdx()]; if(a1HashgetBeginAtomIdx()<<" - " <getEndAtomIdx()<getBeginAtomIdx()]; bool a2Hash = aromaticAtoms[bi->getEndAtomIdx()]; if((!a1Hash) && a2Hash) std::swap(a1Hash,a2Hash); ourHash = a1Hash; ourHash |= a2Hash<<1; ourHash |= (bondNbrs[i]%8)<<5; hashLayers[5].push_back(ourHash); } } unsigned int l=0; bool flaggedPath=false; for(std::vector< std::vector >::iterator layerIt=hashLayers.begin(); layerIt!=hashLayers.end();++layerIt,++l){ if(!layerIt->size()) continue; // ---- std::sort(layerIt->begin(),layerIt->end()); // finally, we will add the number of distinct atoms in the path at the end // of the vect. This allows us to distinguish C1CC1 from CC(C)C layerIt->push_back(atomsInPath.count()); layerIt->push_back(l+1); // hash the path to generate a seed: unsigned long seed = gboost::hash_range(layerIt->begin(),layerIt->end()); #ifdef VERBOSE_FINGERPRINTING std::cerr<<" hash: "<setBit(bitId); if(atomCounts && !flaggedPath){ for(unsigned int aIdx=0;aIdx