// $Id$ // // Copyright (C) 2003-2010 Greg Landrum and Rational Discovery LLC // // @@ All Rights Reserved @@ // This file is part of the RDKit. // The contents are covered by the terms of the BSD license // which is included in the file license.txt, found at the root // of the RDKit source tree. // #include #include #include #include #include "Fingerprints.h" #include #include #include #include #include #include #include #include #include #include #include #include //#define LAYEREDFP_USE_MT namespace RDKit{ namespace { bool isComplexQuery(const Bond *b){ if( !b->hasQuery()) return false; // negated things are always complex: if( b->getQuery()->getNegation()) return true; std::string descr=b->getQuery()->getDescription(); if(descr=="BondOrder") return false; if(descr=="BondAnd" || descr=="BondXor") return true; if(descr=="BondOr") { // detect the types of queries that appear for unspecified bonds in SMARTS: if(b->getQuery()->endChildren()-b->getQuery()->beginChildren()==2){ for(Bond::QUERYBOND_QUERY::CHILD_VECT_CI child=b->getQuery()->beginChildren(); child!=b->getQuery()->endChildren();++child){ if((*child)->getDescription()!="BondOrder" || (*child)->getNegation()) return true; if(static_cast(child->get())->getVal()!=Bond::SINGLE && static_cast(child->get())->getVal()!=Bond::AROMATIC) return true; return false; } } } return true; } bool isComplexQuery(const Atom *a){ if( !a->hasQuery()) return false; // negated things are always complex: if( a->getQuery()->getNegation()) return true; std::string descr=a->getQuery()->getDescription(); if(descr=="AtomAtomicNum") return false; if(descr=="AtomOr" || descr=="AtomXor") return true; if(descr=="AtomAnd"){ Queries::Query::CHILD_VECT_CI childIt=a->getQuery()->beginChildren(); if( (*childIt)->getDescription()=="AtomAtomicNum" && ((*(childIt+1))->getDescription()=="AtomIsAliphatic" || (*(childIt+1))->getDescription()=="AtomIsAromatic") && (childIt+2)==a->getQuery()->endChildren()){ return false; } return true; } return true; } bool isAtomAromatic(const Atom *a){ bool res=false; if( !a->hasQuery()){ res=a->getIsAromatic(); } else { std::string descr=a->getQuery()->getDescription(); if(descr=="AtomAtomicNum"){ res = a->getIsAromatic(); } else if(descr=="AtomIsAromatic") { res=true; if( a->getQuery()->getNegation()) res = !res; } else if(descr=="AtomIsAliphatic") { res=false; if( a->getQuery()->getNegation()) res = !res; } else if(descr=="AtomAnd"){ Queries::Query::CHILD_VECT_CI childIt=a->getQuery()->beginChildren(); if( (*childIt)->getDescription()=="AtomAtomicNum"){ if( a->getQuery()->getNegation()){ res = false; } else if((*(childIt+1))->getDescription()=="AtomIsAliphatic"){ res=false; } else if((*(childIt+1))->getDescription()=="AtomIsAromatic") { res=true; } } } } return res; } } // end of anonymous namespace // caller owns the result, it must be deleted ExplicitBitVect *RDKFingerprintMol(const ROMol &mol,unsigned int minPath, unsigned int maxPath, unsigned int fpSize,unsigned int nBitsPerHash, bool useHs, double tgtDensity,unsigned int minSize, bool branchedPaths, bool useBondOrder, std::vector *atomInvariants){ PRECONDITION(minPath!=0,"minPath==0"); PRECONDITION(maxPath>=minPath,"maxPathsize()>=mol.getNumAtoms(),"bad atomInvariants size"); typedef boost::mt19937 rng_type; typedef boost::uniform_int<> distrib_type; typedef boost::variate_generator source_type; rng_type generator(42u); // // if we generate arbitrarily sized ints then mod them down to the // appropriate size, we can guarantee that a fingerprint of // size x has the same bits set as one of size 2x that's been folded // in half. This is a nice guarantee to have. // distrib_type dist(0,INT_MAX); source_type randomSource(generator,dist); // build default aotm invariants if need be: std::vector lAtomInvariants; if(!atomInvariants){ lAtomInvariants.reserve(mol.getNumAtoms()); for(ROMol::ConstAtomIterator atomIt=mol.beginAtoms(); atomIt!=mol.endAtoms(); ++atomIt){ unsigned int aHash = ((*atomIt)->getAtomicNum()%128)<<1 | (*atomIt)->getIsAromatic(); lAtomInvariants.push_back(aHash); } atomInvariants=&lAtomInvariants; } ExplicitBitVect *res = new ExplicitBitVect(fpSize); INT_PATH_LIST_MAP allPaths; if(branchedPaths){ allPaths = findAllSubgraphsOfLengthsMtoN(mol,minPath,maxPath, useHs); } else { allPaths = findAllPathsOfLengthsMtoN(mol,minPath,maxPath, useHs); } std::vector bondCache; bondCache.resize(mol.getNumBonds()); ROMol::EDGE_ITER firstB,lastB; boost::tie(firstB,lastB) = mol.getEdges(); while(firstB!=lastB){ BOND_SPTR bond = mol[*firstB]; bondCache[bond->getIdx()]=bond.get(); ++firstB; } boost::dynamic_bitset<> atomsInPath(mol.getNumAtoms()); for(INT_PATH_LIST_MAP_CI paths=allPaths.begin();paths!=allPaths.end();paths++){ for( PATH_LIST_CI pathIt=paths->second.begin(); pathIt!=paths->second.end(); pathIt++ ){ const PATH_TYPE &path=*pathIt; #ifdef VERBOSE_FINGERPRINTING std::cerr<<"Path: "; std::copy(path.begin(),path.end(),std::ostream_iterator(std::cerr,", ")); std::cerr< bondNbrs(path.size()); std::fill(bondNbrs.begin(),bondNbrs.end(),0); atomsInPath.reset(); std::vector bondHashes; bondHashes.reserve(path.size()+1); for(unsigned int i=0;igetBeginAtomIdx()); atomsInPath.set(bi->getEndAtomIdx()); for(unsigned int j=i+1;jgetBeginAtomIdx()==bj->getBeginAtomIdx() || bi->getBeginAtomIdx()==bj->getEndAtomIdx() || bi->getEndAtomIdx()==bj->getBeginAtomIdx() || bi->getEndAtomIdx()==bj->getEndAtomIdx() ){ ++bondNbrs[i]; ++bondNbrs[j]; } } #ifdef VERBOSE_FINGERPRINTING std::cerr<<" bond("<getBeginAtomIdx()]; a2Hash = (*atomInvariants)[bi->getEndAtomIdx()]; if(a1HashgetIsAromatic()){ // makes sure aromatic bonds always hash the same: bondHash = Bond::AROMATIC; } else { bondHash = bi->getBondType(); } } boost::uint32_t nBitsInHash=0; boost::uint32_t ourHash=bondNbrs[i]%8; // 3 bits here nBitsInHash+=3; ourHash |= (bondHash%16)<(seed)); for(unsigned int i=0;isetBit(bit); #ifdef VERBOSE_FINGERPRINTING std::cerr<<" bit: "<0.0){ while( static_cast(res->getNumOnBits())/res->getNumBits() < tgtDensity && res->getNumBits() >= 2*minSize ){ ExplicitBitVect *tmpV=FoldFingerprint(*res,2); delete res; res = tmpV; } } return res; } // caller owns the result, it must be deleted ExplicitBitVect *LayeredFingerprintMol(const ROMol &mol, unsigned int layerFlags, unsigned int minPath, unsigned int maxPath, unsigned int fpSize, double tgtDensity,unsigned int minSize, std::vector *atomCounts, ExplicitBitVect *setOnlyBits, bool branchedPaths){ PRECONDITION(minPath!=0,"minPath==0"); PRECONDITION(maxPath>=minPath,"maxPathsize()>=mol.getNumAtoms(),"bad atomCounts size"); PRECONDITION(!setOnlyBits || setOnlyBits->getNumBits()==fpSize,"bad setOnlyBits size"); if(!mol.getRingInfo()->isInitialized()){ MolOps::findSSSR(mol); } #ifdef LAYEREDFP_USE_MT // create a mersenne twister with customized parameters. // The standard parameters (used to create boost::mt19937) // result in an RNG that's much too computationally intensive // to seed. typedef boost::random::mersenne_twister rng_type; typedef boost::uniform_int<> distrib_type; typedef boost::variate_generator source_type; rng_type generator(42u); // // if we generate arbitrarily sized ints then mod them down to the // appropriate size, we can guarantee that a fingerprint of // size x has the same bits set as one of size 2x that's been folded // in half. This is a nice guarantee to have. // distrib_type dist(0,INT_MAX); source_type randomSource(generator,dist); #endif std::vector bondCache; bondCache.resize(mol.getNumBonds()); std::vector isQueryBond(mol.getNumBonds(),0); ROMol::EDGE_ITER firstB,lastB; boost::tie(firstB,lastB) = mol.getEdges(); while(firstB!=lastB){ const Bond *bond = mol[*firstB].get(); isQueryBond[bond->getIdx()] = 0x0; bondCache[bond->getIdx()]=bond; if(isComplexQuery(bond)){ isQueryBond[bond->getIdx()] = 0x1; } if(isComplexQuery(bond->getBeginAtom())){ isQueryBond[bond->getIdx()] |= 0x2; } if(isComplexQuery(bond->getEndAtom())){ isQueryBond[bond->getIdx()] |= 0x4; } ++firstB; } std::vector aromaticAtoms(mol.getNumAtoms(),false); ROMol::VERTEX_ITER firstA,lastA; boost::tie(firstA,lastA) = mol.getVertices(); while(firstA!=lastA){ const Atom *atom = mol[*firstA].get(); if(isAtomAromatic(atom)) aromaticAtoms[atom->getIdx()]=true; ++firstA; } ExplicitBitVect *res = new ExplicitBitVect(fpSize); INT_PATH_LIST_MAP allPaths; if(branchedPaths){ allPaths = findAllSubgraphsOfLengthsMtoN(mol,minPath,maxPath); } else { allPaths = findAllPathsOfLengthsMtoN(mol,minPath,maxPath); } boost::dynamic_bitset<> atomsInPath(mol.getNumAtoms()); for(INT_PATH_LIST_MAP_CI paths=allPaths.begin();paths!=allPaths.end();++paths){ for( PATH_LIST_CI pathIt=paths->second.begin(); pathIt!=paths->second.end(); ++pathIt ){ const PATH_TYPE &path=*pathIt; std::vector< std::vector > hashLayers(maxFingerprintLayers); for(unsigned int i=0;i bondNbrs(path.size(),0); atomsInPath.reset(); for(unsigned int i=0;igetBeginAtomIdx()); atomsInPath.set(bi->getEndAtomIdx()); for(unsigned int j=i+1;jgetBeginAtomIdx()==bj->getBeginAtomIdx() || bi->getBeginAtomIdx()==bj->getEndAtomIdx() || bi->getEndAtomIdx()==bj->getBeginAtomIdx() || bi->getEndAtomIdx()==bj->getEndAtomIdx() ){ ++bondNbrs[i]; ++bondNbrs[j]; } } #ifdef VERBOSE_FINGERPRINTING std::cerr<<" bond("<getIsAromatic() && bi->getBondType()!=Bond::SINGLE && bi->getBondType()!=Bond::AROMATIC){ bondHash = bi->getBondType(); } else { bondHash = Bond::SINGLE; } ourHash = bondHash%8; ourHash |= (bondNbrs[i]%8)<<6; hashLayers[1].push_back(ourHash); } if(layerFlags & 0x4 && !(pathQueries&0x6) ){ //std::cerr<<" consider: "<getBeginAtomIdx()<<" - " <getEndAtomIdx()<getBeginAtom()->getAtomicNum()%128); a2Hash = (bi->getEndAtom()->getAtomicNum()%128); if(a1HashgetBeginAtomIdx()<<" - " <getEndAtomIdx()<getBeginAtom()->getIdx()]; bool a2Hash = aromaticAtoms[bi->getEndAtom()->getIdx()]; if((!a1Hash) && a2Hash) std::swap(a1Hash,a2Hash); ourHash = a1Hash; ourHash |= a2Hash<<1; ourHash |= (bondNbrs[i]%8)<<5; hashLayers[5].push_back(ourHash); } } unsigned int l=0; bool flaggedPath=false; for(std::vector< std::vector >::iterator layerIt=hashLayers.begin(); layerIt!=hashLayers.end();++layerIt,++l){ if(!layerIt->size()) continue; // ---- std::sort(layerIt->begin(),layerIt->end()); // finally, we will add the number of distinct atoms in the path at the end // of the vect. This allows us to distinguish C1CC1 from CC(C)C layerIt->push_back(atomsInPath.count()); layerIt->push_back(l+1); // hash the path to generate a seed: unsigned long seed = gboost::hash_range(layerIt->begin(),layerIt->end()); //std::cerr<<" "<(seed)); unsigned int bitId=randomSource()%fpSize; #else unsigned int bitId=seed%fpSize; #endif if(!setOnlyBits || (*setOnlyBits)[bitId]){ res->setBit(bitId); if(atomCounts && !flaggedPath){ for(unsigned int aIdx=0;aIdx0.0){ while( static_cast(res->getNumOnBits())/res->getNumBits() < tgtDensity && res->getNumBits() >= 2*minSize ){ ExplicitBitVect *tmpV=FoldFingerprint(*res,2); delete res; res = tmpV; } } } return res; } const char *pqs[]={ "[*]~[*]", "[*]~[*]~[*]", "[R]~1~[R]~[R]~1", "[*]~[*]~[*]~[*]", "[*]~[*](~[*])~[*]", "[*]~[R]~1[R]~[R]~1", "[*]~1[R]~[R]~[R]~1", "[*]~[*]~[*]~[*]~[*]", "[*]~[*]~[*](~[*])~[*]", "[*]~[R]~1[R]~[R]~1[*]", "[*]~[R]~1[R]~[R]~[R]~1", "[*]~[*](~[*])(~[*])~[*]", "[*]~[R]~1(~[*])~[R]~[R]~1", "[R]~1[R]~[R]~[R]~[R]~1", "[*]~[*]~[*]~[*]~[*]~[*]", "[*]~[*]~[*]~[*](~[*])~[*]", "[*]~[*]~[*](~[*])~[*]~[*]", "[*]~[*]~[*](~[*])(~[*])~[*]", "[*]~[*](~[*])~[*](~[*])~[*]", "[*]~[R]~1[R]~[R]~1(~[*])~[*]", "[*]~[R]~1[R](~[*])~[R]~1[*]", "[*]~[R]~1[R]~[R](~[*])~[R]~1", "[*]~[R]~1[R]~[R]~[R]~1[*]", "[*]~[R]~1[R]~[R]~[R]~[R]~1", "[*]~[R]~1(~[*])~[R]~[R]~[R]~1", "[R]~1[R]~[R]~[R]~[R]~[R]~1", "[*]~[*]~[*]~[*]~[*]~[*]~[*]", "[*]~[*]~[*]~[*]~[*](~[*])~[*]", "[*]~[*]~[*]~[*](~[*])~[*]~[*]", "[*]~[*]~[*]~[*](~[*])(~[*])~[*]", "[*]~[*]~[*](~[*])~[*](~[*])~[*]", "[*]~[*](~[*])~[*]~[*](~[*])~[*]", "[*]~[*](~[*])~[*](~[*])(~[*])~[*]", ""}; // caller owns the result, it must be deleted ExplicitBitVect *LayeredFingerprintMol2(const ROMol &mol, unsigned int layerFlags, unsigned int minPath, unsigned int maxPath, unsigned int fpSize, std::vector *atomCounts, ExplicitBitVect *setOnlyBits, bool branchedPaths){ PRECONDITION(minPath!=0,"minPath==0"); PRECONDITION(maxPath>=minPath,"maxPathsize()>=mol.getNumAtoms(),"bad atomCounts size"); PRECONDITION(!setOnlyBits || setOnlyBits->getNumBits()==fpSize,"bad setOnlyBits size"); static std::vector patts; // FIX: need a mutex here to be threadsafe if(patts.size()==0){ unsigned int idx=0; while(1){ std::string pq=pqs[idx]; if(pq=="") break; idx++; RWMol *tm; try { tm = SmartsToMol(pq); }catch (...) { tm=NULL; } if(!tm) continue; patts.push_back(ROMOL_SPTR(static_cast(tm))); } } if(!mol.getRingInfo()->isInitialized()){ MolOps::findSSSR(mol); } ExplicitBitVect *res = new ExplicitBitVect(fpSize); unsigned int pIdx=0; BOOST_FOREACH(ROMOL_SPTR patt,patts){ ++pIdx; if(patt->getNumBonds()getNumBonds()>maxPath){ continue; } std::vector matches; SubstructMatch(mol,*(patt.get()),matches,false); boost::uint32_t mIdx=pIdx+patt->getNumAtoms()+patt->getNumBonds(); BOOST_FOREACH(MatchVectType &mv,matches){ boost::uint32_t bitId=pIdx; std::vector amap(mv.size(),0); BOOST_FOREACH(MatchVectType::value_type &p,mv){ gboost::hash_combine(bitId,mol.getAtomWithIdx(p.second)->getAtomicNum()); amap[p.first]=p.second; } ROMol::EDGE_ITER firstB,lastB; boost::tie(firstB,lastB) = patt->getEdges(); while(firstB!=lastB){ BOND_SPTR pbond = (*patt.get())[*firstB]; ++firstB; const Bond *mbond=mol.getBondBetweenAtoms(amap[pbond->getBeginAtomIdx()], amap[pbond->getEndAtomIdx()]); gboost::hash_combine(bitId,(boost::uint32_t)mbond->getBondType()); } res->setBit(bitId%fpSize); // collect bits counting the number of occurances of the pattern: gboost::hash_combine(mIdx,0xBEEF); res->setBit(mIdx%fpSize); } } return res; } }