mirror of
https://github.com/rdkit/rdkit.git
synced 2026-06-03 21:44:30 +08:00
321 lines
11 KiB
C++
321 lines
11 KiB
C++
// $Id$
|
|
//
|
|
// Copyright (C) 2007-2010 Greg Landrum
|
|
//
|
|
// @@ All Rights Reserved @@
|
|
//
|
|
|
|
#include <GraphMol/RDKitBase.h>
|
|
#include <GraphMol/Fingerprints/AtomPairs.h>
|
|
#include <GraphMol/Subgraphs/Subgraphs.h>
|
|
#include <DataStructs/SparseIntVect.h>
|
|
#include <RDGeneral/hash/hash.hpp>
|
|
#include <boost/cstdint.hpp>
|
|
|
|
namespace RDKit{
|
|
namespace AtomPairs {
|
|
unsigned int numPiElectrons(const Atom *atom){
|
|
PRECONDITION(atom,"no atom");
|
|
unsigned int res=0;
|
|
if(atom->getIsAromatic()){
|
|
res=1;
|
|
} else if(atom->getHybridization() != Atom::SP3){
|
|
CHECK_INVARIANT(static_cast<unsigned int>(atom->getExplicitValence())>=atom->getDegree(),
|
|
"explicit valence exceeds atom degree");
|
|
res = atom->getExplicitValence()-atom->getDegree();
|
|
}
|
|
return res;
|
|
}
|
|
|
|
boost::uint32_t getAtomCode(const Atom *atom,unsigned int branchSubtract){
|
|
PRECONDITION(atom,"no atom");
|
|
boost::uint32_t code;
|
|
|
|
unsigned int numBranches=0;
|
|
if(atom->getDegree()>branchSubtract){
|
|
numBranches=atom->getDegree()-branchSubtract;
|
|
}
|
|
|
|
code=numBranches%maxNumBranches;
|
|
unsigned int nPi=numPiElectrons(atom)%maxNumPi;
|
|
code |= nPi<<numBranchBits;
|
|
|
|
unsigned int typeIdx=0;
|
|
unsigned int nTypes=1<<numTypeBits;
|
|
while(typeIdx<nTypes){
|
|
if(atomNumberTypes[typeIdx]==static_cast<unsigned int>(atom->getAtomicNum())){
|
|
break;
|
|
} else if(atomNumberTypes[typeIdx]>static_cast<unsigned int>(atom->getAtomicNum())){
|
|
typeIdx=nTypes;
|
|
break;
|
|
}
|
|
++typeIdx;
|
|
}
|
|
if(typeIdx==nTypes) --typeIdx;
|
|
code |= typeIdx<<(numBranchBits+numPiBits);
|
|
return code;
|
|
};
|
|
|
|
boost::uint32_t getAtomPairCode(boost::uint32_t codeI,boost::uint32_t codeJ,
|
|
unsigned int dist){
|
|
PRECONDITION(dist<maxPathLen,"dist too long");
|
|
boost::uint32_t res=dist;
|
|
res |= std::min(codeI,codeJ) << numPathBits;
|
|
res |= std::max(codeI,codeJ) << (numPathBits+codeSize);
|
|
return res;
|
|
}
|
|
|
|
void setAtomPairBit(boost::uint32_t i, boost::uint32_t j,boost::uint32_t nAtoms,
|
|
const std::vector<boost::uint32_t> &atomCodes,
|
|
const double *dm,SparseIntVect<boost::int32_t> *bv,
|
|
unsigned int minLength,unsigned int maxLength){
|
|
unsigned int dist=static_cast<unsigned int>(floor(dm[i*nAtoms+j]));
|
|
if(dist>=minLength && dist<=maxLength){
|
|
boost::uint32_t bitId=getAtomPairCode(atomCodes[i],atomCodes[j],dist);
|
|
bv->setVal(bitId,(*bv)[bitId]+1);
|
|
}
|
|
}
|
|
|
|
SparseIntVect<boost::int32_t> *getAtomPairFingerprint(const ROMol &mol,
|
|
const std::vector<boost::uint32_t> *fromAtoms){
|
|
return getAtomPairFingerprint(mol,1,maxPathLen-1,fromAtoms);
|
|
};
|
|
|
|
SparseIntVect<boost::int32_t> *
|
|
getAtomPairFingerprint(const ROMol &mol,unsigned int minLength,unsigned int maxLength,
|
|
const std::vector<boost::uint32_t> *fromAtoms){
|
|
PRECONDITION(minLength<=maxLength,"bad lengths provided");
|
|
SparseIntVect<boost::int32_t> *res=new SparseIntVect<boost::int32_t>(1<<numAtomPairFingerprintBits);
|
|
const double *dm = MolOps::getDistanceMat(mol);
|
|
const unsigned int nAtoms=mol.getNumAtoms();
|
|
|
|
std::vector<boost::uint32_t> atomCodes;
|
|
for(ROMol::ConstAtomIterator atomItI=mol.beginAtoms();
|
|
atomItI!=mol.endAtoms();++atomItI){
|
|
atomCodes.push_back(getAtomCode(*atomItI));
|
|
}
|
|
for(ROMol::ConstAtomIterator atomItI=mol.beginAtoms();
|
|
atomItI!=mol.endAtoms();++atomItI){
|
|
unsigned int i=(*atomItI)->getIdx();
|
|
if(!fromAtoms){
|
|
for(ROMol::ConstAtomIterator atomItJ=atomItI+1;
|
|
atomItJ!=mol.endAtoms();++atomItJ){
|
|
unsigned int j=(*atomItJ)->getIdx();
|
|
setAtomPairBit(i,j,nAtoms,atomCodes,dm,res,minLength,maxLength);
|
|
}
|
|
} else {
|
|
for(std::vector<boost::uint32_t>::const_iterator bvIt=fromAtoms->begin();
|
|
bvIt!=fromAtoms->end();++bvIt){
|
|
if(*bvIt!=i){
|
|
setAtomPairBit(i,*bvIt,nAtoms,atomCodes,dm,res,minLength,maxLength);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
return res;
|
|
}
|
|
|
|
SparseIntVect<boost::int32_t> *
|
|
getHashedAtomPairFingerprint(const ROMol &mol,unsigned int nBits,
|
|
unsigned int minLength,unsigned int maxLength){
|
|
PRECONDITION(minLength<=maxLength,"bad lengths provided");
|
|
SparseIntVect<boost::int32_t> *res=new SparseIntVect<boost::int32_t>(nBits);
|
|
const double *dm = MolOps::getDistanceMat(mol);
|
|
const unsigned int nAtoms=mol.getNumAtoms();
|
|
|
|
std::vector<boost::uint32_t> atomCodes;
|
|
for(ROMol::ConstAtomIterator atomItI=mol.beginAtoms();
|
|
atomItI!=mol.endAtoms();++atomItI){
|
|
atomCodes.push_back(getAtomCode(*atomItI));
|
|
}
|
|
for(ROMol::ConstAtomIterator atomItI=mol.beginAtoms();
|
|
atomItI!=mol.endAtoms();++atomItI){
|
|
unsigned int i=(*atomItI)->getIdx();
|
|
for(ROMol::ConstAtomIterator atomItJ=atomItI+1;
|
|
atomItJ!=mol.endAtoms();++atomItJ){
|
|
unsigned int j=(*atomItJ)->getIdx();
|
|
unsigned int dist=static_cast<unsigned int>(floor(dm[i*nAtoms+j]));
|
|
if(dist>=minLength && dist<=maxLength){
|
|
boost::uint32_t bit=0;
|
|
gboost::hash_combine(bit,std::min(atomCodes[i],atomCodes[j]));
|
|
gboost::hash_combine(bit,dist);
|
|
gboost::hash_combine(bit,std::max(atomCodes[i],atomCodes[j]));
|
|
res->setVal(bit%nBits,(*res)[bit%nBits]+1);
|
|
}
|
|
}
|
|
}
|
|
return res;
|
|
}
|
|
|
|
boost::uint64_t
|
|
getTopologicalTorsionCode(const std::vector<boost::uint32_t> &pathCodes){
|
|
bool reverseIt=false;
|
|
unsigned int i=0;
|
|
unsigned int j=pathCodes.size()-1;
|
|
while(i<j){
|
|
if(pathCodes[i]>pathCodes[j]){
|
|
reverseIt=true;
|
|
break;
|
|
} else if( pathCodes[i]<pathCodes[j]){
|
|
break;
|
|
}
|
|
++i;
|
|
--j;
|
|
}
|
|
|
|
boost::uint64_t res=0;
|
|
if(reverseIt){
|
|
//std::cerr<<"r";
|
|
for(unsigned int i=0;i<pathCodes.size();++i){
|
|
res |= static_cast<boost::uint64_t>(pathCodes[pathCodes.size()-i-1])<<(codeSize*i);
|
|
}
|
|
}else{
|
|
//std::cerr<<" ";
|
|
for(unsigned int i=0;i<pathCodes.size();++i){
|
|
res |= static_cast<boost::uint64_t>(pathCodes[i])<<(codeSize*i);
|
|
}
|
|
}
|
|
//for(unsigned int i=0;i<pathCodes.size();++i){
|
|
// std::cerr<<atomCodes[i]<<" ";
|
|
//}
|
|
//std::cerr<<res<<std::endl;
|
|
|
|
return res;
|
|
}
|
|
|
|
size_t
|
|
getTopologicalTorsionHash(const std::vector<boost::uint32_t> &pathCodes){
|
|
bool reverseIt=false;
|
|
unsigned int i=0;
|
|
unsigned int j=pathCodes.size()-1;
|
|
while(i<j){
|
|
if(pathCodes[i]>pathCodes[j]){
|
|
reverseIt=true;
|
|
break;
|
|
} else if( pathCodes[i]<pathCodes[j]){
|
|
break;
|
|
}
|
|
++i;
|
|
--j;
|
|
}
|
|
|
|
boost::uint32_t res=0;
|
|
if(reverseIt){
|
|
for(unsigned int i=0;i<pathCodes.size();++i){
|
|
gboost::hash_combine(res,pathCodes[pathCodes.size()-i-1]);
|
|
}
|
|
}else{
|
|
for(unsigned int i=0;i<pathCodes.size();++i){
|
|
gboost::hash_combine(res,pathCodes[i]);
|
|
}
|
|
}
|
|
return res;
|
|
}
|
|
|
|
|
|
SparseIntVect<boost::int64_t> *
|
|
getTopologicalTorsionFingerprint(const ROMol &mol,unsigned int targetSize,
|
|
const std::vector<boost::uint32_t> *fromAtoms){
|
|
boost::uint64_t sz=1;
|
|
sz=(sz<<(targetSize*codeSize));
|
|
// NOTE: this -1 is incorrect but it's needed for backwards compatibility.
|
|
// hopefully we'll never have a case with a torsion that hits this.
|
|
//
|
|
// mmm, bug compatible.
|
|
sz-=1;
|
|
SparseIntVect<boost::int64_t> *res=new SparseIntVect<boost::int64_t>(sz);
|
|
|
|
std::vector<boost::uint32_t> atomCodes;
|
|
for(ROMol::ConstAtomIterator atomItI=mol.beginAtoms();
|
|
atomItI!=mol.endAtoms();++atomItI){
|
|
atomCodes.push_back(getAtomCode(*atomItI));
|
|
}
|
|
|
|
PATH_LIST paths=findAllPathsOfLengthN(mol,targetSize,false);
|
|
for(PATH_LIST::const_iterator pathIt=paths.begin();
|
|
pathIt!=paths.end();++pathIt){
|
|
bool keepIt=true;
|
|
if(fromAtoms){
|
|
keepIt=false;
|
|
}
|
|
std::vector<boost::uint32_t> pathCodes;
|
|
const PATH_TYPE &path=*pathIt;
|
|
if(fromAtoms){
|
|
if(std::find(fromAtoms->begin(),fromAtoms->end(),
|
|
static_cast<boost::uint32_t>(path.front()))!=fromAtoms->end() ||
|
|
std::find(fromAtoms->begin(),fromAtoms->end(),
|
|
static_cast<boost::uint32_t>(path.back()))!=fromAtoms->end()
|
|
){
|
|
keepIt=true;
|
|
}
|
|
}
|
|
if(keepIt){
|
|
for(unsigned int i=0;i<targetSize;++i){
|
|
unsigned int code=atomCodes[path[i]];
|
|
// subtract off the branching number:
|
|
if(i==0 || i==targetSize-1){
|
|
code-=1;
|
|
} else {
|
|
code-=2;
|
|
}
|
|
pathCodes.push_back(code);
|
|
}
|
|
boost::int64_t code=getTopologicalTorsionCode(pathCodes);
|
|
res->setVal(code,res->getVal(code)+1);
|
|
}
|
|
}
|
|
return res;
|
|
}
|
|
|
|
SparseIntVect<boost::int64_t> *
|
|
getHashedTopologicalTorsionFingerprint(const ROMol &mol,
|
|
unsigned int nBits,
|
|
unsigned int targetSize,
|
|
const std::vector<boost::uint32_t> *fromAtoms){
|
|
SparseIntVect<boost::int64_t> *res=new SparseIntVect<boost::int64_t>(nBits);
|
|
|
|
std::vector<boost::uint32_t> atomCodes;
|
|
for(ROMol::ConstAtomIterator atomItI=mol.beginAtoms();
|
|
atomItI!=mol.endAtoms();++atomItI){
|
|
atomCodes.push_back(getAtomCode(*atomItI));
|
|
}
|
|
|
|
PATH_LIST paths=findAllPathsOfLengthN(mol,targetSize,false);
|
|
for(PATH_LIST::const_iterator pathIt=paths.begin();
|
|
pathIt!=paths.end();++pathIt){
|
|
bool keepIt=true;
|
|
if(fromAtoms){
|
|
keepIt=false;
|
|
}
|
|
const PATH_TYPE &path=*pathIt;
|
|
if(fromAtoms){
|
|
if(std::find(fromAtoms->begin(),fromAtoms->end(),
|
|
static_cast<boost::uint32_t>(path.front()))!=fromAtoms->end() ||
|
|
std::find(fromAtoms->begin(),fromAtoms->end(),
|
|
static_cast<boost::uint32_t>(path.back()))!=fromAtoms->end()
|
|
){
|
|
keepIt=true;
|
|
}
|
|
}
|
|
if(keepIt){
|
|
std::vector<boost::uint32_t> pathCodes(targetSize);
|
|
for(unsigned int i=0;i<targetSize;++i){
|
|
unsigned int code=atomCodes[path[i]];
|
|
// subtract off the branching number:
|
|
if(i==0 || i==targetSize-1){
|
|
code-=1;
|
|
} else {
|
|
code-=2;
|
|
}
|
|
pathCodes[i]=code;
|
|
}
|
|
size_t bit=getTopologicalTorsionHash(pathCodes);
|
|
res->setVal(bit%nBits,res->getVal(bit%nBits)+1);
|
|
}
|
|
}
|
|
return res;
|
|
}
|
|
|
|
} // end of namespace AtomPairs
|
|
} // end of namespace RDKit
|