mirror of
https://github.com/rdkit/rdkit.git
synced 2026-06-05 22:04:27 +08:00
455 lines
19 KiB
C++
455 lines
19 KiB
C++
// $Id$
|
|
//
|
|
// Copyright (C) 2014 Novartis Institutes for BioMedical Research
|
|
//
|
|
// @@ All Rights Reserved @@
|
|
// This file is part of the RDKit.
|
|
// The contents are covered by the terms of the BSD license
|
|
// which is included in the file license.txt, found at the root
|
|
// of the RDKit source tree.
|
|
//
|
|
#include <stdio.h>
|
|
#include <stdlib.h>
|
|
#include <RDGeneral/BoostStartInclude.h>
|
|
#include <boost/format.hpp>
|
|
#include <RDGeneral/BoostEndInclude.h>
|
|
#include <boost/crc.hpp>
|
|
#include <boost/cstdint.hpp>
|
|
#include "../Descriptors/MolDescriptors.h"
|
|
|
|
#include "MolHash.h"
|
|
|
|
namespace RDKit
|
|
{
|
|
namespace MolHash
|
|
{
|
|
struct MolFragment // Reference to a fragment of source molecule
|
|
{
|
|
std::vector<const Atom*> Atoms;
|
|
std::vector<const Bond*> Bonds;
|
|
std::vector<boost::uint32_t> AtomsIdx;
|
|
std::vector<boost::uint32_t> BondsIdx;
|
|
std::map<boost::uint32_t,boost::uint32_t> MolAtomIdxMap; // Full Molecule to fragment indeces backward conversion map
|
|
public:
|
|
boost::uint32_t getNumAtoms()const {return AtomsIdx.size();}
|
|
boost::uint32_t getNumBonds()const {return BondsIdx.size();}
|
|
};
|
|
|
|
// INTERNAL FUNCTIONS:
|
|
static
|
|
HashCodeType computeMorganCodeHash (const MolFragment& mol
|
|
, const std::vector<boost::uint32_t>& atomLabels
|
|
, const std::vector<boost::uint32_t>& bondLabels);
|
|
static
|
|
void prepareMolFragment(MolFragment& m,
|
|
const ROMol &mol,
|
|
const std::vector<unsigned> *atomsToUse,
|
|
const std::vector<unsigned> *bondsToUse);
|
|
static
|
|
void prepareLabels(std::vector<boost::uint32_t>& atomLabels, std::vector<boost::uint32_t>& bondLabels
|
|
, const ROMol& mol, const MolFragment& m
|
|
, const std::vector<boost::uint32_t> *atomCodes
|
|
, const std::vector<boost::uint32_t> *bondCodes);
|
|
static
|
|
boost::uint32_t computeCRC32 (const void* data, size_t size)
|
|
{
|
|
boost::crc_32_type crc;
|
|
crc.process_bytes(data, size);
|
|
return crc.checksum();
|
|
}
|
|
//=============================================================================
|
|
// MolHash Module API implementation:
|
|
//=============================================================================
|
|
|
|
void fillAtomBondCodes(const ROMol &mol, boost::uint64_t flags // CodeFlags constants combination
|
|
, std::vector<boost::uint32_t> *atomCodes // NULL is allowed
|
|
, std::vector<boost::uint32_t> *bondCodes) // NULL is allowed
|
|
{
|
|
if(atomCodes)
|
|
{
|
|
unsigned n = mol.getNumAtoms();
|
|
atomCodes->resize(n);
|
|
for(unsigned i=0; i < n; i++)
|
|
{
|
|
if(0==(CF_ATOM_ALL & flags)) // NO LABELS
|
|
{
|
|
(*atomCodes)[i] = 1;
|
|
continue;
|
|
}
|
|
const Atom* atom = mol.getAtomWithIdx(i);
|
|
(*atomCodes)[i] = 0;
|
|
if(0!=(CF_ELEMENT & flags))
|
|
(*atomCodes)[i] |= atom->getAtomicNum();
|
|
if(0!=(CF_CHARGE & flags))
|
|
(*atomCodes)[i] |= (atom->getFormalCharge()+8) << 8; // allowed range [-8, +8]
|
|
if(0!=(CF_VALENCE & flags))
|
|
(*atomCodes)[i] |= (atom->getExplicitValence()) << 13; //getTotalValence()
|
|
if(0!=(CF_ATOM_CHIRALITY & flags)){
|
|
char v=0;
|
|
if(atom->getChiralTag()==Atom::CHI_TETRAHEDRAL_CW ||
|
|
atom->getChiralTag()==Atom::CHI_TETRAHEDRAL_CCW){
|
|
if(atom->hasProp("_CIPCode")){
|
|
std::string code=atom->getProp<std::string>("_CIPCode");
|
|
if(code=="R") v=1;
|
|
else if(code=="S") v=2;
|
|
} else if(atom->hasProp("_ringStereoAtoms")){
|
|
const INT_VECT &ringStereoAtoms=atom->getProp<INT_VECT>("_ringStereoAtoms");
|
|
if(ringStereoAtoms.size()){
|
|
if(ringStereoAtoms[0]<0){
|
|
v=1;
|
|
} else {
|
|
v=2;
|
|
}
|
|
if(ringStereoAtoms.size()>1){
|
|
BOOST_LOG(rdWarningLog)<<"Warning: atom with more than 1 ring-stereo atoms found."<<std::endl;
|
|
}
|
|
}
|
|
|
|
}
|
|
} else {
|
|
v=atom->getChiralTag();
|
|
}
|
|
(*atomCodes)[i] |= v << 18; // 2 bits
|
|
}
|
|
if(0!=(CF_ATOM_AROMATIC & flags))
|
|
(*atomCodes)[i] |= (atom->getIsAromatic() ? 1 : 0) << 20; // 1 bit
|
|
//if(0!=( & flags))
|
|
// (*atomCodes)[i] |= (atom-()) << 21; // 3 bits reserved
|
|
if(0!=(CF_ISOTOPE & flags))
|
|
(*atomCodes)[i] |= (atom->getIsotope()) << 24;
|
|
}
|
|
}
|
|
|
|
if(bondCodes)
|
|
{
|
|
std::map<unsigned, bool> bondsInRing;
|
|
const RingInfo::VECT_INT_VECT& rings = mol.getRingInfo()->bondRings();
|
|
for(RingInfo::VECT_INT_VECT::const_iterator r = rings.begin(); r != rings.end(); r++)
|
|
for(INT_VECT::const_iterator b = r->begin(); b != r->end(); b++)
|
|
if (bondsInRing.end() == bondsInRing.find(*b))
|
|
bondsInRing[(unsigned)*b] = true;
|
|
|
|
unsigned n = mol.getNumBonds();
|
|
bondCodes->resize(n);
|
|
for(unsigned i=0; i < n; i++)
|
|
{
|
|
if(0==(CF_BOND_ALL & flags)) // NO LABELS
|
|
{
|
|
(*bondCodes)[i] = 1;
|
|
continue;
|
|
}
|
|
const Bond* bond = mol.getBondWithIdx(i);
|
|
(*bondCodes)[i] = 0;
|
|
if(0!=(CF_BOND_ORDER & flags))
|
|
{
|
|
unsigned order = bond->getBondType();
|
|
if(0==(CF_BOND_AROMATIZATION & flags)) // ignore aromatization
|
|
{
|
|
static const unsigned orderMatch [Bond::ZERO+1] =
|
|
{
|
|
Bond::UNSPECIFIED,
|
|
Bond::SINGLE,
|
|
Bond::DOUBLE,
|
|
Bond::TRIPLE,
|
|
Bond::QUADRUPLE,
|
|
Bond::QUINTUPLE,
|
|
Bond::HEXTUPLE,
|
|
Bond::SINGLE, //ONEANDAHALF,
|
|
Bond::DOUBLE, //TWOANDAHALF,
|
|
Bond::TRIPLE, //THREEANDAHALF,
|
|
Bond::QUADRUPLE, //FOURANDAHALF,
|
|
Bond::QUINTUPLE, //FIVEANDAHALF,
|
|
Bond::SINGLE, //AROMATIC,
|
|
Bond::IONIC,
|
|
Bond::HYDROGEN,
|
|
Bond::THREECENTER,
|
|
Bond::DATIVEONE,
|
|
Bond::DATIVE,
|
|
Bond::DATIVEL,
|
|
Bond::DATIVER,
|
|
Bond::OTHER,
|
|
Bond::ZERO
|
|
};
|
|
order = orderMatch[order];
|
|
}
|
|
(*bondCodes)[i] |= order;
|
|
}
|
|
if(0!=(CF_BOND_AROMATIZATION & flags))
|
|
(*bondCodes)[i] |= ((bond->getIsAromatic() ? 1 : 0)) << 8;
|
|
if(0!=(CF_BOND_CHIRALITY & flags))
|
|
{
|
|
(*bondCodes)[i] |= bond->getStereo() << 9;
|
|
}
|
|
if(0!=(CF_BOND_IN_RING & flags))
|
|
(*bondCodes)[i] |= (bondsInRing.end() != bondsInRing.find(bond->getIdx()) ? 1 : 0) << 11; // 1 bit
|
|
}
|
|
}
|
|
}
|
|
|
|
//=============================================================================
|
|
|
|
HashCodeType generateMoleculeHashCode(const ROMol &mol,
|
|
const std::vector<unsigned> *atomsToUse,
|
|
const std::vector<unsigned> *bondsToUse,
|
|
const std::vector<boost::uint32_t> *atomCodes,
|
|
const std::vector<boost::uint32_t> *bondCodes)
|
|
{
|
|
MolFragment m;
|
|
prepareMolFragment(m, mol, atomsToUse, bondsToUse);
|
|
if(0==m.getNumAtoms() || 0 == m.getNumBonds())
|
|
return 0;
|
|
std::vector<boost::uint32_t> atomLabels;
|
|
std::vector<boost::uint32_t> bondLabels;
|
|
prepareLabels (atomLabels, bondLabels, mol, m, atomCodes, bondCodes);
|
|
return computeMorganCodeHash (m, atomLabels, bondLabels);
|
|
}
|
|
|
|
|
|
void generateMoleculeHashSet(const ROMol &mol, HashSet& res,
|
|
const std::vector<unsigned> *atomsToUse,
|
|
const std::vector<unsigned> *bondsToUse)
|
|
{
|
|
res.Version = 100; // v. 1.0
|
|
res.Reserved= 0;
|
|
|
|
MolFragment m;
|
|
prepareMolFragment(m, mol, atomsToUse, bondsToUse);
|
|
|
|
res.NumAtoms = m.getNumAtoms();
|
|
res.NumBonds = m.getNumBonds();
|
|
if(0==m.getNumAtoms() || 0 == m.getNumBonds())
|
|
return;
|
|
|
|
std::string formula = RDKit::Descriptors::calcMolFormula(mol);
|
|
res.FormulaCRC32 = computeCRC32(formula.c_str(), formula.length());
|
|
|
|
boost::uint64_t flags = 0; // CodeFlags constants combination
|
|
std::vector<boost::uint32_t> atomCodes;
|
|
std::vector<boost::uint32_t> bondCodes;
|
|
std::vector<boost::uint32_t> atomLabels;
|
|
std::vector<boost::uint32_t> bondLabels;
|
|
|
|
// flags = CF_ATOM_ALL &(~(CF_BOND_CHIRALITY | CF_ATOM_CHIRALITY | CF_ISOTOPE));
|
|
flags = CF_ELEMENT | CF_CHARGE | CF_ATOM_AROMATIC; /// | CF_VALENCE
|
|
fillAtomBondCodes(mol, flags, &atomCodes, &bondCodes);
|
|
prepareLabels (atomLabels, bondLabels, mol, m, &atomCodes, &bondCodes);
|
|
res.NonChiralAtomsHash = computeMorganCodeHash (m, atomLabels, bondLabels);
|
|
|
|
flags = CF_BOND_ALL &(~CF_BOND_CHIRALITY);
|
|
fillAtomBondCodes(mol, flags, &atomCodes, &bondCodes);
|
|
prepareLabels (atomLabels, bondLabels, mol, m, &atomCodes, &bondCodes);
|
|
res.NonChiralBondsHash = computeMorganCodeHash (m, atomLabels, bondLabels);
|
|
|
|
flags = CF_ATOM_CHIRALITY | CF_ISOTOPE;
|
|
fillAtomBondCodes(mol, flags, &atomCodes, &bondCodes);
|
|
prepareLabels (atomLabels, bondLabels, mol, m, &atomCodes, &bondCodes);
|
|
res.ChiralAtomsHash = computeMorganCodeHash (m, atomLabels, bondLabels);
|
|
|
|
flags = CF_BOND_CHIRALITY;
|
|
fillAtomBondCodes(mol, flags, &atomCodes, &bondCodes);
|
|
prepareLabels (atomLabels, bondLabels, mol, m, &atomCodes, &bondCodes);
|
|
res.ChiralBondsHash = computeMorganCodeHash (m, atomLabels, bondLabels);
|
|
|
|
flags = CF_BOND_CHIRALITY | CF_ATOM_CHIRALITY | CF_ISOTOPE;
|
|
fillAtomBondCodes(mol, flags, &atomCodes, &bondCodes);
|
|
prepareLabels (atomLabels, bondLabels, mol, m, &atomCodes, &bondCodes);
|
|
res.ChiralityHash = computeMorganCodeHash (m, atomLabels, bondLabels);
|
|
}
|
|
|
|
//=============================================================================
|
|
std::string generateMoleculeHashSet(const ROMol &mol,
|
|
const std::vector<unsigned> *atomsToUse,
|
|
const std::vector<unsigned> *bondsToUse)
|
|
{
|
|
std::string str;
|
|
HashSet res;
|
|
generateMoleculeHashSet(mol, res, atomsToUse, bondsToUse);
|
|
//char buf[64];
|
|
//snprintf(buf, sizeof(buf),"%u-%u-%u-", res.Version, res.NumAtoms,res.NumBonds);
|
|
str = (boost::format("%u-%u-%u-")%(res.Version)%(res.NumAtoms)%(res.NumBonds)).str() ;
|
|
str += encode(&res.FormulaCRC32, sizeof(res.FormulaCRC32));
|
|
str += "-";
|
|
str += encode(&res.NonChiralAtomsHash, sizeof(res.NonChiralAtomsHash));
|
|
str += "-";
|
|
str += encode(&res.NonChiralBondsHash, sizeof(res.NonChiralBondsHash));
|
|
str += "-";
|
|
str += encode(&res.ChiralAtomsHash, sizeof(res.ChiralAtomsHash));
|
|
str += "-";
|
|
str += encode(&res.ChiralBondsHash, sizeof(res.ChiralBondsHash));
|
|
str += "-";
|
|
str += encode(&res.ChiralityHash, sizeof(res.ChiralityHash));
|
|
return str.c_str();
|
|
}
|
|
//=============================================================================
|
|
// INTERNAL FUNCTIONS:
|
|
//=============================================================================
|
|
static
|
|
HashCodeType computeMorganCodeHash (const MolFragment& mol
|
|
, const std::vector<boost::uint32_t>& atomLabels
|
|
, const std::vector<boost::uint32_t>& bondLabels)
|
|
{
|
|
size_t nv = mol.getNumAtoms();
|
|
size_t ne = mol.getNumBonds();
|
|
std::vector<HashCodeType> currCodes(nv);
|
|
std::vector<HashCodeType> prevCodes(nv);
|
|
size_t nIterations = mol.getNumBonds();
|
|
if (nIterations > 5)
|
|
nIterations = 5;
|
|
|
|
for(unsigned molAtomIdx = 0; molAtomIdx < mol.getNumAtoms(); molAtomIdx++)
|
|
currCodes[molAtomIdx] = atomLabels[mol.AtomsIdx[molAtomIdx]];
|
|
|
|
for (size_t iter = 0; iter < nIterations; iter++)
|
|
{
|
|
for (size_t i = 0; i < nv; i++)
|
|
prevCodes[i] = currCodes[i];
|
|
|
|
for (size_t molBondIdx= 0; molBondIdx < ne; molBondIdx++)
|
|
{
|
|
const Bond* bond = mol.Bonds[molBondIdx];
|
|
unsigned order = bondLabels[mol.BondsIdx[molBondIdx]];
|
|
unsigned atom1 = mol.MolAtomIdxMap.find(bond->getBeginAtomIdx())->second;
|
|
unsigned atom2 = mol.MolAtomIdxMap.find(bond->getEndAtomIdx ())->second;
|
|
boost::uint32_t v1 = prevCodes[atom1];
|
|
boost::uint32_t v2 = prevCodes[atom2];
|
|
|
|
currCodes[atom1] += v2*v2 + (v2 + 23) * (order + 1721);
|
|
currCodes[atom2] += v1*v1 + (v1 + 23) * (order + 1721);
|
|
}
|
|
}
|
|
|
|
HashCodeType result = 0;
|
|
for(unsigned molAtomIdx = 0; molAtomIdx < nv; molAtomIdx++)
|
|
{
|
|
HashCodeType code = currCodes[molAtomIdx];
|
|
result += code * (code + 6849) + 29;
|
|
}
|
|
return result;
|
|
}
|
|
//=============================================================================
|
|
static
|
|
void prepareMolFragment(MolFragment& m,
|
|
const ROMol &mol,
|
|
const std::vector<unsigned> *atomsToUse,
|
|
const std::vector<unsigned> *bondsToUse)
|
|
{
|
|
if(0!=atomsToUse && atomsToUse->empty())
|
|
atomsToUse = 0;
|
|
if(0!=bondsToUse && bondsToUse->empty())
|
|
bondsToUse = 0;
|
|
|
|
if(0==atomsToUse && 0==bondsToUse) // whole molecule
|
|
{
|
|
unsigned n = mol.getNumAtoms();
|
|
m.AtomsIdx.resize(n);
|
|
for(unsigned i = 0; i < n; i++)
|
|
m.AtomsIdx[i] = i;
|
|
|
|
n = mol.getNumBonds();
|
|
m.BondsIdx.resize(n);
|
|
for(unsigned i = 0; i < n; i++)
|
|
m.BondsIdx[i] = i;
|
|
}
|
|
else if(0!=atomsToUse) // selected atoms only and all/selected bonds between them
|
|
{
|
|
std::map<unsigned, unsigned> addedBonds;
|
|
unsigned n = atomsToUse->size();
|
|
m.AtomsIdx.resize(n);
|
|
for(unsigned i = 0; i < n; i++) // add all selected atoms at first
|
|
m.AtomsIdx[i] = (*atomsToUse)[i];
|
|
|
|
for(unsigned i = 0; i < n; i++) // add bonds between all selected atoms
|
|
{
|
|
ROMol::OEDGE_ITER beg,end;
|
|
for(boost::tie(beg,end) = mol.getAtomBonds(mol.getAtomWithIdx(m.AtomsIdx[i])); beg!=end; beg++)
|
|
{
|
|
const Bond* bond = & *((mol)[*beg]);
|
|
if(addedBonds.end() != addedBonds.find(bond->getIdx()))
|
|
continue; // the bond has been already added
|
|
if(0!=bondsToUse
|
|
&& bondsToUse->end() == find(bondsToUse->begin(), bondsToUse->end(), bond->getIdx()))
|
|
continue; // skip unselected bond
|
|
|
|
unsigned endAtoms[2];
|
|
endAtoms[0] = bond->getBeginAtomIdx();
|
|
endAtoms[1] = bond->getEndAtomIdx ();
|
|
for(unsigned ai = 0; ai < 2 && 0!=bond; ai++) // both ending bonds of the atom
|
|
{
|
|
if(0!=atomsToUse
|
|
&& atomsToUse->end() == find(atomsToUse->begin(), atomsToUse->end(), endAtoms[ai]))
|
|
bond = 0; //check if both ending atoms of the bond are selected by atoms filter
|
|
}
|
|
if(0!=bond)
|
|
{
|
|
addedBonds[bond->getIdx()] = m.BondsIdx.size();
|
|
m.BondsIdx.push_back(bond->getIdx());
|
|
}
|
|
}
|
|
}
|
|
}
|
|
else if(0!=bondsToUse) // note that 0==atomsToUse in this case
|
|
{
|
|
std::map<unsigned, unsigned> addedAtoms;
|
|
unsigned n = bondsToUse->size();
|
|
m.BondsIdx.resize(n);
|
|
for(unsigned i = 0; i < n; i++)
|
|
{
|
|
const Bond* bond = mol.getBondWithIdx(i);
|
|
m.BondsIdx[i] = bond->getIdx();
|
|
|
|
unsigned endAtoms[2];
|
|
endAtoms[0] = bond->getBeginAtomIdx();
|
|
endAtoms[1] = bond->getEndAtomIdx ();
|
|
for(unsigned ai = 0; ai < 2 && 0!=bond; ai++) // both ending bonds of the atom
|
|
{
|
|
if(addedAtoms.end() == addedAtoms.find(endAtoms[ai]))
|
|
{
|
|
addedAtoms[endAtoms[ai]] = addedAtoms.size();
|
|
m.AtomsIdx.push_back(endAtoms[ai]); // the atom has NOT been already added
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
unsigned n;
|
|
n = m.getNumAtoms();
|
|
m.Atoms.resize(n);
|
|
for(unsigned i = 0; i < n; i++)
|
|
{
|
|
m.Atoms[i] = mol.getAtomWithIdx(m.AtomsIdx[i]);
|
|
m.MolAtomIdxMap[m.AtomsIdx[i]] = i;
|
|
}
|
|
|
|
n = m.getNumBonds();
|
|
m.Bonds.resize(n);
|
|
for(unsigned i = 0; i < n; i++)
|
|
{
|
|
m.Bonds[i] = mol.getBondWithIdx(m.BondsIdx[i]);
|
|
}
|
|
}
|
|
//=============================================================================
|
|
static
|
|
void prepareLabels(std::vector<boost::uint32_t>& atomLabels, std::vector<boost::uint32_t>& bondLabels
|
|
, const ROMol& mol, const MolFragment& m
|
|
, const std::vector<boost::uint32_t> *atomCodes
|
|
, const std::vector<boost::uint32_t> *bondCodes)
|
|
{
|
|
RDUNUSED_PARAM(mol);
|
|
unsigned n;
|
|
n = m.getNumAtoms();
|
|
atomLabels.resize(n);
|
|
for(unsigned i = 0; i < n; i++)
|
|
{
|
|
atomLabels[i] = atomCodes ? (*atomCodes)[m.AtomsIdx[i]] : 1;
|
|
}
|
|
|
|
n = m.getNumBonds();
|
|
bondLabels.resize(n);
|
|
for(unsigned i = 0; i < n; i++)
|
|
{
|
|
bondLabels[i] = bondCodes ? (*bondCodes)[m.BondsIdx[i]] : 1;
|
|
}
|
|
}
|
|
//=============================================================================
|
|
}}
|