Files
rdkit/Code/GraphMol/MolHash/MolHash.cpp
2015-10-18 14:02:29 -04:00

455 lines
19 KiB
C++

// $Id$
//
// Copyright (C) 2014 Novartis Institutes for BioMedical Research
//
// @@ All Rights Reserved @@
// This file is part of the RDKit.
// The contents are covered by the terms of the BSD license
// which is included in the file license.txt, found at the root
// of the RDKit source tree.
//
#include <stdio.h>
#include <stdlib.h>
#include <RDGeneral/BoostStartInclude.h>
#include <boost/format.hpp>
#include <RDGeneral/BoostEndInclude.h>
#include <boost/crc.hpp>
#include <boost/cstdint.hpp>
#include "../Descriptors/MolDescriptors.h"
#include "MolHash.h"
namespace RDKit
{
namespace MolHash
{
struct MolFragment // Reference to a fragment of source molecule
{
std::vector<const Atom*> Atoms;
std::vector<const Bond*> Bonds;
std::vector<boost::uint32_t> AtomsIdx;
std::vector<boost::uint32_t> BondsIdx;
std::map<boost::uint32_t,boost::uint32_t> MolAtomIdxMap; // Full Molecule to fragment indeces backward conversion map
public:
boost::uint32_t getNumAtoms()const {return AtomsIdx.size();}
boost::uint32_t getNumBonds()const {return BondsIdx.size();}
};
// INTERNAL FUNCTIONS:
static
HashCodeType computeMorganCodeHash (const MolFragment& mol
, const std::vector<boost::uint32_t>& atomLabels
, const std::vector<boost::uint32_t>& bondLabels);
static
void prepareMolFragment(MolFragment& m,
const ROMol &mol,
const std::vector<unsigned> *atomsToUse,
const std::vector<unsigned> *bondsToUse);
static
void prepareLabels(std::vector<boost::uint32_t>& atomLabels, std::vector<boost::uint32_t>& bondLabels
, const ROMol& mol, const MolFragment& m
, const std::vector<boost::uint32_t> *atomCodes
, const std::vector<boost::uint32_t> *bondCodes);
static
boost::uint32_t computeCRC32 (const void* data, size_t size)
{
boost::crc_32_type crc;
crc.process_bytes(data, size);
return crc.checksum();
}
//=============================================================================
// MolHash Module API implementation:
//=============================================================================
void fillAtomBondCodes(const ROMol &mol, boost::uint64_t flags // CodeFlags constants combination
, std::vector<boost::uint32_t> *atomCodes // NULL is allowed
, std::vector<boost::uint32_t> *bondCodes) // NULL is allowed
{
if(atomCodes)
{
unsigned n = mol.getNumAtoms();
atomCodes->resize(n);
for(unsigned i=0; i < n; i++)
{
if(0==(CF_ATOM_ALL & flags)) // NO LABELS
{
(*atomCodes)[i] = 1;
continue;
}
const Atom* atom = mol.getAtomWithIdx(i);
(*atomCodes)[i] = 0;
if(0!=(CF_ELEMENT & flags))
(*atomCodes)[i] |= atom->getAtomicNum();
if(0!=(CF_CHARGE & flags))
(*atomCodes)[i] |= (atom->getFormalCharge()+8) << 8; // allowed range [-8, +8]
if(0!=(CF_VALENCE & flags))
(*atomCodes)[i] |= (atom->getExplicitValence()) << 13; //getTotalValence()
if(0!=(CF_ATOM_CHIRALITY & flags)){
char v=0;
if(atom->getChiralTag()==Atom::CHI_TETRAHEDRAL_CW ||
atom->getChiralTag()==Atom::CHI_TETRAHEDRAL_CCW){
if(atom->hasProp("_CIPCode")){
std::string code=atom->getProp<std::string>("_CIPCode");
if(code=="R") v=1;
else if(code=="S") v=2;
} else if(atom->hasProp("_ringStereoAtoms")){
const INT_VECT &ringStereoAtoms=atom->getProp<INT_VECT>("_ringStereoAtoms");
if(ringStereoAtoms.size()){
if(ringStereoAtoms[0]<0){
v=1;
} else {
v=2;
}
if(ringStereoAtoms.size()>1){
BOOST_LOG(rdWarningLog)<<"Warning: atom with more than 1 ring-stereo atoms found."<<std::endl;
}
}
}
} else {
v=atom->getChiralTag();
}
(*atomCodes)[i] |= v << 18; // 2 bits
}
if(0!=(CF_ATOM_AROMATIC & flags))
(*atomCodes)[i] |= (atom->getIsAromatic() ? 1 : 0) << 20; // 1 bit
//if(0!=( & flags))
// (*atomCodes)[i] |= (atom-()) << 21; // 3 bits reserved
if(0!=(CF_ISOTOPE & flags))
(*atomCodes)[i] |= (atom->getIsotope()) << 24;
}
}
if(bondCodes)
{
std::map<unsigned, bool> bondsInRing;
const RingInfo::VECT_INT_VECT& rings = mol.getRingInfo()->bondRings();
for(RingInfo::VECT_INT_VECT::const_iterator r = rings.begin(); r != rings.end(); r++)
for(INT_VECT::const_iterator b = r->begin(); b != r->end(); b++)
if (bondsInRing.end() == bondsInRing.find(*b))
bondsInRing[(unsigned)*b] = true;
unsigned n = mol.getNumBonds();
bondCodes->resize(n);
for(unsigned i=0; i < n; i++)
{
if(0==(CF_BOND_ALL & flags)) // NO LABELS
{
(*bondCodes)[i] = 1;
continue;
}
const Bond* bond = mol.getBondWithIdx(i);
(*bondCodes)[i] = 0;
if(0!=(CF_BOND_ORDER & flags))
{
unsigned order = bond->getBondType();
if(0==(CF_BOND_AROMATIZATION & flags)) // ignore aromatization
{
static const unsigned orderMatch [Bond::ZERO+1] =
{
Bond::UNSPECIFIED,
Bond::SINGLE,
Bond::DOUBLE,
Bond::TRIPLE,
Bond::QUADRUPLE,
Bond::QUINTUPLE,
Bond::HEXTUPLE,
Bond::SINGLE, //ONEANDAHALF,
Bond::DOUBLE, //TWOANDAHALF,
Bond::TRIPLE, //THREEANDAHALF,
Bond::QUADRUPLE, //FOURANDAHALF,
Bond::QUINTUPLE, //FIVEANDAHALF,
Bond::SINGLE, //AROMATIC,
Bond::IONIC,
Bond::HYDROGEN,
Bond::THREECENTER,
Bond::DATIVEONE,
Bond::DATIVE,
Bond::DATIVEL,
Bond::DATIVER,
Bond::OTHER,
Bond::ZERO
};
order = orderMatch[order];
}
(*bondCodes)[i] |= order;
}
if(0!=(CF_BOND_AROMATIZATION & flags))
(*bondCodes)[i] |= ((bond->getIsAromatic() ? 1 : 0)) << 8;
if(0!=(CF_BOND_CHIRALITY & flags))
{
(*bondCodes)[i] |= bond->getStereo() << 9;
}
if(0!=(CF_BOND_IN_RING & flags))
(*bondCodes)[i] |= (bondsInRing.end() != bondsInRing.find(bond->getIdx()) ? 1 : 0) << 11; // 1 bit
}
}
}
//=============================================================================
HashCodeType generateMoleculeHashCode(const ROMol &mol,
const std::vector<unsigned> *atomsToUse,
const std::vector<unsigned> *bondsToUse,
const std::vector<boost::uint32_t> *atomCodes,
const std::vector<boost::uint32_t> *bondCodes)
{
MolFragment m;
prepareMolFragment(m, mol, atomsToUse, bondsToUse);
if(0==m.getNumAtoms() || 0 == m.getNumBonds())
return 0;
std::vector<boost::uint32_t> atomLabels;
std::vector<boost::uint32_t> bondLabels;
prepareLabels (atomLabels, bondLabels, mol, m, atomCodes, bondCodes);
return computeMorganCodeHash (m, atomLabels, bondLabels);
}
void generateMoleculeHashSet(const ROMol &mol, HashSet& res,
const std::vector<unsigned> *atomsToUse,
const std::vector<unsigned> *bondsToUse)
{
res.Version = 100; // v. 1.0
res.Reserved= 0;
MolFragment m;
prepareMolFragment(m, mol, atomsToUse, bondsToUse);
res.NumAtoms = m.getNumAtoms();
res.NumBonds = m.getNumBonds();
if(0==m.getNumAtoms() || 0 == m.getNumBonds())
return;
std::string formula = RDKit::Descriptors::calcMolFormula(mol);
res.FormulaCRC32 = computeCRC32(formula.c_str(), formula.length());
boost::uint64_t flags = 0; // CodeFlags constants combination
std::vector<boost::uint32_t> atomCodes;
std::vector<boost::uint32_t> bondCodes;
std::vector<boost::uint32_t> atomLabels;
std::vector<boost::uint32_t> bondLabels;
// flags = CF_ATOM_ALL &(~(CF_BOND_CHIRALITY | CF_ATOM_CHIRALITY | CF_ISOTOPE));
flags = CF_ELEMENT | CF_CHARGE | CF_ATOM_AROMATIC; /// | CF_VALENCE
fillAtomBondCodes(mol, flags, &atomCodes, &bondCodes);
prepareLabels (atomLabels, bondLabels, mol, m, &atomCodes, &bondCodes);
res.NonChiralAtomsHash = computeMorganCodeHash (m, atomLabels, bondLabels);
flags = CF_BOND_ALL &(~CF_BOND_CHIRALITY);
fillAtomBondCodes(mol, flags, &atomCodes, &bondCodes);
prepareLabels (atomLabels, bondLabels, mol, m, &atomCodes, &bondCodes);
res.NonChiralBondsHash = computeMorganCodeHash (m, atomLabels, bondLabels);
flags = CF_ATOM_CHIRALITY | CF_ISOTOPE;
fillAtomBondCodes(mol, flags, &atomCodes, &bondCodes);
prepareLabels (atomLabels, bondLabels, mol, m, &atomCodes, &bondCodes);
res.ChiralAtomsHash = computeMorganCodeHash (m, atomLabels, bondLabels);
flags = CF_BOND_CHIRALITY;
fillAtomBondCodes(mol, flags, &atomCodes, &bondCodes);
prepareLabels (atomLabels, bondLabels, mol, m, &atomCodes, &bondCodes);
res.ChiralBondsHash = computeMorganCodeHash (m, atomLabels, bondLabels);
flags = CF_BOND_CHIRALITY | CF_ATOM_CHIRALITY | CF_ISOTOPE;
fillAtomBondCodes(mol, flags, &atomCodes, &bondCodes);
prepareLabels (atomLabels, bondLabels, mol, m, &atomCodes, &bondCodes);
res.ChiralityHash = computeMorganCodeHash (m, atomLabels, bondLabels);
}
//=============================================================================
std::string generateMoleculeHashSet(const ROMol &mol,
const std::vector<unsigned> *atomsToUse,
const std::vector<unsigned> *bondsToUse)
{
std::string str;
HashSet res;
generateMoleculeHashSet(mol, res, atomsToUse, bondsToUse);
//char buf[64];
//snprintf(buf, sizeof(buf),"%u-%u-%u-", res.Version, res.NumAtoms,res.NumBonds);
str = (boost::format("%u-%u-%u-")%(res.Version)%(res.NumAtoms)%(res.NumBonds)).str() ;
str += encode(&res.FormulaCRC32, sizeof(res.FormulaCRC32));
str += "-";
str += encode(&res.NonChiralAtomsHash, sizeof(res.NonChiralAtomsHash));
str += "-";
str += encode(&res.NonChiralBondsHash, sizeof(res.NonChiralBondsHash));
str += "-";
str += encode(&res.ChiralAtomsHash, sizeof(res.ChiralAtomsHash));
str += "-";
str += encode(&res.ChiralBondsHash, sizeof(res.ChiralBondsHash));
str += "-";
str += encode(&res.ChiralityHash, sizeof(res.ChiralityHash));
return str.c_str();
}
//=============================================================================
// INTERNAL FUNCTIONS:
//=============================================================================
static
HashCodeType computeMorganCodeHash (const MolFragment& mol
, const std::vector<boost::uint32_t>& atomLabels
, const std::vector<boost::uint32_t>& bondLabels)
{
size_t nv = mol.getNumAtoms();
size_t ne = mol.getNumBonds();
std::vector<HashCodeType> currCodes(nv);
std::vector<HashCodeType> prevCodes(nv);
size_t nIterations = mol.getNumBonds();
if (nIterations > 5)
nIterations = 5;
for(unsigned molAtomIdx = 0; molAtomIdx < mol.getNumAtoms(); molAtomIdx++)
currCodes[molAtomIdx] = atomLabels[mol.AtomsIdx[molAtomIdx]];
for (size_t iter = 0; iter < nIterations; iter++)
{
for (size_t i = 0; i < nv; i++)
prevCodes[i] = currCodes[i];
for (size_t molBondIdx= 0; molBondIdx < ne; molBondIdx++)
{
const Bond* bond = mol.Bonds[molBondIdx];
unsigned order = bondLabels[mol.BondsIdx[molBondIdx]];
unsigned atom1 = mol.MolAtomIdxMap.find(bond->getBeginAtomIdx())->second;
unsigned atom2 = mol.MolAtomIdxMap.find(bond->getEndAtomIdx ())->second;
boost::uint32_t v1 = prevCodes[atom1];
boost::uint32_t v2 = prevCodes[atom2];
currCodes[atom1] += v2*v2 + (v2 + 23) * (order + 1721);
currCodes[atom2] += v1*v1 + (v1 + 23) * (order + 1721);
}
}
HashCodeType result = 0;
for(unsigned molAtomIdx = 0; molAtomIdx < nv; molAtomIdx++)
{
HashCodeType code = currCodes[molAtomIdx];
result += code * (code + 6849) + 29;
}
return result;
}
//=============================================================================
static
void prepareMolFragment(MolFragment& m,
const ROMol &mol,
const std::vector<unsigned> *atomsToUse,
const std::vector<unsigned> *bondsToUse)
{
if(0!=atomsToUse && atomsToUse->empty())
atomsToUse = 0;
if(0!=bondsToUse && bondsToUse->empty())
bondsToUse = 0;
if(0==atomsToUse && 0==bondsToUse) // whole molecule
{
unsigned n = mol.getNumAtoms();
m.AtomsIdx.resize(n);
for(unsigned i = 0; i < n; i++)
m.AtomsIdx[i] = i;
n = mol.getNumBonds();
m.BondsIdx.resize(n);
for(unsigned i = 0; i < n; i++)
m.BondsIdx[i] = i;
}
else if(0!=atomsToUse) // selected atoms only and all/selected bonds between them
{
std::map<unsigned, unsigned> addedBonds;
unsigned n = atomsToUse->size();
m.AtomsIdx.resize(n);
for(unsigned i = 0; i < n; i++) // add all selected atoms at first
m.AtomsIdx[i] = (*atomsToUse)[i];
for(unsigned i = 0; i < n; i++) // add bonds between all selected atoms
{
ROMol::OEDGE_ITER beg,end;
for(boost::tie(beg,end) = mol.getAtomBonds(mol.getAtomWithIdx(m.AtomsIdx[i])); beg!=end; beg++)
{
const Bond* bond = & *((mol)[*beg]);
if(addedBonds.end() != addedBonds.find(bond->getIdx()))
continue; // the bond has been already added
if(0!=bondsToUse
&& bondsToUse->end() == find(bondsToUse->begin(), bondsToUse->end(), bond->getIdx()))
continue; // skip unselected bond
unsigned endAtoms[2];
endAtoms[0] = bond->getBeginAtomIdx();
endAtoms[1] = bond->getEndAtomIdx ();
for(unsigned ai = 0; ai < 2 && 0!=bond; ai++) // both ending bonds of the atom
{
if(0!=atomsToUse
&& atomsToUse->end() == find(atomsToUse->begin(), atomsToUse->end(), endAtoms[ai]))
bond = 0; //check if both ending atoms of the bond are selected by atoms filter
}
if(0!=bond)
{
addedBonds[bond->getIdx()] = m.BondsIdx.size();
m.BondsIdx.push_back(bond->getIdx());
}
}
}
}
else if(0!=bondsToUse) // note that 0==atomsToUse in this case
{
std::map<unsigned, unsigned> addedAtoms;
unsigned n = bondsToUse->size();
m.BondsIdx.resize(n);
for(unsigned i = 0; i < n; i++)
{
const Bond* bond = mol.getBondWithIdx(i);
m.BondsIdx[i] = bond->getIdx();
unsigned endAtoms[2];
endAtoms[0] = bond->getBeginAtomIdx();
endAtoms[1] = bond->getEndAtomIdx ();
for(unsigned ai = 0; ai < 2 && 0!=bond; ai++) // both ending bonds of the atom
{
if(addedAtoms.end() == addedAtoms.find(endAtoms[ai]))
{
addedAtoms[endAtoms[ai]] = addedAtoms.size();
m.AtomsIdx.push_back(endAtoms[ai]); // the atom has NOT been already added
}
}
}
}
unsigned n;
n = m.getNumAtoms();
m.Atoms.resize(n);
for(unsigned i = 0; i < n; i++)
{
m.Atoms[i] = mol.getAtomWithIdx(m.AtomsIdx[i]);
m.MolAtomIdxMap[m.AtomsIdx[i]] = i;
}
n = m.getNumBonds();
m.Bonds.resize(n);
for(unsigned i = 0; i < n; i++)
{
m.Bonds[i] = mol.getBondWithIdx(m.BondsIdx[i]);
}
}
//=============================================================================
static
void prepareLabels(std::vector<boost::uint32_t>& atomLabels, std::vector<boost::uint32_t>& bondLabels
, const ROMol& mol, const MolFragment& m
, const std::vector<boost::uint32_t> *atomCodes
, const std::vector<boost::uint32_t> *bondCodes)
{
RDUNUSED_PARAM(mol);
unsigned n;
n = m.getNumAtoms();
atomLabels.resize(n);
for(unsigned i = 0; i < n; i++)
{
atomLabels[i] = atomCodes ? (*atomCodes)[m.AtomsIdx[i]] : 1;
}
n = m.getNumBonds();
bondLabels.resize(n);
for(unsigned i = 0; i < n; i++)
{
bondLabels[i] = bondCodes ? (*bondCodes)[m.BondsIdx[i]] : 1;
}
}
//=============================================================================
}}