Files
rdkit/Code/GraphMol/MolHash/MolHash.cpp
2015-11-14 14:58:11 +01:00

426 lines
16 KiB
C++

// $Id$
//
// Copyright (C) 2014 Novartis Institutes for BioMedical Research
//
// @@ All Rights Reserved @@
// This file is part of the RDKit.
// The contents are covered by the terms of the BSD license
// which is included in the file license.txt, found at the root
// of the RDKit source tree.
//
#include <stdio.h>
#include <stdlib.h>
#include <RDGeneral/BoostStartInclude.h>
#include <boost/format.hpp>
#include <RDGeneral/BoostEndInclude.h>
#include <boost/crc.hpp>
#include <boost/cstdint.hpp>
#include "../Descriptors/MolDescriptors.h"
#include "MolHash.h"
namespace RDKit {
namespace MolHash {
struct MolFragment // Reference to a fragment of source molecule
{
std::vector<const Atom *> Atoms;
std::vector<const Bond *> Bonds;
std::vector<boost::uint32_t> AtomsIdx;
std::vector<boost::uint32_t> BondsIdx;
std::map<boost::uint32_t, boost::uint32_t> MolAtomIdxMap; // Full Molecule to
// fragment indeces
// backward
// conversion map
public:
boost::uint32_t getNumAtoms() const { return AtomsIdx.size(); }
boost::uint32_t getNumBonds() const { return BondsIdx.size(); }
};
// INTERNAL FUNCTIONS:
static HashCodeType computeMorganCodeHash(
const MolFragment &mol, const std::vector<boost::uint32_t> &atomLabels,
const std::vector<boost::uint32_t> &bondLabels);
static void prepareMolFragment(MolFragment &m, const ROMol &mol,
const std::vector<unsigned> *atomsToUse,
const std::vector<unsigned> *bondsToUse);
static void prepareLabels(std::vector<boost::uint32_t> &atomLabels,
std::vector<boost::uint32_t> &bondLabels,
const ROMol &mol, const MolFragment &m,
const std::vector<boost::uint32_t> *atomCodes,
const std::vector<boost::uint32_t> *bondCodes);
static boost::uint32_t computeCRC32(const void *data, size_t size) {
boost::crc_32_type crc;
crc.process_bytes(data, size);
return crc.checksum();
}
//=============================================================================
// MolHash Module API implementation:
//=============================================================================
void fillAtomBondCodes(
const ROMol &mol, boost::uint64_t flags // CodeFlags constants combination
,
std::vector<boost::uint32_t> *atomCodes // NULL is allowed
,
std::vector<boost::uint32_t> *bondCodes) // NULL is allowed
{
if (atomCodes) {
unsigned n = mol.getNumAtoms();
atomCodes->resize(n);
for (unsigned i = 0; i < n; i++) {
if (0 == (CF_ATOM_ALL & flags)) // NO LABELS
{
(*atomCodes)[i] = 1;
continue;
}
const Atom *atom = mol.getAtomWithIdx(i);
(*atomCodes)[i] = 0;
if (0 != (CF_ELEMENT & flags)) (*atomCodes)[i] |= atom->getAtomicNum();
if (0 != (CF_CHARGE & flags))
(*atomCodes)[i] |= (atom->getFormalCharge() + 8)
<< 8; // allowed range [-8, +8]
if (0 != (CF_VALENCE & flags))
(*atomCodes)[i] |= (atom->getExplicitValence())
<< 13; // getTotalValence()
if (0 != (CF_ATOM_CHIRALITY & flags)) {
char v = 0;
if (atom->getChiralTag() == Atom::CHI_TETRAHEDRAL_CW ||
atom->getChiralTag() == Atom::CHI_TETRAHEDRAL_CCW) {
if (atom->hasProp("_CIPCode")) {
std::string code = atom->getProp<std::string>("_CIPCode");
if (code == "R")
v = 1;
else if (code == "S")
v = 2;
} else if (atom->hasProp("_ringStereoAtoms")) {
const INT_VECT &ringStereoAtoms =
atom->getProp<INT_VECT>("_ringStereoAtoms");
if (ringStereoAtoms.size()) {
if (ringStereoAtoms[0] < 0) {
v = 1;
} else {
v = 2;
}
if (ringStereoAtoms.size() > 1) {
BOOST_LOG(rdWarningLog)
<< "Warning: atom with more than 1 ring-stereo atoms found."
<< std::endl;
}
}
}
} else {
v = atom->getChiralTag();
}
(*atomCodes)[i] |= v << 18; // 2 bits
}
if (0 != (CF_ATOM_AROMATIC & flags))
(*atomCodes)[i] |= (atom->getIsAromatic() ? 1 : 0) << 20; // 1 bit
// if(0!=( & flags))
// (*atomCodes)[i] |= (atom-()) << 21; // 3 bits reserved
if (0 != (CF_ISOTOPE & flags))
(*atomCodes)[i] |= (atom->getIsotope()) << 24;
}
}
if (bondCodes) {
std::map<unsigned, bool> bondsInRing;
const RingInfo::VECT_INT_VECT &rings = mol.getRingInfo()->bondRings();
for (RingInfo::VECT_INT_VECT::const_iterator r = rings.begin();
r != rings.end(); r++)
for (INT_VECT::const_iterator b = r->begin(); b != r->end(); b++)
if (bondsInRing.end() == bondsInRing.find(*b))
bondsInRing[(unsigned)*b] = true;
unsigned n = mol.getNumBonds();
bondCodes->resize(n);
for (unsigned i = 0; i < n; i++) {
if (0 == (CF_BOND_ALL & flags)) // NO LABELS
{
(*bondCodes)[i] = 1;
continue;
}
const Bond *bond = mol.getBondWithIdx(i);
(*bondCodes)[i] = 0;
if (0 != (CF_BOND_ORDER & flags)) {
unsigned order = bond->getBondType();
if (0 == (CF_BOND_AROMATIZATION & flags)) // ignore aromatization
{
static const unsigned orderMatch[Bond::ZERO + 1] = {
Bond::UNSPECIFIED, Bond::SINGLE, Bond::DOUBLE,
Bond::TRIPLE, Bond::QUADRUPLE, Bond::QUINTUPLE,
Bond::HEXTUPLE,
Bond::SINGLE, // ONEANDAHALF,
Bond::DOUBLE, // TWOANDAHALF,
Bond::TRIPLE, // THREEANDAHALF,
Bond::QUADRUPLE, // FOURANDAHALF,
Bond::QUINTUPLE, // FIVEANDAHALF,
Bond::SINGLE, // AROMATIC,
Bond::IONIC, Bond::HYDROGEN, Bond::THREECENTER,
Bond::DATIVEONE, Bond::DATIVE, Bond::DATIVEL,
Bond::DATIVER, Bond::OTHER, Bond::ZERO};
order = orderMatch[order];
}
(*bondCodes)[i] |= order;
}
if (0 != (CF_BOND_AROMATIZATION & flags))
(*bondCodes)[i] |= ((bond->getIsAromatic() ? 1 : 0)) << 8;
if (0 != (CF_BOND_CHIRALITY & flags)) {
(*bondCodes)[i] |= bond->getStereo() << 9;
}
if (0 != (CF_BOND_IN_RING & flags))
(*bondCodes)[i] |=
(bondsInRing.end() != bondsInRing.find(bond->getIdx()) ? 1 : 0)
<< 11; // 1 bit
}
}
}
//=============================================================================
HashCodeType generateMoleculeHashCode(
const ROMol &mol, const std::vector<unsigned> *atomsToUse,
const std::vector<unsigned> *bondsToUse,
const std::vector<boost::uint32_t> *atomCodes,
const std::vector<boost::uint32_t> *bondCodes) {
MolFragment m;
prepareMolFragment(m, mol, atomsToUse, bondsToUse);
if (0 == m.getNumAtoms() || 0 == m.getNumBonds()) return 0;
std::vector<boost::uint32_t> atomLabels;
std::vector<boost::uint32_t> bondLabels;
prepareLabels(atomLabels, bondLabels, mol, m, atomCodes, bondCodes);
return computeMorganCodeHash(m, atomLabels, bondLabels);
}
void generateMoleculeHashSet(const ROMol &mol, HashSet &res,
const std::vector<unsigned> *atomsToUse,
const std::vector<unsigned> *bondsToUse) {
res.Version = 100; // v. 1.0
res.Reserved = 0;
MolFragment m;
prepareMolFragment(m, mol, atomsToUse, bondsToUse);
res.NumAtoms = m.getNumAtoms();
res.NumBonds = m.getNumBonds();
if (0 == m.getNumAtoms() || 0 == m.getNumBonds()) return;
std::string formula = RDKit::Descriptors::calcMolFormula(mol);
res.FormulaCRC32 = computeCRC32(formula.c_str(), formula.length());
boost::uint64_t flags = 0; // CodeFlags constants combination
std::vector<boost::uint32_t> atomCodes;
std::vector<boost::uint32_t> bondCodes;
std::vector<boost::uint32_t> atomLabels;
std::vector<boost::uint32_t> bondLabels;
// flags = CF_ATOM_ALL &(~(CF_BOND_CHIRALITY | CF_ATOM_CHIRALITY |
// CF_ISOTOPE));
flags = CF_ELEMENT | CF_CHARGE | CF_ATOM_AROMATIC; /// | CF_VALENCE
fillAtomBondCodes(mol, flags, &atomCodes, &bondCodes);
prepareLabels(atomLabels, bondLabels, mol, m, &atomCodes, &bondCodes);
res.NonChiralAtomsHash = computeMorganCodeHash(m, atomLabels, bondLabels);
flags = CF_BOND_ALL & (~CF_BOND_CHIRALITY);
fillAtomBondCodes(mol, flags, &atomCodes, &bondCodes);
prepareLabels(atomLabels, bondLabels, mol, m, &atomCodes, &bondCodes);
res.NonChiralBondsHash = computeMorganCodeHash(m, atomLabels, bondLabels);
flags = CF_ATOM_CHIRALITY | CF_ISOTOPE;
fillAtomBondCodes(mol, flags, &atomCodes, &bondCodes);
prepareLabels(atomLabels, bondLabels, mol, m, &atomCodes, &bondCodes);
res.ChiralAtomsHash = computeMorganCodeHash(m, atomLabels, bondLabels);
flags = CF_BOND_CHIRALITY;
fillAtomBondCodes(mol, flags, &atomCodes, &bondCodes);
prepareLabels(atomLabels, bondLabels, mol, m, &atomCodes, &bondCodes);
res.ChiralBondsHash = computeMorganCodeHash(m, atomLabels, bondLabels);
flags = CF_BOND_CHIRALITY | CF_ATOM_CHIRALITY | CF_ISOTOPE;
fillAtomBondCodes(mol, flags, &atomCodes, &bondCodes);
prepareLabels(atomLabels, bondLabels, mol, m, &atomCodes, &bondCodes);
res.ChiralityHash = computeMorganCodeHash(m, atomLabels, bondLabels);
}
//=============================================================================
std::string generateMoleculeHashSet(const ROMol &mol,
const std::vector<unsigned> *atomsToUse,
const std::vector<unsigned> *bondsToUse) {
std::string str;
HashSet res;
generateMoleculeHashSet(mol, res, atomsToUse, bondsToUse);
// char buf[64];
// snprintf(buf, sizeof(buf),"%u-%u-%u-", res.Version,
// res.NumAtoms,res.NumBonds);
str = (boost::format("%u-%u-%u-") % (res.Version) % (res.NumAtoms) %
(res.NumBonds)).str();
str += encode(&res.FormulaCRC32, sizeof(res.FormulaCRC32));
str += "-";
str += encode(&res.NonChiralAtomsHash, sizeof(res.NonChiralAtomsHash));
str += "-";
str += encode(&res.NonChiralBondsHash, sizeof(res.NonChiralBondsHash));
str += "-";
str += encode(&res.ChiralAtomsHash, sizeof(res.ChiralAtomsHash));
str += "-";
str += encode(&res.ChiralBondsHash, sizeof(res.ChiralBondsHash));
str += "-";
str += encode(&res.ChiralityHash, sizeof(res.ChiralityHash));
return str.c_str();
}
//=============================================================================
// INTERNAL FUNCTIONS:
//=============================================================================
static HashCodeType computeMorganCodeHash(
const MolFragment &mol, const std::vector<boost::uint32_t> &atomLabels,
const std::vector<boost::uint32_t> &bondLabels) {
size_t nv = mol.getNumAtoms();
size_t ne = mol.getNumBonds();
std::vector<HashCodeType> currCodes(nv);
std::vector<HashCodeType> prevCodes(nv);
size_t nIterations = mol.getNumBonds();
if (nIterations > 5) nIterations = 5;
for (unsigned molAtomIdx = 0; molAtomIdx < mol.getNumAtoms(); molAtomIdx++)
currCodes[molAtomIdx] = atomLabels[mol.AtomsIdx[molAtomIdx]];
for (size_t iter = 0; iter < nIterations; iter++) {
for (size_t i = 0; i < nv; i++) prevCodes[i] = currCodes[i];
for (size_t molBondIdx = 0; molBondIdx < ne; molBondIdx++) {
const Bond *bond = mol.Bonds[molBondIdx];
unsigned order = bondLabels[mol.BondsIdx[molBondIdx]];
unsigned atom1 = mol.MolAtomIdxMap.find(bond->getBeginAtomIdx())->second;
unsigned atom2 = mol.MolAtomIdxMap.find(bond->getEndAtomIdx())->second;
boost::uint32_t v1 = prevCodes[atom1];
boost::uint32_t v2 = prevCodes[atom2];
currCodes[atom1] += v2 * v2 + (v2 + 23) * (order + 1721);
currCodes[atom2] += v1 * v1 + (v1 + 23) * (order + 1721);
}
}
HashCodeType result = 0;
for (unsigned molAtomIdx = 0; molAtomIdx < nv; molAtomIdx++) {
HashCodeType code = currCodes[molAtomIdx];
result += code * (code + 6849) + 29;
}
return result;
}
//=============================================================================
static void prepareMolFragment(MolFragment &m, const ROMol &mol,
const std::vector<unsigned> *atomsToUse,
const std::vector<unsigned> *bondsToUse) {
if (0 != atomsToUse && atomsToUse->empty()) atomsToUse = 0;
if (0 != bondsToUse && bondsToUse->empty()) bondsToUse = 0;
if (0 == atomsToUse && 0 == bondsToUse) // whole molecule
{
unsigned n = mol.getNumAtoms();
m.AtomsIdx.resize(n);
for (unsigned i = 0; i < n; i++) m.AtomsIdx[i] = i;
n = mol.getNumBonds();
m.BondsIdx.resize(n);
for (unsigned i = 0; i < n; i++) m.BondsIdx[i] = i;
} else if (0 != atomsToUse) // selected atoms only and all/selected bonds
// between them
{
std::map<unsigned, unsigned> addedBonds;
unsigned n = atomsToUse->size();
m.AtomsIdx.resize(n);
for (unsigned i = 0; i < n; i++) // add all selected atoms at first
m.AtomsIdx[i] = (*atomsToUse)[i];
for (unsigned i = 0; i < n; i++) // add bonds between all selected atoms
{
ROMol::OEDGE_ITER beg, end;
for (boost::tie(beg, end) =
mol.getAtomBonds(mol.getAtomWithIdx(m.AtomsIdx[i]));
beg != end; beg++) {
const Bond *bond = &*((mol)[*beg]);
if (addedBonds.end() != addedBonds.find(bond->getIdx()))
continue; // the bond has been already added
if (0 != bondsToUse &&
bondsToUse->end() ==
find(bondsToUse->begin(), bondsToUse->end(), bond->getIdx()))
continue; // skip unselected bond
unsigned endAtoms[2];
endAtoms[0] = bond->getBeginAtomIdx();
endAtoms[1] = bond->getEndAtomIdx();
for (unsigned ai = 0; ai < 2 && 0 != bond;
ai++) // both ending bonds of the atom
{
if (0 != atomsToUse &&
atomsToUse->end() ==
find(atomsToUse->begin(), atomsToUse->end(), endAtoms[ai]))
bond = 0; // check if both ending atoms of the bond are selected by
// atoms filter
}
if (0 != bond) {
addedBonds[bond->getIdx()] = m.BondsIdx.size();
m.BondsIdx.push_back(bond->getIdx());
}
}
}
} else if (0 != bondsToUse) // note that 0==atomsToUse in this case
{
std::map<unsigned, unsigned> addedAtoms;
unsigned n = bondsToUse->size();
m.BondsIdx.resize(n);
for (unsigned i = 0; i < n; i++) {
const Bond *bond = mol.getBondWithIdx(i);
m.BondsIdx[i] = bond->getIdx();
unsigned endAtoms[2];
endAtoms[0] = bond->getBeginAtomIdx();
endAtoms[1] = bond->getEndAtomIdx();
for (unsigned ai = 0; ai < 2 && 0 != bond;
ai++) // both ending bonds of the atom
{
if (addedAtoms.end() == addedAtoms.find(endAtoms[ai])) {
addedAtoms[endAtoms[ai]] = addedAtoms.size();
m.AtomsIdx.push_back(
endAtoms[ai]); // the atom has NOT been already added
}
}
}
}
unsigned n;
n = m.getNumAtoms();
m.Atoms.resize(n);
for (unsigned i = 0; i < n; i++) {
m.Atoms[i] = mol.getAtomWithIdx(m.AtomsIdx[i]);
m.MolAtomIdxMap[m.AtomsIdx[i]] = i;
}
n = m.getNumBonds();
m.Bonds.resize(n);
for (unsigned i = 0; i < n; i++) {
m.Bonds[i] = mol.getBondWithIdx(m.BondsIdx[i]);
}
}
//=============================================================================
static void prepareLabels(std::vector<boost::uint32_t> &atomLabels,
std::vector<boost::uint32_t> &bondLabels,
const ROMol &mol, const MolFragment &m,
const std::vector<boost::uint32_t> *atomCodes,
const std::vector<boost::uint32_t> *bondCodes) {
RDUNUSED_PARAM(mol);
unsigned n;
n = m.getNumAtoms();
atomLabels.resize(n);
for (unsigned i = 0; i < n; i++) {
atomLabels[i] = atomCodes ? (*atomCodes)[m.AtomsIdx[i]] : 1;
}
n = m.getNumBonds();
bondLabels.resize(n);
for (unsigned i = 0; i < n; i++) {
bondLabels[i] = bondCodes ? (*bondCodes)[m.BondsIdx[i]] : 1;
}
}
//=============================================================================
}
}