diff --git a/Code/GraphMol/Descriptors/CMakeLists.txt b/Code/GraphMol/Descriptors/CMakeLists.txt index 4ca1e1426..035492e9c 100644 --- a/Code/GraphMol/Descriptors/CMakeLists.txt +++ b/Code/GraphMol/Descriptors/CMakeLists.txt @@ -1,6 +1,6 @@ rdkit_library(Descriptors Crippen.cpp MolDescriptors.cpp MolSurf.cpp Lipinski.cpp ConnectivityDescriptors.cpp - MQN.cpp + MQN.cpp USRDescriptor.cpp LINK_LIBRARIES PartialCharges SmilesParse FileParsers Subgraphs SubstructMatch ${RDKit_THREAD_LIBS}) @@ -8,6 +8,7 @@ rdkit_headers(Crippen.h Lipinski.h MolDescriptors.h MolSurf.h ConnectivityDescriptors.h MQN.h + USRDescriptor.h DEST GraphMol/Descriptors) rdkit_test(testDescriptors test.cpp diff --git a/Code/GraphMol/Descriptors/USRDescriptor.cpp b/Code/GraphMol/Descriptors/USRDescriptor.cpp new file mode 100644 index 000000000..749f05fb2 --- /dev/null +++ b/Code/GraphMol/Descriptors/USRDescriptor.cpp @@ -0,0 +1,149 @@ +// +// Copyright (C) 2011-2013 Greg Landrum +// +// @@ All Rights Reserved @@ +// This file is part of the RDKit. +// The contents are covered by the terms of the BSD license +// which is included in the file license.txt, found at the root +// of the RDKit source tree. +// + + +#include +#include +#include +#include +#include +#include "USRDescriptor.h" + + +namespace RDKit{ + + namespace { + void calcDistances(const RDGeom::Point3DConstPtrVect &coords, + const RDGeom::Point3D &point, + RDNumeric::Vector &distances) { + PRECONDITION(distances.size() == coords.size(), "distances and coords must be the same size"); + RDGeom::Point3D tmpPt; + unsigned int i = 0; + // loop over coordinates + BOOST_FOREACH(const RDGeom::Point3D *tpp, coords) { + distances[i++] = (*tpp-point).length(); + } + } + + void calcCentroid(const RDGeom::Point3DConstPtrVect &coords, + RDGeom::Point3D &pt) { + PRECONDITION(coords.size() != 0,"no coordinates"); + // set pt to zero + pt *= 0.0; + // loop over coordinates + BOOST_FOREACH(const RDGeom::Point3D *opt, coords) { + pt += *opt; + } + pt /= coords.size(); + } + + void calcMoments(const RDNumeric::Vector &dist, + RDNumeric::Vector &moments) { + PRECONDITION(moments.size() == 3, "moments must have 3 elements"); + PRECONDITION(dist.size() != 0,"no distances"); + // set all elements to zero + moments *= 0.0; + unsigned int numPts = dist.size(); + // 1. moment: mean + for (unsigned int i = 0; i < numPts; ++i) { + moments[0] += dist[i]; + } + moments[0] /= numPts; + // 2. moment: standard deviation + // 3. moment: cubic root of skewness + for (unsigned int i = 0; i < numPts; ++i) { + double diff = dist[i] - moments[0]; + moments[1] += diff * diff; + moments[2] += diff * diff * diff; + } + moments[1] = sqrt(moments[1] / numPts); + moments[2] /= numPts; + moments[2] = cbrt(moments[2] / (moments[1] * moments[1] * moments[1])); + } + + void addMoments(std::vector &d, + const RDNumeric::Vector &m, + int idx) { + for (unsigned int i = 0; i < m.size(); ++i) { + d[i + idx] = m[i]; + } + } + + } // end namespace + + namespace Descriptors { + + void USR(const ROMol &mol, std::vector &descriptor, int confId) { + PRECONDITION(descriptor.size() == 12, "descriptor must have 12 elements"); + unsigned int na = mol.getNumAtoms(); + // check that number of atoms > 3 + if (na < 3) { + throw ValueErrorException("Number of atoms must be greater than 3"); + } + // check that minimum a conformer exists + if (mol.getNumConformers() == 0) { + throw ConformerException("No conformations available on this molecule"); + } + + const Conformer &conf = mol.getConformer(confId); + RDGeom::Point3DConstPtrVect coords(na); + // loop over atoms + for (unsigned int ai = 0; ai < na; ++ai) { + coords[ai] = &conf.getAtomPos(ai); + } + calcUSRForPoints(coords, descriptor); + } + + void calcUSRForPoints(const RDGeom::Point3DConstPtrVect &coords, + std::vector &descriptor) { + PRECONDITION(descriptor.size() == 12, "descriptor must have 12 elements"); + RDGeom::Point3D pt; + RDGeom::Point3D pt2; + RDNumeric::Vector dist(coords.size()); + RDNumeric::Vector moments(3, 0.0); + + // ctd = centroid + calcCentroid(coords, pt); + calcDistances(coords, pt, dist); + calcMoments(dist, moments); + addMoments(descriptor, moments, 0); + + // catc = closest atom to centroid + pt = (*coords[dist.smallestValId()]); // catc + pt2 = (*coords[dist.largestValId()]); // fatc + calcDistances(coords, pt, dist); + calcMoments(dist, moments); + addMoments(descriptor, moments, 3); + + // fatc = farthest atom to centroid + calcDistances(coords, pt2, dist); + calcMoments(dist, moments); + addMoments(descriptor, moments, 6); + + // fatf = farthest atom to fatc + pt = (*coords[dist.largestValId()]); + calcDistances(coords, pt, dist); + calcMoments(dist, moments); + addMoments(descriptor, moments, 9); + } + + double calcUSRScore(const std::vector &d1, const std::vector &d2) { + PRECONDITION(d1.size() == d2.size(), "descriptors must have the same size"); + double score = 0.0; + unsigned int num = d1.size(); + for (unsigned int i = 0; i < num; ++i) { + score += fabs(d1[i] - d2[i]); + } + score /= num; + return 1.0 / (1.0 + score); + } + + } // end of namespace Descriptors +} //end of namespace RDKit diff --git a/Code/GraphMol/Descriptors/USRDescriptor.h b/Code/GraphMol/Descriptors/USRDescriptor.h new file mode 100644 index 000000000..6e801cd0c --- /dev/null +++ b/Code/GraphMol/Descriptors/USRDescriptor.h @@ -0,0 +1,64 @@ +// +// Copyright (C) 2011-2013 Greg Landrum +// +// @@ All Rights Reserved @@ +// This file is part of the RDKit. +// The contents are covered by the terms of the BSD license +// which is included in the file license.txt, found at the root +// of the RDKit source tree. +// + +/*! \file USRDescriptor.h + + \brief Contains the USR descriptor. Use MolDescriptors.h in client code. + +*/ +#ifndef __RD_USR_H__ +#define __RD_USR_H__ + +#include +#include + +namespace RDKit{ + class ROMol; + class Conformer; + namespace Descriptors { + /*! + Calculates the ultra-fast shape recognition (USR) + + Reference: P. J. Ballester, W. G. Richards, JCC (2007), 28, 1711 - 1723. + + Derived from RDKit Python implementation of Jan Domanski + who derived his code from Adrian Schreyer's code: + http://hg.adrianschreyer.eu/usrcat/src/70e075d93cd25370e7ef93301d0e28d49a0851c2/usrcat/geometry.py?at=default + + \param mol the molecule of interest + \param descriptor storage for the computed USR descriptor + \param confId the conformer Id + + */ + void USR(const ROMol &mol, std::vector &descriptor, int confId = -1); + + /*! + Calculates the USR descriptor for a single conformer + + \param coords the 3D coordinates of the atoms of a conformer + \param descriptor storage for the computed USR descriptor + + */ + void calcUSRForPoints(const RDGeom::Point3DConstPtrVect &coords, std::vector &descriptor); + + /*! + Calculates the score between two USR descriptors + + \param d1 descriptor 1 + \param d2 descriptor 2 + + \return the score + */ + double calcUSRScore(const std::vector &d1, const std::vector &d2); + + } // end of namespace Descriptors +} //end of namespace RDKit + +#endif diff --git a/Code/GraphMol/Descriptors/test.cpp b/Code/GraphMol/Descriptors/test.cpp index ff57fc2ad..52a59788d 100644 --- a/Code/GraphMol/Descriptors/test.cpp +++ b/Code/GraphMol/Descriptors/test.cpp @@ -29,7 +29,7 @@ #include #include - +#include #include #include @@ -1606,6 +1606,57 @@ void testMQNs(){ BOOST_LOG(rdErrorLog) << " done" << std::endl; } +void testUSRDescriptor(){ + BOOST_LOG(rdErrorLog) << "-------------------------------------" << std::endl; + BOOST_LOG(rdErrorLog) << " Test USR Descriptor" << std::endl; + std::vector descriptor(12); + + // no conformers + ROMol *mol = SmilesToMol("C1CCCCC1"); + bool ok = false; + try { + USR(*mol, descriptor); + } catch (ConformerException &e) { + ok = true; + } + TEST_ASSERT(ok); + + // number of atoms < 3 + mol = SmilesToMol("CC"); + ok = false; + try { + USR(*mol, descriptor); + } catch (ValueErrorException &e) { + ok = true; + } + TEST_ASSERT(ok); + + // DESCRIPTOR + // comparing to results produced by Adrian Schreyer's code + // http://hg.adrianschreyer.eu/usrcat/src/70e075d93cd25370e7ef93301d0e28d49a0851c2/usrcat/geometry.py?at=default + double refValues[12] = {2.37938524, 0.62181927, -0.89089872, 2.63773456, 1.1577952, -0.6937349, + 3.38248245, 1.59816952, -0.72933115, 3.38248245, 1.59816952,-0.72933115}; + std::vector refUSR (refValues, refValues + sizeof(refValues) / sizeof(double) ); + std::string rdbase = getenv("RDBASE"); + std::string fname1 = rdbase + "/Code/GraphMol/Descriptors/test_data/benzene.mol"; + mol = MolFileToMol(fname1, true, false, true); + std::vector myUSR(12); + USR(*mol, myUSR); + for (unsigned int i = 0; i < myUSR.size(); ++i) { + TEST_ASSERT(feq(myUSR[i], refUSR[i])); + } + + // SCORE + // descriptors and reference score from JCC (2007), 28, 1711-1723. + double m1[12]={4.44, 2.98, 1.04, 4.55, 4.70, 0.23, 8.30, 16.69, -22.97, 7.37, 15.64, 0.51}; + double m2[12]={4.39, 3.11, 1.36, 4.50, 4.44, 0.09, 8.34, 16.78, -23.20, 7.15, 16.52, 0.13}; + std::vector d1 (m1, m1 + sizeof(m1) / sizeof(double) ); + std::vector d2 (m2, m2 + sizeof(m2) / sizeof(double) ); + TEST_ASSERT(feq(calcUSRScore(d1, d2), 0.812, 0.001)); + + BOOST_LOG(rdErrorLog) << " done" << std::endl; +} + void testGitHubIssue56(){ BOOST_LOG(rdErrorLog) << "-------------------------------------" << std::endl; BOOST_LOG(rdErrorLog) << " Test GitHub Issue 56." << std::endl; @@ -1685,6 +1736,7 @@ int main(){ testRingDescriptors(); testMiscCountDescriptors(); testMQNs(); + testUSRDescriptor(); #endif testGitHubIssue56(); diff --git a/Code/Numerics/Vector.h b/Code/Numerics/Vector.h index 3346360d7..63be7602f 100644 --- a/Code/Numerics/Vector.h +++ b/Code/Numerics/Vector.h @@ -235,6 +235,20 @@ namespace RDNumeric { return id; } + //! \brief Gets the ID of the entry that has the smallest value + inline unsigned int smallestValId() const { + TYPE res = (TYPE)(1.e8); + unsigned int i, id=d_size; + TYPE *data = d_data.get(); + for (i = 0; i < d_size; i++) { + if (data[i] < res) { + res = data[i]; + id = i; + } + } + return id; + } + //! returns the dot product between two Vectors inline TYPE dotProduct(const Vector other) { PRECONDITION(d_size == other.size(), "Size mismatch in vector doct product");