diff --git a/Code/GraphMol/Descriptors/USRDescriptor.cpp b/Code/GraphMol/Descriptors/USRDescriptor.cpp index 9529edf5e..c2318a6e7 100644 --- a/Code/GraphMol/Descriptors/USRDescriptor.cpp +++ b/Code/GraphMol/Descriptors/USRDescriptor.cpp @@ -11,11 +11,15 @@ #include #include -#include -#include +#include +#include +#include #include #include "USRDescriptor.h" +#include +#include +#include namespace RDKit{ @@ -72,28 +76,86 @@ namespace RDKit{ void calcMoments(const std::vector &dist, std::vector &descriptor, int idx) { - PRECONDITION(!dist.empty(),"no distances"); std::vector moments (3, 0.0); unsigned int numPts = dist.size(); - // 1. moment: mean - for (unsigned int i = 0; i < numPts; ++i) { - moments[0] += dist[i]; + if (numPts > 0) { + // 1. moment: mean + for (unsigned int i = 0; i < numPts; ++i) { + moments[0] += dist[i]; + } + moments[0] /= numPts; + // 2. moment: standard deviation + // 3. moment: cubic root of skewness + for (unsigned int i = 0; i < numPts; ++i) { + double diff = dist[i] - moments[0]; + moments[1] += diff * diff; + moments[2] += diff * diff * diff; + } + moments[1] = sqrt(moments[1] / numPts); + moments[2] /= numPts; + moments[2] = cbrt(moments[2] / (moments[1] * moments[1] * moments[1])); } - moments[0] /= numPts; - // 2. moment: standard deviation - // 3. moment: cubic root of skewness - for (unsigned int i = 0; i < numPts; ++i) { - double diff = dist[i] - moments[0]; - moments[1] += diff * diff; - moments[2] += diff * diff * diff; - } - moments[1] = sqrt(moments[1] / numPts); - moments[2] /= numPts; - moments[2] = cbrt(moments[2] / (moments[1] * moments[1] * moments[1])); // add moments to descriptor std::copy(moments.begin(), moments.end(), descriptor.begin()+idx); } + class ss_matcher { + public: + ss_matcher() {}; + ss_matcher(const std::string &pattern){ + RDKit::RWMol *p = RDKit::SmartsToMol(pattern); + TEST_ASSERT(p); + m_matcher.reset(p); + }; + const RDKit::ROMol *getMatcher() const { return m_matcher.get(); }; + private: + RDKit::ROMOL_SPTR m_matcher; + }; + + /* + // Definitions for feature points adapted from: + // Gobbi and Poppinger, Biotech. Bioeng. _61_ 47-54 (1998) + const char *smartsPatterns[4] = { + "[$([C;H2,H1](!=*)[C;H2,H1][C;H2,H1][$([C;H1,H2,H3]);!$(C=*)]),\ +$(C([C;H2,H3])([C;H2,H3])[C;H2,H3])]", // Hydrophobic + "[a]", //Aromatic + "[$([N;!H0;v3,v4&+1]),$([O,S;H1;+0]),n&H1&+0]", // Donor + "[$([O,S;H1;v2;!$(*-*=[O,N,P,S])]),$([O,S;H0;v2]),$([O,S;-]),\ +$([N;v3;!$(N-*=[O,N,P,S])]),n&H0&+0,$([o,s;+0;!$([o,s]:n);!$([o,s]:c:n)])]" // Acceptor + };*/ + // Definitions for feature points from + // http://hg.adrianschreyer.eu/usrcat/src/70e075d93cd2?at=default + const char *smartsPatterns[4] = { + "[#6+0!$(*~[#7,#8,F]),SH0+0v2,s+0,S^3,Cl+0,Br+0,I+0]", // hydrophobic + "[a]", // aromatic + "[$([O,S;H1;v2]-[!$(*=[O,N,P,S])]),$([O,S;H0;v2]),$([O,S;-]),\ +$([N&v3;H1,H2]-[!$(*=[O,N,P,S])]),$([N;v3;H0]),$([n,o,s;+0]),F]", // acceptor + "[N!H0v3,N!H0+v4,OH+0,SH+0,nH+0]" // donor + }; + std::vector featureSmarts(smartsPatterns,smartsPatterns+4); + typedef boost::flyweight,boost::flyweights::no_tracking > pattern_flyweight; + + void getAtomIdsForFeatures(const ROMol &mol, std::vector > &atomIds) { + unsigned int numFeatures = featureSmarts.size(); + PRECONDITION(atomIds.size() == numFeatures, "atomIds must have 4 elements"); + std::vector featureMatchers; + featureMatchers.reserve(numFeatures); + BOOST_FOREACH(std::string feature, featureSmarts) { + const ROMol *matcher = pattern_flyweight(feature).get().getMatcher(); + featureMatchers.push_back(matcher); + } + for (unsigned int i = 0; i < numFeatures; ++i) { + std::vector matchVect; + // to maintain thread safety, we have to copy the pattern molecules: + SubstructMatch(mol,ROMol(*featureMatchers[i],true),matchVect); + BOOST_FOREACH(MatchVectType mv, matchVect) { + for (MatchVectType::const_iterator mIt = mv.begin(); mIt != mv.end(); ++mIt){ + atomIds[i].push_back(mIt->second); + } + } + } // end loop over features + } + } // end namespace namespace Descriptors { @@ -124,6 +186,59 @@ namespace RDKit{ calcUSRFromDistributions(dist, descriptor); } + void USRCAT(const ROMol &mol, std::vector &descriptor, + std::vector > &atomIds, int confId) { + unsigned int na = mol.getNumAtoms(); + // check that number of atoms > 3 + if (na < 3) { + throw ValueErrorException("Number of atoms must be greater than 3"); + } + // check that minimum a conformer exists + if (mol.getNumConformers() == 0) { + throw ConformerException("No conformations available on this molecule"); + } + + // get atom selections + unsigned int numClasses = atomIds.size(); + if (numClasses > 0) { // user provided atom selections + PRECONDITION(descriptor.size() == (numClasses+1)*12, "descriptor must have (numClasses+1)*12 elements"); + } else { // use feature definitions of FeatureMorgan fingerprint + numClasses = 4; + PRECONDITION(descriptor.size() == 60, "descriptor must have 60 elements"); + atomIds.resize(numClasses); + getAtomIdsForFeatures(mol, atomIds); + } + + const Conformer &conf = mol.getConformer(confId); + RDGeom::Point3DConstPtrVect coords(na); + // loop over atoms + for (unsigned int ai = 0; ai < na; ++ai) { + coords[ai] = &conf.getAtomPos(ai); + } + // the original USR + std::vector > dist(4); + std::vector points(4); + calcUSRDistributions(coords, dist, points); + std::vector tmpDescriptor(12); + calcUSRFromDistributions(dist, tmpDescriptor); + std::copy(tmpDescriptor.begin(), tmpDescriptor.end(), descriptor.begin()); + + // loop over the atom selections + unsigned int featIdx = 12; + BOOST_FOREACH(std::vector atoms, atomIds) { + // reduce the coordinates to the atoms of interest + RDGeom::Point3DConstPtrVect reducedCoords(atoms.size()); + unsigned int i = 0; + BOOST_FOREACH(unsigned int idx, atoms) { + reducedCoords[i++] = coords[idx]; + } + calcUSRDistributionsFromPoints(reducedCoords, points, dist); + calcUSRFromDistributions(dist, tmpDescriptor); + std::copy(tmpDescriptor.begin(), tmpDescriptor.end(), descriptor.begin()+featIdx); + featIdx += 12; + } + } + void calcUSRDistributions(const RDGeom::Point3DConstPtrVect &coords, std::vector > &dist, std::vector &points) { @@ -154,20 +269,28 @@ namespace RDKit{ void calcUSRFromDistributions(const std::vector > &dist, std::vector &descriptor) { + PRECONDITION(descriptor.size() == 3*dist.size(), "descriptor must have 3 times more elements than dist"); for (unsigned int i = 0; i < dist.size(); ++i) { calcMoments(dist[i], descriptor, 3*i); } } - double calcUSRScore(const std::vector &d1, const std::vector &d2) { + double calcUSRScore(const std::vector &d1, const std::vector &d2, + const std::vector &weights) { + unsigned int num = 12; // length of each subset PRECONDITION(d1.size() == d2.size(), "descriptors must have the same size"); - double score = 0.0; - unsigned int num = d1.size(); - for (unsigned int i = 0; i < num; ++i) { - score += fabs(d1[i] - d2[i]); + PRECONDITION(weights.size() == (d1.size()/num), "size of weights not correct"); + double score = 1.0; + for (unsigned int w; w < weights.size(); ++w) { + double tmpScore = 0.0; + unsigned int offset = num*w; + for (unsigned int i = 0; i < num; ++i) { + tmpScore += fabs(d1[i+offset] - d2[i+offset]); + } + tmpScore /= num; + score += weights[w]*tmpScore; } - score /= num; - return 1.0 / (1.0 + score); + return 1.0 / score; } } // end of namespace Descriptors diff --git a/Code/GraphMol/Descriptors/USRDescriptor.h b/Code/GraphMol/Descriptors/USRDescriptor.h index 513e141c5..0f4702146 100644 --- a/Code/GraphMol/Descriptors/USRDescriptor.h +++ b/Code/GraphMol/Descriptors/USRDescriptor.h @@ -24,13 +24,13 @@ namespace RDKit{ class Conformer; namespace Descriptors { /*! - Calculates the ultra-fast shape recognition (USR) + Calculates the ultra-fast shape recognition (USR) descriptor Reference: P. J. Ballester, W. G. Richards, JCC (2007), 28, 1711 - 1723. Derived from RDKit Python implementation of Jan Domanski who derived his code from Adrian Schreyer's code: - http://hg.adrianschreyer.eu/usrcat/src/70e075d93cd25370e7ef93301d0e28d49a0851c2/usrcat/geometry.py?at=default + http://hg.adrianschreyer.eu/usrcat/src/70e075d93cd2?at=default \param mol the molecule of interest \param descriptor storage for the computed USR descriptor @@ -39,6 +39,21 @@ namespace RDKit{ */ void USR(const ROMol &mol, std::vector &descriptor, int confId = -1); + /*! + Calculates the ultra-fast shape recognition with CREDO atom types (USRCAT) descriptor + + Reference: A. M. Schreyer, T. Blundell, J. Cheminf. (2012), 4, 27. + + Derived from Python implementation Adrian Schreyer: + http://hg.adrianschreyer.eu/usrcat/src/70e075d93cd2?at=default + + \param mol the molecule of interest + \param descriptor storage for the computed USR descriptor + \param confId the conformer Id + + */ + void USRCAT(const ROMol &mol, std::vector &descriptor, + std::vector > &atomIds, int confId = -1); /*! Calculates the four distance distributions for the USR descriptor @@ -74,14 +89,16 @@ namespace RDKit{ std::vector &descriptor); /*! - Calculates the score between two USR descriptors + Calculates the score between two USRCAT descriptors with weights - \param d1 descriptor 1 - \param d2 descriptor 2 + \param d1 descriptor 1 + \param d2 descriptor 2 + \param weights the weights for each subset of moments - \return the score - */ - double calcUSRScore(const std::vector &d1, const std::vector &d2); + \return the score + */ + double calcUSRScore(const std::vector &d1, const std::vector &d2, + const std::vector &weights); } // end of namespace Descriptors } //end of namespace RDKit diff --git a/Code/GraphMol/Descriptors/Wrap/rdMolDescriptors.cpp b/Code/GraphMol/Descriptors/Wrap/rdMolDescriptors.cpp index 37bd5bc3a..d855520ad 100644 --- a/Code/GraphMol/Descriptors/Wrap/rdMolDescriptors.cpp +++ b/Code/GraphMol/Descriptors/Wrap/rdMolDescriptors.cpp @@ -544,18 +544,67 @@ namespace { return pyDescr; } - double GetUSRScore(python::object descriptor1, python::object descriptor2) { + double GetUSRScore(python::object descriptor1, python::object descriptor2, + python::object weights) { unsigned int numElements = python::extract(descriptor1.attr("__len__")()); if (numElements != python::extract(descriptor2.attr("__len__")()) ) { throw_value_error("descriptors must have the same length"); } + unsigned int numWeights = numElements / 12; + unsigned int numPyWeights = python::extract(weights.attr("__len__")()); + std::vector w(numWeights, 1.0); // default weights: all to 1.0 + if ((numPyWeights > 0) && (numPyWeights != numWeights)) { + throw_value_error("number of weights is not correct"); + } else if (numPyWeights == numWeights) { + for (unsigned int i = 0; i < numWeights; ++i) { + w[i] = python::extract(weights[i]); + } + } std::vector d1(numElements); std::vector d2(numElements); for (unsigned int i = 0; i < numElements; ++i) { d1[i] = python::extract(descriptor1[i]); d2[i] = python::extract(descriptor2[i]); } - return RDKit::Descriptors::calcUSRScore(d1, d2); + return RDKit::Descriptors::calcUSRScore(d1, d2, w); + } + + python::list GetUSRCAT(const RDKit::ROMol &mol, python::object atomSelections, int confId) { + if (mol.getNumConformers() == 0) { + throw_value_error("no conformers"); + } + if (mol.getNumAtoms() < 3) { + throw_value_error("too few atoms (minimum three)"); + } + + // check if there is an atom selection provided + std::vector > atomIds; + unsigned int sizeDescriptor = 60; + if (atomSelections != python::object()) { + // make sure the optional argument actually was a list + python::list typecheck = python::extract(atomSelections); + unsigned int numSel = python::extract(atomSelections.attr("__len__")()); + if (numSel == 0) { + throw_value_error("empty atom selections"); + } + atomIds.resize(numSel); + for (unsigned int i = 0; i < numSel; ++i) { + unsigned int numPts = python::extract(atomSelections[i].attr("__len__")()); + std::vector tmpIds(numPts); + for (unsigned int j = 0; j < numPts; ++j) { + tmpIds[j] = python::extract(atomSelections[i][j]) - 1; + } + atomIds[i] = tmpIds; + } + sizeDescriptor = 12 * (numSel+1); + } + std::vector descriptor(sizeDescriptor); + RDKit::Descriptors::USRCAT(mol, descriptor, atomIds, confId); + python::list pyDescr; + BOOST_FOREACH(double d, descriptor) { + pyDescr.append(d); + } + return pyDescr; } python::list CalcSlogPVSA(const RDKit::ROMol &mol, @@ -817,10 +866,17 @@ BOOST_PYTHON_MODULE(rdMolDescriptors) { python::def("GetUSRFromDistributions", GetUSRFromDistributions, (python::arg("distances")), docString.c_str()); - docString="Returns the USR score for two USR descriptors"; + docString="Returns the USR score for two USR or USRCAT descriptors"; python::def("GetUSRScore", GetUSRScore, (python::arg("descriptor1"), - python::arg("descriptor2")), + python::arg("descriptor2"), + python::arg("weights")=python::list()), + docString.c_str()); + docString="Returns a USRCAT descriptor for one conformer of a molecule"; + python::def("GetUSRCAT", GetUSRCAT, + (python::arg("mol"), + python::arg("atomSelections")=python::object(), + python::arg("confId")=-1), docString.c_str()); docString="returns (as a list of 2-tuples) the contributions of each atom to\n" diff --git a/Code/GraphMol/Descriptors/Wrap/testMolDescriptors.py b/Code/GraphMol/Descriptors/Wrap/testMolDescriptors.py index 5bdc1e57f..e4e0f2657 100644 --- a/Code/GraphMol/Descriptors/Wrap/testMolDescriptors.py +++ b/Code/GraphMol/Descriptors/Wrap/testMolDescriptors.py @@ -245,6 +245,23 @@ class TestCase(unittest.TestCase) : self.failUnlessEqual(rdMD.GetUSRScore(usr, usr2), 1.0) + def testUSRCAT(self): + mol = Chem.MolFromSmiles("CC") + AllChem.Compute2DCoords(mol) + self.failUnlessRaises(ValueError, lambda : rdMD.GetUSRCAT(mol)) + mol = Chem.MolFromSmiles("C1CCCCC1") + mol = Chem.AddHs(mol) + self.failUnlessRaises(ValueError, lambda : rdMD.GetUSRCAT(mol)) + AllChem.Compute2DCoords(mol) + usr = rdMD.GetUSRCAT(mol) + self.failUnlessEqual(len(usr), 60) + self.failUnlessRaises(ValueError, lambda : rdMD.GetUSRCAT(mol, atomSelections=[])) + atoms = [[1, 2, 3, 4, 5, 6], [], [], []] + usr2 = rdMD.GetUSRCAT(mol, atomSelections=atoms) + self.failUnlessEqual(len(usr), 60) + self.failUnlessEqual(rdMD.GetUSRScore(usr, usr2, weights=[1.0, 1.0, 1.0, 1.0, 1.0]), 1.0) + + def testMolWt(self): mol = Chem.MolFromSmiles("C"); amw = rdMD._CalcMolWt(mol); diff --git a/Code/GraphMol/Descriptors/test.cpp b/Code/GraphMol/Descriptors/test.cpp index eb988b875..edc34623a 100644 --- a/Code/GraphMol/Descriptors/test.cpp +++ b/Code/GraphMol/Descriptors/test.cpp @@ -1638,7 +1638,7 @@ void testUSRDescriptor(){ 3.38248245, 1.59816952, -0.72933115, 3.38248245, 1.59816952,-0.72933115}; std::vector refUSR (refValues, refValues + sizeof(refValues) / sizeof(double) ); std::string rdbase = getenv("RDBASE"); - std::string fname1 = rdbase + "/Code/GraphMol/Descriptors/test_data/benzene.mol"; + std::string fname1 = rdbase + "/Code/GraphMol/Descriptors/test_data/cyclohexane.mol"; mol = MolFileToMol(fname1, true, false, true); std::vector myUSR(12); USR(*mol, myUSR); @@ -1657,7 +1657,70 @@ void testUSRScore(){ double m2[12]={4.39, 3.11, 1.36, 4.50, 4.44, 0.09, 8.34, 16.78, -23.20, 7.15, 16.52, 0.13}; std::vector d1 (m1, m1 + sizeof(m1) / sizeof(double) ); std::vector d2 (m2, m2 + sizeof(m2) / sizeof(double) ); - TEST_ASSERT(feq(calcUSRScore(d1, d2), 0.812, 0.001)); + std::vector weights(1, 1.0); + TEST_ASSERT(feq(calcUSRScore(d1, d2, weights), 0.812, 0.001)); + + BOOST_LOG(rdErrorLog) << " done" << std::endl; +} + +void testUSRCATDescriptor(){ + BOOST_LOG(rdErrorLog) << "-------------------------------------" << std::endl; + BOOST_LOG(rdErrorLog) << " Test USRCAT Descriptor" << std::endl; + std::vector descriptor(12); + + // no conformers + ROMol *mol = SmilesToMol("C1CCCCC1"); + bool ok = false; + try { + USR(*mol, descriptor); + } catch (ConformerException &e) { + ok = true; + } + TEST_ASSERT(ok); + + // number of atoms < 3 + mol = SmilesToMol("CC"); + ok = false; + try { + USR(*mol, descriptor); + } catch (ValueErrorException &e) { + ok = true; + } + TEST_ASSERT(ok); + + // DESCRIPTOR + // comparing to results produced by Adrian Schreyer's code + // http://hg.adrianschreyer.eu/usrcat/src/70e075d93cd2?at=default + double refValues[60] = {2.37938524, 0.62181927, -0.89089872, 2.63773456, 1.1577952, -0.6937349, + 3.38248245, 1.59816952, -0.72933115, 3.38248245, 1.59816952,-0.72933115, + 1.50000000, 0.00000000, -0.77827171, 1.86602540, 1.00893468,-0.89059233, + 3.02358914, 1.02718486, -0.64081261, 3.02358914, 1.02718486,-0.64081261, + 0.00000000, 0.00000000, 0.00000000, 0.00000000, 0.00000000, 0.00000000, + 0.00000000, 0.00000000, 0.00000000, 0.00000000, 0.00000000, 0.00000000, + 0.00000000, 0.00000000, 0.00000000, 0.00000000, 0.00000000, 0.00000000, + 0.00000000, 0.00000000, 0.00000000, 0.00000000, 0.00000000, 0.00000000, + 0.00000000, 0.00000000, 0.00000000, 0.00000000, 0.00000000, 0.00000000, + 0.00000000, 0.00000000, 0.00000000, 0.00000000, 0.00000000, 0.00000000}; + + std::vector refUSR (refValues, refValues + sizeof(refValues) / sizeof(double) ); + std::string rdbase = getenv("RDBASE"); + std::string fname1 = rdbase + "/Code/GraphMol/Descriptors/test_data/cyclohexane.mol"; + mol = MolFileToMol(fname1, true, false, true); + std::vector > atomIds; + std::vector myUSR(12); + USRCAT(*mol, myUSR, atomIds); + for (unsigned int i = 0; i < myUSR.size(); ++i) { + TEST_ASSERT(feq(myUSR[i], refUSR[i])); + } + + atomIds.resize(4); + unsigned int h[6] = {0, 1, 2, 3, 4, 5}; + std::vector hydrophobic(h, h + sizeof(h) / sizeof(unsigned int)); + atomIds[0] = hydrophobic; + USRCAT(*mol, myUSR, atomIds); + for (unsigned int i = 0; i < myUSR.size(); ++i) { + TEST_ASSERT(feq(myUSR[i], refUSR[i])); + } BOOST_LOG(rdErrorLog) << " done" << std::endl; } diff --git a/Code/GraphMol/Descriptors/test_data/benzene.mol b/Code/GraphMol/Descriptors/test_data/cyclohexane.mol similarity index 98% rename from Code/GraphMol/Descriptors/test_data/benzene.mol rename to Code/GraphMol/Descriptors/test_data/cyclohexane.mol index a0fa793fb..ef49c56b0 100644 --- a/Code/GraphMol/Descriptors/test_data/benzene.mol +++ b/Code/GraphMol/Descriptors/test_data/cyclohexane.mol @@ -1,4 +1,4 @@ -benzene +cyclohexane RDKit 2D 18 18 0 0 0 0 0 0 0 0999 V2000 @@ -39,6 +39,6 @@ benzene 6 17 1 0 6 18 1 0 M END -benzene +cyclohexane $$$$