// $Id$ // // Copyright (C) 2003-2008 Greg Landrum and Rational Discovery LLC // // @@ All Rights Reserved @@ // This file is part of the RDKit. // The contents are covered by the terms of the BSD license // which is included in the file license.txt, found at the root // of the RDKit source tree. // #define PY_ARRAY_UNIQUE_SYMBOL rdmetric_array_API #include #include #include #include #include #include #include #include #include #include using namespace RDDataManip; void wrap_MMcalc(); namespace python = boost::python; namespace RDDataManip { template class PyObjectManager : public boost::noncopyable { public: PyObjectManager() = default; PyObjectManager(T *obj) : d_obj(obj) {} ~PyObjectManager() { Py_XDECREF((PyObject *)d_obj); } PyObjectManager &operator=(T *obj) { Py_XDECREF((PyObject *)d_obj); d_obj = obj; return *this; } T *get() { return d_obj; } T *release() { return std::exchange(d_obj, nullptr); } private: T *d_obj = nullptr; }; PyObject *getEuclideanDistMat(python::object descripMat) { // Bit of a pain involved here, we accept three types of PyObjects here // 1. A Numeric Array // - first find what 'type' of entry we have (float, double and int is all // we recognize for now) // - then point to contiguous piece of memory from the array that contains // the data with a type* // - then make a new type** pointer so that double index into this // contiguous memory will work // and then pass it along to the distance calculator // 2. A list of Numeric Vector (or 1D arrays) // - in this case wrap descripMat with a PySequenceHolder where // type is the // type of entry in vector (accepted types are int, double and float // - Then pass the PySequenceHolder to the metric calculator // 3. A list (or tuple) of lists (or tuple) // - In this case other than wrapping descripMat with a PySequenceHolder // each of the individual list in there are also wrapped by a // PySequenceHolder // - so the distance calculator is passed in a // "PySequenceHolder>" // - FIX: not that we always convert entry values to double here, even if // we passed // in a list of list of ints (or floats). Given that lists can be // heterogeneous, I do not // know how to ask a list what type of entries if contains. // // OK my brain is going to explode now // first deal with situation where we have an Numeric Array PyObject *descMatObj = descripMat.ptr(); PyObjectManager distRes; if (PyArray_Check(descMatObj)) { // get the dimensions of the array int nrows = PyArray_DIM((PyArrayObject *)descMatObj, 0); int ncols = PyArray_DIM((PyArrayObject *)descMatObj, 1); int i; CHECK_INVARIANT((nrows > 0) && (ncols > 0), ""); npy_intp dMatLen = nrows * (nrows - 1) / 2; // now that we have the dimensions declare the distance matrix which is // always a // 1D double array distRes = (PyArrayObject *)PyArray_SimpleNew(1, &dMatLen, NPY_DOUBLE); // grab a pointer to the data in the array so that we can directly put // values in there // and avoid copying : auto *dMat = (double *)PyArray_DATA(distRes.get()); PyObjectManager copy( (PyArrayObject *)PyArray_ContiguousFromObject( descMatObj, PyArray_DESCR((PyArrayObject *)descMatObj)->type_num, 2, 2)); // if we have double array if (PyArray_DESCR((PyArrayObject *)descMatObj)->type_num == NPY_DOUBLE) { auto *desc = (double *)PyArray_DATA((PyArrayObject *)descMatObj); // REVIEW: create an adaptor object to hold a double * and support // operator[]() so that we don't have to do this stuff: // here is the 2D array trick this so that when the distance calaculator // asks for desc2D[i] we basically get the ith row as double* std::unique_ptr desc2D(new double *[nrows]); for (i = 0; i < nrows; i++) { desc2D[i] = desc; desc += ncols; } MetricMatrixCalc mmCalc; mmCalc.setMetricFunc(&EuclideanDistanceMetric); mmCalc.calcMetricMatrix(desc2D.get(), nrows, ncols, dMat); // we got the distance matrix we are happy so return return PyArray_Return(distRes.release()); } // if we have a float array else if (PyArray_DESCR((PyArrayObject *)descMatObj)->type_num == NPY_FLOAT) { auto *desc = (float *)PyArray_DATA(copy.get()); std::unique_ptr desc2D(new float *[nrows]); for (i = 0; i < nrows; i++) { desc2D[i] = desc; desc += ncols; } MetricMatrixCalc mmCalc; mmCalc.setMetricFunc(&EuclideanDistanceMetric); mmCalc.calcMetricMatrix(desc2D.get(), nrows, ncols, dMat); return PyArray_Return(distRes.release()); } // if we have an integer array else if (PyArray_DESCR((PyArrayObject *)descMatObj)->type_num == NPY_INT) { int *desc = (int *)PyArray_DATA(copy.get()); std::unique_ptr desc2D(new int *[nrows]); for (i = 0; i < nrows; i++) { desc2D[i] = desc; desc += ncols; } MetricMatrixCalc mmCalc; mmCalc.setMetricFunc(&EuclideanDistanceMetric); mmCalc.calcMetricMatrix(desc2D.get(), nrows, ncols, dMat); return PyArray_Return(distRes.release()); } else { // unrecognized type for the matrix, throw up throw_value_error( "The array has to be of type int, float, or double for " "GetEuclideanDistMat"); } } // done with an array input else { // REVIEW: removed a ton of code here // we have probably have a list or a tuple unsigned int ncols = 0; unsigned int nrows = boost::python::len(descripMat); CHECK_INVARIANT(nrows > 0, "Empty list passed in"); npy_intp dMatLen = nrows * (nrows - 1) / 2; distRes = (PyArrayObject *)PyArray_SimpleNew(1, &dMatLen, NPY_DOUBLE); auto *dMat = (double *)PyArray_DATA(distRes.get()); // assume that we a have a list of list of values (that can be extracted to // double) std::vector> dData; dData.reserve(nrows); for (unsigned int i = 0; i < nrows; i++) { // PySequenceHolder row(seq[i]); PySequenceHolder row(descripMat[i]); if (i == 0) { ncols = row.size(); } else if (row.size() != ncols) { throw_value_error("All subsequences must be the same length"); } dData.push_back(row); } MetricMatrixCalc>, PySequenceHolder> mmCalc; mmCalc.setMetricFunc(&EuclideanDistanceMetric, PySequenceHolder>); mmCalc.calcMetricMatrix(dData, nrows, ncols, dMat); } return PyArray_Return(distRes.release()); } PyObject *getTanimotoDistMat(python::object bitVectList) { // we will assume here that we have a either a list of ExplicitBitVectors or // SparseBitVects unsigned int nrows = boost::python::len(bitVectList); CHECK_INVARIANT(nrows > 1, ""); // First check what type of vector we have python::object v1 = bitVectList[0]; python::extract ebvWorks(v1); python::extract sbvWorks(v1); if (!ebvWorks.check() && !sbvWorks.check()) { throw_value_error( "GetTanimotoDistMat can only take a sequence of ExplicitBitVects or " "SparseBitvects"); } npy_intp dMatLen = nrows * (nrows - 1) / 2; auto *simRes = (PyArrayObject *)PyArray_SimpleNew(1, &dMatLen, NPY_DOUBLE); auto *sMat = (double *)PyArray_DATA(simRes); if (ebvWorks.check()) { PySequenceHolder dData(bitVectList); MetricMatrixCalc, ExplicitBitVect> mmCalc; mmCalc.setMetricFunc( &TanimotoDistanceMetric); mmCalc.calcMetricMatrix(dData, nrows, 0, sMat); } else if (sbvWorks.check()) { PySequenceHolder dData(bitVectList); MetricMatrixCalc, SparseBitVect> mmCalc; mmCalc.setMetricFunc(&TanimotoDistanceMetric); mmCalc.calcMetricMatrix(dData, nrows, 0, sMat); } return PyArray_Return(simRes); } PyObject *getTanimotoSimMat(python::object bitVectList) { // we will assume here that we have a either a list of ExplicitBitVectors or // SparseBitVects unsigned int nrows = boost::python::len(bitVectList); CHECK_INVARIANT(nrows > 1, ""); // First check what type of vector we have python::object v1 = bitVectList[0]; python::extract ebvWorks(v1); python::extract sbvWorks(v1); if (!ebvWorks.check() && !sbvWorks.check()) { throw_value_error( "GetTanimotoDistMat can only take a sequence of ExplicitBitVects or " "SparseBitvects"); } npy_intp dMatLen = nrows * (nrows - 1) / 2; auto *simRes = (PyArrayObject *)PyArray_SimpleNew(1, &dMatLen, NPY_DOUBLE); auto *sMat = (double *)PyArray_DATA(simRes); if (ebvWorks.check()) { PySequenceHolder dData(bitVectList); MetricMatrixCalc, ExplicitBitVect> mmCalc; mmCalc.setMetricFunc( &TanimotoSimilarityMetric); mmCalc.calcMetricMatrix(dData, nrows, 0, sMat); } else if (sbvWorks.check()) { PySequenceHolder dData(bitVectList); MetricMatrixCalc, SparseBitVect> mmCalc; mmCalc.setMetricFunc( &TanimotoSimilarityMetric); mmCalc.calcMetricMatrix(dData, nrows, 0, sMat); } return PyArray_Return(simRes); } } // namespace RDDataManip BOOST_PYTHON_MODULE(rdMetricMatrixCalc) { python::scope().attr("__doc__") = "Module containing the calculator for metric matrix calculation, \n" "e.g. similarity and distance matrices"; rdkit_import_array(); std::string docString; docString = "Compute the distance matrix from a descriptor matrix using the Euclidean distance metric\n\n\ ARGUMENTS: \n\ \n\ descripMat - A python object of any one of the following types \n\ 1. A numeric array of dimensions n by m where n is the number of items in the data set \n\ and m is the number of descriptors \n\ 2. A list of Numeric Vectors (or 1D arrays), each entry in the list corresponds \n\ to descriptor vector for one item \n\ 3. A list (or tuple) of lists (or tuples) of values, where the values can be extracted to \n\ double. \n\n\ RETURNS: \n\ A numeric one-dimensional array containing the lower triangle elements of the symmetric distance matrix\n\n"; python::def("GetEuclideanDistMat", RDDataManip::getEuclideanDistMat, docString.c_str(), python::args("descripMat")); docString = "Compute the distance matrix from a list of BitVects using the Tanimoto distance metric\n\n\ ARGUMENTS: \n\ \n\ bitVectList - a list of bit vectors. Currently this works only for a list of explicit bit vectors, \n\ needs to be expanded to support a list of SparseBitVects\n\n\ RETURNS: \n\ A numeric 1 dimensional array containing the lower triangle elements of the\n\ symmetric distance matrix\n\n"; python::def("GetTanimotoDistMat", RDDataManip::getTanimotoDistMat, docString.c_str(), python::args("bitVectList")); docString = "Compute the similarity matrix from a list of BitVects \n\n\ ARGUMENTS: \n\ \n\ bitVectList - a list of bit vectors. Currently this works only for a list of explicit bit vectors, \n\ needs to be expanded to support a list of SparseBitVects\n\n\ RETURNS: \n\ A numeric 1 dimensional array containing the lower triangle elements of the symmetric similarity matrix\n\n"; python::def("GetTanimotoSimMat", RDDataManip::getTanimotoSimMat, docString.c_str(), python::args("bitVectList")); }