Files
rdkit/Code/DataManip/MetricMatrixCalc/Wrap/rdMetricMatrixCalc.cpp
Ricardo Rodriguez a4b63d7df5 Minor refactor of the python wrappers (#8847)
* refactor python wrappers

* fix FilterHierarchyMatcher converted already registered warning
2025-10-05 09:42:31 +02:00

322 lines
12 KiB
C++

// $Id$
//
// Copyright (C) 2003-2008 Greg Landrum and Rational Discovery LLC
//
// @@ All Rights Reserved @@
// This file is part of the RDKit.
// The contents are covered by the terms of the BSD license
// which is included in the file license.txt, found at the root
// of the RDKit source tree.
//
#define PY_ARRAY_UNIQUE_SYMBOL rdmetric_array_API
#include <RDBoost/python.h>
#include <RDBoost/boost_numpy.h>
#include <RDBoost/PySequenceHolder.h>
#include <RDBoost/Wrap.h>
#include <RDBoost/import_array.h>
#include <RDGeneral/types.h>
#include <DataManip/MetricMatrixCalc/MetricMatrixCalc.h>
#include <DataManip/MetricMatrixCalc/MetricFuncs.h>
#include <DataStructs/BitVects.h>
#include <string>
using namespace RDDataManip;
void wrap_MMcalc();
namespace python = boost::python;
namespace RDDataManip {
template <typename T>
class PyObjectManager : public boost::noncopyable {
public:
PyObjectManager() = default;
PyObjectManager(T *obj) : d_obj(obj) {}
~PyObjectManager() { Py_XDECREF((PyObject *)d_obj); }
PyObjectManager &operator=(T *obj) {
Py_XDECREF((PyObject *)d_obj);
d_obj = obj;
return *this;
}
T *get() { return d_obj; }
T *release() { return std::exchange(d_obj, nullptr); }
private:
T *d_obj = nullptr;
};
PyObject *getEuclideanDistMat(python::object descripMat) {
// Bit of a pain involved here, we accept three types of PyObjects here
// 1. A Numeric Array
// - first find what 'type' of entry we have (float, double and int is all
// we recognize for now)
// - then point to contiguous piece of memory from the array that contains
// the data with a type*
// - then make a new type** pointer so that double index into this
// contiguous memory will work
// and then pass it along to the distance calculator
// 2. A list of Numeric Vector (or 1D arrays)
// - in this case wrap descripMat with a PySequenceHolder<type*> where
// type is the
// type of entry in vector (accepted types are int, double and float
// - Then pass the PySequenceHolder to the metric calculator
// 3. A list (or tuple) of lists (or tuple)
// - In this case other than wrapping descripMat with a PySequenceHolder
// each of the individual list in there are also wrapped by a
// PySequenceHolder
// - so the distance calculator is passed in a
// "PySequenceHolder<PySequenceHolder<double>>"
// - FIX: not that we always convert entry values to double here, even if
// we passed
// in a list of list of ints (or floats). Given that lists can be
// heterogeneous, I do not
// know how to ask a list what type of entries if contains.
//
// OK my brain is going to explode now
// first deal with situation where we have an Numeric Array
PyObject *descMatObj = descripMat.ptr();
PyObjectManager<PyArrayObject> distRes;
if (PyArray_Check(descMatObj)) {
// get the dimensions of the array
int nrows = PyArray_DIM((PyArrayObject *)descMatObj, 0);
int ncols = PyArray_DIM((PyArrayObject *)descMatObj, 1);
int i;
CHECK_INVARIANT((nrows > 0) && (ncols > 0), "");
npy_intp dMatLen = nrows * (nrows - 1) / 2;
// now that we have the dimensions declare the distance matrix which is
// always a
// 1D double array
distRes = (PyArrayObject *)PyArray_SimpleNew(1, &dMatLen, NPY_DOUBLE);
// grab a pointer to the data in the array so that we can directly put
// values in there
// and avoid copying :
auto *dMat = (double *)PyArray_DATA(distRes.get());
PyObjectManager<PyArrayObject> copy(
(PyArrayObject *)PyArray_ContiguousFromObject(
descMatObj, PyArray_DESCR((PyArrayObject *)descMatObj)->type_num, 2,
2));
// if we have double array
if (PyArray_DESCR((PyArrayObject *)descMatObj)->type_num == NPY_DOUBLE) {
auto *desc = (double *)PyArray_DATA((PyArrayObject *)descMatObj);
// REVIEW: create an adaptor object to hold a double * and support
// operator[]() so that we don't have to do this stuff:
// here is the 2D array trick this so that when the distance calaculator
// asks for desc2D[i] we basically get the ith row as double*
std::unique_ptr<double *[]> desc2D(new double *[nrows]);
for (i = 0; i < nrows; i++) {
desc2D[i] = desc;
desc += ncols;
}
MetricMatrixCalc<double **, double *> mmCalc;
mmCalc.setMetricFunc(&EuclideanDistanceMetric<double *, double *>);
mmCalc.calcMetricMatrix(desc2D.get(), nrows, ncols, dMat);
// we got the distance matrix we are happy so return
return PyArray_Return(distRes.release());
}
// if we have a float array
else if (PyArray_DESCR((PyArrayObject *)descMatObj)->type_num ==
NPY_FLOAT) {
auto *desc = (float *)PyArray_DATA(copy.get());
std::unique_ptr<float *[]> desc2D(new float *[nrows]);
for (i = 0; i < nrows; i++) {
desc2D[i] = desc;
desc += ncols;
}
MetricMatrixCalc<float **, float *> mmCalc;
mmCalc.setMetricFunc(&EuclideanDistanceMetric<float *, float *>);
mmCalc.calcMetricMatrix(desc2D.get(), nrows, ncols, dMat);
return PyArray_Return(distRes.release());
}
// if we have an integer array
else if (PyArray_DESCR((PyArrayObject *)descMatObj)->type_num == NPY_INT) {
int *desc = (int *)PyArray_DATA(copy.get());
std::unique_ptr<int *[]> desc2D(new int *[nrows]);
for (i = 0; i < nrows; i++) {
desc2D[i] = desc;
desc += ncols;
}
MetricMatrixCalc<int **, int *> mmCalc;
mmCalc.setMetricFunc(&EuclideanDistanceMetric<int *, int *>);
mmCalc.calcMetricMatrix(desc2D.get(), nrows, ncols, dMat);
return PyArray_Return(distRes.release());
} else {
// unrecognized type for the matrix, throw up
throw_value_error(
"The array has to be of type int, float, or double for "
"GetEuclideanDistMat");
}
} // done with an array input
else {
// REVIEW: removed a ton of code here
// we have probably have a list or a tuple
unsigned int ncols = 0;
unsigned int nrows = boost::python::len(descripMat);
CHECK_INVARIANT(nrows > 0, "Empty list passed in");
npy_intp dMatLen = nrows * (nrows - 1) / 2;
distRes = (PyArrayObject *)PyArray_SimpleNew(1, &dMatLen, NPY_DOUBLE);
auto *dMat = (double *)PyArray_DATA(distRes.get());
// assume that we a have a list of list of values (that can be extracted to
// double)
std::vector<PySequenceHolder<double>> dData;
dData.reserve(nrows);
for (unsigned int i = 0; i < nrows; i++) {
// PySequenceHolder<double> row(seq[i]);
PySequenceHolder<double> row(descripMat[i]);
if (i == 0) {
ncols = row.size();
} else if (row.size() != ncols) {
throw_value_error("All subsequences must be the same length");
}
dData.push_back(row);
}
MetricMatrixCalc<std::vector<PySequenceHolder<double>>,
PySequenceHolder<double>>
mmCalc;
mmCalc.setMetricFunc(&EuclideanDistanceMetric<PySequenceHolder<double>,
PySequenceHolder<double>>);
mmCalc.calcMetricMatrix(dData, nrows, ncols, dMat);
}
return PyArray_Return(distRes.release());
}
PyObject *getTanimotoDistMat(python::object bitVectList) {
// we will assume here that we have a either a list of ExplicitBitVectors or
// SparseBitVects
unsigned int nrows = boost::python::len(bitVectList);
CHECK_INVARIANT(nrows > 1, "");
// First check what type of vector we have
python::object v1 = bitVectList[0];
python::extract<ExplicitBitVect> ebvWorks(v1);
python::extract<SparseBitVect> sbvWorks(v1);
if (!ebvWorks.check() && !sbvWorks.check()) {
throw_value_error(
"GetTanimotoDistMat can only take a sequence of ExplicitBitVects or "
"SparseBitvects");
}
npy_intp dMatLen = nrows * (nrows - 1) / 2;
auto *simRes = (PyArrayObject *)PyArray_SimpleNew(1, &dMatLen, NPY_DOUBLE);
auto *sMat = (double *)PyArray_DATA(simRes);
if (ebvWorks.check()) {
PySequenceHolder<ExplicitBitVect> dData(bitVectList);
MetricMatrixCalc<PySequenceHolder<ExplicitBitVect>, ExplicitBitVect> mmCalc;
mmCalc.setMetricFunc(
&TanimotoDistanceMetric<ExplicitBitVect, ExplicitBitVect>);
mmCalc.calcMetricMatrix(dData, nrows, 0, sMat);
} else if (sbvWorks.check()) {
PySequenceHolder<SparseBitVect> dData(bitVectList);
MetricMatrixCalc<PySequenceHolder<SparseBitVect>, SparseBitVect> mmCalc;
mmCalc.setMetricFunc(&TanimotoDistanceMetric<SparseBitVect, SparseBitVect>);
mmCalc.calcMetricMatrix(dData, nrows, 0, sMat);
}
return PyArray_Return(simRes);
}
PyObject *getTanimotoSimMat(python::object bitVectList) {
// we will assume here that we have a either a list of ExplicitBitVectors or
// SparseBitVects
unsigned int nrows = boost::python::len(bitVectList);
CHECK_INVARIANT(nrows > 1, "");
// First check what type of vector we have
python::object v1 = bitVectList[0];
python::extract<ExplicitBitVect> ebvWorks(v1);
python::extract<SparseBitVect> sbvWorks(v1);
if (!ebvWorks.check() && !sbvWorks.check()) {
throw_value_error(
"GetTanimotoDistMat can only take a sequence of ExplicitBitVects or "
"SparseBitvects");
}
npy_intp dMatLen = nrows * (nrows - 1) / 2;
auto *simRes = (PyArrayObject *)PyArray_SimpleNew(1, &dMatLen, NPY_DOUBLE);
auto *sMat = (double *)PyArray_DATA(simRes);
if (ebvWorks.check()) {
PySequenceHolder<ExplicitBitVect> dData(bitVectList);
MetricMatrixCalc<PySequenceHolder<ExplicitBitVect>, ExplicitBitVect> mmCalc;
mmCalc.setMetricFunc(
&TanimotoSimilarityMetric<ExplicitBitVect, ExplicitBitVect>);
mmCalc.calcMetricMatrix(dData, nrows, 0, sMat);
} else if (sbvWorks.check()) {
PySequenceHolder<SparseBitVect> dData(bitVectList);
MetricMatrixCalc<PySequenceHolder<SparseBitVect>, SparseBitVect> mmCalc;
mmCalc.setMetricFunc(
&TanimotoSimilarityMetric<SparseBitVect, SparseBitVect>);
mmCalc.calcMetricMatrix(dData, nrows, 0, sMat);
}
return PyArray_Return(simRes);
}
} // namespace RDDataManip
BOOST_PYTHON_MODULE(rdMetricMatrixCalc) {
python::scope().attr("__doc__") =
"Module containing the calculator for metric matrix calculation, \n"
"e.g. similarity and distance matrices";
rdkit_import_array();
std::string docString;
docString =
"Compute the distance matrix from a descriptor matrix using the Euclidean distance metric\n\n\
ARGUMENTS: \n\
\n\
descripMat - A python object of any one of the following types \n\
1. A numeric array of dimensions n by m where n is the number of items in the data set \n\
and m is the number of descriptors \n\
2. A list of Numeric Vectors (or 1D arrays), each entry in the list corresponds \n\
to descriptor vector for one item \n\
3. A list (or tuple) of lists (or tuples) of values, where the values can be extracted to \n\
double. \n\n\
RETURNS: \n\
A numeric one-dimensional array containing the lower triangle elements of the symmetric distance matrix\n\n";
python::def("GetEuclideanDistMat", RDDataManip::getEuclideanDistMat,
docString.c_str(), python::args("descripMat"));
docString =
"Compute the distance matrix from a list of BitVects using the Tanimoto distance metric\n\n\
ARGUMENTS: \n\
\n\
bitVectList - a list of bit vectors. Currently this works only for a list of explicit bit vectors, \n\
needs to be expanded to support a list of SparseBitVects\n\n\
RETURNS: \n\
A numeric 1 dimensional array containing the lower triangle elements of the\n\
symmetric distance matrix\n\n";
python::def("GetTanimotoDistMat", RDDataManip::getTanimotoDistMat,
docString.c_str(), python::args("bitVectList"));
docString =
"Compute the similarity matrix from a list of BitVects \n\n\
ARGUMENTS: \n\
\n\
bitVectList - a list of bit vectors. Currently this works only for a list of explicit bit vectors, \n\
needs to be expanded to support a list of SparseBitVects\n\n\
RETURNS: \n\
A numeric 1 dimensional array containing the lower triangle elements of the symmetric similarity matrix\n\n";
python::def("GetTanimotoSimMat", RDDataManip::getTanimotoSimMat,
docString.c_str(), python::args("bitVectList"));
}