Files
rdkit/Code/DataManip/MetricMatrixCalc/Wrap/rdMetricMatrixCalc.cpp

283 lines
12 KiB
C++
Executable File

// $Id$
//
// Copyright (C) 2003-2006 Rational Discovery LLC
//
// @@ All Rights Reserved @@
//
#define PY_ARRAY_UNIQUE_SYMBOL rdmetric_array_API
#include <boost/python.hpp>
#include <boost/python/numeric.hpp>
#include "Numeric/arrayobject.h"
#include <RDBoost/PySequenceHolder.h>
#include <RDBoost/Wrap.h>
#include <RDGeneral/types.h>
#include <DataManip/MetricMatrixCalc/MetricMatrixCalc.h>
#include <DataManip/MetricMatrixCalc/MetricFuncs.h>
#include <DataStructs/BitVects.h>
#include <string>
using namespace RDDataManip;
void wrap_MMcalc();
namespace python = boost::python;
namespace RDDataManip {
PyObject *getEuclideanDistMat(python::object descripMat) {
// Bit of a pain involved here, we accept three types of PyObjects here
// 1. A Numeric Array
// - first find what 'type' of entry we have (float, double and int is all we recognize for now)
// - then point to contiguous piece of memory from the array that contains the data with a type*
// - then make a new type** pointer so that double index into this contiguous memory will work
// and then pass it along to the distance calculator
// 2. A list of Numeric Vector (or 1D arrays)
// - in this case wrap descripMat with a PySequenceHolder<type*> where type is the
// type of entry in vector (accepted types are int, double and float
// - Then pass the PySequenceHolder to the metrci calculator
// 3. A list (or tuple) of lists (or tuple)
// - In this case other than wrapping descripMat with a PySequenceHolder
// each of the indivual list in there are also wrapped by a PySequenceHolder
// - so the distance calculator is passed in a "PySequenceHolder<PySequenceHolder<double>>"
// - FIX: not that we always convert entry values to double here, even if we passed
// in a list of list of ints (or floats). Given that lists can be heterogeneous, I do not
// know how to ask a list what type of entries if contains.
//
// OK my brain is going to explode now
// first deal with situation where we have an Numeric Array
PyObject *descMatObj = descripMat.ptr();
PyArrayObject *distRes;
if (PyArray_Check(descMatObj)) {
// get the dimensions of the array
int nrows = ((PyArrayObject *)descMatObj)->dimensions[0];
int ncols = ((PyArrayObject *)descMatObj)->dimensions[1];
int i;
CHECK_INVARIANT((nrows > 0) && (ncols > 0), "");
int dMatLen = nrows*(nrows-1)/2;
// now that we have the dimensions declare the distance matrix which is always a
// 1D double array
distRes = (PyArrayObject *)PyArray_FromDims(1, &dMatLen, PyArray_DOUBLE);
// grab a pointer to the data in the array so that we can directly put values in there
// and avoid copying (as I understand it PyArray_FromDimsAndData will do it here for us
// because python will never free the malloced memory this way)
double *dMat = (double *)distRes->data;
// if we have double array
PyArrayObject *copy;
copy = (PyArrayObject *)PyArray_ContiguousFromObject(descMatObj,
((PyArrayObject *)descMatObj)->descr->type_num,
2,2);
if (((PyArrayObject *)descMatObj)->descr->type_num == PyArray_DOUBLE) {
double *desc = (double *)copy->data;
// REVIEW: create an adaptor object to hold a double * and support
// operator[]() so that we don't have to do this stuff:
// here is the 2D array trick this so that when the distance calaculator
// asks for desc2D[i] we basically get the ith row as double*
double **desc2D = new double*[nrows];
for (i = 0; i < nrows; i++) {
desc2D[i] = desc;
desc += ncols;
}
MetricMatrixCalc<double**, double*> mmCalc;
mmCalc.setMetricFunc(&EuclideanDistance<double *, double *>);
mmCalc.calcMetricMatrix(desc2D, nrows, ncols, dMat);
delete [] desc2D;
// we got the distance matrix we are happy so return
return PyArray_Return(distRes);
}
// if we have a float array
else if (((PyArrayObject *)descMatObj)->descr->type_num == PyArray_FLOAT) {
float* desc = (float *)copy->data;
float **desc2D = new float*[nrows];
for (i = 0; i < nrows; i++) {
desc2D[i] = desc;
desc += ncols;
}
MetricMatrixCalc<float**, float*> mmCalc;
mmCalc.setMetricFunc(&EuclideanDistance<float *, float*>);
mmCalc.calcMetricMatrix(desc2D, nrows, ncols, dMat);
delete [] desc2D;
return PyArray_Return(distRes);
}
// if we have an interger array
else if (((PyArrayObject *)descMatObj)->descr->type_num == PyArray_INT) {
int *desc = (int *)copy->data;
int **desc2D = new int*[nrows];
for (i = 0; i < nrows; i++) {
desc2D[i] = desc;
desc += ncols;
}
MetricMatrixCalc<int**, int*> mmCalc;
mmCalc.setMetricFunc(&EuclideanDistance<int *, int*>);
mmCalc.calcMetricMatrix(desc2D, nrows, ncols, dMat);
delete [] desc2D;
return PyArray_Return(distRes);
}
else {
// unreconiged type for the matrix, throw up
throw_value_error("The array has to be of type int, float, or double for GetEuclideanDistMat");
}
} // done with an array input
else {
// REVIEW: removed a ton of code here
// we have probably have a list or a tuple
int nrows, ncols, i;
ncols = 0;
nrows = python::extract<int>(descripMat.attr("__len__")());
CHECK_INVARIANT(nrows > 0, "Empty list passed in");
int dMatLen = nrows*(nrows-1)/2;
distRes = (PyArrayObject *)PyArray_FromDims(1, &dMatLen, PyArray_DOUBLE);
double *dMat = (double *)distRes->data;
// assume that we a have a list of list of values (that can be extracted to double)
std::vector<PySequenceHolder<double> > dData;
dData.reserve(nrows);
for (i = 0; i < nrows; i++) {
//PySequenceHolder<double> row(seq[i]);
PySequenceHolder<double> row(descripMat[i]);
if(i==0){
ncols = row.size();
} else if( row.size() != ncols ){
throw_value_error("All subsequences must be the same length");
}
dData.push_back(row);
}
MetricMatrixCalc< std::vector<PySequenceHolder<double> >, PySequenceHolder<double> > mmCalc;
mmCalc.setMetricFunc(&EuclideanDistance< PySequenceHolder<double>, PySequenceHolder<double> >);
mmCalc.calcMetricMatrix(dData, nrows, ncols, dMat);
}
return PyArray_Return(distRes);
}
PyObject *getTanimotoDistMat(python::object bitVectList) {
// we will assume here that we have a either a list of ExplicitBitVectors or
// SparseBitVects
int nrows = python::extract<int>(bitVectList.attr("__len__")());
CHECK_INVARIANT(nrows > 1, "");
// First check what type of vector we have
python::object v1 = bitVectList[0];
python::extract<ExplicitBitVect> ebvWorks(v1);
python::extract<SparseBitVect> sbvWorks(v1);
if(!ebvWorks.check() && !sbvWorks.check()){
throw_value_error("GetTanimotoDistMat can only take a sequence of ExplicitBitVects or SparseBitvects");
}
int dMatLen = nrows*(nrows-1)/2;
PyArrayObject *simRes = (PyArrayObject *)PyArray_FromDims(1, &dMatLen, PyArray_DOUBLE);
double *sMat = (double *)simRes->data;
if (ebvWorks.check()) {
PySequenceHolder<ExplicitBitVect> dData(bitVectList);
MetricMatrixCalc<PySequenceHolder<ExplicitBitVect>, ExplicitBitVect> mmCalc;
mmCalc.setMetricFunc(&TanimotoDistance<ExplicitBitVect, ExplicitBitVect>);
mmCalc.calcMetricMatrix(dData, nrows, 0, sMat);
}
else if (sbvWorks.check()) {
PySequenceHolder<SparseBitVect> dData(bitVectList);
MetricMatrixCalc<PySequenceHolder<SparseBitVect>, SparseBitVect> mmCalc;
mmCalc.setMetricFunc(&TanimotoDistance<SparseBitVect, SparseBitVect>);
mmCalc.calcMetricMatrix(dData, nrows, 0, sMat);
}
return PyArray_Return(simRes);
}
PyObject *getTanimotoSimMat(python::object bitVectList) {
// we will assume here that we have a either a list of ExplicitBitVectors or
// SparseBitVects
int nrows = python::extract<int>(bitVectList.attr("__len__")());
CHECK_INVARIANT(nrows > 1, "");
// First check what type of vector we have
python::object v1 = bitVectList[0];
python::extract<ExplicitBitVect> ebvWorks(v1);
python::extract<SparseBitVect> sbvWorks(v1);
if(!ebvWorks.check() && !sbvWorks.check()){
throw_value_error("GetTanimotoDistMat can only take a sequence of ExplicitBitVects or SparseBitvects");
}
int dMatLen = nrows*(nrows-1)/2;
PyArrayObject *simRes = (PyArrayObject *)PyArray_FromDims(1, &dMatLen, PyArray_DOUBLE);
double *sMat = (double *)simRes->data;
if (ebvWorks.check()) {
PySequenceHolder<ExplicitBitVect> dData(bitVectList);
MetricMatrixCalc<PySequenceHolder<ExplicitBitVect>, ExplicitBitVect> mmCalc;
mmCalc.setMetricFunc(&TanimotoSimilarity<ExplicitBitVect, ExplicitBitVect>);
mmCalc.calcMetricMatrix(dData, nrows, 0, sMat);
}
else if (sbvWorks.check()) {
PySequenceHolder<SparseBitVect> dData(bitVectList);
MetricMatrixCalc<PySequenceHolder<SparseBitVect>, SparseBitVect> mmCalc;
mmCalc.setMetricFunc(&TanimotoSimilarity<SparseBitVect, SparseBitVect>);
mmCalc.calcMetricMatrix(dData, nrows, 0, sMat);
}
return PyArray_Return(simRes);
}
}
BOOST_PYTHON_MODULE(rdMetricMatrixCalc)
{
python::scope().attr("__doc__") =
"Module containing the calculator for metric matrix calculation, \n"
"e.g. simialrity and distance matrices"
;
import_array();
python::register_exception_translator<IndexErrorException>(&translate_index_error);
python::register_exception_translator<ValueErrorException>(&translate_value_error);
std::string docString;
docString = "Compute the distance matrix from a descriptor matrix using Euclidean distance metric\n\n\
ARGUMENTS: \n\
\n\
descripMat - A python object of any one of the folliwng type \n\
1. A numeric array of dimensions n by m where n is the number of items in the data set \n\
and m is the number of descriptors \n\
2. A list of Numeric Vectors (or 1D arrays), each entry in the list corresponds \n\
to descriptor vector for one item \n\
3. A list (or tuple) of list (or tuple) of values, where the values can be extracted to \n\
double. \n\n\
RETURNS: \n\
A numeric 1 dimensional array containing the lower triangle elements of the symmetric distance matrix\n\n";
python::def("GetEuclideanDistMat", RDDataManip::getEuclideanDistMat,
docString.c_str());
docString = "Compute the distance matrix from a list of BitVects \n\n\
ARGUMENTS: \n\
\n\
bitVectList - a list of bit vectors. Currently this works only for a list of explicit bit vectors, \n\
needs to be expanded to support a lsit of SparseBitVects\n\n\
RETURNS: \n\
A numeric 1 dimensional array containing the lower triangle elements of the\n\
symmetric distance matrix\n\n";
python::def("GetTanimotoDistMat", RDDataManip::getTanimotoDistMat,
docString.c_str());
docString = "Compute the similarity matrix from a list of BitVects \n\n\
ARGUMENTS: \n\
\n\
bitVectList - a list of bit vectors. Currently this works only for a list of explicit bit vectors, \n\
needs to be expanded to support a lsit of SparseBitVects\n\n\
RETURNS: \n\
A numeric 1 dimensional array containing the lower triangle elements of the symmetric similarity matrix\n\n";
python::def("GetTanimotoSimMat", RDDataManip::getTanimotoSimMat,
docString.c_str());
}