rdkit/Code/DataManip/MetricMatrixCalc/Wrap/rdMetricMatrixCalc.cpp

// $Id$
//
//  Copyright (C) 2003-2006 Rational Discovery LLC
//
//  @@ All Rights Reserved @@
//
#define PY_ARRAY_UNIQUE_SYMBOL rdmetric_array_API
#include <boost/python.hpp>
#include <boost/python/numeric.hpp>
#include "Numeric/arrayobject.h"

#include <RDBoost/PySequenceHolder.h>
#include <RDBoost/Wrap.h>

#include <RDGeneral/types.h>

#include <DataManip/MetricMatrixCalc/MetricMatrixCalc.h>
#include <DataManip/MetricMatrixCalc/MetricFuncs.h>
#include <DataStructs/BitVects.h>
#include <string>

using namespace RDDataManip;

void wrap_MMcalc();

namespace python = boost::python;
namespace RDDataManip {

  PyObject *getEuclideanDistMat(python::object descripMat) {
    // Bit of a pain involved here, we accept three types of PyObjects here
    // 1. A Numeric Array
    //     - first find what 'type' of entry we have (float, double and int is all we recognize for now)
    //     - then point to contiguous piece of memory from the array that contains the data with a type*
    //     - then make a new type** pointer so that double index into this contiguous memory will work
    //       and then pass it along to the distance calculator
    // 2. A list of Numeric Vector (or 1D arrays)
    //     - in this case wrap descripMat with a PySequenceHolder<type*> where type is the
    //       type of entry in vector (accepted types are int, double and float
    //     - Then pass the PySequenceHolder to the metrci calculator
    // 3. A list (or tuple) of lists (or tuple)
    //     - In this case other than wrapping descripMat with a PySequenceHolder
    //       each of the indivual list in there are also wrapped by a PySequenceHolder
    //     - so the distance calculator is passed in a "PySequenceHolder<PySequenceHolder<double>>"
    //     - FIX: not that we always convert entry values to double here, even if we passed
    //       in a list of list of ints (or floats). Given that lists can be heterogeneous, I do not
    //       know how to ask a list what type of entries if contains.
    //
    //  OK my brain is going to explode now

    // first deal with situation where we have an Numeric Array
    PyObject *descMatObj = descripMat.ptr();
    PyArrayObject *distRes;
    if (PyArray_Check(descMatObj)) {
      // get the dimensions of the array
      int nrows = ((PyArrayObject *)descMatObj)->dimensions[0];
      int ncols = ((PyArrayObject *)descMatObj)->dimensions[1];
      int i;
      CHECK_INVARIANT((nrows > 0) && (ncols > 0), "");

      int dMatLen = nrows*(nrows-1)/2;

      // now that we have the dimensions declare the distance matrix which is always a
      // 1D double array
      distRes = (PyArrayObject *)PyArray_FromDims(1, &dMatLen, PyArray_DOUBLE);

      // grab a pointer to the data in the array so that we can directly put values in there
      // and avoid copying (as I understand it PyArray_FromDimsAndData will do it here for us
      // because python will never free the malloced memory this way)
      double *dMat = (double *)distRes->data;

      // if we have double array
      PyArrayObject *copy;
      copy = (PyArrayObject *)PyArray_ContiguousFromObject(descMatObj,
							   ((PyArrayObject *)descMatObj)->descr->type_num,
							   2,2);
      if (((PyArrayObject *)descMatObj)->descr->type_num == PyArray_DOUBLE) {
        double *desc = (double *)copy->data;

	// REVIEW: create an adaptor object to hold a double * and support
	//  operator[]() so that we don't have to do this stuff:

        // here is the 2D array trick this so that when the distance calaculator
        // asks for desc2D[i] we basically get the ith row as double*
        double **desc2D = new double*[nrows];
        for (i = 0; i < nrows; i++) {
          desc2D[i] = desc;
          desc += ncols;
        }
        MetricMatrixCalc<double**, double*> mmCalc;
        mmCalc.setMetricFunc(&EuclideanDistance<double *, double *>);
        mmCalc.calcMetricMatrix(desc2D, nrows, ncols, dMat);

        delete [] desc2D;
        // we got the distance matrix we are happy so return
        return PyArray_Return(distRes);
      }

      // if we have a float array
      else if (((PyArrayObject *)descMatObj)->descr->type_num == PyArray_FLOAT) {
        float* desc = (float *)copy->data;
        float **desc2D = new float*[nrows];
        for (i = 0; i < nrows; i++) {
          desc2D[i] = desc;
          desc += ncols;
        }
        MetricMatrixCalc<float**, float*> mmCalc;
        mmCalc.setMetricFunc(&EuclideanDistance<float *, float*>);
        mmCalc.calcMetricMatrix(desc2D, nrows, ncols, dMat);
        delete [] desc2D;
        return PyArray_Return(distRes);
      }

      // if we have an interger array
      else if (((PyArrayObject *)descMatObj)->descr->type_num == PyArray_INT) {
        int *desc = (int *)copy->data;
        int **desc2D = new int*[nrows];
        for (i = 0; i < nrows; i++) {
          desc2D[i] = desc;
          desc += ncols;
        }
        MetricMatrixCalc<int**, int*> mmCalc;
        mmCalc.setMetricFunc(&EuclideanDistance<int *, int*>);
        mmCalc.calcMetricMatrix(desc2D, nrows, ncols, dMat);
        delete [] desc2D;
        return PyArray_Return(distRes);
      }
      else {
        // unreconiged type for the matrix, throw up
        throw_value_error("The array has to be of type int, float, or double for GetEuclideanDistMat");
      }
    } // done with an array input
    else {
      // REVIEW: removed a ton of code here

      // we have probably have a list or a tuple

      int nrows, ncols, i;
      ncols = 0;
      nrows = python::extract<int>(descripMat.attr("__len__")());
      CHECK_INVARIANT(nrows > 0, "Empty list passed in");

      int dMatLen = nrows*(nrows-1)/2;
      distRes = (PyArrayObject *)PyArray_FromDims(1, &dMatLen, PyArray_DOUBLE);
      double *dMat = (double *)distRes->data;

      // assume that we a have a list of list of values (that can be extracted to double)
      std::vector<PySequenceHolder<double> > dData;
      dData.reserve(nrows);
      for (i = 0; i < nrows; i++) {
	//PySequenceHolder<double> row(seq[i]);
        PySequenceHolder<double> row(descripMat[i]);
        if(i==0){
	  ncols = row.size();
	} else if( row.size() != ncols ){
	  throw_value_error("All subsequences must be the same length");
	}
	dData.push_back(row);
      }

      MetricMatrixCalc< std::vector<PySequenceHolder<double> >, PySequenceHolder<double> > mmCalc;
      mmCalc.setMetricFunc(&EuclideanDistance< PySequenceHolder<double>, PySequenceHolder<double> >);
      mmCalc.calcMetricMatrix(dData, nrows, ncols, dMat);
    }
    return PyArray_Return(distRes);
  }

  PyObject *getTanimotoDistMat(python::object bitVectList) {
    // we will assume here that we have a either a list of ExplicitBitVectors or
    // SparseBitVects
    int nrows = python::extract<int>(bitVectList.attr("__len__")());
    CHECK_INVARIANT(nrows > 1, "");

    // First check what type of vector we have
    python::object v1 = bitVectList[0];
    python::extract<ExplicitBitVect> ebvWorks(v1);
    python::extract<SparseBitVect> sbvWorks(v1);
    if(!ebvWorks.check() && !sbvWorks.check()){
      throw_value_error("GetTanimotoDistMat can only take a sequence of ExplicitBitVects or SparseBitvects");
    }

    int dMatLen = nrows*(nrows-1)/2;
    PyArrayObject *simRes = (PyArrayObject *)PyArray_FromDims(1, &dMatLen, PyArray_DOUBLE);
    double *sMat = (double *)simRes->data;

    if (ebvWorks.check()) {
      PySequenceHolder<ExplicitBitVect> dData(bitVectList);
      MetricMatrixCalc<PySequenceHolder<ExplicitBitVect>, ExplicitBitVect> mmCalc;
      mmCalc.setMetricFunc(&TanimotoDistance<ExplicitBitVect, ExplicitBitVect>);
      mmCalc.calcMetricMatrix(dData, nrows, 0, sMat);
    }
    else if (sbvWorks.check()) {
      PySequenceHolder<SparseBitVect> dData(bitVectList);
      MetricMatrixCalc<PySequenceHolder<SparseBitVect>, SparseBitVect> mmCalc;
      mmCalc.setMetricFunc(&TanimotoDistance<SparseBitVect, SparseBitVect>);
      mmCalc.calcMetricMatrix(dData, nrows, 0, sMat);
    }
    return PyArray_Return(simRes);
  }

  PyObject *getTanimotoSimMat(python::object bitVectList) {
    // we will assume here that we have a either a list of ExplicitBitVectors or
    // SparseBitVects
    int nrows = python::extract<int>(bitVectList.attr("__len__")());
    CHECK_INVARIANT(nrows > 1, "");

    // First check what type of vector we have
    python::object v1 = bitVectList[0];
    python::extract<ExplicitBitVect> ebvWorks(v1);
    python::extract<SparseBitVect> sbvWorks(v1);
    if(!ebvWorks.check() && !sbvWorks.check()){
      throw_value_error("GetTanimotoDistMat can only take a sequence of ExplicitBitVects or SparseBitvects");
    }

    int dMatLen = nrows*(nrows-1)/2;
    PyArrayObject *simRes = (PyArrayObject *)PyArray_FromDims(1, &dMatLen, PyArray_DOUBLE);
    double *sMat = (double *)simRes->data;

    if (ebvWorks.check()) {
      PySequenceHolder<ExplicitBitVect> dData(bitVectList);
      MetricMatrixCalc<PySequenceHolder<ExplicitBitVect>, ExplicitBitVect> mmCalc;
      mmCalc.setMetricFunc(&TanimotoSimilarity<ExplicitBitVect, ExplicitBitVect>);
      mmCalc.calcMetricMatrix(dData, nrows, 0, sMat);
    }
    else if (sbvWorks.check()) {
      PySequenceHolder<SparseBitVect> dData(bitVectList);
      MetricMatrixCalc<PySequenceHolder<SparseBitVect>, SparseBitVect> mmCalc;
      mmCalc.setMetricFunc(&TanimotoSimilarity<SparseBitVect, SparseBitVect>);
      mmCalc.calcMetricMatrix(dData, nrows, 0, sMat);
    }
    return PyArray_Return(simRes);
  }
}

BOOST_PYTHON_MODULE(rdMetricMatrixCalc)
{
  python::scope().attr("__doc__") =
    "Module containing the calculator for metric matrix calculation, \n"
    "e.g. simialrity and distance matrices"
    ;

  import_array();
  python::register_exception_translator<IndexErrorException>(&translate_index_error);
  python::register_exception_translator<ValueErrorException>(&translate_value_error);

  std::string docString;
  docString = "Compute the distance matrix from a descriptor matrix using Euclidean distance metric\n\n\
  ARGUMENTS: \n\
\n\
    descripMat - A python object of any one of the folliwng type \n\
                   1. A numeric array of dimensions n by m where n is the number of items in the data set \n\
                       and m is the number of descriptors \n\
                   2. A list of Numeric Vectors (or 1D arrays), each entry in the list corresponds \n\
                       to descriptor vector for one item \n\
                   3. A list (or tuple) of list (or tuple) of values, where the values can be extracted to \n\
                       double. \n\n\
  RETURNS: \n\
    A numeric 1 dimensional array containing the lower triangle elements of the symmetric distance matrix\n\n";

  python::def("GetEuclideanDistMat", RDDataManip::getEuclideanDistMat,
              docString.c_str());

  docString = "Compute the distance matrix from a list of BitVects \n\n\
  ARGUMENTS: \n\
\n\
    bitVectList - a list of bit vectors. Currently this works only for a list of explicit bit vectors, \n\
                  needs to be expanded to support a lsit of SparseBitVects\n\n\
  RETURNS: \n\
    A numeric 1 dimensional array containing the lower triangle elements of the\n\
    symmetric distance matrix\n\n";
  python::def("GetTanimotoDistMat", RDDataManip::getTanimotoDistMat,
              docString.c_str());

  docString = "Compute the similarity matrix from a list of BitVects \n\n\
  ARGUMENTS: \n\
\n\
    bitVectList - a list of bit vectors. Currently this works only for a list of explicit bit vectors, \n\
                  needs to be expanded to support a lsit of SparseBitVects\n\n\
  RETURNS: \n\
    A numeric 1 dimensional array containing the lower triangle elements of the symmetric similarity matrix\n\n";
  python::def("GetTanimotoSimMat", RDDataManip::getTanimotoSimMat,
              docString.c_str());
}