mirror of
https://github.com/rdkit/rdkit.git
synced 2026-06-03 21:44:30 +08:00
Defer numpy initialization to first use (#9127)
* Defer numpy initialization to first use in rdchem, rdmolops, cDataStructs
`from rdkit import Chem` unconditionally bootstrapped numpy (~120ms) via
import_array()/boost::python::numpy::initialize() in module init functions,
even when no numpy-dependent APIs were called. This is costly in cold-start
environments like AWS Lambda.
Move numpy initialization behind lazy guards (static bool + first-call init)
in rdchem.so, rdmolops.so, and cDataStructs.so. Numpy now loads only when
an API that actually needs it is invoked (GetDistanceMatrix, GetPositions,
SetPositions, GetAdjacencyMatrix, ConvertToNumpyArray, etc.).
Also change Conformer::SetPos to accept python::object instead of
np::ndarray to prevent Boost.Python from requiring numpy type conversion
before the lazy guard runs.
Adds test_lazy_numpy.py with subprocess-based tests verifying:
- `from rdkit import Chem` does not load numpy
- SmilesToMol/MolToSmiles work without numpy
- numpy loads on demand when array APIs are called
* skip inchi tests if not available
* switch to threadsafe once_flag, like elsewhere
* finish ifdef style
* switch to magic static style
* Revert "switch to magic static style"
This reverts commit 7300188db7.
This commit is contained in:
@@ -10,6 +10,10 @@
|
||||
//
|
||||
#define PY_ARRAY_UNIQUE_SYMBOL rddatastructs_array_API
|
||||
|
||||
#ifdef RDK_BUILD_THREADSAFE_SSS
|
||||
#include <mutex>
|
||||
#endif
|
||||
|
||||
#include <RDBoost/python.h>
|
||||
#include <RDBoost/Wrap.h>
|
||||
#include <DataStructs/BitVects.h>
|
||||
@@ -32,9 +36,26 @@ void wrap_realValVect();
|
||||
void wrap_sparseIntVect();
|
||||
void wrap_FPB();
|
||||
|
||||
#ifdef RDK_BUILD_THREADSAFE_SSS
|
||||
static std::once_flag s_ds_numpy_init_flag;
|
||||
#endif
|
||||
|
||||
static void ds_ensure_numpy() {
|
||||
#ifdef RDK_BUILD_THREADSAFE_SSS
|
||||
std::call_once(s_ds_numpy_init_flag, rdkit_import_array);
|
||||
#else
|
||||
static bool initialized = false;
|
||||
if (!initialized) {
|
||||
initialized = true;
|
||||
rdkit_import_array();
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
namespace {
|
||||
template <typename T, typename U>
|
||||
void converter(const T &v, python::object destArray, U func) {
|
||||
ds_ensure_numpy();
|
||||
if (!PyArray_Check(destArray.ptr())) {
|
||||
throw_value_error("Expecting a Numeric array object");
|
||||
}
|
||||
@@ -64,7 +85,6 @@ void convertToDoubleNumpyArray(const T &v, python::object destArray) {
|
||||
}
|
||||
|
||||
BOOST_PYTHON_MODULE(cDataStructs) {
|
||||
rdkit_import_array();
|
||||
python::scope().attr("__doc__") =
|
||||
"Module containing an assortment of functionality for basic data "
|
||||
"structures.\n"
|
||||
|
||||
@@ -93,3 +93,6 @@ add_pytest(pyCDXMLTest
|
||||
|
||||
add_pytest(pySubsetTest
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/test_subset.py)
|
||||
|
||||
add_pytest(pyTestLazyNumpy
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/test_lazy_numpy.py)
|
||||
|
||||
@@ -36,6 +36,7 @@ RDGeom::Point3D GetAtomPos(const Conformer *conf, unsigned int aid) {
|
||||
}
|
||||
|
||||
PyObject *GetPos(const Conformer *conf) {
|
||||
rdkit_rdchem_ensure_numpy();
|
||||
const RDGeom::POINT3D_VECT &pos = conf->getPositions();
|
||||
|
||||
// define a 2D array with the following size
|
||||
@@ -58,7 +59,9 @@ PyObject *GetPos(const Conformer *conf) {
|
||||
return PyArray_Return(res);
|
||||
}
|
||||
|
||||
void SetPos(Conformer *conf, np::ndarray const &array) {
|
||||
void SetPos(Conformer *conf, python::object const &arrayObj) {
|
||||
rdkit_rdchem_ensure_numpy();
|
||||
np::ndarray array = python::extract<np::ndarray>(arrayObj);
|
||||
if (array.get_dtype() != np::dtype::get_builtin<double>()) {
|
||||
PyErr_SetString(PyExc_TypeError, "Incorrect array data type");
|
||||
python::throw_error_already_set();
|
||||
|
||||
@@ -503,6 +503,7 @@ VECT_INT_VECT getSymmSSSR(ROMol &mol, bool includeDativeBonds,
|
||||
PyObject *getDistanceMatrix(ROMol &mol, bool useBO = false,
|
||||
bool useAtomWts = false, bool force = false,
|
||||
const char *prefix = nullptr) {
|
||||
rdkit_rdmolops_ensure_numpy();
|
||||
int nats = mol.getNumAtoms();
|
||||
npy_intp dims[2];
|
||||
dims[0] = nats;
|
||||
@@ -521,6 +522,7 @@ PyObject *getDistanceMatrix(ROMol &mol, bool useBO = false,
|
||||
PyObject *get3DDistanceMatrix(ROMol &mol, int confId = -1,
|
||||
bool useAtomWts = false, bool force = false,
|
||||
const char *prefix = nullptr) {
|
||||
rdkit_rdmolops_ensure_numpy();
|
||||
int nats = mol.getNumAtoms();
|
||||
npy_intp dims[2];
|
||||
dims[0] = nats;
|
||||
@@ -542,6 +544,7 @@ PyObject *get3DDistanceMatrix(ROMol &mol, int confId = -1,
|
||||
|
||||
PyObject *getAdjacencyMatrix(ROMol &mol, bool useBO = false, int emptyVal = 0,
|
||||
bool force = false, const char *prefix = nullptr) {
|
||||
rdkit_rdmolops_ensure_numpy();
|
||||
int nats = mol.getNumAtoms();
|
||||
npy_intp dims[2];
|
||||
dims[0] = nats;
|
||||
|
||||
@@ -21,11 +21,36 @@
|
||||
|
||||
#include <sstream>
|
||||
#include <utility>
|
||||
#ifdef RDK_BUILD_THREADSAFE_SSS
|
||||
#include <mutex>
|
||||
#endif
|
||||
|
||||
#include "seqs.hpp"
|
||||
namespace python = boost::python;
|
||||
using namespace RDKit;
|
||||
|
||||
#ifdef RDK_BUILD_THREADSAFE_SSS
|
||||
static std::once_flag s_rdchem_numpy_init_flag;
|
||||
#endif
|
||||
|
||||
void rdkit_rdchem_ensure_numpy() {
|
||||
#ifdef RDK_BUILD_THREADSAFE_SSS
|
||||
std::call_once(s_rdchem_numpy_init_flag, []() {
|
||||
const bool register_scalar_converters = false;
|
||||
boost::python::numpy::initialize(register_scalar_converters);
|
||||
rdkit_import_array();
|
||||
});
|
||||
#else
|
||||
static bool initialized = false;
|
||||
if (!initialized) {
|
||||
initialized = true;
|
||||
const bool register_scalar_converters = false;
|
||||
boost::python::numpy::initialize(register_scalar_converters);
|
||||
rdkit_import_array();
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
namespace RDKit {
|
||||
void tossit() { throw IndexErrorException(1); }
|
||||
} // namespace RDKit
|
||||
@@ -102,12 +127,9 @@ T *next_ptr(O &self) {
|
||||
BOOST_PYTHON_MODULE(rdchem) {
|
||||
python::scope().attr("__doc__") =
|
||||
"Module containing the core chemistry functionality of the RDKit";
|
||||
const bool register_scalar_converters = false;
|
||||
boost::python::numpy::initialize(register_scalar_converters);
|
||||
RegisterListConverter<RDKit::Atom *>();
|
||||
RegisterListConverter<RDKit::Bond *>();
|
||||
RegisterListConverter<RDKit::CONFORMER_SPTR>();
|
||||
rdkit_import_array();
|
||||
|
||||
// this is one of those parts where I think I wish that I knew how to do
|
||||
// template meta-programming
|
||||
|
||||
@@ -18,4 +18,6 @@ class ConformerException;
|
||||
}
|
||||
void rdExceptionTranslator(RDKit::ConformerException const &x);
|
||||
|
||||
void rdkit_rdchem_ensure_numpy();
|
||||
|
||||
#endif
|
||||
|
||||
@@ -17,17 +17,35 @@
|
||||
#include <RDBoost/import_array.h>
|
||||
#include <RDGeneral/Exceptions.h>
|
||||
#include <GraphMol/SanitException.h>
|
||||
#ifdef RDK_BUILD_THREADSAFE_SSS
|
||||
#include <mutex>
|
||||
#endif
|
||||
|
||||
namespace python = boost::python;
|
||||
using namespace RDKit;
|
||||
|
||||
#ifdef RDK_BUILD_THREADSAFE_SSS
|
||||
static std::once_flag s_rdmolops_numpy_init_flag;
|
||||
#endif
|
||||
|
||||
void rdkit_rdmolops_ensure_numpy() {
|
||||
#ifdef RDK_BUILD_THREADSAFE_SSS
|
||||
std::call_once(s_rdmolops_numpy_init_flag, rdkit_import_array);
|
||||
#else
|
||||
static bool initialized = false;
|
||||
if (!initialized) {
|
||||
initialized = true;
|
||||
rdkit_import_array();
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
void wrap_molops();
|
||||
void wrap_chiralityops();
|
||||
|
||||
BOOST_PYTHON_MODULE(rdmolops) {
|
||||
python::scope().attr("__doc__") =
|
||||
"Module containing RDKit functionality for manipulating molecules.";
|
||||
rdkit_import_array();
|
||||
|
||||
// ******************************
|
||||
// Functions from MolOps
|
||||
|
||||
@@ -13,4 +13,6 @@
|
||||
|
||||
#define PY_ARRAY_UNIQUE_SYMBOL rdmolops_array_API
|
||||
|
||||
void rdkit_rdmolops_ensure_numpy();
|
||||
|
||||
#endif
|
||||
|
||||
164
Code/GraphMol/Wrap/test_lazy_numpy.py
Normal file
164
Code/GraphMol/Wrap/test_lazy_numpy.py
Normal file
@@ -0,0 +1,164 @@
|
||||
#
|
||||
# Copyright (C) 2025 RDKit contributors
|
||||
# All Rights Reserved
|
||||
#
|
||||
"""Tests that 'from rdkit import Chem' does not eagerly load numpy,
|
||||
and that core Chem functionality works before numpy is loaded.
|
||||
|
||||
Because numpy's import state is process-global and cannot be unloaded,
|
||||
import-ordering tests must run in a **fresh subprocess**.
|
||||
"""
|
||||
import subprocess
|
||||
import sys
|
||||
import textwrap
|
||||
import unittest
|
||||
|
||||
|
||||
def _run_snippet(code: str) -> subprocess.CompletedProcess:
|
||||
"""Run *code* in a clean Python subprocess and return the result."""
|
||||
return subprocess.run(
|
||||
[sys.executable, "-c", textwrap.dedent(code)],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=60,
|
||||
)
|
||||
|
||||
|
||||
class TestLazyNumpy(unittest.TestCase):
|
||||
|
||||
def test_chem_import_does_not_load_numpy(self):
|
||||
"""Importing rdkit.Chem must not pull numpy into sys.modules."""
|
||||
result = _run_snippet("""\
|
||||
import sys
|
||||
from rdkit import Chem
|
||||
# numpy must not have been imported as a side-effect
|
||||
if "numpy" in sys.modules:
|
||||
sys.exit("FAIL: numpy was loaded by 'from rdkit import Chem'")
|
||||
""")
|
||||
self.assertEqual(result.returncode, 0, result.stderr or result.stdout)
|
||||
|
||||
def test_basic_smiles_roundtrip_without_numpy(self):
|
||||
"""MolFromSmiles / MolToSmiles must work before numpy is loaded."""
|
||||
result = _run_snippet("""\
|
||||
import sys
|
||||
from rdkit import Chem
|
||||
|
||||
mol = Chem.MolFromSmiles("c1ccccc1")
|
||||
assert mol is not None, "MolFromSmiles returned None"
|
||||
|
||||
smi = Chem.MolToSmiles(mol)
|
||||
assert smi == "c1ccccc1", f"unexpected SMILES: {smi}"
|
||||
|
||||
assert "numpy" not in sys.modules, "numpy crept in during SMILES ops"
|
||||
""")
|
||||
self.assertEqual(result.returncode, 0, result.stderr or result.stdout)
|
||||
|
||||
def test_mol_operations_without_numpy(self):
|
||||
"""Core mol operations (AddHs, sanitize, substruct) work pre-numpy."""
|
||||
result = _run_snippet("""\
|
||||
import sys
|
||||
from rdkit import Chem
|
||||
from rdkit.Chem import inchi
|
||||
|
||||
mol = Chem.MolFromSmiles("CCO")
|
||||
assert mol is not None
|
||||
|
||||
# AddHs / RemoveHs
|
||||
molH = Chem.AddHs(mol)
|
||||
assert molH.GetNumAtoms() == 9 # 3 heavy + 6 H
|
||||
mol2 = Chem.RemoveHs(molH)
|
||||
assert mol2.GetNumAtoms() == 3
|
||||
|
||||
# Substructure match
|
||||
query = Chem.MolFromSmarts("[OH]")
|
||||
assert mol.HasSubstructMatch(query)
|
||||
|
||||
# MolToMolBlock (V2000)
|
||||
mb = Chem.MolToMolBlock(mol)
|
||||
assert "V2000" in mb
|
||||
|
||||
if inchi.INCHI_AVAILABLE:
|
||||
# InChI round-trip
|
||||
inchi_str = inchi.MolToInchi(mol)
|
||||
assert inchi_str.startswith("InChI=")
|
||||
|
||||
assert "numpy" not in sys.modules, "numpy crept in during mol ops"
|
||||
""")
|
||||
self.assertEqual(result.returncode, 0, result.stderr or result.stdout)
|
||||
|
||||
def test_numpy_loads_on_demand(self):
|
||||
"""Calling a numpy-returning API (GetDistanceMatrix) loads numpy lazily."""
|
||||
result = _run_snippet("""\
|
||||
import sys
|
||||
from rdkit import Chem
|
||||
|
||||
assert "numpy" not in sys.modules, "numpy loaded too early"
|
||||
|
||||
mol = Chem.MolFromSmiles("CCO")
|
||||
dm = Chem.GetDistanceMatrix(mol)
|
||||
|
||||
assert "numpy" in sys.modules, "numpy should be loaded after GetDistanceMatrix"
|
||||
assert dm.shape == (3, 3), f"unexpected shape: {dm.shape}"
|
||||
assert dm[0, 2] == 2.0, f"unexpected distance: {dm[0, 2]}"
|
||||
""")
|
||||
self.assertEqual(result.returncode, 0, result.stderr or result.stdout)
|
||||
|
||||
def test_conformer_positions_loads_numpy(self):
|
||||
"""GetPositions() / SetPositions() load numpy on first call."""
|
||||
result = _run_snippet("""\
|
||||
import sys
|
||||
from rdkit import Chem
|
||||
from rdkit.Chem import rdchem
|
||||
|
||||
assert "numpy" not in sys.modules, "numpy loaded too early"
|
||||
|
||||
mol = Chem.MolFromSmiles("C")
|
||||
conf = rdchem.Conformer(mol.GetNumAtoms())
|
||||
conf.SetAtomPosition(0, (1.0, 2.0, 3.0))
|
||||
mol.AddConformer(conf, assignId=True)
|
||||
|
||||
pos = mol.GetConformer().GetPositions()
|
||||
assert "numpy" in sys.modules, "numpy should load after GetPositions"
|
||||
assert pos.shape == (1, 3), f"unexpected shape: {pos.shape}"
|
||||
assert abs(pos[0, 0] - 1.0) < 1e-6
|
||||
""")
|
||||
self.assertEqual(result.returncode, 0, result.stderr or result.stdout)
|
||||
|
||||
def test_datastructs_convert_loads_numpy(self):
|
||||
"""DataStructs.ConvertToNumpyArray loads numpy on demand."""
|
||||
result = _run_snippet("""\
|
||||
import sys
|
||||
from rdkit import Chem, DataStructs
|
||||
|
||||
assert "numpy" not in sys.modules, "numpy loaded too early"
|
||||
|
||||
fp = Chem.RDKFingerprint(Chem.MolFromSmiles("c1ccccc1"))
|
||||
|
||||
import numpy as np
|
||||
arr = np.zeros(len(fp), dtype=int)
|
||||
DataStructs.ConvertToNumpyArray(fp, arr)
|
||||
assert arr.sum() > 0, "fingerprint should have bits set"
|
||||
""")
|
||||
self.assertEqual(result.returncode, 0, result.stderr or result.stdout)
|
||||
|
||||
def test_adjacency_matrix_loads_numpy(self):
|
||||
"""GetAdjacencyMatrix loads numpy on demand and returns correct result."""
|
||||
result = _run_snippet("""\
|
||||
import sys
|
||||
from rdkit import Chem
|
||||
|
||||
assert "numpy" not in sys.modules, "numpy loaded too early"
|
||||
|
||||
mol = Chem.MolFromSmiles("CC")
|
||||
adj = Chem.GetAdjacencyMatrix(mol)
|
||||
|
||||
assert "numpy" in sys.modules, "numpy should load after GetAdjacencyMatrix"
|
||||
assert adj[0, 1] == 1
|
||||
assert adj[1, 0] == 1
|
||||
assert adj[0, 0] == 0
|
||||
""")
|
||||
self.assertEqual(result.returncode, 0, result.stderr or result.stdout)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
unittest.main()
|
||||
Reference in New Issue
Block a user