Defer numpy initialization to first use (#9127)

* Defer numpy initialization to first use in rdchem, rdmolops, cDataStructs

`from rdkit import Chem` unconditionally bootstrapped numpy (~120ms) via
import_array()/boost::python::numpy::initialize() in module init functions,
even when no numpy-dependent APIs were called. This is costly in cold-start
environments like AWS Lambda.

Move numpy initialization behind lazy guards (static bool + first-call init)
in rdchem.so, rdmolops.so, and cDataStructs.so. Numpy now loads only when
an API that actually needs it is invoked (GetDistanceMatrix, GetPositions,
SetPositions, GetAdjacencyMatrix, ConvertToNumpyArray, etc.).

Also change Conformer::SetPos to accept python::object instead of
np::ndarray to prevent Boost.Python from requiring numpy type conversion
before the lazy guard runs.

Adds test_lazy_numpy.py with subprocess-based tests verifying:
- `from rdkit import Chem` does not load numpy
- SmilesToMol/MolToSmiles work without numpy
- numpy loads on demand when array APIs are called

* skip inchi tests if not available

* switch to threadsafe once_flag, like elsewhere

* finish ifdef style

* switch to magic static style

* Revert "switch to magic static style"

This reverts commit 7300188db7.
This commit is contained in:
Yakov Pechersky
2026-02-23 12:42:42 -05:00
committed by GitHub
parent 474df5a9a8
commit 872b054d5c
9 changed files with 243 additions and 6 deletions

View File

@@ -10,6 +10,10 @@
//
#define PY_ARRAY_UNIQUE_SYMBOL rddatastructs_array_API
#ifdef RDK_BUILD_THREADSAFE_SSS
#include <mutex>
#endif
#include <RDBoost/python.h>
#include <RDBoost/Wrap.h>
#include <DataStructs/BitVects.h>
@@ -32,9 +36,26 @@ void wrap_realValVect();
void wrap_sparseIntVect();
void wrap_FPB();
#ifdef RDK_BUILD_THREADSAFE_SSS
static std::once_flag s_ds_numpy_init_flag;
#endif
static void ds_ensure_numpy() {
#ifdef RDK_BUILD_THREADSAFE_SSS
std::call_once(s_ds_numpy_init_flag, rdkit_import_array);
#else
static bool initialized = false;
if (!initialized) {
initialized = true;
rdkit_import_array();
}
#endif
}
namespace {
template <typename T, typename U>
void converter(const T &v, python::object destArray, U func) {
ds_ensure_numpy();
if (!PyArray_Check(destArray.ptr())) {
throw_value_error("Expecting a Numeric array object");
}
@@ -64,7 +85,6 @@ void convertToDoubleNumpyArray(const T &v, python::object destArray) {
}
BOOST_PYTHON_MODULE(cDataStructs) {
rdkit_import_array();
python::scope().attr("__doc__") =
"Module containing an assortment of functionality for basic data "
"structures.\n"

View File

@@ -93,3 +93,6 @@ add_pytest(pyCDXMLTest
add_pytest(pySubsetTest
${CMAKE_CURRENT_SOURCE_DIR}/test_subset.py)
add_pytest(pyTestLazyNumpy
${CMAKE_CURRENT_SOURCE_DIR}/test_lazy_numpy.py)

View File

@@ -36,6 +36,7 @@ RDGeom::Point3D GetAtomPos(const Conformer *conf, unsigned int aid) {
}
PyObject *GetPos(const Conformer *conf) {
rdkit_rdchem_ensure_numpy();
const RDGeom::POINT3D_VECT &pos = conf->getPositions();
// define a 2D array with the following size
@@ -58,7 +59,9 @@ PyObject *GetPos(const Conformer *conf) {
return PyArray_Return(res);
}
void SetPos(Conformer *conf, np::ndarray const &array) {
void SetPos(Conformer *conf, python::object const &arrayObj) {
rdkit_rdchem_ensure_numpy();
np::ndarray array = python::extract<np::ndarray>(arrayObj);
if (array.get_dtype() != np::dtype::get_builtin<double>()) {
PyErr_SetString(PyExc_TypeError, "Incorrect array data type");
python::throw_error_already_set();

View File

@@ -503,6 +503,7 @@ VECT_INT_VECT getSymmSSSR(ROMol &mol, bool includeDativeBonds,
PyObject *getDistanceMatrix(ROMol &mol, bool useBO = false,
bool useAtomWts = false, bool force = false,
const char *prefix = nullptr) {
rdkit_rdmolops_ensure_numpy();
int nats = mol.getNumAtoms();
npy_intp dims[2];
dims[0] = nats;
@@ -521,6 +522,7 @@ PyObject *getDistanceMatrix(ROMol &mol, bool useBO = false,
PyObject *get3DDistanceMatrix(ROMol &mol, int confId = -1,
bool useAtomWts = false, bool force = false,
const char *prefix = nullptr) {
rdkit_rdmolops_ensure_numpy();
int nats = mol.getNumAtoms();
npy_intp dims[2];
dims[0] = nats;
@@ -542,6 +544,7 @@ PyObject *get3DDistanceMatrix(ROMol &mol, int confId = -1,
PyObject *getAdjacencyMatrix(ROMol &mol, bool useBO = false, int emptyVal = 0,
bool force = false, const char *prefix = nullptr) {
rdkit_rdmolops_ensure_numpy();
int nats = mol.getNumAtoms();
npy_intp dims[2];
dims[0] = nats;

View File

@@ -21,11 +21,36 @@
#include <sstream>
#include <utility>
#ifdef RDK_BUILD_THREADSAFE_SSS
#include <mutex>
#endif
#include "seqs.hpp"
namespace python = boost::python;
using namespace RDKit;
#ifdef RDK_BUILD_THREADSAFE_SSS
static std::once_flag s_rdchem_numpy_init_flag;
#endif
void rdkit_rdchem_ensure_numpy() {
#ifdef RDK_BUILD_THREADSAFE_SSS
std::call_once(s_rdchem_numpy_init_flag, []() {
const bool register_scalar_converters = false;
boost::python::numpy::initialize(register_scalar_converters);
rdkit_import_array();
});
#else
static bool initialized = false;
if (!initialized) {
initialized = true;
const bool register_scalar_converters = false;
boost::python::numpy::initialize(register_scalar_converters);
rdkit_import_array();
}
#endif
}
namespace RDKit {
void tossit() { throw IndexErrorException(1); }
} // namespace RDKit
@@ -102,12 +127,9 @@ T *next_ptr(O &self) {
BOOST_PYTHON_MODULE(rdchem) {
python::scope().attr("__doc__") =
"Module containing the core chemistry functionality of the RDKit";
const bool register_scalar_converters = false;
boost::python::numpy::initialize(register_scalar_converters);
RegisterListConverter<RDKit::Atom *>();
RegisterListConverter<RDKit::Bond *>();
RegisterListConverter<RDKit::CONFORMER_SPTR>();
rdkit_import_array();
// this is one of those parts where I think I wish that I knew how to do
// template meta-programming

View File

@@ -18,4 +18,6 @@ class ConformerException;
}
void rdExceptionTranslator(RDKit::ConformerException const &x);
void rdkit_rdchem_ensure_numpy();
#endif

View File

@@ -17,17 +17,35 @@
#include <RDBoost/import_array.h>
#include <RDGeneral/Exceptions.h>
#include <GraphMol/SanitException.h>
#ifdef RDK_BUILD_THREADSAFE_SSS
#include <mutex>
#endif
namespace python = boost::python;
using namespace RDKit;
#ifdef RDK_BUILD_THREADSAFE_SSS
static std::once_flag s_rdmolops_numpy_init_flag;
#endif
void rdkit_rdmolops_ensure_numpy() {
#ifdef RDK_BUILD_THREADSAFE_SSS
std::call_once(s_rdmolops_numpy_init_flag, rdkit_import_array);
#else
static bool initialized = false;
if (!initialized) {
initialized = true;
rdkit_import_array();
}
#endif
}
void wrap_molops();
void wrap_chiralityops();
BOOST_PYTHON_MODULE(rdmolops) {
python::scope().attr("__doc__") =
"Module containing RDKit functionality for manipulating molecules.";
rdkit_import_array();
// ******************************
// Functions from MolOps

View File

@@ -13,4 +13,6 @@
#define PY_ARRAY_UNIQUE_SYMBOL rdmolops_array_API
void rdkit_rdmolops_ensure_numpy();
#endif

View File

@@ -0,0 +1,164 @@
#
# Copyright (C) 2025 RDKit contributors
# All Rights Reserved
#
"""Tests that 'from rdkit import Chem' does not eagerly load numpy,
and that core Chem functionality works before numpy is loaded.
Because numpy's import state is process-global and cannot be unloaded,
import-ordering tests must run in a **fresh subprocess**.
"""
import subprocess
import sys
import textwrap
import unittest
def _run_snippet(code: str) -> subprocess.CompletedProcess:
"""Run *code* in a clean Python subprocess and return the result."""
return subprocess.run(
[sys.executable, "-c", textwrap.dedent(code)],
capture_output=True,
text=True,
timeout=60,
)
class TestLazyNumpy(unittest.TestCase):
def test_chem_import_does_not_load_numpy(self):
"""Importing rdkit.Chem must not pull numpy into sys.modules."""
result = _run_snippet("""\
import sys
from rdkit import Chem
# numpy must not have been imported as a side-effect
if "numpy" in sys.modules:
sys.exit("FAIL: numpy was loaded by 'from rdkit import Chem'")
""")
self.assertEqual(result.returncode, 0, result.stderr or result.stdout)
def test_basic_smiles_roundtrip_without_numpy(self):
"""MolFromSmiles / MolToSmiles must work before numpy is loaded."""
result = _run_snippet("""\
import sys
from rdkit import Chem
mol = Chem.MolFromSmiles("c1ccccc1")
assert mol is not None, "MolFromSmiles returned None"
smi = Chem.MolToSmiles(mol)
assert smi == "c1ccccc1", f"unexpected SMILES: {smi}"
assert "numpy" not in sys.modules, "numpy crept in during SMILES ops"
""")
self.assertEqual(result.returncode, 0, result.stderr or result.stdout)
def test_mol_operations_without_numpy(self):
"""Core mol operations (AddHs, sanitize, substruct) work pre-numpy."""
result = _run_snippet("""\
import sys
from rdkit import Chem
from rdkit.Chem import inchi
mol = Chem.MolFromSmiles("CCO")
assert mol is not None
# AddHs / RemoveHs
molH = Chem.AddHs(mol)
assert molH.GetNumAtoms() == 9 # 3 heavy + 6 H
mol2 = Chem.RemoveHs(molH)
assert mol2.GetNumAtoms() == 3
# Substructure match
query = Chem.MolFromSmarts("[OH]")
assert mol.HasSubstructMatch(query)
# MolToMolBlock (V2000)
mb = Chem.MolToMolBlock(mol)
assert "V2000" in mb
if inchi.INCHI_AVAILABLE:
# InChI round-trip
inchi_str = inchi.MolToInchi(mol)
assert inchi_str.startswith("InChI=")
assert "numpy" not in sys.modules, "numpy crept in during mol ops"
""")
self.assertEqual(result.returncode, 0, result.stderr or result.stdout)
def test_numpy_loads_on_demand(self):
"""Calling a numpy-returning API (GetDistanceMatrix) loads numpy lazily."""
result = _run_snippet("""\
import sys
from rdkit import Chem
assert "numpy" not in sys.modules, "numpy loaded too early"
mol = Chem.MolFromSmiles("CCO")
dm = Chem.GetDistanceMatrix(mol)
assert "numpy" in sys.modules, "numpy should be loaded after GetDistanceMatrix"
assert dm.shape == (3, 3), f"unexpected shape: {dm.shape}"
assert dm[0, 2] == 2.0, f"unexpected distance: {dm[0, 2]}"
""")
self.assertEqual(result.returncode, 0, result.stderr or result.stdout)
def test_conformer_positions_loads_numpy(self):
"""GetPositions() / SetPositions() load numpy on first call."""
result = _run_snippet("""\
import sys
from rdkit import Chem
from rdkit.Chem import rdchem
assert "numpy" not in sys.modules, "numpy loaded too early"
mol = Chem.MolFromSmiles("C")
conf = rdchem.Conformer(mol.GetNumAtoms())
conf.SetAtomPosition(0, (1.0, 2.0, 3.0))
mol.AddConformer(conf, assignId=True)
pos = mol.GetConformer().GetPositions()
assert "numpy" in sys.modules, "numpy should load after GetPositions"
assert pos.shape == (1, 3), f"unexpected shape: {pos.shape}"
assert abs(pos[0, 0] - 1.0) < 1e-6
""")
self.assertEqual(result.returncode, 0, result.stderr or result.stdout)
def test_datastructs_convert_loads_numpy(self):
"""DataStructs.ConvertToNumpyArray loads numpy on demand."""
result = _run_snippet("""\
import sys
from rdkit import Chem, DataStructs
assert "numpy" not in sys.modules, "numpy loaded too early"
fp = Chem.RDKFingerprint(Chem.MolFromSmiles("c1ccccc1"))
import numpy as np
arr = np.zeros(len(fp), dtype=int)
DataStructs.ConvertToNumpyArray(fp, arr)
assert arr.sum() > 0, "fingerprint should have bits set"
""")
self.assertEqual(result.returncode, 0, result.stderr or result.stdout)
def test_adjacency_matrix_loads_numpy(self):
"""GetAdjacencyMatrix loads numpy on demand and returns correct result."""
result = _run_snippet("""\
import sys
from rdkit import Chem
assert "numpy" not in sys.modules, "numpy loaded too early"
mol = Chem.MolFromSmiles("CC")
adj = Chem.GetAdjacencyMatrix(mol)
assert "numpy" in sys.modules, "numpy should load after GetAdjacencyMatrix"
assert adj[0, 1] == 1
assert adj[1, 0] == 1
assert adj[0, 0] == 0
""")
self.assertEqual(result.returncode, 0, result.stderr or result.stdout)
if __name__ == '__main__':
unittest.main()