From 34444086926ad9d550c75a996fc9b9e63c9da143 Mon Sep 17 00:00:00 2001
From: Greg Landrum <greg.landrum@gmail.com>
Date: Wed, 4 Feb 2026 12:06:21 +0100
Subject: [PATCH] Get things working with numpy 2.4 and pandas 3.0 (#9072)

* get BertzCT working with numpy 2.4

* test pass with with Pandas 3.0
(on windows at least)

* update testRanker too

* update nb test

* run win32 CI tests with different pandas versions
also updates boost version

* works with pandas 2.0?

* update linux_build_py311 -> linux_build_py312
test both old and new pandas

can't go higher with the python version yet because the older pandas and numpy are not available.

* doctest fix?

---------

Co-authored-by: = <=>
---
 ..._build_py311.yml => linux_build_py312.yml} | 20 ++++++--
 .azure-pipelines/vs_build.yml                 | 14 +++++-
 Code/ML/InfoTheory/Wrap/testRanker.py         | 22 ++++-----
 Docs/Book/GettingStartedInPython.rst          |  2 +-
 azure-pipelines.yml                           | 10 ++--
 rdkit/Chem/GraphDescriptors.py                | 16 +++----
 rdkit/Chem/PandasPatcher.py                   |  9 ++--
 rdkit/Chem/PandasTools.py                     |  2 +-
 rdkit/Chem/UnitTestPandasTools.py             |  2 +-
 rdkit/Chem/nbtests/github4823.ipynb           | 47 ++++++++++++-------
 10 files changed, 92 insertions(+), 52 deletions(-)
 rename .azure-pipelines/{linux_build_py311.yml => linux_build_py312.yml} (79%)

diff --git a/.azure-pipelines/linux_build_py311.yml b/.azure-pipelines/linux_build_py312.yml
similarity index 79%
rename from .azure-pipelines/linux_build_py311.yml
rename to .azure-pipelines/linux_build_py312.yml
index 5acb7ffc1..ab8ec6d97 100644
--- a/.azure-pipelines/linux_build_py311.yml
+++ b/.azure-pipelines/linux_build_py312.yml
@@ -7,9 +7,11 @@ steps:
     conda update -q conda
     conda info -a
     conda create --name rdkit_build -c conda-forge --override-channels  $(python) cmake \
-        boost-cpp=$(boost_version) \
-        boost=$(boost_version) \
-        numpy=1.24.3 pillow eigen pandas=2.1 matplotlib-base=3.8 \
+        libboost-python-devel=$(boost_version) \
+        libboost-python=$(boost_version) \
+        libboost-devel=$(boost_version) \
+        libboost=$(boost_version) \
+        numpy=2.4 pillow eigen pandas=3 matplotlib-base=3.8 \
         cairo
     conda activate rdkit_build
     conda config --env --add channels conda-forge
@@ -66,6 +68,18 @@ steps:
     cd build
     ctest -j $( $(number_of_cores) ) --output-on-failure -T Test
   displayName: Run tests
+- bash: |
+    source ${CONDA}/etc/profile.d/conda.sh
+    conda activate rdkit_build
+    conda install -c conda-forge --override-channels numpy=1.24 pandas=2.2
+    export RDBASE=`pwd`
+    export PYTHONPATH=${RDBASE}:${PYTHONPATH}
+    export LD_LIBRARY_PATH=${RDBASE}/lib:${CONDA_PREFIX}/lib:${LD_LIBRARY_PATH}
+    echo "LD_LIBRARY_PATH: " $LD_LIBRARY_PATH
+    export QT_QPA_PLATFORM='offscreen'
+    cd build
+    ctest -j $( $(number_of_cores) ) --output-on-failure -T Test
+  displayName: Run tests with older numpy and pandas
 - bash: |
     source ${CONDA}/etc/profile.d/conda.sh
     conda activate rdkit_build
diff --git a/.azure-pipelines/vs_build.yml b/.azure-pipelines/vs_build.yml
index e06511b0f..4f59647f6 100644
--- a/.azure-pipelines/vs_build.yml
+++ b/.azure-pipelines/vs_build.yml
@@ -7,11 +7,12 @@ steps:
     conda install -n base conda-libmamba-solver
     conda config --set solver libmamba
     conda create --name rdkit_build -c conda-forge --override-channels  $(python) ^
-        boost=$(boost_version) boost-cpp=$(boost_version) ^
+        libboost-python=(boost_version) ^
         libboost-python-devel=(boost_version) ^
         libboost=$(boost_version) ^
         libboost-devel=$(boost_version) ^
-        numpy matplotlib cairo pillow eigen pandas=2.1 ^
+        numpy matplotlib cairo pillow eigen ^
+        numpy=2.4 pandas=3 ^
         sphinx myst-parser ipython jupyter pytest nbval cmake 
     call activate rdkit_build
     conda config --env --add channels conda-forge 
@@ -59,6 +60,15 @@ steps:
     cd build
     ctest -C Release -j $(number_of_cores) --output-on-failure -T Test
   displayName: Run tests
+- script: |
+    call activate rdkit_build
+    conda install -c conda-forge --override-channels  numpy=1.26 pandas=2.2
+    set RDBASE=%cd%
+    set PYTHONPATH=%RDBASE%;%PYTHONPATH%
+    set PATH=%RDBASE%\lib;%PATH%
+    cd build
+    ctest -C Release -j $(number_of_cores) --output-on-failure -T Test
+  displayName: Run with older numpy and pandas
 - script: |
     call activate rdkit_build
     conda install -c conda-forge --override-channels  sphinx myst-parser
diff --git a/Code/ML/InfoTheory/Wrap/testRanker.py b/Code/ML/InfoTheory/Wrap/testRanker.py
index 65f49a4d9..76502055b 100644
--- a/Code/ML/InfoTheory/Wrap/testRanker.py
+++ b/Code/ML/InfoTheory/Wrap/testRanker.py
@@ -19,33 +19,33 @@ class TestCase(unittest.TestCase):
     pass
 
   def test0GainFuns(self):
-    arr = numpy.array([9, 5])
+    arr = numpy.array([9, 5],float)
     self.assertTrue(feq(rdit.InfoEntropy(arr), 0.9403))
-    arr = numpy.array([9, 9])
+    arr = numpy.array([9, 9],float)
     self.assertTrue(feq(rdit.InfoEntropy(arr), 1.0000))
-    arr = numpy.array([5, 5])
+    arr = numpy.array([5, 5],float)
     self.assertTrue(feq(rdit.InfoEntropy(arr), 1.0000))
-    arr = numpy.array([5, 0])
+    arr = numpy.array([5, 0],float)
     self.assertTrue(feq(rdit.InfoEntropy(arr), 0.0000))
-    arr = numpy.array([5, 5, 5])
+    arr = numpy.array([5, 5, 5],float)
     self.assertTrue(feq(rdit.InfoEntropy(arr), 1.5850))
-    arr = numpy.array([2, 5, 5])
+    arr = numpy.array([2, 5, 5],float)
     self.assertTrue(feq(rdit.InfoEntropy(arr), 1.4834))
 
-    mat2 = numpy.array([[6, 2], [3, 3]])
+    mat2 = numpy.array([[6, 2], [3, 3]],float)
     self.assertTrue(feq(rdit.InfoGain(mat2), 0.0481))
     self.assertTrue(feq(rdit.ChiSquare(mat2), 0.9333))
 
-    mat3 = numpy.array([[1, 1], [2, 1]])
+    mat3 = numpy.array([[1, 1], [2, 1]],float)
     self.assertTrue(feq(rdit.InfoGain(mat3), 0.0200))
 
-    mat4 = numpy.array([[2, 0], [1, 2]])
+    mat4 = numpy.array([[2, 0], [1, 2]],float)
     self.assertTrue(feq(rdit.InfoGain(mat4), 0.4200))
 
-    mat5 = numpy.array([[0, 0], [0, 0]])
+    mat5 = numpy.array([[0, 0], [0, 0]],float)
     self.assertTrue(feq(rdit.InfoGain(mat5), 0.0000))
 
-    mat6 = numpy.array([[1, 0], [1, 0]])
+    mat6 = numpy.array([[1, 0], [1, 0]],float)
     self.assertTrue(feq(rdit.InfoGain(mat6), 0.0000))
 
   def test1ranker(self):
diff --git a/Docs/Book/GettingStartedInPython.rst b/Docs/Book/GettingStartedInPython.rst
index 63ae8cbad..a70d92332 100644
--- a/Docs/Book/GettingStartedInPython.rst
+++ b/Docs/Book/GettingStartedInPython.rst
@@ -3500,7 +3500,7 @@ These are accessible using Python's help command:
   >>> m.GetNumAtoms()
   7
   >>> help(m.GetNumAtoms) 
-  Help on method GetNumAtoms:
+  Help on method GetNumAtoms...
   <BLANKLINE>
   GetNumAtoms(...) method of rdkit.Chem.rdchem.Mol instance
       GetNumAtoms( (Mol)self [, (int)onlyHeavy=-1 [, (bool)onlyExplicit=True]]) -> int :
diff --git a/azure-pipelines.yml b/azure-pipelines.yml
index 5e6406901..54658fecd 100644
--- a/azure-pipelines.yml
+++ b/azure-pipelines.yml
@@ -32,20 +32,20 @@ jobs:
     cxx: g++-11
   steps:
   - template: .azure-pipelines/linux_build.yml
-- job: Ubuntu_x64_py311
+- job: Ubuntu_x64_py312
   timeoutInMinutes: 120
   pool:
     vmImage: ubuntu-latest
   variables:
-    python: python=3.11
-    boost_version: 1.82.0
+    python: python=3.12
+    boost_version: 1.89.0
     compiler: gxx_linux-64
     cc: gcc-13
     cxx: g++-13
     number_of_cores: nproc
-    python_name: python311
+    python_name: python312
   steps:
-  - template: .azure-pipelines/linux_build_py311.yml
+  - template: .azure-pipelines/linux_build_py312.yml
 - job: macOS_x64
   timeoutInMinutes: 120
   pool:
diff --git a/rdkit/Chem/GraphDescriptors.py b/rdkit/Chem/GraphDescriptors.py
index 52e6e18cd..fc91cf5d7 100644
--- a/rdkit/Chem/GraphDescriptors.py
+++ b/rdkit/Chem/GraphDescriptors.py
@@ -229,7 +229,7 @@ def Chi0(mol):
   deltas = [x.GetDegree() for x in mol.GetAtoms()]
   while 0 in deltas:
     deltas.remove(0)
-  deltas = numpy.array(deltas, 'd')
+  deltas = numpy.array(deltas, float)
   res = sum(numpy.sqrt(1. / deltas))
   return res
 
@@ -244,7 +244,7 @@ def Chi1(mol):
   c1s = [x.GetBeginAtom().GetDegree() * x.GetEndAtom().GetDegree() for x in mol.GetBonds()]
   while 0 in c1s:
     c1s.remove(0)
-  c1s = numpy.array(c1s, 'd')
+  c1s = numpy.array(c1s, float)
   res = sum(numpy.sqrt(1. / c1s))
   return res
 
@@ -320,7 +320,7 @@ def _pyChiNv_(mol, order=2):
                         for hkd in _hkDeltas(mol, skipHs=0)])
   accum = 0.0
   for path in Chem.FindAllPathsOfLengthN(mol, order + 1, useBonds=0):
-    accum += numpy.prod(deltas[numpy.array(path)])
+    accum += numpy.prod(deltas[numpy.array(path)],float)
   return accum
 
 
@@ -358,7 +358,7 @@ def _pyChi0n(mol):
   deltas = [_nVal(x) for x in mol.GetAtoms()]
   while deltas.count(0):
     deltas.remove(0)
-  deltas = numpy.array(deltas, 'd')
+  deltas = numpy.array(deltas, float)
   res = sum(numpy.sqrt(1. / deltas))
   return res
 
@@ -367,7 +367,7 @@ def _pyChi1n(mol):
   """  Similar to Hall Kier Chi1v, but uses nVal instead of valence
 
   """
-  delts = numpy.array([_nVal(x) for x in mol.GetAtoms()], 'd')
+  delts = numpy.array([_nVal(x) for x in mol.GetAtoms()], float)
   res = 0.0
   for bond in mol.GetBonds():
     v = delts[bond.GetBeginAtomIdx()] * delts[bond.GetEndAtomIdx()]
@@ -391,7 +391,7 @@ def _pyChiNn_(mol, order=2):
   deltas = numpy.array([(1. / numpy.sqrt(x) if x else 0.0) for x in nval])
   accum = 0.0
   for path in Chem.FindAllPathsOfLengthN(mol, order + 1, useBonds=0):
-    accum += numpy.prod(deltas[numpy.array(path)])
+    accum += numpy.prod(deltas[numpy.array(path)],float)
   return accum
 
 
@@ -578,10 +578,10 @@ def _CalculateEntropies(connectionDict, atomTypeDict, numAtoms):
   """
   connectionList = list(connectionDict.values())
   totConnections = sum(connectionList)
-  connectionIE = totConnections * (entropy.InfoEntropy(numpy.array(connectionList)) +
+  connectionIE = totConnections * (entropy.InfoEntropy(numpy.array(connectionList, float)) +
                                    math.log(totConnections) / _log2val)
   atomTypeList = list(atomTypeDict.values())
-  atomTypeIE = numAtoms * entropy.InfoEntropy(numpy.array(atomTypeList))
+  atomTypeIE = numAtoms * entropy.InfoEntropy(numpy.array(atomTypeList, float))
   return atomTypeIE + connectionIE
 
 
diff --git a/rdkit/Chem/PandasPatcher.py b/rdkit/Chem/PandasPatcher.py
index 375a162d2..a3d864c08 100644
--- a/rdkit/Chem/PandasPatcher.py
+++ b/rdkit/Chem/PandasPatcher.py
@@ -88,12 +88,13 @@ except ImportError:
   log.warning("Failed to import pandas")
   raise
 
-dataframe_applymap = pd.DataFrame.applymap
 try:
-  if tuple(map(int, (pd.__version__.split(".")))) >= (2, 1, 0):
+  if tuple(map(int, (pd.__version__.split(".")))) < (2, 1, 0):
+    dataframe_applymap = pd.DataFrame.applymap
+  else:
     dataframe_applymap = pd.DataFrame.map
 except:
-  pass
+  log.warning("Failed to find a suitable map function for data frames")
 
 orig_to_html = getattr(to_html_class, "to_html")
 pprint_thing = pandas_formats.printing.pprint_thing
@@ -138,7 +139,7 @@ class MolFormatter:
   @classmethod
   def get_formatters(cls, df, orig_formatters):
     """Return an instance of MolFormatter for each column that contains Chem.Mol objects"""
-    df_subset = df.select_dtypes("object")
+    df_subset = df.select_dtypes(["object", "string"])
     return {
       col: cls(orig_formatters.get(col, None))
       for col in df_subset.columns[dataframe_applymap(df_subset, MolFormatter.is_mol).any()]
diff --git a/rdkit/Chem/PandasTools.py b/rdkit/Chem/PandasTools.py
index 173d8c86c..7997c092d 100644
--- a/rdkit/Chem/PandasTools.py
+++ b/rdkit/Chem/PandasTools.py
@@ -431,7 +431,7 @@ def WriteSDF(df, out, molColName='ROMol', idName=None, properties=None, allNumer
   if allNumeric:
     properties.extend([
       dt for dt in df.dtypes.keys()
-      if (np.issubdtype(df.dtypes[dt], np.floating) or np.issubdtype(df.dtypes[dt], np.integer))
+      if not pd.api.types.is_string_dtype(df.dtypes[dt]) and (np.issubdtype(df.dtypes[dt], np.floating) or np.issubdtype(df.dtypes[dt], np.integer))
     ])
 
   if molColName in properties:
diff --git a/rdkit/Chem/UnitTestPandasTools.py b/rdkit/Chem/UnitTestPandasTools.py
index 10ffcec65..198a82f2a 100644
--- a/rdkit/Chem/UnitTestPandasTools.py
+++ b/rdkit/Chem/UnitTestPandasTools.py
@@ -212,7 +212,7 @@ class TestPandasTools(unittest.TestCase):
 
   @unittest.skipIf(not hasattr(rdMolDraw2D, 'MolDraw2DCairo'), 'Cairo not available')
   def testPandasShouldShowMoleculesWhenTruncating(self):
-    csv_data = '''"Molecule ChEMBL ID";"Molecule Name";"Molecule Max Phase";"Molecular Weight";"#RO5 Violations";"AlogP";"Compound Key";"Smiles";"Standard Type";"Standard Relation";"Standard Value";"Standard Units";"pChEMBL Value";"Data Validity Comment";"Comment";"Uo Units";"Ligand Efficiency BEI";"Ligand Efficiency LE";"Ligand Efficiency LLE";"Ligand Efficiency SEI";"Potential Duplicate";"Assay ChEMBL ID";"Assay Description";"Assay Type";"BAO Format ID";"BAO Label";"Assay Organism";"Assay Tissue ChEMBL ID";"Assay Tissue Name";"Assay Cell Type";"Assay Subcellular Fraction";"Target ChEMBL ID";"Target Name";"Target Organism";"Target Type";"Document ChEMBL ID";"Source ID";"Source Description";"Document Journal";"Document Year";"Cell ChEMBL ID"
+    csv_data = r'''"Molecule ChEMBL ID";"Molecule Name";"Molecule Max Phase";"Molecular Weight";"#RO5 Violations";"AlogP";"Compound Key";"Smiles";"Standard Type";"Standard Relation";"Standard Value";"Standard Units";"pChEMBL Value";"Data Validity Comment";"Comment";"Uo Units";"Ligand Efficiency BEI";"Ligand Efficiency LE";"Ligand Efficiency LLE";"Ligand Efficiency SEI";"Potential Duplicate";"Assay ChEMBL ID";"Assay Description";"Assay Type";"BAO Format ID";"BAO Label";"Assay Organism";"Assay Tissue ChEMBL ID";"Assay Tissue Name";"Assay Cell Type";"Assay Subcellular Fraction";"Target ChEMBL ID";"Target Name";"Target Organism";"Target Type";"Document ChEMBL ID";"Source ID";"Source Description";"Document Journal";"Document Year";"Cell ChEMBL ID"
   "CHEMBL543779";"";"0";"341.86";"0";"2.60";"1w";"CCN(CC)CCS/C(=N\O)C(=O)c1ccc(C#N)cc1.Cl";"IC50";"'='";"180000.0";"nM";"";"Outside typical range";"";"UO_0000065";"";"";"";"";"False";"CHEMBL644102";"Reversible inhibition of Human AchE";"B";"BAO_0000357";"single protein format";"None";"None";"None";"None";"None";"CHEMBL220";"Acetylcholinesterase";"Homo sapiens";"SINGLE PROTEIN";"CHEMBL1123431";"1";"Scientific Literature";"J. Med. Chem.";"1986";"None"
   '''
     try:
diff --git a/rdkit/Chem/nbtests/github4823.ipynb b/rdkit/Chem/nbtests/github4823.ipynb
index 27cea7638..68b1914b5 100644
--- a/rdkit/Chem/nbtests/github4823.ipynb
+++ b/rdkit/Chem/nbtests/github4823.ipynb
@@ -130,7 +130,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": 5,
    "id": "permanent-liechtenstein",
    "metadata": {
     "scrolled": true
@@ -139,36 +139,51 @@
     {
      "data": {
       "text/plain": [
-       "Index(['AMW', 'CLOGP', 'CP', 'CR', 'DAYLIGHT.FPG', 'DAYLIGHT_CLOGP', 'FP',\n",
-       "       'ISM', 'LIPINSKI_VIOLATIONS', 'NUM_HACCEPTORS', 'NUM_HDONORS',\n",
-       "       'NUM_HETEROATOMS', 'NUM_LIPINSKIHACCEPTORS', 'NUM_LIPINSKIHDONORS',\n",
-       "       'NUM_RINGS', 'NUM_ROTATABLEBONDS', 'NUM_ROTATABLEBONDS_O', 'P1',\n",
-       "       'SMILES', 'ID', 'ROMol'],\n",
-       "      dtype='object')"
+       "['AMW',\n",
+       " 'CLOGP',\n",
+       " 'CP',\n",
+       " 'CR',\n",
+       " 'DAYLIGHT.FPG',\n",
+       " 'DAYLIGHT_CLOGP',\n",
+       " 'FP',\n",
+       " 'ISM',\n",
+       " 'LIPINSKI_VIOLATIONS',\n",
+       " 'NUM_HACCEPTORS',\n",
+       " 'NUM_HDONORS',\n",
+       " 'NUM_HETEROATOMS',\n",
+       " 'NUM_LIPINSKIHACCEPTORS',\n",
+       " 'NUM_LIPINSKIHDONORS',\n",
+       " 'NUM_RINGS',\n",
+       " 'NUM_ROTATABLEBONDS',\n",
+       " 'NUM_ROTATABLEBONDS_O',\n",
+       " 'P1',\n",
+       " 'SMILES',\n",
+       " 'ID',\n",
+       " 'ROMol']"
       ]
      },
-     "execution_count": 4,
+     "execution_count": 5,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
    "source": [
-    "df.columns"
+    "list(df.columns)"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": 6,
    "id": "careful-netherlands",
    "metadata": {},
    "outputs": [
     {
      "data": {
       "text/plain": [
-       "<module 'rdkit.Chem.PandasTools' from '/scratch/RDKit_git/rdkit/Chem/PandasTools.py'>"
+       "<module 'rdkit.Chem.PandasTools' from '/localhome/glandrum/RDKit_git/rdkit/Chem/PandasTools.py'>"
       ]
      },
-     "execution_count": 5,
+     "execution_count": 6,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -181,7 +196,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": 7,
    "id": "identical-finder",
    "metadata": {},
    "outputs": [
@@ -261,7 +276,7 @@
        "4    223.231   2.43   1.869;-0P;4.71   6.390;-0R;4.71"
       ]
      },
-     "execution_count": 6,
+     "execution_count": 7,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -281,7 +296,7 @@
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "Python 3 (ipykernel)",
+   "display_name": "py312_build",
    "language": "python",
    "name": "python3"
   },
@@ -295,7 +310,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.11.8"
+   "version": "3.12.3"
   },
   "toc": {
    "base_numbering": 1,