Get things working with numpy 2.4 and pandas 3.0 (#9072)

* get BertzCT working with numpy 2.4 * test pass with with Pandas 3.0 (on windows at least) * update testRanker too * update nb test * run win32 CI tests with different pandas versions also updates boost version * works with pandas 2.0? * update linux_build_py311 -> linux_build_py312 test both old and new pandas can't go higher with the python version yet because the older pandas and numpy are not available. * doctest fix? --------- Co-authored-by: = <=>
2026-06-03 21:44:30 +08:00 · 2026-02-04 12:06:21 +01:00
parent 56eb0dfa81
commit 3444408692
10 changed files with 92 additions and 52 deletions
--- a/.azure-pipelines/linux_build_py312.yml
+++ b/.azure-pipelines/linux_build_py312.yml
@@ -7,9 +7,11 @@ steps:
    conda update -q conda
    conda info -a
    conda create --name rdkit_build -c conda-forge --override-channels  $(python) cmake \
-        boost-cpp=$(boost_version) \
-        boost=$(boost_version) \
-        numpy=1.24.3 pillow eigen pandas=2.1 matplotlib-base=3.8 \
+        libboost-python-devel=$(boost_version) \
+        libboost-python=$(boost_version) \
+        libboost-devel=$(boost_version) \
+        libboost=$(boost_version) \
+        numpy=2.4 pillow eigen pandas=3 matplotlib-base=3.8 \
        cairo
    conda activate rdkit_build
    conda config --env --add channels conda-forge
@@ -66,6 +68,18 @@ steps:
    cd build
    ctest -j $( $(number_of_cores) ) --output-on-failure -T Test
  displayName: Run tests
+- bash: |
+    source ${CONDA}/etc/profile.d/conda.sh
+    conda activate rdkit_build
+    conda install -c conda-forge --override-channels numpy=1.24 pandas=2.2
+    export RDBASE=`pwd`
+    export PYTHONPATH=${RDBASE}:${PYTHONPATH}
+    export LD_LIBRARY_PATH=${RDBASE}/lib:${CONDA_PREFIX}/lib:${LD_LIBRARY_PATH}
+    echo "LD_LIBRARY_PATH: " $LD_LIBRARY_PATH
+    export QT_QPA_PLATFORM='offscreen'
+    cd build
+    ctest -j $( $(number_of_cores) ) --output-on-failure -T Test
+  displayName: Run tests with older numpy and pandas
 - bash: |
    source ${CONDA}/etc/profile.d/conda.sh
    conda activate rdkit_build
--- a/.azure-pipelines/vs_build.yml
+++ b/.azure-pipelines/vs_build.yml
@@ -7,11 +7,12 @@ steps:
    conda install -n base conda-libmamba-solver
    conda config --set solver libmamba
    conda create --name rdkit_build -c conda-forge --override-channels  $(python) ^
-        boost=$(boost_version) boost-cpp=$(boost_version) ^
+        libboost-python=(boost_version) ^
        libboost-python-devel=(boost_version) ^
        libboost=$(boost_version) ^
        libboost-devel=$(boost_version) ^
-        numpy matplotlib cairo pillow eigen pandas=2.1 ^
+        numpy matplotlib cairo pillow eigen ^
+        numpy=2.4 pandas=3 ^
        sphinx myst-parser ipython jupyter pytest nbval cmake 
    call activate rdkit_build
    conda config --env --add channels conda-forge 
@@ -59,6 +60,15 @@ steps:
    cd build
    ctest -C Release -j $(number_of_cores) --output-on-failure -T Test
  displayName: Run tests
+- script: |
+    call activate rdkit_build
+    conda install -c conda-forge --override-channels  numpy=1.26 pandas=2.2
+    set RDBASE=%cd%
+    set PYTHONPATH=%RDBASE%;%PYTHONPATH%
+    set PATH=%RDBASE%\lib;%PATH%
+    cd build
+    ctest -C Release -j $(number_of_cores) --output-on-failure -T Test
+  displayName: Run with older numpy and pandas
 - script: |
    call activate rdkit_build
    conda install -c conda-forge --override-channels  sphinx myst-parser
--- a/Code/ML/InfoTheory/Wrap/testRanker.py
+++ b/Code/ML/InfoTheory/Wrap/testRanker.py
@@ -19,33 +19,33 @@ class TestCase(unittest.TestCase):
    pass

  def test0GainFuns(self):
-    arr = numpy.array([9, 5])
+    arr = numpy.array([9, 5],float)
    self.assertTrue(feq(rdit.InfoEntropy(arr), 0.9403))
-    arr = numpy.array([9, 9])
+    arr = numpy.array([9, 9],float)
    self.assertTrue(feq(rdit.InfoEntropy(arr), 1.0000))
-    arr = numpy.array([5, 5])
+    arr = numpy.array([5, 5],float)
    self.assertTrue(feq(rdit.InfoEntropy(arr), 1.0000))
-    arr = numpy.array([5, 0])
+    arr = numpy.array([5, 0],float)
    self.assertTrue(feq(rdit.InfoEntropy(arr), 0.0000))
-    arr = numpy.array([5, 5, 5])
+    arr = numpy.array([5, 5, 5],float)
    self.assertTrue(feq(rdit.InfoEntropy(arr), 1.5850))
-    arr = numpy.array([2, 5, 5])
+    arr = numpy.array([2, 5, 5],float)
    self.assertTrue(feq(rdit.InfoEntropy(arr), 1.4834))

-    mat2 = numpy.array([[6, 2], [3, 3]])
+    mat2 = numpy.array([[6, 2], [3, 3]],float)
    self.assertTrue(feq(rdit.InfoGain(mat2), 0.0481))
    self.assertTrue(feq(rdit.ChiSquare(mat2), 0.9333))

-    mat3 = numpy.array([[1, 1], [2, 1]])
+    mat3 = numpy.array([[1, 1], [2, 1]],float)
    self.assertTrue(feq(rdit.InfoGain(mat3), 0.0200))

-    mat4 = numpy.array([[2, 0], [1, 2]])
+    mat4 = numpy.array([[2, 0], [1, 2]],float)
    self.assertTrue(feq(rdit.InfoGain(mat4), 0.4200))

-    mat5 = numpy.array([[0, 0], [0, 0]])
+    mat5 = numpy.array([[0, 0], [0, 0]],float)
    self.assertTrue(feq(rdit.InfoGain(mat5), 0.0000))

-    mat6 = numpy.array([[1, 0], [1, 0]])
+    mat6 = numpy.array([[1, 0], [1, 0]],float)
    self.assertTrue(feq(rdit.InfoGain(mat6), 0.0000))

  def test1ranker(self):
--- a/Docs/Book/GettingStartedInPython.rst
+++ b/Docs/Book/GettingStartedInPython.rst
@@ -3500,7 +3500,7 @@ These are accessible using Python's help command:
  >>> m.GetNumAtoms()
  7
  >>> help(m.GetNumAtoms) 
-  Help on method GetNumAtoms:
+  Help on method GetNumAtoms...
  <BLANKLINE>
  GetNumAtoms(...) method of rdkit.Chem.rdchem.Mol instance
      GetNumAtoms( (Mol)self [, (int)onlyHeavy=-1 [, (bool)onlyExplicit=True]]) -> int :
--- a/azure-pipelines.yml
+++ b/azure-pipelines.yml
@@ -32,20 +32,20 @@ jobs:
    cxx: g++-11
  steps:
  - template: .azure-pipelines/linux_build.yml
- job: Ubuntu_x64_py311
+- job: Ubuntu_x64_py312
  timeoutInMinutes: 120
  pool:
    vmImage: ubuntu-latest
  variables:
-    python: python=3.11
-    boost_version: 1.82.0
+    python: python=3.12
+    boost_version: 1.89.0
    compiler: gxx_linux-64
    cc: gcc-13
    cxx: g++-13
    number_of_cores: nproc
-    python_name: python311
+    python_name: python312
  steps:
-  - template: .azure-pipelines/linux_build_py311.yml
+  - template: .azure-pipelines/linux_build_py312.yml
 - job: macOS_x64
  timeoutInMinutes: 120
  pool:
--- a/rdkit/Chem/GraphDescriptors.py
+++ b/rdkit/Chem/GraphDescriptors.py
@@ -229,7 +229,7 @@ def Chi0(mol):
  deltas = [x.GetDegree() for x in mol.GetAtoms()]
  while 0 in deltas:
    deltas.remove(0)
-  deltas = numpy.array(deltas, 'd')
+  deltas = numpy.array(deltas, float)
  res = sum(numpy.sqrt(1. / deltas))
  return res

@@ -244,7 +244,7 @@ def Chi1(mol):
  c1s = [x.GetBeginAtom().GetDegree() * x.GetEndAtom().GetDegree() for x in mol.GetBonds()]
  while 0 in c1s:
    c1s.remove(0)
-  c1s = numpy.array(c1s, 'd')
+  c1s = numpy.array(c1s, float)
  res = sum(numpy.sqrt(1. / c1s))
  return res

@@ -320,7 +320,7 @@ def _pyChiNv_(mol, order=2):
                        for hkd in _hkDeltas(mol, skipHs=0)])
  accum = 0.0
  for path in Chem.FindAllPathsOfLengthN(mol, order + 1, useBonds=0):
-    accum += numpy.prod(deltas[numpy.array(path)])
+    accum += numpy.prod(deltas[numpy.array(path)],float)
  return accum


@@ -358,7 +358,7 @@ def _pyChi0n(mol):
  deltas = [_nVal(x) for x in mol.GetAtoms()]
  while deltas.count(0):
    deltas.remove(0)
-  deltas = numpy.array(deltas, 'd')
+  deltas = numpy.array(deltas, float)
  res = sum(numpy.sqrt(1. / deltas))
  return res

@@ -367,7 +367,7 @@ def _pyChi1n(mol):
  """  Similar to Hall Kier Chi1v, but uses nVal instead of valence

  """
-  delts = numpy.array([_nVal(x) for x in mol.GetAtoms()], 'd')
+  delts = numpy.array([_nVal(x) for x in mol.GetAtoms()], float)
  res = 0.0
  for bond in mol.GetBonds():
    v = delts[bond.GetBeginAtomIdx()] * delts[bond.GetEndAtomIdx()]
@@ -391,7 +391,7 @@ def _pyChiNn_(mol, order=2):
  deltas = numpy.array([(1. / numpy.sqrt(x) if x else 0.0) for x in nval])
  accum = 0.0
  for path in Chem.FindAllPathsOfLengthN(mol, order + 1, useBonds=0):
-    accum += numpy.prod(deltas[numpy.array(path)])
+    accum += numpy.prod(deltas[numpy.array(path)],float)
  return accum


@@ -578,10 +578,10 @@ def _CalculateEntropies(connectionDict, atomTypeDict, numAtoms):
  """
  connectionList = list(connectionDict.values())
  totConnections = sum(connectionList)
-  connectionIE = totConnections * (entropy.InfoEntropy(numpy.array(connectionList)) +
+  connectionIE = totConnections * (entropy.InfoEntropy(numpy.array(connectionList, float)) +
                                   math.log(totConnections) / _log2val)
  atomTypeList = list(atomTypeDict.values())
-  atomTypeIE = numAtoms * entropy.InfoEntropy(numpy.array(atomTypeList))
+  atomTypeIE = numAtoms * entropy.InfoEntropy(numpy.array(atomTypeList, float))
  return atomTypeIE + connectionIE


--- a/rdkit/Chem/PandasPatcher.py
+++ b/rdkit/Chem/PandasPatcher.py
@@ -88,12 +88,13 @@ except ImportError:
  log.warning("Failed to import pandas")
  raise

-dataframe_applymap = pd.DataFrame.applymap
 try:
-  if tuple(map(int, (pd.__version__.split(".")))) >= (2, 1, 0):
+  if tuple(map(int, (pd.__version__.split(".")))) < (2, 1, 0):
+    dataframe_applymap = pd.DataFrame.applymap
+  else:
    dataframe_applymap = pd.DataFrame.map
 except:
-  pass
+  log.warning("Failed to find a suitable map function for data frames")

 orig_to_html = getattr(to_html_class, "to_html")
 pprint_thing = pandas_formats.printing.pprint_thing
@@ -138,7 +139,7 @@ class MolFormatter:
  @classmethod
  def get_formatters(cls, df, orig_formatters):
    """Return an instance of MolFormatter for each column that contains Chem.Mol objects"""
-    df_subset = df.select_dtypes("object")
+    df_subset = df.select_dtypes(["object", "string"])
    return {
      col: cls(orig_formatters.get(col, None))
      for col in df_subset.columns[dataframe_applymap(df_subset, MolFormatter.is_mol).any()]
--- a/rdkit/Chem/PandasTools.py
+++ b/rdkit/Chem/PandasTools.py
@@ -431,7 +431,7 @@ def WriteSDF(df, out, molColName='ROMol', idName=None, properties=None, allNumer
  if allNumeric:
    properties.extend([
      dt for dt in df.dtypes.keys()
-      if (np.issubdtype(df.dtypes[dt], np.floating) or np.issubdtype(df.dtypes[dt], np.integer))
+      if not pd.api.types.is_string_dtype(df.dtypes[dt]) and (np.issubdtype(df.dtypes[dt], np.floating) or np.issubdtype(df.dtypes[dt], np.integer))
    ])

  if molColName in properties:
--- a/rdkit/Chem/UnitTestPandasTools.py
+++ b/rdkit/Chem/UnitTestPandasTools.py
@@ -212,7 +212,7 @@ class TestPandasTools(unittest.TestCase):

  @unittest.skipIf(not hasattr(rdMolDraw2D, 'MolDraw2DCairo'), 'Cairo not available')
  def testPandasShouldShowMoleculesWhenTruncating(self):
-    csv_data = '''"Molecule ChEMBL ID";"Molecule Name";"Molecule Max Phase";"Molecular Weight";"#RO5 Violations";"AlogP";"Compound Key";"Smiles";"Standard Type";"Standard Relation";"Standard Value";"Standard Units";"pChEMBL Value";"Data Validity Comment";"Comment";"Uo Units";"Ligand Efficiency BEI";"Ligand Efficiency LE";"Ligand Efficiency LLE";"Ligand Efficiency SEI";"Potential Duplicate";"Assay ChEMBL ID";"Assay Description";"Assay Type";"BAO Format ID";"BAO Label";"Assay Organism";"Assay Tissue ChEMBL ID";"Assay Tissue Name";"Assay Cell Type";"Assay Subcellular Fraction";"Target ChEMBL ID";"Target Name";"Target Organism";"Target Type";"Document ChEMBL ID";"Source ID";"Source Description";"Document Journal";"Document Year";"Cell ChEMBL ID"
+    csv_data = r'''"Molecule ChEMBL ID";"Molecule Name";"Molecule Max Phase";"Molecular Weight";"#RO5 Violations";"AlogP";"Compound Key";"Smiles";"Standard Type";"Standard Relation";"Standard Value";"Standard Units";"pChEMBL Value";"Data Validity Comment";"Comment";"Uo Units";"Ligand Efficiency BEI";"Ligand Efficiency LE";"Ligand Efficiency LLE";"Ligand Efficiency SEI";"Potential Duplicate";"Assay ChEMBL ID";"Assay Description";"Assay Type";"BAO Format ID";"BAO Label";"Assay Organism";"Assay Tissue ChEMBL ID";"Assay Tissue Name";"Assay Cell Type";"Assay Subcellular Fraction";"Target ChEMBL ID";"Target Name";"Target Organism";"Target Type";"Document ChEMBL ID";"Source ID";"Source Description";"Document Journal";"Document Year";"Cell ChEMBL ID"
  "CHEMBL543779";"";"0";"341.86";"0";"2.60";"1w";"CCN(CC)CCS/C(=N\O)C(=O)c1ccc(C#N)cc1.Cl";"IC50";"'='";"180000.0";"nM";"";"Outside typical range";"";"UO_0000065";"";"";"";"";"False";"CHEMBL644102";"Reversible inhibition of Human AchE";"B";"BAO_0000357";"single protein format";"None";"None";"None";"None";"None";"CHEMBL220";"Acetylcholinesterase";"Homo sapiens";"SINGLE PROTEIN";"CHEMBL1123431";"1";"Scientific Literature";"J. Med. Chem.";"1986";"None"
  '''
    try:
--- a/rdkit/Chem/nbtests/github4823.ipynb
+++ b/rdkit/Chem/nbtests/github4823.ipynb
@@ -130,7 +130,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": 5,
   "id": "permanent-liechtenstein",
   "metadata": {
    "scrolled": true
@@ -139,36 +139,51 @@
    {
     "data": {
      "text/plain": [
-       "Index(['AMW', 'CLOGP', 'CP', 'CR', 'DAYLIGHT.FPG', 'DAYLIGHT_CLOGP', 'FP',\n",
-       "       'ISM', 'LIPINSKI_VIOLATIONS', 'NUM_HACCEPTORS', 'NUM_HDONORS',\n",
-       "       'NUM_HETEROATOMS', 'NUM_LIPINSKIHACCEPTORS', 'NUM_LIPINSKIHDONORS',\n",
-       "       'NUM_RINGS', 'NUM_ROTATABLEBONDS', 'NUM_ROTATABLEBONDS_O', 'P1',\n",
-       "       'SMILES', 'ID', 'ROMol'],\n",
-       "      dtype='object')"
+       "['AMW',\n",
+       " 'CLOGP',\n",
+       " 'CP',\n",
+       " 'CR',\n",
+       " 'DAYLIGHT.FPG',\n",
+       " 'DAYLIGHT_CLOGP',\n",
+       " 'FP',\n",
+       " 'ISM',\n",
+       " 'LIPINSKI_VIOLATIONS',\n",
+       " 'NUM_HACCEPTORS',\n",
+       " 'NUM_HDONORS',\n",
+       " 'NUM_HETEROATOMS',\n",
+       " 'NUM_LIPINSKIHACCEPTORS',\n",
+       " 'NUM_LIPINSKIHDONORS',\n",
+       " 'NUM_RINGS',\n",
+       " 'NUM_ROTATABLEBONDS',\n",
+       " 'NUM_ROTATABLEBONDS_O',\n",
+       " 'P1',\n",
+       " 'SMILES',\n",
+       " 'ID',\n",
+       " 'ROMol']"
      ]
     },
-     "execution_count": 4,
+     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
-    "df.columns"
+    "list(df.columns)"
   ]
  },
  {
   "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": 6,
   "id": "careful-netherlands",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
-       "<module 'rdkit.Chem.PandasTools' from '/scratch/RDKit_git/rdkit/Chem/PandasTools.py'>"
+       "<module 'rdkit.Chem.PandasTools' from '/localhome/glandrum/RDKit_git/rdkit/Chem/PandasTools.py'>"
      ]
     },
-     "execution_count": 5,
+     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
@@ -181,7 +196,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": 7,
   "id": "identical-finder",
   "metadata": {},
   "outputs": [
@@ -261,7 +276,7 @@
       "4    223.231   2.43   1.869;-0P;4.71   6.390;-0R;4.71"
      ]
     },
-     "execution_count": 6,
+     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
@@ -281,7 +296,7 @@
 ],
 "metadata": {
  "kernelspec": {
-   "display_name": "Python 3 (ipykernel)",
+   "display_name": "py312_build",
   "language": "python",
   "name": "python3"
  },
@@ -295,7 +310,7 @@
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
-   "version": "3.11.8"
+   "version": "3.12.3"
  },
  "toc": {
   "base_numbering": 1,