From 34444086926ad9d550c75a996fc9b9e63c9da143 Mon Sep 17 00:00:00 2001 From: Greg Landrum Date: Wed, 4 Feb 2026 12:06:21 +0100 Subject: [PATCH] Get things working with numpy 2.4 and pandas 3.0 (#9072) * get BertzCT working with numpy 2.4 * test pass with with Pandas 3.0 (on windows at least) * update testRanker too * update nb test * run win32 CI tests with different pandas versions also updates boost version * works with pandas 2.0? * update linux_build_py311 -> linux_build_py312 test both old and new pandas can't go higher with the python version yet because the older pandas and numpy are not available. * doctest fix? --------- Co-authored-by: = <=> --- ..._build_py311.yml => linux_build_py312.yml} | 20 ++++++-- .azure-pipelines/vs_build.yml | 14 +++++- Code/ML/InfoTheory/Wrap/testRanker.py | 22 ++++----- Docs/Book/GettingStartedInPython.rst | 2 +- azure-pipelines.yml | 10 ++-- rdkit/Chem/GraphDescriptors.py | 16 +++---- rdkit/Chem/PandasPatcher.py | 9 ++-- rdkit/Chem/PandasTools.py | 2 +- rdkit/Chem/UnitTestPandasTools.py | 2 +- rdkit/Chem/nbtests/github4823.ipynb | 47 ++++++++++++------- 10 files changed, 92 insertions(+), 52 deletions(-) rename .azure-pipelines/{linux_build_py311.yml => linux_build_py312.yml} (79%) diff --git a/.azure-pipelines/linux_build_py311.yml b/.azure-pipelines/linux_build_py312.yml similarity index 79% rename from .azure-pipelines/linux_build_py311.yml rename to .azure-pipelines/linux_build_py312.yml index 5acb7ffc1..ab8ec6d97 100644 --- a/.azure-pipelines/linux_build_py311.yml +++ b/.azure-pipelines/linux_build_py312.yml @@ -7,9 +7,11 @@ steps: conda update -q conda conda info -a conda create --name rdkit_build -c conda-forge --override-channels $(python) cmake \ - boost-cpp=$(boost_version) \ - boost=$(boost_version) \ - numpy=1.24.3 pillow eigen pandas=2.1 matplotlib-base=3.8 \ + libboost-python-devel=$(boost_version) \ + libboost-python=$(boost_version) \ + libboost-devel=$(boost_version) \ + libboost=$(boost_version) \ + numpy=2.4 pillow eigen pandas=3 matplotlib-base=3.8 \ cairo conda activate rdkit_build conda config --env --add channels conda-forge @@ -66,6 +68,18 @@ steps: cd build ctest -j $( $(number_of_cores) ) --output-on-failure -T Test displayName: Run tests +- bash: | + source ${CONDA}/etc/profile.d/conda.sh + conda activate rdkit_build + conda install -c conda-forge --override-channels numpy=1.24 pandas=2.2 + export RDBASE=`pwd` + export PYTHONPATH=${RDBASE}:${PYTHONPATH} + export LD_LIBRARY_PATH=${RDBASE}/lib:${CONDA_PREFIX}/lib:${LD_LIBRARY_PATH} + echo "LD_LIBRARY_PATH: " $LD_LIBRARY_PATH + export QT_QPA_PLATFORM='offscreen' + cd build + ctest -j $( $(number_of_cores) ) --output-on-failure -T Test + displayName: Run tests with older numpy and pandas - bash: | source ${CONDA}/etc/profile.d/conda.sh conda activate rdkit_build diff --git a/.azure-pipelines/vs_build.yml b/.azure-pipelines/vs_build.yml index e06511b0f..4f59647f6 100644 --- a/.azure-pipelines/vs_build.yml +++ b/.azure-pipelines/vs_build.yml @@ -7,11 +7,12 @@ steps: conda install -n base conda-libmamba-solver conda config --set solver libmamba conda create --name rdkit_build -c conda-forge --override-channels $(python) ^ - boost=$(boost_version) boost-cpp=$(boost_version) ^ + libboost-python=(boost_version) ^ libboost-python-devel=(boost_version) ^ libboost=$(boost_version) ^ libboost-devel=$(boost_version) ^ - numpy matplotlib cairo pillow eigen pandas=2.1 ^ + numpy matplotlib cairo pillow eigen ^ + numpy=2.4 pandas=3 ^ sphinx myst-parser ipython jupyter pytest nbval cmake call activate rdkit_build conda config --env --add channels conda-forge @@ -59,6 +60,15 @@ steps: cd build ctest -C Release -j $(number_of_cores) --output-on-failure -T Test displayName: Run tests +- script: | + call activate rdkit_build + conda install -c conda-forge --override-channels numpy=1.26 pandas=2.2 + set RDBASE=%cd% + set PYTHONPATH=%RDBASE%;%PYTHONPATH% + set PATH=%RDBASE%\lib;%PATH% + cd build + ctest -C Release -j $(number_of_cores) --output-on-failure -T Test + displayName: Run with older numpy and pandas - script: | call activate rdkit_build conda install -c conda-forge --override-channels sphinx myst-parser diff --git a/Code/ML/InfoTheory/Wrap/testRanker.py b/Code/ML/InfoTheory/Wrap/testRanker.py index 65f49a4d9..76502055b 100644 --- a/Code/ML/InfoTheory/Wrap/testRanker.py +++ b/Code/ML/InfoTheory/Wrap/testRanker.py @@ -19,33 +19,33 @@ class TestCase(unittest.TestCase): pass def test0GainFuns(self): - arr = numpy.array([9, 5]) + arr = numpy.array([9, 5],float) self.assertTrue(feq(rdit.InfoEntropy(arr), 0.9403)) - arr = numpy.array([9, 9]) + arr = numpy.array([9, 9],float) self.assertTrue(feq(rdit.InfoEntropy(arr), 1.0000)) - arr = numpy.array([5, 5]) + arr = numpy.array([5, 5],float) self.assertTrue(feq(rdit.InfoEntropy(arr), 1.0000)) - arr = numpy.array([5, 0]) + arr = numpy.array([5, 0],float) self.assertTrue(feq(rdit.InfoEntropy(arr), 0.0000)) - arr = numpy.array([5, 5, 5]) + arr = numpy.array([5, 5, 5],float) self.assertTrue(feq(rdit.InfoEntropy(arr), 1.5850)) - arr = numpy.array([2, 5, 5]) + arr = numpy.array([2, 5, 5],float) self.assertTrue(feq(rdit.InfoEntropy(arr), 1.4834)) - mat2 = numpy.array([[6, 2], [3, 3]]) + mat2 = numpy.array([[6, 2], [3, 3]],float) self.assertTrue(feq(rdit.InfoGain(mat2), 0.0481)) self.assertTrue(feq(rdit.ChiSquare(mat2), 0.9333)) - mat3 = numpy.array([[1, 1], [2, 1]]) + mat3 = numpy.array([[1, 1], [2, 1]],float) self.assertTrue(feq(rdit.InfoGain(mat3), 0.0200)) - mat4 = numpy.array([[2, 0], [1, 2]]) + mat4 = numpy.array([[2, 0], [1, 2]],float) self.assertTrue(feq(rdit.InfoGain(mat4), 0.4200)) - mat5 = numpy.array([[0, 0], [0, 0]]) + mat5 = numpy.array([[0, 0], [0, 0]],float) self.assertTrue(feq(rdit.InfoGain(mat5), 0.0000)) - mat6 = numpy.array([[1, 0], [1, 0]]) + mat6 = numpy.array([[1, 0], [1, 0]],float) self.assertTrue(feq(rdit.InfoGain(mat6), 0.0000)) def test1ranker(self): diff --git a/Docs/Book/GettingStartedInPython.rst b/Docs/Book/GettingStartedInPython.rst index 63ae8cbad..a70d92332 100644 --- a/Docs/Book/GettingStartedInPython.rst +++ b/Docs/Book/GettingStartedInPython.rst @@ -3500,7 +3500,7 @@ These are accessible using Python's help command: >>> m.GetNumAtoms() 7 >>> help(m.GetNumAtoms) - Help on method GetNumAtoms: + Help on method GetNumAtoms... GetNumAtoms(...) method of rdkit.Chem.rdchem.Mol instance GetNumAtoms( (Mol)self [, (int)onlyHeavy=-1 [, (bool)onlyExplicit=True]]) -> int : diff --git a/azure-pipelines.yml b/azure-pipelines.yml index 5e6406901..54658fecd 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -32,20 +32,20 @@ jobs: cxx: g++-11 steps: - template: .azure-pipelines/linux_build.yml -- job: Ubuntu_x64_py311 +- job: Ubuntu_x64_py312 timeoutInMinutes: 120 pool: vmImage: ubuntu-latest variables: - python: python=3.11 - boost_version: 1.82.0 + python: python=3.12 + boost_version: 1.89.0 compiler: gxx_linux-64 cc: gcc-13 cxx: g++-13 number_of_cores: nproc - python_name: python311 + python_name: python312 steps: - - template: .azure-pipelines/linux_build_py311.yml + - template: .azure-pipelines/linux_build_py312.yml - job: macOS_x64 timeoutInMinutes: 120 pool: diff --git a/rdkit/Chem/GraphDescriptors.py b/rdkit/Chem/GraphDescriptors.py index 52e6e18cd..fc91cf5d7 100644 --- a/rdkit/Chem/GraphDescriptors.py +++ b/rdkit/Chem/GraphDescriptors.py @@ -229,7 +229,7 @@ def Chi0(mol): deltas = [x.GetDegree() for x in mol.GetAtoms()] while 0 in deltas: deltas.remove(0) - deltas = numpy.array(deltas, 'd') + deltas = numpy.array(deltas, float) res = sum(numpy.sqrt(1. / deltas)) return res @@ -244,7 +244,7 @@ def Chi1(mol): c1s = [x.GetBeginAtom().GetDegree() * x.GetEndAtom().GetDegree() for x in mol.GetBonds()] while 0 in c1s: c1s.remove(0) - c1s = numpy.array(c1s, 'd') + c1s = numpy.array(c1s, float) res = sum(numpy.sqrt(1. / c1s)) return res @@ -320,7 +320,7 @@ def _pyChiNv_(mol, order=2): for hkd in _hkDeltas(mol, skipHs=0)]) accum = 0.0 for path in Chem.FindAllPathsOfLengthN(mol, order + 1, useBonds=0): - accum += numpy.prod(deltas[numpy.array(path)]) + accum += numpy.prod(deltas[numpy.array(path)],float) return accum @@ -358,7 +358,7 @@ def _pyChi0n(mol): deltas = [_nVal(x) for x in mol.GetAtoms()] while deltas.count(0): deltas.remove(0) - deltas = numpy.array(deltas, 'd') + deltas = numpy.array(deltas, float) res = sum(numpy.sqrt(1. / deltas)) return res @@ -367,7 +367,7 @@ def _pyChi1n(mol): """ Similar to Hall Kier Chi1v, but uses nVal instead of valence """ - delts = numpy.array([_nVal(x) for x in mol.GetAtoms()], 'd') + delts = numpy.array([_nVal(x) for x in mol.GetAtoms()], float) res = 0.0 for bond in mol.GetBonds(): v = delts[bond.GetBeginAtomIdx()] * delts[bond.GetEndAtomIdx()] @@ -391,7 +391,7 @@ def _pyChiNn_(mol, order=2): deltas = numpy.array([(1. / numpy.sqrt(x) if x else 0.0) for x in nval]) accum = 0.0 for path in Chem.FindAllPathsOfLengthN(mol, order + 1, useBonds=0): - accum += numpy.prod(deltas[numpy.array(path)]) + accum += numpy.prod(deltas[numpy.array(path)],float) return accum @@ -578,10 +578,10 @@ def _CalculateEntropies(connectionDict, atomTypeDict, numAtoms): """ connectionList = list(connectionDict.values()) totConnections = sum(connectionList) - connectionIE = totConnections * (entropy.InfoEntropy(numpy.array(connectionList)) + + connectionIE = totConnections * (entropy.InfoEntropy(numpy.array(connectionList, float)) + math.log(totConnections) / _log2val) atomTypeList = list(atomTypeDict.values()) - atomTypeIE = numAtoms * entropy.InfoEntropy(numpy.array(atomTypeList)) + atomTypeIE = numAtoms * entropy.InfoEntropy(numpy.array(atomTypeList, float)) return atomTypeIE + connectionIE diff --git a/rdkit/Chem/PandasPatcher.py b/rdkit/Chem/PandasPatcher.py index 375a162d2..a3d864c08 100644 --- a/rdkit/Chem/PandasPatcher.py +++ b/rdkit/Chem/PandasPatcher.py @@ -88,12 +88,13 @@ except ImportError: log.warning("Failed to import pandas") raise -dataframe_applymap = pd.DataFrame.applymap try: - if tuple(map(int, (pd.__version__.split(".")))) >= (2, 1, 0): + if tuple(map(int, (pd.__version__.split(".")))) < (2, 1, 0): + dataframe_applymap = pd.DataFrame.applymap + else: dataframe_applymap = pd.DataFrame.map except: - pass + log.warning("Failed to find a suitable map function for data frames") orig_to_html = getattr(to_html_class, "to_html") pprint_thing = pandas_formats.printing.pprint_thing @@ -138,7 +139,7 @@ class MolFormatter: @classmethod def get_formatters(cls, df, orig_formatters): """Return an instance of MolFormatter for each column that contains Chem.Mol objects""" - df_subset = df.select_dtypes("object") + df_subset = df.select_dtypes(["object", "string"]) return { col: cls(orig_formatters.get(col, None)) for col in df_subset.columns[dataframe_applymap(df_subset, MolFormatter.is_mol).any()] diff --git a/rdkit/Chem/PandasTools.py b/rdkit/Chem/PandasTools.py index 173d8c86c..7997c092d 100644 --- a/rdkit/Chem/PandasTools.py +++ b/rdkit/Chem/PandasTools.py @@ -431,7 +431,7 @@ def WriteSDF(df, out, molColName='ROMol', idName=None, properties=None, allNumer if allNumeric: properties.extend([ dt for dt in df.dtypes.keys() - if (np.issubdtype(df.dtypes[dt], np.floating) or np.issubdtype(df.dtypes[dt], np.integer)) + if not pd.api.types.is_string_dtype(df.dtypes[dt]) and (np.issubdtype(df.dtypes[dt], np.floating) or np.issubdtype(df.dtypes[dt], np.integer)) ]) if molColName in properties: diff --git a/rdkit/Chem/UnitTestPandasTools.py b/rdkit/Chem/UnitTestPandasTools.py index 10ffcec65..198a82f2a 100644 --- a/rdkit/Chem/UnitTestPandasTools.py +++ b/rdkit/Chem/UnitTestPandasTools.py @@ -212,7 +212,7 @@ class TestPandasTools(unittest.TestCase): @unittest.skipIf(not hasattr(rdMolDraw2D, 'MolDraw2DCairo'), 'Cairo not available') def testPandasShouldShowMoleculesWhenTruncating(self): - csv_data = '''"Molecule ChEMBL ID";"Molecule Name";"Molecule Max Phase";"Molecular Weight";"#RO5 Violations";"AlogP";"Compound Key";"Smiles";"Standard Type";"Standard Relation";"Standard Value";"Standard Units";"pChEMBL Value";"Data Validity Comment";"Comment";"Uo Units";"Ligand Efficiency BEI";"Ligand Efficiency LE";"Ligand Efficiency LLE";"Ligand Efficiency SEI";"Potential Duplicate";"Assay ChEMBL ID";"Assay Description";"Assay Type";"BAO Format ID";"BAO Label";"Assay Organism";"Assay Tissue ChEMBL ID";"Assay Tissue Name";"Assay Cell Type";"Assay Subcellular Fraction";"Target ChEMBL ID";"Target Name";"Target Organism";"Target Type";"Document ChEMBL ID";"Source ID";"Source Description";"Document Journal";"Document Year";"Cell ChEMBL ID" + csv_data = r'''"Molecule ChEMBL ID";"Molecule Name";"Molecule Max Phase";"Molecular Weight";"#RO5 Violations";"AlogP";"Compound Key";"Smiles";"Standard Type";"Standard Relation";"Standard Value";"Standard Units";"pChEMBL Value";"Data Validity Comment";"Comment";"Uo Units";"Ligand Efficiency BEI";"Ligand Efficiency LE";"Ligand Efficiency LLE";"Ligand Efficiency SEI";"Potential Duplicate";"Assay ChEMBL ID";"Assay Description";"Assay Type";"BAO Format ID";"BAO Label";"Assay Organism";"Assay Tissue ChEMBL ID";"Assay Tissue Name";"Assay Cell Type";"Assay Subcellular Fraction";"Target ChEMBL ID";"Target Name";"Target Organism";"Target Type";"Document ChEMBL ID";"Source ID";"Source Description";"Document Journal";"Document Year";"Cell ChEMBL ID" "CHEMBL543779";"";"0";"341.86";"0";"2.60";"1w";"CCN(CC)CCS/C(=N\O)C(=O)c1ccc(C#N)cc1.Cl";"IC50";"'='";"180000.0";"nM";"";"Outside typical range";"";"UO_0000065";"";"";"";"";"False";"CHEMBL644102";"Reversible inhibition of Human AchE";"B";"BAO_0000357";"single protein format";"None";"None";"None";"None";"None";"CHEMBL220";"Acetylcholinesterase";"Homo sapiens";"SINGLE PROTEIN";"CHEMBL1123431";"1";"Scientific Literature";"J. Med. Chem.";"1986";"None" ''' try: diff --git a/rdkit/Chem/nbtests/github4823.ipynb b/rdkit/Chem/nbtests/github4823.ipynb index 27cea7638..68b1914b5 100644 --- a/rdkit/Chem/nbtests/github4823.ipynb +++ b/rdkit/Chem/nbtests/github4823.ipynb @@ -130,7 +130,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 5, "id": "permanent-liechtenstein", "metadata": { "scrolled": true @@ -139,36 +139,51 @@ { "data": { "text/plain": [ - "Index(['AMW', 'CLOGP', 'CP', 'CR', 'DAYLIGHT.FPG', 'DAYLIGHT_CLOGP', 'FP',\n", - " 'ISM', 'LIPINSKI_VIOLATIONS', 'NUM_HACCEPTORS', 'NUM_HDONORS',\n", - " 'NUM_HETEROATOMS', 'NUM_LIPINSKIHACCEPTORS', 'NUM_LIPINSKIHDONORS',\n", - " 'NUM_RINGS', 'NUM_ROTATABLEBONDS', 'NUM_ROTATABLEBONDS_O', 'P1',\n", - " 'SMILES', 'ID', 'ROMol'],\n", - " dtype='object')" + "['AMW',\n", + " 'CLOGP',\n", + " 'CP',\n", + " 'CR',\n", + " 'DAYLIGHT.FPG',\n", + " 'DAYLIGHT_CLOGP',\n", + " 'FP',\n", + " 'ISM',\n", + " 'LIPINSKI_VIOLATIONS',\n", + " 'NUM_HACCEPTORS',\n", + " 'NUM_HDONORS',\n", + " 'NUM_HETEROATOMS',\n", + " 'NUM_LIPINSKIHACCEPTORS',\n", + " 'NUM_LIPINSKIHDONORS',\n", + " 'NUM_RINGS',\n", + " 'NUM_ROTATABLEBONDS',\n", + " 'NUM_ROTATABLEBONDS_O',\n", + " 'P1',\n", + " 'SMILES',\n", + " 'ID',\n", + " 'ROMol']" ] }, - "execution_count": 4, + "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "df.columns" + "list(df.columns)" ] }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 6, "id": "careful-netherlands", "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "" + "" ] }, - "execution_count": 5, + "execution_count": 6, "metadata": {}, "output_type": "execute_result" } @@ -181,7 +196,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 7, "id": "identical-finder", "metadata": {}, "outputs": [ @@ -261,7 +276,7 @@ "4 223.231 2.43 1.869;-0P;4.71 6.390;-0R;4.71" ] }, - "execution_count": 6, + "execution_count": 7, "metadata": {}, "output_type": "execute_result" } @@ -281,7 +296,7 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3 (ipykernel)", + "display_name": "py312_build", "language": "python", "name": "python3" }, @@ -295,7 +310,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.11.8" + "version": "3.12.3" }, "toc": { "base_numbering": 1,