Pandastools improvements (#9251)

* Added automatic parsing functionality * Added documentation * Slightly changed check for gzip extension * Apply suggestions from code review Added small changes for readability Co-authored-by: Greg Landrum <greg.landrum@gmail.com> --------- Co-authored-by: Greg Landrum <greg.landrum@gmail.com>
2026-06-03 21:44:30 +08:00 · 2026-05-05 17:02:49 +01:00
parent 6d75052459
commit b54cbac151
1 changed files with 26 additions and 8 deletions
--- a/rdkit/Chem/PandasTools.py
+++ b/rdkit/Chem/PandasTools.py
@@ -240,17 +240,35 @@ else:

  def LoadSDF(filename, idName='ID', molColName='ROMol', includeFingerprints=False,
              isomericSmiles=True, smilesName=None, embedProps=False, removeHs=True,
-              strictParsing=True, sanitize=True):
+              strictParsing=True, sanitize=True, autoConvertStrings=False):
    '''Read file in SDF format and return as Pandas data frame.
-      If embedProps=True all properties also get embedded in Mol objects in the molecule column.
-      If molColName=None molecules would not be present in resulting DataFrame (only properties
-      would be read).
+      
+      Arguments:
+      
+       - filename: path to the SDF file or a file-like object.
+       - idName: name of the column to be used for the molecule title. Defaults to "ID".
+       - molColName: name of the column to be used for the RDKit molecule objects. If None, molecules will not be included in resulting DataFrame. Defaults to "ROMol".
+       - includeFingerprints: if True, precompute fingerprints and store them within the molecule objects to accelerate substructure matching. Defaults to False.
+       - isomericSmiles: if True, generated SMILES will include isomeric information. Defaults to True.
+       - smilesName: if set, add a column with the specified name to the DataFrame that contains the SMILES representation of the molecule. If None, SMILES will not be included in final DataFrame. Defaults to None.
+       - embedProps: if True, properties will also be embedded in the molecule objects instead of only being added as separate columns to the Dataframe. Defaults to False.
+       - removeHs: if True, explicit hydrogens will be removed from the molecules. Defaults to True.
+       - strictParsing: if True, an exception will be raised if a molecule cannot be parsed; if False, unparseable molecules will be skipped. Defaults to True.
+       - sanitize: if True, molecules will be sanitized during parsing. It is passed on to Chem.ForwardSDMolSupplier sanitize. Defaults to True.
+       - autoConvertStrings: if True, allows to automatically convert properties to numeric or boolean types where possible. Properties that cannot be converted are left as strings. Defaults to False.

-      Sanitize boolean is passed on to Chem.ForwardSDMolSupplier sanitize.
+
+      Returns:
+      
+        A pandas DataFrame containing the data from the SDF file.
+      
+      
+      Note:
+      
      If neither molColName nor smilesName are set, sanitize=false.
      '''
    if isinstance(filename, str):
-      if filename.lower()[-3:] == ".gz":
+      if filename.lower().endswith("gz"):
        import gzip
        f = gzip.open(filename, "rb")
      else:
@@ -268,7 +286,7 @@ else:
                                  strictParsing=strictParsing)):
      if mol is None:
        continue
-      row = dict((k, mol.GetProp(k)) for k in mol.GetPropNames())
+      row = mol.GetPropsAsDict(autoConvertStrings=autoConvertStrings)
      if molColName is not None and not embedProps:
        for prop in mol.GetPropNames():
          mol.ClearProp(prop)
@@ -416,7 +434,7 @@ def WriteSDF(df, out, molColName='ROMol', idName=None, properties=None, allNumer
    '''
  close = None
  if isinstance(out, str):
-    if out.lower()[-3:] == ".gz":
+    if out.lower().endswith("gz"):
      import gzip
      out = gzip.open(out, "wt")
      close = out.close