mirror of
https://github.com/rdkit/rdkit.git
synced 2026-06-03 21:44:30 +08:00
Add MolFromInchiAndAuxInfo to restore original atom order from AuxInfo (#9158)
* Add MolFromInchiAndAuxInfo to restore original atom order from AuxInfo Add a new function that reconstructs molecules from InChI + AuxInfo strings, restoring the original atom ordering and 2D/3D coordinates from the /N: and /rC: AuxInfo layers. Includes comprehensive tests for round-tripping, stereo preservation, coordinate restoration, edge cases, and multi-fragment molecules. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com> * Update rdkit/Chem/UnitTestInchi.py Co-authored-by: Greg Landrum <greg.landrum@gmail.com> --------- Co-authored-by: Claude Opus 4.6 <noreply@anthropic.com> Co-authored-by: Greg Landrum <greg.landrum@gmail.com>
This commit is contained in:
159
External/INCHI-API/python/inchi.py
vendored
159
External/INCHI-API/python/inchi.py
vendored
@@ -32,7 +32,9 @@
|
||||
INCHI_AVAILABLE = True
|
||||
|
||||
import logging
|
||||
import re
|
||||
|
||||
from rdkit import Chem, Geometry
|
||||
from rdkit import RDLogger
|
||||
from rdkit.Chem import rdinchi
|
||||
|
||||
@@ -51,6 +53,159 @@ class InchiReadWriteError(Exception):
|
||||
pass
|
||||
|
||||
|
||||
def _parse_auxinfo_coordinates(auxinfo):
|
||||
"""Parse the rC: (coordinate) layer from an InChI AuxInfo string.
|
||||
|
||||
Returns (coords_list, is_3d) where coords_list is a list of (x, y, z) tuples
|
||||
in original input atom order, or (None, None) if parsing fails or coords are empty.
|
||||
"""
|
||||
if not auxinfo:
|
||||
return None, None
|
||||
match = re.search(r'/rC:([^/]+)', auxinfo)
|
||||
if not match:
|
||||
return None, None
|
||||
entries = match.group(1).split(';')
|
||||
# Remove trailing empty entries from trailing semicolons
|
||||
while entries and not entries[-1].strip():
|
||||
entries.pop()
|
||||
coords = []
|
||||
for entry in entries:
|
||||
entry = entry.strip()
|
||||
if not entry:
|
||||
return None, None
|
||||
parts = entry.split(',')
|
||||
try:
|
||||
x = float(parts[0]) if parts[0] else 0.0
|
||||
y = float(parts[1]) if len(parts) > 1 and parts[1] else 0.0
|
||||
z = float(parts[2]) if len(parts) > 2 and parts[2] else 0.0
|
||||
except (ValueError, IndexError):
|
||||
return None, None
|
||||
coords.append((x, y, z))
|
||||
if not coords:
|
||||
return None, None
|
||||
# All-zero coordinates means no real coords were present
|
||||
if all(x == 0.0 and y == 0.0 and z == 0.0 for x, y, z in coords):
|
||||
return None, None
|
||||
is_3d = any(z != 0.0 for _, _, z in coords)
|
||||
return coords, is_3d
|
||||
|
||||
|
||||
def _parse_auxinfo_atom_order(auxinfo):
|
||||
"""Parse the N: (atom numbering) layer from an InChI AuxInfo string.
|
||||
|
||||
Returns a list of 0-based original atom indices, or None if parsing fails.
|
||||
The returned list maps from InChI canonical order to original atom order:
|
||||
result[i] is the original atom index for InChI canonical atom i.
|
||||
"""
|
||||
if not auxinfo:
|
||||
return None
|
||||
match = re.search(r'/N:([^/]+)', auxinfo)
|
||||
if not match:
|
||||
return None
|
||||
# The N: layer contains comma-separated 1-based atom indices
|
||||
# possibly with semicolons separating disconnected fragments
|
||||
tokens = match.group(1).replace(';', ',').split(',')
|
||||
try:
|
||||
return [int(t) - 1 for t in tokens]
|
||||
except (ValueError, IndexError):
|
||||
return None
|
||||
|
||||
|
||||
def _attach_conformer(mol, coords, is_3d):
|
||||
"""Attach parsed /rC: coordinates to a molecule as a conformer."""
|
||||
if coords is not None and len(coords) == mol.GetNumAtoms():
|
||||
conf = Chem.Conformer(mol.GetNumAtoms())
|
||||
conf.Set3D(is_3d)
|
||||
for i, (x, y, z) in enumerate(coords):
|
||||
conf.SetAtomPosition(i, Geometry.Point3D(x, y, z))
|
||||
mol.AddConformer(conf, assignId=True)
|
||||
return mol
|
||||
|
||||
|
||||
def _build_inverse_permutation(atom_order, size):
|
||||
"""Build the inverse permutation for RenumberAtoms.
|
||||
|
||||
atom_order[inchi_idx] = original_idx. Returns new_order where
|
||||
new_order[original_idx] = inchi_idx, or None if any index is out of range.
|
||||
"""
|
||||
new_order = [0] * size
|
||||
for inchi_idx, orig_idx in enumerate(atom_order):
|
||||
if orig_idx >= size:
|
||||
return None
|
||||
new_order[orig_idx] = inchi_idx
|
||||
return new_order
|
||||
|
||||
|
||||
def MolFromInchiAndAuxInfo(inchi, auxinfo, sanitize=True, removeHs=True, logLevel=None,
|
||||
treatWarningAsError=False):
|
||||
"""Construct a molecule from an InChI string and its AuxInfo, restoring the
|
||||
original atom ordering.
|
||||
|
||||
Keyword arguments:
|
||||
sanitize -- set to True to enable sanitization of the molecule. Default is
|
||||
True
|
||||
removeHs -- set to True to remove Hydrogens from a molecule. This only
|
||||
makes sense when sanitization is enabled
|
||||
logLevel -- the log level used for logging logs and messages from InChI
|
||||
API. set to None to diable the logging completely
|
||||
treatWarningAsError -- set to True to raise an exception in case of a
|
||||
molecule that generates warning in calling InChI API. The resultant
|
||||
molecule and error message are part of the excpetion
|
||||
|
||||
Returns:
|
||||
a rdkit.Chem.rdchem.Mol instance with atoms reordered to match the
|
||||
original atom ordering encoded in the AuxInfo
|
||||
"""
|
||||
mol = MolFromInchi(inchi, sanitize=sanitize, removeHs=removeHs, logLevel=logLevel,
|
||||
treatWarningAsError=treatWarningAsError)
|
||||
if mol is None:
|
||||
return None
|
||||
|
||||
# /rC: coordinates are in original input order; attach after reordering.
|
||||
coords, is_3d = _parse_auxinfo_coordinates(auxinfo)
|
||||
|
||||
atom_order = _parse_auxinfo_atom_order(auxinfo)
|
||||
if atom_order is None:
|
||||
return _attach_conformer(mol, coords, is_3d)
|
||||
|
||||
from rdkit.Chem import RenumberAtoms
|
||||
num_mol_atoms = mol.GetNumAtoms()
|
||||
|
||||
if removeHs:
|
||||
# N: layer lists heavy atoms only; must match molecule size after H removal
|
||||
if len(atom_order) != num_mol_atoms:
|
||||
return mol
|
||||
new_order = _build_inverse_permutation(atom_order, num_mol_atoms)
|
||||
if new_order is None:
|
||||
return mol
|
||||
return _attach_conformer(RenumberAtoms(mol, new_order), coords, is_3d)
|
||||
|
||||
if len(atom_order) > num_mol_atoms:
|
||||
return mol
|
||||
|
||||
if len(atom_order) == num_mol_atoms:
|
||||
new_order = _build_inverse_permutation(atom_order, num_mol_atoms)
|
||||
if new_order is None:
|
||||
return mol
|
||||
return _attach_conformer(RenumberAtoms(mol, new_order), coords, is_3d)
|
||||
|
||||
# More atoms in molecule than in atom_order (explicit Hs added by InChI).
|
||||
# Place heavy atoms in original order, then append Hs.
|
||||
num_heavy = len(atom_order)
|
||||
new_order = _build_inverse_permutation(atom_order, num_heavy)
|
||||
if new_order is None:
|
||||
return mol
|
||||
new_order.extend([0] * (num_mol_atoms - num_heavy))
|
||||
h_slot = num_heavy
|
||||
for i in range(num_mol_atoms):
|
||||
if i not in atom_order:
|
||||
if h_slot >= num_mol_atoms:
|
||||
return mol
|
||||
new_order[h_slot] = i
|
||||
h_slot += 1
|
||||
return _attach_conformer(RenumberAtoms(mol, new_order), coords, is_3d)
|
||||
|
||||
|
||||
def MolFromInchi(inchi, sanitize=True, removeHs=True, logLevel=None, treatWarningAsError=False):
|
||||
"""Construct a molecule from a InChI string
|
||||
|
||||
@@ -237,6 +392,6 @@ GetInchiVersion = rdinchi.GetInchiVersion
|
||||
|
||||
__all__ = [
|
||||
'MolToInchiAndAuxInfo', 'MolToInchi', 'MolBlockToInchiAndAuxInfo', 'MolBlockToInchi',
|
||||
'MolFromInchi', 'InchiReadWriteError', 'InchiToInchiKey', 'MolToInchiKey', 'GetInchiVersion',
|
||||
'INCHI_AVAILABLE'
|
||||
'MolFromInchi', 'MolFromInchiAndAuxInfo', 'InchiReadWriteError', 'InchiToInchiKey',
|
||||
'MolToInchiKey', 'GetInchiVersion', 'INCHI_AVAILABLE'
|
||||
]
|
||||
|
||||
@@ -43,7 +43,8 @@ from rdkit.Chem import (INCHI_AVAILABLE, ForwardSDMolSupplier, MolFromMolBlock,
|
||||
|
||||
if INCHI_AVAILABLE:
|
||||
from rdkit.Chem import (InchiReadWriteError, InchiToInchiKey, MolBlockToInchi, MolFromInchi,
|
||||
MolToInchi, MolToInchiKey, GetInchiVersion)
|
||||
MolFromInchiAndAuxInfo, MolToInchi, MolToInchiAndAuxInfo, MolToInchiKey,
|
||||
GetInchiVersion)
|
||||
|
||||
COLOR_RED = '\033[31m'
|
||||
COLOR_GREEN = '\033[32m'
|
||||
@@ -331,6 +332,154 @@ M END"""
|
||||
self.assertGreaterEqual(version, "1.07.2")
|
||||
|
||||
|
||||
@unittest.skipUnless(INCHI_AVAILABLE, 'Inchi support not available')
|
||||
class TestMolFromInchiAndAuxInfo(unittest.TestCase):
|
||||
|
||||
def test0RoundTripAtomOrder(self):
|
||||
"""Verify that round-tripping through InChI+AuxInfo preserves atom ordering."""
|
||||
from rdkit.Chem import MolFromSmiles, MolToSmiles
|
||||
smiles_list = ['c1ccccc1O', 'CC(=O)O', 'C(=O)(N)C', 'c1cc(O)ccc1N']
|
||||
for smi in smiles_list:
|
||||
mol = MolFromSmiles(smi)
|
||||
inchi, aux = MolToInchiAndAuxInfo(mol)
|
||||
mol2 = MolFromInchiAndAuxInfo(inchi, aux)
|
||||
self.assertIsNotNone(mol2)
|
||||
# The atom ordering should be restored, so atom symbols in order should match
|
||||
orig_atoms = [a.GetAtomicNum() for a in mol.GetAtoms()]
|
||||
new_atoms = [a.GetAtomicNum() for a in mol2.GetAtoms()]
|
||||
self.assertEqual(orig_atoms, new_atoms,
|
||||
f"Atom order mismatch for {smi}: {orig_atoms} vs {new_atoms}")
|
||||
|
||||
def test1StereoPreservation(self):
|
||||
"""Verify stereochemistry is preserved through round-trip."""
|
||||
from rdkit.Chem import MolFromSmiles
|
||||
smi = '[C@@H](O)(F)Cl'
|
||||
mol = MolFromSmiles(smi)
|
||||
inchi, aux = MolToInchiAndAuxInfo(mol)
|
||||
mol2 = MolFromInchiAndAuxInfo(inchi, aux)
|
||||
self.assertIsNotNone(mol2)
|
||||
inchi2 = MolToInchi(mol2)
|
||||
self.assertEqual(inchi, inchi2)
|
||||
|
||||
def test2NoneAuxInfo(self):
|
||||
"""MolFromInchiAndAuxInfo with None auxinfo should still return a molecule."""
|
||||
inchi = 'InChI=1S/CH4/h1H4'
|
||||
mol = MolFromInchiAndAuxInfo(inchi, None)
|
||||
self.assertIsNotNone(mol)
|
||||
self.assertEqual(mol.GetNumAtoms(), 1)
|
||||
|
||||
def test3EmptyAuxInfo(self):
|
||||
"""MolFromInchiAndAuxInfo with empty auxinfo should still return a molecule."""
|
||||
inchi = 'InChI=1S/CH4/h1H4'
|
||||
mol = MolFromInchiAndAuxInfo(inchi, '')
|
||||
self.assertIsNotNone(mol)
|
||||
|
||||
def test4InvalidInchi(self):
|
||||
"""MolFromInchiAndAuxInfo with invalid InChI should return None."""
|
||||
mol = MolFromInchiAndAuxInfo('not_an_inchi', 'AuxInfo=1/0/N:1/')
|
||||
self.assertIsNone(mol)
|
||||
|
||||
def test5MultiFragmentAuxInfo(self):
|
||||
"""Test with a molecule that produces multi-fragment AuxInfo (semicolons)."""
|
||||
from rdkit.Chem import MolFromSmiles
|
||||
smi = 'CC.OO'
|
||||
mol = MolFromSmiles(smi)
|
||||
inchi, aux = MolToInchiAndAuxInfo(mol)
|
||||
mol2 = MolFromInchiAndAuxInfo(inchi, aux)
|
||||
self.assertIsNotNone(mol2)
|
||||
self.assertEqual(mol2.GetNumAtoms(), mol.GetNumAtoms())
|
||||
|
||||
def test6CoordinateRestoration(self):
|
||||
"""Round-trip a mol block with 2D coords and verify conformer is restored and
|
||||
trailing semicolons in /rC: are handled correctly."""
|
||||
from rdkit.Chem import MolFromMolBlock
|
||||
mb = """
|
||||
Mrv1824 02111920092D
|
||||
|
||||
6 6 0 0 0 0 999 V2000
|
||||
-5.5134 3.5259 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0
|
||||
-6.2279 3.1134 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0
|
||||
-6.2279 2.2884 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0
|
||||
-5.5134 1.8759 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0
|
||||
-4.7989 2.2884 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0
|
||||
-4.7989 3.1134 0.0000 O 0 0 0 0 0 0 0 0 0 0 0 0
|
||||
1 2 1 0 0 0 0
|
||||
3 4 1 0 0 0 0
|
||||
4 5 1 0 0 0 0
|
||||
5 6 1 0 0 0 0
|
||||
1 6 1 0 0 0 0
|
||||
2 3 2 0 0 0 0
|
||||
M END"""
|
||||
mol = MolFromMolBlock(mb)
|
||||
self.assertIsNotNone(mol)
|
||||
inchi, aux = MolToInchiAndAuxInfo(mol)
|
||||
mol2 = MolFromInchiAndAuxInfo(inchi, aux)
|
||||
self.assertIsNotNone(mol2)
|
||||
self.assertEqual(mol2.GetNumConformers(), 1)
|
||||
# Verify round-trip produces populated /rC: layer
|
||||
inchi2, aux2 = MolToInchiAndAuxInfo(mol2)
|
||||
self.assertIn('/rC:', aux2)
|
||||
self.assertNotIn('/rC:;', aux2)
|
||||
# Real AuxInfo has trailing semicolons in /rC: — verify our parser handles them
|
||||
rc_match = re.search(r'/rC:([^/]+)', aux)
|
||||
self.assertIsNotNone(rc_match, "AuxInfo should contain /rC: layer")
|
||||
self.assertTrue(rc_match.group(1).endswith(';'),
|
||||
"Real AuxInfo /rC: should end with trailing semicolon")
|
||||
|
||||
def test7EmptyCoordinates(self):
|
||||
"""AuxInfo with empty /rC: or no /rC: should produce no conformer, no crash."""
|
||||
from rdkit.Chem import MolFromSmiles
|
||||
mol = MolFromSmiles('C')
|
||||
inchi, aux = MolToInchiAndAuxInfo(mol)
|
||||
# Manually strip or replace /rC: to simulate empty coords
|
||||
aux_empty = re.sub(r'/rC:[^/]*', '/rC:;', aux)
|
||||
mol2 = MolFromInchiAndAuxInfo(inchi, aux_empty)
|
||||
self.assertIsNotNone(mol2)
|
||||
self.assertEqual(mol2.GetNumConformers(), 0)
|
||||
|
||||
def test7bAllZeroCoordinates(self):
|
||||
"""AuxInfo with all-zero /rC: entries (,,;) should produce no conformer."""
|
||||
from rdkit.Chem import MolFromSmiles
|
||||
mol = MolFromSmiles('CCO')
|
||||
inchi, aux = MolToInchiAndAuxInfo(mol)
|
||||
# Replace /rC: content with all-zero entries (,,; format from InChI spec)
|
||||
num_atoms = mol.GetNumAtoms()
|
||||
zero_coords = ';'.join([',,'] * num_atoms) + ';'
|
||||
aux_zeros = re.sub(r'/rC:[^/]*', '/rC:' + zero_coords, aux)
|
||||
mol2 = MolFromInchiAndAuxInfo(inchi, aux_zeros)
|
||||
self.assertIsNotNone(mol2)
|
||||
self.assertEqual(mol2.GetNumConformers(), 0)
|
||||
|
||||
def test8CoordinateAtomOrderMatch(self):
|
||||
"""Verify coordinates are on the correct atoms after reordering."""
|
||||
from rdkit.Chem import MolFromMolBlock
|
||||
mb = """
|
||||
Mrv1824 02111920092D
|
||||
|
||||
3 2 0 0 0 0 999 V2000
|
||||
1.0000 2.0000 0.0000 O 0 0 0 0 0 0 0 0 0 0 0 0
|
||||
3.0000 4.0000 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0
|
||||
5.0000 6.0000 0.0000 N 0 0 0 0 0 0 0 0 0 0 0 0
|
||||
1 2 1 0 0 0 0
|
||||
2 3 1 0 0 0 0
|
||||
M END"""
|
||||
mol = MolFromMolBlock(mb)
|
||||
self.assertIsNotNone(mol)
|
||||
orig_data = list(zip([a.GetSymbol() for a in mol.GetAtoms()], mol.GetConformer().GetPositions()))
|
||||
inchi, aux = MolToInchiAndAuxInfo(mol)
|
||||
mol2 = MolFromInchiAndAuxInfo(inchi, aux)
|
||||
self.assertIsNotNone(mol2)
|
||||
self.assertEqual(mol2.GetNumConformers(), 1)
|
||||
# Each atom should have the same symbol and (approximately) same coordinates
|
||||
for i, a in enumerate(mol2.GetAtoms()):
|
||||
orig_sym, orig_pos = orig_data[i]
|
||||
self.assertEqual(a.GetSymbol(), orig_sym)
|
||||
pos = mol2.GetConformer().GetAtomPosition(i)
|
||||
for j in range(3):
|
||||
self.assertAlmostEqual([pos.x, pos.y, pos.z][j], orig_pos[j], places=4,
|
||||
msg=f"Coord mismatch for atom {i} ({a.GetSymbol()})")
|
||||
|
||||
|
||||
if __name__ == '__main__': # pragma: nocover
|
||||
# only run the test if InChI is available
|
||||
if INCHI_AVAILABLE:
|
||||
|
||||
Reference in New Issue
Block a user