Support more formats and add new method returning deleted fragments (#1432)

* Support more formats and add new method returning deleted fragments

* Reduce number of changes and make more visible what has really changed

* Add 'UnitTestSaltRemover' and perform changes suggested by review

* Add support for 'smi' file

* Improve doctest regarding new functionality

* Following suggestions made by collaborator

* Be consistent with 'ValueError' and add an example without tuple destructuring
This commit is contained in:
Christian Ribeaud
2017-06-19 17:10:10 +02:00
committed by Greg Landrum
parent e03397c664
commit e4717913cd
5 changed files with 7758 additions and 23 deletions

3
.gitignore vendored
View File

@@ -6,6 +6,9 @@
/.project
/.pydevproject
#- IDEA files
/.idea
#- Binary files
__pycache__/
*.so

View File

@@ -30,23 +30,60 @@
#
# Created by Greg Landrum, Dec 2006
#
import os
import re
from rdkit import Chem
from rdkit import RDConfig
from collections import namedtuple
from contextlib import closing
from rdkit import Chem, RDConfig
from rdkit.Chem.rdmolfiles import SDMolSupplier, SmilesMolSupplier
class InputFormat:
SMARTS = 'smarts'
MOL = 'mol'
SMILES = 'smiles'
def _smartsFromSmartsLine(line):
"""
Converts given line into a molecule using 'Chem.MolFromSmarts'.
"""
# Name the regular expression (better than inlining it)
whitespace = re.compile(r'[\t ]+')
# Reflects the specialisation of this method to read the rather unusual
# SMARTS files with the // comments.
line = line.strip().split('//')[0]
if line:
smarts = whitespace.split(line)
salt = Chem.MolFromSmarts(smarts[0])
if salt is None:
raise ValueError(line)
return salt
def _getSmartsSaltsFromStream(stream):
"""
Yields extracted SMARTS salts from given stream.
"""
with closing(stream) as lines:
for line in lines:
smarts = _smartsFromSmartsLine(line)
if smarts:
yield smarts
def _getSmartsSaltsFromFile(filename):
"""
Extracts SMARTS salts from given file object.
"""
return _getSmartsSaltsFromStream(open(filename, 'r'))
class SaltRemover(object):
defnFilename = os.path.join(RDConfig.RDDataDir, 'Salts.txt')
def __init__(self, defnFilename=None, defnData=None):
def __init__(self, defnFilename=None, defnData=None, defnFormat=InputFormat.SMARTS):
if defnFilename:
self.defnFilename = defnFilename
self.defnData = defnData
self.salts = None
self.defnFormat = defnFormat
self._initPatterns()
def _initPatterns(self):
@@ -56,10 +93,15 @@ class SaltRemover(object):
>>> len(remover.salts)>0
True
Default input format is SMARTS
>>> remover = SaltRemover(defnData="[Cl,Br]")
>>> len(remover.salts)
1
>>> remover = SaltRemover(defnData="[Na+]\\nCC(=O)O", defnFormat=InputFormat.SMILES)
>>> len(remover.salts)
2
>>> from rdkit import RDLogger
>>> RDLogger.DisableLog('rdApp.error')
>>> remover = SaltRemover(defnData="[Cl,fail]")
@@ -69,22 +111,31 @@ class SaltRemover(object):
>>> RDLogger.EnableLog('rdApp.error')
"""
whitespace = re.compile(r'[\t ]+')
if self.defnData:
from rdkit.six.moves import cStringIO as StringIO
inF = StringIO(self.defnData)
with closing(inF):
self.salts = []
for line in inF:
if line:
if self.defnFormat == InputFormat.SMARTS:
salt = _smartsFromSmartsLine(line)
elif self.defnFormat == InputFormat.SMILES:
salt = Chem.MolFromSmiles(line)
else:
raise ValueError('Unsupported format for supplier.')
if salt is None:
raise ValueError(line)
self.salts.append(salt)
else:
inF = open(self.defnFilename, 'r')
with closing(inF):
self.salts = []
for line in inF:
line = line.strip().split('//')[0]
if line:
splitL = whitespace.split(line)
salt = Chem.MolFromSmarts(splitL[0])
if salt is None:
raise ValueError(line)
self.salts.append(salt)
if self.defnFormat == InputFormat.SMARTS:
self.salts = [mol for mol in _getSmartsSaltsFromFile(self.defnFilename)]
elif self.defnFormat == InputFormat.MOL:
self.salts = [mol for mol in SDMolSupplier(self.defnFilename)]
elif self.defnFormat == InputFormat.SMILES:
self.salts = [mol for mol in SmilesMolSupplier(self.defnFilename)]
else:
raise ValueError('Unsupported format for supplier.')
def StripMol(self, mol, dontRemoveEverything=False):
"""
@@ -146,6 +197,51 @@ class SaltRemover(object):
2
"""
strippedMol = self._StripMol(mol, dontRemoveEverything)
return strippedMol.mol
def StripMolWithDeleted(self, mol, dontRemoveEverything=False):
"""
Strips given molecule and returns it, with the fragments which have been deleted.
>>> remover = SaltRemover(defnData="[Cl,Br]")
>>> len(remover.salts)
1
>>> mol = Chem.MolFromSmiles('CN(C)C.Cl.Br')
>>> res, deleted = remover.StripMolWithDeleted(mol)
>>> Chem.MolToSmiles(res)
'CN(C)C'
>>> [Chem.MolToSmarts(m) for m in deleted]
['[Cl,Br]']
>>> mol = Chem.MolFromSmiles('CN(C)C.Cl')
>>> res, deleted = remover.StripMolWithDeleted(mol)
>>> res.GetNumAtoms()
4
>>> len(deleted)
1
>>> deleted[0].GetNumAtoms()
1
>>> Chem.MolToSmiles(deleted[0])
'Cl'
Multiple occurrences of 'Cl' and without tuple destructuring
>>> mol = Chem.MolFromSmiles('CN(C)C.Cl.Cl')
>>> tup = remover.StripMolWithDeleted(mol)
>>> tup.mol.GetNumAtoms()
4
>>> len(tup.deleted)
1
>>> tup.deleted[0].GetNumAtoms()
1
>>> Chem.MolToSmiles(deleted[0])
'Cl'
"""
return self._StripMol(mol, dontRemoveEverything)
def _StripMol(self, mol, dontRemoveEverything=False):
def _applyPattern(m, salt, notEverything):
nAts = m.GetNumAtoms()
@@ -156,19 +252,19 @@ class SaltRemover(object):
t = Chem.DeleteSubstructs(res, salt, True)
if not t or (notEverything and t.GetNumAtoms() == 0):
return res
else:
res = t
res = t
while res.GetNumAtoms() and nAts > res.GetNumAtoms():
nAts = res.GetNumAtoms()
t = Chem.DeleteSubstructs(res, salt, True)
if notEverything and t.GetNumAtoms() == 0:
break
else:
res = t
res = t
return res
StrippedMol = namedtuple('StrippedMol', ['mol', 'deleted'])
deleted = []
if dontRemoveEverything and len(Chem.GetMolFrags(mol)) <= 1:
return mol
return StrippedMol(mol, deleted)
modified = False
natoms = mol.GetNumAtoms()
for salt in self.salts:
@@ -176,11 +272,12 @@ class SaltRemover(object):
if natoms != mol.GetNumAtoms():
natoms = mol.GetNumAtoms()
modified = True
deleted.append(salt)
if dontRemoveEverything and len(Chem.GetMolFrags(mol)) <= 1:
break
if modified and mol.GetNumAtoms() > 0:
Chem.SanitizeMol(mol)
return mol
return StrippedMol(mol, deleted)
def __call__(self, mol, dontRemoveEverything=False):
"""
@@ -188,6 +285,8 @@ class SaltRemover(object):
>>> remover = SaltRemover(defnData="[Cl,Br]")
>>> len(remover.salts)
1
>>> Chem.MolToSmiles(remover.salts[0])
'Cl'
>>> mol = Chem.MolFromSmiles('CN(C)C.Cl')
>>> res = remover(mol)

View File

@@ -0,0 +1,71 @@
import doctest
import unittest
import os
from rdkit import Chem
import Chem.SaltRemover
from Chem.SaltRemover import SaltRemover, InputFormat
def load_tests(loader, tests, ignore):
""" Add the Doctests from the module """
tests.addTests(doctest.DocTestSuite(Chem.SaltRemover, optionflags=doctest.ELLIPSIS))
return tests
class TestCase(unittest.TestCase):
def test_withSmiles(self):
remover = SaltRemover(defnData="[Na+]\nCC(=O)O", defnFormat=InputFormat.SMILES)
self.assertEqual(len(remover.salts), 2)
mol = Chem.MolFromSmiles('CC(=O)O.[Na+]')
res = remover.StripMol(mol)
self.assertEqual(res.GetNumAtoms(), 0)
def test_withSdfFile(self):
testFile = os.sep.join(
[os.path.dirname(os.path.abspath(__file__)), 'test_data', 'witch-salts.sdf'])
remover = SaltRemover(defnFilename=testFile, defnFormat=InputFormat.MOL)
self.assertEqual(len(remover.salts), 240)
m = Chem.MolFromSmiles("Cc1onc(-c2ccccc2)c1C([O-])=NC1C(=O)N2C1SC(C)(C)C2C(=O)O.O.[Na+]")
tuple = remover.StripMolWithDeleted(m)
self.assertEqual(Chem.MolToSmiles(tuple.mol), 'Cc1onc(-c2ccccc2)c1C([O-])=NC1C(=O)N2C1SC(C)(C)C2C(=O)O.O')
self.assertEqual(len(tuple.deleted), 1)
self.assertEqual(Chem.MolToSmiles(tuple.deleted[0]), '[Na+]')
def test_withSmiFile(self):
testFile = os.sep.join(
[os.path.dirname(os.path.abspath(__file__)), 'test_data', 'c6h6-cdk.smi'])
remover = SaltRemover(defnFilename=testFile, defnFormat=InputFormat.SMILES)
self.assertEqual(len(remover.salts), 216)
def test_withDontRemoveEverything(self):
testFile = os.sep.join(
[os.path.dirname(os.path.abspath(__file__)), 'test_data', 'witch-salts.sdf'])
remover = SaltRemover(defnFilename=testFile, defnFormat=InputFormat.MOL)
m = Chem.MolFromSmiles('Cc1ccccc1')
mol, deleted = remover.StripMolWithDeleted(m, dontRemoveEverything=True)
# List should be empty
self.assertFalse(deleted)
self.assertEqual(m, mol)
def test_SmilesVsSmarts(self):
# SMARTS
remover = SaltRemover(defnData="[Cl,Br]")
mol = Chem.MolFromSmiles('CN(Br)Cl.Cl')
res = remover.StripMol(mol)
self.assertEqual(res.GetNumAtoms(), 4)
self.assertEqual(Chem.MolToSmiles(res), 'CN(Cl)Br')
mol = Chem.MolFromSmiles('CN(C)C.Cl.Br')
res, deleted = remover.StripMolWithDeleted(mol)
self.assertEqual(Chem.MolToSmiles(res), 'CN(C)C')
# Because we read in SMARTS, we should output as well. Otherwise, we will have
# mismatches
self.assertListEqual([Chem.MolToSmarts(m) for m in deleted], ['[Cl,Br]'])
# SMILES
remover = SaltRemover(defnData="Cl", defnFormat=InputFormat.SMILES)
mol = Chem.MolFromSmiles('CN(Br)Cl.Cl')
res = remover.StripMol(mol)
self.assertEqual(res.GetNumAtoms(), 4)
self.assertEqual(Chem.MolToSmiles(res), 'CN(Cl)Br')
if __name__ == '__main__': # pragma: nocover
unittest.main()

View File

@@ -0,0 +1,217 @@
c1ccccc1
C1=CC1C2C=C2
C1=CC2C=CC12
C1=CC2C3C1C23
C12C3C1C4C2C34
C13C2C4C1C2C34
C#CC=CC=C
C#CCC1C=C1
C#CC1C=CC1
C#CC1C2CC12
C1=CCC=C=C1
C=C=CC1C=C1
C=CC1C=C=C1
C=1=CC2CC2(C=1)
C=C1C=CC=C1
C=CC=1C=CC=1
C=CC=C1C=C1
C1=CC1C2=CC2
C1=CC2C=C2(C1)
C1=CC2=CC1C2
C1=CC2CC2(=C1)
C1=CC2=CCC12
C1=CC2=CC2(C1)
C=C1C2C=CC12
C=CC1C2C=C12
C1=C2CC3C1C23
C=2C1C3CC1C=23
C=1CC2C3C=1C23
C=1C2CC3C=1C23
C=1C2C=1C3CC23
C=C1C2C3C1C23
C1=CC2(C1)(C=C2)
C1C3C1C23(C=C2)
C1=CC23(CC3(C12))
C=CC23(C1C3(C12))
C1C2C3C4C1C234
C2C1C3C4C1C234
C2C3C1C4C1C234
C#CCCC#C
C=CC#CC=C
C1=CCCC#C1
C1=CCC#CC1
C=CC1C#CC1
C=CCC1C#C1
C1#CC1C2CC2
C1#CC2CCC12
C1#CC2CC2(C1)
C1#CC2CC1C2
C#CCC=C=C
C=CC=C=C=C
C=C=CC=C=C
C=1CCC=C=C=1
C#CC(=C)C=C
C#CC=C1CC1
C#CC1=CCC1
C#CCC1=CC1
C#CC1CC1(=C)
C=C=C1C=CC1
C=C=CC1=CC1
C=CC=1CC=C=1
C=CCC=1C=C=1
C=C1CC=C=C1
C1=C=C2CCC12
C1=C=C2CC1C2
C1=C=C1C2CC2
C=1CC2CC2(=C=1)
C=C=C1C2CC12
C=C1C=CC1(=C)
C=CC1=CC1(=C)
C1=CC1=C2CC2
C1=CC=2CCC1=2
C1=CC=2CC=2(C1)
C=1C=C2CCC=12
C=1C=C2CC=1C2
C1CC=2C=C1C=2
C1C=C2C=C2(C1)
C=1CC2=CC=1C2
C1C=C2CC2(=C1)
C=1CC=1C2=CC2
C=1CC2=CCC=12
C=C1CC2=CC12
C=CC1C=2CC1=2
C=C1CC2C=C12
C=C2C1C=C2(C1)
C=C1C=C2CC12
C=C1C2=CCC12
C=CC=1C2CC=12
C=C1C2C(=C)C12
C1CC=2C3C1C=23
C1C2CC3=C1C23
C1C=2CC3C1C=23
C1C2C1C=3CC2=3
C1C2C1=C3CC23
C2C1C3CC1=C23
C1CC2C=3C1C2=3
C1C2CC=3C1C2=3
C#CC12(CC2(C1))
C=1=CC2(C=1)(CC2)
C=C2CC12(C=C1)
C=CC12(C=C2(C1))
C1CC23(C=C3(C12))
C1C3=CC23(CC12)
C1=C2CC13(CC23)
C=1C3C=1C23(CC2)
C=1C3CC23(CC=12)
C=1CC23(CC3(C=12))
C1C2CC13(C=C23)
C=C1C3C2CC123
C1=CC23(CC13(C2))
C2C1C34(CC14(C23))
C1CC34(C2C4(C123))
C1C2C14(C3CC234)
C1C2C3C14(CC234)
C1C24(CC13(C4(C23)))
C=C1C#CCC1
C=C1CC#CC1
C=C=C=C1CC1
C=C=C1CC1(=C)
C1CC2=C=C2(C1)
C1CC=2CC1=C=2
C=C1C(=C)C1(=C)
C=C1C=2CCC1=2
C=C1CC=2CC1=2
C1CC2(C1)(C#C2)
C1#CC2(C1)(CC2)
C1CC23(CC3(=C12))
C1C3=C1C23(CC2)
C1C=2CC13(CC=23)
C=C2C13(CC23(C1))
C1C24(CC34(CC123))
C#CC(C)C#C
CC#CC1C=C1
CC=CC1C#C1
CC1C=CC#C1
CC1C2C#CC12
C#CC=C=CC
CC1C=C=C=C1
C#CC1C=C1(C)
C#CC1=CC1(C)
CC=1C=CC=C=1
CC=C=C1C=C1
CC=CC=1C=C=1
CC1=CC=C=C1
CC=C1C=C=C1
CC1C2C=C=C12
CC1=CC2C=C12
CC=1C2C=CC=12
CC1=CC2=CC12
CC=C1C2C=C12
CC1C=C2C=C12
CC1C=2C=CC1=2
CC1C2=CC=C12
CC2C1=CC2(=C1)
CC=1C2C3C=1C23
CC1C2C3C1=C23
CC1C2C=3C1C2=3
C#CC1(C)(C=C1)
CC2=CC12(C=C1)
CC12(C=CC2(=C1))
CC1C3C2=CC123
CC23(C=C1C3(C12))
CC23(C1C=C3(C12))
CC2C14(C3C1C234)
CC14(C3C2C1C234)
C#CC#CCC
C#CCC#CC
C=C=CC#CC
C=C=C=C=CC
CCC1=CC#C1
CC1=CC#CC1
CCC=C1C#C1
CC=1C#CCC=1
CC#CC1=CC1
CC=C1C#CC1
C=C(C)C1C#C1
C=C1C#CC1(C)
C#CC(C)=C=C
CCC=1C=C=C=1
CC=1CC=C=C=1
C#CC=1CC=1(C)
C=C=C1C=C1(C)
C=CC1=C=C1(C)
C=C1C=C=C1(C)
C=C(C)C=1C=C=1
CCC1C2=C=C12
CC1=C=C2CC12
CC2C=1CC2(=C=1)
CC1CC2=C=C12
CCC=1C2=CC=12
CC=1CC2=CC=12
CC=2C1=CC=2(C1)
CC=1C=C2CC=12
CC1=CC=2CC1=2
CC=1C2=CCC=12
CC=C1C=2CC1=2
C=C1C2C(C)=C12
C=C1C=2C(C)C1=2
C=CC1(C)(C#C1)
CC2CC12(C#C1)
CC12(C#CC2(C1))
CC12(C=C=C2(C1))
C=C1C2=CC12(C)
CCC23(C1C3(=C12))
CC1=C3C2CC123
CC1C3=C2CC123
CC23(C1CC3(=C12))
CC23(CC=1C3(C=12))
CC23(CC1C3(=C12))
CC13(CC23(C=C12))
CC24(C1C34(CC123))
CC#CC#CC
CC(C)=C1C#C1
CC=1C#CC=1(C)
CC1=C=C=C1(C)
CC1=C2C(C)=C12
CC2(C)(C1=C=C12)
CC13(C2=C1C23(C))

File diff suppressed because it is too large Load Diff