mirror of
https://github.com/rdkit/rdkit.git
synced 2026-06-04 21:54:27 +08:00
* remove all of the "from __future__" imports * remove the first batch of rdkit.six imports/uses * next step of rdkit.six removal * removing xrange, range, and some maps * next round of removals * next round of cleanups * fix inchi test * last bits of "from rdkit.six" are gone * and the last of the six stuff is gone * strange importlib problem
183 lines
6.3 KiB
Python
183 lines
6.3 KiB
Python
#!/usr/bin/python
|
|
# encoding: utf-8
|
|
|
|
# Jan 2011 (markus kossner) Cleaned up the code, added some documentation
|
|
# somwhere around Aug 2008 (markus kossner) created
|
|
#
|
|
# This script extracts the molecular framework for a database of molecules.
|
|
# You can use two modes (hard coded):
|
|
# - Scaff: The molecular frame is extracted
|
|
# - RedScaff: All linking chains between rings are deleted. The rings are directly connected.
|
|
#
|
|
# You can comment in/out the code snippets indicated by the comments
|
|
# to force each atom of the frame to be a Carbon.
|
|
#
|
|
# Usage: Frames.py <database.sdf>
|
|
# Output:
|
|
# - sd files containing all molecules belonging to one frame (1.sdf, 2.sdf etc)
|
|
# - frames.smi containing the (caninical) smiles and count of occurrence
|
|
#
|
|
|
|
|
|
import os, sys
|
|
from Chem import AllChem as Chem
|
|
|
|
|
|
def flatten(x):
|
|
"""flatten(sequence) -> list
|
|
Returns a single, flat list which contains all elements retrieved
|
|
from the sequence and all nested sub-sequences (iterables).
|
|
Examples:
|
|
>>> [1, 2, [3,4], (5,6)]
|
|
[1, 2, [3, 4], (5, 6)]
|
|
>>> flatten([[[1,2,3], (42,None)], [4,5], [6], 7, MyVector(8,9,10)])
|
|
[1, 2, 3, 42, None, 4, 5, 6, 7, 8, 9, 10]"""
|
|
result = []
|
|
for el in x:
|
|
if hasattr(el, "__iter__") and not isinstance(el, basestring):
|
|
result.extend(flatten(el))
|
|
else:
|
|
result.append(el)
|
|
return result
|
|
|
|
|
|
def GetFrame(mol, mode='Scaff'):
|
|
'''return a ganeric molecule defining the reduced scaffold of the input mol.
|
|
mode can be 'Scaff' or 'RedScaff':
|
|
|
|
Scaff -> chop off the side chains and return the scaffold
|
|
|
|
RedScaff -> remove all linking chains and connect the rings
|
|
directly at the atoms where the linker was
|
|
'''
|
|
|
|
ring = mol.GetRingInfo()
|
|
RingAtoms = flatten(ring.AtomRings())
|
|
NonRingAtoms = [atom.GetIdx() for atom in mol.GetAtoms() if atom.GetIdx() not in RingAtoms]
|
|
RingNeighbors = []
|
|
Paths = []
|
|
for NonRingAtom in NonRingAtoms:
|
|
for neighbor in mol.GetAtomWithIdx(NonRingAtom).GetNeighbors():
|
|
if neighbor.GetIdx() in RingAtoms:
|
|
RingNeighbors.append(NonRingAtom)
|
|
#The ring Atoms having a non ring Neighbor will be the start of a walk
|
|
Paths.append([neighbor.GetIdx(), NonRingAtom])
|
|
break
|
|
PosConnectors = [x for x in NonRingAtoms if x not in RingNeighbors
|
|
] #Only these Atoms are potential starting points of a Linker chain
|
|
#print 'PosConnectors:'
|
|
#print PosConnectors
|
|
Framework = [x for x in RingAtoms]
|
|
#Start a list of pathways which we will have to walk
|
|
#print 'Path atoms:'
|
|
#print Paths
|
|
Linkers = []
|
|
while len(Paths) > 0:
|
|
NewPaths = []
|
|
for P in Paths:
|
|
if P == None:
|
|
print('ooh')
|
|
else:
|
|
for neighbor in mol.GetAtomWithIdx(P[-1]).GetNeighbors():
|
|
if neighbor.GetIdx() not in P:
|
|
if neighbor.GetIdx() in NonRingAtoms:
|
|
n = P[:]
|
|
n.append(neighbor.GetIdx())
|
|
NewPaths.append(n[:])
|
|
elif neighbor.GetIdx() in RingAtoms:
|
|
#print 'adding the following path to Framework:'
|
|
#print P
|
|
n = P[:]
|
|
n.append(neighbor.GetIdx())
|
|
Linkers.append(n)
|
|
Framework = Framework + P[:]
|
|
|
|
Paths = NewPaths[:]
|
|
#print 'Linkers:',Linkers
|
|
#print 'RingAtoms:',RingAtoms
|
|
#em.AddBond(3,4,Chem.BondType.SINGLE)
|
|
if mode == 'RedScaff':
|
|
Framework = list(set(Framework))
|
|
todel = []
|
|
NonRingAtoms.sort(reverse=True)
|
|
em = Chem.EditableMol(mol)
|
|
BondsToAdd = [sorted([i[0], i[-1]]) for i in Linkers]
|
|
mem = []
|
|
for i in BondsToAdd:
|
|
if i not in mem:
|
|
em.AddBond(i[0], i[1], Chem.BondType.SINGLE)
|
|
mem.append(i)
|
|
for i in NonRingAtoms:
|
|
todel.append(i)
|
|
for i in todel:
|
|
em.RemoveAtom(i)
|
|
m = em.GetMol()
|
|
#===================================#
|
|
# Now do the flattening of atoms and bonds!
|
|
# Any heavy atom will become a carbon and any bond will become a single bond! #
|
|
#===================================#
|
|
# for atom in m.GetAtoms(): #
|
|
# atom.SetAtomicNum(6) #
|
|
# atom.SetFormalCharge(0) #
|
|
# for bond in m.GetBonds(): #
|
|
# bond.SetBondType(Chem.BondType.SINGLE) #
|
|
# Chem.SanitizeMol(m) #
|
|
#===================================#
|
|
return m
|
|
|
|
if mode == 'Scaff':
|
|
Framework = list(set(Framework))
|
|
todel = []
|
|
NonRingAtoms.sort(reverse=True)
|
|
for i in NonRingAtoms:
|
|
if i != None:
|
|
if i not in Framework:
|
|
todel.append(i)
|
|
em = Chem.EditableMol(mol)
|
|
for i in todel:
|
|
em.RemoveAtom(i)
|
|
m = em.GetMol()
|
|
#===================================#
|
|
# Now do the flattening of atoms and bonds!
|
|
# Any heavy atom will become a carbon and any bond will become a single bond!! #
|
|
#===================================#
|
|
# for atom in m.GetAtoms(): #
|
|
# atom.SetAtomicNum(6) #
|
|
# atom.SetFormalCharge(0) #
|
|
# for bond in m.GetBonds(): #
|
|
# bond.SetBondType(Chem.BondType.SINGLE) #
|
|
# Chem.SanitizeMol(m) #
|
|
#===================================#
|
|
return m
|
|
|
|
|
|
if __name__ == '__main__':
|
|
if len(sys.argv) < 2:
|
|
print("No input file provided: Frames.py filetosprocess.ext")
|
|
sys.exit(1)
|
|
|
|
suppl = Chem.SDMolSupplier(sys.argv[1])
|
|
FrameDict = {}
|
|
|
|
for mol in suppl:
|
|
m = GetFrame(mol)
|
|
cansmiles = Chem.MolToSmiles(m, isomericSmiles=True)
|
|
if cansmiles in FrameDict:
|
|
FrameDict[cansmiles].append(mol)
|
|
else:
|
|
FrameDict[cansmiles] = [mol, ]
|
|
|
|
counter = 0
|
|
w = open('frames.smi', 'w')
|
|
for key, item in FrameDict.items():
|
|
counter += 1
|
|
d = Chem.SDWriter(str(counter) + '.sdf')
|
|
for i in item:
|
|
i.SetProp('Scaffold', key)
|
|
i.SetProp('Cluster', str(counter))
|
|
d.write(i)
|
|
print(key, len(item))
|
|
w.write(key + '\t' + str(len(item)) + '\n')
|
|
w.close
|
|
print('number of Clusters: %d' % (counter))
|