mirror of
https://github.com/rdkit/rdkit.git
synced 2026-06-03 21:44:30 +08:00
efgs code (#8145)
This commit is contained in:
20
Contrib/efgs/README.md
Normal file
20
Contrib/efgs/README.md
Normal file
@@ -0,0 +1,20 @@
|
||||
Source code of EFGs, a complete an accurate implementation of Ertl's functional group (FG) detection algorithm.
|
||||
|
||||
For a RDKit molecule, it provides four outputs:
|
||||
a) a PNG binary string with an image of the molecule with color-highlighted
|
||||
functional groups;
|
||||
b) a list of sets of atom indices (idx), each set corresponding to a
|
||||
functional group;
|
||||
c) a list of pseudo-SMILES canonicalized strings for the full functional
|
||||
groups;
|
||||
d) a list of RDKit labeled mol objects, one for each full functional group
|
||||
|
||||
efgs.py: code of get_dec_fgs function and additional auxiliary functions
|
||||
|
||||
try_efgs.py: example code to execute get_dec_fgs in Python with a Pandas dataframe
|
||||
|
||||
ch33query.sql: sql query used to extract ChEMBL33 bioactive compounds
|
||||
|
||||
ChemRxiv preprint for full details on the implementation:
|
||||
EFGs: A Complete and Accurate Implementation of Ertl’s Functional Group Detection Algorithm in RDKit
|
||||
https://chemrxiv.org/engage/chemrxiv/article-details/67543afe7be152b1d03a820c
|
||||
26
Contrib/efgs/ch33query.sql
Normal file
26
Contrib/efgs/ch33query.sql
Normal file
@@ -0,0 +1,26 @@
|
||||
select distinct
|
||||
md.chembl_id chid,
|
||||
cs.standard_inchi,
|
||||
md.pref_name molname,
|
||||
td.chembl_id tchid,
|
||||
td.pref_name tarname,
|
||||
pchembl_value pchembl,
|
||||
organism as organism,
|
||||
ac.activity_id,
|
||||
td.target_type
|
||||
from
|
||||
ch33.activities ac,
|
||||
ch33.molecule_dictionary md,
|
||||
ch33.assays ay,
|
||||
ch33.target_dictionary td,
|
||||
ch33.compound_structures cs,
|
||||
ch33.chembl_id_lookup cl
|
||||
where
|
||||
cl.entity_id = cs.molregno and
|
||||
cl.entity_type = 'COMPOUND' and
|
||||
ac.molregno = md.molregno and
|
||||
ac.molregno = cl.entity_id and
|
||||
ac.assay_id = ay.assay_id and
|
||||
pchembl_value > 5 and
|
||||
ac.pchembl_value is not null and td.tid = ay.tid and td.target_type in (
|
||||
'PROTEIN COMPLEX','SINGLE PROTEIN')
|
||||
567
Contrib/efgs/efgs.py
Normal file
567
Contrib/efgs/efgs.py
Normal file
@@ -0,0 +1,567 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
Implementation of Ertl's Functional Group Algorithm in RDKit
|
||||
|
||||
Ertl, Peter.
|
||||
|
||||
An algorithm to identify functional groups in organic molecules
|
||||
|
||||
J Cheminform (2017) 9:36; DOI: 10.1186/s13321-017-0225-z
|
||||
|
||||
@author: gonzalo.colmenarejo
|
||||
"""
|
||||
|
||||
|
||||
from rdkit import Chem
|
||||
import re
|
||||
from rdkit.Chem.Draw import rdMolDraw2D
|
||||
|
||||
|
||||
def psmi_can(s):
|
||||
# Generation of canonical pseudosmiles from pseudosmiles
|
||||
s = s.replace("[R]","[Ge]").replace("[Car]", "[Pb]").replace("[Cal]", "[Sn]").replace("[Oar]", "[Po]").replace("[Nar]", "[Sb]").replace("[Sar]","[Re]")
|
||||
s = s.replace("[Nar+]", "[Sb+]").replace("[Sar+]","[Re+]").replace("[Sear]","[Bi]").replace("[Tear]","[Tl]").replace("[Oar+]", "[Po+]").replace("[Nar+]", "[Sb+]")
|
||||
s = s.replace("[Nar-]", "[Sb-]").replace("[Sar+]","[Re+]").replace("[Sar-]","[Re-]").replace("[Sear+]","[Bi+]").replace("[Sear-]","[Bi-]")
|
||||
mol = Chem.MolFromSmiles(s)
|
||||
s = Chem.MolToSmiles(mol, canonical = True, isomericSmiles = False)
|
||||
s = s.replace("[Ge]","[R]").replace("[Pb]", "[Car]").replace("[Sn]", "[Cal]").replace("[Po]", "[Oar]").replace("[Sb]", "[Nar]").replace("[Re]","[Sar]")
|
||||
s = s.replace("[Sb+]", "[Nar+]").replace("[Re+]","[Sar+]").replace("[Bi]","[Sear]").replace("[Tl]","[Tear]").replace("[Po+]", "[Oar+]").replace("[Sb+]", "[Nar+]")
|
||||
s = s.replace("[Nar-]", "[Sb-]").replace("[Re+]","[Sar+]").replace("[Re-]","[Sar-]").replace("[Bi+]","[Sear+]").replace("[Bi-]","[Sear-]")
|
||||
return s
|
||||
|
||||
|
||||
def cval_fix(fragment):
|
||||
# Fix valences of carbons
|
||||
for atom in fragment.GetAtoms():
|
||||
if atom.GetSymbol() == 'C':
|
||||
charge = atom.GetFormalCharge()
|
||||
num_bonds = sum([bond.GetBondTypeAsDouble() for bond in atom.GetBonds()])
|
||||
implicit_h = atom.GetNumImplicitHs()
|
||||
explicit_h = atom.GetNumExplicitHs()
|
||||
total_valence = num_bonds + implicit_h + explicit_h
|
||||
if charge == 0:
|
||||
if total_valence != 4:
|
||||
explicit_h = 4 - num_bonds - implicit_h
|
||||
atom.SetNumExplicitHs(int(explicit_h))
|
||||
else:
|
||||
explicit_h = 4 - num_bonds - implicit_h + charge
|
||||
atom.SetNumExplicitHs(int(explicit_h))
|
||||
atom.UpdatePropertyCache()
|
||||
return fragment
|
||||
|
||||
|
||||
def merge(mol, marked, aset):
|
||||
# Merge initially marged atoms into FGs
|
||||
bset = set()
|
||||
for idx in aset:
|
||||
atom = mol.GetAtomWithIdx(idx)
|
||||
for nbr in atom.GetNeighbors():
|
||||
jdx = nbr.GetIdx()
|
||||
if jdx in marked:
|
||||
marked.remove(jdx)
|
||||
bset.add(jdx)
|
||||
if not bset:
|
||||
return
|
||||
merge(mol, marked, bset)
|
||||
aset.update(bset)
|
||||
|
||||
|
||||
def get_fgs(mol):
|
||||
# generation of nondecorated FGs
|
||||
het23c_pattern = Chem.MolFromSmarts('A=,#[!#6]')
|
||||
#het23c_pattern = Chem.MolFromSmarts('[C,c]=,#[!#6]')
|
||||
c23c_pattern = Chem.MolFromSmarts('C=,#[C,c]')
|
||||
acetal_pattern = Chem.MolFromSmarts('[CX4](-[O,N,S])-[O,N,S]')
|
||||
oxirane_pattern = Chem.MolFromSmarts('[O,N,S]1CC1')
|
||||
|
||||
marked = set()
|
||||
for atom in mol.GetAtoms():
|
||||
if ((atom.GetAtomicNum() not in (6,1)) and (atom.GetIsAromatic() is False)):
|
||||
marked.add(atom.GetIdx())
|
||||
|
||||
for pattern in [het23c_pattern, c23c_pattern, acetal_pattern, oxirane_pattern]:
|
||||
for path in mol.GetSubstructMatches(pattern):
|
||||
for atomindex in path:
|
||||
marked.add(atomindex)
|
||||
|
||||
groups = []
|
||||
while marked:
|
||||
grp = set([marked.pop()])
|
||||
merge(mol, marked, grp)
|
||||
groups.append(grp)
|
||||
|
||||
for atom in mol.GetAtoms():
|
||||
if ((atom.GetAtomicNum() not in (6,1)) and (atom.GetIsAromatic() is True)):
|
||||
lone_atom = True
|
||||
for nbr in atom.GetNeighbors():
|
||||
if (nbr.GetIsAromatic() == False): # Don't merge adjacent aromatic atoms into the same FG!!
|
||||
for group in groups:
|
||||
if nbr.GetIdx() in group:
|
||||
group.add(atom.GetIdx())
|
||||
lone_atom = False
|
||||
if (lone_atom):
|
||||
groups.append({atom.GetIdx()})
|
||||
return groups
|
||||
|
||||
|
||||
def get_carbonyl_envs(mol, functional_groups):
|
||||
# Find carbonyl environments
|
||||
carbonyl_pattern = Chem.MolFromSmarts('C=O')
|
||||
pats_idx = mol.GetSubstructMatches(carbonyl_pattern) # tupla of sets of matched atoms
|
||||
carbonyl_envs = []
|
||||
for group in functional_groups:
|
||||
carbonyl_env = set()
|
||||
for pat_idx in pats_idx:
|
||||
neigh_list = []
|
||||
if (len(set(pat_idx) & group) > 0): # identify the carbonyl corresponding in the group to that SMARTS match
|
||||
c_atom = mol.GetAtomWithIdx(pat_idx[0]) # Get the C of the carbonyl
|
||||
neigh_list = [((pat_idx[0], x.GetIdx()), mol.GetBondBetweenAtoms(pat_idx[0], x.GetIdx()))
|
||||
for x in c_atom.GetNeighbors() if ((x.GetIdx() not in group) and (x.GetAtomicNum() == 6))]
|
||||
if (len(neigh_list) > 0):
|
||||
carbonyl_env.update(neigh_list) #
|
||||
carbonyl_envs.append(carbonyl_env)
|
||||
return carbonyl_envs
|
||||
|
||||
|
||||
def single_ns_group(mol, group):
|
||||
# Find single N, S atom groups
|
||||
ns_count = 0
|
||||
het_count = 0
|
||||
|
||||
for idx in group:
|
||||
atom = mol.GetAtomWithIdx(idx)
|
||||
if ((atom.GetSymbol() == 'N') or (atom.GetSymbol() == 'S')):
|
||||
ns_count += 1
|
||||
if atom.GetSymbol() not in ('C', 'H'):
|
||||
het_count += 1
|
||||
|
||||
return ns_count == 1 and het_count == 1
|
||||
|
||||
|
||||
def get_freeval_envs(mol, functional_groups):
|
||||
# Find environments for heteroatoms with free valences
|
||||
oh_pattern = Chem.MolFromSmarts('[OX2H]')
|
||||
n_pattern = Chem.MolFromSmarts('[N;X3;H1,H2]')
|
||||
sh_pattern = Chem.MolFromSmarts('[SX2H]')
|
||||
|
||||
# Identify atoms to preserve (those Hs in OH, NH2, and SH groups)
|
||||
preserve_hs = set()
|
||||
for pattern in [oh_pattern, n_pattern, sh_pattern]:
|
||||
matches = mol.GetSubstructMatches(pattern)
|
||||
if (len(matches) > 0): # found matches
|
||||
for m in matches:
|
||||
group = [g for g in functional_groups if m[0] in g][0]
|
||||
if ((pattern == oh_pattern) or (single_ns_group(mol, group))): # we apply the oh pattern always but the n or s patterns only with single sn groups
|
||||
atom = mol.GetAtomWithIdx(m[0]) # Retrieve O, N, or S of the hydroxyl, amine, thiol
|
||||
for n_atom in atom.GetNeighbors():
|
||||
if (n_atom.GetAtomicNum() == 1): # Will only preserve if explicit H, if not it remains "silent"
|
||||
preserve_hs.add(n_atom.GetIdx())
|
||||
freeval_envs = []
|
||||
for group in functional_groups:
|
||||
freeval_env = set()
|
||||
for idx in group:
|
||||
atom = mol.GetAtomWithIdx(idx)
|
||||
if ((atom.GetAtomicNum() not in [6,1]) and (atom.GetIsAromatic() == False)): # heteroatom of the group
|
||||
neigh_list = [((idx, x.GetIdx()), mol.GetBondBetweenAtoms(idx, x.GetIdx()), x.GetAtomicNum())
|
||||
for x in atom.GetNeighbors() if ((x.GetIdx() not in group) and (x.GetIdx() not in preserve_hs)
|
||||
and (x.GetAtomicNum() in (6,1))
|
||||
and (mol.GetBondBetweenAtoms(idx, x.GetIdx()).GetIsAromatic() == False))]
|
||||
if (len(neigh_list) > 0):
|
||||
freeval_env.update(neigh_list)
|
||||
freeval_envs.append(freeval_env)
|
||||
return freeval_envs, preserve_hs
|
||||
|
||||
|
||||
def get_singleno_envs(mol, functional_groups):
|
||||
# Find single N, O functional groups environments (amines vs anilines, alcohols vs phenols)
|
||||
singleno_envs = []
|
||||
for group in functional_groups:
|
||||
singleno_env = set()
|
||||
if (len(group) == 1) and (mol.GetAtomWithIdx(list(group)[0]).GetAtomicNum() in [7,8]) and not (mol.GetAtomWithIdx(list(group)[0]).GetIsAromatic()):
|
||||
atom = mol.GetAtomWithIdx(list(group)[0])
|
||||
if sum([1 for neigh_atom in atom.GetNeighbors() if neigh_atom.GetAtomicNum() == 6]) == 1:
|
||||
for neigh_atom in atom.GetNeighbors():
|
||||
if (neigh_atom.GetAtomicNum() == 6) and (mol.GetBondBetweenAtoms(atom.GetIdx(), neigh_atom.GetIdx()).GetBondType() == Chem.rdchem.BondType.SINGLE):
|
||||
type_carbon = "Car" if neigh_atom.GetIsAromatic() == True else "Cal"
|
||||
singleno_env.add(((list(group)[0], neigh_atom.GetIdx()),
|
||||
mol.GetBondBetweenAtoms(list(group)[0], neigh_atom.GetIdx()), type_carbon))
|
||||
singleno_envs.append(singleno_env)
|
||||
return singleno_envs
|
||||
|
||||
|
||||
def get_dbond2car(mol, functional_groups):
|
||||
# Find atoms in functional groups double bonded to aromatic carbons
|
||||
db2car_pattern = Chem.MolFromSmarts("c=*")
|
||||
db2car_envs = []
|
||||
matches = mol.GetSubstructMatches(db2car_pattern)
|
||||
matches = tuple([x for x in matches if mol.GetBondBetweenAtoms(x[0], x[1]).GetIsAromatic() == False])
|
||||
for group in functional_groups:
|
||||
db2car_env = set()
|
||||
for matchx in matches:
|
||||
if matchx[0] in group or matchx[1] in group:
|
||||
db2car_env.add(matchx[0])
|
||||
db2car_envs.append(db2car_env)
|
||||
return db2car_envs
|
||||
|
||||
|
||||
def col_mol(mol, functional_groups, rad = 0.5, lw = 2, width = 300, height = 250):
|
||||
# Color functional groups in molecules like Ertl's paper
|
||||
cols = {"NO": (1, 0.3, 1, 0.8), # violet
|
||||
"O": (1, 0.6, 0.6, 0.9), # pink
|
||||
"N": (0.4, 0.7, 0.9), # blue
|
||||
"X": (0, 1, 0, 0.9), # green
|
||||
"har": (1, 0.65, 0, 0.9), # orange
|
||||
"S(O)": (1, 1, 0.2, 0.9), # yellow
|
||||
"NOS": (0.7, 0.7, 0, 0.9), # pistacchio
|
||||
"CC": (0.627, 0.627, 0.627, 0.9), # grey
|
||||
"P,etc": (0.5, 1, 0.8, 0.9) # cyan
|
||||
}
|
||||
|
||||
ats_hl = {}
|
||||
bds_hl = {}
|
||||
rdi_hl = {}
|
||||
lws_hl = {}
|
||||
|
||||
for fg in functional_groups:
|
||||
ar = any([mol.GetAtomWithIdx(idx).GetIsAromatic() for idx in fg])
|
||||
els = "".join(sorted(list(set([mol.GetAtomWithIdx(idx).GetSymbol() for idx in fg]))))
|
||||
|
||||
if ar == True:
|
||||
col = [cols["har"]]
|
||||
elif any(x in els for x in ["F", "Cl", "Br", "I"]):
|
||||
col = [cols["X"]]
|
||||
elif any(x in els for x in ["P", "Se", "B", "Si", "As", "Te"]):
|
||||
col = [cols["P,etc"]]
|
||||
elif els in ["NO", "CNO"]:
|
||||
col = [cols["NO"]]
|
||||
elif els in ["O", "CO"]:
|
||||
col = [cols["O"]]
|
||||
elif els in ["N", "CN"]:
|
||||
col = [cols["N"]]
|
||||
elif els in ["S","OS","COS", "CS"]:
|
||||
col = [cols["S(O)"]]
|
||||
elif els in ["NOS", "CNOS", "NS", "CNS"]:
|
||||
col = [cols["NOS"]]
|
||||
elif els == "C":
|
||||
col = [cols["CC"]]
|
||||
|
||||
for idx in fg:
|
||||
ats_hl[idx] = col
|
||||
rdi_hl[idx] = rad
|
||||
|
||||
if len(fg) > 1:
|
||||
for bond in mol.GetBonds():
|
||||
at1_idx = bond.GetBeginAtomIdx()
|
||||
at2_idx = bond.GetEndAtomIdx()
|
||||
if at1_idx in fg and at2_idx in fg:
|
||||
bds_hl[bond.GetIdx()] = col
|
||||
lws_hl[bond.GetIdx()] = lw
|
||||
|
||||
Chem.rdDepictor.Compute2DCoords(mol)
|
||||
Chem.rdDepictor.StraightenDepiction(mol)
|
||||
d2d = rdMolDraw2D.MolDraw2DCairo(width,height)
|
||||
dopts = d2d.drawOptions()
|
||||
dopts.atomHighlightsAreCircles = False
|
||||
d2d.DrawMoleculeWithHighlights(mol, "", ats_hl, bds_hl, rdi_hl, lws_hl)
|
||||
d2d.FinishDrawing()
|
||||
return d2d.GetDrawingText()
|
||||
|
||||
|
||||
|
||||
|
||||
def get_dec_fgs(mol, verbose = False):
|
||||
# Generate functional groups decorated with carbon environments
|
||||
for atom in mol.GetAtoms():
|
||||
atom.SetChiralTag(Chem.rdchem.ChiralType.CHI_UNSPECIFIED)
|
||||
for bond in mol.GetBonds():
|
||||
bond.SetStereo(Chem.rdchem.BondStereo.STEREONONE)
|
||||
mol_aux = Chem.Mol(mol)
|
||||
mol = Chem.RemoveHs(mol)
|
||||
mol = Chem.AddHs(mol)
|
||||
fgs = get_fgs(mol)
|
||||
cnes = get_carbonyl_envs(mol, fgs)
|
||||
fves, pr_hs = get_freeval_envs(mol, fgs)
|
||||
noes = get_singleno_envs(mol, fgs)
|
||||
db2cars = get_dbond2car(mol, fgs)
|
||||
|
||||
## Dummyze permitted free valence neighboring Hs
|
||||
for fve in fves:
|
||||
if (len(fve) > 0):
|
||||
for fv in fve:
|
||||
if (fv[2] == 1):
|
||||
#print("dummy H: " + str(fv[0][1]))
|
||||
mol.GetAtomWithIdx(fv[0][1]).SetAtomicNum(0)
|
||||
|
||||
## Remove explicit atoms
|
||||
mol = Chem.RemoveHs(mol)
|
||||
|
||||
## Update fves accordingly to account for changes in numbers of dummy atoms
|
||||
for i in range(len(fves)):
|
||||
fve = list(fves[i])
|
||||
if (len(fve) > 0):
|
||||
dummy_paired = []
|
||||
for j in range(len(fve)):
|
||||
fv = fve[j]
|
||||
if (fv[2] == 1): # this entry had a H fv
|
||||
for neighbor in mol.GetAtomWithIdx(fv[0][0]).GetNeighbors():
|
||||
if (neighbor.GetAtomicNum() == 0):
|
||||
dummy_paired.append((fv[0][0], neighbor.GetIdx()))
|
||||
# dummy_idx = neighbor.GetIdx() ## Assign the updated idx
|
||||
# dummy_bond = mol.GetBondBetweenAtoms(fv[0][0], dummy_idx) # Update the bond too
|
||||
# new_fv = ((fv[0][0], dummy_idx), dummy_bond, 0)
|
||||
# fve[j] = new_fv
|
||||
dummy_paired = list(set(dummy_paired))
|
||||
dummy_count = 0
|
||||
for j in range(len(fve)):
|
||||
fv = fve[j]
|
||||
if (fv[2] == 1):
|
||||
d_pair = dummy_paired[dummy_count]
|
||||
fve[j] = ((d_pair), mol.GetBondBetweenAtoms(d_pair[0], d_pair[1]), 0)
|
||||
dummy_count = dummy_count + 1
|
||||
fves[i] = set(fve) # reassign the possibly updated fve
|
||||
|
||||
psmis = []
|
||||
fg_mols = []
|
||||
psmi_labs = []
|
||||
pseudo_smiles = ""
|
||||
|
||||
# Loop over functional groups to generate pseudo smiles and pseudo molecules for each of them
|
||||
for i, group in enumerate(fgs):
|
||||
neigh_atoms = sorted(list(set([x[0][1] for x in cnes[i]] + [x[0][1] for x in fves[i]] + [x[0][1] for x in noes[i]])))
|
||||
cne = cnes[i]
|
||||
fve = fves[i]
|
||||
noe = noes[i]
|
||||
db2car = db2cars[i]
|
||||
R_idxs = sorted(list(set(neigh_atoms + [x for x in db2car])))
|
||||
psmi_labs = [(list(x)[0][1], list(x)[2]) for x in noe if len(noe) > 0] + [(x, "Car") for x in db2car if len(db2car) > 0]
|
||||
|
||||
if (len(neigh_atoms) > 0): # There is decoration applicable
|
||||
|
||||
## Get the bonds of the group
|
||||
bonds_idx = set()
|
||||
for j in range(mol.GetNumBonds()):
|
||||
bond = mol.GetBondWithIdx(j)
|
||||
begin_aid = bond.GetBeginAtomIdx()
|
||||
end_aid = bond.GetEndAtomIdx()
|
||||
if ((begin_aid in group) and (end_aid in group)):
|
||||
bonds_idx.add(bond.GetIdx())
|
||||
|
||||
## Add bonds to carbonyl carbons
|
||||
if(len(cne) > 0):
|
||||
for env in cne:
|
||||
bond = env[1]
|
||||
bonds_idx.add(bond.GetIdx())
|
||||
|
||||
## Add bonds to free valence atoms
|
||||
if(len(fve) > 0):
|
||||
for env in fve:
|
||||
bond = env[1]
|
||||
bonds_idx.add(bond.GetIdx())
|
||||
|
||||
## Add bonds to single n, o atoms
|
||||
if(len(noe) > 0):
|
||||
for env in noe:
|
||||
bond = env[1]
|
||||
bonds_idx.add(bond.GetIdx())
|
||||
|
||||
# Get the fragment as a new molecule from bonds in the FG + environment atoms
|
||||
amap = {}
|
||||
fragment = Chem.PathToSubmol(mol, list(bonds_idx), atomMap = amap)
|
||||
|
||||
# Create a copy of the fragment to modify
|
||||
fragment_with_r_groups = Chem.RWMol(fragment)
|
||||
|
||||
# Replace atoms at the specified positions with R group labels
|
||||
j = 0
|
||||
for idx in R_idxs:
|
||||
label = "[*:" + str(j+1) + "]"
|
||||
|
||||
# Create a dummy atom with the R group label
|
||||
r_group_atom = Chem.MolFromSmiles(label).GetAtomWithIdx(0)
|
||||
r_group_atom.SetAtomMapNum(int(label[-2])) # Set the map number to match the R group number
|
||||
|
||||
# Replace the atom at the specified position with the R group atom
|
||||
fragment_with_r_groups.ReplaceAtom(amap[R_idxs[j]], r_group_atom)
|
||||
j = j + 1
|
||||
|
||||
# Convert the modified fragment back to a Mol object
|
||||
fragment_with_r_groups = fragment_with_r_groups.GetMol()
|
||||
|
||||
## Fix C valences
|
||||
fragment_with_r_groups = cval_fix(fragment_with_r_groups)
|
||||
|
||||
# Generate the canonical pseudo-SMILES
|
||||
pseudo_smiles = Chem.MolToSmiles(fragment_with_r_groups, canonical=True, isomericSmiles = False)
|
||||
|
||||
# Generate the labeled molecule
|
||||
fragment_with_r_groups = Chem.RWMol(fragment_with_r_groups)
|
||||
for atom in fragment_with_r_groups.GetAtoms():
|
||||
idx = atom.GetIdx() # idx of atom in fragment
|
||||
map_idx = list(amap.keys())[list(amap.values()).index(idx)] # idx of atom in original molecule
|
||||
if map_idx in neigh_atoms: # Check if that position is an environment atom in original molecule
|
||||
if (map_idx in [x[0][1] for x in list(noe)]):
|
||||
type_carbon = [x[2] for x in list(noe) if x[0][1] == map_idx][0]
|
||||
fragment_with_r_groups.GetAtomWithIdx(idx).SetProp('atomLabel', type_carbon)
|
||||
else:
|
||||
fragment_with_r_groups.GetAtomWithIdx(idx).SetProp('atomLabel','R')
|
||||
else:
|
||||
if atom.GetIsAromatic():
|
||||
if atom.GetAtomicNum() == 6:
|
||||
fragment_with_r_groups.GetAtomWithIdx(idx).SetProp('atomLabel', "Car")
|
||||
if atom.GetAtomicNum() == 7:
|
||||
fragment_with_r_groups.GetAtomWithIdx(idx).SetProp('atomLabel', "Nar")
|
||||
if atom.GetAtomicNum() == 8:
|
||||
fragment_with_r_groups.GetAtomWithIdx(idx).SetProp('atomLabel', "Oar")
|
||||
if atom.GetAtomicNum() == 16:
|
||||
fragment_with_r_groups.GetAtomWithIdx(idx).SetProp('atomLabel', "Sar")
|
||||
if map_idx in db2car: # check db2caromatics
|
||||
fragment_with_r_groups.GetAtomWithIdx(idx).SetProp('atomLabel', "Car")
|
||||
fragment_with_r_groups = fragment_with_r_groups.GetMol()
|
||||
|
||||
else: # No neighbors
|
||||
if(len(group) == 1): # Single-atommed FG
|
||||
atom = mol.GetAtomWithIdx(list(group)[0])
|
||||
charge = atom.GetFormalCharge()
|
||||
charge_str = "+" if charge == 1 else "-" if charge == -1 else ""
|
||||
fragment = Chem.RWMol() ## Create emtpy edtiable molecule
|
||||
fragment.AddAtom(Chem.Atom('*')) ## Add dummy atom to molecule
|
||||
if atom.GetIsAromatic():
|
||||
if atom.GetAtomicNum() == 7:
|
||||
pseudo_smiles = "n" + charge_str
|
||||
fragment.GetAtomWithIdx(0).SetProp('atomLabel', "Nar" + charge_str)
|
||||
if atom.GetAtomicNum() == 8:
|
||||
pseudo_smiles = "o" + charge_str
|
||||
fragment.GetAtomWithIdx(0).SetProp('atomLabel', "Oar" + charge_str)
|
||||
if atom.GetAtomicNum() == 16:
|
||||
pseudo_smiles = "s" + charge_str
|
||||
fragment.GetAtomWithIdx(0).SetProp('atomLabel', "Sar" + charge_str)
|
||||
if atom.GetAtomicNum() == 15:
|
||||
pseudo_smiles = "p" + charge_str
|
||||
fragment.GetAtomWithIdx(0).SetProp('atomLabel', "Par" + charge_str)
|
||||
if atom.GetAtomicNum() == 34:
|
||||
pseudo_smiles = "se" + charge_str
|
||||
fragment.GetAtomWithIdx(0).SetProp('atomLabel', "Sear" + charge_str)
|
||||
if atom.GetAtomicNum() == 52:
|
||||
pseudo_smiles = "te" + charge_str
|
||||
fragment.GetAtomWithIdx(0).SetProp('atomLabel', "Tear" + charge_str)
|
||||
else:
|
||||
if atom.GetAtomicNum() == 7:
|
||||
pseudo_smiles = "N" + charge_str
|
||||
fragment.GetAtomWithIdx(0).SetProp('atomLabel', "N" + charge_str)
|
||||
if atom.GetAtomicNum() == 8:
|
||||
pseudo_smiles = "O" + charge_str
|
||||
fragment.GetAtomWithIdx(0).SetProp('atomLabel', "O" + charge_str)
|
||||
if atom.GetAtomicNum() == 16:
|
||||
pseudo_smiles = "S" + charge_str
|
||||
fragment.GetAtomWithIdx(0).SetProp('atomLabel', "S" + charge_str)
|
||||
if atom.GetAtomicNum() == 15:
|
||||
pseudo_smiles = "P" + charge_str + "]"
|
||||
fragment.GetAtomWithIdx(0).SetProp('atomLabel', "P" + charge_str)
|
||||
if atom.GetAtomicNum() == 34:
|
||||
pseudo_smiles = "Se" + charge_str + "]"
|
||||
fragment.GetAtomWithIdx(0).SetProp('atomLabel', "Se" + charge_str)
|
||||
if atom.GetAtomicNum() == 52:
|
||||
pseudo_smiles = "Te" + charge_str + "]"
|
||||
fragment.GetAtomWithIdx(0).SetProp('atomLabel', "Te" + charge_str)
|
||||
fragment = fragment.GetMol()
|
||||
else: # Multi-atommed FG
|
||||
bonds_idx = []
|
||||
for j in range(mol.GetNumBonds()):
|
||||
bond = mol.GetBondWithIdx(j)
|
||||
begin_aid = bond.GetBeginAtomIdx()
|
||||
end_aid = bond.GetEndAtomIdx()
|
||||
if begin_aid in group and end_aid in group:
|
||||
bonds_idx.append(j)
|
||||
|
||||
# Get the fragment as a new molecule
|
||||
amap = {}
|
||||
fragment = Chem.PathToSubmol(mol, bonds_idx, atomMap = amap)
|
||||
fragment = cval_fix(fragment)
|
||||
pseudo_smiles = Chem.MolToSmiles(fragment, canonical=True, isomericSmiles = False)
|
||||
for atom in fragment.GetAtoms():
|
||||
idx = atom.GetIdx()
|
||||
charge = atom.GetFormalCharge()
|
||||
charge_str = "\u207A" if charge == 1 else "\u207B" if charge == -1 else ""
|
||||
if atom.GetIsAromatic():
|
||||
if atom.GetAtomicNum() == 6:
|
||||
fragment.GetAtomWithIdx(idx).SetProp('atomLabel', "Car" + charge_str)
|
||||
if atom.GetAtomicNum() == 7:
|
||||
fragment.GetAtomWithIdx(idx).SetProp('atomLabel', "Nar" + charge_str)
|
||||
if atom.GetAtomicNum() == 8:
|
||||
fragment.GetAtomWithIdx(idx).SetProp('atomLabel', "Oar" + charge_str)
|
||||
if atom.GetAtomicNum() == 16:
|
||||
fragment.GetAtomWithIdx(idx).SetProp('atomLabel', "Sar" + charge_str)
|
||||
|
||||
# Parse the pseudo-smiles; first the Cars and Cals
|
||||
if (len(psmi_labs) > 0):
|
||||
for psmi_lab in list(psmi_labs):
|
||||
pseudo_smiles = pseudo_smiles.replace("[*:" + str(R_idxs.index(psmi_lab[0]) + 1) + "]", "[" + psmi_lab[1] + "]")
|
||||
|
||||
|
||||
# Define the pattern to match
|
||||
pattern = r'\[\*:\d+\]'
|
||||
|
||||
# Define the replacement string
|
||||
replacement = "[R]"
|
||||
|
||||
# Replace [R] groups
|
||||
pseudo_smiles = re.sub(pattern, replacement, pseudo_smiles)
|
||||
|
||||
# Replace ns
|
||||
pseudo_smiles = pseudo_smiles.replace("[n+]", "[Nar+]")
|
||||
pseudo_smiles = pseudo_smiles.replace("[n-]", "[Nar-]")
|
||||
pseudo_smiles = pseudo_smiles.replace("[n+]", "[Nar+]")
|
||||
pseudo_smiles = pseudo_smiles.replace("[n-]", "[Nar-]")
|
||||
pseudo_smiles = re.sub(r'(?<![MNI])n\+', '[Nar+]', pseudo_smiles)
|
||||
pseudo_smiles = re.sub(r'(?<![MNI])n\-', '[Nar-]', pseudo_smiles)
|
||||
pseudo_smiles = re.sub(r'(?<![MZI])n', '[Nar]', pseudo_smiles)
|
||||
|
||||
# Replace cs, ss, os, ps, ses, tes
|
||||
pseudo_smiles = pseudo_smiles.replace("[se+]", "[Sear+]")
|
||||
pseudo_smiles = pseudo_smiles.replace("[se-]", "[Sear-]")
|
||||
pseudo_smiles = pseudo_smiles.replace("se+", "[Sear+]")
|
||||
pseudo_smiles = pseudo_smiles.replace("se-", "[Sear-]")
|
||||
pseudo_smiles = pseudo_smiles.replace("se", "[Sear]")
|
||||
pseudo_smiles = pseudo_smiles.replace("[te+]", "[Tear+]")
|
||||
pseudo_smiles = pseudo_smiles.replace("[te-]", "[Tear-]")
|
||||
pseudo_smiles = pseudo_smiles.replace("te", "[Tear]")
|
||||
pseudo_smiles = pseudo_smiles.replace("[c+]", "[Car+]")
|
||||
pseudo_smiles = pseudo_smiles.replace("[c-]", "[Car-]")
|
||||
pseudo_smiles = pseudo_smiles.replace("c", "[Car]")
|
||||
pseudo_smiles = pseudo_smiles.replace("[s+]", "[Sar+]")
|
||||
pseudo_smiles = pseudo_smiles.replace("[s-]", "[Sar-]")
|
||||
pseudo_smiles = re.sub(r"(?<!A)(s\+)", "[Sar+]", pseudo_smiles) # To avoid issues with As
|
||||
pseudo_smiles = re.sub(r"(?<!A)(s-)", "[Sar-]", pseudo_smiles) # To avoid issues with As
|
||||
pseudo_smiles = re.sub(r"(?<!A)s", "[Sar]", pseudo_smiles) # To avoid issues with As
|
||||
pseudo_smiles = pseudo_smiles.replace("o+", "[Oar+]")
|
||||
pseudo_smiles = pseudo_smiles.replace("o-", "[Oar-]")
|
||||
pseudo_smiles = pseudo_smiles.replace("o", "[Oar]")
|
||||
pseudo_smiles = pseudo_smiles.replace("[o]", "[Oar]")
|
||||
pseudo_smiles = pseudo_smiles.replace("[o+]", "[Oar+]")
|
||||
pseudo_smiles = pseudo_smiles.replace("[o-]", "[Oar-]")
|
||||
pseudo_smiles = pseudo_smiles.replace("[p+]", "[Par+]")
|
||||
pseudo_smiles = pseudo_smiles.replace("[p-]", "[Par-]")
|
||||
pseudo_smiles = pseudo_smiles.replace("p", "[Par]")
|
||||
|
||||
|
||||
# Final canonicalization
|
||||
pseudo_smiles = psmi_can(pseudo_smiles)
|
||||
|
||||
psmis.append(pseudo_smiles)
|
||||
if (len(neigh_atoms) == 0):
|
||||
fg_mols.append(fragment)
|
||||
else:
|
||||
fg_mols.append(fragment_with_r_groups)
|
||||
if not ("H" in pseudo_smiles):
|
||||
if (verbose):
|
||||
print("Canonical Pseudo-SMILES:", pseudo_smiles)
|
||||
|
||||
img_text = col_mol(mol_aux, fgs)
|
||||
return img_text, fgs, psmis, fg_mols
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
223
Contrib/efgs/try_efgs.py
Normal file
223
Contrib/efgs/try_efgs.py
Normal file
@@ -0,0 +1,223 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
Implementation of Ertl's Functional Group Algorithm in RDKit
|
||||
|
||||
Ertl, Peter.
|
||||
|
||||
An algorithm to identify functional groups in organic molecules
|
||||
|
||||
J Cheminform (2017) 9:36; DOI: 10.1186/s13321-017-0225-z
|
||||
|
||||
@author: gonzalo.colmenarejo
|
||||
"""
|
||||
|
||||
|
||||
from rdkit import Chem
|
||||
from rdkit.Chem.Draw import IPythonConsole
|
||||
from chembl_structure_pipeline import standardizer as sdz
|
||||
from datetime import datetime
|
||||
from IPython.display import Image
|
||||
from PIL import Image as pilImage
|
||||
from PIL import ImageDraw, ImageFont
|
||||
from rdkit.Chem.Draw import rdMolDraw2D
|
||||
import efgs
|
||||
import pandas as pd
|
||||
import io
|
||||
|
||||
|
||||
|
||||
ertl_smis = [
|
||||
'Cc1nc(NS(=O)(=O)c2ccc(N)cc2)nc(C)c1',
|
||||
'NC(=N)c1ccc(C=Cc2ccc(cc2O)C(=N)N)cc1',
|
||||
'CC(=O)Nc1nnc(s1)S(=O)(=O)N',
|
||||
'NS(=O)(=O)c1cc2c(NCNS2(=O)=O)cc1Cl',
|
||||
'CNC1=Nc2ccc(Cl)cc2C(=N(=O)C1)c3ccccc3',
|
||||
'Cc1onc(c1C(=O)NC2C3SC(C)(C)C(N3C2=O)C(=O)O)c4ccccc4',
|
||||
'Clc1ccccc1C2=NCC(=O)Nc3ccc(cc23)N(=O)=O',
|
||||
'COc1cc(cc(C(=O)NCC2CCCN2CC=C)c1OC)S(=O)(=O)N',
|
||||
'Cc1ccc(Cl)c(Nc2ccccc2C(=O)O)c1Cl',
|
||||
'Clc1ccc2Oc3ccccc3N=C(N4CCNCC4)c2c1',
|
||||
'FC(F)(F)CN1C(=O)CN=C(c2ccccc2)c3cc(Cl)ccc13',
|
||||
'OCC1OC(CC1O)n2cnc3C(O)CNC=Nc32',
|
||||
'CCNC1CC(C)S(=O)(=O)c2sc(cc12)S(=O)(=O)N',
|
||||
'CC(O)C1C2C(C)C(=C(N2C1=O)C(=O)O)SC3CNC(C3)C(=O)N(C)C',
|
||||
'CC1CN(CC(C)N1)c2c(F)c(N)c3c(=O)c(cn(C4CC4)c3c2F)C(=O)O',
|
||||
'CC(=CCC1C(=O)N(N(C1=O)c2ccccc2)c3ccccc3)C',
|
||||
'Clc1ccc2N=C3NC(=O)CN3Cc2c1Cl',
|
||||
'CC(=O)NC1C(NC(=N)N)C=C(OC1C(O)C(O)CO)C(=O)O',
|
||||
'CC(O)C(O)C1CNc2nc(N)nc(O)c2N1',
|
||||
'NC1CCCCN(C1)c2c(Cl)cc3c(=O)c(cn(C4CC4)c3c2Cl)C(=O)O',
|
||||
]
|
||||
|
||||
|
||||
|
||||
def mindex(mol):
|
||||
# Utility function to draw molecule with idx
|
||||
for atom in mol.GetAtoms():
|
||||
try:
|
||||
atom.SetAtomMapNum(atom.GetIdx())
|
||||
except:
|
||||
None
|
||||
return mol
|
||||
|
||||
|
||||
def demindex(mol):
|
||||
# Utility function to return mindex-ed molecule to normal
|
||||
for atom in mol.GetAtoms():
|
||||
atom.ClearProp('molAtomMapNumber')
|
||||
return mol
|
||||
|
||||
|
||||
def molimg(x):
|
||||
# Utility function to draw image of labeled mol from PNG binary string
|
||||
try:
|
||||
return pilImage.open(io.BytesIO(x))
|
||||
except:
|
||||
return None
|
||||
|
||||
|
||||
def img_grid(images, titles = None, num_columns = 5,font = "arial.ttf", font_size = 18, title_height = 30, cell_width = 300, cell_height = 300, bg_color = (255, 255, 255)):
|
||||
"""
|
||||
Creates a grid of images with optional titles under each image.
|
||||
|
||||
Parameters:
|
||||
- images (list): List of PIL Image objects.
|
||||
- titles (list, optional): List of titles as strings for each image. If None, no titles will be added.
|
||||
- num_columns (int): Number of columns in the grid.
|
||||
- font_sizes: Font size of titles (if provided).
|
||||
- title_height: Height of titles (if provieded).
|
||||
- cell_width (int): Width of each image cell.
|
||||
- cell_height (int): Height of each image cell (not including title space).
|
||||
- bg_color (tuple): Background color as an RGB tuple.
|
||||
|
||||
Returns:
|
||||
- PIL Image object containing the image grid.
|
||||
"""
|
||||
# Set title height if titles are provided; otherwise, set to zero
|
||||
title_height = title_height if titles else 0
|
||||
cell_height_with_title = cell_height + title_height + 2
|
||||
|
||||
# Calculate grid dimensions
|
||||
num_images = len(images)
|
||||
num_rows = (num_images + num_columns - 1) // num_columns # Round up for last row
|
||||
|
||||
# Create a blank background image for the grid
|
||||
grid_width = num_columns * cell_width
|
||||
grid_height = num_rows * cell_height_with_title
|
||||
grid_image = pilImage.new("RGB", (grid_width, grid_height), bg_color)
|
||||
|
||||
# Set up font for titles if needed
|
||||
if titles:
|
||||
try:
|
||||
font = ImageFont.truetype(font, font_size) # Change font and size as desired
|
||||
except IOError:
|
||||
font = ImageFont.load_default()
|
||||
|
||||
# Iterate through each cell in the grid
|
||||
for i in range(num_rows * num_columns):
|
||||
row = i // num_columns
|
||||
col = i % num_columns
|
||||
x = col * cell_width
|
||||
y = row * cell_height_with_title
|
||||
|
||||
if i < num_images:
|
||||
# Resize the image to fit in the cell
|
||||
img_resized = images[i].resize((cell_width, cell_height))
|
||||
grid_image.paste(img_resized, (x, y))
|
||||
|
||||
# Draw title if provided
|
||||
if titles and i < len(titles):
|
||||
draw = ImageDraw.Draw(grid_image)
|
||||
title = titles[i]
|
||||
|
||||
# Calculate title position and center it
|
||||
text_bbox = draw.textbbox((0, 0), title, font=font)
|
||||
text_width = text_bbox[2] - text_bbox[0]
|
||||
title_x = x + (cell_width - text_width) // 2 # Center title horizontally
|
||||
title_y = y + cell_height # Position directly beneath the image
|
||||
|
||||
# Draw title text
|
||||
draw.text((title_x, title_y), title, fill="black", font=font)
|
||||
else:
|
||||
# Leave blank space if no image for this cell
|
||||
blank_cell = pilImage.new("RGB", (cell_width, cell_height), bg_color)
|
||||
grid_image.paste(blank_cell, (x, y))
|
||||
|
||||
return grid_image
|
||||
|
||||
|
||||
|
||||
# ## Mols from file
|
||||
start_time = datetime.now()
|
||||
d = pd.read_csv("../data/ch33_inchis.csv", sep = ";", low_memory = False)
|
||||
d = d[pd.notna(d.inchi)].reset_index(drop = True) # make sure there are no missing inchis
|
||||
d = d[d.inchi.apply(lambda x: "." not in x.split("/")[1])].reset_index(drop = True) # make sure we have no complex molecules
|
||||
d.shape # (839063, 1)
|
||||
d = d.sample(frac=1, random_state=4210).reset_index(drop = True) # scramble the df
|
||||
n = d.shape[0] # Here one can put an alternative much smaller n
|
||||
d = d[d.index.isin(range(n))].reset_index(drop = True) # keep only n
|
||||
d["mol"] = d.inchi.apply(lambda x: sdz.standardize_mol(Chem.MolFromInchi(x)))
|
||||
d = d[pd.notna(d.mol)].reset_index(drop = True) # remove entries without molecule object
|
||||
d.shape[0] # n
|
||||
end_time = datetime.now()
|
||||
print('Duration to analyze {} molecules: {}'.format(n, end_time - start_time))
|
||||
|
||||
start_time = datetime.now()
|
||||
d[["img_text","fgs","fgsmi","fgmol"]] = d.apply(lambda x: pd.Series(efgs.get_dec_fgs(x["mol"])), axis = 1)
|
||||
end_time = datetime.now()
|
||||
print('Duration to analyze {} molecules: {}'.format(n, end_time - start_time))
|
||||
|
||||
|
||||
# Generate distribution of FGs
|
||||
fg_dist = d.explode(["fgsmi","fgmol"]).groupby("fgsmi", as_index = False).apply(lambda x:
|
||||
pd.DataFrame({"fgsmi": [x["fgsmi"].iloc[0]],
|
||||
"n": [x.shape[0]],
|
||||
"mol": [x["fgmol"].iloc[0]]})).sort_values("n", ascending = False)
|
||||
|
||||
fg_dist.shape # 5829, 3 (for all ChEMBL comps)
|
||||
fg_dist.to_csv("../results/fg_dist.csv", sep = ";", index = False)
|
||||
# fg_dist = pd.read_csv("../results/fg_dist.csv", sep = ";", index = False) # To recover data
|
||||
|
||||
mols = fg_dist.mol.iloc[:64].tolist()
|
||||
legs = fg_dist.fgsmi.iloc[:64].tolist()
|
||||
|
||||
|
||||
## High-resolution grid with top-48 FGs
|
||||
images = []
|
||||
img_width = 250
|
||||
img_height = 250
|
||||
for m in mols[:48]:
|
||||
Chem.rdDepictor.Compute2DCoords(m)
|
||||
Chem.rdDepictor.StraightenDepiction(m)
|
||||
d2d = rdMolDraw2D.MolDraw2DCairo(img_width,img_height)
|
||||
dopts = d2d.drawOptions()
|
||||
dopts.bondLineWidth = 5
|
||||
dopts.padding = 0.2
|
||||
d2d.DrawMolecule(m)
|
||||
d2d.FinishDrawing()
|
||||
images.append(molimg(d2d.GetDrawingText()))
|
||||
|
||||
fgs_grid = img_grid(images[:48], titles = legs[:48], num_columns = 8, font_size = 28, cell_width = 300, cell_height = 300, bg_color=(255, 255, 255))
|
||||
fgs_grid = fgs_grid.resize((6000, int(6000*fgs_grid.height/fgs_grid.width)))
|
||||
fgs_grid.save("../results/fg_grid.png", dpi = (300,300))
|
||||
|
||||
|
||||
# High-resolution grid with 20 first molecules with colored functional groups
|
||||
images = [molimg(x) for x in d.img_text.iloc[0:20].tolist()]
|
||||
cell_width = images[0].width
|
||||
cell_height = images[0].height
|
||||
molfg_grid = img_grid(images, num_columns = 5, cell_width = cell_width, cell_height = cell_height, bg_color=(255, 255, 255))
|
||||
molfg_grid
|
||||
molfg_grid = molfg_grid.resize((6000, int(6000*molfg_grid.height/molfg_grid.width)))
|
||||
molfg_grid.save("../RESULTS/molfg_grid.png", dpi = (300,300))
|
||||
|
||||
|
||||
# High-resolution grid with 20 Ertl molecules in Figure 1
|
||||
ertmols = [Chem.MolFromSmiles(s) for s in ertl_smis[:20]]
|
||||
images = [molimg(efgs.get_dec_fgs(ertmol)[0]) for ertmol in ertmols]
|
||||
cell_width = images[0].width
|
||||
cell_height = images[0].height
|
||||
molfg_grid = img_grid(images, num_columns = 5, cell_width = cell_width, cell_height = cell_height, bg_color=(255, 255, 255))
|
||||
molfg_grid
|
||||
molfg_grid = molfg_grid.resize((6000, int(6000*molfg_grid.height/molfg_grid.width)))
|
||||
molfg_grid.save("../RESULTS/ertl_molfg_grid.png", dpi = (300,300))
|
||||
Reference in New Issue
Block a user