This commit is contained in:
bbu-imdea
2025-01-09 07:26:53 +01:00
committed by GitHub
parent 5362427d37
commit 1185a4da2b
4 changed files with 836 additions and 0 deletions

20
Contrib/efgs/README.md Normal file
View File

@@ -0,0 +1,20 @@
Source code of EFGs, a complete an accurate implementation of Ertl's functional group (FG) detection algorithm.
For a RDKit molecule, it provides four outputs:
a) a PNG binary string with an image of the molecule with color-highlighted
functional groups;
b) a list of sets of atom indices (idx), each set corresponding to a
functional group;
c) a list of pseudo-SMILES canonicalized strings for the full functional
groups;
d) a list of RDKit labeled mol objects, one for each full functional group
efgs.py: code of get_dec_fgs function and additional auxiliary functions
try_efgs.py: example code to execute get_dec_fgs in Python with a Pandas dataframe
ch33query.sql: sql query used to extract ChEMBL33 bioactive compounds
ChemRxiv preprint for full details on the implementation:
EFGs: A Complete and Accurate Implementation of Ertls Functional Group Detection Algorithm in RDKit
https://chemrxiv.org/engage/chemrxiv/article-details/67543afe7be152b1d03a820c

View File

@@ -0,0 +1,26 @@
select distinct
md.chembl_id chid,
cs.standard_inchi,
md.pref_name molname,
td.chembl_id tchid,
td.pref_name tarname,
pchembl_value pchembl,
organism as organism,
ac.activity_id,
td.target_type
from
ch33.activities ac,
ch33.molecule_dictionary md,
ch33.assays ay,
ch33.target_dictionary td,
ch33.compound_structures cs,
ch33.chembl_id_lookup cl
where
cl.entity_id = cs.molregno and
cl.entity_type = 'COMPOUND' and
ac.molregno = md.molregno and
ac.molregno = cl.entity_id and
ac.assay_id = ay.assay_id and
pchembl_value > 5 and
ac.pchembl_value is not null and td.tid = ay.tid and td.target_type in (
'PROTEIN COMPLEX','SINGLE PROTEIN')

567
Contrib/efgs/efgs.py Normal file
View File

@@ -0,0 +1,567 @@
# -*- coding: utf-8 -*-
"""
Implementation of Ertl's Functional Group Algorithm in RDKit
Ertl, Peter.
An algorithm to identify functional groups in organic molecules
J Cheminform (2017) 9:36; DOI: 10.1186/s13321-017-0225-z
@author: gonzalo.colmenarejo
"""
from rdkit import Chem
import re
from rdkit.Chem.Draw import rdMolDraw2D
def psmi_can(s):
# Generation of canonical pseudosmiles from pseudosmiles
s = s.replace("[R]","[Ge]").replace("[Car]", "[Pb]").replace("[Cal]", "[Sn]").replace("[Oar]", "[Po]").replace("[Nar]", "[Sb]").replace("[Sar]","[Re]")
s = s.replace("[Nar+]", "[Sb+]").replace("[Sar+]","[Re+]").replace("[Sear]","[Bi]").replace("[Tear]","[Tl]").replace("[Oar+]", "[Po+]").replace("[Nar+]", "[Sb+]")
s = s.replace("[Nar-]", "[Sb-]").replace("[Sar+]","[Re+]").replace("[Sar-]","[Re-]").replace("[Sear+]","[Bi+]").replace("[Sear-]","[Bi-]")
mol = Chem.MolFromSmiles(s)
s = Chem.MolToSmiles(mol, canonical = True, isomericSmiles = False)
s = s.replace("[Ge]","[R]").replace("[Pb]", "[Car]").replace("[Sn]", "[Cal]").replace("[Po]", "[Oar]").replace("[Sb]", "[Nar]").replace("[Re]","[Sar]")
s = s.replace("[Sb+]", "[Nar+]").replace("[Re+]","[Sar+]").replace("[Bi]","[Sear]").replace("[Tl]","[Tear]").replace("[Po+]", "[Oar+]").replace("[Sb+]", "[Nar+]")
s = s.replace("[Nar-]", "[Sb-]").replace("[Re+]","[Sar+]").replace("[Re-]","[Sar-]").replace("[Bi+]","[Sear+]").replace("[Bi-]","[Sear-]")
return s
def cval_fix(fragment):
# Fix valences of carbons
for atom in fragment.GetAtoms():
if atom.GetSymbol() == 'C':
charge = atom.GetFormalCharge()
num_bonds = sum([bond.GetBondTypeAsDouble() for bond in atom.GetBonds()])
implicit_h = atom.GetNumImplicitHs()
explicit_h = atom.GetNumExplicitHs()
total_valence = num_bonds + implicit_h + explicit_h
if charge == 0:
if total_valence != 4:
explicit_h = 4 - num_bonds - implicit_h
atom.SetNumExplicitHs(int(explicit_h))
else:
explicit_h = 4 - num_bonds - implicit_h + charge
atom.SetNumExplicitHs(int(explicit_h))
atom.UpdatePropertyCache()
return fragment
def merge(mol, marked, aset):
# Merge initially marged atoms into FGs
bset = set()
for idx in aset:
atom = mol.GetAtomWithIdx(idx)
for nbr in atom.GetNeighbors():
jdx = nbr.GetIdx()
if jdx in marked:
marked.remove(jdx)
bset.add(jdx)
if not bset:
return
merge(mol, marked, bset)
aset.update(bset)
def get_fgs(mol):
# generation of nondecorated FGs
het23c_pattern = Chem.MolFromSmarts('A=,#[!#6]')
#het23c_pattern = Chem.MolFromSmarts('[C,c]=,#[!#6]')
c23c_pattern = Chem.MolFromSmarts('C=,#[C,c]')
acetal_pattern = Chem.MolFromSmarts('[CX4](-[O,N,S])-[O,N,S]')
oxirane_pattern = Chem.MolFromSmarts('[O,N,S]1CC1')
marked = set()
for atom in mol.GetAtoms():
if ((atom.GetAtomicNum() not in (6,1)) and (atom.GetIsAromatic() is False)):
marked.add(atom.GetIdx())
for pattern in [het23c_pattern, c23c_pattern, acetal_pattern, oxirane_pattern]:
for path in mol.GetSubstructMatches(pattern):
for atomindex in path:
marked.add(atomindex)
groups = []
while marked:
grp = set([marked.pop()])
merge(mol, marked, grp)
groups.append(grp)
for atom in mol.GetAtoms():
if ((atom.GetAtomicNum() not in (6,1)) and (atom.GetIsAromatic() is True)):
lone_atom = True
for nbr in atom.GetNeighbors():
if (nbr.GetIsAromatic() == False): # Don't merge adjacent aromatic atoms into the same FG!!
for group in groups:
if nbr.GetIdx() in group:
group.add(atom.GetIdx())
lone_atom = False
if (lone_atom):
groups.append({atom.GetIdx()})
return groups
def get_carbonyl_envs(mol, functional_groups):
# Find carbonyl environments
carbonyl_pattern = Chem.MolFromSmarts('C=O')
pats_idx = mol.GetSubstructMatches(carbonyl_pattern) # tupla of sets of matched atoms
carbonyl_envs = []
for group in functional_groups:
carbonyl_env = set()
for pat_idx in pats_idx:
neigh_list = []
if (len(set(pat_idx) & group) > 0): # identify the carbonyl corresponding in the group to that SMARTS match
c_atom = mol.GetAtomWithIdx(pat_idx[0]) # Get the C of the carbonyl
neigh_list = [((pat_idx[0], x.GetIdx()), mol.GetBondBetweenAtoms(pat_idx[0], x.GetIdx()))
for x in c_atom.GetNeighbors() if ((x.GetIdx() not in group) and (x.GetAtomicNum() == 6))]
if (len(neigh_list) > 0):
carbonyl_env.update(neigh_list) #
carbonyl_envs.append(carbonyl_env)
return carbonyl_envs
def single_ns_group(mol, group):
# Find single N, S atom groups
ns_count = 0
het_count = 0
for idx in group:
atom = mol.GetAtomWithIdx(idx)
if ((atom.GetSymbol() == 'N') or (atom.GetSymbol() == 'S')):
ns_count += 1
if atom.GetSymbol() not in ('C', 'H'):
het_count += 1
return ns_count == 1 and het_count == 1
def get_freeval_envs(mol, functional_groups):
# Find environments for heteroatoms with free valences
oh_pattern = Chem.MolFromSmarts('[OX2H]')
n_pattern = Chem.MolFromSmarts('[N;X3;H1,H2]')
sh_pattern = Chem.MolFromSmarts('[SX2H]')
# Identify atoms to preserve (those Hs in OH, NH2, and SH groups)
preserve_hs = set()
for pattern in [oh_pattern, n_pattern, sh_pattern]:
matches = mol.GetSubstructMatches(pattern)
if (len(matches) > 0): # found matches
for m in matches:
group = [g for g in functional_groups if m[0] in g][0]
if ((pattern == oh_pattern) or (single_ns_group(mol, group))): # we apply the oh pattern always but the n or s patterns only with single sn groups
atom = mol.GetAtomWithIdx(m[0]) # Retrieve O, N, or S of the hydroxyl, amine, thiol
for n_atom in atom.GetNeighbors():
if (n_atom.GetAtomicNum() == 1): # Will only preserve if explicit H, if not it remains "silent"
preserve_hs.add(n_atom.GetIdx())
freeval_envs = []
for group in functional_groups:
freeval_env = set()
for idx in group:
atom = mol.GetAtomWithIdx(idx)
if ((atom.GetAtomicNum() not in [6,1]) and (atom.GetIsAromatic() == False)): # heteroatom of the group
neigh_list = [((idx, x.GetIdx()), mol.GetBondBetweenAtoms(idx, x.GetIdx()), x.GetAtomicNum())
for x in atom.GetNeighbors() if ((x.GetIdx() not in group) and (x.GetIdx() not in preserve_hs)
and (x.GetAtomicNum() in (6,1))
and (mol.GetBondBetweenAtoms(idx, x.GetIdx()).GetIsAromatic() == False))]
if (len(neigh_list) > 0):
freeval_env.update(neigh_list)
freeval_envs.append(freeval_env)
return freeval_envs, preserve_hs
def get_singleno_envs(mol, functional_groups):
# Find single N, O functional groups environments (amines vs anilines, alcohols vs phenols)
singleno_envs = []
for group in functional_groups:
singleno_env = set()
if (len(group) == 1) and (mol.GetAtomWithIdx(list(group)[0]).GetAtomicNum() in [7,8]) and not (mol.GetAtomWithIdx(list(group)[0]).GetIsAromatic()):
atom = mol.GetAtomWithIdx(list(group)[0])
if sum([1 for neigh_atom in atom.GetNeighbors() if neigh_atom.GetAtomicNum() == 6]) == 1:
for neigh_atom in atom.GetNeighbors():
if (neigh_atom.GetAtomicNum() == 6) and (mol.GetBondBetweenAtoms(atom.GetIdx(), neigh_atom.GetIdx()).GetBondType() == Chem.rdchem.BondType.SINGLE):
type_carbon = "Car" if neigh_atom.GetIsAromatic() == True else "Cal"
singleno_env.add(((list(group)[0], neigh_atom.GetIdx()),
mol.GetBondBetweenAtoms(list(group)[0], neigh_atom.GetIdx()), type_carbon))
singleno_envs.append(singleno_env)
return singleno_envs
def get_dbond2car(mol, functional_groups):
# Find atoms in functional groups double bonded to aromatic carbons
db2car_pattern = Chem.MolFromSmarts("c=*")
db2car_envs = []
matches = mol.GetSubstructMatches(db2car_pattern)
matches = tuple([x for x in matches if mol.GetBondBetweenAtoms(x[0], x[1]).GetIsAromatic() == False])
for group in functional_groups:
db2car_env = set()
for matchx in matches:
if matchx[0] in group or matchx[1] in group:
db2car_env.add(matchx[0])
db2car_envs.append(db2car_env)
return db2car_envs
def col_mol(mol, functional_groups, rad = 0.5, lw = 2, width = 300, height = 250):
# Color functional groups in molecules like Ertl's paper
cols = {"NO": (1, 0.3, 1, 0.8), # violet
"O": (1, 0.6, 0.6, 0.9), # pink
"N": (0.4, 0.7, 0.9), # blue
"X": (0, 1, 0, 0.9), # green
"har": (1, 0.65, 0, 0.9), # orange
"S(O)": (1, 1, 0.2, 0.9), # yellow
"NOS": (0.7, 0.7, 0, 0.9), # pistacchio
"CC": (0.627, 0.627, 0.627, 0.9), # grey
"P,etc": (0.5, 1, 0.8, 0.9) # cyan
}
ats_hl = {}
bds_hl = {}
rdi_hl = {}
lws_hl = {}
for fg in functional_groups:
ar = any([mol.GetAtomWithIdx(idx).GetIsAromatic() for idx in fg])
els = "".join(sorted(list(set([mol.GetAtomWithIdx(idx).GetSymbol() for idx in fg]))))
if ar == True:
col = [cols["har"]]
elif any(x in els for x in ["F", "Cl", "Br", "I"]):
col = [cols["X"]]
elif any(x in els for x in ["P", "Se", "B", "Si", "As", "Te"]):
col = [cols["P,etc"]]
elif els in ["NO", "CNO"]:
col = [cols["NO"]]
elif els in ["O", "CO"]:
col = [cols["O"]]
elif els in ["N", "CN"]:
col = [cols["N"]]
elif els in ["S","OS","COS", "CS"]:
col = [cols["S(O)"]]
elif els in ["NOS", "CNOS", "NS", "CNS"]:
col = [cols["NOS"]]
elif els == "C":
col = [cols["CC"]]
for idx in fg:
ats_hl[idx] = col
rdi_hl[idx] = rad
if len(fg) > 1:
for bond in mol.GetBonds():
at1_idx = bond.GetBeginAtomIdx()
at2_idx = bond.GetEndAtomIdx()
if at1_idx in fg and at2_idx in fg:
bds_hl[bond.GetIdx()] = col
lws_hl[bond.GetIdx()] = lw
Chem.rdDepictor.Compute2DCoords(mol)
Chem.rdDepictor.StraightenDepiction(mol)
d2d = rdMolDraw2D.MolDraw2DCairo(width,height)
dopts = d2d.drawOptions()
dopts.atomHighlightsAreCircles = False
d2d.DrawMoleculeWithHighlights(mol, "", ats_hl, bds_hl, rdi_hl, lws_hl)
d2d.FinishDrawing()
return d2d.GetDrawingText()
def get_dec_fgs(mol, verbose = False):
# Generate functional groups decorated with carbon environments
for atom in mol.GetAtoms():
atom.SetChiralTag(Chem.rdchem.ChiralType.CHI_UNSPECIFIED)
for bond in mol.GetBonds():
bond.SetStereo(Chem.rdchem.BondStereo.STEREONONE)
mol_aux = Chem.Mol(mol)
mol = Chem.RemoveHs(mol)
mol = Chem.AddHs(mol)
fgs = get_fgs(mol)
cnes = get_carbonyl_envs(mol, fgs)
fves, pr_hs = get_freeval_envs(mol, fgs)
noes = get_singleno_envs(mol, fgs)
db2cars = get_dbond2car(mol, fgs)
## Dummyze permitted free valence neighboring Hs
for fve in fves:
if (len(fve) > 0):
for fv in fve:
if (fv[2] == 1):
#print("dummy H: " + str(fv[0][1]))
mol.GetAtomWithIdx(fv[0][1]).SetAtomicNum(0)
## Remove explicit atoms
mol = Chem.RemoveHs(mol)
## Update fves accordingly to account for changes in numbers of dummy atoms
for i in range(len(fves)):
fve = list(fves[i])
if (len(fve) > 0):
dummy_paired = []
for j in range(len(fve)):
fv = fve[j]
if (fv[2] == 1): # this entry had a H fv
for neighbor in mol.GetAtomWithIdx(fv[0][0]).GetNeighbors():
if (neighbor.GetAtomicNum() == 0):
dummy_paired.append((fv[0][0], neighbor.GetIdx()))
# dummy_idx = neighbor.GetIdx() ## Assign the updated idx
# dummy_bond = mol.GetBondBetweenAtoms(fv[0][0], dummy_idx) # Update the bond too
# new_fv = ((fv[0][0], dummy_idx), dummy_bond, 0)
# fve[j] = new_fv
dummy_paired = list(set(dummy_paired))
dummy_count = 0
for j in range(len(fve)):
fv = fve[j]
if (fv[2] == 1):
d_pair = dummy_paired[dummy_count]
fve[j] = ((d_pair), mol.GetBondBetweenAtoms(d_pair[0], d_pair[1]), 0)
dummy_count = dummy_count + 1
fves[i] = set(fve) # reassign the possibly updated fve
psmis = []
fg_mols = []
psmi_labs = []
pseudo_smiles = ""
# Loop over functional groups to generate pseudo smiles and pseudo molecules for each of them
for i, group in enumerate(fgs):
neigh_atoms = sorted(list(set([x[0][1] for x in cnes[i]] + [x[0][1] for x in fves[i]] + [x[0][1] for x in noes[i]])))
cne = cnes[i]
fve = fves[i]
noe = noes[i]
db2car = db2cars[i]
R_idxs = sorted(list(set(neigh_atoms + [x for x in db2car])))
psmi_labs = [(list(x)[0][1], list(x)[2]) for x in noe if len(noe) > 0] + [(x, "Car") for x in db2car if len(db2car) > 0]
if (len(neigh_atoms) > 0): # There is decoration applicable
## Get the bonds of the group
bonds_idx = set()
for j in range(mol.GetNumBonds()):
bond = mol.GetBondWithIdx(j)
begin_aid = bond.GetBeginAtomIdx()
end_aid = bond.GetEndAtomIdx()
if ((begin_aid in group) and (end_aid in group)):
bonds_idx.add(bond.GetIdx())
## Add bonds to carbonyl carbons
if(len(cne) > 0):
for env in cne:
bond = env[1]
bonds_idx.add(bond.GetIdx())
## Add bonds to free valence atoms
if(len(fve) > 0):
for env in fve:
bond = env[1]
bonds_idx.add(bond.GetIdx())
## Add bonds to single n, o atoms
if(len(noe) > 0):
for env in noe:
bond = env[1]
bonds_idx.add(bond.GetIdx())
# Get the fragment as a new molecule from bonds in the FG + environment atoms
amap = {}
fragment = Chem.PathToSubmol(mol, list(bonds_idx), atomMap = amap)
# Create a copy of the fragment to modify
fragment_with_r_groups = Chem.RWMol(fragment)
# Replace atoms at the specified positions with R group labels
j = 0
for idx in R_idxs:
label = "[*:" + str(j+1) + "]"
# Create a dummy atom with the R group label
r_group_atom = Chem.MolFromSmiles(label).GetAtomWithIdx(0)
r_group_atom.SetAtomMapNum(int(label[-2])) # Set the map number to match the R group number
# Replace the atom at the specified position with the R group atom
fragment_with_r_groups.ReplaceAtom(amap[R_idxs[j]], r_group_atom)
j = j + 1
# Convert the modified fragment back to a Mol object
fragment_with_r_groups = fragment_with_r_groups.GetMol()
## Fix C valences
fragment_with_r_groups = cval_fix(fragment_with_r_groups)
# Generate the canonical pseudo-SMILES
pseudo_smiles = Chem.MolToSmiles(fragment_with_r_groups, canonical=True, isomericSmiles = False)
# Generate the labeled molecule
fragment_with_r_groups = Chem.RWMol(fragment_with_r_groups)
for atom in fragment_with_r_groups.GetAtoms():
idx = atom.GetIdx() # idx of atom in fragment
map_idx = list(amap.keys())[list(amap.values()).index(idx)] # idx of atom in original molecule
if map_idx in neigh_atoms: # Check if that position is an environment atom in original molecule
if (map_idx in [x[0][1] for x in list(noe)]):
type_carbon = [x[2] for x in list(noe) if x[0][1] == map_idx][0]
fragment_with_r_groups.GetAtomWithIdx(idx).SetProp('atomLabel', type_carbon)
else:
fragment_with_r_groups.GetAtomWithIdx(idx).SetProp('atomLabel','R')
else:
if atom.GetIsAromatic():
if atom.GetAtomicNum() == 6:
fragment_with_r_groups.GetAtomWithIdx(idx).SetProp('atomLabel', "Car")
if atom.GetAtomicNum() == 7:
fragment_with_r_groups.GetAtomWithIdx(idx).SetProp('atomLabel', "Nar")
if atom.GetAtomicNum() == 8:
fragment_with_r_groups.GetAtomWithIdx(idx).SetProp('atomLabel', "Oar")
if atom.GetAtomicNum() == 16:
fragment_with_r_groups.GetAtomWithIdx(idx).SetProp('atomLabel', "Sar")
if map_idx in db2car: # check db2caromatics
fragment_with_r_groups.GetAtomWithIdx(idx).SetProp('atomLabel', "Car")
fragment_with_r_groups = fragment_with_r_groups.GetMol()
else: # No neighbors
if(len(group) == 1): # Single-atommed FG
atom = mol.GetAtomWithIdx(list(group)[0])
charge = atom.GetFormalCharge()
charge_str = "+" if charge == 1 else "-" if charge == -1 else ""
fragment = Chem.RWMol() ## Create emtpy edtiable molecule
fragment.AddAtom(Chem.Atom('*')) ## Add dummy atom to molecule
if atom.GetIsAromatic():
if atom.GetAtomicNum() == 7:
pseudo_smiles = "n" + charge_str
fragment.GetAtomWithIdx(0).SetProp('atomLabel', "Nar" + charge_str)
if atom.GetAtomicNum() == 8:
pseudo_smiles = "o" + charge_str
fragment.GetAtomWithIdx(0).SetProp('atomLabel', "Oar" + charge_str)
if atom.GetAtomicNum() == 16:
pseudo_smiles = "s" + charge_str
fragment.GetAtomWithIdx(0).SetProp('atomLabel', "Sar" + charge_str)
if atom.GetAtomicNum() == 15:
pseudo_smiles = "p" + charge_str
fragment.GetAtomWithIdx(0).SetProp('atomLabel', "Par" + charge_str)
if atom.GetAtomicNum() == 34:
pseudo_smiles = "se" + charge_str
fragment.GetAtomWithIdx(0).SetProp('atomLabel', "Sear" + charge_str)
if atom.GetAtomicNum() == 52:
pseudo_smiles = "te" + charge_str
fragment.GetAtomWithIdx(0).SetProp('atomLabel', "Tear" + charge_str)
else:
if atom.GetAtomicNum() == 7:
pseudo_smiles = "N" + charge_str
fragment.GetAtomWithIdx(0).SetProp('atomLabel', "N" + charge_str)
if atom.GetAtomicNum() == 8:
pseudo_smiles = "O" + charge_str
fragment.GetAtomWithIdx(0).SetProp('atomLabel', "O" + charge_str)
if atom.GetAtomicNum() == 16:
pseudo_smiles = "S" + charge_str
fragment.GetAtomWithIdx(0).SetProp('atomLabel', "S" + charge_str)
if atom.GetAtomicNum() == 15:
pseudo_smiles = "P" + charge_str + "]"
fragment.GetAtomWithIdx(0).SetProp('atomLabel', "P" + charge_str)
if atom.GetAtomicNum() == 34:
pseudo_smiles = "Se" + charge_str + "]"
fragment.GetAtomWithIdx(0).SetProp('atomLabel', "Se" + charge_str)
if atom.GetAtomicNum() == 52:
pseudo_smiles = "Te" + charge_str + "]"
fragment.GetAtomWithIdx(0).SetProp('atomLabel', "Te" + charge_str)
fragment = fragment.GetMol()
else: # Multi-atommed FG
bonds_idx = []
for j in range(mol.GetNumBonds()):
bond = mol.GetBondWithIdx(j)
begin_aid = bond.GetBeginAtomIdx()
end_aid = bond.GetEndAtomIdx()
if begin_aid in group and end_aid in group:
bonds_idx.append(j)
# Get the fragment as a new molecule
amap = {}
fragment = Chem.PathToSubmol(mol, bonds_idx, atomMap = amap)
fragment = cval_fix(fragment)
pseudo_smiles = Chem.MolToSmiles(fragment, canonical=True, isomericSmiles = False)
for atom in fragment.GetAtoms():
idx = atom.GetIdx()
charge = atom.GetFormalCharge()
charge_str = "\u207A" if charge == 1 else "\u207B" if charge == -1 else ""
if atom.GetIsAromatic():
if atom.GetAtomicNum() == 6:
fragment.GetAtomWithIdx(idx).SetProp('atomLabel', "Car" + charge_str)
if atom.GetAtomicNum() == 7:
fragment.GetAtomWithIdx(idx).SetProp('atomLabel', "Nar" + charge_str)
if atom.GetAtomicNum() == 8:
fragment.GetAtomWithIdx(idx).SetProp('atomLabel', "Oar" + charge_str)
if atom.GetAtomicNum() == 16:
fragment.GetAtomWithIdx(idx).SetProp('atomLabel', "Sar" + charge_str)
# Parse the pseudo-smiles; first the Cars and Cals
if (len(psmi_labs) > 0):
for psmi_lab in list(psmi_labs):
pseudo_smiles = pseudo_smiles.replace("[*:" + str(R_idxs.index(psmi_lab[0]) + 1) + "]", "[" + psmi_lab[1] + "]")
# Define the pattern to match
pattern = r'\[\*:\d+\]'
# Define the replacement string
replacement = "[R]"
# Replace [R] groups
pseudo_smiles = re.sub(pattern, replacement, pseudo_smiles)
# Replace ns
pseudo_smiles = pseudo_smiles.replace("[n+]", "[Nar+]")
pseudo_smiles = pseudo_smiles.replace("[n-]", "[Nar-]")
pseudo_smiles = pseudo_smiles.replace("[n+]", "[Nar+]")
pseudo_smiles = pseudo_smiles.replace("[n-]", "[Nar-]")
pseudo_smiles = re.sub(r'(?<![MNI])n\+', '[Nar+]', pseudo_smiles)
pseudo_smiles = re.sub(r'(?<![MNI])n\-', '[Nar-]', pseudo_smiles)
pseudo_smiles = re.sub(r'(?<![MZI])n', '[Nar]', pseudo_smiles)
# Replace cs, ss, os, ps, ses, tes
pseudo_smiles = pseudo_smiles.replace("[se+]", "[Sear+]")
pseudo_smiles = pseudo_smiles.replace("[se-]", "[Sear-]")
pseudo_smiles = pseudo_smiles.replace("se+", "[Sear+]")
pseudo_smiles = pseudo_smiles.replace("se-", "[Sear-]")
pseudo_smiles = pseudo_smiles.replace("se", "[Sear]")
pseudo_smiles = pseudo_smiles.replace("[te+]", "[Tear+]")
pseudo_smiles = pseudo_smiles.replace("[te-]", "[Tear-]")
pseudo_smiles = pseudo_smiles.replace("te", "[Tear]")
pseudo_smiles = pseudo_smiles.replace("[c+]", "[Car+]")
pseudo_smiles = pseudo_smiles.replace("[c-]", "[Car-]")
pseudo_smiles = pseudo_smiles.replace("c", "[Car]")
pseudo_smiles = pseudo_smiles.replace("[s+]", "[Sar+]")
pseudo_smiles = pseudo_smiles.replace("[s-]", "[Sar-]")
pseudo_smiles = re.sub(r"(?<!A)(s\+)", "[Sar+]", pseudo_smiles) # To avoid issues with As
pseudo_smiles = re.sub(r"(?<!A)(s-)", "[Sar-]", pseudo_smiles) # To avoid issues with As
pseudo_smiles = re.sub(r"(?<!A)s", "[Sar]", pseudo_smiles) # To avoid issues with As
pseudo_smiles = pseudo_smiles.replace("o+", "[Oar+]")
pseudo_smiles = pseudo_smiles.replace("o-", "[Oar-]")
pseudo_smiles = pseudo_smiles.replace("o", "[Oar]")
pseudo_smiles = pseudo_smiles.replace("[o]", "[Oar]")
pseudo_smiles = pseudo_smiles.replace("[o+]", "[Oar+]")
pseudo_smiles = pseudo_smiles.replace("[o-]", "[Oar-]")
pseudo_smiles = pseudo_smiles.replace("[p+]", "[Par+]")
pseudo_smiles = pseudo_smiles.replace("[p-]", "[Par-]")
pseudo_smiles = pseudo_smiles.replace("p", "[Par]")
# Final canonicalization
pseudo_smiles = psmi_can(pseudo_smiles)
psmis.append(pseudo_smiles)
if (len(neigh_atoms) == 0):
fg_mols.append(fragment)
else:
fg_mols.append(fragment_with_r_groups)
if not ("H" in pseudo_smiles):
if (verbose):
print("Canonical Pseudo-SMILES:", pseudo_smiles)
img_text = col_mol(mol_aux, fgs)
return img_text, fgs, psmis, fg_mols

223
Contrib/efgs/try_efgs.py Normal file
View File

@@ -0,0 +1,223 @@
# -*- coding: utf-8 -*-
"""
Implementation of Ertl's Functional Group Algorithm in RDKit
Ertl, Peter.
An algorithm to identify functional groups in organic molecules
J Cheminform (2017) 9:36; DOI: 10.1186/s13321-017-0225-z
@author: gonzalo.colmenarejo
"""
from rdkit import Chem
from rdkit.Chem.Draw import IPythonConsole
from chembl_structure_pipeline import standardizer as sdz
from datetime import datetime
from IPython.display import Image
from PIL import Image as pilImage
from PIL import ImageDraw, ImageFont
from rdkit.Chem.Draw import rdMolDraw2D
import efgs
import pandas as pd
import io
ertl_smis = [
'Cc1nc(NS(=O)(=O)c2ccc(N)cc2)nc(C)c1',
'NC(=N)c1ccc(C=Cc2ccc(cc2O)C(=N)N)cc1',
'CC(=O)Nc1nnc(s1)S(=O)(=O)N',
'NS(=O)(=O)c1cc2c(NCNS2(=O)=O)cc1Cl',
'CNC1=Nc2ccc(Cl)cc2C(=N(=O)C1)c3ccccc3',
'Cc1onc(c1C(=O)NC2C3SC(C)(C)C(N3C2=O)C(=O)O)c4ccccc4',
'Clc1ccccc1C2=NCC(=O)Nc3ccc(cc23)N(=O)=O',
'COc1cc(cc(C(=O)NCC2CCCN2CC=C)c1OC)S(=O)(=O)N',
'Cc1ccc(Cl)c(Nc2ccccc2C(=O)O)c1Cl',
'Clc1ccc2Oc3ccccc3N=C(N4CCNCC4)c2c1',
'FC(F)(F)CN1C(=O)CN=C(c2ccccc2)c3cc(Cl)ccc13',
'OCC1OC(CC1O)n2cnc3C(O)CNC=Nc32',
'CCNC1CC(C)S(=O)(=O)c2sc(cc12)S(=O)(=O)N',
'CC(O)C1C2C(C)C(=C(N2C1=O)C(=O)O)SC3CNC(C3)C(=O)N(C)C',
'CC1CN(CC(C)N1)c2c(F)c(N)c3c(=O)c(cn(C4CC4)c3c2F)C(=O)O',
'CC(=CCC1C(=O)N(N(C1=O)c2ccccc2)c3ccccc3)C',
'Clc1ccc2N=C3NC(=O)CN3Cc2c1Cl',
'CC(=O)NC1C(NC(=N)N)C=C(OC1C(O)C(O)CO)C(=O)O',
'CC(O)C(O)C1CNc2nc(N)nc(O)c2N1',
'NC1CCCCN(C1)c2c(Cl)cc3c(=O)c(cn(C4CC4)c3c2Cl)C(=O)O',
]
def mindex(mol):
# Utility function to draw molecule with idx
for atom in mol.GetAtoms():
try:
atom.SetAtomMapNum(atom.GetIdx())
except:
None
return mol
def demindex(mol):
# Utility function to return mindex-ed molecule to normal
for atom in mol.GetAtoms():
atom.ClearProp('molAtomMapNumber')
return mol
def molimg(x):
# Utility function to draw image of labeled mol from PNG binary string
try:
return pilImage.open(io.BytesIO(x))
except:
return None
def img_grid(images, titles = None, num_columns = 5,font = "arial.ttf", font_size = 18, title_height = 30, cell_width = 300, cell_height = 300, bg_color = (255, 255, 255)):
"""
Creates a grid of images with optional titles under each image.
Parameters:
- images (list): List of PIL Image objects.
- titles (list, optional): List of titles as strings for each image. If None, no titles will be added.
- num_columns (int): Number of columns in the grid.
- font_sizes: Font size of titles (if provided).
- title_height: Height of titles (if provieded).
- cell_width (int): Width of each image cell.
- cell_height (int): Height of each image cell (not including title space).
- bg_color (tuple): Background color as an RGB tuple.
Returns:
- PIL Image object containing the image grid.
"""
# Set title height if titles are provided; otherwise, set to zero
title_height = title_height if titles else 0
cell_height_with_title = cell_height + title_height + 2
# Calculate grid dimensions
num_images = len(images)
num_rows = (num_images + num_columns - 1) // num_columns # Round up for last row
# Create a blank background image for the grid
grid_width = num_columns * cell_width
grid_height = num_rows * cell_height_with_title
grid_image = pilImage.new("RGB", (grid_width, grid_height), bg_color)
# Set up font for titles if needed
if titles:
try:
font = ImageFont.truetype(font, font_size) # Change font and size as desired
except IOError:
font = ImageFont.load_default()
# Iterate through each cell in the grid
for i in range(num_rows * num_columns):
row = i // num_columns
col = i % num_columns
x = col * cell_width
y = row * cell_height_with_title
if i < num_images:
# Resize the image to fit in the cell
img_resized = images[i].resize((cell_width, cell_height))
grid_image.paste(img_resized, (x, y))
# Draw title if provided
if titles and i < len(titles):
draw = ImageDraw.Draw(grid_image)
title = titles[i]
# Calculate title position and center it
text_bbox = draw.textbbox((0, 0), title, font=font)
text_width = text_bbox[2] - text_bbox[0]
title_x = x + (cell_width - text_width) // 2 # Center title horizontally
title_y = y + cell_height # Position directly beneath the image
# Draw title text
draw.text((title_x, title_y), title, fill="black", font=font)
else:
# Leave blank space if no image for this cell
blank_cell = pilImage.new("RGB", (cell_width, cell_height), bg_color)
grid_image.paste(blank_cell, (x, y))
return grid_image
# ## Mols from file
start_time = datetime.now()
d = pd.read_csv("../data/ch33_inchis.csv", sep = ";", low_memory = False)
d = d[pd.notna(d.inchi)].reset_index(drop = True) # make sure there are no missing inchis
d = d[d.inchi.apply(lambda x: "." not in x.split("/")[1])].reset_index(drop = True) # make sure we have no complex molecules
d.shape # (839063, 1)
d = d.sample(frac=1, random_state=4210).reset_index(drop = True) # scramble the df
n = d.shape[0] # Here one can put an alternative much smaller n
d = d[d.index.isin(range(n))].reset_index(drop = True) # keep only n
d["mol"] = d.inchi.apply(lambda x: sdz.standardize_mol(Chem.MolFromInchi(x)))
d = d[pd.notna(d.mol)].reset_index(drop = True) # remove entries without molecule object
d.shape[0] # n
end_time = datetime.now()
print('Duration to analyze {} molecules: {}'.format(n, end_time - start_time))
start_time = datetime.now()
d[["img_text","fgs","fgsmi","fgmol"]] = d.apply(lambda x: pd.Series(efgs.get_dec_fgs(x["mol"])), axis = 1)
end_time = datetime.now()
print('Duration to analyze {} molecules: {}'.format(n, end_time - start_time))
# Generate distribution of FGs
fg_dist = d.explode(["fgsmi","fgmol"]).groupby("fgsmi", as_index = False).apply(lambda x:
pd.DataFrame({"fgsmi": [x["fgsmi"].iloc[0]],
"n": [x.shape[0]],
"mol": [x["fgmol"].iloc[0]]})).sort_values("n", ascending = False)
fg_dist.shape # 5829, 3 (for all ChEMBL comps)
fg_dist.to_csv("../results/fg_dist.csv", sep = ";", index = False)
# fg_dist = pd.read_csv("../results/fg_dist.csv", sep = ";", index = False) # To recover data
mols = fg_dist.mol.iloc[:64].tolist()
legs = fg_dist.fgsmi.iloc[:64].tolist()
## High-resolution grid with top-48 FGs
images = []
img_width = 250
img_height = 250
for m in mols[:48]:
Chem.rdDepictor.Compute2DCoords(m)
Chem.rdDepictor.StraightenDepiction(m)
d2d = rdMolDraw2D.MolDraw2DCairo(img_width,img_height)
dopts = d2d.drawOptions()
dopts.bondLineWidth = 5
dopts.padding = 0.2
d2d.DrawMolecule(m)
d2d.FinishDrawing()
images.append(molimg(d2d.GetDrawingText()))
fgs_grid = img_grid(images[:48], titles = legs[:48], num_columns = 8, font_size = 28, cell_width = 300, cell_height = 300, bg_color=(255, 255, 255))
fgs_grid = fgs_grid.resize((6000, int(6000*fgs_grid.height/fgs_grid.width)))
fgs_grid.save("../results/fg_grid.png", dpi = (300,300))
# High-resolution grid with 20 first molecules with colored functional groups
images = [molimg(x) for x in d.img_text.iloc[0:20].tolist()]
cell_width = images[0].width
cell_height = images[0].height
molfg_grid = img_grid(images, num_columns = 5, cell_width = cell_width, cell_height = cell_height, bg_color=(255, 255, 255))
molfg_grid
molfg_grid = molfg_grid.resize((6000, int(6000*molfg_grid.height/molfg_grid.width)))
molfg_grid.save("../RESULTS/molfg_grid.png", dpi = (300,300))
# High-resolution grid with 20 Ertl molecules in Figure 1
ertmols = [Chem.MolFromSmiles(s) for s in ertl_smis[:20]]
images = [molimg(efgs.get_dec_fgs(ertmol)[0]) for ertmol in ertmols]
cell_width = images[0].width
cell_height = images[0].height
molfg_grid = img_grid(images, num_columns = 5, cell_width = cell_width, cell_height = cell_height, bg_color=(255, 255, 255))
molfg_grid
molfg_grid = molfg_grid.resize((6000, int(6000*molfg_grid.height/molfg_grid.width)))
molfg_grid.save("../RESULTS/ertl_molfg_grid.png", dpi = (300,300))