efgs code (#8145)

2026-06-03 21:44:30 +08:00 · 2025-01-09 07:26:53 +01:00
parent 5362427d37
commit 1185a4da2b
4 changed files with 836 additions and 0 deletions
--- a/Contrib/efgs/README.md
+++ b/Contrib/efgs/README.md
@@ -0,0 +1,20 @@
+Source code of EFGs, a complete an accurate implementation of Ertl's functional group (FG) detection algorithm.
+
+For a RDKit molecule, it provides four outputs: 
+a) a PNG binary string with an image of the molecule with color-highlighted
+functional groups; 
+b) a list of sets of atom indices (idx), each set corresponding to a
+functional group; 
+c) a list of pseudo-SMILES canonicalized strings for the full functional
+groups; 
+d) a list of RDKit labeled mol objects, one for each full functional group 
+
+efgs.py: code of get_dec_fgs function and additional auxiliary functions
+
+try_efgs.py: example code to execute get_dec_fgs in Python with a Pandas dataframe
+
+ch33query.sql: sql query used to extract ChEMBL33 bioactive compounds
+
+ChemRxiv preprint for full details on the implementation: 
+EFGs: A Complete and Accurate Implementation of Ertl’s Functional Group Detection Algorithm in RDKit
+https://chemrxiv.org/engage/chemrxiv/article-details/67543afe7be152b1d03a820c
--- a/Contrib/efgs/ch33query.sql
+++ b/Contrib/efgs/ch33query.sql
@@ -0,0 +1,26 @@
+select distinct
+    md.chembl_id chid,
+        cs.standard_inchi,
+        md.pref_name molname,
+        td.chembl_id tchid,
+		td.pref_name tarname,
+        pchembl_value pchembl,
+        organism as organism,
+		ac.activity_id,
+        td.target_type
+        from
+        ch33.activities ac,
+        ch33.molecule_dictionary md,
+        ch33.assays ay,
+        ch33.target_dictionary td,
+        ch33.compound_structures cs,
+        ch33.chembl_id_lookup cl
+        where
+        cl.entity_id = cs.molregno and
+        cl.entity_type = 'COMPOUND' and
+        ac.molregno = md.molregno and
+        ac.molregno = cl.entity_id and
+        ac.assay_id = ay.assay_id and
+        pchembl_value > 5 and
+        ac.pchembl_value is not null and td.tid = ay.tid and td.target_type in (
+	'PROTEIN COMPLEX','SINGLE PROTEIN')
--- a/Contrib/efgs/efgs.py
+++ b/Contrib/efgs/efgs.py
@@ -0,0 +1,567 @@
+# -*- coding: utf-8 -*-
+"""
+Implementation of Ertl's Functional Group Algorithm in RDKit
+
+Ertl, Peter. 
+
+An algorithm to identify functional groups in organic molecules
+
+J Cheminform (2017) 9:36; DOI: 10.1186/s13321-017-0225-z
+
+@author: gonzalo.colmenarejo
+"""
+
+
+from rdkit import Chem
+import re
+from rdkit.Chem.Draw import rdMolDraw2D
+
+    
+def psmi_can(s):
+    # Generation of canonical pseudosmiles from pseudosmiles
+    s = s.replace("[R]","[Ge]").replace("[Car]", "[Pb]").replace("[Cal]", "[Sn]").replace("[Oar]", "[Po]").replace("[Nar]", "[Sb]").replace("[Sar]","[Re]")
+    s = s.replace("[Nar+]", "[Sb+]").replace("[Sar+]","[Re+]").replace("[Sear]","[Bi]").replace("[Tear]","[Tl]").replace("[Oar+]", "[Po+]").replace("[Nar+]", "[Sb+]")
+    s = s.replace("[Nar-]", "[Sb-]").replace("[Sar+]","[Re+]").replace("[Sar-]","[Re-]").replace("[Sear+]","[Bi+]").replace("[Sear-]","[Bi-]")
+    mol = Chem.MolFromSmiles(s)
+    s = Chem.MolToSmiles(mol, canonical = True, isomericSmiles = False)
+    s = s.replace("[Ge]","[R]").replace("[Pb]", "[Car]").replace("[Sn]", "[Cal]").replace("[Po]", "[Oar]").replace("[Sb]", "[Nar]").replace("[Re]","[Sar]")
+    s = s.replace("[Sb+]", "[Nar+]").replace("[Re+]","[Sar+]").replace("[Bi]","[Sear]").replace("[Tl]","[Tear]").replace("[Po+]", "[Oar+]").replace("[Sb+]", "[Nar+]")
+    s = s.replace("[Nar-]", "[Sb-]").replace("[Re+]","[Sar+]").replace("[Re-]","[Sar-]").replace("[Bi+]","[Sear+]").replace("[Bi-]","[Sear-]")
+    return s
+
+
+def cval_fix(fragment):
+    # Fix valences of carbons
+    for atom in fragment.GetAtoms():
+        if atom.GetSymbol() == 'C':
+            charge = atom.GetFormalCharge()
+            num_bonds = sum([bond.GetBondTypeAsDouble() for bond in atom.GetBonds()])
+            implicit_h = atom.GetNumImplicitHs()
+            explicit_h = atom.GetNumExplicitHs()
+            total_valence = num_bonds + implicit_h + explicit_h
+            if charge == 0:
+                if total_valence != 4:
+                    explicit_h = 4 - num_bonds - implicit_h
+                    atom.SetNumExplicitHs(int(explicit_h))
+            else:
+                explicit_h = 4 - num_bonds - implicit_h + charge
+                atom.SetNumExplicitHs(int(explicit_h))
+            atom.UpdatePropertyCache()
+    return fragment
+
+
+def merge(mol, marked, aset):
+    # Merge initially marged atoms into FGs
+    bset = set()
+    for idx in aset:
+        atom = mol.GetAtomWithIdx(idx)
+        for nbr in atom.GetNeighbors():
+            jdx = nbr.GetIdx()
+            if jdx in marked:
+                marked.remove(jdx)
+                bset.add(jdx)
+    if not bset:
+        return
+    merge(mol, marked, bset)
+    aset.update(bset)
+
+
+def get_fgs(mol):
+    # generation of nondecorated FGs  
+    het23c_pattern = Chem.MolFromSmarts('A=,#[!#6]')
+    #het23c_pattern = Chem.MolFromSmarts('[C,c]=,#[!#6]')
+    c23c_pattern = Chem.MolFromSmarts('C=,#[C,c]')
+    acetal_pattern = Chem.MolFromSmarts('[CX4](-[O,N,S])-[O,N,S]')
+    oxirane_pattern = Chem.MolFromSmarts('[O,N,S]1CC1')
+    
+    marked = set()
+    for atom in mol.GetAtoms():
+        if ((atom.GetAtomicNum() not in (6,1)) and (atom.GetIsAromatic() is False)):
+            marked.add(atom.GetIdx())
+
+    for pattern in [het23c_pattern, c23c_pattern, acetal_pattern, oxirane_pattern]:
+        for path in mol.GetSubstructMatches(pattern):
+            for atomindex in path:
+                marked.add(atomindex)
+                
+    groups = []
+    while marked:
+        grp = set([marked.pop()])
+        merge(mol, marked, grp)
+        groups.append(grp)
+        
+    for atom in mol.GetAtoms():
+        if ((atom.GetAtomicNum() not in (6,1)) and (atom.GetIsAromatic() is True)): 
+            lone_atom = True
+            for nbr in atom.GetNeighbors():
+                if (nbr.GetIsAromatic() == False): # Don't merge adjacent aromatic atoms into the same FG!!
+                    for group in groups:
+                        if nbr.GetIdx() in group:
+                            group.add(atom.GetIdx())
+                            lone_atom = False
+            if (lone_atom):
+                groups.append({atom.GetIdx()})
+    return groups
+
+
+def get_carbonyl_envs(mol, functional_groups):
+    # Find carbonyl environments
+    carbonyl_pattern = Chem.MolFromSmarts('C=O')
+    pats_idx = mol.GetSubstructMatches(carbonyl_pattern) # tupla of sets of matched atoms
+    carbonyl_envs = []
+    for group in functional_groups:
+        carbonyl_env = set()
+        for pat_idx in pats_idx:
+            neigh_list = []
+            if (len(set(pat_idx) & group) > 0): # identify the carbonyl corresponding in the group to that SMARTS match
+                c_atom = mol.GetAtomWithIdx(pat_idx[0]) # Get the C of the carbonyl
+                neigh_list = [((pat_idx[0], x.GetIdx()), mol.GetBondBetweenAtoms(pat_idx[0], x.GetIdx())) 
+                                      for x in c_atom.GetNeighbors() if ((x.GetIdx() not in group) and (x.GetAtomicNum() == 6))]
+            if (len(neigh_list) > 0):
+                carbonyl_env.update(neigh_list) # 
+        carbonyl_envs.append(carbonyl_env)
+    return carbonyl_envs
+
+
+def single_ns_group(mol, group):
+    # Find single N, S atom groups
+    ns_count = 0
+    het_count = 0
+
+    for idx in group:
+        atom = mol.GetAtomWithIdx(idx)
+        if ((atom.GetSymbol() == 'N') or (atom.GetSymbol() == 'S')):
+            ns_count += 1
+        if atom.GetSymbol() not in ('C', 'H'):
+            het_count += 1
+
+    return ns_count == 1 and het_count == 1    
+
+
+def get_freeval_envs(mol, functional_groups):
+    # Find environments for heteroatoms with free valences
+    oh_pattern = Chem.MolFromSmarts('[OX2H]')
+    n_pattern = Chem.MolFromSmarts('[N;X3;H1,H2]')
+    sh_pattern = Chem.MolFromSmarts('[SX2H]')
+    
+    # Identify atoms to preserve (those Hs in OH, NH2, and SH groups)
+    preserve_hs = set()
+    for pattern in [oh_pattern, n_pattern, sh_pattern]:
+        matches = mol.GetSubstructMatches(pattern)
+        if (len(matches) > 0): # found matches
+            for m in matches:
+                group = [g for g in functional_groups if m[0] in g][0]
+                if ((pattern == oh_pattern) or (single_ns_group(mol, group))): # we apply the oh pattern always but the n or s patterns only with single sn groups
+                    atom = mol.GetAtomWithIdx(m[0]) # Retrieve O, N, or S of the hydroxyl, amine, thiol
+                    for n_atom in atom.GetNeighbors():
+                        if (n_atom.GetAtomicNum() == 1): # Will only preserve if explicit H, if not it remains "silent"
+                            preserve_hs.add(n_atom.GetIdx())           
+    freeval_envs = []
+    for group in functional_groups:
+        freeval_env = set()
+        for idx in group:
+            atom = mol.GetAtomWithIdx(idx)
+            if ((atom.GetAtomicNum() not in [6,1]) and (atom.GetIsAromatic() == False)): # heteroatom of the group
+                neigh_list = [((idx, x.GetIdx()), mol.GetBondBetweenAtoms(idx, x.GetIdx()), x.GetAtomicNum())
+                               for x in atom.GetNeighbors() if ((x.GetIdx() not in group) and (x.GetIdx() not in preserve_hs)
+                               and (x.GetAtomicNum() in (6,1)) 
+                               and (mol.GetBondBetweenAtoms(idx, x.GetIdx()).GetIsAromatic() == False))]
+                if (len(neigh_list) > 0):
+                        freeval_env.update(neigh_list)
+        freeval_envs.append(freeval_env)
+    return freeval_envs, preserve_hs
+
+
+def get_singleno_envs(mol, functional_groups):
+    # Find single N, O functional groups environments (amines vs anilines, alcohols vs phenols)
+    singleno_envs = []
+    for group in functional_groups:
+        singleno_env = set()
+        if (len(group) == 1) and (mol.GetAtomWithIdx(list(group)[0]).GetAtomicNum() in [7,8]) and not (mol.GetAtomWithIdx(list(group)[0]).GetIsAromatic()): 
+            atom = mol.GetAtomWithIdx(list(group)[0])
+            if sum([1 for neigh_atom in atom.GetNeighbors() if neigh_atom.GetAtomicNum() == 6]) == 1:
+                for neigh_atom in atom.GetNeighbors():
+                    if (neigh_atom.GetAtomicNum() == 6) and (mol.GetBondBetweenAtoms(atom.GetIdx(), neigh_atom.GetIdx()).GetBondType() == Chem.rdchem.BondType.SINGLE):
+                        type_carbon = "Car" if neigh_atom.GetIsAromatic() == True else "Cal"
+                        singleno_env.add(((list(group)[0], neigh_atom.GetIdx()),  
+                                        mol.GetBondBetweenAtoms(list(group)[0], neigh_atom.GetIdx()), type_carbon))
+        singleno_envs.append(singleno_env)
+    return singleno_envs
+
+
+def get_dbond2car(mol, functional_groups):
+    # Find atoms in functional groups double bonded to aromatic carbons
+    db2car_pattern = Chem.MolFromSmarts("c=*")
+    db2car_envs = []
+    matches = mol.GetSubstructMatches(db2car_pattern)
+    matches = tuple([x for x in matches if mol.GetBondBetweenAtoms(x[0], x[1]).GetIsAromatic() == False])
+    for group in functional_groups:
+        db2car_env = set()
+        for matchx in matches:
+            if matchx[0] in group or matchx[1] in group:
+                db2car_env.add(matchx[0])
+        db2car_envs.append(db2car_env)
+    return db2car_envs
+
+
+def col_mol(mol, functional_groups, rad = 0.5, lw = 2, width = 300, height = 250):
+    # Color functional groups in molecules like Ertl's paper
+    cols = {"NO": (1, 0.3, 1, 0.8), # violet
+            "O": (1, 0.6, 0.6, 0.9), # pink
+            "N": (0.4, 0.7, 0.9), # blue
+            "X": (0, 1, 0, 0.9), # green
+            "har": (1, 0.65, 0, 0.9), # orange
+            "S(O)": (1, 1, 0.2, 0.9), # yellow
+            "NOS": (0.7, 0.7, 0, 0.9), # pistacchio
+            "CC": (0.627, 0.627, 0.627, 0.9), # grey
+            "P,etc": (0.5, 1, 0.8, 0.9) # cyan
+            }
+    
+    ats_hl = {}
+    bds_hl = {}
+    rdi_hl = {}
+    lws_hl = {}
+    
+    for fg in functional_groups:
+        ar = any([mol.GetAtomWithIdx(idx).GetIsAromatic() for idx in fg])
+        els = "".join(sorted(list(set([mol.GetAtomWithIdx(idx).GetSymbol() for idx in fg]))))
+        
+        if ar == True:
+            col = [cols["har"]]
+        elif any(x in els for x in ["F", "Cl", "Br", "I"]):
+            col = [cols["X"]]
+        elif any(x in els for x in ["P", "Se", "B", "Si", "As", "Te"]):
+            col = [cols["P,etc"]]
+        elif els in ["NO", "CNO"]:
+            col = [cols["NO"]]
+        elif els in ["O", "CO"]:
+            col = [cols["O"]]
+        elif els in ["N", "CN"]:
+            col = [cols["N"]]
+        elif els in ["S","OS","COS", "CS"]:
+            col = [cols["S(O)"]]
+        elif els in ["NOS", "CNOS", "NS", "CNS"]:
+            col = [cols["NOS"]]
+        elif els == "C":
+            col = [cols["CC"]]
+            
+        for idx in fg:
+            ats_hl[idx] = col
+            rdi_hl[idx] = rad
+            
+        if len(fg) > 1:
+            for bond in mol.GetBonds():
+                at1_idx = bond.GetBeginAtomIdx()
+                at2_idx = bond.GetEndAtomIdx()
+                if at1_idx in fg and at2_idx in fg:
+                    bds_hl[bond.GetIdx()] = col
+                    lws_hl[bond.GetIdx()] = lw
+
+    Chem.rdDepictor.Compute2DCoords(mol)  
+    Chem.rdDepictor.StraightenDepiction(mol)
+    d2d = rdMolDraw2D.MolDraw2DCairo(width,height)
+    dopts = d2d.drawOptions()
+    dopts.atomHighlightsAreCircles = False
+    d2d.DrawMoleculeWithHighlights(mol, "", ats_hl, bds_hl, rdi_hl, lws_hl)
+    d2d.FinishDrawing()
+    return d2d.GetDrawingText()
+
+
+    
+                
+def get_dec_fgs(mol, verbose = False):
+    # Generate functional groups decorated with carbon environments
+    for atom in mol.GetAtoms():
+        atom.SetChiralTag(Chem.rdchem.ChiralType.CHI_UNSPECIFIED)
+    for bond in mol.GetBonds():
+        bond.SetStereo(Chem.rdchem.BondStereo.STEREONONE)
+    mol_aux = Chem.Mol(mol)
+    mol = Chem.RemoveHs(mol)
+    mol = Chem.AddHs(mol)
+    fgs = get_fgs(mol)
+    cnes = get_carbonyl_envs(mol, fgs)
+    fves, pr_hs = get_freeval_envs(mol, fgs)
+    noes = get_singleno_envs(mol, fgs)
+    db2cars = get_dbond2car(mol, fgs)
+    
+    ## Dummyze permitted free valence neighboring Hs
+    for fve in fves:
+        if (len(fve) > 0):
+            for fv in fve:
+                if (fv[2] == 1):
+                    #print("dummy H: " + str(fv[0][1]))
+                    mol.GetAtomWithIdx(fv[0][1]).SetAtomicNum(0)
+                
+    ## Remove explicit atoms
+    mol = Chem.RemoveHs(mol)   
+   
+    ## Update fves accordingly to account for changes in numbers of dummy atoms
+    for i in range(len(fves)):
+        fve = list(fves[i])
+        if (len(fve) > 0):
+            dummy_paired = []
+            for j in range(len(fve)):
+                fv = fve[j]
+                if (fv[2] == 1): # this entry had a H fv
+                    for neighbor in mol.GetAtomWithIdx(fv[0][0]).GetNeighbors():
+                        if (neighbor.GetAtomicNum() == 0):
+                            dummy_paired.append((fv[0][0], neighbor.GetIdx()))
+                            # dummy_idx = neighbor.GetIdx() ## Assign the updated idx 
+                            # dummy_bond = mol.GetBondBetweenAtoms(fv[0][0], dummy_idx) # Update the bond too
+                    # new_fv = ((fv[0][0], dummy_idx), dummy_bond, 0)
+                    # fve[j] = new_fv
+            dummy_paired = list(set(dummy_paired))
+            dummy_count = 0
+            for j in range(len(fve)):
+                fv = fve[j]
+                if (fv[2] == 1):
+                    d_pair = dummy_paired[dummy_count]
+                    fve[j] = ((d_pair), mol.GetBondBetweenAtoms(d_pair[0], d_pair[1]), 0)
+                    dummy_count = dummy_count + 1
+        fves[i] = set(fve) # reassign the possibly updated fve
+    
+    psmis = []
+    fg_mols = []
+    psmi_labs = []
+    pseudo_smiles = ""
+    
+    # Loop over functional groups to generate pseudo smiles and pseudo molecules for each of them
+    for i, group in enumerate(fgs):
+        neigh_atoms = sorted(list(set([x[0][1] for x in cnes[i]] + [x[0][1] for x in fves[i]] + [x[0][1] for x in noes[i]])))
+        cne = cnes[i]
+        fve = fves[i]
+        noe = noes[i]
+        db2car = db2cars[i]
+        R_idxs = sorted(list(set(neigh_atoms + [x for x in db2car])))
+        psmi_labs = [(list(x)[0][1], list(x)[2]) for x in noe if len(noe) > 0] + [(x, "Car") for x in db2car if len(db2car) > 0]
+       
+        if (len(neigh_atoms) > 0): # There is decoration applicable
+           
+            ## Get the bonds of the group
+            bonds_idx = set()
+            for j in range(mol.GetNumBonds()):
+                bond = mol.GetBondWithIdx(j)
+                begin_aid = bond.GetBeginAtomIdx()
+                end_aid = bond.GetEndAtomIdx()
+                if ((begin_aid in group) and (end_aid in group)):
+                    bonds_idx.add(bond.GetIdx())
+                    
+            ## Add bonds to carbonyl carbons
+            if(len(cne) > 0):
+                for env in cne:
+                    bond = env[1]
+                    bonds_idx.add(bond.GetIdx())
+                
+            ## Add bonds to free valence atoms
+            if(len(fve) > 0):
+                for env in fve:
+                    bond = env[1]
+                    bonds_idx.add(bond.GetIdx())
+                    
+            ## Add bonds to single n, o atoms
+            if(len(noe) > 0):
+                for env in noe:
+                    bond = env[1]
+                    bonds_idx.add(bond.GetIdx())
+
+            # Get the fragment as a new molecule from bonds in the FG + environment atoms
+            amap = {}
+            fragment = Chem.PathToSubmol(mol, list(bonds_idx), atomMap = amap)
+            
+            # Create a copy of the fragment to modify
+            fragment_with_r_groups = Chem.RWMol(fragment)
+    
+            # Replace atoms at the specified positions with R group labels
+            j = 0
+            for idx in R_idxs:
+                label = "[*:" + str(j+1) + "]"
+                
+                # Create a dummy atom with the R group label
+                r_group_atom = Chem.MolFromSmiles(label).GetAtomWithIdx(0)
+                r_group_atom.SetAtomMapNum(int(label[-2]))  # Set the map number to match the R group number
+
+                # Replace the atom at the specified position with the R group atom
+                fragment_with_r_groups.ReplaceAtom(amap[R_idxs[j]], r_group_atom)
+                j = j + 1
+
+            # Convert the modified fragment back to a Mol object
+            fragment_with_r_groups = fragment_with_r_groups.GetMol()
+            
+            ## Fix C valences
+            fragment_with_r_groups = cval_fix(fragment_with_r_groups)
+
+            # Generate the canonical pseudo-SMILES
+            pseudo_smiles = Chem.MolToSmiles(fragment_with_r_groups, canonical=True, isomericSmiles = False)
+            
+            # Generate the labeled molecule
+            fragment_with_r_groups = Chem.RWMol(fragment_with_r_groups)
+            for atom in fragment_with_r_groups.GetAtoms():
+                idx = atom.GetIdx() # idx of atom in fragment
+                map_idx = list(amap.keys())[list(amap.values()).index(idx)] # idx of atom in original molecule
+                if map_idx in neigh_atoms: # Check if that position is an environment atom in original molecule
+                    if (map_idx in [x[0][1] for x in list(noe)]):
+                        type_carbon = [x[2] for x in list(noe) if x[0][1] == map_idx][0]
+                        fragment_with_r_groups.GetAtomWithIdx(idx).SetProp('atomLabel', type_carbon)
+                    else:
+                        fragment_with_r_groups.GetAtomWithIdx(idx).SetProp('atomLabel','R')
+                else:
+                    if atom.GetIsAromatic():
+                        if atom.GetAtomicNum() == 6:
+                            fragment_with_r_groups.GetAtomWithIdx(idx).SetProp('atomLabel', "Car")
+                        if atom.GetAtomicNum() == 7:
+                            fragment_with_r_groups.GetAtomWithIdx(idx).SetProp('atomLabel', "Nar")
+                        if atom.GetAtomicNum() == 8:
+                            fragment_with_r_groups.GetAtomWithIdx(idx).SetProp('atomLabel', "Oar")
+                        if atom.GetAtomicNum() == 16:
+                            fragment_with_r_groups.GetAtomWithIdx(idx).SetProp('atomLabel', "Sar")
+                if map_idx in db2car: # check db2caromatics
+                    fragment_with_r_groups.GetAtomWithIdx(idx).SetProp('atomLabel', "Car")
+            fragment_with_r_groups = fragment_with_r_groups.GetMol()
+                        
+        else: # No neighbors
+            if(len(group) == 1): # Single-atommed FG
+                atom = mol.GetAtomWithIdx(list(group)[0])
+                charge = atom.GetFormalCharge()
+                charge_str = "+" if charge == 1 else "-" if charge == -1 else ""
+                fragment = Chem.RWMol() ## Create emtpy edtiable molecule
+                fragment.AddAtom(Chem.Atom('*')) ## Add dummy atom to molecule
+                if atom.GetIsAromatic():
+                    if atom.GetAtomicNum() == 7:
+                        pseudo_smiles = "n" + charge_str
+                        fragment.GetAtomWithIdx(0).SetProp('atomLabel', "Nar" + charge_str)
+                    if atom.GetAtomicNum() == 8:
+                        pseudo_smiles = "o" + charge_str
+                        fragment.GetAtomWithIdx(0).SetProp('atomLabel', "Oar" + charge_str)
+                    if atom.GetAtomicNum() == 16:
+                        pseudo_smiles = "s" + charge_str
+                        fragment.GetAtomWithIdx(0).SetProp('atomLabel', "Sar" + charge_str)
+                    if atom.GetAtomicNum() == 15:
+                        pseudo_smiles = "p" + charge_str
+                        fragment.GetAtomWithIdx(0).SetProp('atomLabel', "Par" + charge_str)
+                    if atom.GetAtomicNum() == 34:
+                        pseudo_smiles = "se" + charge_str
+                        fragment.GetAtomWithIdx(0).SetProp('atomLabel', "Sear" + charge_str)
+                    if atom.GetAtomicNum() == 52:
+                            pseudo_smiles = "te" + charge_str
+                            fragment.GetAtomWithIdx(0).SetProp('atomLabel', "Tear" + charge_str)
+                else:
+                    if atom.GetAtomicNum() == 7:
+                        pseudo_smiles = "N" + charge_str
+                        fragment.GetAtomWithIdx(0).SetProp('atomLabel', "N" + charge_str)
+                    if atom.GetAtomicNum() == 8:
+                        pseudo_smiles = "O" + charge_str
+                        fragment.GetAtomWithIdx(0).SetProp('atomLabel', "O" + charge_str)
+                    if atom.GetAtomicNum() == 16:
+                        pseudo_smiles = "S" + charge_str
+                        fragment.GetAtomWithIdx(0).SetProp('atomLabel', "S" + charge_str)
+                    if atom.GetAtomicNum() == 15:
+                        pseudo_smiles = "P" + charge_str + "]"
+                        fragment.GetAtomWithIdx(0).SetProp('atomLabel', "P" + charge_str)
+                    if atom.GetAtomicNum() == 34:
+                        pseudo_smiles = "Se" + charge_str + "]"
+                        fragment.GetAtomWithIdx(0).SetProp('atomLabel', "Se" + charge_str)
+                    if atom.GetAtomicNum() == 52:
+                        pseudo_smiles = "Te" + charge_str + "]"
+                        fragment.GetAtomWithIdx(0).SetProp('atomLabel', "Te" + charge_str)
+                fragment = fragment.GetMol()
+            else: # Multi-atommed FG
+                bonds_idx = []
+                for j in range(mol.GetNumBonds()):
+                    bond = mol.GetBondWithIdx(j)
+                    begin_aid = bond.GetBeginAtomIdx()
+                    end_aid = bond.GetEndAtomIdx()
+                    if begin_aid in group and end_aid in group:
+                        bonds_idx.append(j)
+
+                # Get the fragment as a new molecule
+                amap = {}
+                fragment = Chem.PathToSubmol(mol, bonds_idx, atomMap = amap)
+                fragment = cval_fix(fragment)
+                pseudo_smiles = Chem.MolToSmiles(fragment, canonical=True, isomericSmiles = False)
+                for atom in fragment.GetAtoms():
+                    idx = atom.GetIdx()
+                    charge = atom.GetFormalCharge()
+                    charge_str = "\u207A" if charge == 1 else "\u207B" if charge == -1 else ""
+                    if atom.GetIsAromatic():
+                        if atom.GetAtomicNum() == 6:
+                            fragment.GetAtomWithIdx(idx).SetProp('atomLabel', "Car" + charge_str)
+                        if atom.GetAtomicNum() == 7:
+                            fragment.GetAtomWithIdx(idx).SetProp('atomLabel', "Nar" + charge_str)
+                        if atom.GetAtomicNum() == 8:
+                            fragment.GetAtomWithIdx(idx).SetProp('atomLabel', "Oar" + charge_str)
+                        if atom.GetAtomicNum() == 16:
+                            fragment.GetAtomWithIdx(idx).SetProp('atomLabel', "Sar" + charge_str)      
+        
+        # Parse the pseudo-smiles; first the Cars and Cals
+        if (len(psmi_labs) > 0):
+            for psmi_lab in list(psmi_labs):
+                pseudo_smiles = pseudo_smiles.replace("[*:" + str(R_idxs.index(psmi_lab[0]) + 1) + "]", "[" + psmi_lab[1] + "]")
+
+        
+        # Define the pattern to match
+        pattern = r'\[\*:\d+\]'
+
+        # Define the replacement string
+        replacement = "[R]"
+        
+        # Replace [R] groups
+        pseudo_smiles = re.sub(pattern, replacement, pseudo_smiles)
+        
+        # Replace ns
+        pseudo_smiles = pseudo_smiles.replace("[n+]", "[Nar+]")
+        pseudo_smiles = pseudo_smiles.replace("[n-]", "[Nar-]")
+        pseudo_smiles = pseudo_smiles.replace("[n+]", "[Nar+]")
+        pseudo_smiles = pseudo_smiles.replace("[n-]", "[Nar-]")
+        pseudo_smiles = re.sub(r'(?<![MNI])n\+', '[Nar+]', pseudo_smiles)
+        pseudo_smiles = re.sub(r'(?<![MNI])n\-', '[Nar-]', pseudo_smiles)
+        pseudo_smiles = re.sub(r'(?<![MZI])n', '[Nar]', pseudo_smiles)
+        
+        # Replace cs, ss, os, ps, ses, tes
+        pseudo_smiles = pseudo_smiles.replace("[se+]", "[Sear+]")
+        pseudo_smiles = pseudo_smiles.replace("[se-]", "[Sear-]")
+        pseudo_smiles = pseudo_smiles.replace("se+", "[Sear+]")
+        pseudo_smiles = pseudo_smiles.replace("se-", "[Sear-]")
+        pseudo_smiles = pseudo_smiles.replace("se", "[Sear]")
+        pseudo_smiles = pseudo_smiles.replace("[te+]", "[Tear+]")
+        pseudo_smiles = pseudo_smiles.replace("[te-]", "[Tear-]")
+        pseudo_smiles = pseudo_smiles.replace("te", "[Tear]")
+        pseudo_smiles = pseudo_smiles.replace("[c+]", "[Car+]")
+        pseudo_smiles = pseudo_smiles.replace("[c-]", "[Car-]")
+        pseudo_smiles = pseudo_smiles.replace("c", "[Car]")
+        pseudo_smiles = pseudo_smiles.replace("[s+]", "[Sar+]")
+        pseudo_smiles = pseudo_smiles.replace("[s-]", "[Sar-]")
+        pseudo_smiles = re.sub(r"(?<!A)(s\+)", "[Sar+]", pseudo_smiles) # To avoid issues with As
+        pseudo_smiles = re.sub(r"(?<!A)(s-)", "[Sar-]", pseudo_smiles) # To avoid issues with As
+        pseudo_smiles = re.sub(r"(?<!A)s", "[Sar]", pseudo_smiles) # To avoid issues with As
+        pseudo_smiles = pseudo_smiles.replace("o+", "[Oar+]")
+        pseudo_smiles = pseudo_smiles.replace("o-", "[Oar-]")
+        pseudo_smiles = pseudo_smiles.replace("o", "[Oar]")
+        pseudo_smiles = pseudo_smiles.replace("[o]", "[Oar]")
+        pseudo_smiles = pseudo_smiles.replace("[o+]", "[Oar+]")
+        pseudo_smiles = pseudo_smiles.replace("[o-]", "[Oar-]")
+        pseudo_smiles = pseudo_smiles.replace("[p+]", "[Par+]")
+        pseudo_smiles = pseudo_smiles.replace("[p-]", "[Par-]")
+        pseudo_smiles = pseudo_smiles.replace("p", "[Par]")
+  
+        
+        # Final canonicalization
+        pseudo_smiles = psmi_can(pseudo_smiles)
+        
+        psmis.append(pseudo_smiles)
+        if (len(neigh_atoms) == 0):
+            fg_mols.append(fragment)
+        else:
+            fg_mols.append(fragment_with_r_groups)
+        if not ("H" in pseudo_smiles):
+            if (verbose):
+                print("Canonical Pseudo-SMILES:", pseudo_smiles)
+        
+    img_text = col_mol(mol_aux, fgs)
+    return img_text, fgs, psmis, fg_mols
+
+
+
+
+
+
+
--- a/Contrib/efgs/try_efgs.py
+++ b/Contrib/efgs/try_efgs.py
@@ -0,0 +1,223 @@
+# -*- coding: utf-8 -*-
+"""
+Implementation of Ertl's Functional Group Algorithm in RDKit
+
+Ertl, Peter. 
+
+An algorithm to identify functional groups in organic molecules
+
+J Cheminform (2017) 9:36; DOI: 10.1186/s13321-017-0225-z
+
+@author: gonzalo.colmenarejo
+"""
+
+
+from rdkit import Chem
+from rdkit.Chem.Draw import IPythonConsole
+from chembl_structure_pipeline import standardizer as sdz
+from datetime import datetime
+from IPython.display import Image
+from PIL import Image as pilImage
+from PIL import ImageDraw, ImageFont
+from rdkit.Chem.Draw import rdMolDraw2D
+import efgs
+import pandas as pd
+import io
+
+
+
+ertl_smis = [
+    'Cc1nc(NS(=O)(=O)c2ccc(N)cc2)nc(C)c1', 
+    'NC(=N)c1ccc(C=Cc2ccc(cc2O)C(=N)N)cc1', 
+    'CC(=O)Nc1nnc(s1)S(=O)(=O)N',
+    'NS(=O)(=O)c1cc2c(NCNS2(=O)=O)cc1Cl',
+    'CNC1=Nc2ccc(Cl)cc2C(=N(=O)C1)c3ccccc3',
+    'Cc1onc(c1C(=O)NC2C3SC(C)(C)C(N3C2=O)C(=O)O)c4ccccc4',
+    'Clc1ccccc1C2=NCC(=O)Nc3ccc(cc23)N(=O)=O',
+    'COc1cc(cc(C(=O)NCC2CCCN2CC=C)c1OC)S(=O)(=O)N',
+    'Cc1ccc(Cl)c(Nc2ccccc2C(=O)O)c1Cl',
+    'Clc1ccc2Oc3ccccc3N=C(N4CCNCC4)c2c1',
+    'FC(F)(F)CN1C(=O)CN=C(c2ccccc2)c3cc(Cl)ccc13',
+    'OCC1OC(CC1O)n2cnc3C(O)CNC=Nc32',
+    'CCNC1CC(C)S(=O)(=O)c2sc(cc12)S(=O)(=O)N',
+    'CC(O)C1C2C(C)C(=C(N2C1=O)C(=O)O)SC3CNC(C3)C(=O)N(C)C',
+    'CC1CN(CC(C)N1)c2c(F)c(N)c3c(=O)c(cn(C4CC4)c3c2F)C(=O)O',
+    'CC(=CCC1C(=O)N(N(C1=O)c2ccccc2)c3ccccc3)C',
+    'Clc1ccc2N=C3NC(=O)CN3Cc2c1Cl',
+    'CC(=O)NC1C(NC(=N)N)C=C(OC1C(O)C(O)CO)C(=O)O',
+    'CC(O)C(O)C1CNc2nc(N)nc(O)c2N1',
+    'NC1CCCCN(C1)c2c(Cl)cc3c(=O)c(cn(C4CC4)c3c2Cl)C(=O)O',
+]
+
+
+
+def mindex(mol):
+    # Utility function to draw molecule with idx
+    for atom in mol.GetAtoms():
+        try:
+            atom.SetAtomMapNum(atom.GetIdx())
+        except:
+            None
+    return mol
+
+
+def demindex(mol):
+    # Utility function to return mindex-ed molecule to normal
+    for atom in mol.GetAtoms():
+        atom.ClearProp('molAtomMapNumber')
+    return mol
+
+
+def molimg(x):
+    # Utility function to draw image of labeled mol from PNG binary string
+    try:
+        return pilImage.open(io.BytesIO(x))
+    except:
+        return None
+    
+
+def img_grid(images, titles = None, num_columns = 5,font = "arial.ttf", font_size = 18, title_height = 30, cell_width = 300, cell_height = 300, bg_color = (255, 255, 255)):
+    """
+    Creates a grid of images with optional titles under each image.
+
+    Parameters:
+    - images (list): List of PIL Image objects.
+    - titles (list, optional): List of titles as strings for each image. If None, no titles will be added.
+    - num_columns (int): Number of columns in the grid.
+    - font_sizes: Font size of titles (if provided).
+    - title_height: Height of titles (if provieded).
+    - cell_width (int): Width of each image cell.
+    - cell_height (int): Height of each image cell (not including title space).
+    - bg_color (tuple): Background color as an RGB tuple.
+
+    Returns:
+    - PIL Image object containing the image grid.
+    """
+    # Set title height if titles are provided; otherwise, set to zero
+    title_height = title_height if titles else 0
+    cell_height_with_title = cell_height + title_height + 2
+
+    # Calculate grid dimensions
+    num_images = len(images)
+    num_rows = (num_images + num_columns - 1) // num_columns  # Round up for last row
+
+    # Create a blank background image for the grid
+    grid_width = num_columns * cell_width
+    grid_height = num_rows * cell_height_with_title
+    grid_image = pilImage.new("RGB", (grid_width, grid_height), bg_color)
+
+    # Set up font for titles if needed
+    if titles:
+        try:
+            font = ImageFont.truetype(font, font_size)  # Change font and size as desired
+        except IOError:
+            font = ImageFont.load_default()
+
+    # Iterate through each cell in the grid
+    for i in range(num_rows * num_columns):
+        row = i // num_columns
+        col = i % num_columns
+        x = col * cell_width
+        y = row * cell_height_with_title
+
+        if i < num_images:
+            # Resize the image to fit in the cell
+            img_resized = images[i].resize((cell_width, cell_height))
+            grid_image.paste(img_resized, (x, y))
+
+            # Draw title if provided
+            if titles and i < len(titles):
+                draw = ImageDraw.Draw(grid_image)
+                title = titles[i]
+
+                # Calculate title position and center it
+                text_bbox = draw.textbbox((0, 0), title, font=font)
+                text_width = text_bbox[2] - text_bbox[0]
+                title_x = x + (cell_width - text_width) // 2  # Center title horizontally
+                title_y = y + cell_height  # Position directly beneath the image
+
+                # Draw title text
+                draw.text((title_x, title_y), title, fill="black", font=font)
+        else:
+            # Leave blank space if no image for this cell
+            blank_cell = pilImage.new("RGB", (cell_width, cell_height), bg_color)
+            grid_image.paste(blank_cell, (x, y))
+
+    return grid_image
+
+
+
+# ## Mols from file
+start_time = datetime.now()
+d = pd.read_csv("../data/ch33_inchis.csv", sep = ";", low_memory = False)
+d = d[pd.notna(d.inchi)].reset_index(drop = True) # make sure there are no missing inchis
+d = d[d.inchi.apply(lambda x: "." not in x.split("/")[1])].reset_index(drop = True) # make sure we have no complex molecules
+d.shape # (839063, 1)
+d = d.sample(frac=1, random_state=4210).reset_index(drop = True) # scramble the df
+n = d.shape[0] # Here one can put an alternative much smaller n 
+d = d[d.index.isin(range(n))].reset_index(drop = True) # keep only n
+d["mol"] = d.inchi.apply(lambda x: sdz.standardize_mol(Chem.MolFromInchi(x)))
+d = d[pd.notna(d.mol)].reset_index(drop = True) # remove entries without molecule object
+d.shape[0]  # n
+end_time = datetime.now()
+print('Duration to analyze {} molecules: {}'.format(n, end_time - start_time))
+
+start_time = datetime.now()
+d[["img_text","fgs","fgsmi","fgmol"]] = d.apply(lambda x: pd.Series(efgs.get_dec_fgs(x["mol"])), axis = 1)
+end_time = datetime.now()
+print('Duration to analyze {} molecules: {}'.format(n, end_time - start_time))
+
+
+# Generate distribution of FGs
+fg_dist = d.explode(["fgsmi","fgmol"]).groupby("fgsmi", as_index = False).apply(lambda x: 
+                                                                                pd.DataFrame({"fgsmi": [x["fgsmi"].iloc[0]], 
+                                                                                              "n": [x.shape[0]], 
+                                                                                              "mol": [x["fgmol"].iloc[0]]})).sort_values("n", ascending = False)
+
+fg_dist.shape  # 5829, 3 (for all ChEMBL comps)
+fg_dist.to_csv("../results/fg_dist.csv", sep = ";", index = False)  
+# fg_dist = pd.read_csv("../results/fg_dist.csv", sep = ";", index = False) # To recover data
+    
+mols = fg_dist.mol.iloc[:64].tolist()
+legs = fg_dist.fgsmi.iloc[:64].tolist()
+
+
+## High-resolution grid with top-48 FGs 
+images = []
+img_width = 250
+img_height = 250
+for m in mols[:48]:
+    Chem.rdDepictor.Compute2DCoords(m)
+    Chem.rdDepictor.StraightenDepiction(m)
+    d2d = rdMolDraw2D.MolDraw2DCairo(img_width,img_height)
+    dopts = d2d.drawOptions()
+    dopts.bondLineWidth = 5
+    dopts.padding = 0.2
+    d2d.DrawMolecule(m)
+    d2d.FinishDrawing()
+    images.append(molimg(d2d.GetDrawingText()))
+   
+fgs_grid = img_grid(images[:48], titles = legs[:48], num_columns = 8, font_size = 28, cell_width = 300, cell_height = 300, bg_color=(255, 255, 255))
+fgs_grid = fgs_grid.resize((6000, int(6000*fgs_grid.height/fgs_grid.width)))  
+fgs_grid.save("../results/fg_grid.png", dpi = (300,300))
+
+
+# High-resolution grid with 20 first molecules with colored functional groups
+images = [molimg(x) for x in d.img_text.iloc[0:20].tolist()]
+cell_width = images[0].width
+cell_height = images[0].height
+molfg_grid = img_grid(images, num_columns = 5, cell_width = cell_width, cell_height = cell_height, bg_color=(255, 255, 255))
+molfg_grid
+molfg_grid = molfg_grid.resize((6000, int(6000*molfg_grid.height/molfg_grid.width)))
+molfg_grid.save("../RESULTS/molfg_grid.png", dpi = (300,300))
+
+
+# High-resolution grid with 20 Ertl molecules in Figure 1
+ertmols = [Chem.MolFromSmiles(s) for s in ertl_smis[:20]]
+images = [molimg(efgs.get_dec_fgs(ertmol)[0]) for ertmol in ertmols] 
+cell_width = images[0].width
+cell_height = images[0].height
+molfg_grid = img_grid(images, num_columns = 5, cell_width = cell_width, cell_height = cell_height, bg_color=(255, 255, 255))
+molfg_grid
+molfg_grid = molfg_grid.resize((6000, int(6000*molfg_grid.height/molfg_grid.width)))
+molfg_grid.save("../RESULTS/ertl_molfg_grid.png", dpi = (300,300))