Delete pocket_flow/utils/.ipynb_checkpoints directory

2026-06-04 12:44:22 +08:00 · 2023-06-16 18:05:24 +08:00
parent 36617eb71a
commit 1ec5e0a77d
14 changed files with 0 additions and 4054 deletions
--- a/pocket_flow/utils/.ipynb_checkpoints/ParseFile-checkpoint.py
+++ b/pocket_flow/utils/.ipynb_checkpoints/ParseFile-checkpoint.py
@@ -1,763 +0,0 @@
-import os
-import copy
-import numpy as np
-from rdkit import Chem, RDConfig
-from rdkit.Chem.rdchem import BondType
-from rdkit.Chem import ChemicalFeatures
-from .residues_base import RESIDUES_TOPO, RESIDUES_TOPO_WITH_H
-try:
-    import pymol
-except:
-    print('we can not compute the atoms on the surface of protein, '\
-          'because pymol can not be imported')
-    pass
-#from .BaseFeatures import Mol2Graph, RESIDUE_GRAPH_WITHOUT_H, RESIDUE_GRAPH_WITH_H
-#from .utils import coordinate_adjusting
-#from .Data import LigandData
-
-ATOM_TYPE_WITH_HYBIRD = [
-    'SP3_C', 'SP2_C', 'SP_C', 'SP3_N', 'SP2_N', 'SP_N', 'SP3_O', 'SP2_O', 'SP3_F', 'SP3_P',
-    'SP2_P', 'SP3D_P', 'SP3_S', 'SP2_S', 'SP3D_S', 'SP3D2_S', 'SP3_Cl', 'SP3_Br', 'SP3_I'
-    ]
-ATOM_MAP = [6, 6, 6, 7, 7, 7, 8, 8, 9, 15, 15, 15, 16, 16, 16, 16, 17, 35, 53]
-PT = Chem.GetPeriodicTable()
-BACKBONE_SYMBOL = {'N', 'CA', 'C', 'O'}
-AMINO_ACID_TYPE = {
-    'CYS':0, 'GLY':1, 'ALA':2, 'THR':3, 'LYS':4, 'PRO':5, 'VAL':6, 'SER':7, 'ASN':8, 'LEU':9,
-    'GLN':10, 'MET':11, 'ASP':12, 'TRP':13, 'HIS':14, 'GLU':15, 'ARG':16, 'ILE':17, 'PHE':18,
-    'TYR':19
-    }
-
-'''
-def coordinate_adjusting(pos, wts):
-    center_of_mass = np.sum(pos*wts, axis=0)/wts.sum()
-    pos = pos - center_of_mass
-    
-    I_xx = (wts*(pos[...,1]**2 + pos[...,2]**2)).sum()
-    I_yy = (wts*(pos[...,0]**2 + pos[...,2]**2)).sum()
-    I_zz = (wts*(pos[...,0]**2 + pos[...,1]**2)).sum()
-    I_xy = -(wts*(pos[...,0]*pos[...,1])).sum()
-    I_xz = -(wts*(pos[...,0]*pos[...,2])).sum()
-    I_yz = -(wts*(pos[...,1]*pos[...,2])).sum()
-    I = np.array([[I_xx, I_xy, I_xz],
-                  [I_xy, I_yy, I_yz],
-                  [I_xz, I_yz, I_zz]])
-    eig_v, eig_m = np.linalg.eigh(I)  # np.linalg.eigh guarantees you that the eigenvalues are sorted and uses a
-                                      # faster algorithm that takes advantage of the fact that the matrix is  
-                                      # symmetric. If you know that your matrix is symmetric, use this function.
-    am = pos@eig_m   # 用本真矢矩阵变换原子坐标，以‘摆正’分子
-    
-    rotate_around = []
-    x, y, z = am[0]
-    if x < 0:
-        am[..., 0] = am[..., 0] * -1
-        rotate_around.append(0)
-    if y < 0:
-        am[..., 1] = am[..., 1] * -1
-        rotate_around.append(1)
-    if z < 0:
-        am[..., 2] = am[..., 2] * -1
-        rotate_around.append(2)
-    return am, eig_m, center_of_mass, rotate_around
-'''
-def coordinate_adjusting(pos, wts):
-    center_of_mass = np.sum(pos*wts, axis=0)/wts.sum()
-    pos = pos - center_of_mass
-    
-    I_xx = (wts*(pos[...,1]**2 + pos[...,2]**2)).sum()
-    I_yy = (wts*(pos[...,0]**2 + pos[...,2]**2)).sum()
-    I_zz = (wts*(pos[...,0]**2 + pos[...,1]**2)).sum()
-    I_xy = -(wts*(pos[...,0]*pos[...,1])).sum()
-    I_xz = -(wts*(pos[...,0]*pos[...,2])).sum()
-    I_yz = -(wts*(pos[...,1]*pos[...,2])).sum()
-    I = np.array([[I_xx, I_xy, I_xz],
-                  [I_xy, I_yy, I_yz],
-                  [I_xz, I_yz, I_zz]])
-    eig_v, eig_m = np.linalg.eigh(I)  # np.linalg.eigh guarantees you that the eigenvalues are sorted and uses a
-                                      # faster algorithm that takes advantage of the fact that the matrix is  
-                                      # symmetric. If you know that your matrix is symmetric, use this function.
-    am = pos@eig_m   # 用本真矢矩阵变换原子坐标，以‘摆正’分子
-    return am, eig_m, center_of_mass
-
-class Dict(dict):
-    __setattr__ = dict.__setitem__
-    __getattr__ = dict.__getitem__
-
-
-class Atom(object):
-    def __init__(self, atom_info):
-        
-        self.idx = int(atom_info[6:11])
-        self.name = atom_info[12:16].strip()
-        self.res_name = atom_info[17:20].strip()  # atom_info[17:20].strip()
-        self.chain = atom_info[21:22].strip()
-        self.res_idx = int(atom_info[22:26])
-        self.coord = np.array([float(atom_info[30:38].strip()),
-                               float(atom_info[38:46].strip()),
-                               float(atom_info[46:54].strip())])
-        self.occupancy = float(atom_info[54:60])
-        self.temperature_factor = float(atom_info[60:66].strip())
-        self.seg_id = atom_info[72:76].strip()
-        self.element = atom_info[76:78].strip()
-        if self.element == 'SE':
-            self.element = 'S'
-            self.name = 'SD'
-            self.res_name = 'MET'
-        self.mass = PT.GetAtomicWeight(self.element)
-        if self.occupancy < 1.0:
-            self.is_disorder = True
-        else:
-            self.is_disorder = False
-        self.is_surf = atom_info.split()[-1] == 'surf'
-
-    @property
-    def to_string(self):
-        fmt = '{:6s}{:5d} {:^4s}{:1s}{:3s} {:1s}{:4d}{:1s}   {:8.3f}{:8.3f}{:8.3f}{:6.2f}{:6.2f}        {:<2s}{:2s}'    # https://cupnet.net/pdb-format/
-        out = fmt.format('ATOM', self.idx, self.name, '', self.res_name, 
-                       self.chain, self.res_idx,'', self.coord[0],
-                       self.coord[1], self.coord[2], self.occupancy,
-                       self.temperature_factor, self.seg_id, self.element)
-        return out
-    
-    @property
-    def to_dict(self):
-        return {
-            'element': PT.GetAtomicNumber(self.element),
-            'pos': self.coord,
-            'is_backbone': self.name in BACKBONE_SYMBOL,
-            'atom_name': self.name,
-            'atom_to_aa_type': AMINO_ACID_TYPE[self.res_name]
-             }
-    
-    def __repr__(self):
-        info = 'name={}, index={}, res={}, chain={}, is_disorder={}'.format(self.name, 
-                                                                 self.idx, 
-                                                                 self.res_name+str(self.res_idx),
-                                                                 self.chain, 
-                                                                 self.is_disorder)
-        return '{}({})'.format(self.__class__.__name__, info)
-
-
-class Residue(object):
-
-    def __init__(self, res_info):
-        self.res_info = res_info
-        #atoms_ = [Atom(i) for i in res_info]
-        self.atom_dict = {}
-        disorder = []
-        for i in res_info:   # 排除disorder原子
-            atom = Atom(i)
-            if atom.name in self.atom_dict:
-                continue
-            else:
-                self.atom_dict[atom.name] = atom
-                disorder.append(atom.is_disorder)
-            if atom.res_name == 'MSE':
-                atom.res_name = 'MET'   # 把MSE残基改成MET
-            
-        if True in disorder:
-            self.is_disorder = True
-        else:
-            self.is_disorder = False
-            
-        self.idx = self.atom_dict[atom.name].res_idx
-        self.chain = self.atom_dict[atom.name].chain
-        self.name = self.atom_dict[atom.name].res_name
-        self.is_perfect = True if len(self.get_heavy_atoms)==len(RESIDUES_TOPO[self.name]) else False
-
-    @property
-    def to_heavy_string(self):
-        return '\n'.join([a.to_string for a in self.get_heavy_atoms])
-    
-    @property
-    def to_string(self):
-        return '\n'.join([a.to_string for a in self.get_atoms])
-    
-    @property
-    def get_coords(self):
-        return np.array([a.coord for a in self.get_atoms])
-
-    @property
-    def get_atoms(self):
-        return list(self.atom_dict.values())
-    
-    @property
-    def get_heavy_atoms(self):
-        return [a for a in self.atom_dict.values() if 'H' not in a.element]
-    
-    @property
-    def get_heavy_coords(self):
-        return np.array([a.coord for a in self.get_heavy_atoms])
-
-    @property
-    def center_of_mass(self):
-        atom_mass = np.array([i.mass for i in self.get_atoms]).reshape(-1, 1)
-        return np.sum(self.get_coords*atom_mass, axis=0)/atom_mass.sum()
-    
-    @property
-    def bond_graph(self):
-        i, j, bt = [], [], []
-        res_graph = RESIDUES_TOPO[self.name]
-        atom_names = [i.name for i in self.get_heavy_atoms]
-        for ix, name in enumerate(atom_names):
-            for adj in res_graph[name]:
-                if adj in atom_names:
-                    idx_j = atom_names.index(adj)
-                    i.append(ix)
-                    j.append(idx_j)
-                    bt.append(res_graph[name][adj])
-        edge_index = np.stack([i,j]).astype(dtype=np.int64)
-        bt = np.array(bt, dtype=np.int64)
-        return edge_index, bt
-    
-    @property
-    def centroid(self):
-        return self.get_coords.mean(axis=0)
-    
-    def __repr__(self):
-        info = 'name={}, index={}, chain={}, is_disorder={}, is_perfect={}'.format(
-            self.name, self.idx, self.chain, self.is_disorder, self.is_perfect
-            )
-        return '{}({})'.format(self.__class__.__name__, info)
-
-
-class Chain(object):
-    def __init__(self, chain_info, ignore_incomplete_res=True, pdb_file=None):
-        self.pdb_file = pdb_file
-        self.res_dict = {}
-        '''if ignore_incomplete_res:
-            self.residues = {}
-            for i in chain_info:
-                res = Residue(chain_info[i])
-                if len(res.get_heavy_atoms) == len(RESIDUES_TOPO[res.name]):
-                    self.residues[i] = res
-        else:
-            self.residues = {i:Residue(chain_info[i]) for i in chain_info}'''
-        self.residues = {i:Residue(chain_info[i]) for i in chain_info}
-        #print(self.residues)
-        self.chain = list(self.residues.values())[0].chain
-        self.__normalized__ = False
-        self.center_of_mass_shift = None
-        self.rotate_matrix = None
-        self.ignore_incomplete_res =ignore_incomplete_res
-
-    def normalize_coord(self):
-        wts = np.array([i.mass for i in self.get_atoms])
-        wts = np.expand_dims(wts,axis=1)
-        pos = self.get_coords
-        
-        am, self.rotate_matrix, self.center_of_mass_shift = coordinate_adjusting(pos, wts)
-        for ix, a in enumerate(self.get_atoms):
-            a.coord = am[ix]
-        self.__normalized__ = True
-    
-    @property
-    def get_incomplete_residues(self):
-        return [i for i in self.residues.values() if i.is_perfect==False]
-
-    @property
-    def to_heavy_string(self):
-        return '\n'.join([res.to_heavy_string for res in self.get_residues])
-    
-    @property
-    def to_string(self):
-        return '\n'.join([res.to_string for res in self.get_residues])
-    
-    @property
-    def get_atoms(self):
-        atoms = []
-        for res in self.get_residues:
-            atoms.extend(res.get_atoms)
-        return atoms
-    
-    @property
-    def get_residues(self):
-        if self.ignore_incomplete_res:
-            return [i for i in self.residues.values() if i.is_perfect]
-        else:
-            return list(self.residues.values())
-        
-    @property
-    def get_heavy_atoms(self):
-        atoms = []
-        for res in self.get_residues:
-            atoms.extend(res.get_heavy_atoms)
-        return atoms
-    
-    @property
-    def get_coords(self):
-        return np.array([i.coord for i in self.get_atoms])
-
-    @property
-    def get_heavy_coords(self):
-        return np.array([i.coord for i in self.get_heavy_atoms])
-
-    @property
-    def center_of_mass(self):
-        atom_mass = np.array([i.mass for i in self.get_atoms]).reshape(-1, 1)
-        return np.sum(self.get_coords*atom_mass, axis=0)/atom_mass.sum()
-
-    @property
-    def centroid(self):
-        return self.get_coords.mean(axis=0)
-
-    def compute_surface_atoms(self):
-        path, filename = os.path.split(self.pdb_file)
-        chain_file_name = filename.split('.')[0] + '_' + self.chain + '.pdb'
-        chain_file = path+'/'+chain_file_name
-        with open(chain_file, 'w') as fw:
-            fw.write(self.to_heavy_string)
-
-        pymol.cmd.load(chain_file)
-        if path == '':
-            path = './'
-        sele = chain_file_name.split('.')[0]
-        pymol.cmd.remove("({}) and hydro".format(sele))
-        name = pymol.util.find_surface_atoms(sele=sele, _self=pymol.cmd)
-        save_name = path + '/' + sele + '-surface.pdb'
-        pymol.cmd.save(save_name,((name)))
-        surf_protein = Protein(save_name, ignore_incomplete_res=False)
-        surf_res_dict = {r.idx:r for r in surf_protein.get_residues}
-        for res in self.get_residues:
-            res_idx = res.idx
-            if res_idx in surf_res_dict:
-                for a in surf_res_dict[res_idx].get_heavy_atoms:
-                    res.atom_dict[a.name].is_surf = True
-        self.has_surf_atom = True
-        os.remove(save_name)
-        os.remove(chain_file)
-
-    def get_surf_mask(self):
-        if self.has_surf_atom is False:
-            self.compute_surface_atoms()
-        return np.array([a.is_surf for a in self.get_heavy_atoms], dtype=np.bool)
-
-    def get_res_by_id(self, res_id):
-        return self.residues[res_id]
-    
-    def __repr__(self):
-        tmp = 'Chain={}, NumResidues={}, NumAtoms={}, NumHeavyAtoms={}'
-        info = tmp.format(self.chain, len(self.residues), self.get_coords.shape[0], self.get_heavy_coords.shape[0])
-        return '{}({})'.format(self.__class__.__name__, info)
-
-
-class Protein(object):
-
-    def __init__(self, pdb_file, ignore_incomplete_res=True):
-        self.ignore_incomplete_res = ignore_incomplete_res
-        self.name = pdb_file.split('/')[-1].split('.')[0]
-        self.pdb_file = pdb_file
-        self.has_surf_atom = False
-        #self.pdb = [line.strip() for line in open(pdb_file).readlines()]
-        #atoms = os.popen('grep ATOM {}'.format(pdb_file)).read().strip().split('\n')
-        with open(pdb_file) as fr:
-            lines = fr.readlines()
-            surf_item = lines[0].strip().split()[-1]
-            if surf_item in {'surf', 'inner'}:
-                self.has_surf_atom = True
-            chain_info = {}
-            for line in lines:
-                if line.startswith('ATOM'):
-                    line = line.strip()
-                    chain = line[21:22].strip()
-                    res_idx = int(line[22:26].strip())
-                    if chain not in chain_info:
-                        chain_info[chain] = {}
-                        chain_info[chain][res_idx] = [line]
-                    elif res_idx not in chain_info[chain]:
-                        chain_info[chain][res_idx] = [line]
-                    else:
-                        chain_info[chain][res_idx].append(line)
-        
-        self.chains = {
-            c:Chain(chain_info[c], ignore_incomplete_res=ignore_incomplete_res, pdb_file=pdb_file) for c in chain_info
-            }
-        self.__normalized__ = False
-        self.center_of_mass_shift = None
-        self.rotate_matrix = None
-    
-    def normalize_coord(self):
-        is_chain_normalized = [c for c in self.chains if self.chains[c].__normalized__==True]
-        if len(is_chain_normalized) > 0:
-            chain_normalized = ' '.join(is_chain_normalized)
-            info = 'Warning: The Chains ({}) have been normalized '\
-                   'independently, this may lead to wrong coordinate '\
-                   'transformation for whole Protein !!!'.format(chain_normalized)
-            print(info)
-        
-        wts = np.array([i.mass for i in self.get_atoms])
-        wts = np.expand_dims(wts,axis=1)
-        pos = self.get_coords
-        
-        am, self.rotate_matrix, self.center_of_mass_shift = coordinate_adjusting(pos, wts)
-        for ix, a in enumerate(self.get_atoms):
-            a.coord = am[ix]
-        self.__normalized__ = True
-    
-    @property
-    def get_incomplete_residues(self):
-        res_list = []
-        for i in self.chains:
-            res_list += self.chains[i].get_incomplete_residues
-        return res_list
-
-    @property
-    def to_heavy_string(self):
-        return '\n'.join([res.to_heavy_string for res in self.get_residues])
-    
-    @property
-    def to_string(self):
-        return '\n'.join([res.to_string for res in self.get_residues])
-
-    @property
-    def get_residues(self,):
-        res_list = []
-        for i in self.chains:
-            res_list += self.chains[i].get_residues
-        return res_list
-
-    @property
-    def get_atoms(self):
-        atoms = []
-        for res in self.get_residues:
-            atoms.extend(res.get_atoms)
-        return atoms
-    
-    @property
-    def get_heavy_atoms(self):
-        atoms = []
-        for res in self.get_residues:
-            atoms.extend(res.get_heavy_atoms)
-        return atoms
-
-    @property
-    def get_coords(self):
-        return np.array([i.coord for i in self.get_atoms])
-
-    @property
-    def get_heavy_coords(self):
-        return np.array([i.coord for i in self.get_heavy_atoms])
-
-    @property
-    def center_of_mass(self):
-        atom_mass = np.array([i.mass for i in self.get_atoms]).reshape(-1, 1)
-        return np.sum(self.get_coords*atom_mass, axis=0)/atom_mass.sum()
-    
-    @property
-    def bond_graph(self):
-        res_list = self.get_residues
-        bond_index = []
-        bond_type = []
-        N_term_list = []
-        C_term_list = []
-        cusum = 0
-        for ix, res in enumerate(res_list):
-            e_idx, e_type = res.bond_graph
-            bond_index.append(e_idx + cusum)
-            bond_type.append(e_type)
-            N_term_ix = [i.name for i in self.get_heavy_atoms].index('N') + cusum
-            C_term_ix = [i.name for i in self.get_heavy_atoms].index('C') + cusum
-            N_term_list.append(N_term_ix)
-            C_term_list.append(C_term_ix)
-            cusum += res.get_heavy_coords.shape[0]
-            if ix != 0:
-                if res.idx-res_list[ix-1].idx == 1 and res.chain == res_list[ix-1].chain:
-                    bond_idx_between_res = np.array(
-                        [[N_term_ix, C_term_list[ix-1]],[C_term_list[ix-1],N_term_ix]],
-                        dtype=np.long
-                    )
-                    bond_index.append(bond_idx_between_res)
-                    bond_type_between_res = np.array([1,1], dtype=np.long)
-                    bond_type.append(bond_type_between_res)
-        bond_index = np.concatenate(bond_index, axis=1)
-        bond_type = np.concatenate(bond_type)
-        return bond_index, bond_type
-
-    @property
-    def centroid(self):
-        return self.get_coords.mean(axis=0)
-    
-    def get_chain(self, chain_id):
-        return self.chains[chain_id]
-    
-    def get_res_by_id(self, chain_id, res_id):
-        return self.chains[chain_id].get_res_by_id(res_id)
-    
-    def get_atom_dict(self, removeHs=True, get_surf=False):
-        atom_dict = {
-            'element': [],
-            'pos': [],
-            'is_backbone': [],
-            'atom_name': [],
-            'atom_to_aa_type': []
-             }
-        for a in self.get_atoms:
-            if a.element == 'H' and removeHs:
-                continue
-            atom_dict['element'].append(a.to_dict['element'])
-            atom_dict['pos'].append(a.to_dict['pos'])
-            atom_dict['is_backbone'].append(a.to_dict['is_backbone'])
-            atom_dict['atom_name'].append(a.to_dict['atom_name'])
-            atom_dict['atom_to_aa_type'].append(a.to_dict['atom_to_aa_type'])
-        atom_dict['element'] = np.array(atom_dict['element'], dtype=np.long)
-        atom_dict['pos'] = np.array(atom_dict['pos'], dtype=np.float32)
-        atom_dict['is_backbone'] = np.array(atom_dict['is_backbone'], dtype=np.bool)
-        #atom_dict['atom_name'] = atom_dict['atom_name']
-        if get_surf:
-            atom_dict['surface_mask'] = np.array([a.is_surf for a in self.get_heavy_atoms], dtype=np.bool)#self.get_surf_mask()
-        
-        atom_dict['atom_to_aa_type'] = np.array(atom_dict['atom_to_aa_type'], dtype=np.long)
-        atom_dict['molecule_name'] = None
-        protein_bond_index, protein_bond_type = self.bond_graph
-        atom_dict['bond_index'] = protein_bond_index
-        atom_dict['bond_type'] = protein_bond_type
-        atom_dict['filename'] = self.pdb_file
-        return atom_dict
-
-    def get_backbone_dict(self, removeHs=True):
-        atom_dict = self.get_atom_dict(removeHs=removeHs)
-        backbone_dict = {}
-        backbone_dict['element'] = atom_dict['element'][atom_dict['is_backbone']]
-        backbone_dict['pos'] = atom_dict['pos'][atom_dict['is_backbone']]
-        backbone_dict['is_backbone'] = np.ones(atom_dict['is_backbone'].sum(), dtype=np.bool)
-        backbone_dict['atom_name'] = np.array(atom_dict['atom_name'])[atom_dict['is_backbone']].tolist()
-        backbone_dict['atom_to_aa_type'] = atom_dict['atom_to_aa_type'][atom_dict['is_backbone']]
-        backbone_dict['molecule_name'] = atom_dict['molecule_name']
-        atom_dict['bond_index'] = np.empty([0,2], dtype=np.long)
-        atom_dict['bond_type'] = np.empty(0, dtype=np.long)
-        atom_dict['filename'] = self.pdb_file
-        return backbone_dict
-
-    @property
-    def get_backbone(self):
-        atoms = []
-        for res in self.get_residues:
-            bkb = [a for a in res.get_atoms if a.name in BACKBONE_SYMBOL]
-            atoms += bkb
-        return atoms
-
-    def compute_surface_atoms(self):
-        """
-        If the pdb_file is a pocket_file, the surface atoms is not correct!
-        """
-        pymol.cmd.load(self.pdb_file)
-        path, filename = os.path.split(self.pdb_file)
-        if path == '':
-            path = './'
-        sele = filename.split('.')[0]
-        pymol.cmd.remove("({}) and hydro".format(sele))
-        name = pymol.util.find_surface_atoms(sele=sele, _self=pymol.cmd)
-        save_name = path + '/' + sele + '-surface.pdb'
-        pymol.cmd.save(save_name,((name)))
-        surf_protein = Protein(save_name, ignore_incomplete_res=False)
-        surf_res_dict = {r.idx:r for r in surf_protein.get_residues}
-        for res in self.get_residues:
-            res_idx = res.idx
-            if res_idx in surf_res_dict:
-                for a in surf_res_dict[res_idx].get_heavy_atoms:
-                    res.atom_dict[a.name].is_surf = True
-        self.has_surf_atom = True
-        os.remove(save_name)
-
-    def get_surf_mask(self):
-        if self.has_surf_atom is False:
-            self.compute_surface_atoms()
-        return np.array([a.is_surf for a in self.get_heavy_atoms], dtype=np.bool)
-        
-    @staticmethod
-    def empty_dict():
-        empty_pocket_dict = {}
-        empty_pocket_dict = Dict(empty_pocket_dict)
-        empty_pocket_dict.element = np.empty(0, dtype=np.int64)
-        empty_pocket_dict.pos = np.empty([0,3], dtype=np.float32)
-        empty_pocket_dict.is_backbone = np.empty(0, dtype=bool)
-        empty_pocket_dict.atom_name = []
-        empty_pocket_dict.atom_to_aa_type = np.empty(0, dtype=np.int64)
-        empty_pocket_dict.molecule_name = None
-        empty_pocket_dict.bond_index = np.empty([2,0], dtype=np.int64)
-        empty_pocket_dict.bond_type = np.empty(0, dtype=np.int64)
-        empty_pocket_dict.filename = None
-        return empty_pocket_dict
-    
-    def __repr__(self):
-        num_res = 0
-        num_atom = 0
-        for i in self.chains:
-            res_list = list(self.chains[i].residues.values())
-            num_res += len(res_list)
-            for i in res_list:
-                num_atom += len(i.get_heavy_atoms)
-        num_incomp = len(self.get_incomplete_residues)
-        tmp = 'Name={}, NumChains={}, NumResidues={}, NumHeavyAtoms={}, NumIncompleteRes={}'
-        info = tmp.format(
-            self.name, len(self.chains), num_res, num_atom, num_incomp
-            )
-        return '{}({})'.format(self.__class__.__name__, info)
-
-####################################################################################################################
-
-
-ATOM_FAMILIES = ['Acceptor', 'Donor', 'Aromatic', 'Hydrophobe', 'LumpedHydrophobe', 'NegIonizable', 'PosIonizable', 'ZnBinder']
-ATOM_FAMILIES_ID = {s: i for i, s in enumerate(ATOM_FAMILIES)}
-BOND_TYPES = {t: i for i, t in enumerate(BondType.names.values())}
-BOND_NAMES = {i: t for i, t in enumerate(BondType.names.keys())}
-
-
-def is_in_ring(mol):
-    d = {a:np.array([], dtype=np.int64) for a in range(len(mol.GetAtoms()))}
-    rings = Chem.GetSymmSSSR(mol)
-    for a in d:
-        for r_idx, ring in enumerate(rings):
-            if a in ring:
-                d[a] = np.append(d[a], r_idx+1)
-            else:
-                d[a] = np.append(d[a], -a)
-    return d
-
-def parse_sdf_to_dict(mol_file):
-    fdefName = os.path.join(RDConfig.RDDataDir,'BaseFeatures.fdef')
-    factory = ChemicalFeatures.BuildFeatureFactory(fdefName)
-    rdmol = next(iter(Chem.SDMolSupplier(mol_file, removeHs=True)))
-    Chem.Kekulize(rdmol)
-    ring_info = is_in_ring(rdmol)
-    conformer = rdmol.GetConformer()
-    feat_mat = np.zeros([rdmol.GetNumAtoms(), len(ATOM_FAMILIES)], dtype=np.int64)
-    for feat in factory.GetFeaturesForMol(rdmol):
-        feat_mat[feat.GetAtomIds(), ATOM_FAMILIES_ID[feat.GetFamily()]] = 1
-    
-    element, pos, atom_mass = [], [], []
-    for a in rdmol.GetAtoms():
-        element.append(a.GetAtomicNum())
-        pos.append(conformer.GetAtomPosition(a.GetIdx()))
-        atom_mass.append(a.GetMass())
-    element = np.array(element, dtype=np.int64)
-    pos = np.array(pos, dtype=np.float32)
-    atom_mass = np.array(atom_mass, np.float32)
-    center_of_mass = (pos * atom_mass.reshape(-1,1)).sum(0)/atom_mass.sum()
-    
-    edge_index, edge_type = [], []
-    for b in rdmol.GetBonds():
-        row = [b.GetBeginAtomIdx(), b.GetEndAtomIdx()]
-        col = [b.GetEndAtomIdx(), b.GetBeginAtomIdx()]
-        edge_index.extend([row, col])
-        edge_type.extend([BOND_TYPES[b.GetBondType()]] * 2)
-    edge_index = np.array(edge_index)
-    edge_index_perm = edge_index[:,0].argsort()
-    edge_index = edge_index[edge_index_perm].T
-    edge_type = np.array(edge_type)[edge_index_perm]
-    
-    return {'element': element,
-            'pos': pos,
-            'bond_index': edge_index,
-            'bond_type': edge_type,
-            'center_of_mass': center_of_mass,
-            'atom_feature': feat_mat,
-            'ring_info': ring_info,
-            'filename':mol_file
-           }
-
-
-from scipy.spatial.distance import pdist
-from scipy.spatial.distance import squareform
-
-
-class Ligand(object):
-    def __init__(self, mol_file, removeHs=True, sanitize=True):
-        if isinstance(mol_file, Chem.rdchem.Mol):
-            mol = mol_file
-            self.name = None
-            self.lig_file = None
-        else:
-            mol = Chem.MolFromMolFile(mol_file, removeHs=removeHs, sanitize=sanitize)
-            if mol is None:
-                mol = Chem.MolFromMolFile(mol_file, removeHs=removeHs, sanitize=False)
-                mol.UpdatePropertyCache(strict=False)
-                Chem.SanitizeMol(
-                    mol,  ## 如果报错可以尝试：Chem.rdmolops.SanitizeFlags.SANITIZE_FINDRADICALS
-                    Chem.SanitizeFlags.SANITIZE_FINDRADICALS|\
-                    Chem.SanitizeFlags.SANITIZE_KEKULIZE|\
-                    Chem.SanitizeFlags.SANITIZE_SETAROMATICITY| \
-                    Chem.SanitizeFlags.SANITIZE_SETCONJUGATION| \
-                    Chem.SanitizeFlags.SANITIZE_SETHYBRIDIZATION| \
-                    Chem.SanitizeFlags.SANITIZE_SYMMRINGS,
-                    catchErrors=True
-                    )
-            self.name = mol_file.split('/')[-1].split('.')[0]
-            self.lig_file = mol_file
-            
-        Chem.Kekulize(mol)
-        self.mol = mol
-        #self.self_loop = self_loop
-        self.num_atoms = len(self.mol.GetAtoms())
-        self.normalized_coords = None
-    
-    def normalize_pos(self, shift_vector, rotate_matrix):
-        conformer = self.mol.GetConformer()
-        coords = np.array([conformer.GetAtomPosition(a.GetIdx()) for a in self.mol.GetAtoms()])
-        coords = (coords - shift_vector)@rotate_matrix
-        for ix, pos in enumerate(coords):
-            conformer.SetAtomPosition(ix, pos)
-        self.normalized_coords = coords
-    
-    def mol_block(self):
-        return Chem.MolToMolBlock(self.mol)
-    
-    def to_dict(self):
-        fdefName = os.path.join(RDConfig.RDDataDir,'BaseFeatures.fdef')
-        factory = ChemicalFeatures.BuildFeatureFactory(fdefName)
-        #rdmol = next(iter(Chem.SDMolSupplier(mol_file, removeHs=True)))
-        ring_info = is_in_ring(self.mol)
-        conformer = self.mol.GetConformer()
-        feat_mat = np.zeros([self.mol.GetNumAtoms(), len(ATOM_FAMILIES)], dtype=np.int64)
-        for feat in factory.GetFeaturesForMol(self.mol):
-            feat_mat[feat.GetAtomIds(), ATOM_FAMILIES_ID[feat.GetFamily()]] = 1
-        
-        element, pos, atom_mass = [], [], []
-        for a in self.mol.GetAtoms():
-            element.append(a.GetAtomicNum())
-            pos.append(conformer.GetAtomPosition(a.GetIdx()))
-            atom_mass.append(a.GetMass())
-        element = np.array(element, dtype=np.int64)
-        pos = np.array(pos, dtype=np.float32)
-        atom_mass = np.array(atom_mass, np.float32)
-        center_of_mass = (pos * atom_mass.reshape(-1,1)).sum(0)/atom_mass.sum()
-        
-        edge_index, edge_type = [], []
-        for b in self.mol.GetBonds():
-            row = [b.GetBeginAtomIdx(), b.GetEndAtomIdx()]
-            col = [b.GetEndAtomIdx(), b.GetBeginAtomIdx()]
-            edge_index.extend([row, col])
-            edge_type.extend([BOND_TYPES[b.GetBondType()]] * 2)
-        edge_index = np.array(edge_index)
-        edge_index_perm = edge_index[:,0].argsort()
-        edge_index = edge_index[edge_index_perm].T
-        edge_type = np.array(edge_type)[edge_index_perm]
-        
-        return {'element': element,
-                'pos': pos,
-                'bond_index': edge_index,
-                'bond_type': edge_type,
-                'center_of_mass': center_of_mass,
-                'atom_feature': feat_mat,
-                'ring_info': ring_info,
-                'filename':self.lig_file
-            }
-
-    @staticmethod
-    def empty_dict():
-        empty_ligand_dict = {}
-        empty_ligand_dict = Dict(empty_ligand_dict)
-        empty_ligand_dict.element = np.empty(0, dtype=np.int64)
-        empty_ligand_dict.pos = np.empty([0,3], dtype=np.float32)
-        empty_ligand_dict.bond_index = np.empty([2,0], dtype=np.int64)
-        empty_ligand_dict.bond_type = np.empty(0, dtype=np.int64)
-        empty_ligand_dict.center_of_mass = np.empty([0,3], dtype=np.float32)
-        empty_ligand_dict.atom_feature = np.empty([0,8], dtype=np.float32)
-        empty_ligand_dict.ring_info = {}
-        empty_ligand_dict.filename = None
-        return empty_ligand_dict
-
-    def __repr__(self):
-        tmp = 'Name={}, NumAtoms={}'
-        info = tmp.format(self.name, self.num_atoms)
-        return '{}({})'.format(self.__class__.__name__, info)
--- a/pocket_flow/utils/.ipynb_checkpoints/init-checkpoint.py
+++ b/pocket_flow/utils/.ipynb_checkpoints/init-checkpoint.py
@@ -1,13 +0,0 @@
-from .data import *
-from .generate_utils import *
-from .ParseFile import *
-from .train import *
-from .transform_utils import *
-from .transform import *
-from .load_dataset import *
-from .residues_base import *
-from .process_raw import *
-from .metrics import *
-from .mol_filter import *
-from . import sascorer
-from .draw import *
--- a/pocket_flow/utils/.ipynb_checkpoints/bond_lenth_distribution-checkpoint.py
+++ b/pocket_flow/utils/.ipynb_checkpoints/bond_lenth_distribution-checkpoint.py
@@ -1,133 +0,0 @@
-from rdkit import Chem
-import numpy as np
-import matplotlib.pyplot as plt
-from scipy.stats import gaussian_kde
-from tqdm.auto import tqdm
-from multiprocessing import Pool
-#from pocket_flow.utils import CrossDocked2020
-
-from matplotlib.collections import PathCollection
-from matplotlib.legend_handler import HandlerLine2D
-
-
-BOND_SYMBOL = {'SINGLE':'-', 'DOUBLE':'=', 'TRIPLE':'#', 'AROMATIC':'@'}
-def compute_bond_length(mol, atomatic=True):
-    #Chem.Kekulize(mol)
-    conformer = mol.GetConformer()
-    coords = np.array([conformer.GetAtomPosition(a.GetIdx()) for a in mol.GetAtoms()])
-    bond_dict = {}
-    for b in mol.GetBonds():
-        bond_type = b.GetBondType().__str__()
-        if b.GetIsAromatic() and atomatic:
-            bond_type = 'AROMATIC'
-        bond_symbol = BOND_SYMBOL[bond_type]
-        a_ix_start = b.GetBeginAtomIdx()
-        a_ix_end = b.GetEndAtomIdx()
-        bond_lenth = np.linalg.norm(coords[a_ix_start]-coords[a_ix_end])
-        #string_bond_lenth = '{:.3f}'.format(bond_lenth)
-        
-        symbol_start = b.GetBeginAtom().GetSymbol()
-        symbol_end = b.GetEndAtom().GetSymbol()
-        bond_key = bond_symbol.join(sorted([symbol_start, symbol_end]))
-        if bond_key not in bond_dict:
-            bond_dict[bond_key] = [bond_lenth]
-        else:
-            bond_dict[bond_key].append(bond_lenth)
-    return bond_dict
-
-
-def bond_statistic(inputs, from_file=True, interval=5000, n_p=16, sanitize=True):
-    BOND_DICT = {}
-
-    for idx in tqdm(range(0, len(inputs), interval)):
-        if idx+interval >= len(inputs):
-            raw_files = inputs[idx:]
-        else:
-            raw_files = inputs[idx:idx+interval]
-
-        if from_file:
-            mol_list = []
-            for sdf in raw_files:
-                mol = Chem.MolFromMolFile(sdf[1], sanitize=True)
-                if mol:
-                    mol_list.append(mol)
-        else:
-            mol_list = raw_files
-        pool = Pool(processes=n_p)
-        dict_list = pool.map(compute_bond_length, mol_list)
-        for d in dict_list:
-            for k in d:
-                if k in BOND_DICT:
-                    BOND_DICT[k] += d[k]
-                else:
-                    BOND_DICT[k] = d[k]
-    return BOND_DICT
-
-'''
-import pickle
-
-BOND_DICT = bond_statistic(cs2020.index, from_file=True, interval=5000, n_p=16)
-with open('BondLenthStatistic.pkl','wb') as fw:
-    pickle.dump(BOND_DICT, fw)
-del BOND_DICT
-
-with open('BondLenthStatistic.pkl', 'rb') as fr:
-    BOND_DICT = pickle.load(fr)
-'''
-
-class VizBondDensity(object):
-    
-    def __init__(self, data_bond_dict, gen_bond_dict, linewidth=0.1, color_data="#BC5F6A", color_gen="#19B3B1", 
-                 transparency=0.2, legend_size=25, axis_label_size=30, tick_size=25, is_fill=True, 
-                 figsize=(14, 9)):
-        self.data_bond_dict = data_bond_dict
-        self.gen_bond_dict = gen_bond_dict
-        self.linewidth = linewidth
-        self.color_data = color_data
-        self.color_gen = color_gen
-        self.transparency = transparency
-        self.legend_size = legend_size
-        self.axis_label_size = axis_label_size
-        self.tick_size = tick_size
-        self.is_fill = is_fill
-        self.figsize = figsize
-    
-    def draw(self, bond_type, save_path=None, dpi=300):
-        #bond_type = 'O=S'
-        density = gaussian_kde(np.array(self.data_bond_dict[bond_type]))
-        density.covariance_factor = lambda : .25
-        density._compute_covariance()
-
-        density_gen = gaussian_kde(np.array(self.gen_bond_dict[bond_type]))
-        density_gen.covariance_factor = lambda : .25
-        density_gen._compute_covariance()
-
-        # Create a vector of 200 values going from 0 to 8:
-        xs = np.linspace(1, 2, 200)
-       
-        # Set the figure size
-        plt.figure(figsize=self.figsize)
-        
-        #设置图片的右边框和上边框为不显示
-        ax=plt.gca()  #gca:get current axis得到当前轴
-        ax.spines['right'].set_color('none')
-        ax.spines['top'].set_color('none')
-        
-        # 开始画图
-        plt.plot(xs, density(xs), color=self.color_data, alpha=self.transparency, linewidth=self.linewidth)
-        if self.is_fill:
-            plt.fill_between(xs, density(xs), color=self.color_data, alpha=self.transparency, label='Dataset')
-
-        plt.plot(xs, density_gen(xs), color=self.color_gen, alpha=self.transparency, linewidth=self.linewidth)
-        if self.is_fill:
-            plt.fill_between(xs, density_gen(xs), color=self.color_gen, alpha=self.transparency, label='Generate')
-        plt.legend(prop={'size': self.legend_size})#framealpha=1, handler_map={plt.Line2D: HandlerLine2D(update_func=updateline)},
-                      #frameon=True, fontsize=20)
-        plt.yticks(size=self.tick_size) # fontproperties = 'Times New Roman', 
-        plt.xticks(size=self.tick_size) # fontproperties = 'Times New Roman', 
-        plt.xlabel('Bond Length (Å)', fontdict={'size':self.axis_label_size}) # 'family': Arial
-        plt.ylabel('Density', fontdict={'size':self.axis_label_size})
-        if save_path:
-            plt.savefig(save_path + '/' + '{}.png'.format(bond_type), dpi=dpi)
-        #plt.show()
-        return plt
--- a/pocket_flow/utils/.ipynb_checkpoints/docking-checkpoint.py
+++ b/pocket_flow/utils/.ipynb_checkpoints/docking-checkpoint.py
@@ -1,183 +0,0 @@
-import os
-import subprocess
-import random
-import string
-from easydict import EasyDict
-from rdkit import Chem
-from rdkit.Chem.rdForceFieldHelpers import UFFOptimizeMolecule
-
-from .reconstruct import reconstruct_from_generated
-
-
-def get_random_id(length=30):
-    letters = string.ascii_lowercase
-    return ''.join(random.choice(letters) for i in range(length)) 
-
-
-def load_pdb(path):
-    with open(path, 'r') as f:
-        return f.read()
-
-
-def parse_qvina_outputs(docked_sdf_path):
-
-    suppl = Chem.SDMolSupplier(docked_sdf_path)
-    results = []
-    for i, mol in enumerate(suppl):
-        if mol is None:
-            continue
-        line = mol.GetProp('REMARK').splitlines()[0].split()[2:]
-        results.append(EasyDict({
-            'rdmol': mol,
-            'mode_id': i,
-            'affinity': float(line[0]),
-            'rmsd_lb': float(line[1]),
-            'rmsd_ub': float(line[2]),
-        }))
-
-    return results
-
-class BaseDockingTask(object):
-
-    def __init__(self, pdb_block, ligand_rdmol):
-        super().__init__()
-        self.pdb_block = pdb_block
-        self.ligand_rdmol = ligand_rdmol
-
-    def run(self):
-        raise NotImplementedError()
-    
-    def get_results(self):
-        raise NotImplementedError()
-
-
-class QVinaDockingTask(BaseDockingTask):
-
-    @classmethod
-    def from_generated_data(cls, data, protein_root='./data/crossdocked', **kwargs):
-        protein_fn = os.path.join(
-            os.path.dirname(data.ligand_filename),
-            os.path.basename(data.ligand_filename)[:10] + '.pdb'
-        )
-        protein_path = os.path.join(protein_root, protein_fn)
-        with open(protein_path, 'r') as f:
-            pdb_block = f.read()
-        ligand_rdmol = reconstruct_from_generated(data)
-        return cls(pdb_block, ligand_rdmol, **kwargs)
-
-    @classmethod
-    def from_original_data(cls, data, ligand_root='./data/crossdocked_pocket10', protein_root='./data/crossdocked', **kwargs):
-        protein_fn = os.path.join(
-            os.path.dirname(data.ligand_filename),
-            os.path.basename(data.ligand_filename)[:10] + '.pdb'
-        )
-        protein_path = os.path.join(protein_root, protein_fn)
-        with open(protein_path, 'r') as f:
-            pdb_block = f.read()
-
-        ligand_path = os.path.join(ligand_root, data.ligand_filename)
-        ligand_rdmol = next(iter(Chem.SDMolSupplier(ligand_path)))
-        return cls(pdb_block, ligand_rdmol, **kwargs)
-
-    def __init__(self, pdb_block, ligand_rdmol, conda_env='adt', tmp_dir='./tmp', use_uff=True, center=None):
-        super().__init__(pdb_block, ligand_rdmol)
-        self.conda_env = conda_env
-        self.tmp_dir = os.path.realpath(tmp_dir)
-        os.makedirs(tmp_dir, exist_ok=True)
-
-        self.task_id = get_random_id()
-        self.receptor_id = self.task_id + '_receptor'
-        self.ligand_id = self.task_id + '_ligand'
-
-        self.receptor_path = os.path.join(self.tmp_dir, self.receptor_id + '.pdb')
-        self.ligand_path = os.path.join(self.tmp_dir, self.ligand_id + '.sdf')
-
-        with open(self.receptor_path, 'w') as f:
-            f.write(pdb_block)
-
-        ligand_rdmol = Chem.AddHs(ligand_rdmol, addCoords=True)
-        if use_uff:
-            UFFOptimizeMolecule(ligand_rdmol)
-        sdf_writer = Chem.SDWriter(self.ligand_path)
-        sdf_writer.write(ligand_rdmol)
-        sdf_writer.close()
-        self.ligand_rdmol = ligand_rdmol
-
-        pos = ligand_rdmol.GetConformer(0).GetPositions()
-        if center is None:
-            self.center = (pos.max(0) + pos.min(0)) / 2
-        else:
-            self.center = center
-
-        self.proc = None
-        self.results = None
-        self.output = None
-        self.docked_sdf_path = None
-
-    def run(self, exhaustiveness=16):
-        commands = """
-eval "$(conda shell.bash hook)"
-conda activate {env}
-cd {tmp}
-# Prepare receptor (PDB->PDBQT)
-prepare_receptor4.py -r {receptor_id}.pdb
-# Prepare ligand
-obabel {ligand_id}.sdf -O {ligand_id}.pdbqt
-qvina2.1 \
-    --receptor {receptor_id}.pdbqt \
-    --ligand {ligand_id}.pdbqt \
-    --center_x {center_x:.4f} \
-    --center_y {center_y:.4f} \
-    --center_z {center_z:.4f} \
-    --size_x 20 --size_y 20 --size_z 20 \
-    --exhaustiveness {exhaust}
-obabel {ligand_id}_out.pdbqt -O{ligand_id}_out.sdf -h
-        """.format(
-            receptor_id = self.receptor_id,
-            ligand_id = self.ligand_id,
-            env = self.conda_env, 
-            tmp = self.tmp_dir, 
-            exhaust = exhaustiveness,
-            center_x = self.center[0],
-            center_y = self.center[1],
-            center_z = self.center[2],
-        )
-
-        self.docked_sdf_path = os.path.join(self.tmp_dir, '%s_out.sdf' % self.ligand_id)
-
-        self.proc = subprocess.Popen(
-            '/bin/bash', 
-            shell=False, 
-            stdin=subprocess.PIPE, 
-            stdout=subprocess.PIPE, 
-            stderr=subprocess.PIPE
-        )
-
-        self.proc.stdin.write(commands.encode('utf-8'))
-        self.proc.stdin.close()
-
-        # return commands
-
-    def run_sync(self):
-        self.run()
-        while self.get_results() is None:
-            pass
-        results = self.get_results()
-        print('Best affinity:', results[0]['affinity'])
-        return results
-
-    def get_results(self):
-        if self.proc is None:   # Not started
-            return None
-        elif self.proc.poll() is None:  # In progress
-            return None
-        else:
-            if self.output is None:
-                self.output = self.proc.stdout.readlines()
-                try:
-                    self.results = parse_qvina_outputs(self.docked_sdf_path)
-                except:
-                    print('[Error] Vina output error: %s' % self.docked_sdf_path)
-                    return []
-            return self.results
-
--- a/pocket_flow/utils/.ipynb_checkpoints/draw-checkpoint.py
+++ b/pocket_flow/utils/.ipynb_checkpoints/draw-checkpoint.py
@@ -1,147 +0,0 @@
-import os
-from rdkit import Chem
-from rdkit.Chem import AllChem, Descriptors
-from rdkit.Chem import  Draw
-from rdkit.Chem.Draw import rdMolDraw2D
-from rdkit.Chem.Draw.MolDrawing import MolDrawing, DrawingOptions
-from .train import verify_dir_exists
-
-
-
-
-def draw_docked_mol_list(mol_list, molsPerRow=5, subImgSize=(300,300)):
-    opts = DrawingOptions()
-    opts.atomLabelFontSize = 30
-    opts.bondLineWidth = 1.5
-    opts.colorBonds = False
-    
-    legends = []
-    for m in mol_list:
-        AllChem.Compute2DCoords(m)
-        docking_score = 'Docking Score: {:.3f}'.format(float(m.GetProp('r_i_docking_score')))
-        mw = 'MolWt: {:.3f}'.format(Descriptors.MolWt(m))
-        name = m.GetProp('_Name')
-        legend = name + '\n' + mw + '\n' + docking_score
-        legends.append(legend)
-
-    img = Draw.MolsToGridImage(
-        mol_list,
-        molsPerRow=molsPerRow,
-        subImgSize=subImgSize,
-        legends=legends,
-        returnPNG=False
-    )
-    return img
-'''
-def verify_dir_exists(dirname):
-    if os.path.isdir(os.path.dirname(dirname)) == False:
-    #if os.path.isdir(dirname) == False:
-        os.makedirs(os.path.dirname(dirname))'''
-
-
-def HighlightAtomByWeights(mol,         # rdkit.Molecule的instance
-                           #weights,     # 注意力系数，注意weights的顺序应该与rdkit.Molecule原子idx的顺序一致
-                           save=None, 
-                           size=(300,300), 
-                           colors=['#FFFFFF','#FF0000'],  # 颜色变化，可以定义多个颜色
-                           bondLineWidth=5,   # 化学键显示的宽度
-                           FontSize=3,
-                           fixedBondLength=50,    # 化学键显示的长度
-                           legend=None,
-                           legendFontSize=5,
-                           elemColor=False,
-                           withIsomeric=True):  # 是否按元素给原子着色
-    
-    draw = rdMolDraw2D.MolDraw2DCairo(size[0], size[1])
-    option = rdMolDraw2D.MolDrawOptions()
-    option.bondLineWidth = bondLineWidth
-    option.fixedBondLength = fixedBondLength
-    option.setHighlightColour((0.95,0.7,0.95))
-    option.baseFontSize = FontSize
-    option.legendFontSize = legendFontSize
-    if elemColor == False:
-        option.updateAtomPalette({k: (0, 0, 0) for k in DrawingOptions.elemDict.keys()})
-    draw.SetDrawOptions(option)
-    if withIsomeric:
-        AllChem.Compute2DCoords(mol)
-    else:
-        smi = Chem.MolToSmiles(mol, isomericSmiles=False)
-        mol = Chem.MolFromSmiles(smi)
-    rdMolDraw2D.PrepareAndDrawMolecule(draw, mol, legend=legend)
-    draw.FinishDrawing()
-    if save:
-        if '.png' in save:
-            n = 0
-            path = '/'.join(save.split('/')[0:-1])
-            while os.path.exists(save):
-                n += 1
-                name = save.split('/')[-1].split('.')[0]
-                name = name + '_' + str(n) + '.png'
-                save = path + '/' + name
-            else:
-                draw.WriteDrawingText(save)
-    else:
-        return draw
-
-'''
-supp = list(iter(Chem.SDMolSupplier('dockpose-idscore2500-first50.sdf', removeHs=True, sanitize=1)))
-save_path = './img_docking_results/idscore_filtered/'
-verify_dir_exists(save_path)
-for ix,mol in enumerate(supp[:50]):
-    docking_score = 'Docking Score: {:.3f}'.format(float(mol.GetProp('r_i_docking_score')))
-    mw = 'MolWt: {:.3f}'.format(Descriptors.MolWt(mol))
-    legend = mw + '\n' + docking_score
-    name = save_path + '/' + 'No-' + str(ix) + '-' + mol.GetProp('_Name') + '.png'
-    HighlightAtomByWeights(mol, save=name, bondLineWidth=2, FontSize=1, elemColor=True, legend=legend, legendFontSize=100)
-'''
-
-from PIL import Image
-import matplotlib.pyplot as plt
-import numpy as np
-
-def CombineImages(img_file_list, col_num=7, save_dir='./', title='image', img_size=None):
-    
-    num_img = len(img_file_list)
-    num_row = num_img//col_num+1 if num_img%col_num != 0 else num_img//col_num
-    
-    if img_size is None:
-        img_size = Image.open(img_file_list[0]).size
-    #print((img_size[1]*col_num, img_size[0]*num_row))
-    toImage = Image.new('RGB', (img_size[1]*col_num, img_size[0]*num_row), color=(255,255,255))
-    #print(toImage.size)
-    '''for y in range(1, num_row+1):
-        for x in range(1, num_per_row):'''
-    x_cusum = 0
-    y_cusum = 0
-    num_has_paste = 0
-    for img_file in img_file_list:
-        img = Image.open(img_file)
-        #print((x_cusum, y_cusum))
-        toImage.paste(img, (x_cusum, y_cusum))
-        num_has_paste += 1
-        #print(num_has_paste)
-        if num_has_paste%col_num==0:
-            x_cusum = 0
-            y_cusum += img_size[1]
-        else:
-            x_cusum += img_size[0]
-            
-    #toImage.save("merged.png")
-    plt.xticks([])
-    plt.yticks([])
-    plt.axis('off')  # 取消所有边框，坐标轴
-    plt.imshow(toImage)
-    #plt.title(title, y=0.02) # title默认设置在图像上方，通过设置y改变位置
-    #plt.show()
-    verify_dir_exists(save_dir)
-    toImage.save(save_dir+'/'+title+'.png')
-    #plt.savefig(fname=save_dir+'/'+title)
-    plt.clf()    # 清除当前图形。如果不清除，当使用循环大量作图，机器会为plt分配越来越多的内存，速度会逐渐变慢。
-
-
-'''
-import glob
-
-l = sorted([(int(i.split('/')[-1].split('-')[1]), i) for i in glob.glob('img_docking_results/simlarity_filtered/*.png')], key=lambda x:x[0])
-CombineImages([i[1] for i in l])
-'''
--- a/pocket_flow/utils/.ipynb_checkpoints/generate_utils-checkpoint.py
+++ b/pocket_flow/utils/.ipynb_checkpoints/generate_utils-checkpoint.py
@@ -1,482 +0,0 @@
-import torch
-import torch.nn.functional as F
-from rdkit import Chem, Geometry
-from rdkit.Chem import Descriptors
-import numpy as np
-import copy
-import itertools
-
-
-max_valence_dict = {
-                6:torch.LongTensor([4]), 7:torch.LongTensor([3]), 8:torch.LongTensor([2]),
-                9:torch.LongTensor([1]), 15:torch.LongTensor([5]), 16:torch.LongTensor([6]),
-                17:torch.LongTensor([1]), 35:torch.LongTensor([1]), 53:torch.LongTensor([1]),
-                1:torch.LongTensor([1])
-                }
-def add_ligand_atom_to_data(
-    old_data, pos, element, bond_index, bond_type, type_map=[1,6,7,8,9,15,16,17,35,53],
-    max_valence_dict=max_valence_dict):
-    data = old_data.clone()
-    if 'max_atom_valence' not in data.__dict__['_store']:
-        data.max_atom_valence = torch.empty(0, dtype=torch.long)
-    # add position of new atom to context
-    data.ligand_context_pos = torch.cat([
-        data.ligand_context_pos, pos.view(1, 3).to(data.ligand_context_pos)
-    ], dim=0)
-
-    # add feature of new atom to context
-    data.ligand_context_feature_full = torch.cat([
-        data.ligand_context_feature_full,
-        torch.cat([
-            F.one_hot(element.view(1), len(type_map)).to(data.ligand_context_feature_full), # (1, num_elements)
-            torch.tensor([[1, 0, 0]]).to(data.ligand_context_feature_full),  # is_mol_atom, num_neigh (placeholder), valence (placeholder)
-            torch.tensor([[0, 0, 0]]).to(data.ligand_context_feature_full)  # num_of_bonds 1, 2, 3(placeholder)
-        ], dim=1)
-    ], dim=0)
-    data.context_idx = torch.arange(data.context_idx.size(0)+1)
-    # 
-    idx_num_neigh = len(type_map) + 1
-    idx_valence = idx_num_neigh + 1
-    idx_num_of_bonds = idx_valence + 1
-
-    # add type of new atom to context
-    element = torch.LongTensor([type_map[element.item()]])
-    data.ligand_context_element = torch.cat([
-        data.ligand_context_element, element.view(1).to(data.ligand_context_element)
-    ])
-    max_new_atom_valence = max_valence_dict[element.item()].to(data.max_atom_valence)
-    data.max_atom_valence = torch.cat([data.max_atom_valence, max_new_atom_valence])
-    
-    # change the feature of new atom to context according to ligand context
-    if len(bond_type) != 0:
-        bond_index, bond_type = remove_triangle(
-            pos, data.ligand_context_pos, data.ligand_context_bond_index, data.ligand_context_bond_type, 
-                    bond_index, bond_type)
-        bond_index[0, :] = len(data.ligand_context_pos) - 1
-        '''bond_type = check_double_bond(
-            data.ligand_context_bond_index, 
-            data.ligand_context_bond_type, 
-            bond_index, 
-            bond_type
-            )'''
-        bond_type = check_valence_is_2(
-                            bond_index, bond_type,
-                            data.ligand_context_element, data.ligand_context_valence
-                            )
-        bond_vec = data.ligand_context_pos[bond_index[0]] - data.ligand_context_pos[bond_index[1]]
-        bond_lengths = torch.norm(bond_vec, dim=-1, p=2)
-        if (bond_lengths > 3).any():
-            print(bond_lengths)
-        
-        bond_index_all = torch.cat([bond_index, torch.stack([bond_index[1, :], bond_index[0, :]], dim=0)], dim=1)
-        bond_type_all = torch.cat([bond_type, bond_type], dim=0)
-
-        data.ligand_context_bond_index = torch.cat([
-            data.ligand_context_bond_index, bond_index_all.to(data.ligand_context_bond_index)
-        ], dim=1)
-
-        data.ligand_context_bond_type = torch.cat([
-            data.ligand_context_bond_type,
-            bond_type_all
-        ])
-        # modify atom features related to bonds
-        # previous atom
-        data.ligand_context_feature_full[bond_index[1, :], idx_num_neigh] += 1 # num of neigh of previous nodes
-        data.ligand_context_feature_full[bond_index[1, :], idx_valence] += bond_type # valence of previous nodes
-        data.ligand_context_feature_full[bond_index[1, :], idx_num_of_bonds + bond_type - 1] += 1  # num of bonds of 
-        # the new atom
-        data.ligand_context_feature_full[-1, idx_num_neigh] += len(bond_index[1]) # num of neigh of last node
-        data.ligand_context_feature_full[-1, idx_valence] += torch.sum(bond_type) # valence of last node
-        for bond in [1, 2, 3]:
-            data.ligand_context_feature_full[-1, idx_num_of_bonds + bond - 1] += (bond_type == bond).sum()  # num of bonds of last node
-    data.ligand_context_valence = data.ligand_context_feature_full[:,idx_valence]
-    del old_data
-    return data
-
-
-def data2mol(data, raise_error=True, sanitize=True):
-    element = data.ligand_context_element.clone().cpu().tolist()
-    bond_index = data.ligand_context_bond_index.clone().cpu().tolist()
-    bond_type = data.ligand_context_bond_type.clone().cpu().tolist()
-    pos = data.ligand_context_pos.clone().cpu().tolist()
-    n_atoms = len(pos)
-    rd_mol = Chem.RWMol()
-    rd_conf = Chem.Conformer(n_atoms)
-
-    # add atoms and coordinates
-    for i, atom in enumerate(element):
-        rd_atom = Chem.Atom(atom)
-        rd_mol.AddAtom(rd_atom)
-        rd_coords = Geometry.Point3D(*pos[i])
-        rd_conf.SetAtomPosition(i, rd_coords)
-    rd_mol.AddConformer(rd_conf)
-    # add atoms and coordinates
-    # add atoms and coordinates
-    for i, type_this in enumerate(bond_type):
-        node_i, node_j = bond_index[0][i], bond_index[1][i]
-        if node_i < node_j:
-            if type_this == 1:
-                rd_mol.AddBond(node_i, node_j, Chem.BondType.SINGLE)
-            elif type_this == 2:
-                rd_mol.AddBond(node_i, node_j, Chem.BondType.DOUBLE)
-            elif type_this == 3:
-                rd_mol.AddBond(node_i, node_j, Chem.BondType.TRIPLE)
-            elif type_this == 12:
-                rd_mol.AddBond(node_i, node_j, Chem.BondType.AROMATIC)
-            else:
-                raise Exception('unknown bond order {}'.format(type_this))
-    # modify
-    try:
-        rd_mol = modify_submol(rd_mol)
-    except:
-        if raise_error:
-            raise MolReconsError()
-        else:
-            print('MolReconsError')
-    # check valid
-    rd_mol_check = Chem.MolFromSmiles(Chem.MolToSmiles(rd_mol))
-    if rd_mol_check is None:
-        if raise_error:
-            raise MolReconsError()
-        else:
-            print('MolReconsError')
-    rd_mol = rd_mol.GetMol()
-    if 12 in bond_type:  # mol may directlu come from ture mols and contains aromatic bonds
-        Chem.Kekulize(rd_mol, clearAromaticFlags=True)
-    if sanitize:
-        Chem.SanitizeMol(rd_mol, Chem.SANITIZE_ALL^Chem.SANITIZE_KEKULIZE^Chem.SANITIZE_SETAROMATICITY)
-    #rd_mol = modify(rd_mol)
-    return rd_mol
-
-
-def modify_submol(mol):  # modify mols containing C=N(C)O
-    submol = Chem.MolFromSmiles('C=N(C)O', sanitize=False)
-    sub_fragments = mol.GetSubstructMatches(submol)
-    for fragment in sub_fragments:
-        atomic_nums = np.array([mol.GetAtomWithIdx(atom).GetAtomicNum() for atom in fragment])
-        idx_atom_N = fragment[np.where(atomic_nums == 7)[0][0]]
-        idx_atom_O = fragment[np.where(atomic_nums == 8)[0][0]]
-        mol.GetAtomWithIdx(idx_atom_N).SetFormalCharge(1)  # set N to N+
-        mol.GetAtomWithIdx(idx_atom_O).SetFormalCharge(-1)  # set O to O-
-    return mol
-
-
-class MolReconsError(Exception):
-    pass
-
-def add_context(data):
-    data.ligand_context_pos = data.ligand_pos
-    data.ligand_context_element = data.ligand_element
-    data.ligand_context_bond_index = data.ligand_bond_index
-    data.ligand_context_bond_type = data.ligand_bond_type
-    return data
-
-
-def check_valency(mol):
-    try:
-        Chem.SanitizeMol(mol, sanitizeOps=Chem.SanitizeFlags.SANITIZE_PROPERTIES)
-        return True
-    except ValueError:
-        return False
-
-
-def check_double_bond(
-    ligand_context_bond_index, ligand_context_bond_type, bond_index_to_add, bond_type_to_add
-    ):
-    bond_type_in_place = [
-        ligand_context_bond_type[ix==ligand_context_bond_index[0]]\
-             for ix in bond_index_to_add[1]
-             ]
-    has_double = torch.BoolTensor([(bt==2).sum()>0 for bt in bond_type_in_place])
-    bond_type_to_add_has_double = bond_type_to_add >= 2
-    mask = torch.stack([has_double,bond_type_to_add_has_double]).all(dim=0)
-    bond_type_to_add = bond_type_to_add - mask.long()
-    return bond_type_to_add
-
-
-def check_valence_is_2(
-    bond_index_to_add, bond_type_to_add,
-    ligand_context_element, ligand_context_valence
-    ):
-    atom_type_to_add = ligand_context_element[-1]
-    new_atom_type_is_C = (atom_type_to_add == 6).view(-1)
-    if new_atom_type_is_C:
-        atom_valence_in_place = ligand_context_valence[bond_index_to_add[1]] >= 2
-        atom_type_in_place_is_C = ligand_context_element[bond_index_to_add[1]] == 6
-        bond_type_to_add_mask = bond_type_to_add >= 2
-
-        mask = torch.stack([atom_valence_in_place, atom_type_in_place_is_C, bond_type_to_add_mask]).all(dim=0)
-        bond_type_to_add = bond_type_to_add - mask.long()
-    return bond_type_to_add
-    
-
-def remove_triangle(pos_to_add, ligand_context_pos, ligand_context_bond_index, ligand_context_bond_type, 
-                    bond_index_to_add, bond_type_to_add):
-    new_j = bond_index_to_add[1]
-    atom_in_place_adjs = [
-        ligand_context_bond_index[1][ligand_context_bond_index[0]==i] for i in new_j
-        ]
-    '''atom_in_place_bond_type = [
-        ligand_context_bond_type[1][ligand_context_bond_index[0]==i] for i in new_j
-    ]'''
-    L = []
-    for j in new_j:
-        l = []
-        for i in atom_in_place_adjs:
-            if j in i:
-                l.append(True)
-            else:
-                l.append(False)
-        L.append(l)
-    adj_mask = torch.LongTensor(L).any(-1)
-    if adj_mask.sum() > 0:
-        dist = torch.norm(pos_to_add.view(-1,3) - ligand_context_pos[new_j], dim=-1, p=2)
-        dist_mask_idx = torch.nonzero(adj_mask).view(-1)
-        max_dist_idx_to_remove = dist_mask_idx[dist[dist_mask_idx].argmax()]
-        mask = (torch.arange(len(bond_type_to_add)) == max_dist_idx_to_remove) == False
-        bond_index_to_add = bond_index_to_add[:, mask]
-        bond_type_to_add = bond_type_to_add[mask]
-    return bond_index_to_add, bond_type_to_add
-
-
-PATTERNS = [
-        #Chem.MolFromSmarts('[C,N]1~&@[C,N]~&@[C,N]~&@[C,N]~&@[C,N]~&@[C,N]~&@1'), # '[R]1~&@[R]~&@[R]~&@[R]~&@[R]~&@[R]~&@1'
-        Chem.MolFromSmarts('[N]1~&@[N]~&@[C]~&@[C]~&@[C]~&@[C]~&@1'),   # '[R]1~&@[R]~&@[R]~&@[R]~&@[R]~&@1'
-        Chem.MolFromSmarts('[N]1~&@[C]~&@[N]~&@[C]~&@[C]~&@[C]~&@1'),
-        Chem.MolFromSmarts('[N]1~&@[C]~&@[C]~&@[N]~&@[C]~&@[C]~&@1'),
-        Chem.MolFromSmarts('[N]1~&@[C]~&@[C]~&@[C]~&@[C]~&@[C]~&@1'),
-        Chem.MolFromSmarts('[C]1~&@[C]~&@[C]~&@[C]~&@[C]~&@[C]~&@1')
-    ]
-PATTERNS_1 = [
-        [Chem.MolFromSmarts('[#6,7,8]-[#7]1~&@[#6,7]~&@[#6,7]~&@[#6,7]~&@[#6,7]~&@[#6,7]~&@1'),
-        Chem.MolFromSmarts('[C,N,O]-[N]1~&@[C,N]~&@[C,N]~&@[C,N]~&@[C,N]~&@[C,N]~&@1')],
-        [Chem.MolFromSmarts('[#6,7,8]-[#6]1(-[#6,7,8])~&@[#6,7]~&@[#6,7]~&@[#6,7]~&@[#6,7]~&@[#6,7]~&@1'),
-        Chem.MolFromSmarts('[C,N,O]-[C]1(-[C,N,O])~&@[C,N]~&@[C,N]~&@[C,N]~&@[C,N]~&@[C,N]~&@1')]
-        #Chem.MolFromSmarts('[C,N,O]-[N]1~&@[C]~&@[C]~&@[N]~&@[C]~&@[C]-1'),
-        #Chem.MolFromSmarts('[C,N,O]-[N]1~&@[C]~&@[C]~&@[C]~&@[C]~&@[C]-1'),
-    ]
-MAX_VALENCE = {'C':4, 'N':3}
-def modify(mol, max_double_in_6ring=0):
-    #atoms = mol.GetAtoms()
-    mol_copy = copy.deepcopy(mol)
-    mw = Chem.RWMol(mol)
-
-    p1 = Chem.MolFromSmarts('[#6,7]=[#6]1-[#6,7]~&@[#6,7]~&@[#6,7]~&@[#6,7]~&@[#6,7]-1')
-    p1_ = Chem.MolFromSmarts('[C,N]=[C]1-[C,N]~&@[C,N]~&@[C,N]~&@[C,N]~&@[C,N]-1')
-    subs = set(list(mw.GetSubstructMatches(p1)) + list(mw.GetSubstructMatches(p1_)))
-    subs_set_1 = [set(s) for s in subs]
-    for sub in subs:
-        comb = itertools.combinations(sub, 2)
-        #b_list = [(c, mw.GetBondBetweenAtoms(*c)) for c in comb if mw.GetBondBetweenAtoms(*c) is not None]
-        change_double = False
-        r_b_double = 0
-        b_list = []
-        for ix,c in enumerate(comb):
-            b = mw.GetBondBetweenAtoms(*c)
-            if ix == 0:
-                bt = b.GetBondType().__str__()
-                is_r = b.IsInRingSize(6)
-                b_list.append((c, bt, is_r))
-                continue
-            if b is not None:
-                bt = b.GetBondType().__str__()
-                is_r = b.IsInRing()
-                b_list.append((c, bt, is_r))
-                if is_r is True and bt == 'DOUBLE':
-                    r_b_double += 1
-                    if r_b_double > max_double_in_6ring:
-                        change_double = True
-        if change_double:
-            for ix,b in enumerate(b_list):
-                if ix == 0:
-                    if b[-1] is False:
-                        mw.RemoveBond(*b[0])
-                        mw.AddBond(*b[0], Chem.rdchem.BondType.SINGLE)
-                    else:
-                        continue
-                if b[1] == 'DOUBLE' and b[-1] is False:
-                    mw.RemoveBond(*b[0])
-                    mw.AddBond(*b[0], Chem.rdchem.BondType.SINGLE)
-                    break
-    
-    #p2 = Chem.MolFromSmarts('[C,N,O]-[N]1~&@[C,N]~&@[C,N]~&@[C,N]~&@[C,N]~&@[C,N]-1')
-    for p2 in PATTERNS_1:
-        Chem.GetSSSR(mw)
-        subs2 = set(list(mw.GetSubstructMatches(p2[0])) + list(mw.GetSubstructMatches(p2[1])))
-        for sub in subs2:
-            comb = itertools.combinations(sub, 2)
-            b_list = [(c, mw.GetBondBetweenAtoms(*c)) for c in comb if mw.GetBondBetweenAtoms(*c) is not None]
-            for b in b_list:
-                if b[-1].GetBondType().__str__() == 'DOUBLE':
-                    mw.RemoveBond(*b[0])
-                    mw.AddBond(*b[0], Chem.rdchem.BondType.SINGLE)
-
-    Chem.GetSSSR(mw)
-    p3 = Chem.MolFromSmarts('[#8]=[#6]1-[#6,7]~&@[#6,7]~&@[#6,7]~&@[#6,7]~&@[#6,7]-1')
-    p3_ = Chem.MolFromSmarts('[O]=[C]1-[C,N]~&@[C,N]~&@[C,N]~&@[C,N]~&@[C,N]-1')
-    subs = set(list(mw.GetSubstructMatches(p3)) + list(mw.GetSubstructMatches(p3_)))
-    subs_set_2 = [set(s) for s in subs]
-    for sub in subs:
-        comb = itertools.combinations(sub, 2)
-        b_list = [(c, mw.GetBondBetweenAtoms(*c)) for c in comb if mw.GetBondBetweenAtoms(*c) is not None]
-        for b in b_list:
-            if b[-1].GetBondType().__str__() == 'DOUBLE' and b[-1].IsInRing() is True:
-                mw.RemoveBond(*b[0])
-                mw.AddBond(*b[0], Chem.rdchem.BondType.SINGLE)
-
-    p = Chem.MolFromSmarts('[#6,7]1~&@[#6,7]~&@[#6,7]~&@[#6,7]~&@[#6,7]~&@[#6,7]~&@1')
-    p_ = Chem.MolFromSmarts('[C,N]1~&@[C,N]~&@[C,N]~&@[C,N]~&@[C,N]~&@[C,N]~&@1')
-    Chem.GetSSSR(mw)
-    subs = set(list(mw.GetSubstructMatches(p)) + list(mw.GetSubstructMatches(p_)))
-    subs_set_3 = [set(s) for s in subs]
-    for sub in subs:
-        pass_sub = False
-        if subs_set_2:
-            for s in subs_set_2:
-                if len(s-set(sub)) == 1:
-                    pass_sub = True
-                    break
-        if pass_sub:
-            continue
-
-        bond_list = [(i,sub[0]) if ix+1==len(sub) else (i, sub[ix+1]) for ix,i in enumerate(sub)]
-        if len(bond_list) == 0:
-            continue
-        atoms = [mw.GetAtomWithIdx(i) for i in sub]
-        for a in atoms:
-            if a.GetExplicitValence()==MAX_VALENCE[a.GetSymbol()] and a.GetHybridization().__str__()=='SP3':
-                break
-        else:
-            bond_type = [mw.GetBondBetweenAtoms(*b).GetBondType().__str__() for b in bond_list]
-            if bond_type.count('DOUBLE') > max_double_in_6ring:
-                for b in bond_list:
-                    mw.RemoveBond(*b)
-                    mw.AddBond(*b, Chem.rdchem.BondType.AROMATIC)
-    
-    # get new mol from modified mol
-    conf = mw.GetConformer()
-    rd_mol = Chem.RWMol()
-    rd_conf = Chem.Conformer(mw.GetNumAtoms())
-    for i, atom in enumerate(mw.GetAtoms()):
-        rd_atom = Chem.Atom(atom.GetAtomicNum())
-        rd_mol.AddAtom(rd_atom)
-        rd_coords = conf.GetAtomPosition(i)
-        rd_conf.SetAtomPosition(i, rd_coords)
-    rd_mol.AddConformer(rd_conf)
-    
-    for i, bond in enumerate(mw.GetBonds()):
-        bt = bond.GetBondType()
-        node_i = bond.GetBeginAtomIdx()
-        node_j = bond.GetEndAtomIdx()
-        rd_mol.AddBond(node_i, node_j, bt)
-    out_mol = rd_mol.GetMol()
-
-    # check validility of the new mol
-    mol_check = Chem.MolFromSmiles(Chem.MolToSmiles(out_mol))
-    if mol_check:
-        try:
-            Chem.Kekulize(out_mol)
-            del mol_copy
-            return out_mol
-        except:
-            del mol
-            return mol_copy
-    else:
-        del mol
-        return mol_copy
-
-
-'''
-PATTERNS = [
-        Chem.MolFromSmarts('[C,N]1~&@[C,N]~&@[C,N]~&@[C,N]~&@[C,N]~&@[C,N]~&@1'), # '[R]1~&@[R]~&@[R]~&@[R]~&@[R]~&@[R]~&@1'
-        #Chem.MolFromSmarts('[C,N]1~&@[C,N]~&@[C,N]~&@[C,N]~&@[C,N]~&@1')   # '[R]1~&@[R]~&@[R]~&@[R]~&@[R]~&@1'
-    ]
-def modify(mol):
-    #atoms = mol.GetAtoms()
-    mol_copy = copy.deepcopy(mol)
-    mw = Chem.RWMol(mol)
-    p1 = Chem.MolFromSmarts('[#6]=[#6]1-[C,N]~&@[C,N]~&@[C,N]~&@[C,N]~&@[C,N]-1')
-    subs = mw.GetSubstructMatches(p1)
-    for sub in subs:
-        comb = itertools.combinations(sub, 2)
-        b_list = [(c, mw.GetBondBetweenAtoms(*c)) for c in comb if mw.GetBondBetweenAtoms(*c) is not None]
-        for b in b_list:
-            if b[-1].GetBondType().__str__() == 'DOUBLE' and b[-1].IsInRing() is False:
-                mw.RemoveBond(*b[0])
-                mw.AddBond(*b[0], Chem.rdchem.BondType.SINGLE)
-    
-    Chem.GetSSSR(mw)
-    for p in PATTERNS:
-        subs = list(mw.GetSubstructMatches(p))
-        for sub in subs:
-            bond_list = [(i,sub[0]) if ix+1==len(sub) else (i, sub[ix+1]) for ix,i in enumerate(sub)]
-            if len(bond_list) == 0:
-                continue
-            bond_type = [mw.GetBondBetweenAtoms(*b).GetBondType().__str__() for b in bond_list]
-            if bond_type.count('DOUBLE') > 0:
-                for b in bond_list:
-                    mw.RemoveBond(*b)
-                    mw.AddBond(*b, Chem.rdchem.BondType.AROMATIC)
-
-    # get new mol from modified mol
-    conf = mw.GetConformer()
-    rd_mol = Chem.RWMol()
-    rd_conf = Chem.Conformer(mw.GetNumAtoms())
-    for i, atom in enumerate(mw.GetAtoms()):
-        rd_atom = Chem.Atom(atom.GetAtomicNum())
-        rd_mol.AddAtom(rd_atom)
-        rd_coords = conf.GetAtomPosition(i)
-        rd_conf.SetAtomPosition(i, rd_coords)
-    rd_mol.AddConformer(rd_conf)
-    
-    for i, bond in enumerate(mw.GetBonds()):
-        bt = bond.GetBondType()
-        node_i = bond.GetBeginAtomIdx()
-        node_j = bond.GetEndAtomIdx()
-        rd_mol.AddBond(node_i, node_j, bt)
-    out_mol = rd_mol.GetMol()
-
-    # check validility of the new mol
-    mol_check = Chem.MolFromSmiles(Chem.MolToSmiles(out_mol))
-    if mol_check:
-        try:
-            Chem.Kekulize(out_mol)
-            return out_mol
-        except:
-            return mol_copy
-    else:
-        return mol_copy
-'''
-
-def save_sdf(mol_list, save_name='mol_gen.sdf'):
-    writer = Chem.SDWriter(save_name)
-    writer.SetProps(['LOGP', 'MW'])
-    for i, mol in enumerate(mol_list):
-        mw = Descriptors.ExactMolWt(mol)
-        logp = Descriptors.MolLogP(mol)
-        mol.SetProp('MW', '%.2f' %(mw))
-        mol.SetProp('LOGP', '%.2f' %(logp))
-        mol.SetProp('_Name', 'No_%s' %(i))
-        Chem.Kekulize(mol)
-        writer.write(mol)
-    writer.close()
-
-
-def check_alert_structure(mol, alert_smarts):
-    Chem.GetSSSR(mol)
-    pattern = Chem.MolFromSmarts(alert_smarts)
-    subs = mol.GetSubstructMatches(pattern)
-    if len(subs) == 0:
-        return False
-    else:
-        return True
-
-def check_alert_structures(mol, alert_smarts_list):
-    Chem.GetSSSR(mol)
-    patterns = [Chem.MolFromSmarts(sma) for sma in alert_smarts_list]
-    for p in patterns:
-        subs = mol.GetSubstructMatches(p)
-        if len(subs) != 0:
-            return True
-    else:
-        return False
--- a/pocket_flow/utils/.ipynb_checkpoints/load_dataset-checkpoint.py
+++ b/pocket_flow/utils/.ipynb_checkpoints/load_dataset-checkpoint.py
@@ -1,292 +0,0 @@
-import os
-import pickle
-import glob
-import lmdb
-import random
-from rdkit import Chem
-from multiprocessing import Pool
-import numpy as np
-import torch
-from torch.utils.data import Dataset, Subset
-#from torch_geometric.loader import DataLoader
-from tqdm.auto import tqdm
-from .data import ComplexData, torchify_dict
-from .ParseFile import Protein, Ligand, parse_sdf_to_dict
- 
-
-
-class LoadDataset(Dataset):
-
-    def __init__(self, dataset_path, transform=None, map_size=10*(1024*1024*1024)):
-        super().__init__()
-        self.dataset_path = dataset_path
-        self.transform = transform
-        self.map_size = map_size
-        self.db = None
-        self.keys = None
-
-    def _connect_db(self):
-        self.db = lmdb.open(
-            self.dataset_path,
-            map_size=self.map_size,   # 10GB
-            create=False,
-            subdir=False,
-            readonly=True,
-            lock=False,
-            readahead=False,
-            meminit=False,
-        )
-        with self.db.begin() as txn:
-            self.keys = list(txn.cursor().iternext(values=False))
-    
-    def _close_db(self):
-        self.db.close()
-        self.db = None
-        self.keys = None
-    
-    def __len__(self):
-        if self.db is None:
-            self._connect_db()
-        return len(self.keys)
-
-    def __getitem__(self, idx):
-        if self.db is None:
-            self._connect_db()
-        key = self.keys[idx]
-        data = pickle.loads(self.db.begin().get(key))
-        #data.id = idx
-        #assert data.protein_pos.size(0) > 0
-        if self.transform is not None:
-            data = self.transform(data)
-        return data
-    
-    def __connect_db_edit__(self):
-        self.db = lmdb.open(
-            self.dataset_path,
-            map_size=self.map_size,   # 10GB
-            create=False,
-            subdir=False,
-            readonly=False,
-            lock=False,
-            readahead=False,
-            meminit=False,
-        )
-        with self.db.begin() as txn:
-            self.keys = list(txn.cursor().iternext(values=False))
-    
-    def remove(self, idx):
-        if self.db is None:
-            self.__connect_db_edit__()
-        txn = self.db.begin(write=True)
-        txn.delete(self.keys[idx])
-        txn.commit()
-        self._close_db()
-
-    @staticmethod
-    def split(dataset, val_num=None, shuffle=True, random_seed=0):
-        index = list(range(len(dataset)))
-        if shuffle:
-            random.seed(random_seed)
-            random.shuffle(index)
-        index = torch.LongTensor(index)
-        split_dic = {'valid':index[:val_num], 'train':index[val_num:]}
-        subsets = {k:Subset(dataset, indices=v) for k, v in split_dic.items()}
-        train_set, val_set = subsets['train'], subsets['valid']
-        return train_set, val_set
-    
-    @staticmethod
-    def split_by_name(dataset, test_key_set, name2id_path=None):
-        #self.name2id_path = '/'.join(self.dataset_path.split('/')[:-1])
-        #self.dataset_name = self.dataset_path.split('/')[-1].split('.')[0]
-        if name2id_path is None:
-            name2id_path = '/'.join(dataset.dataset_path.split('/')[:-1])
-            dataset_name = dataset.dataset_path.split('/')[-1].split('.')[0]
-            name2id_file= name2id_path+'/'+dataset_name+'_name2id.pt'
-            if os.path.exists(name2id_file):
-                name2id = torch.load(name2id_file)
-            else:
-                name2id = {}
-                for i in tqdm(range(len(dataset)), 'Indexing Dataset'):
-                    try:
-                        data = dataset[i]
-                    except AssertionError as e:
-                        print(i, e)
-                        continue
-                    name = (
-                        '/'.join(data.protein_filename.split('/')[-2:]), 
-                        '/'.join(data.ligand_filename.split('/')[-2:])
-                            )
-                    name2id[name] = i
-                torch.save(name2id, name2id_file)
-        else:
-            name2id = torch.load(name2id_file)
-        
-        train_idx = []
-        test_idx = []
-        for k,v in tqdm(name2id.items(), 'Spliting'):
-            if k in test_key_set:
-                test_idx.append(v)
-            else:
-                train_idx.append(v)
-        split_dict = {
-            'valid':torch.LongTensor(test_idx), 'train':torch.LongTensor(train_idx)
-            }
-        subsets = {k:Subset(dataset, indices=v) for k, v in split_dict.items()}
-        train_set, val_set = subsets['train'], subsets['valid']
-        return train_set, val_set
-        
-#config = load_config('./configs/train.yml')
-#dataset = LoadDataset('../Pocket2Mol-learning/data/crossdocked_pocket10_processed.lmdb')
-#print(len(dataset))
-
-
-class CrossDocked2020(object):
-    def __init__(self, raw_path, index_path, unexpected_sample=[],
-                 atomic_numbers=[6,7,8,9,15,16,17,35,53]):
-        self.raw_path = raw_path
-        self.file_dirname = os.path.dirname(raw_path)
-        self.index_path = index_path
-        self.unexpected_sample = unexpected_sample
-        self.index = self.get_file(index_path, raw_path)
-        self.atomic_numbers = set(atomic_numbers)
-
-    @staticmethod
-    def get_file(index_dirname, index_path):
-        with open(index_dirname, 'rb') as f:
-            index = pickle.load(f)
-        file_list = []
-        for i in index:
-            if i[0] is None:
-                continue
-            else:
-                pdb = os.path.join(index_path, i[0])
-            sdf = os.path.join(index_path, i[1])
-            file_list.append([pdb,sdf])
-        return file_list
-
-    def process(self, raw_file_info):
-        try:
-            pocket_file, ligand_file = raw_file_info
-            lig = Ligand(ligand_file, removeHs=True, sanitize=True)
-            for a in lig.mol.GetAtoms():
-                if a.GetAtomicNum() not in self.atomic_numbers:
-                    return None
-            else:
-                ligand_dict = lig.to_dict()
-            if self.only_backbone:
-                pocket_dict = Protein(pocket_file).get_backbone_dict(removeHs=True)
-            else:
-                pocket_dict = Protein(pocket_file).get_atom_dict()
-            #ligand_dict = parse_sdf_to_dict(ligand_fn)
-            data = ComplexData.from_protein_ligand_dicts(
-                protein_dict=torchify_dict(pocket_dict),
-                ligand_dict=torchify_dict(ligand_dict),
-            )
-            data.protein_filename = '/'.join(pocket_file.split('/')[-2:])
-            data.ligand_filename = '/'.join(ligand_file.split('/')[-2:])
-            return data
-        except:
-            return None
-    
-    def run(self, dataset_name='crossdocked_pocket10_processed.lmdb', lmdb_path=None, 
-            max_ligand_atom=50, only_backbone=False, n_process=16, interval=2000):
-        #file_dirname = os.path.dirname(self.index_path)
-        self.only_backbone = only_backbone
-        if lmdb_path:
-            lmdb_path = os.path.join(lmdb_path, dataset_name.split('/')[-1])
-        else:
-            lmdb_path = dataset_name #os.path.join(self.file_dirname,dataset_name.split('/')[-1])
-        if os.path.exists(lmdb_path):
-            raise FileExistsError(lmdb_path + ' has been existed !')
-        db = lmdb.open(
-                    lmdb_path,
-                    map_size=200*(1024*1024*1024),   # 200GB
-                    create=True,
-                    subdir=False,
-                    readonly=False, # Writable
-                )
-        data_ix_list = []
-        exception_list = []
-        data_ix = 0
-        for idx in tqdm(range(0, len(self.index), interval)):
-            if idx+interval >= len(self.index):
-                raw_files = self.index[idx:]
-            else:
-                raw_files = self.index[idx:idx+interval]
-            val_raw_files = []
-            for items in raw_files:
-                '''if items[0] is None:
-                    continue'''
-                if 'ATOM' not in open(items[0]).read():
-                    continue
-                elif items[1] in self.unexpected_sample:
-                    continue
-                elif Chem.MolFromMolFile(items[1])!=None:
-                    val_raw_files.append(items)
-                else:
-                    exception_list.append(items)
-            torch.multiprocessing.set_sharing_strategy('file_system')
-            pool = Pool(processes=n_process)
-            data_list = pool.map(self.process, val_raw_files)
-            with db.begin(write=True, buffers=True) as txn:
-                for data in data_list:
-                    if data is None: continue
-                    if data.protein_pos.size(0) < 50: continue
-                    if len(data.ligand_nbh_list) > max_ligand_atom: continue
-                    key = str(data_ix).encode()
-                    txn.put(
-                        key = key,
-                        value = pickle.dumps(data)
-                    )
-                    data_ix_list.append(key)
-                    data_ix += 1
-        db.close()
-        data_ix_list = np.array(data_ix_list)
-        np.save(lmdb_path.split('.')[0]+'_Keys', data_ix_list)
-        with open(lmdb_path.split('.')[0]+'_invalid.list','w') as fw:
-            fw.write(str(exception_list))
-'''
-unexpected_sample = [
-    line.split()[-1] for line in open('/export/home/jyy/PocketFlow/data/unexcept_element_sample_new.csv').read().split('\n')
-    ]
-cs2020 = CrossDocked2020(
-    '/export/home/jyy/Pocket2Mol-learning/data/crossdocked_pocket10',
-    '/export/home/jyy/Pocket2Mol-learning/data/crossdocked_pocket10/index.pkl',
-    unexpected_sample=unexpected_sample
-    )
-cs2020.run(
-    dataset_name='crossdocked_pocket10_processed_35Atoms_backbone.lmdb',
-    max_ligand_atom=35,
-    only_backbone=True,
-    lmdb_path=None
-    )
-'''
-
-
-class PDBBind2020(CrossDocked2020):
-    def __init__(self, raw_file_list, unexpected_sample=[], atomic_numbers=[6,7,8,9,15,16,17,35,53]):
-        self.index = self.get_file(raw_file_list)
-        self.unexpected_sample = unexpected_sample
-        self.file_dirname = os.path.split(os.path.split(self.index[0][0])[0])
-        self.atomic_numbers = set(atomic_numbers)
-
-    @staticmethod
-    def get_file(raw_file_list):
-        file_list = []
-        for i in raw_file_list:
-            sdf = glob.glob(i + '/*.sdf')
-            pdb = glob.glob(i + '/*pocket.pdb')
-            if pdb and sdf:
-                file_list.append([pdb[0], sdf[0]])
-        return file_list
-
-
-'''
-refine_set_dir_list = glob.glob('/export/home/jyy/PDBBind2020/refined-set/*/')
-other_pl_dir_list = glob.glob('/export/home/jyy/PDBBind2020/v2020-other-PL/*/')
-
-raw_file_list = refine_set_dir_list + other_pl_dir_list
-pdbbind_2020 = PDBBind2020(raw_file_list, atomic_numbers=[6,7,8,9,15,16,17,35,53])
-pdbbind_2020.run(dataset_name='PDBBind_2020.lmdb', max_ligand_atom=35,
-            n_process=6, interval=2000, only_backbone=False, lmdb_path=None)
-'''
--- a/pocket_flow/utils/.ipynb_checkpoints/metrics-checkpoint.py
+++ b/pocket_flow/utils/.ipynb_checkpoints/metrics-checkpoint.py
@@ -1,316 +0,0 @@
-from rdkit import Chem
-
-
-
-FUSED_QUA_RING_PATTERN = [
-    Chem.MolFromSmarts(i) for i in[
-        '[R]12~&@[R]~&@[R]~&@[R]~&@[R]~&@[R]~&@1~&@[R]~&@[R]1~&@[R](~&@[R]~&@[R]3~&@[R]~&@[R]~&@[R]~&@[R]~&@[R]~&@3~&@[R]~&@1)~&@[R]~&@2',
-         '[R]12~&@[R]~&@[R]~&@[R]~&@[R]3~&@[R]~&@1~&@[R](~&@[R]~&@[R]~&@[R]~&@2)~&@[R]~&@[R]1~&@[R]~&@3~&@[R]~&@[R]~&@[R]~&@[R]~&@1',
-         '[R]12~&@[R]3~&@[R]~&@[R]~&@[R]4~&@[R]~&@[R]~&@[R]~&@[R](~&@[R]~&@1~&@4)~&@[R]~&@[R]~&@[R]~&@2~&@[R]~&@[R]~&@[R]~&@3',
-         '[R]12~&@[R]3~&@[R]~&@[R]~&@[R]~&@[R]~&@[R]~&@3~&@[R]3~&@[R]~&@[R]~&@[R]~&@[R]~&@[R]~&@3~&@[R]~&@1~&@[R]~&@[R]~&@[R]~&@[R]~&@2',
-         '[R]12~&@[R]~&@[R]~&@[R]3~&@[R](~&@[R]~&@[R]~&@[R]4~&@[R]~&@3~&@[R]~&@[R]~&@[R]~&@[R]~&@4)~&@[R]~&@1~&@[R]~&@[R]~&@[R]~&@[R]~&@2',
-         '[R]12~&@[R]~&@[R]~&@[R]3~&@[R]~&@[R]4~&@[R]~&@[R]~&@[R]~&@[R]~&@[R]~&@4~&@[R]~&@[R]~&@3~&@[R]~&@1~&@[R]~&@[R]~&@[R]~&@[R]~&@2',
-         '[R]12~&@[R]~&@[R]~&@[R]~&@[R]~&@1~&@[R]~&@[R]1~&@[R]~&@[R]3~&@[R]~&@[R]~&@[R]~&@[R]~&@[R]~&@3~&@[R]~&@[R]~&@1~&@[R]~&@2',
-         '[R]12~&@[R]~&@[R]3~&@[R](~&@[R]~&@[R]4~&@[R]~&@[R]~&@[R]~&@[R]~&@[R]~&@4~&@[R]~&@3)~&@[R]~&@1~&@[R]~&@[R]~&@[R]~&@[R]~&@2',
-         '[R]12~&@[R]~&@[R]~&@[R]3~&@[R]~&@[R]4~&@[R]~&@[R]~&@[R]~&@[R]~&@[R]~&@4~&@[R]~&@[R]~&@3~&@[R]~&@1~&@[R]~&@[R]~&@[R]~&@2',
-         '[R]12~&@[R]~&@[R]3~&@[R]~&@[R]4~&@[R](~&@[R]~&@[R]~&@[R]~&@[R]~&@4)~&@[R]~&@3~&@[R]~&@1~&@[R]~&@[R]~&@[R]~&@[R]~&@2',
-         '[R]12~&@[R]~&@[R]3~&@[R]4~&@[R]~&@[R]~&@[R]~&@[R]~&@[R]~&@4~&@[R]~&@[R]~&@3~&@[R]~&@1~&@[R]~&@[R]~&@[R]~&@[R]~&@2',
-         '[R]12~&@[R]~&@[R]~&@[R]3~&@[R]4~&@[R]~&@[R]~&@[R]~&@[R]~&@4~&@[R]~&@[R]~&@[R]~&@3~&@[R]~&@1~&@[R]~&@[R]~&@[R]~&@2',
-         '[R]12~&@[R]~&@[R]~&@[R]~&@[R]~&@[R]~&@1~&@[R]1~&@[R](~&@[R]~&@[R]~&@[R]~&@1)~&@[R]1~&@[R]~&@2~&@[R]~&@[R]~&@[R]~&@1',
-         '[R]12~&@[R]~&@[R]~&@[R]3~&@[R]~&@[R]~&@[R]4~&@[R](~&@[R]~&@[R]~&@[R]~&@4)~&@[R]~&@3~&@[R]~&@1~&@[R]~&@[R]~&@[R]~&@2',
-         '[R]12~&@[R]~&@[R]~&@[R]~&@[R]~&@1~&@[R]~&@[R]~&@[R]1~&@[R]3~&@[R]~&@[R]~&@[R]~&@[R]~&@[R]~&@3~&@[R]~&@[R]~&@1~&@[R]~&@2',
-         '[R]12~&@[R]~&@[R]~&@[R]~&@[R]~&@[R]~&@1~&@[R]1~&@[R]~&@[R]3~&@[R]~&@[R]~&@[R]~&@[R]~&@[R]~&@3~&@[R]~&@[R]~&@1~&@[R]~&@[R]~&@[R]~&@2',
-         '[R]12~&@[R]~&@[R]~&@[R]~&@[R]3~&@[R]~&@[R]~&@[R]~&@[R]~&@[R]~&@3~&@[R]~&@1~&@[R]~&@[R]~&@[R]~&@2',
-         '[R]12~&@[R]~&@[R]~&@[R]~&@[R]~&@[R]~&@1~&@[R]~&@[R]~&@[R]1~&@[R]~&@[R]~&@[R]~&@[R]~&@1~&@[R]~&@2',
-         '[R]12~&@[R]~&@[R]3~&@[R](~&@[R]~&@[R]~&@[R]4~&@[R]~&@3~&@[R]~&@[R]~&@[R]~&@4)~&@[R]~&@1~&@[R]~&@[R]~&@[R]~&@[R]~&@2',
-         '[R]12~&@[R]~&@[R]3~&@[R](~&@[R]~&@[R]4~&@[R](~&@[R]~&@[R]~&@[R]~&@4)~&@[R]~&@3)~&@[R]~&@1~&@[R]~&@[R]~&@[R]~&@[R]~&@2',
-         '[R]12~&@[R]~&@[R]~&@[R]~&@[R]~&@[R]3~&@[R]~&@1~&@[R]1~&@[R](~&@[R]~&@[R]~&@[R]~&@[R]~&@1~&@[R]~&@[R]~&@3)~&@[R]~&@[R]~&@2',
-         '[R]12~&@[R]~&@[R]~&@[R]~&@[R]~&@[R]3~&@[R]~&@1~&@[R]1~&@[R](~&@[R]~&@[R]~&@[R]~&@1~&@[R]~&@[R]~&@3)~&@[R]~&@[R]~&@2',
-         '[R]12~&@[R]~&@[R]~&@[R]~&@[R]3~&@[R]~&@1~&@[R]1~&@[R](~&@[R]~&@[R]~&@2)~&@[R]~&@[R]~&@[R]~&@1~&@[R]~&@[R]~&@3',
-         '[R]12~&@[R]~&@[R]~&@[R]~&@[R]~&@[R]3~&@[R]~&@1~&@[R](~&@[R]~&@[R]~&@3)~&@[R]~&@[R]~&@[R]~&@2',
-         '[R]12~&@[R]~&@[R]~&@[R]~&@[R]~&@[R]~&@1~&@[R]~&@[R]1~&@[R]3~&@[R]~&@2~&@[R]~&@[R]~&@[R]~&@3~&@[R]~&@[R]~&@[R]~&@1',
-         '[R]12~&@[R]~&@[R]~&@[R]~&@[R]~&@[R]3~&@[R]~&@1~&@[R](~&@[R]1~&@[R](~&@[R]~&@[R]~&@[R]~&@1)~&@[R]~&@2)~&@[R]~&@[R]~&@[R]~&@3',
-         '[R]12~&@[R]~&@[R]~&@[R]3~&@[R]~&@1~&@[R]1~&@[R](~&@[R]~&@[R]~&@2)~&@[R]~&@[R]~&@[R]~&@1~&@[R]~&@[R]~&@3',
-         '[R]12~&@[R]~&@[R]3~&@[R]~&@[R]~&@[R]4~&@[R]~&@3~&@[R](~&@[R]~&@[R]~&@[R]~&@4)~&@[R]~&@1~&@[R]~&@[R]~&@[R]~&@2',
-         '[R]12~&@[R]~&@[R]~&@[R]3~&@[R]~&@1~&@[R](~&@[R]~&@[R]~&@[R]~&@2)~&@[R]~&@[R]1~&@[R]~&@3~&@[R]~&@[R]~&@[R]~&@1',
-         '[R]12~&@[R]~&@[R]~&@[R]~&@[R]3~&@[R]~&@1~&@[R](~&@[R]~&@[R]~&@[R]~&@[R]~&@2)~&@[R]~&@[R]~&@[R]~&@3',
-         '[R]12~&@[R]~&@[R]~&@[R]~&@[R]~&@[R]~&@1~&@[R]~&@[R]1~&@[R]~&@2~&@[R]~&@[R]~&@[R]2~&@[R]~&@1~&@[R]~&@[R]~&@[R]~&@[R]~&@2',
-         '[R]12~&@[R]~&@[R]~&@[R]~&@[R]~&@[R]~&@1~&@[R]~&@[R]1~&@[R]~&@2~&@[R]2~&@[R]~&@[R]~&@[R]~&@[R]~&@[R]~&@2~&@[R]~&@[R]~&@1',
-         '[R]12~&@[R]~&@[R]~&@[R]3~&@[R]~&@1~&@[R](~&@[R]~&@[R]~&@[R]~&@[R]~&@2)~&@[R]~&@[R]~&@3',
-         '[R]12~&@[R]~&@[R]~&@[R]~&@[R]~&@1~&@[R]~&@[R]1~&@[R]3~&@[R]~&@2~&@[R]~&@[R]~&@[R]~&@[R]~&@3~&@[R]~&@[R]~&@1',
-         '[R]12~&@[R]~&@[R]~&@[R]~&@[R]34~&@[R]~&@1~&@[R](~&@[R]~&@[R]~&@[R]~&@2)~&@[R]~&@[R]~&@3~&@[R]~&@[R]~&@[R]~&@[R]~&@4',
-         '[R]12~&@[R]~&@[R]~&@[R]~&@[R]~&@[R]~&@[R]~&@1~&@[R]~&@[R]1~&@[R](~&@[R]~&@[R]3~&@[R]~&@[R]~&@[R]~&@[R]~&@[R]~&@3~&@[R]~&@1)~&@[R]~&@2',
-         '[R]12~&@[R]~&@[R]~&@[R]~&@[R]~&@[R]~&@[R]~&@1~&@[R]~&@[R]1~&@[R](~&@[R]3~&@[R]~&@[R]~&@[R]~&@[R]~&@[R]~&@3~&@[R]~&@[R]~&@1)~&@[R]~&@2',
-         '[R]12~&@[R]~&@[R]~&@[R]~&@[R]~&@[R]~&@[R]~&@1~&@[R]1~&@[R](~&@[R]~&@[R]~&@[R]~&@[R]~&@1)~&@[R]1~&@[R]~&@2~&@[R]~&@[R]~&@[R]~&@[R]~&@1',
-         '[R]12~&@[R]~&@[R]~&@[R]~&@[R]~&@[R]~&@[R]~&@1~&@[R]1~&@[R]3~&@[R](~&@[R]~&@[R]~&@[R]~&@[R]~&@3~&@[R]~&@[R]~&@[R]~&@1)~&@[R]~&@2',
-         '[R]12~&@[R]~&@[R]~&@[R]~&@[R]~&@[R]~&@[R]~&@1~&@[R]~&@[R]1~&@[R](~&@[R]~&@[R]3~&@[R]~&@[R]~&@[R]~&@[R]~&@3~&@[R]~&@1)~&@[R]~&@2',
-         '[R]12~&@[R]~&@[R]~&@[R]~&@[R]~&@[R]~&@[R]~&@1~&@[R]~&@[R]1~&@[R](~&@[R]~&@[R]~&@[R]3~&@[R]~&@1~&@[R]~&@[R]~&@[R]~&@3)~&@[R]~&@2',
-         '[R]12~&@[R]~&@[R]~&@[R]3~&@[R]4~&@[R]~&@1~&@[R](~&@[R]~&@[R]~&@4~&@[R]~&@[R]~&@[R]~&@3)~&@[R]~&@[R]~&@[R]~&@2',
-         '[R]12~&@[R]~&@[R]~&@[R]~&@[R]~&@[R]~&@[R]~&@1~&@[R]1~&@[R]3~&@[R](~&@[R]~&@[R]~&@[R]~&@[R]~&@3~&@[R]~&@[R]~&@1)~&@[R]~&@2',
-         '[R]12~&@[R]~&@[R]~&@[R]~&@[R]3~&@[R](~&@[R]~&@[R]4~&@[R]~&@[R]~&@[R]~&@[R]~&@[R]~&@4~&@[R]~&@3)~&@[R]~&@1~&@[R]~&@[R]~&@[R]~&@[R]~&@2',
-         '[R]12~&@[R]~&@[R]~&@[R]~&@[R]~&@[R]~&@1~&@[R]~&@[R]1~&@[R]~&@[R]3~&@[R]~&@[R]~&@[R]~&@[R]~&@[R]~&@3~&@[R]~&@[R]~&@1~&@[R]~&@[R]~&@2',
-         '[R]12~&@[R]~&@[R]~&@[R]~&@[R]~&@[R]~&@1~&@[R]~&@[R]1~&@[R]3~&@[R]~&@[R]~&@[R]~&@[R]~&@[R]~&@3~&@[R]~&@[R]~&@[R]~&@1~&@[R]~&@[R]~&@2',
-         '[R]12~&@[R]~&@[R]~&@[R]~&@[R]3~&@[R](~&@[R]~&@[R]~&@[R]~&@[R]~&@3)~&@[R]~&@1~&@[R]~&@[R]~&@[R]1~&@[R]~&@2~&@[R]~&@[R]~&@[R]~&@[R]~&@1',
-         '[R]12~&@[R]~&@[R]~&@[R]~&@[R]3~&@[R@@H](~&@[R@H]4~&@[R]~&@[R]~&@[R]~&@[R]~&@[R@H]~&@4~&@[R]~&@[R]~&@3)~&@[R]~&@1~&@[R]~&@[R]~&@[R]~&@[R]~&@2',
-         '[R]12~&@[R]~&@[R]3~&@[R](~&@[R]~&@[R]~&@[R]~&@[R]~&@3)~&@[R]3~&@[R](~&@[R]~&@[R]~&@[R]~&@[R]~&@3)~&@[R]~&@1~&@[R]~&@[R]~&@[R]~&@[R]~&@2',
-         '[R]12~&@[R]~&@[R]3~&@[R](~&@[R]~&@[R]~&@[R]~&@[R]~&@3)~&@[R]3~&@[R](~&@[R]~&@[R]~&@[R]~&@3)~&@[R]~&@1~&@[R]~&@[R]~&@[R]~&@[R]~&@2',
-         '[R]12~&@[R]3~&@[R]~&@[R]~&@[R]~&@[R]~&@[R]~&@3~&@[R]~&@[R]3~&@[R]~&@[R]~&@[R]~&@[R]~&@3~&@[R]~&@1~&@[R]~&@[R]~&@[R]~&@[R]~&@2',
-         '[R]12~&@[R]~&@[R]~&@[R]~&@[R]3~&@[R]~&@1~&@[R](~&@[R]~&@[R]~&@[R]~&@2)~&@[R]1~&@[R]~&@3~&@[R]~&@[R]~&@[R]~&@[R]~&@1',
-         '[R]12~&@[R]~&@[R]~&@[R]~&@[R]~&@[R]~&@1~&@[R]~&@[R]1~&@[R](~&@[R]3~&@[R]~&@[R]~&@[R]~&@[R]~&@3~&@[R]~&@1)~&@[R]~&@2',
-         '[R]12~&@[R]~&@[R]~&@[R]~&@[R]~&@[R]~&@1~&@[R]~&@[R]1~&@[R]~&@[R]~&@[R]~&@[R]~&@[R]~&@1~&@[R]~&@[R]~&@2',
-         '[R]12~&@[R]~&@[R]~&@[R]~&@[R]3~&@[R](~&@[R]~&@[R]~&@[R]~&@[R]~&@3)~&@[R]~&@1~&@[R]~&@[R]~&@[R]~&@[R]~&@2',
-         '[R]12~&@[R]~&@[R]~&@[R]~&@[R]~&@1~&@[R]1~&@[R]~&@[R]~&@[R]~&@[R]3~&@[R]~&@1~&@[R](~&@[R]~&@[R]~&@[R]~&@3)~&@[R]~&@2',
-         '[R]12~&@[R]3~&@[R]~&@[R]~&@[R]~&@[R]~&@[R]~&@3~&@[R]3~&@[R]~&@[R]~&@[R]~&@[R]~&@3~&@[R]~&@1~&@[R]~&@[R]~&@[R]~&@[R]~&@2',
-         '[R]12~&@[R]~&@[R]~&@[R]3~&@[R]~&@1~&@[R](~&@[R]1~&@[R](~&@[R]~&@2)~&@[R]~&@[R]~&@[R]~&@[R]~&@1)~&@[R]~&@[R]~&@[R]~&@3',
-         '[R]12~&@[R]~&@[R]~&@[R]3~&@[R](~&@[R]~&@[R]~&@[R]~&@[R]~&@3)~&@[R]~&@1~&@[R]~&@[R]~&@[R]1~&@[R]~&@2~&@[R]~&@[R]~&@[R]~&@1',
-         '[R]12~&@[R]~&@[R]~&@[R]3~&@[R](~&@[R]~&@[R]~&@[R]~&@[R]~&@3)~&@[R]~&@1~&@[R]~&@[R]1~&@[R](~&@[R]~&@[R]~&@[R]~&@1)~&@[R]~&@2',
-         '[R]12~&@[R]~&@[R]~&@[R]~&@[R]~&@[R]~&@1~&@[R]1~&@[R](~&@[R]~&@[R]~&@[R]3~&@[R]~&@1~&@[R]~&@[R]~&@[R]~&@3)~&@[R]~&@[R]~&@2',
-         '[R]12~&@[R]~&@[R]~&@[R]~&@[R]~&@1~&@[R]~&@[R]1~&@[R]~&@[R]3~&@[R]~&@[R]~&@[R]~&@[R]~&@3~&@[R]~&@[R]~&@1~&@[R]~&@2',
-         '[R]12~&@[R]~&@[R]~&@[R]3~&@[R](~&@[R]~&@[R]4~&@[R]~&@[R]~&@[R]~&@[R]~&@4~&@[R]~&@3)~&@[R]~&@1~&@[R]~&@[R]~&@[R]~&@2',
-         '[R]12~&@[R]~&@[R]~&@[R]3~&@[R]~&@[R]~&@[R]4~&@[R](~&@[R]~&@[R]~&@[R]~&@[R]~&@4)~&@[R]~&@3~&@[R]~&@1~&@[R]~&@[R]~&@[R]~&@[R]~&@2',
-         '[R]12~&@[R]~&@[R]~&@[R]3~&@[R]4~&@[R](~&@[R]~&@[R]~&@[R]~&@3)~&@[R]~&@[R]~&@[R]~&@[R]~&@1~&@4~&@[R]~&@[R]~&@[R]~&@[R]~&@2',
-         '[R]12~&@[R]~&@[R]~&@[R]~&@[R]~&@13~&@[R](~&@[R]~&@[R]~&@[R]~&@3)~&@[R]~&@[R]1~&@[R]~&@2~&@[R]~&@[R]~&@[R]~&@[R]~&@1',
-         '[R]12~&@[R]~&@[R]~&@[R]~&@[R]~&@[R]~&@1~&@[R]~&@[R]1~&@[R]~&@2~&@[R]2~&@[R]~&@[R]~&@[R]~&@[R]~&@2~&@[R]~&@[R]~&@1'
-            ]
-        ]
-
-def has_fused4ring(mol):
-    for pat in FUSED_QUA_RING_PATTERN:
-        if mol.HasSubstructMatch(pat):
-            return True
-    else:
-        return False
-
-
-PATTERNS_1 = [Chem.MolFromSmarts(i) for i in [
-                        '[R]12~&@[R]~&@[R]~&@[R]~&@1~&@[R]~&@2',
-                         '[R]12~&@[R]~&@[R]~&@1~&@[R]~&@[R]~&@[R]~&@2',
-                         '[R]12~&@[R]~&@[R]~&@[R]~&@[R]~&@[R]~&@1~&@[R]~&@2',
-                         '[R]12~&@[R]~&@[R]~&@1~&@[R]~&@2',
-                         '[R]12~&@[R]~&@[R]~&@[R]~&@1~&@[R]~&@[R]~&@[R]~&@[R]~&@2',
-                         '[R]12~&@[R]~&@[R]~&@[R]~&@[R]~&@1~&@[R]~&@[R]~&@2',
-                         '[R]12~&@[R]~&@[R]~&@[R]~&@1~&@[R]~&@[R]~&@2',
-                         '[R]1~&@[R]~&@[R]~&@12~&@[R]~&@[R]~&@2'
-                        ]
-           ]
-
-def judge_unexpected_ring(mol):
-    for pat in PATTERNS_1:
-        subs = mol.GetSubstructMatches(pat)
-        if len(subs) > 0:
-            return True
-    else:
-        return False
-
-
-PATTERNS = [Chem.MolFromSmarts(i) for i in [
-                        '[R]12~&@[R]3~&@[R]~&@[R]~&@[R]~&@[R]~&@[R]~&@3~&@[R]~&@[R]~&@[R]~&@1~&@[R]~&@[R]~&@[R]~&@[R]~&@2',
-                        '[R]12~&@[R]~&@[R]3~&@[R]~&@[R]~&@[R]~&@[R]~&@[R]~&@3~&@[R]~&@[R]~&@1~&@[R]~&@[R]~&@[R]~&@2',
-                        '[R]12~&@[R]~&@[R]~&@[R]~&@[R]3~&@[R]~&@1~&@[R](~&@[R]~&@[R]~&@[R]~&@2)~&@[R]~&@[R]~&@[R]~&@3',
-                        '[R]12~&@[R]~&@[R]~&@[R]~&@[R]~&@[R]~&@1~&@[R]1~&@[R](~&@[R]~&@2)~&@[R]~&@[R]~&@[R]~&@[R]~&@1',
-                        '[R]12~&@[R]~&@[R]~&@[R]3~&@[R](~&@[R]~&@1~&@[R]~&@[R]~&@[R]~&@[R]~&@2)~&@[R]~&@[R]~&@[R]~&@3',
-                        '[R]12~&@[R]~&@[R]~&@[R]~&@[R]~&@[R]~&@1~&@[R]~&@[R]1~&@[R](~&@[R]~&@2)~&@[R]~&@[R]~&@[R]~&@[R]~&@1']
-           ]
-
-def judge_fused_ring(mol):
-    for pat in PATTERNS+FUSED_QUA_RING_PATTERN:
-        if mol.HasSubstructMatch(pat):
-            return True
-    else:
-        return False
-
-def substructure(mol_lib):
-    total_num = sum([len(i) for i in mol_lib])
-    ring_size_statis = {
-            'tri_ring':{'num':0},
-            'qua_ring':{'num':0},
-            'fif_ring':{'num':0},
-            'hex_ring':{'num':0},
-            'hep_ring':{'num':0},
-            'oct_ring':{'num':0},
-            'big_ring':{'num':0},
-            'fused_ring':{'num':0},
-            'unexpected_ring':{'num':0},
-            'sssr':{}
-                       }
-    for s in mol_lib:
-        for mol in s:
-            if mol is None: continue
-            mol = Chem.MolFromSmiles(Chem.MolToSmiles(mol, isomericSmiles=False))
-            sssr = Chem.GetSSSR(mol)
-            if sssr in ring_size_statis['sssr']:
-                ring_size_statis['sssr'][sssr]['num'] += 1
-            else:
-                ring_size_statis['sssr'][sssr] = {}
-                ring_size_statis['sssr'][sssr]['num'] = 1
-            ring = mol.GetRingInfo()
-            has_ring_size_3 = False
-            has_ring_size_4 = False
-            has_ring_size_5 = False
-            has_ring_size_6 = False
-            has_ring_size_7 = False
-            has_ring_size_8 = False
-            has_big_ring = False
-            has_fused_ring = False
-            has_unexpected_ring = False
-            for r in mol.GetRingInfo().AtomRings():
-                if len(r) == 3 and has_ring_size_3 is False:
-                    has_ring_size_3 = True
-                    ring_size_statis['tri_ring']['num'] += 1
-                if len(r) == 4 and has_ring_size_4 is False:
-                    has_ring_size_4 = True
-                    ring_size_statis['qua_ring']['num'] += 1
-                if len(r) == 5 and has_ring_size_5 is False:
-                    has_ring_size_5 = True
-                    ring_size_statis['fif_ring']['num'] += 1
-                if len(r) == 6 and has_ring_size_6 is False:
-                    has_ring_size_6 = True
-                    ring_size_statis['hex_ring']['num'] += 1
-                if len(r) == 7 and has_ring_size_7 is False:
-                    has_ring_size_7 = True
-                    ring_size_statis['hep_ring']['num'] += 1
-                if len(r) == 8 and has_ring_size_8 is False:
-                    has_ring_size_8 = True
-                    ring_size_statis['oct_ring']['num'] += 1
-                if len(r) > 8 and has_big_ring is False:
-                    has_big_ring = True
-                    ring_size_statis['big_ring']['num'] += 1
-            '''for i in range(mol.GetNumAtoms()):
-                if ring.IsAtomInRingOfSize(i,3) and has_ring_size_3 is False:
-                    has_ring_size_3 = True
-                    ring_size_statis['tri_ring']['num'] += 1
-                if ring.IsAtomInRingOfSize(i,4) and has_ring_size_4 is False:
-                    has_ring_size_4 = True
-                    ring_size_statis['qua_ring']['num'] += 1
-                if ring.IsAtomInRingOfSize(i,5) and has_ring_size_5 is False:
-                    has_ring_size_5 = True
-                    ring_size_statis['fif_ring']['num'] += 1
-                if ring.IsAtomInRingOfSize(i,6) and has_ring_size_6 is False:
-                    has_ring_size_6 = True
-                    ring_size_statis['hex_ring']['num'] += 1
-                if ring.IsAtomInRingOfSize(i,7) and has_ring_size_7 is False:
-                    has_ring_size_7 = True
-                    ring_size_statis['hep_ring']['num'] += 1
-                if ring.IsAtomInRingOfSize(i,8) and has_ring_size_8 is False:
-                    has_ring_size_8 = True
-                    ring_size_statis['oct_ring']['num'] += 1'''
-            if judge_fused_ring(mol) and has_fused_ring is False:
-                has_fused_ring = True
-                ring_size_statis['fused_ring']['num'] += 1
-            if judge_unexpected_ring(mol) and has_unexpected_ring is False:
-                ring_size_statis['unexpected_ring']['num'] += 1
-    ring_size_statis['tri_ring']['rate'] = ring_size_statis['tri_ring']['num']/total_num
-    ring_size_statis['qua_ring']['rate'] = ring_size_statis['qua_ring']['num']/total_num
-    ring_size_statis['fif_ring']['rate'] = ring_size_statis['fif_ring']['num']/total_num
-    ring_size_statis['hex_ring']['rate'] = ring_size_statis['hex_ring']['num']/total_num
-    ring_size_statis['hep_ring']['rate'] = ring_size_statis['hep_ring']['num']/total_num
-    ring_size_statis['oct_ring']['rate'] = ring_size_statis['oct_ring']['num']/total_num
-    ring_size_statis['big_ring']['rate'] = ring_size_statis['big_ring']['num']/total_num
-    ring_size_statis['fused_ring']['rate'] = ring_size_statis['fused_ring']['num']/total_num
-    ring_size_statis['unexpected_ring']['rate'] = ring_size_statis['unexpected_ring']['num']/total_num
-    for k in ring_size_statis['sssr']:
-        ring_size_statis['sssr'][k]['rate'] = ring_size_statis['sssr'][k]['num']/total_num
-    return ring_size_statis
-
-
-def smoothing(scalars, weight=0.8):  # Weight between 0 and 1
-    last = scalars[0]  # First value in the plot (first timestep)
-    smoothed = list()
-    for point in scalars:
-        smoothed_val = last * weight + (1 - weight) * point  # Calculate smoothed value
-        smoothed.append(smoothed_val)                        # Save it
-        last = smoothed_val                                  # Anchor the last smoothed value
-    return smoothed
-
-
-
-
-"""PATTERNS = [Chem.MolFromSmarts(i) for i in [
-                        '[R]12~&@[R]3~&@[R]~&@[R]~&@[R]~&@[R]~&@[R]~&@3~&@[R]~&@[R]~&@[R]~&@1~&@[R]~&@[R]~&@[R]~&@[R]~&@2',
-                        '[R]12~&@[R]~&@[R]3~&@[R]~&@[R]~&@[R]~&@[R]~&@[R]~&@3~&@[R]~&@[R]~&@1~&@[R]~&@[R]~&@[R]~&@2',
-                        '[R]12~&@[R]~&@[R]~&@[R]~&@[R]3~&@[R]~&@1~&@[R](~&@[R]~&@[R]~&@[R]~&@2)~&@[R]~&@[R]~&@[R]~&@3',
-                        '[R]12~&@[R]~&@[R]~&@[R]~&@[R]~&@[R]~&@1~&@[R]1~&@[R](~&@[R]~&@2)~&@[R]~&@[R]~&@[R]~&@[R]~&@1',
-                        '[R]12~&@[R]~&@[R]~&@[R]3~&@[R](~&@[R]~&@1~&@[R]~&@[R]~&@[R]~&@[R]~&@2)~&@[R]~&@[R]~&@[R]~&@3',
-                        '[R]12~&@[R]~&@[R]~&@[R]~&@[R]~&@[R]~&@1~&@[R]~&@[R]1~&@[R](~&@[R]~&@2)~&@[R]~&@[R]~&@[R]~&@[R]~&@1']
-           ]
-
-def judge_fused_ring(mol):
-    for pat in PATTERNS:
-        subs = mol.GetSubstructMatches(pat)
-        if len(subs) > 0:
-            return True
-    else:
-        return False
-
-def substructure(mol_lib):
-    total_num = sum([len(i) for i in mol_lib])
-    ring_size_statis = {
-            'tri_ring':{'num':0},
-            'qua_ring':{'num':0},
-            'fif_ring':{'num':0},
-            'hex_ring':{'num':0},
-            'hep_ring':{'num':0},
-            'oct_ring':{'num':0},
-            'fused_ring':{'num':0},
-            'sssr':{}
-                       }
-    for s in mol_lib:
-        for mol in s:
-            if mol is None: continue
-            mol = Chem.MolFromSmiles(Chem.MolToSmiles(mol, isomericSmiles=False))
-            sssr = Chem.GetSSSR(mol)
-            if sssr in ring_size_statis['sssr']:
-                ring_size_statis['sssr'][sssr]['num'] += 1
-            else:
-                ring_size_statis['sssr'][sssr] = {}
-                ring_size_statis['sssr'][sssr]['num'] = 1
-            ring = mol.GetRingInfo()
-            has_ring_size_3 = False
-            has_ring_size_4 = False
-            has_ring_size_5 = False
-            has_ring_size_6 = False
-            has_ring_size_7 = False
-            has_ring_size_8 = False
-            has_fused_ring = False
-            for i in range(mol.GetNumAtoms()):
-                if ring.IsAtomInRingOfSize(i,3) and has_ring_size_3 is False:
-                    has_ring_size_3 = True
-                    ring_size_statis['tri_ring']['num'] += 1
-                if ring.IsAtomInRingOfSize(i,4) and has_ring_size_4 is False:
-                    has_ring_size_4 = True
-                    ring_size_statis['qua_ring']['num'] += 1
-                if ring.IsAtomInRingOfSize(i,5) and has_ring_size_5 is False:
-                    has_ring_size_5 = True
-                    ring_size_statis['fif_ring']['num'] += 1
-                if ring.IsAtomInRingOfSize(i,6) and has_ring_size_6 is False:
-                    has_ring_size_6 = True
-                    ring_size_statis['hex_ring']['num'] += 1
-                if ring.IsAtomInRingOfSize(i,7) and has_ring_size_7 is False:
-                    has_ring_size_7 = True
-                    ring_size_statis['hep_ring']['num'] += 1
-                if ring.IsAtomInRingOfSize(i,8) and has_ring_size_8 is False:
-                    has_ring_size_8 = True
-                    ring_size_statis['oct_ring']['num'] += 1 
-            if judge_fused_ring(mol) and has_fused_ring is False:
-                has_fused_ring = True
-                ring_size_statis['fused_ring']['num'] += 1
-    ring_size_statis['tri_ring']['rate'] = ring_size_statis['tri_ring']['num']/total_num
-    ring_size_statis['qua_ring']['rate'] = ring_size_statis['qua_ring']['num']/total_num
-    ring_size_statis['fif_ring']['rate'] = ring_size_statis['fif_ring']['num']/total_num
-    ring_size_statis['hex_ring']['rate'] = ring_size_statis['hex_ring']['num']/total_num
-    ring_size_statis['hep_ring']['rate'] = ring_size_statis['hep_ring']['num']/total_num
-    ring_size_statis['oct_ring']['rate'] = ring_size_statis['oct_ring']['num']/total_num
-    ring_size_statis['fused_ring']['rate'] = ring_size_statis['fused_ring']['num']/total_num
-    for k in ring_size_statis['sssr']:
-        ring_size_statis['sssr'][k]['rate'] = ring_size_statis['sssr'][k]['num']/total_num
-    return ring_size_statis
-
-
-def smoothing(scalars, weight=0.8):  # Weight between 0 and 1
-    last = scalars[0]  # First value in the plot (first timestep)
-    smoothed = list()
-    for point in scalars:
-        smoothed_val = last * weight + (1 - weight) * point  # Calculate smoothed value
-        smoothed.append(smoothed_val)                        # Save it
-        last = smoothed_val                                  # Anchor the last smoothed value
-    return smoothed"""
--- a/pocket_flow/utils/.ipynb_checkpoints/mol_filter-checkpoint.py
+++ b/pocket_flow/utils/.ipynb_checkpoints/mol_filter-checkpoint.py
@@ -1,439 +0,0 @@
-from rdkit import Chem
-from rdkit.Chem import QED, Descriptors, Crippen, Lipinski
-from multiprocessing import Pool
-from tqdm.auto import tqdm
-from . import sascorer
-
-
-# from https://doi.org/10.1016/j.bmc.2019.115192
-ALERT_STRUCTURES = [
-    'C#CI', 'C[C&D2]=!@N[!O]', 'CC([C,H])=!@N[!O]', 'N#CC(=O)', 'C(=[O,S])[F,Cl,Br,I]', '[#6][CH1]=O', 
-    'COS(=O)(=O)C(F)(F)F', '[C;H1$(C([#6;!$(C=O)])),H0$(C([#6;!$(C=O)])[#6;!$(C=O)])]=[CH1]!@N([#6;!$(C(=O))])[#6;!$(C(=O))]', 
-    '[CX4][Cl,Br,I]', '*=C=*', 'c1(N)ccc(C(=O)NC3(=O))c(c3ccc2)c21', 'N!@C=C-C=[C!r]([OH])', 'NC#N', 
-    '[#6]C(=O)-!@OC(=[O,N])', 'O=*N=[N+]=[N-]', 'N=[N+]([O-])C', 'c1ccccc1[N!r]=[N!r]c2ccccc2', 
-    '[N;R0]=[N;R0]C#N', 'C(Cl)(Cl)(Cl)C([O,S])[$([NX4+]),$([NX3]);!$(*=*)&!$(*:*)]', 'N[C!r](=O)S', 
-    '[Cl]C([C&R0])=N', 'SC(=[!r])S', 'O1CCOCCOCCOCC1', 'O1CCOCCOCCOCCOCC1', 'O1CCOCCOCCOCCOCCOCC1', 
-    'N#CC[OH1]', 'c1c([N;!R](~[O;X1])~[O;X1])cc([N;!R](~[O;X1])~[O;X1])cc1', 
-    'c1c([N;!R](~[O;X1])~[O;X1])c([N;!R](~[O;X1])~[O;X1])ccc1', 
-    'c1c([N;!R](~[O;X1])~[O;X1])ccc([N;!R](~[O;X1])~[O;X1])c1', 'C=[N+]=[N-]', '[N+]#N', 
-    '[C!r]=[C&D2][C!r]=[C&D2][C&D3]=O', 'NC(=S)S', 'S1SC=CC1=S', 'S1C=CSC1=C', 'S[C;!$(C=*)]S', 
-    'c1cccc(NC(=NC(=[N,S,O])NC(=O)3)C3=N2)c12', 'O=C1NC(=O)c2nc3ccccc3nc2N1', 
-    'c1cc(O)cc(OC(=CC(=O)C=C3)C3=C2)c12', 
-    '[$([cX3](:*):*),$([cX2+](:*):*),$([CX3]=*),$([CX2+]=*)]1([$([OX1]),$([NH]);!$([*X3])&!$(*-C=O)])[$([cX3](:*):*),$([cX2+](:*):*),$([CX3]=*),$([CX2+]=*)]([$([OX1]),$(N);!$([*X3])&!$(*-C=O)])[$([cX3](:*):*),$([cX2+](:*):*),$([CX3]=*),$([CX2+]=*)][$([cX3](:*):*),$([cX2+](:*):*),$([CX3]=*),$([CX2+]=*)][$([cX3](:*):*),$([cX2+](:*):*),$([CX3]=*),$([CX2+]=*)][$([cX3](:*):*),$([cX2+](:*):*),$([CX3]=*),$([CX2+]=*)]1', 
-    '[$([cX3](:*):*),$([cX2+](:*):*),$([CX3]=*),$([CX2+]=*)]1([$([OX1]),$([NH]);!$([*X3])&!$(*-C=O)])[$([cX3](:*):*),$([cX2+](:*):*),$([CX3]=*),$([CX2+]=*)][$([cX3](:*):*),$([cX2+](:*):*),$([CX3]=*),$([CX2+]=*)][$([cX3](:*):*),$([cX2+](:*):*),$([CX3]=*),$([CX2+]=*)]([$([OX1]),$(N);!$([*X3])&!$(*-C=O)])[$([cX3](:*):*),$([cX2+](:*):*),$([CX3]=*),$([CX2+]=*)][$([cX3](:*):*),$([cX2+](:*):*),$([CX3]=*),$([CX2+]=*)]1', 
-    '[$([cX3](:*):*),$([cX2+](:*):*),$([CX3]=*),$([CX2+]=*)]1[$([cX3](:*):*),$([cX2+](:*):*),$([CX3]=*),$([CX2+]=*)](O[$(C=O),$(C=N)])[$([cX3](:*):*),$([cX2+](:*):*),$([CX3]=*),$([CX2+]=*)][$([cX3](:*):*),$([cX2+](:*):*),$([CX3]=*),$([CX2+]=*)][$([cX3](:*):*),$([cX2+](:*):*),$([CX3]=*),$([CX2+]=*)](O[$(C=O),$(C=N)])[$([cX3](:*):*),$([cX2+](:*):*),$([CX3]=*),$([CX2+]=*)]1', 'C(=O)Onnn', 'c1[n+]([#6])ccn1([#6])', 
-    '[#6,#8,#16]-[CH1]=[NH1]', 'C=N([C,S])[C&v4]', 'N=[C!R]=O', 'N=[C!R]=S', 'C=[C!R]=O', 'P(=S)(S)S', 
-    '[#3,#11,#12,#13,#19,#20,#26,#27,#28,#29,#30]', 
-    '[#6;$([#6]~[#3,#11,#12,#13,#19,#20,#26,#27,#28,#29,#30])]', '[#16]1cc[#16][#6]1=[#16]', 
-    '[C&D1]=C[$([C&D2](C)(=O)),$(C(C)(=O)[!#8]),$(C(C)(=O)[O&D2]),$(C(C)(#N)),$(N(C)(~O)(~O)),$([S&D4](C)(~O)(~O))]', 
-    '[C&D1]#C[$([C&D2](C)(=O)),$(C(C)(=O)[!#8]),$(C(C)(=O)[O&D2]),$(C(C)(#N)),$(N(C)(~O)(~O)),$([S&D4](C)(~O)(~O))]', 
-    '[$([cX3](:*):*),$([cX2+](:*):*),$([CX3]=*),$([CX2+]=*),$([C&X4])]-[C&D2]=!@C[$([C&D2](C)(=O)),$(C(C)(=O)[!#8]),$(C(C)(=O)[O&D2]),$(C(C)(#N)),$(N(C)(~O)(~O)),$([S&D4](C)(~O)(~O))]', 
-    '[$([cX3](:*):*),$([cX2+](:*):*),$([CX3]=*),$([CX2+]=*),$([C&X4])]-[C&D2]#!@C[$([C&D2](C)(=O)),$(C(C)(=O)[!#8]),$(C(C)(=O)[O&D2]),$(C(C)(#N)),$(N(C)(~O)(~O)),$([S&D4](C)(~O)(~O))]', 
-    '[C&X4]-[C&D2!r]=[C!r][$([C&D2](C)(=O)),$(C(C)(=O)[!#8]),$(C(C)(=O)[O&D2]),$(C(C)(#N)),$(N(C)(~O)(~O)),$([S&D4](C)(~O)(~O))]', 
-    '[$([cX3](:*):*),$([cX2+](:*):*),$([CX3]=*),$([CX2+]=*)]-[C&D2!r]=[C!r][$([C&D2](C)(=O)),$(C(C)(=O)[!#8]),$(C(C)(=O)[O&D2]),$(C(C)(#N)),$(N(C)(~O)(~O)),$([S&D4](C)(~O)(~O))]-[C&X4]', 
-    'C1[C&D2!r]=[C!r][$([C&D2](C)(=O)),$(C(C)(=O)[!#8]),$(C(C)(=O)[O&D2]),$(C(C)(#N)),$(N(C)(~O)(~O)),$([S&D4](C)(~O)(~O))]-A=A1', 
-    'A1[C&D2!r]=[C!r][$([C&D2](C)(=O)),$(C(C)(=O)[!#8]),$(C(C)(=O)[O&D2]),$(C(C)(#N)),$(N(C)(~O)(~O)),$([S&D4](C)(~O)(~O))]-A1', 
-    'c1ccccc1C(=O)C=!@CC(=O)!@*', '[F,I,Cl,Br;!v1;!-]', 'N[F,I,Cl,Br]', '[N+!$(N=O)][O-X1]', 
-    'c1ccc(n[o,s]n2)c2c1[N;!R](~[O;X1])~[O;X1]', 'c1c([N;!R](~[O;X1])~[O;X1])cc(n[o,s]n2)c2c1', 
-    '[#7v5]', '[N!$([N+][O-])]=O', 'C(O)(O)[OH]', '[#8v3]', 'OO', 'c12cccc3c1c4c(cc3)cccc4cc2', 'C=P', 
-    'C=!@CC=!@C', 'c1cccc(cc(cccc2)c2c3)c13', 'c1cccc(c(cccc2)c2cc3)c13', 
-    '*[SX2][SX2][SX2]*', '[#16+1!$([#16]~[O-])]', 'c1ccc[o+]c1', 'O=S(=O)O[C,c]', 'COS(=O)O[C,c]', 
-    'S(=O)(=O)C#N', '[#16][F,Cl,Br,I]', '[SX2H0]!@[N]', 'C1NNC=NN1', '*[CH1](=!@S)', 'SC#N', 
-    'P(=S)(-[S;H1,H0$(S(P)C)])(-[O;H1,H0$(O(P)C)])(-N(C)C)', '*1[O,S]*1', '[#16v3,#16v5]', 
-    'C(=O)N(C(=O))OC(=O)', 'B(c1ccccc1)(c2ccccc2)c3ccccc3', 'P(c1ccccc1)(c2ccccc2)(c3ccccc3)', 
-    '[Si](c1ccccc1)(c2ccccc2)(c3ccccc3)', 'O=C([C!r]=[C!r]C)[!O&!N]', 'O=C([!O&!N])[C!r]=[C&D1!r]', 
-    'N=!@N', 'CC(=O)[C&X2]-!@Oc', 'C=[N!r][N!r]=C', 'S1(CCN4(C(CC41)=O))', 'S1(CC=CN5(C(CC51)=O))', 
-    '[S;X2]1[CH2][CH]2[CH]([CH]1)[NH]C([NH]2)=O', 'C(=N)-!@N-!@C(=N)', '*[S&D2!r][S&D2!r]*', 
-    '*~[N!H0]!@[N!H0]~*', '[C&D2][C&D2][C&D2][C&D2][C&D2][C&D2][C&D2][C&D2][C&D2][C&v4&D1]', 
-    'Nc1aaa(N!@=N)aa1', 'C[n+1]2ccccc2', '[N+1&X4;H0]', 'CC(=[S,O])-!@[S&D2]*', 'CC(=S)-!@O*', 
-    '[SX2H1]', '[3#1]', 'N1C(=S)SC(=C)C1=O', 'C1([#6](~[!#6]~[#6][!#6][#6]1=,:[!#6])~[!#1;!#6])=[C;!R;H,H2]', 
-    'c1ccc(c(c1)[OH])C=N[#7]', '[#6]C(=[!C;!R])C(=[!C;!R])[#6,$(S(=O)=O)]', 
-    'O=C2C(=!@N[#7])c1c(cccc1)N2', 'N([C;H2,H3])([C;H2,H3])c1oc([#6]=,:[#7][#7;H][#6]=,:[!#6])[cH][cH]1', 
-    'c1(C(=O)[OH])c([NH]N=C)cccc1', 'c1c(c(ccc1)C(cc)=O)[NH2,$([NH1](c)[!$(C=O)])]', 
-    'C(C#N)(C#N)C([$(C#N),$(C=N)])C#N', 'C(C#N)(C#N)C([NH2])=CC#N', 'C(C#N)(C#N)=N[NH]c1ccccc1', 
-    '[NH2]C1=C(C#N)[CH](cc)C(C#N)=C([NH2])S1', '[#7+](cc)=,:[#6][CH]=CN([CX4])[#6]', 
-    'C(C#N)(C#N)=Cc1ccccc1', 'C1(=C)C(N=CS1)=O', 'C1(C(C=C[!C]1)=C)=[!C]', 'C1(=C)C(=O)NNC1=O', 
-    '[#7]C=!@C2C(=O)c1c(cccc1)[!C]2', 'c1(ccccc1)[CH]=!@C3C(=O)c2c(cccc2)S3', 'c12ccccc1C(=O)C(=C)C2=O', 
-    'C=!@C([!#1])-@C(=!@[!C])-@C(=!@C)[!#1]', '[#6]C(=O)[CH]=C([NH][#6])C(=O)O[#6]', 
-    '[#7]~c1[$(nn),$(n[cH]),$(nc[#7;H,H2]),$(c([#7])n),$(c([#7])[cH]),$(c([#7])c[#7;H,H2])][n,cH,$(c[#7;H,H2])]c([#7H,#7H2,$(O[C;H2,H3])])nn1', 
-    'c12c([cH][cH][cH][cH]1)[cH]c(c([cH]2)[OH])C(=O)[NH]N=C', 
-    '[C;H2,H3]N([C;H2,H3])c1[cH][cH]c([CH]=NN[$(C(=O)[CH2]Scn),$(C(=O)[CH2]aan),$(C(=O)cc[OH]),$(cn),$([CH2][C;H,H2][OH])])[cH][cH]1', 
-    '[#6]N1[CH2][CH2]N([CH2][CH2]1)N=[CH]ca', 'C1C(=[!C;!R])C(=O)[!C]*=1', 'C1(C(=O)NC(=O)NC(=O)1)=N', 
-    'c12ccccc1C(=O)[CX4]C2=O', 'c1ccc3c2c(cccc12)[NH][CX4][NH]3', 'n2(c1acccc1)c([CX4])c[cH]c2[CX4]', 
-    'N1C(=S)S[CX4]C1=O', 'c1c(ccc(c1)[NH]S(=O)=O)[OH]', 
-    'c1([CH2,$(cc)])[cH,$(c[CH2,CH3]),$(cC=O)]sc([nH,$(n[CH,CH2,CH3]),$(n(cc)cc)]1)=[N;!R]', 
-    'c1([$([#7][#6](=O)cc),NH2])c(-!@C(=O)N[C;H2,H3])sc(n1[$([CH2][CH]=[CH2]),$(cc)])=S', 
-    'o1c(sc2cc(ccc12)[#7,#8])=[O,S]', 'S=c1cc[!#6]cc1', '[#6][#6](=S)[#6]', 
-    '[NH2]c1sc([!#1])c([!#1])c1C=O', 'c1cccc2c1cc3c(n2)nc4c(c3[#7])cccc4', 
-    '[NH2]c1c([NH]S(=O)=O)[cH]c([NH][C;H2,H3])c([F,Cl,Br,I])[cH]1', 
-    'c13[cH][cH][cH][cH]c1oc2[cH]c([NH][C;H2,H3])c(O[C;H2,H3])[cH]c23', 
-    'c1ccccc1N3C(=O)C([NH]c2ccc(O[C;H2,H3])cc2)=C([F,Cl,Br,I])C3=O', 
-    '[C;H2,H3]Oc1[cH][cH][cH][cH]c1[NH][CH](C=O)S', '[NH2]c1c([OH])cc(S(=O)(=O)[OH])cc1', 
-    'c2([NH2])c([OH])c(C=O)[cH]c1[cH][cH][cH][cH]c12', 'c13c(C(Nc2c1cccc2)([#6])[#6])ssc3=*', 
-    'c1c(ccc([NH2,$([NH][CX4]),$([N]([CX4])[CX4])])c1)[CX4]c2ccc(cc2)[NH2,$([NH][CX4]),$([N]([CX4])[CX4])]', 
-    '[C;H2,H3]N([C;H2,H3])c1[cH][cH]c([CH]=NN=C([#6])cc)[cH][cH]1', 
-    '[cH]1c([cH]nc2c1[cH][cH][cH][cH]2)[CH2]N3c4c([CH2][CH2]3)[cH][cH][cH][cH]4', 
-    '[cH]2c([NH]C(=S)[NH]c1ccccc1)[cH]c(N([C;H2,H3])[C;H2,H3])[cH][cH]2', 
-    'N3C=C(C=O)C(c1ccc(N([C;H2,H3])[C;H2,H3])cc1)[#6]2=,:[#6]3~[#7]~[#6](~[#16])~[#7]~[#6](~[#7])~2', 
-    '[C;H2,H3]N([C;H2,H3])c1[cH][cH]c(o1)[CH]=CC#N', 
-    '[NH2]c2c(N=C1[CH]=[#6]~[#6]~[#6]=C1)[cH][cH][cH][cH]2', 
-    '[NH2]c1[cH][cH]c([cH][cH]1)c2[cH]c(C=O)c([C;H2,H3])o2', 
-    'c1(O[C;H2,H3])[cH,$(c[C;H2,H3])][cH]c(O[C;H2,H3])c([CH2][$([NH]C(=O)[CH2][CH2][C;H2,H3]),$([CH]([C;H2,H3])[NH]C(=S)[N;H,H2])])[cH]1', 
-    'c2(c([NH2])n(c1c(cccc1)C(=O)[OH])nc2C=O)[$(C#N),$(C=S)]', 'c12ncccc1c([NH2])c(C(=O)~[OX1])s2', 
-    'C3(=O)C(=[CH][NH]c1c(C(=O)[OH])cccc1)N=C(c2ccccc2)O3', 'c1([OH])ccc([NH]C(=O)cc)c(c1)C(=O)[OH]', 
-    'c1(oc([#6])[cH][cH]1)C(=O)[NH]c2[cH][cH][cH][cH]c2C(=O)[OH]', 
-    'c1ccccc1C(=O)[NH]c2ccccc2C(=O)[NH][NH]c3nccs3', 'c1ccc2c(cc1)ccc2', 'c1(c(ccccc1)[N;H,H2])=N[#6]', 
-    'c1(ccc(cc1)c2cc(c3c(c(c2)[OH])coc3)=O)S[C;H2,H3]', 'c13ccccc1sc(=NN=c2cccccc2)n3[C;H2,H3]', 
-    'c23c([Br])cc1c(cco1)c2[cH]cc(=O)o3', 
-    'c12c([F,Cl,Br,I])cc([F,Cl,Br,I])cc1[cH]c(C(=O)[NH2])c(=[NH])o2', 
-    'c1(c(nc2c(c1C#N)c(c(cc2)C#N)[NH2])[NH2])C#N', 
-    'n3c([SX2]c1c([NH2])cccc1)c(C#N)c(c2ccccc2)c(C#N)c3[NH2]', 'C1(C#N)(C#N)[CH](C(=O)[#6])[CH]1', 
-    'C=CC(C#N)(C#N)C(C#N)=C[NH2]', 'ccC(=O)[NH]C(=O)C(C#N)=[CH][NH]cc', 'O=S(=O)C(C#N)=N[N;H,H2]', 
-    'c2(c1ccccc1nnc2)C(cc)C#N', 'C1(C([C;H2,H3])=C(C#N)[#6](~[#8])~[#7]~[#6]1~[#8])=[CH]cc', 
-    'O=c1c(cc(nn1)C=O)C#N', 'n2(c1ccccc1)c(=O)c(C#N)cc(C#N)n2', 
-    '[NH2]C1=C(C#N)[C;H,H2](cc)C([C;H2,H3])=C(C=C)O1', 
-    '[NH2,$([OH])]C2=C(C#N)[CH](cc)c1c(n([#6])nc1)O2', '[NH2]C1=C(C#N)[CH](cc)C(C#N)=C(cc)O1', 
-    '[NH2]C2=C(C#N)[CH](cc)c1c(ccs1)O2', '[C;H2,H3][SX2]C1=C(C#N)[CH](cc)C(C#N)C(=O)N1', 
-    '[NH2]C2=C(C#N)[CH](c1cccs1)C(C(=O)O[#6])=C([C;H2,H3])O2', 
-    '[SX2]1C=C(C#N)C([#6])(C=O)C([$(C=O),$(C#N)])=C1[NH2]', '[NH2]C1=C(C#N)[CH](cc)S[CX4]S1', 
-    '[NH,$([N][CX4])]1[#6]:,=[#6]([#6](=O)ccc)C([#6])C([$(C=O),$(C#N)])=C1[CH3]', 
-    'N2=Nc1n[!#6]nc1N=Ncc2', '[CH]2([OH])c1n[!#6]nc1[CH]([OH])C=C2', 
-    'c1ccccc1N([C;H2,H3])[CH]=[CH]C=!@[CH][CH]=CC=@Nc2ccccc2', 'C2(c1c(cc(c(n1)[NH2])C#N)C(C=2)=C)C#N', 
-    'C(C#N)(C#N)=C(S)S', '[OH]C(=O)c1c([OH])[cH]c([cH][cH]1)c2[cH][cH]c(o2)[CH]=C(C#N)c3nccn3', 
-    'n2([#6])[cH][cH][cH]c2[CH]=C(C#N)c1nccs1', 
-    'C3(=O)C(=[CH][$(c1ccccc1),$(c2ccc[!#6]2)])N=C(aaa)[S,$([N]aa)]3', 'N1=CC(C(N1)=S)=C', 
-    'c1(ccco1)[CH]=!@C3C(=O)c2c(cccc2)[!C]3', 'S=c1[nH]ccc2c1C(=O)OC2=[C;H,H2]', 
-    '[#6]1[O,S]C(C([#6]=,:1)=O)=CC=O', 'C2(=O)C(=[CH]c1cc([F,Cl,Br,I])ccc1O[C;H2,H3])N=C(S[C;H2,H3])S2', 
-    '[cH]2[cH][cH][cH]c1[CH2]C(=O)C(=C([C;H2,H3])[C;H2,H3])c12', 'C1C(OC(C=1)(O)[#6])O', 
-    'c12[cH]c(O[C;H2,H3])c(O[C;H2,H3])[cH]c1C([#6])=C([#6])S[CH2]2', 
-    'c2(O[C;H2,H3])c(O[C;H2,H3])[cH]c1C=C[CH]Sc1[cH]2', 'C1(=O)C(C(C#N)=[CH][#7])C([#7])C=C1', 
-    'c23ccc1ccccc1c2[C;H,H2][CX4]NC3=[CH]C(=O)N([C;H2,H3])[C;H2,H3]', 'O=CC1=C(SC(=[CH][#6])S1)C=O', 
-    'C1(C(=O)[CH2]C[CH2]C(=O)1)=C([N;H,H2])C=O', 'C#CC(=O)C#C', 
-    'aa[CH,$(CC#N)]=C1[#6]:,=[#6]C(=[O,$([N;!R])])[#6]:,=[#6]1', 
-    'S1C(=Ncc)[NH,NH2,$(N[CH2][CH2]O),$(Ncc)]C(=O)C1=[CH][$(ccc[Cl]),$(c[!#6])]', 
-    'S1C(=O)NC(=S)C1=[CH]cc', 'n2ccc([CH]=C1C(=O)NC(=[!C])N1)c2[C;H2,H3]', 
-    '[OH]C(=O)c1cccc(c1)cac[CH]=C2C(=[!C])NC(=[!C])[!C]2', 'n12c(nc(=O)c([C;H2,H3])n1)sc(=[CH]cc)c2=O', 
-    'N2([C;H2,H3])C(=S)[NH]C(=[CH]c1cc(Br)ccc1)C2=O', 
-    '[C;H2,H3]N2C(=[S,N])[!C]C(=C1[CH]=[CH]ccN1[C;H2,H3])C2=O', 
-    'c1ccccc1C(=O)[CH]=c3c(=O)[nH]c(=O)c(=[CH]c2ccccc2)[nH]3', 'O=C1cc[CH2]NC1=[C;H,H2]', 
-    'c1(oc([C;H2,H3])[cH][cH]1)[CH]([OH])C#C[CX4]', 'ccn2nc1[CH2][S;X2][CH2]c1c2[NH]C(=O)[CH]=[C;H,H2]', 
-    '[cH]1[cH]n([C;H2,H3])c3c1c2[cH][cH]n(c2c(c3O[C;H2,H3])O[C;H2,H3])[C;H2,H3]', 
-    'C2(=O)C(=C([C;H2,H3])[NH][CH2][CH2][C;H2,H3])N=C(c1ccccc1)O2', 
-    'n4(c1ccccc1)[cH][n+](c2ccccc2)c(=Nc3ccccc3)n4', 'n1nc(c2-c(c1C)ccc2)C', 
-    's1c2c(c([C;H2,H3])c1[C;H2,H3])c(ncn2)NN=Cc3ccco3', 'C2([#6])[CH2]C(=O)n1nc([NH2])c([NH2])c1N=2', 
-    'c2(c1n(c(ccn1)=O)nc2c3nccc3)C#N', 'n2nnnc1cccc-12', 
-    'c2([CH2][C;H2,H3])c([C;H2,H3])c1c(=[X1])n([C;H,H2][$(C(=O)O),$(cc)])[cH,$(cS[C;H2,H3])]nc1[a;X2]2', 
-    'c1ccc2c(c1)nc3c(n2)ccc4c3cccc4', 'n1c3c(c(=N)c2ccccc12)cccc3', 
-    'c23cc1ccccc1cc2n([C;H2,H3])c(=O)c(cc[NH][C;H2,H3])n3', 
-    'c12acccc1nc([CH]=C([OH])[#6])c([CH]=C([OH])[#6])n2', 
-    'c1ccc2c(c1)nc(c(n2)[CH2]C(=O)cc)[CH2]C(=O)cc', 'c1ccc2c(c1)nc(c(n2)c3ccccc3)c4c(cccc4)[OH]', 
-    '[C;H2,H3]Oc1[cH][cH][cH][cH]c1[NH]c3c2c(O[C;H2,H3])ccc(O[C;H2,H3])c2ncc3', 
-    'N=c1[nH]c([N;H,H2,H3])c([N;H,H2])nn1', '[NH](c1[cH][cH][cH][cH]c1[OH])c2nnc(=N)oc2[NH2]', 
-    's1[cH][cH][cH]c1C4[NH]N=C(c2ccncc2)c3c(cccc3)N=4', 
-    'C([F])([F])C(=O)[NH]c1[cH]n([CH2][CH2]O[CH2]cc)n[cH]1', 'c12cccca1c([!#1])a[n+](~cc)a2', 
-    '[#6]13~[#7](cc)~[#6]~[#6]~[#6]~[#6]~1~[#6]2~[#7]~[#6]~[#6]~[#6]~[#7+]~2~[#7]~3', 
-    'C1(C=O)(cc)[SX2]C=N[NH]1', 'S=c2[nH]nc(c1[cH][cH]c(O[C;H2,H3])[cH][cH]1)o2', 'N=C1SC(=N)N=C1', 
-    'N1([C;H2,H3])C(=S)N(cc)C(=Ncc)C1=Ncc', 'S1C(=N[NH])SC(=Ncc)C1=Ncc', 
-    'c1ccccc1C4=Nn3c2ccccc2[n+]c3S[CX4]4', 'n2c1ccccc1n([C;H2,H3])c2S[CH2]C(=O)[NH]N=[CH][CH]=[C;H,H2]', 
-    'c12ccccc1[CH2][CH2]N=C2[SX2][CH2]C(=O)c3ccccc3', 
-    'c1ccc3c(c1)Sc2[cH][cH][cH,$(cO),$(c[SX2]),$(c[CX4]),$(c[NH2,$([NH][CX4]),$([N]([CX4])[CX4])])]([cH]c2C(C3)[NH2,$([NH][CX4]),$([N]([CX4])[CX4])])', 
-    '[C;H2,H3][SX2]c2nc1OC=Nccc1nn2', '[C;H2,H3][SX2]c3nc(c1o[cH][cH][cH]1)c(c2o[cH][cH][cH]2)nn3', 
-    '[C;H,H2,H3]C3=NN(c1ccccc1)S2[!C]*=CC=23', 'N=c1ncns1', 'cc[NH]C(=O)c1nnsc1[NH]cc', 'n2sc1CNccc1c2=S',
-    'C1(SC(=[CH]cc)N(cc)N=1)C=O', 'c1([OH])ccc([OH])c(c1)C(=!@C[#7])C=O', 
-    'c14[cH][cH]c(O[C;H2,H3])[cH]c1C(=N[NH]c2[cH][cH]c(C(=O)[OH])[cH][cH]2)c3[cH]c(O[C;H2,H3])[cH][cH]c34', 
-    '[OH]C(=O)c1ccc(cc1)NN=[CH]c2ac([cH][cH]2)c3ccccc3', 
-    'c1(o[cH,$(c[C;H2,H3])][cH][cH]1)C(=O)[NH]N=[CH,$(C[C;H2,H3])]c3cc(***c2occc2)ccc3', 
-    'c1cccc2c1cc4c(c2C=N[NH]c3ccccc3)cccc4', 
-    'c1(o[cH,$(c[C;H2,H3])][cH][cH]1)[CH,$(C[C;H2,H3])]=N[NH]c2nccs2', 
-    'c1(o[cH,$(c[C;H2,H3])][cH][cH]1)[CH,$(C[C;H2,H3])]=N[NH]c2ccncc2', 
-    'c1ccccc1N(c2ccccc2)N=[CH]c3ac([cH][cH]3)c4cc(C(=O)[OH])ccc4', 
-    '[OH]C(=O)c1cc(ccc1)cacC=N[NH]C(=O)[CH2]O', 
-    'C(c1[cH]c[cH,$(c[CX4])][cH][cH]1)(c2[cH][cH][cH,$(c[Cl])][cH][cH]2)=[$(NO[CH2][CH2][CH2]N([C;H2,H3])[C;H2,H3]),$(NO[CH2][CH2]N([C;H2,H3])[C;H2,H3]),$(N[NH]C(=[NH])[NH2]),$([CH][#7])]', 
-    'c12[cH][cH][cH][cH]c1[!#6][cH,$(c[OH]),$(c[C;H2,H3])]c2[CH]=N[NH][$(c3nc[cH]s3),$(c[cH][cH]),$(cncncn),$(cnnnn)]', 
-    'c1(s[cH][cH][cH,$(c[C;H2,H3])]1)[CH]=N[NH]c2ccccc2', 
-    '[CH]4(n1[cH]n[cH][cH]1)c2[cH]c([Br])[cH][cH]c2[CH2][CH2]c3[cH][cH][cH][cH]c34', 
-    'n3c(c1ccccc1)c(c2ccccc2)n(N=!@C)c3[NH2]', 'cc[CH]=[CH][CH]=NN([CX4])[CX4]', 
-    'C1(C=Nc2c(N1)cccc2)=[CH]C=O', 'c1cC(=O)C=C1N=[CH]N([CX4])[CX4]', 'c1ccc2c(c1)C(C=N2)=[N;!R]', 
-    'cc[CH]=[CH][CH]=NN=C', 'N([C;H2,H3])([C;H2,H3])[CH]=NC([C;H2,H3])=NN([C;H2,H3])cc', 
-    'c12c([cH][cH][cH][cH]1)c(c([cH][cH]2)C(=Ncc)[C;H2,H3])[OH]', 
-    '[NH](c1ccccc1)N=C(C(=O)[C;H2,H3])[NH][NH,NH2,NH3,$(cc)]', '[N;!R]=C2C(=O)c1c(cccc1)S2', 
-    'cc[N;!R]=C2C(=[!C])c1c(cccc1)N2', 'C1(=[!C])C(N=CS1)=O', 'C=[N;!R]c1c([OH])cccc1', 
-    '[CX4]1C(=O)NNC1=O', 'O=CC=[CH][OH]', '[C;H2,H3]C([OH])=C(C(=O)[C;H2,H3])[CH]C#C', 
-    'cc[NH]N=C([C;H2,H3])[CH2]C([C;H2,H3])=N[NH]cc', 'c1cccc2c1C(c3c2nacc3)=O', 
-    'c1cccc2c1C(C3C2=N*=CC3)=O', 'c1(cc3c(c2ccccc12)c4c(C3=O)cccc4)[OH]', 
-    'c1c(cccc1)C(=O)[NH]N=C3c2c(cccc2)c4c3cccc4', 'c12c(O[CH2]N(ccO[C;H2,H3])[CH2]1)[cH]cc[cH]2', 
-    'c1([NH]C(=O)[CH2][CH2]cc)[cH][cH]c([C;H2,H3])c([NH]C(=O)[CH2][CH2]cc)[cH]1', 
-    '[N;H,H2](c1[cH][cH]c(O[CH3])c(O[C;H,H2,H3])[cH]1)C(=O)[NH][CH2][CH2][CH2]N([CH3])cc', 
-    'c1c3c(cc2c1cccc2)nc(n3)COC(=O)c4cc(cc(c4)[NH2])[NH2]', 
-    'n2(c1[cH][cH][cH][cH]c1C(=O)[NH][CH]([C;H2,H3])[CH2]Occ)[cH][cH][cH][cH]2', 
-    'C1(cc)[CH2][CH](C(=O)[#6])[CH](C(=O)[OH])[CH2]C(cc)=1', 'C(cc)(cc)(cc)SccC(=O)[OH]', 
-    'c1(C([CX4])([CX4])[NH]C(=O)N([CH2][C;H2,H3])[CH2][CH2][C;H,H2][CH2]cc)[cH][cH][cH]c(C(=[CH2])[C;H2,H3])[cH]1', 
-    'c1cc3c2c(c1)cccc2C(=CC3=O)O[C;H2,H3]', 'c13ccc(c2c1c(ccc2)C(C=C3C([F])([F])[F])=O)[#7]', 
-    'c1(ccc3c2c(cccc12)C(C=C3)=N)[#7]', 'C(c1ccc([OH])cc1)(c2ccc([OH])cc2)OS(=O)=O', 
-    'n12cccc2C=N([#6])CC1', 'n2([C;H2,H3])c1c(ccC(=O)1)cc2[C;H2,H3]', 
-    'c1(c2c(c(n1C(O)=O)[C;H2,H3])S[CH2]S2)[C;H2,H3]', 'n2(c1ccccc1)[cH][cH][cH]c2C=N[OH]', 
-    'c1ccc2c(c1)c4c3c(C2=O)cccc3no4', 'O=C2c1ccccc1c3c([OH])c(=O)nc4cccc2c34', 
-    'C1([#6]:,=[#6][#6]:,=[#6]C1=[!C])=[!C]', 'N2(c1ccccc1)C(=NC=O)S[CH2]C2=O', 
-    'N4(c1ccccc1)C(=O)S[CH]([NH]c3c2ccccc2ccc3)C4=O', 'N2(c1ccccc1)C(=O)S[CH2]C2=S', 
-    'N3(C(=O)c1ccccc1)C(=Nc2ccccc2)S[CH2]C3=O', 'O=C4CCC3C2C(=O)CC1CCCC1C2CCC3=C4', 
-    'C2Cc1ccccc1C(c3c2cccc3)=C[#6]', 'c1ccc3c(c1)[SX2,CX4]c2cc[cH,$(c[Cl]),$(c[CX4])]cc2C3=C[#6]', 
-    'c1ccc2c(c1)C(c3c(SC2)scc3)=C', 'c1ccc2c(c1)C(c3c2ccc(c3)[NH2])=[CH][#6]', 
-    '[CH3]c2nc([NH]S(c1[cH][cH]c([cH][cH]1)O[CH2][CH2][C;H2,H3])(=O)=O)[cH][cH][cH]2', 
-    'c1([cH][cH]c([cH][cH]1)[NH2])S(=O)(=O)[NH]c2[cH][cH][cH]nn2', 
-    '[CH3]C([CH3])([CH3])c1[cH]c(C([CH3])([CH3])[CH3])c(O[C;H,H2]N)c[cH]1', 
-    'a1aaa(aa1)[CH]=[CH]C([NH][NH]c2n(nnn2)[#6])=O', 'c1([F,Cl,Br,I,$([n+](c)c)])c(-!@C=N)sc(n1)=O', 
-    'c1(S[C;R])c([CH]([#6])[#6])sc([nH,$(n[C;H2,H3])]1)=O', 
-    's2c([NH]c1c([CH2,CH3,$(cc)])cccc1)[n+]([C;H2,H3])c([#6])[cH]2', 'c1nc(sc1)NNS(=O)=O', 
-    'c1c(cc(C(=O)[OH])cc1)[NH]c2nc([cH]s2)c3ccc([CH]([C;H,H2,H3])[C;H,H2,H3])cc3', 
-    '[C;H2,H3][NH]C=N[NH]c1nc(cc)[cH]s1', 'O=S(=O)(cc)[NH][NH]c1nc(cs1)cc', 
-    'n1c3c(cc2c1nc(s2)[#7])sc(n3)[#7]', 's2ccc(c1csc([NH2])n1)[cH]2', 
-    'c2([NH]cc[C;H2,H3])nc(c1ccncc1)[cH]s2', 
-    '[C;H2,H3]Oc1[cH][cH]c(O[C;H2,H3])[cH]c1[NH]c3scc(c2ccc(O[C;H2,H3])cc2)n3', '[#6][CH](=S)', 
-    '[cH]1coc([cH]1)C(=S)N2[CH2][CH2]*[CH2][CH2]2', '[CX4][NH]C(cc)=[CH]C(=S)[NH]c1ccccc1', 
-    '[CH](c1ccccc1)(c2ccccc2)C(=S)[N;H,H2]', 'c(N([C;H,H2,H3])[C;H,H2,H3])c[NH]C(=S)[C;H,H2,H3]', 
-    'c1cccnc1C(=S)[NH]c2ccccc2O[C;H2,H3]', 'acC(=S)[NH][NH]ca', '[C;H2,H3]SC(=S)[NH][CH2]cc', 
-    'C1[CX4]SC(NC=1)=S', 'O=c2sc1cccc(O[C;H2,H3])c1o2', '[NH]1C(=S)[CH](C#N)[CH](cc)[CH]=C1cc', 
-    'S=CC([C;H2,H3])=C([C;H2,H3])N([C;H2,H3])[C;H2,H3]', 'C1=CN(C(c2ccccc12)(C#N)C(=S)S)C=O', 
-    'c1c(sc(cc1)=S)[#7]', 'ccC(=[SX1])[SX2][C;H,H2][CH2,CH3,$(cc)]', 'n1c(=O)[cH]c([#6])sc1=S', 
-    'c1ccccc1N3C(=O)C(Sc2ccccc2)=[CH]C(=O)3', '[CX4][N+]([CX4][OH])=CS[C;H,H2,H3]', 'S=Cc2n1ccccc1cc2', 
-    'c2(nanc(c1o[cH][cH][cH]1)n2)S[CX4]', 'C(=O)(N1CCSCC1)c2c([cH][cH][cH][cH]2)S[C;H2,H3]', 
-    'c2c([NH]C(=S)[NH][CH2][CH2][CH2]N([C;H2,H3])c1ccccc1)cccc2', 
-    'c2c([NH]C(=S)[NH][CH2][CH2]N([C;H2,H3])c1ccccc1)cccc2', 
-    'c1(ccccc1)[NH]C(=S)N[NH]C(=O)c2a[!#6]cc2', 'c1(ccccc1)[NH]C(=S)N[NH]c2ccccc2', 
-    'C1[NH][NH]C(=S)N[NH]1', 'c1(ccccc1)[NH]C(=S)N[NH][#6](=,:[#7;R])[#7;R]', 
-    'c1(ccccc1)[NH]C(=S)[NH]N=Cc2ccnc2', 'c1(ccoc1[C;H,H2,H3])C=N[NH]C(=S)[N;H,H2]', 
-    'c2(=S)n1ccnnc1n[nH]2', '[CX4][SX2]C(=N-aaaa)[NH]N=C', 
-    'ccN([C;H2,H3])[CH2][CH2][CH2][NH]C(=S)[NH]c1c([F,Cl,Br,I])[cH]c([C;H2,H3])[cH][cH]1', 
-    '[cH]3c([NH]C(=S)[NH][C;H,H2]c1oc([C;H,H2,H3])[cH][cH]1)[cH]c2c(O[CH2]O2)[cH]3', 
-    'O=C-!@n2ccc1c2[nH]c(=S)[nH]1', 
-    'c12c([cH][cH][cH][cH]1)c([cH][cH][cH]2)C([C;H2,H3])=N[NH]C(=S)[NH]ccc', 
-    'c1(ccccc1)[#7;H][#6](=S)[#7][#7;H][#6;H]=,:[#6;H][#6]=O', 
-    '[C;H2,H3]N([C;H2,H3])[CH]=CC(=O)c1c([SX2])sc([$(C#N),$(C=O)])c1', 
-    's1[cH][cH]c(O[C;H2,H3])c1C(=O)[NH][N;H,H2]', 
-    'c1(s[cH,$(c[C;H2,H3])][cH][cH]1)[CH,$(C[C;H2,H3])]C(=O)[NH]c2nccs2', 
-    'c1csc(c1[NH2])[CH]=[CH]c2cccs2', '[NH2]c3sc([NH]C(=O)c1ccccc1)c(C#N)c3c2aaaaa2'
-    ]  # '([#7+1!r].[#7+1!r].[#7+1!r])'
-
-'''l = []
-lines = open('/export/home/jyy/gbp_pocket_flow_with_edge/pocket_flow/utils/alert_struct.csv').read().split('\n')
-for line in lines[2:]:
-    if ',"' in line:
-        l.append(line.split(',"')[-1].strip('"'))
-    else:
-        l.append(line.split(',')[-1])'''
-
-HAT1_REF_MOL = [
-    'CN1C2=C(C=CC(S(=O)(NC3=CC=C(C4=NC=CO4)C=C3)=O)=C2)CCC1C5=CC=CC=C5',
-    'CN1C2=C(C=CC(S(=O)(NC3=CC=C(Br)C=C3)=O)=C2)CCC1C4=CC=CC=C4',
-    'CN1C2=C(C=CC(S(=O)(NC3=C(C=CC=C4)C4=C(Br)C=N3)=O)=C2)CCC1C5=CC=CC=C5',
-    'O=S(C1=CC2=C(C=C1)C=CC(C3=CC=CC=C3)=N2)(NC4=CC=C(Br)C=C4)=O',
-    'O=S(C1=CC2=C(C(F)=C1)C=CC(C3=CC=CC=C3)=N2)(NC4=CC=C(Br)C=C4)=O',
-    'CN1C2=C(C=CC(S(=O)(NC3=CC=C(C4=COC=N4)C=C3)=O)=C2)CCC1C5=CC=C(Cl)C=C5',
-    'CN1C2=C(C=CC(S(=O)(NC3=CC=C(C4=CSC=N4)C=C3)=O)=C2)CCC1C5=CC=C(Cl)C=C5',
-    'BrC1=C(O)C=CC(/C=C2CCC/C(C\2=O)=C\C3=CC=C(O)C(Br)=C3)=C1',
-    'O=S(C1=CC2=C(NC(C3C2C=CC3)C4=C(C=CC=C4)F)C=C1)(NC5=CC=C(C(O)=O)C=C5)=O',
-    'BrC1=C(O)C=CC(/C=C2CC(CC)C/C(C\2=O)=C\C3=CC=C(O)C(Br)=C3)=C1'
-    'O=C1/C(CC/C1=C\C2=CC(Br)=C(O)C=C2)=C/C3=CC=C(O)C(Br)=C3',
-    'O=C(/C=C/C1=CC(Br)=C(O)C=C1)/C=C/C2=CC=C(O)C(Br)=C2'
-            ]
-
-def check_alert_struct(mol):
-    Chem.GetSSSR(mol)
-    for alert_smarts in ALERT_STRUCTURES:
-        p = Chem.MolFromSmarts(alert_smarts)
-        if mol.HasSubstructMatch(p):
-            return False
-    else:
-        return True
-
-
-def check_lipinski(mol):
-    mw = Descriptors.MolWt(mol)
-    logp = Crippen.MolLogP(mol)
-    #tpsa = Descriptors.TPSA(mol) # tpsa<=140
-    hbd = Lipinski.NumHDonors(mol)
-    hba = Lipinski.NumHAcceptors(mol)
-    #rb = Descriptors.NumRotatableBonds(mol)
-    if sum([mw<=700, logp<=5, hba<=10, hbd<=5]) != 4:
-        return False
-    else:
-        return True
-
-
-class Filter(object):
-    def __init__(self, sa_threshhold=5.0, qed_threshhold=0.3, mw_threshhold=300):
-        self.sa_threshhold = sa_threshhold
-        self.qed_threshhold = qed_threshhold
-        self.mw_threshhold = mw_threshhold
-    
-    def _filter(self, mol):
-        try:
-            qed_check =  QED.qed(mol) > self.qed_threshhold
-            sa_check = sascorer.calculateScore(mol) < self.sa_threshhold
-            mw_check = Descriptors.MolWt(mol) > self.mw_threshhold
-            lip_check = check_lipinski(mol)
-            if qed_check and sa_check and mw_check and lip_check:
-                struct_check = check_alert_struct(mol)
-                if struct_check:
-                    return (Chem.MolToSmiles(mol), mol)
-                else:
-                    return False
-            else:
-                return False
-        except:
-            return False
-    
-    def run(self, mol_list, save_name='Filter_Result.sdf', mol_name=None):
-        num_mol = 0
-        smi_dict = {}
-        #filter(lambda x:x!=False, filter_list)
-        for ix, m in tqdm(enumerate(mol_list)):
-            m_ = Chem.MolFromSmiles(Chem.MolToSmiles(m))
-            out = self._filter(m_)
-            if out != False and out[0] not in smi_dict:
-                smi_dict.setdefault(out[0])
-                with open(save_name, 'a') as sdf_writer:
-                    #mol = out[1]
-                    if mol_name:
-                        m.SetProp('_Name', mol_name)
-                    mol_block = Chem.MolToMolBlock(m)
-                    sdf_writer.write(mol_block + '\n$$$$\n')
-                    num_mol += 1
-    
-    def run_file_list(self, sdf_list, save_name='Filter_Result.sdf'):
-        num_mol = 0
-        smi_dict = {}
-        for sdf in sdf_list:
-            supp = Chem.SDMolSupplier(sdf)
-            for mol in supp:
-                if mol is None: continue
-                mol_ = Chem.MolFromSmiles(Chem.MolToSmiles(mol))
-                out = self._filter(mol_)
-                if out != False and out[0] not in smi_dict:
-                    mol_name = 'No_{}-{}'.format(num_mol, sdf)
-                    smi_dict.setdefault(out[0])
-                    with open(save_name, 'a') as sdf_writer:
-                        #mol = out[1]
-                        mol.SetProp('_Name', mol_name)
-                        mol_block = Chem.MolToMolBlock(mol)
-                        sdf_writer.write(mol_block + '\n$$$$\n')
-                        num_mol += 1
-        return num_mol
-    
-    def np_run(self, mol_list, save_name='Filter_Result.sdf', n_process=10):
-        smi_dict = {}
-        pool = Pool(processes=n_process)
-        filter_list = pool.map(self._filter, mol_list)
-        #filter(lambda x:x!=False, filter_list)
-        for ix, i in enumerate(filter_list):
-            if i:
-                if i[0] not in smi_dict:
-                    smi_dict.setdefault(i[0])
-                    with open(save_name, 'a') as sdf_writer:
-                        sdf_writer.write(i[1] + '\n$$$$\n')
-
-                        
-
-from PIL import Image
-import matplotlib.pyplot as plt
-from .train import verify_dir_exists
-from rdkit import Chem
-from rdkit.Chem import AllChem, Descriptors, Draw
-from rdkit.Chem.Draw import rdMolDraw2D
-from rdkit.Chem.Draw.MolDrawing import DrawingOptions
-import os
-
-
-def draw_docked_mol_list(mol_list, molsPerRow=5, subImgSize=(300,300)):
-    opts = DrawingOptions()
-    opts.atomLabelFontSize = 30
-    opts.bondLineWidth = 1.5
-    opts.colorBonds = False
-    
-    legends = []
-    for m in mol_list:
-        AllChem.Compute2DCoords(m)
-        docking_score = 'Docking Score: {:.3f}'.format(float(m.GetProp('r_i_docking_score')))
-        mw = 'MolWt: {:.3f}'.format(Descriptors.MolWt(m))
-        legend = mw + '\n' + docking_score
-        legends.append(legend)
-
-    img = Draw.MolsToGridImage(
-        mol_list,
-        molsPerRow=molsPerRow,
-        subImgSize=subImgSize,
-        legends=legends
-    )
-    return img
-
-
-def DrawDockedMol(mol,         # rdkit.Molecule的instance
-            save=None, 
-            size=(300,300), 
-            bondLineWidth=5,   # 化学键显示的宽度
-            FontSize=3,
-            fixedBondLength=50,    # 化学键显示的长度
-            legend=None,
-            legendFontSize=5,
-            elemColor=True):  # 是否按元素给原子着色
-    
-    if legend is None:
-        try:
-            docking_score = 'Docking Score: {:.3f}'.format(float(mol.GetProp('r_i_docking_score')))
-            mw = 'MolWt: {:.3f}'.format(Descriptors.MolWt(mol))
-            legend = mw + '\n' + docking_score
-        except:
-            pass
-    draw = rdMolDraw2D.MolDraw2DCairo(size[0], size[1])
-    option = rdMolDraw2D.MolDrawOptions()
-    option.bondLineWidth = bondLineWidth
-    option.fixedBondLength = fixedBondLength
-    option.setHighlightColour((0.95,0.7,0.95))
-    option.baseFontSize = FontSize
-    option.legendFontSize = legendFontSize
-    if elemColor == False:
-        option.updateAtomPalette({k: (0, 0, 0) for k in DrawingOptions.elemDict.keys()})
-    draw.SetDrawOptions(option)
-    AllChem.Compute2DCoords(mol)
-    rdMolDraw2D.PrepareAndDrawMolecule(draw, mol, legend=legend)
-    draw.FinishDrawing()
-    if save:
-        if '.png' in save:
-            n = 0
-            path = '/'.join(save.split('/')[0:-1])
-            while os.path.exists(save):
-                n += 1
-                name = save.split('/')[-1].split('.')[0]
-                name = name + '_' + str(n) + '.png'
-                save = path + '/' + name
-            else:
-                draw.WriteDrawingText(save)
-    else:
-        return draw
-    
-
-def CombineImages(img_file_list, col_num=7, save_dir='./', title='merged_image.png', img_size=None):
-    num_img = len(img_file_list)
-    num_row = num_img//col_num+1 if num_img%col_num != 0 else num_img//col_num
-    if img_size is None:
-        img_size = Image.open(img_file_list[0]).size
-    toImage = Image.new('RGB', (img_size[1]*col_num, img_size[0]*num_row), color=(255,255,255))
-    x_cusum = 0
-    y_cusum = 0
-    num_has_paste = 0
-    for img_file in img_file_list:
-        img = Image.open(img_file)
-        toImage.paste(img, (x_cusum, y_cusum))
-        num_has_paste += 1
-        if num_has_paste%col_num==0:
-            x_cusum = 0
-            y_cusum += img_size[1]
-        else:
-            x_cusum += img_size[0]
-    verify_dir_exists(save_dir)
-    toImage.save(save_dir+'/'+title)
-    plt.xticks([])
-    plt.yticks([])
-    plt.axis('off')  # 取消所有边框，坐标轴
-    plt.imshow(toImage)
-    plt.clf()    # 清除当前图形。如果不清除，当使用循环大量作图，机器会为plt分配越来越多的内存，速度会逐渐变慢。
--- a/pocket_flow/utils/.ipynb_checkpoints/out_table-checkpoint.py
+++ b/pocket_flow/utils/.ipynb_checkpoints/out_table-checkpoint.py
@@ -1,218 +0,0 @@
-from weakref import ref
-from rdkit import Chem
-from rdkit.Chem import AllChem
-from rdkit.Chem import QED, Descriptors, Draw
-from rdkit import DataStructs
-from . import sascorer
-import xlsxwriter
-from easydict import EasyDict
-from io import BytesIO
-import glob
-
-
-SIMILARITY_METRIC = ['DataStructs.TanimotoSimilarity',
-              'DataStructs.DiceSimilarity',
-              'DataStructs.CosineSimilarity',
-              'DataStructs.SokalSimilarity',
-              'DataStructs.RusselSimilarity',
-              'DataStructs.KulczynskiSimilarity',
-              'DataStructs.McConnaugheySimilarity']
-
-
-class ProcessSmilesFile(object):
-    def __init__(self, sdf_file):# refer_mol=None):
-        suppl = Chem.SDMolSupplier(sdf_file)
-        self.smiles = {}
-        #self.refer_mol = refer_mol
-        for idx, mol in enumerate(suppl):
-            if mol is None:
-                continue
-            smi = Chem.MolToSmiles(mol)
-            self.smiles[smi] = {}
-            self.smiles[smi]['mol'] = mol
-            self.smiles[smi]['index'] = idx
-            self.smiles[smi]['name'] = mol.GetProp('_Name')
-            self.smiles[smi]['sdf'] = sdf_file
-    
-    def _QedScore(self,):
-        for smi in self.smiles:
-            #mol = Chem.MolFromSmiles(smi)
-            #print(self.smiles[smi])
-            self.smiles[smi]['QED'] = QED.qed(self.smiles[smi]['mol'])
-    
-    def _SAScore(self,):
-        for smi in self.smiles:
-            #mol = Chem.MolFromSmiles(smi)
-            try:
-                self.smiles[smi]['SA'] = sascorer.calculateScore(self.smiles[smi]['mol'])
-            except:
-                self.smiles[smi]['SA'] = 10.0
-    
-    def _Lipinski(self,):
-        for smi in self.smiles:
-            m = self.smiles[smi]['mol']
-            #wt = sum([a.GetMass() for a in m.GetAtoms()])
-            #self.smiles[smi]['MolWt'] = wt
-            self.smiles[smi]['MolWt'] = Descriptors.MolWt(m)
-            self.smiles[smi]['LogP'] = Descriptors.MolLogP(m)
-            self.smiles[smi]['TPSA'] = Descriptors.TPSA(m)
-            self.smiles[smi]['HBD'] = Descriptors.NumHDonors(m)
-            self.smiles[smi]['HBA'] = Descriptors.NumHAcceptors(m)
-            self.smiles[smi]['RB'] = Descriptors.NumRotatableBonds(m)
-            self.smiles[smi]['Lipinski'] = [self.smiles[smi]['MolWt'],
-                                            self.smiles[smi]['LogP'],
-                                            self.smiles[smi]['TPSA'],
-                                            self.smiles[smi]['HBD'],
-                                            self.smiles[smi]['HBA'],
-                                            self.smiles[smi]['RB']]
-    
-    def _Similarity(self, refer_mol, metric='DataStructs.TanimotoSimilarity'):
-        metric = eval(metric)
-        refer_fp = AllChem.GetMorganFingerprint(refer_mol, 2)
-        for smi in self.smiles:
-            fp = AllChem.GetMorganFingerprint(self.smiles[smi]['mol'],2)
-            '''similarity = DataStructs.FingerprintSimilarity(
-                fp, refer_fp, metric=metric
-                )'''
-            similarity = metric(fp, refer_fp)
-            self.smiles[smi]['similarity'] = similarity
-
-    def Filter(self, molwt=200, lp_rule=False, refer_mol=None, 
-               metric='DataStructs.TanimotoSimilarity', similarity=0.5):
-        self._QedScore()
-        self._SAScore()
-        self._Lipinski()
-        if refer_mol:
-            self._Similarity(refer_mol, metric=metric)
-        #smiles = EasyDict(self.smiles)
-        out = {}
-        for s in self.smiles:
-            smi = EasyDict(self.smiles[s])
-            if smi.MolWt < molwt:
-                continue
-            if lp_rule:
-                l = [smi.MolWt<500, smi.LogP<5, smi.RB<10, 
-                        smi.TPSA<140, smi.HBA<10, smi.HBD<5]
-                #lenth = len(set(l))
-                if sum(l) != 6:
-                    continue
-            if refer_mol:
-                if smi.similarity < similarity:
-                    continue
-            else:
-                smi.similarity = '--'
-            out[s] = smi
-        return out
-
-
-def Mol2Excel(mol_items, work_book=None, name='result.xlsx', worksheet_name='worksheet'):
-    #add_worksheet_name = path.split('/')[-2].split('_')[0].split('sample-')[1]
-    if work_book is None:
-        workbook = xlsxwriter.Workbook(name)
-    else:
-        workbook = work_book
-    worksheet = workbook.add_worksheet(worksheet_name)
-    header = ['Image', 'SMILES', 'QED Score', 'SA Score', 'Similarity', 
-                'MolWt', 'LogP', 'TPSA', 'HBD', 'HBA', 'RB', 'sdf file',
-                'index']
-    header_style = {'bold':1,'valign':'vcenter','align':'center','top':2,
-                    'left':2,'right':2,'bottom':2}
-    HeaderStyle = workbook.add_format(header_style)
-    
-    item_style = {'align':'center','valign': 'vcenter','top':2,'left':2,
-                    'right':2,'bottom':2,'text_wrap':1}
-    ItemStyle = workbook.add_format(item_style)
-    worksheet.set_column('A:A', 39)
-    worksheet.set_column('B:B', 39)
-    worksheet.set_column('C:C', 10)
-    worksheet.set_column('D:D', 10)
-    worksheet.set_column('E:E', 10)
-    worksheet.set_column('F:F', 10)
-    worksheet.set_column('G:G', 10)
-    worksheet.set_column('H:H', 10)
-    worksheet.set_column('I:I', 10)
-    worksheet.set_column('J:J', 10)
-    worksheet.set_column('K:K', 10)
-    worksheet.set_column('L:L', 30)
-    worksheet.set_column('M:M', 10)
-
-    for ix_, i in enumerate(header):
-        worksheet.write(0, ix_, i, HeaderStyle)
-
-    for ix, items in enumerate(mol_items):
-        smi = items[-2]
-        info = EasyDict(items[-1])
-        img_data = BytesIO()
-        AllChem.Compute2DCoords(info.mol)
-        img = Draw.MolToImage(info.mol)
-        img.save(img_data, format='PNG')
-        worksheet.set_row(ix+1, 185)
-        worksheet.insert_image(ix+1, 0, 'f', {'x_scale': 0.9, 'y_scale': 0.8, 'image_data':img_data, 'positioning':1})
-        worksheet.write(ix+1, 1, smi, ItemStyle)
-        worksheet.write(ix+1, 2, info.QED, ItemStyle)
-        worksheet.write(ix+1, 3, info.SA, ItemStyle)
-        worksheet.write(ix+1, 4, info.similarity, ItemStyle)
-        worksheet.write(ix+1, 5, info.MolWt, ItemStyle)
-        worksheet.write(ix+1, 6, info.LogP, ItemStyle)
-        worksheet.write(ix+1, 7, info.TPSA, ItemStyle)
-        worksheet.write(ix+1, 8, info.HBD, ItemStyle)
-        worksheet.write(ix+1, 9, info.HBA, ItemStyle)
-        worksheet.write(ix+1, 10, info.RB, ItemStyle)
-        worksheet.write(ix+1, 11, info.sdf, ItemStyle)
-        worksheet.write(ix+1, 12, info.index, ItemStyle)
-    if work_book is None:
-        workbook.close()
-
-def WriteExcel(result_dir_list, name='result.xlsx', molwt=200, lp_rule=False, sorted_idx=0, refer_mol=None):
-    '''
-    sorted_ix: 用于排序的属性的index, 0:SA; 1:QED; 2:SA*QED
-    '''
-    #创建一个工作簿并添加一张工作表，工作表是可以命名的
-    #workbook = xlsxwriter.Workbook(name)
-    for path in result_dir_list:
-        mol_dict = ProcessSmilesFile(path).Filter(
-            molwt=molwt, lp_rule=lp_rule, refer_mol=refer_mol, 
-            metric='DataStructs.TanimotoSimilarity', similarity=0.2
-            )
-        #print(mol_dict)
-        #L = [(mol_dict[i]['SA'], mol_dict[i]['QED'], mol_dict[i]['SA']*mol_dict[i]['QED'], i, mol_dict[i]) for i in mol_dict]
-        L = [(mol_dict[i]['SA'], mol_dict[i]['QED'], mol_dict[i]['similarity'], i, mol_dict[i]) for i in mol_dict]
-        if sorted_idx==0:
-            L = sorted(L, reverse=False, key=lambda x: x[sorted_idx])
-        else:
-            L = sorted(L, reverse=True, key=lambda x: x[sorted_idx])
-        #print(L)
-        #add_worksheet_name = path.split('/')[-2]
-        #worksheet = workbook.add_worksheet(add_worksheet_name)
-        Mol2Excel(L, name=name)#, worksheet_name=add_worksheet_name)
-    #workbook.close()
-
-
-def WriteExcelAll(result_dir_list, name='result.xlsx', molwt=200, lp_rule=False, 
-                  sorted_ix=0, refer_mol=None, similarity=0.2):
-    all_mol = {}
-    for path in result_dir_list:
-        mol_dict = ProcessSmilesFile(path).Filter(
-            molwt=molwt, lp_rule=lp_rule, refer_mol=refer_mol, 
-            metric='DataStructs.TanimotoSimilarity', similarity=similarity
-            )
-        for smi in mol_dict:
-            if smi not in all_mol:
-                all_mol[smi] = mol_dict[smi]
-    
-    L = [(all_mol[i]['SA'], all_mol[i]['QED'], all_mol[i]['similarity'], i, all_mol[i]) for i in all_mol]
-    if sorted_ix==0:
-        L = sorted(L, reverse=False, key=lambda x: x[sorted_ix])
-    else:
-        L = sorted(L, reverse=True, key=lambda x: x[sorted_ix])
-    Mol2Excel(L, name=name)
-
-
-
-'''refmol_1 = Chem.MolFromSmiles('O=C(/C=C/C1=CC=CC=C1)/C=C/C2=CC=CC=C2')
-refmol_2 = Chem.MolFromSmiles('O=C(/C(CCC/1)=C/C2=CC=CC=C2)C1=C\C3=CC=CC=C3')
-refmol_3 = Chem.MolFromSmiles('O=S(C1=CC2=NC(C3=CC=CC=C3)=CC=C2C=C1)(NC4=CC=CC=C4)=O')
-path_list = [i for i in glob.glob('./outputs/HAT1_2022_08_25*/SMILES.txt') if 'sample-1' not in i]
-path_list += ['HAT1_2022_08_26__18_59_39/SMILES,txt']
-print(path_list)
-WriteExcelAll(path_list, molwt=100, name='HAT1_Wt_more_than_100.xlsx', sorted_ix=0, refer_mol=None)'''
--- a/pocket_flow/utils/.ipynb_checkpoints/process_raw-checkpoint.py
+++ b/pocket_flow/utils/.ipynb_checkpoints/process_raw-checkpoint.py
@@ -1,166 +0,0 @@
-import os 
-import glob
-import numpy as np
-from multiprocessing import Pool
-from .ParseFile import Protein, parse_sdf_to_dict, Ligand
-from .residues_base import RESIDUES_TOPO
-
-
-def verify_dir_exists(dirname):
-    if os.path.isdir(os.path.dirname(dirname)) == False:
-        os.makedirs(os.path.dirname(dirname))
-
-def ComputeDistMat(m1, m2):
-    #m1_square = np.sum(m1*m1, axis=1, keepdims=True)
-    m1_square = np.expand_dims(np.einsum('ij,ij->i', m1, m1), axis=1)
-    if m1 is m2:
-        m2_square = m1_square.T
-    else:
-        #m2_square = np.sum(m2*m2, axis=1, keepdims=True).T
-        m2_square = np.expand_dims(np.einsum('ij,ij->i', m2, m2), axis=0)
-    dist_mat = m1_square + m2_square - np.dot(m1, m2.T)*2
-    # result maybe less than 0 due to floating point rounding errors.
-    dist_mat = np.maximum(dist_mat, 0, dist_mat)
-    if m1 is m2:
-        # Ensure that distances between vectors and themselves are set to 0.0.
-        # This may not be the case due to floating point rounding errors.
-        dist_mat.flat[::dist_mat.shape[0] + 1] = 0.0
-    dist_mat = np.sqrt(dist_mat)
-    return dist_mat
-
-class SplitPocket(object):
-    
-    def __init__(self, 
-                 main_path='/export/home/jyy/CrossDocked2020/',
-                 sample_path='/Refine_Positive_Samples/',
-                 new_sapmle_path='/Refine_Positive_Samples_Pocket/',
-                 type_file='/export/home/jyy/CrossDocked2020/Refine_Positive_Samples_New.types',
-                 dist_cutoff=10,
-                 get_surface_atom=True):
-        
-        self.main_path = main_path
-        self.sample_path = sample_path
-        self.new_sapmle_path = new_sapmle_path
-        self.dist_cutoff = dist_cutoff
-        self.type_file = type_file
-        self.types = [line.strip().split() for line in open(type_file).readlines()]
-        self.exceptions = []
-        self.get_surface_atom = get_surface_atom
-
-    @staticmethod  
-    def _split_pocket(protein, ligand, dist_cutoff):
-        if protein.__normalized__ == False:
-            protein.normalize_coord()
-        rm = protein.rotate_matrix
-        shift = protein.center_of_mass_shift
-        ligand.normalize_pos(shift, rm)
-        res = np.array(protein.get_residues)
-        cm_res = np.array([r.center_of_mass for r in res])
-        dist_mat = ComputeDistMat(ligand.normalized_coords, cm_res)
-        bool_dist_mat = dist_mat < dist_cutoff
-        pocket_res = res[bool_dist_mat.sum(axis=0)>0]
-        pocket_block = '\n'.join(
-            [i.to_heavy_string for i in pocket_res if len(i.get_heavy_atoms)==len(RESIDUES_TOPO[i.name])]
-            )
-        return pocket_block, ligand.mol_block()
-
-    @staticmethod  
-    def _split_pocket_with_surface_atoms(protein, ligand, dist_cutoff):
-        if protein.__normalized__ == False:
-            protein.normalize_coord()
-        rm = protein.rotate_matrix
-        shift = protein.center_of_mass_shift
-        ligand.normalize_pos(shift, rm)
-
-        protein.compute_surface_atoms()
-
-        res = np.array(protein.get_residues)
-        cm_res = np.array([r.center_of_mass for r in res])
-        dist_mat = ComputeDistMat(ligand.normalized_coords, cm_res)
-        bool_dist_mat = dist_mat < dist_cutoff
-        pocket_res = res[bool_dist_mat.sum(axis=0)>0]
-        pocket_block = []
-        for r in pocket_res:
-            for a in r.get_heavy_atoms:
-                if a.is_surf is True:
-                    pocket_block.append(a.to_string + ' surf')
-                else:
-                    pocket_block.append(a.to_string + ' inner')
-        pocket_block = '\n'.join(pocket_block)
-        return pocket_block, ligand.mol_block()
-
-    def _do_split(self, items):
-        try:
-            sub_path = items[4].split('/')[0]
-            ligand_name = items[4].split('/')[1].split('.')[0]
-            chain_id = items[3].split('.')[0].split('_')[-2]
-
-            protein = Protein(self.main_path+self.sample_path+items[3])
-            chain = protein.get_chain(chain_id)
-            ligand = Ligand(self.main_path+self.sample_path+items[4], sanitize=True)
-            
-            if self.get_surface_atom:
-                pocket_block, ligand_block = self._split_pocket_with_surface_atoms(chain, ligand, self.dist_cutoff)
-            else:
-                pocket_block, ligand_block = self._split_pocket(chain, ligand, self.dist_cutoff)
-
-            save_path = '{}/{}/{}/'.format(self.main_path, self.new_sapmle_path, sub_path)
-            verify_dir_exists(save_path)
-            pocket_file_name = '{}/{}_pocket{}.pdb'.format(save_path, ligand_name, self.dist_cutoff)
-
-            open(pocket_file_name, 'w').write(pocket_block)
-            open(save_path+'/{}.mol'.format(ligand_name), 'w').write(ligand_block)
-        except:
-            protein_file = self.main_path+self.sample_path+items[3]
-            ligand_file = self.main_path+self.sample_path+items[4]
-            print('[Exception]', protein_file, ligand_file)
-
-    @staticmethod
-    def split_pocket_from_site_map(site_map, protein_file, dist_cutoff):
-        '''
-        if target dosn't have ligand, we can use the site_map.pdb computed by schrodinger or other software
-        to split potential pocket.
-        '''
-        site_coords = []
-        with open(site_map) as fr:
-            lines = fr.readlines()
-            for line in lines:
-                if line.startswith('HETATM'):
-                    line = line.strip()
-                    xyz = [float(line[30:38].strip()),
-                            float(line[38:46].strip()),
-                            float(line[46:54].strip())]
-                    site_coords.append(xyz)
-        site_coords = np.array(site_coords)
-        
-        protein = Protein(protein_file)
-        res = np.array(protein.get_residues)
-        cm_res = np.array([r.center_of_mass for r in res])
-        dist_mat = ComputeDistMat(site_coords, cm_res)
-        bool_dist_mat = dist_mat < dist_cutoff
-        pocket_res = res[bool_dist_mat.sum(axis=0)>0]
-        pocket_block = '\n'.join(
-            [i.to_heavy_string for i in pocket_res if len(i.get_heavy_atoms)==len(RESIDUES_TOPO[i.name])]
-            )
-        return pocket_block
-
-    def __call__(self, np=10):
-        pool = Pool(processes=np)    # 创建进程池对象，进程数与multiprocessing.cpu_count()相同
-        data_pool = pool.map(self._do_split, self.types) # 并行执行函数
-        pool.close()
-        pool.join()
-        print('Done !')
-
-'''
-from utils import SplitPocket
-split_pocket = SplitPocket(
-    main_path='/export/home/jyy/CrossDocked2020/',
-    sample_path = '/Refine_Positive_Samples/',
-    new_sapmle_path = './crossdocked2020_pocket-surf/',
-    type_file = '/export/home/jyy/CrossDocked2020/Refine_Positive_Samples_New.types',
-    dist_cutoff = 10,
-    get_surface_atom = True
-    )
-split_pocket(np=10)
-'''
-
--- a/pocket_flow/utils/.ipynb_checkpoints/sascorer-checkpoint.py
+++ b/pocket_flow/utils/.ipynb_checkpoints/sascorer-checkpoint.py
@@ -1,173 +0,0 @@
-#
-# calculation of synthetic accessibility score as described in:
-#
-# Estimation of Synthetic Accessibility Score of Drug-like Molecules based on Molecular Complexity and Fragment Contributions
-# Peter Ertl and Ansgar Schuffenhauer
-# Journal of Cheminformatics 1:8 (2009)
-# http://www.jcheminf.com/content/1/1/8
-#
-# several small modifications to the original paper are included
-# particularly slightly different formula for marocyclic penalty
-# and taking into account also molecule symmetry (fingerprint density)
-#
-# for a set of 10k diverse molecules the agreement between the original method
-# as implemented in PipelinePilot and this implementation is r2 = 0.97
-#
-# peter ertl & greg landrum, september 2013
-#
-
-
-from rdkit import Chem
-from rdkit.Chem import rdMolDescriptors
-import pickle
-
-import math
-from collections import defaultdict
-
-import os.path as op
-
-_fscores = None
-
-
-def readFragmentScores(name='fpscores'):
-    import gzip
-    global _fscores
-    # generate the full path filename:
-    if name == "fpscores":
-        name = op.join(op.dirname(__file__), name)
-    data = pickle.load(gzip.open('%s.pkl.gz' % name))
-    outDict = {}
-    for i in data:
-        for j in range(1, len(i)):
-            outDict[i[j]] = float(i[0])
-    _fscores = outDict
-
-
-def numBridgeheadsAndSpiro(mol, ri=None):
-    nSpiro = rdMolDescriptors.CalcNumSpiroAtoms(mol)
-    nBridgehead = rdMolDescriptors.CalcNumBridgeheadAtoms(mol)
-    return nBridgehead, nSpiro
-
-
-def calculateScore(m):
-    if _fscores is None:
-        readFragmentScores()
-
-    # fragment score
-    fp = rdMolDescriptors.GetMorganFingerprint(m,
-                                               2)  # <- 2 is the *radius* of the circular fingerprint
-    fps = fp.GetNonzeroElements()
-    score1 = 0.
-    nf = 0
-    for bitId, v in fps.items():
-        nf += v
-        sfp = bitId
-        score1 += _fscores.get(sfp, -4) * v
-    score1 /= nf
-
-    # features score
-    nAtoms = m.GetNumAtoms()
-    nChiralCenters = len(Chem.FindMolChiralCenters(m, includeUnassigned=True))
-    ri = m.GetRingInfo()
-    nBridgeheads, nSpiro = numBridgeheadsAndSpiro(m, ri)
-    nMacrocycles = 0
-    for x in ri.AtomRings():
-        if len(x) > 8:
-            nMacrocycles += 1
-
-    sizePenalty = nAtoms**1.005 - nAtoms
-    stereoPenalty = math.log10(nChiralCenters + 1)
-    spiroPenalty = math.log10(nSpiro + 1)
-    bridgePenalty = math.log10(nBridgeheads + 1)
-    macrocyclePenalty = 0.
-    # ---------------------------------------
-    # This differs from the paper, which defines:
-    #  macrocyclePenalty = math.log10(nMacrocycles+1)
-    # This form generates better results when 2 or more macrocycles are present
-    if nMacrocycles > 0:
-        macrocyclePenalty = math.log10(2)
-
-    score2 = 0. - sizePenalty - stereoPenalty - spiroPenalty - bridgePenalty - macrocyclePenalty
-
-    # correction for the fingerprint density
-    # not in the original publication, added in version 1.1
-    # to make highly symmetrical molecules easier to synthetise
-    score3 = 0.
-    if nAtoms > len(fps):
-        score3 = math.log(float(nAtoms) / len(fps)) * .5
-
-    sascore = score1 + score2 + score3
-
-    # need to transform "raw" value into scale between 1 and 10
-    min = -4.0
-    max = 2.5
-    sascore = 11. - (sascore - min + 1) / (max - min) * 9.
-    # smooth the 10-end
-    if sascore > 8.:
-        sascore = 8. + math.log(sascore + 1. - 9.)
-    if sascore > 10.:
-        sascore = 10.0
-    elif sascore < 1.:
-        sascore = 1.0
-
-    return sascore
-
-
-def processMols(mols):
-    print('smiles\tName\tsa_score')
-    for i, m in enumerate(mols):
-        if m is None:
-            continue
-
-        s = calculateScore(m)
-
-        smiles = Chem.MolToSmiles(m)
-        print(smiles + "\t" + m.GetProp('_Name') + "\t%3f" % s)
-
-
-if __name__ == '__main__':
-    import sys
-    import time
-
-    t1 = time.time()
-    readFragmentScores("fpscores")
-    t2 = time.time()
-
-    suppl = Chem.SmilesMolSupplier(sys.argv[1])
-    t3 = time.time()
-    processMols(suppl)
-    t4 = time.time()
-
-    print('Reading took %.2f seconds. Calculating took %.2f seconds' % ((t2 - t1), (t4 - t3)),
-          file=sys.stderr)
-
-#
-#  Copyright (c) 2013, Novartis Institutes for BioMedical Research Inc.
-#  All rights reserved.
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions are
-# met:
-#
-#     * Redistributions of source code must retain the above copyright
-#       notice, this list of conditions and the following disclaimer.
-#     * Redistributions in binary form must reproduce the above
-#       copyright notice, this list of conditions and the following
-#       disclaimer in the documentation and/or other materials provided
-#       with the distribution.
-#     * Neither the name of Novartis Institutes for BioMedical Research Inc.
-#       nor the names of its contributors may be used to endorse or promote
-#       products derived from this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-#
--- a/pocket_flow/utils/.ipynb_checkpoints/train-checkpoint.py
+++ b/pocket_flow/utils/.ipynb_checkpoints/train-checkpoint.py
@@ -1,389 +0,0 @@
-import torch
-import torch_geometric as pyg
-from torch_geometric.loader import DataLoader, DataListLoader
-from torch.nn.utils import clip_grad_norm_
-#from torch.utils.data import Dataset, Subset, DataLoader
-from torch.cuda import amp
-import numpy as np
-import os
-import time
-from collections.abc import Iterable
-from torch.utils import tensorboard
-
-
-def inf_iterator(iterable):
-    iterator = iterable.__iter__()
-    while True:
-        try:
-            yield iterator.__next__()
-        except StopIteration:
-            iterator = iterable.__iter__()
-
-
-def get_parameter_number(model):
-    total_num = sum(p.numel() for p in model.parameters())
-    trainable_num = sum(p.numel() for p in model.parameters() if p.requires_grad)
-    return {'Total': total_num, 'Trainable': trainable_num}
-
-def timewait(time_gap):
-    d = time_gap//(24*3600)
-    d_h = time_gap%(24*3600)
-    h = d_h//3600
-    h_m = d_h%3600
-    m = h_m//60
-    s = h_m%60
-    if d > 0:
-        out = '{}d {}h {}m {}s'.format(int(d),int(h),int(m),round(s,2))
-    elif h > 0:
-        out = '{}h {}m {}s'.format(int(h),int(m),round(s,2))
-    elif m > 0:
-        out = '{}m {}s'.format(int(m),round(s,2))
-    else:
-        out = '{}s'.format(round(s,2))
-    return out
-
-def verify_dir_exists(dirname):
-    if os.path.isdir(os.path.dirname(dirname)) == False:
-        os.makedirs(os.path.dirname(dirname))
-
-class Experiment(object):
-    def __init__(self, model, train_set, optimizer, scheduler=None, device='cuda',
-                 valid_set=None, clip_grad=True, max_norm=5, norm_type=2, 
-                 pos_noise_std=0.1, data_parallel=False, use_amp=False):
-        if data_parallel:
-            self.model = pyg.nn.DataParallel(model)
-        else:
-            self.model = model#.to(device)
-        if 'config' in model.__dict__:
-            self.model_config = model.config
-        else:
-            self.model_config = None
-        self.train_set = train_set
-        self.valid_set = valid_set
-        self.optimizer = optimizer
-        self.scheduler = scheduler
-        self.num_train_data = len(train_set)
-        self.data_parallel = data_parallel
-        #self.train_batch_size = train_loader.batch_size
-        if valid_set:
-            self.num_valid_data = len(valid_set)
-        else:
-            self.num_valid_data = None
-            #self.valid_batch_size = valid_loader.batch_size
-
-        self.clip_grad = clip_grad
-        self.max_norm = max_norm
-        self.norm_type = norm_type
-        self.with_tb = False
-        self.device = device
-        self.pos_noise_std = pos_noise_std
-        self.use_amp = use_amp
-        if self.use_amp:
-            self.grad_scaler = amp.GradScaler()
-
-    @staticmethod
-    def get_log(out_dict, key_word, it, time_gap=None):
-        log = []
-        for key, value in out_dict.items():
-            log.append(' {}:{:.5f} |'.format(key, value.item()))
-        log.insert(0, '[{} {}]'.format(key_word, it))
-        if time_gap:
-            log.append(' Time: {}'.format(time_gap))
-        return ''.join(log)
-
-    @staticmethod
-    def write_summary(out_dict, writer, key_word, num_iter, scheduler=None, optimizer=None):
-        for key, value in out_dict.items():
-            writer.add_scalar('{}/{}'.format(key_word, key), value, num_iter)
-        if scheduler is not None:
-            writer.add_scalar('{}/lr'.format(key_word), optimizer.param_groups[0]['lr'], num_iter)
-        writer.flush()
-
-    @staticmethod
-    def get_num_iter(num_data, batch_size):
-        if num_data % batch_size == 0:
-            n_iter = int(num_data/batch_size)
-        else:
-            n_iter = int(num_data/batch_size) + 1
-        return n_iter
-    
-    @property
-    def parameter_number(self):
-        return get_parameter_number(self.model)
-    
-    def _train_step(self, batch, it=0, print_log=False):
-        start = time.time()
-        self.model.train()
-        self.optimizer.zero_grad()
-        if self.use_amp:
-            with amp.autocast():
-                out_dict = self.model.get_loss(batch)
-            self.grad_scaler.scale(out_dict['loss']).backward()
-        else:
-            out_dict = self.model.get_loss(batch)
-            out_dict['loss'].backward()
-        #out_dict = self.model(batch)
-
-        #with torch.autograd.detect_anomaly():
-        
-        #print(out_dict)
-        if self.clip_grad:
-            # 梯度裁剪
-            orig_grad_norm = clip_grad_norm_(
-                self.model.parameters(), 
-                max_norm=self.max_norm, 
-                norm_type=self.norm_type,
-                error_if_nonfinite=True)   # error_if_nonfinite=True：如果梯度存在nan, inf将抛出error
-        if self.use_amp:
-            self.grad_scaler.step(self.optimizer)
-            self.grad_scaler.update()
-        else:
-            self.optimizer.step()
-
-        if self.with_tb:
-            if self.clip_grad:
-                self.writer.add_scalar('train/step/grad', orig_grad_norm, it)
-            self.write_summary(out_dict, self.writer, 'train/step', it)
-
-        end = time.time()
-        time_gap = end - start
-        log = self.get_log(out_dict, 'Step', it, time_gap='{:.3f}'.format(time_gap))
-        with open(self.logdir+'training.log', 'a') as log_writer:
-            log_writer.write(log + '\n')
-        if print_log:
-            print(log)
-        return out_dict
-    
-    def train_epoch(self, n_iter, n_epoch, print_log=False):
-        start = time.time()
-        log_dict = {}
-        #train_loader = inf_iterator(self.train_loader)
-        for i in range(n_iter):
-            if self.data_parallel:
-                batch = next(self.train_loader).cuda()#.to(self.device)
-            else:
-                batch = next(self.train_loader).to(self.device)
-            ## 计算噪音
-            compose_noise = torch.randn_like(batch.compose_pos) * self.pos_noise_std
-            batch.compose_pos = batch.compose_pos + compose_noise
-            out_dict = self._train_step(batch, it=i+n_epoch*n_iter, print_log=print_log)
-            for key, value in out_dict.items():
-                if key not in log_dict:
-                    log_dict[key] = value if 'acc' in key else value * batch.num_graphs
-                else:
-                    log_dict[key] += value if 'acc' in key else value * batch.num_graphs
-        #print(log_dict) 
-        for key, value in log_dict.items():
-            if 'acc' in key:
-                log_dict[key] = value / n_iter
-            else:
-                log_dict[key] = value / self.num_train_data
-
-        if self.with_tb:
-            self.write_summary(log_dict, self.writer, 'train/epoch', n_epoch, 
-                               scheduler=self.scheduler, optimizer=self.optimizer)
-        end = time.time()
-        time_gap = timewait(end - start)
-        log = self.get_log(log_dict, 'Epoch', n_epoch, time_gap=time_gap)
-        with open(self.logdir+'training.log', 'a') as log_writer:
-            log_writer.write(log + '\n')
-        if print_log:
-            print(log)
-        
-
-    def validate(self, n_iter, n_epoch, print_log=False, schedule_key='loss'):
-        start = time.time()
-        log_dict = {}
-        with torch.no_grad():
-            self.model.eval()
-            for _ in range(n_iter):
-                batch = next(self.valid_loader).to(self.device)
-                out_dict = self.model.get_loss(batch)
-                for key, value in out_dict.items():
-                    if key not in log_dict:
-                        log_dict[key] = value if 'acc' in key else value * batch.num_graphs
-                    else:
-                        log_dict[key] += value if 'acc' in key else value * batch.num_graphs
-
-        for key, value in log_dict.items():
-            if 'acc' in key:
-                log_dict[key] = value / n_iter
-            else:
-                log_dict[key] = value / self.num_valid_data
-        if self.scheduler:
-            self.scheduler.step(log_dict[schedule_key])
-        if self.with_tb:
-            self.write_summary(log_dict, self.writer, 'val/epoch', n_epoch, 
-                               scheduler=self.scheduler, optimizer=self.optimizer)
-        end = time.time()
-        time_gap = timewait(end - start)
-        log = self.get_log(log_dict, 'Validate', n_epoch, time_gap=time_gap)
-        with open(self.logdir+'training.log', 'a') as log_writer:
-            log_writer.write(log + '\n')
-        if print_log:
-            print(log)
-        return log_dict['loss']
-    
-    def fit(self, num_epoch, train_batch_size=4, valid_batch_size=16, print_log=True,
-            with_tb=True, logdir='./training_log', early_stop=None, schedule_key='loss',
-            follow_batch=[], exclude_keys=[], collate_fn=None, num_workers=0, pin_memory=False):
-        
-        self.train_loader = inf_iterator(DataLoader(
-                self.train_set, 
-                batch_size = train_batch_size, #config.train.batch_size, 
-                shuffle = True,
-                num_workers = num_workers,
-                pin_memory = pin_memory,
-                follow_batch = follow_batch,
-                exclude_keys = exclude_keys,
-                collate_fn = collate_fn
-            ))
-        self.train_batch_size = train_batch_size
-
-        if self.valid_set:
-            self.valid_loader = inf_iterator(DataLoader(
-                self.valid_set, 
-                batch_size = valid_batch_size, #config.train.batch_size, 
-                shuffle = False,
-                num_workers = num_workers,
-                pin_memory = pin_memory,
-                follow_batch = follow_batch,
-                exclude_keys = exclude_keys,
-                collate_fn = collate_fn
-            ))
-            self.valid_batch_size = valid_batch_size
-            
-        self.n_iter_train = self.get_num_iter(self.num_train_data, self.train_batch_size)
-        self.n_iter_valid = self.get_num_iter(self.num_valid_data, self.valid_batch_size)
-        ## config log
-        date = time.strftime("%Y-%m-%d-%H-%M", time.localtime())
-        self.logdir = logdir+'/'+date+'/'
-        verify_dir_exists(self.logdir)
-        open(self.logdir+'model_config.dir','w').write(str(self.model_config))
-        self.with_tb = with_tb
-        if self.with_tb:
-            self.writer = tensorboard.SummaryWriter(self.logdir)
-        log_writer = open(self.logdir+'training.log', 'w')
-        log_writer.write('\n######## {}; batch_size ########\n'.format(self.parameter_number, train_batch_size))
-        log_writer.close()
-        if print_log:
-            print('\n######## {} ########\n'.format(self.parameter_number))
-        ## training/saving model
-        loss_best = float('inf')
-        early_stop_count = 0
-        for epoch in range(num_epoch):
-            self.train_epoch(self.n_iter_train, epoch, print_log=print_log)
-            val_loss = self.validate(self.n_iter_valid, epoch, schedule_key=schedule_key, print_log=print_log)
-            if val_loss < loss_best:
-                torch.save(self.model.state_dict(), self.logdir+'/model.pth')
-                loss_best = val_loss
-                early_stop_count = 0
-            else:
-                early_stop_count += 1
-            if isinstance(early_stop, int):
-                if early_stop_count == early_stop:
-                    if print_log:
-                        print('Early stopping ......')
-                    with open(self.logdir+'training.log', 'a') as log_writer:
-                        log_writer.write('Early stopping ......\n')
-                    break
-    
-    def fit_step(self, num_step, valid_per_step=5000, train_batch_size=4, valid_batch_size=16, print_log=True,
-                 with_tb=True, logdir='./training_log', schedule_key='loss', num_workers=0, pin_memory=False,
-                 follow_batch=[], exclude_keys=[], collate_fn=None, max_edge_num_in_batch=900000):
-        #torch.multiprocessing.set_sharing_strategy('file_system')
-        if self.data_parallel:
-            self.train_loader = inf_iterator(DataListLoader(
-                self.train_set, 
-                batch_size = train_batch_size, #config.train.batch_size, 
-                shuffle = True,
-                num_workers = num_workers,
-                pin_memory = pin_memory,
-                #follow_batch = follow_batch,
-                #exclude_keys = exclude_keys,
-                collate_fn = collate_fn
-            ))
-            self.train_batch_size = train_batch_size
-
-            if self.valid_set:
-                self.valid_loader = inf_iterator(DataListLoader(
-                    self.valid_set, 
-                    batch_size = valid_batch_size, #config.train.batch_size, 
-                    shuffle = False,
-                    num_workers = num_workers,
-                    pin_memory = pin_memory,
-                    #follow_batch = follow_batch,
-                    #exclude_keys = exclude_keys,
-                    collate_fn = collate_fn
-                ))
-                self.valid_batch_size = valid_batch_size
-        else:
-            self.train_loader = inf_iterator(DataLoader(
-                    self.train_set, 
-                    batch_size = train_batch_size, #config.train.batch_size, 
-                    shuffle = False,
-                    num_workers = num_workers,
-                    pin_memory = pin_memory,
-                    follow_batch = follow_batch,
-                    exclude_keys = exclude_keys,
-                    collate_fn = collate_fn
-                ))
-            self.train_batch_size = train_batch_size
-
-            if self.valid_set:
-                self.valid_loader = inf_iterator(DataLoader(
-                    self.valid_set, 
-                    batch_size = valid_batch_size, #config.train.batch_size, 
-                    shuffle = False,
-                    num_workers = num_workers,
-                    pin_memory = pin_memory,
-                    follow_batch = follow_batch,
-                    exclude_keys = exclude_keys,
-                    collate_fn = collate_fn
-                ))
-                self.valid_batch_size = valid_batch_size
-            
-        self.n_iter_train = self.get_num_iter(self.num_train_data, self.train_batch_size)
-        if self.num_valid_data:
-            self.n_iter_valid = self.get_num_iter(self.num_valid_data, self.valid_batch_size)
-        ## config log
-        date = time.strftime("%Y-%m-%d-%H-%M", time.localtime())
-        self.logdir = logdir+'/'+date+'/'
-        verify_dir_exists(self.logdir)
-        open(self.logdir+'model_config.dir','w').write(str(self.model_config))
-        self.with_tb = with_tb
-        if self.with_tb:
-            self.writer = tensorboard.SummaryWriter(self.logdir)
-        log_writer = open(self.logdir+'training.log', 'w')
-        log_writer.write('\n######## {}; batch_size ########\n'.format(self.parameter_number, train_batch_size))
-        log_writer.close()
-        if print_log:
-            print('\n######## {} ########\n'.format(self.parameter_number))
-        for step in range(1, num_step+1):
-            #batch = next(self.train_loader)#.to(self.device)
-            ## 计算噪音
-            if self.data_parallel:
-                batch = next(self.train_loader)#.to(self.device)
-                for d in batch:
-                    d.cpx_pos = d.cpx_pos + torch.randn_like(d.cpx_pos) * self.pos_noise_std
-            else:
-                batch = next(self.train_loader).to(self.device)
-                #print(batch.cpx_edge_index.shape)
-                cpx_noise = torch.randn_like(batch.cpx_pos) * self.pos_noise_std
-                batch.cpx_pos = batch.cpx_pos + cpx_noise
-            if batch.cpx_edge_index.size(1) > max_edge_num_in_batch:
-                continue
-            out_dict = self._train_step(batch, it=step, print_log=print_log)
-            if (step % valid_per_step == 0 or step == num_step):
-                if self.num_valid_data:
-                    val_loss = self.validate(self.n_iter_valid, step, schedule_key=schedule_key, print_log=print_log)
-                #torch.save(self.model.state_dict(), self.logdir+'/model.pth')
-                ckpt_path = self.logdir + '/ckpt/'
-                verify_dir_exists(ckpt_path)
-                torch.save({    
-                    'config': self.model_config,
-                    'model': self.model.state_dict(),
-                    'optimizer': self.optimizer.state_dict(),
-                    'scheduler': self.scheduler.state_dict(),
-                    'iteration': step, 
-                }, ckpt_path+'/{}.pt'.format(step))
--- a/pocket_flow/utils/.ipynb_checkpoints/transform_utils-checkpoint.py
+++ b/pocket_flow/utils/.ipynb_checkpoints/transform_utils-checkpoint.py
@@ -1,340 +0,0 @@
-import torch
-import torch.nn.functional as F
-from torch_geometric.nn.pool import knn_graph, radius_graph
-from torch_geometric.transforms import Compose
-from torch_geometric.utils.subgraph import subgraph
-from torch_geometric.nn import knn, radius
-from torch_geometric.utils.num_nodes import maybe_num_nodes
-from torch_scatter import scatter_add
-import numpy as np
-import random
-import copy
-
-
-def count_neighbors(edge_index, symmetry, valence=None, num_nodes=None):
-    assert symmetry == True, 'Only support symmetrical edges.'
-
-    if num_nodes is None:
-        num_nodes = maybe_num_nodes(edge_index)   # 如果没有指定配体原子数，根据edge_index来推测
-
-    if valence is None:
-        valence = torch.ones([edge_index.size(1)], device=edge_index.device)
-    valence = valence.view(edge_index.size(1))
-    return scatter_add(valence, index=edge_index[0], dim=0, dim_size=num_nodes).long()
-
-
-def change_features_of_neigh(ligand_feature_full, new_num_neigh, new_num_valence, ligand_atom_num_bonds, num_atom_type=7):
-    idx_n_neigh = num_atom_type + 1
-    idx_n_valence = idx_n_neigh + 1
-    idx_n_bonds = idx_n_valence + 1
-    ligand_feature_full[:, idx_n_neigh] = new_num_neigh.long()
-    ligand_feature_full[:, idx_n_valence] = new_num_valence.long()
-    ligand_feature_full[:, idx_n_bonds:idx_n_bonds+3] = ligand_atom_num_bonds.long()
-    return ligand_feature_full
-
-
-def get_rfs_perm(nbh_list, ring_info):
-    num_nodes = len(nbh_list)
-    node0 = random.randint(0, num_nodes-1)
-    queue,order,not_ring_queue,edge_index = [],[],[],[]
-    if (ring_info[node0]>0).sum():
-        queue.append(node0)
-    else:
-        not_ring_queue.append(node0)
-    while queue or not_ring_queue:
-        if queue:
-            v = queue.pop()
-            order.append(v)
-        elif not_ring_queue:
-            v = not_ring_queue.pop()
-            order.append(v)
-        adj_in_ring, adj_not_ring, edge_idx_step = [], [], []
-        for nbh in nbh_list[v]:
-            if (ring_info[nbh]>0).sum():
-                adj_in_ring.append(nbh)
-            else:
-                adj_not_ring.append(nbh)
-            if nbh in order:
-                edge_idx_step.append((v,nbh))
-                edge_idx_step.append((nbh,v))
-        edge_index.append(edge_idx_step)
-        if adj_not_ring:
-            for w in adj_not_ring:
-                if w not in order and w not in not_ring_queue:
-                    not_ring_queue.append(w)
-        # Preferential access to atoms in the same ring of w
-        same_ring_pool = []
-        if adj_in_ring:
-            for w in adj_in_ring:
-                if (w not in order and w not in queue):
-                    if (ring_info[w] == ring_info[v]).sum()>0:
-                        same_ring_pool.append(w)
-                    else:
-                        queue.append(w)
-                elif w not in order and w in queue:
-                    if (ring_info[w] == ring_info[v]).sum()>0:
-                        same_ring_pool.append(w)
-                        queue.remove(w)
-                elif w not in order and (ring_info[w]>0).sum()>=1:
-                    queue.remove(w)
-                    queue.append(w)
-            queue += same_ring_pool
-    return torch.LongTensor(order), edge_index
-
-
-def get_bfs_perm(nbh_list):
-    num_nodes = len(nbh_list)
-    # 每个节点的邻居数量
-    num_neighbors = torch.LongTensor([len(nbh_list[i]) for i in range(num_nodes)])
-    #print(num_neighbors)
-    # 随机选择一个原子作为起始点
-    bfs_queue = [random.randint(0, num_nodes-1)]
-    bfs_perm, edge_index = [], []
-    num_remains = [num_neighbors.clone()]
-    bfs_next_list = {}
-    visited = {bfs_queue[0]}
-    num_nbh_remain = num_neighbors.clone()
-    while len(bfs_queue) > 0:
-        current = bfs_queue.pop(0) # Remove and return item at index (default last)
-        for nbh in nbh_list[current]:
-            num_nbh_remain[nbh] -= 1
-        bfs_perm.append(current)
-        num_remains.append(num_nbh_remain.clone())
-        next_candid, edge_idx_step = [], []
-        for nxt in nbh_list[current]:
-            if nxt in visited: continue
-            next_candid.append(nxt)
-            visited.add(nxt)
-            for adj in nbh_list[nxt]:
-                if adj in bfs_perm:
-                    edge_idx_step.append((adj,nxt))
-                    edge_idx_step.append((nxt,adj))
-        edge_index.append(edge_idx_step)
-        random.shuffle(next_candid)
-        bfs_queue += next_candid
-        bfs_next_list[current] = copy.copy(bfs_queue)
-    return torch.LongTensor(bfs_perm), edge_index
-
-
-def mask_node(data, context_idx, masked_idx, num_atom_type=10, y_pos_std=0.05):
-    data.context_idx = context_idx  # for change bond index
-    data.masked_idx = masked_idx
-    # masked ligand atom element/feature/pos.
-    data.ligand_masked_element = data.ligand_element[masked_idx]
-    # data.ligand_masked_feature = data.ligand_atom_feature[masked_idx]   # For Prediction. these features are chem properties
-    data.ligand_masked_pos = data.ligand_pos[masked_idx]
-
-    # context ligand atom elment/full features/pos. Note: num_neigh and num_valence features should be changed
-    data.ligand_context_element = data.ligand_element[context_idx]
-    data.ligand_context_feature_full = data.ligand_atom_feature_full[context_idx]   # For Input
-    data.ligand_context_pos = data.ligand_pos[context_idx]
-
-    # new bond with ligand context atoms
-    if data.ligand_bond_index.size(1) != 0:
-        data.ligand_context_bond_index, data.ligand_context_bond_type = subgraph(
-            context_idx,
-            data.ligand_bond_index,
-            edge_attr = data.ligand_bond_type,
-            relabel_nodes = True,
-        )
-    else:
-        data.ligand_context_bond_index = torch.empty([2, 0], dtype=torch.long)
-        data.ligand_context_bond_type = torch.empty([0], dtype=torch.long)
-    # re-calculate atom features that relate to bond
-    data.ligand_context_num_neighbors = count_neighbors(
-        data.ligand_context_bond_index,
-        symmetry=True,
-        num_nodes = context_idx.size(0),
-    )
-    data.ligand_context_valence = count_neighbors(
-        data.ligand_context_bond_index,
-        symmetry=True,
-        valence=data.ligand_context_bond_type,
-        num_nodes=context_idx.size(0)
-    )
-    data.ligand_context_num_bonds = torch.stack([
-        count_neighbors(
-            data.ligand_context_bond_index,
-            symmetry=True,
-            valence=data.ligand_context_bond_type == i,
-            num_nodes=context_idx.size(0),
-        ) for i in [1, 2, 3]
-    ], dim = -1)
-    # re-calculate ligand_context_featrure_full
-    data.ligand_context_feature_full = change_features_of_neigh(
-        data.ligand_context_feature_full,
-        data.ligand_context_num_neighbors,
-        data.ligand_context_valence,
-        data.ligand_context_num_bonds,
-        num_atom_type=num_atom_type
-    )
-    if data.ligand_masked_pos.size(0) == 0:
-        data.y_pos = torch.empty([0,3], dtype=torch.float32)
-    else:
-        data.y_pos = data.ligand_masked_pos[0].view(-1,3)
-        #data.y_pos = data.y_pos.repeat(5, 1)
-        data.y_pos += torch.randn_like(data.y_pos) * y_pos_std
-    data.ligand_frontier = (data.ligand_context_num_neighbors < data.ligand_num_neighbors[context_idx])
-    '''new_step_atom_idx = data.masked_idx[0]
-    candidate_focal_idx_in_context = torch.LongTensor(data.ligand_nbh_list[new_step_atom_idx.item()])
-    focal_idx_in_context_mask = (data.context_idx.unsqueeze(1) == candidate_focal_idx_in_context).any(1)
-    data.focal_idx_in_context = torch.nonzero(focal_idx_in_context_mask).view(-1)
-    if data.focal_idx_in_context.size(0) == 0:
-        data.focal_idx_in_context_ = data.focal_idx_in_context
-    else:
-        data.focal_idx_in_context_ = data.focal_idx_in_context[torch.multinomial(torch.ones_like(data.focal_idx_in_context).float(), 1)]
-    data.focal_label = torch.zeros_like(data.context_idx)
-    data.focal_label[data.focal_idx_in_context] = 1
-    data.atom_label = data.ligand_element[data.masked_idx[0]].unsqueeze(0)
-    # get edge label
-    focal_idx_in_ligand = data.context_idx[focal_idx_in_context_mask]
-    edge_type = []
-    for i in focal_idx_in_ligand:
-        edge_type_mask = torch.stack([data.ligand_bond_index[1]==i, data.ligand_bond_index[0]==new_step_atom_idx], dim=1).all(1)
-        edge_type.append(torch.unique(data.ligand_bond_type[edge_type_mask]))
-    data.edge_label = torch.zeros_like(data.context_idx)
-    data.edge_label[data.focal_idx_in_context] = torch.LongTensor(edge_type)
-    data.edge_query_index_0 = torch.zeros_like(data.edge_label)
-    data.edge_query_index_1 = torch.arange(data.edge_label.size(0))'''
-    return data
-
-
-def make_pos_label(data, num_real_pos=5, num_fake_pos=5, pos_real_std=0.05, pos_fake_std=2.0, k=16):
-    ligand_context_pos = data.ligand_context_pos
-    ligand_masked_pos = data.ligand_masked_pos
-    protein_pos = data.protein_pos
-    
-    if ligand_context_pos.size(0) == 0:
-        # fake position
-        fake_mode = protein_pos[data.candidate_focal_label_in_protein]
-        p = np.ones(fake_mode.size(0), dtype=np.float32)/fake_mode.size(0)
-        pos_fake_idx = np.random.choice(np.arange(fake_mode.size(0)), size=num_fake_pos, p=p)
-        pos_fake = fake_mode[pos_fake_idx]
-        pos_fake += torch.randn_like(pos_fake) * pos_fake_std / 2.
-        # real position
-        p = np.ones(ligand_masked_pos.size(0), dtype=np.float32)/ligand_masked_pos.size(0)
-        pos_real_idx = np.random.choice(np.arange(ligand_masked_pos.size(0)), size=num_real_pos, p=p)
-        pos_real = ligand_masked_pos[pos_real_idx]
-        pos_real += torch.randn_like(pos_real) * pos_real_std
-    else:
-        # fake position
-        fake_mode = ligand_context_pos[data.ligand_frontier]
-        p = np.ones(fake_mode.size(0), dtype=np.float32)/fake_mode.size(0)
-        pos_fake_idx = np.random.choice(np.arange(fake_mode.size(0)), size=num_fake_pos, p=p)
-        pos_fake = fake_mode[pos_fake_idx]
-        pos_fake += torch.randn_like(pos_fake) * pos_fake_std / 2.
-        # real position
-        p = np.ones(ligand_masked_pos.size(0), dtype=np.float32)/ligand_masked_pos.size(0)
-        pos_real_idx = np.random.choice(np.arange(ligand_masked_pos.size(0)), size=num_real_pos-1, p=p)
-        pos_real = ligand_masked_pos[pos_real_idx]
-        pos_real += torch.randn_like(pos_real) * pos_real_std
-        pos_real = torch.cat([pos_real, data.y_pos], dim=0)
-
-    data.pos_fake = pos_fake
-    pos_fake_knn_edge_idx = knn(x=data.cpx_pos, y=pos_fake, k=k, num_workers=16)
-    data.pos_fake_knn_edge_idx_0, data.pos_fake_knn_edge_idx_1 = pos_fake_knn_edge_idx
-
-    data.pos_real = pos_real
-    pos_real_knn_edge_idx = knn(x=data.cpx_pos, y=pos_real, k=k, num_workers=16)
-    data.pos_real_knn_edge_idx_0, data.pos_real_knn_edge_idx_1 = pos_real_knn_edge_idx
-    return data
-
-
-def get_complex_graph(data, len_ligand_ctx, len_compose, num_workers=1, graph_type='knn', knn=16, radius=10.0):
-    if graph_type == 'rad':
-        data.cpx_edge_index = radius_graph(data.cpx_pos, radius, flow='target_to_source', num_workers=num_workers)
-    elif graph_type == 'knn':
-        data.cpx_edge_index = knn_graph(data.cpx_pos, knn, flow='target_to_source', num_workers=num_workers)
-    # compose_knn_edge_index: 蛋白和配体原子组合在一起计算knn图的邻接矩阵
-    id_cpx_edge = data.cpx_edge_index[0, :len_ligand_ctx*knn] * len_compose + data.cpx_edge_index[1, :len_ligand_ctx*knn]
-    # data.cpx_edge_index[0, :len_ligand_ctx*knn] 将knn的edge中属于ligand_ctx的atom的index选出来
-    id_ligand_ctx_edge = data.ligand_context_bond_index[0] * len_compose + data.ligand_context_bond_index[1]
-    idx_edge = [torch.nonzero(id_cpx_edge == id_) for id_ in id_ligand_ctx_edge]  # idx_edge是id_ligand_ctx_edge中的元素在id_cpx_edge中的索引
-    # torch.nonzero(id_ligand_ctx_edge.unsqueeze(1) == id_cpx_edge.unsqueeze(0))[:,1]
-    idx_edge = torch.tensor([a.squeeze() if len(a) > 0 else torch.tensor(-1) for a in idx_edge], dtype=torch.long)
-    data.cpx_edge_type = torch.zeros(len(data.cpx_edge_index[0]), dtype=torch.long)  # for encoder edge embedding
-    data.cpx_edge_type[idx_edge[idx_edge>=0]] = data.ligand_context_bond_type[idx_edge>=0]
-    data.cpx_edge_feature = torch.cat([
-        torch.ones([len(data.cpx_edge_index[0]), 1], dtype=torch.long),
-        torch.zeros([len(data.cpx_edge_index[0]), 3], dtype=torch.long),
-    ], dim=-1)
-    data.cpx_edge_feature[idx_edge[idx_edge>=0]] = F.one_hot(data.ligand_context_bond_type[idx_edge>=0], num_classes=4)    # 0 (1,2,3)-onehot
-    return data
-
-def get_knn_graph(pos, k=16, edge_feat=None, edge_feat_index=None, num_workers=8, graph_type='knn', radius=5.5):
-    if graph_type == 'rad':
-        cpx_edge_index = radius_graph(pos, radius, flow='target_to_source', num_workers=num_workers)
-    if graph_type == 'knn':
-        cpx_edge_index = knn_graph(pos, k, flow='target_to_source', num_workers=num_workers).long()
-    if isinstance(edge_feat, torch.Tensor) and isinstance(edge_feat_index, torch.Tensor):
-        adj_feat_mat = torch.zeros([pos.size(0), pos.size(0)], dtype=torch.long)
-        adj_feat_mat[edge_feat_index[0],edge_feat_index[1]] = edge_feat
-        cpx_edge_type = adj_feat_mat[cpx_edge_index[0],cpx_edge_index[1]]
-    else:
-        cpx_edge_type = None
-    return cpx_edge_index, cpx_edge_type
-
-def get_complex_graph_(data, knn=16, num_workers=8, graph_type='knn', radius=5.5):
-    edge_feat = torch.cat([data.ligand_context_bond_type, data.protein_bond_type]).long()
-    edge_feat_index = torch.cat(
-        [data.ligand_context_bond_index, data.protein_bond_index+data.ligand_context_pos.size(0)], dim=1
-        ).long()
-    knn_edge_index, knn_edge_type = get_knn_graph(
-        data.cpx_pos, k=knn, edge_feat=edge_feat, edge_feat_index=edge_feat_index, 
-        graph_type=graph_type, num_workers=num_workers, radius=radius
-        )
-    data.cpx_edge_index = knn_edge_index
-    data.cpx_edge_type = knn_edge_type
-    data.cpx_edge_feature = F.one_hot(knn_edge_type, num_classes=4)
-    return data
-
-def sample_edge_with_radius(data, r=4.0):
-    y_pos = data.y_pos
-    ligand_context_pos = data.ligand_context_pos
-    context_idx = data.context_idx
-    masked_idx = data.masked_idx
-    ligand_bond_index = data.ligand_bond_index
-    ligand_bond_type = data.ligand_bond_type
-    # select the atoms whose distance < r between pos_query as edge samples
-    edge_index_radius = radius(ligand_context_pos, y_pos, r=r, num_workers=16)
-    # get the labels of edge samples
-    mask = [i in masked_idx[0] and j in context_idx[edge_index_radius[1]] for i,j in zip(*ligand_bond_index)]
-    new_idx_1 = torch.nonzero((ligand_bond_index[:,mask][1].view(-1,1) == context_idx).any(0)).view(-1)
-    real_bond_type_in_edge_index_radius = torch.nonzero((new_idx_1.view(-1,1) == edge_index_radius[1]).any(0)).view(-1)
-    edge_label = torch.zeros(edge_index_radius.size(1), dtype=torch.long)
-    edge_label[real_bond_type_in_edge_index_radius] = ligand_bond_type[mask]
-    data.edge_query_index_0, data.edge_query_index_1 = edge_index_radius
-    data.edge_label = edge_label
-    return data
-
-def get_tri_edges(edge_index_query, pos_query, idx_ligand, ligand_bond_index, ligand_bond_type):
-    row, col = edge_index_query
-    acc_num_edges = 0
-    index_real_cps_edge_i_list, index_real_cps_edge_j_list = [], []  # index of real-ctx edge (for attention)
-    for node in torch.arange(pos_query.size(0)):
-        num_edges = (row == node).sum()
-        index_edge_i = torch.arange(num_edges, dtype=torch.long, ) + acc_num_edges
-        index_edge_i, index_edge_j = torch.meshgrid(index_edge_i, index_edge_i, indexing=None)
-        index_edge_i, index_edge_j = index_edge_i.flatten(), index_edge_j.flatten()
-        index_real_cps_edge_i_list.append(index_edge_i)
-        index_real_cps_edge_j_list.append(index_edge_j)
-        acc_num_edges += num_edges
-    index_real_cps_edge_i = torch.cat(index_real_cps_edge_i_list, dim=0).to(pos_query.device)  # add len(real_compose_edge_index) in the dataloader for batch
-    index_real_cps_edge_j = torch.cat(index_real_cps_edge_j_list, dim=0).to(pos_query.device)
-
-    node_a_cps_tri_edge = col[index_real_cps_edge_i]  # the node of tirangle edge for the edge attention (in the compose)
-    node_b_cps_tri_edge = col[index_real_cps_edge_j]
-    n_context = len(idx_ligand)
-    adj_mat = (torch.zeros([n_context, n_context], dtype=torch.long) - torch.eye(n_context, dtype=torch.long))
-    adj_mat = adj_mat.to(ligand_bond_index.device)
-    adj_mat[ligand_bond_index[0], ligand_bond_index[1]] = ligand_bond_type
-    tri_edge_type = adj_mat[node_a_cps_tri_edge, node_b_cps_tri_edge]
-    tri_edge_feat = (tri_edge_type.view([-1, 1]) == torch.tensor([[-1, 0, 1, 2, 3]]).to(tri_edge_type.device)).long()
-
-    index_real_cps_edge_for_atten = torch.stack([
-        index_real_cps_edge_i, index_real_cps_edge_j  # plus len(real_compose_edge_index_0) for dataloader batch
-    ], dim=0)
-    tri_edge_index = torch.stack([
-        node_a_cps_tri_edge, node_b_cps_tri_edge  # plus len(compose_pos) for dataloader batch
-    ], dim=0)
-    return index_real_cps_edge_for_atten, tri_edge_index, tri_edge_feat