mirror of
https://github.com/Saoge123/PocketFlow.git
synced 2026-06-04 12:44:22 +08:00
Delete pocket_flow/utils/.ipynb_checkpoints directory
This commit is contained in:
@@ -1,763 +0,0 @@
|
||||
import os
|
||||
import copy
|
||||
import numpy as np
|
||||
from rdkit import Chem, RDConfig
|
||||
from rdkit.Chem.rdchem import BondType
|
||||
from rdkit.Chem import ChemicalFeatures
|
||||
from .residues_base import RESIDUES_TOPO, RESIDUES_TOPO_WITH_H
|
||||
try:
|
||||
import pymol
|
||||
except:
|
||||
print('we can not compute the atoms on the surface of protein, '\
|
||||
'because pymol can not be imported')
|
||||
pass
|
||||
#from .BaseFeatures import Mol2Graph, RESIDUE_GRAPH_WITHOUT_H, RESIDUE_GRAPH_WITH_H
|
||||
#from .utils import coordinate_adjusting
|
||||
#from .Data import LigandData
|
||||
|
||||
ATOM_TYPE_WITH_HYBIRD = [
|
||||
'SP3_C', 'SP2_C', 'SP_C', 'SP3_N', 'SP2_N', 'SP_N', 'SP3_O', 'SP2_O', 'SP3_F', 'SP3_P',
|
||||
'SP2_P', 'SP3D_P', 'SP3_S', 'SP2_S', 'SP3D_S', 'SP3D2_S', 'SP3_Cl', 'SP3_Br', 'SP3_I'
|
||||
]
|
||||
ATOM_MAP = [6, 6, 6, 7, 7, 7, 8, 8, 9, 15, 15, 15, 16, 16, 16, 16, 17, 35, 53]
|
||||
PT = Chem.GetPeriodicTable()
|
||||
BACKBONE_SYMBOL = {'N', 'CA', 'C', 'O'}
|
||||
AMINO_ACID_TYPE = {
|
||||
'CYS':0, 'GLY':1, 'ALA':2, 'THR':3, 'LYS':4, 'PRO':5, 'VAL':6, 'SER':7, 'ASN':8, 'LEU':9,
|
||||
'GLN':10, 'MET':11, 'ASP':12, 'TRP':13, 'HIS':14, 'GLU':15, 'ARG':16, 'ILE':17, 'PHE':18,
|
||||
'TYR':19
|
||||
}
|
||||
|
||||
'''
|
||||
def coordinate_adjusting(pos, wts):
|
||||
center_of_mass = np.sum(pos*wts, axis=0)/wts.sum()
|
||||
pos = pos - center_of_mass
|
||||
|
||||
I_xx = (wts*(pos[...,1]**2 + pos[...,2]**2)).sum()
|
||||
I_yy = (wts*(pos[...,0]**2 + pos[...,2]**2)).sum()
|
||||
I_zz = (wts*(pos[...,0]**2 + pos[...,1]**2)).sum()
|
||||
I_xy = -(wts*(pos[...,0]*pos[...,1])).sum()
|
||||
I_xz = -(wts*(pos[...,0]*pos[...,2])).sum()
|
||||
I_yz = -(wts*(pos[...,1]*pos[...,2])).sum()
|
||||
I = np.array([[I_xx, I_xy, I_xz],
|
||||
[I_xy, I_yy, I_yz],
|
||||
[I_xz, I_yz, I_zz]])
|
||||
eig_v, eig_m = np.linalg.eigh(I) # np.linalg.eigh guarantees you that the eigenvalues are sorted and uses a
|
||||
# faster algorithm that takes advantage of the fact that the matrix is
|
||||
# symmetric. If you know that your matrix is symmetric, use this function.
|
||||
am = pos@eig_m # 用本真矢矩阵变换原子坐标,以‘摆正’分子
|
||||
|
||||
rotate_around = []
|
||||
x, y, z = am[0]
|
||||
if x < 0:
|
||||
am[..., 0] = am[..., 0] * -1
|
||||
rotate_around.append(0)
|
||||
if y < 0:
|
||||
am[..., 1] = am[..., 1] * -1
|
||||
rotate_around.append(1)
|
||||
if z < 0:
|
||||
am[..., 2] = am[..., 2] * -1
|
||||
rotate_around.append(2)
|
||||
return am, eig_m, center_of_mass, rotate_around
|
||||
'''
|
||||
def coordinate_adjusting(pos, wts):
|
||||
center_of_mass = np.sum(pos*wts, axis=0)/wts.sum()
|
||||
pos = pos - center_of_mass
|
||||
|
||||
I_xx = (wts*(pos[...,1]**2 + pos[...,2]**2)).sum()
|
||||
I_yy = (wts*(pos[...,0]**2 + pos[...,2]**2)).sum()
|
||||
I_zz = (wts*(pos[...,0]**2 + pos[...,1]**2)).sum()
|
||||
I_xy = -(wts*(pos[...,0]*pos[...,1])).sum()
|
||||
I_xz = -(wts*(pos[...,0]*pos[...,2])).sum()
|
||||
I_yz = -(wts*(pos[...,1]*pos[...,2])).sum()
|
||||
I = np.array([[I_xx, I_xy, I_xz],
|
||||
[I_xy, I_yy, I_yz],
|
||||
[I_xz, I_yz, I_zz]])
|
||||
eig_v, eig_m = np.linalg.eigh(I) # np.linalg.eigh guarantees you that the eigenvalues are sorted and uses a
|
||||
# faster algorithm that takes advantage of the fact that the matrix is
|
||||
# symmetric. If you know that your matrix is symmetric, use this function.
|
||||
am = pos@eig_m # 用本真矢矩阵变换原子坐标,以‘摆正’分子
|
||||
return am, eig_m, center_of_mass
|
||||
|
||||
class Dict(dict):
|
||||
__setattr__ = dict.__setitem__
|
||||
__getattr__ = dict.__getitem__
|
||||
|
||||
|
||||
class Atom(object):
|
||||
def __init__(self, atom_info):
|
||||
|
||||
self.idx = int(atom_info[6:11])
|
||||
self.name = atom_info[12:16].strip()
|
||||
self.res_name = atom_info[17:20].strip() # atom_info[17:20].strip()
|
||||
self.chain = atom_info[21:22].strip()
|
||||
self.res_idx = int(atom_info[22:26])
|
||||
self.coord = np.array([float(atom_info[30:38].strip()),
|
||||
float(atom_info[38:46].strip()),
|
||||
float(atom_info[46:54].strip())])
|
||||
self.occupancy = float(atom_info[54:60])
|
||||
self.temperature_factor = float(atom_info[60:66].strip())
|
||||
self.seg_id = atom_info[72:76].strip()
|
||||
self.element = atom_info[76:78].strip()
|
||||
if self.element == 'SE':
|
||||
self.element = 'S'
|
||||
self.name = 'SD'
|
||||
self.res_name = 'MET'
|
||||
self.mass = PT.GetAtomicWeight(self.element)
|
||||
if self.occupancy < 1.0:
|
||||
self.is_disorder = True
|
||||
else:
|
||||
self.is_disorder = False
|
||||
self.is_surf = atom_info.split()[-1] == 'surf'
|
||||
|
||||
@property
|
||||
def to_string(self):
|
||||
fmt = '{:6s}{:5d} {:^4s}{:1s}{:3s} {:1s}{:4d}{:1s} {:8.3f}{:8.3f}{:8.3f}{:6.2f}{:6.2f} {:<2s}{:2s}' # https://cupnet.net/pdb-format/
|
||||
out = fmt.format('ATOM', self.idx, self.name, '', self.res_name,
|
||||
self.chain, self.res_idx,'', self.coord[0],
|
||||
self.coord[1], self.coord[2], self.occupancy,
|
||||
self.temperature_factor, self.seg_id, self.element)
|
||||
return out
|
||||
|
||||
@property
|
||||
def to_dict(self):
|
||||
return {
|
||||
'element': PT.GetAtomicNumber(self.element),
|
||||
'pos': self.coord,
|
||||
'is_backbone': self.name in BACKBONE_SYMBOL,
|
||||
'atom_name': self.name,
|
||||
'atom_to_aa_type': AMINO_ACID_TYPE[self.res_name]
|
||||
}
|
||||
|
||||
def __repr__(self):
|
||||
info = 'name={}, index={}, res={}, chain={}, is_disorder={}'.format(self.name,
|
||||
self.idx,
|
||||
self.res_name+str(self.res_idx),
|
||||
self.chain,
|
||||
self.is_disorder)
|
||||
return '{}({})'.format(self.__class__.__name__, info)
|
||||
|
||||
|
||||
class Residue(object):
|
||||
|
||||
def __init__(self, res_info):
|
||||
self.res_info = res_info
|
||||
#atoms_ = [Atom(i) for i in res_info]
|
||||
self.atom_dict = {}
|
||||
disorder = []
|
||||
for i in res_info: # 排除disorder原子
|
||||
atom = Atom(i)
|
||||
if atom.name in self.atom_dict:
|
||||
continue
|
||||
else:
|
||||
self.atom_dict[atom.name] = atom
|
||||
disorder.append(atom.is_disorder)
|
||||
if atom.res_name == 'MSE':
|
||||
atom.res_name = 'MET' # 把MSE残基改成MET
|
||||
|
||||
if True in disorder:
|
||||
self.is_disorder = True
|
||||
else:
|
||||
self.is_disorder = False
|
||||
|
||||
self.idx = self.atom_dict[atom.name].res_idx
|
||||
self.chain = self.atom_dict[atom.name].chain
|
||||
self.name = self.atom_dict[atom.name].res_name
|
||||
self.is_perfect = True if len(self.get_heavy_atoms)==len(RESIDUES_TOPO[self.name]) else False
|
||||
|
||||
@property
|
||||
def to_heavy_string(self):
|
||||
return '\n'.join([a.to_string for a in self.get_heavy_atoms])
|
||||
|
||||
@property
|
||||
def to_string(self):
|
||||
return '\n'.join([a.to_string for a in self.get_atoms])
|
||||
|
||||
@property
|
||||
def get_coords(self):
|
||||
return np.array([a.coord for a in self.get_atoms])
|
||||
|
||||
@property
|
||||
def get_atoms(self):
|
||||
return list(self.atom_dict.values())
|
||||
|
||||
@property
|
||||
def get_heavy_atoms(self):
|
||||
return [a for a in self.atom_dict.values() if 'H' not in a.element]
|
||||
|
||||
@property
|
||||
def get_heavy_coords(self):
|
||||
return np.array([a.coord for a in self.get_heavy_atoms])
|
||||
|
||||
@property
|
||||
def center_of_mass(self):
|
||||
atom_mass = np.array([i.mass for i in self.get_atoms]).reshape(-1, 1)
|
||||
return np.sum(self.get_coords*atom_mass, axis=0)/atom_mass.sum()
|
||||
|
||||
@property
|
||||
def bond_graph(self):
|
||||
i, j, bt = [], [], []
|
||||
res_graph = RESIDUES_TOPO[self.name]
|
||||
atom_names = [i.name for i in self.get_heavy_atoms]
|
||||
for ix, name in enumerate(atom_names):
|
||||
for adj in res_graph[name]:
|
||||
if adj in atom_names:
|
||||
idx_j = atom_names.index(adj)
|
||||
i.append(ix)
|
||||
j.append(idx_j)
|
||||
bt.append(res_graph[name][adj])
|
||||
edge_index = np.stack([i,j]).astype(dtype=np.int64)
|
||||
bt = np.array(bt, dtype=np.int64)
|
||||
return edge_index, bt
|
||||
|
||||
@property
|
||||
def centroid(self):
|
||||
return self.get_coords.mean(axis=0)
|
||||
|
||||
def __repr__(self):
|
||||
info = 'name={}, index={}, chain={}, is_disorder={}, is_perfect={}'.format(
|
||||
self.name, self.idx, self.chain, self.is_disorder, self.is_perfect
|
||||
)
|
||||
return '{}({})'.format(self.__class__.__name__, info)
|
||||
|
||||
|
||||
class Chain(object):
|
||||
def __init__(self, chain_info, ignore_incomplete_res=True, pdb_file=None):
|
||||
self.pdb_file = pdb_file
|
||||
self.res_dict = {}
|
||||
'''if ignore_incomplete_res:
|
||||
self.residues = {}
|
||||
for i in chain_info:
|
||||
res = Residue(chain_info[i])
|
||||
if len(res.get_heavy_atoms) == len(RESIDUES_TOPO[res.name]):
|
||||
self.residues[i] = res
|
||||
else:
|
||||
self.residues = {i:Residue(chain_info[i]) for i in chain_info}'''
|
||||
self.residues = {i:Residue(chain_info[i]) for i in chain_info}
|
||||
#print(self.residues)
|
||||
self.chain = list(self.residues.values())[0].chain
|
||||
self.__normalized__ = False
|
||||
self.center_of_mass_shift = None
|
||||
self.rotate_matrix = None
|
||||
self.ignore_incomplete_res =ignore_incomplete_res
|
||||
|
||||
def normalize_coord(self):
|
||||
wts = np.array([i.mass for i in self.get_atoms])
|
||||
wts = np.expand_dims(wts,axis=1)
|
||||
pos = self.get_coords
|
||||
|
||||
am, self.rotate_matrix, self.center_of_mass_shift = coordinate_adjusting(pos, wts)
|
||||
for ix, a in enumerate(self.get_atoms):
|
||||
a.coord = am[ix]
|
||||
self.__normalized__ = True
|
||||
|
||||
@property
|
||||
def get_incomplete_residues(self):
|
||||
return [i for i in self.residues.values() if i.is_perfect==False]
|
||||
|
||||
@property
|
||||
def to_heavy_string(self):
|
||||
return '\n'.join([res.to_heavy_string for res in self.get_residues])
|
||||
|
||||
@property
|
||||
def to_string(self):
|
||||
return '\n'.join([res.to_string for res in self.get_residues])
|
||||
|
||||
@property
|
||||
def get_atoms(self):
|
||||
atoms = []
|
||||
for res in self.get_residues:
|
||||
atoms.extend(res.get_atoms)
|
||||
return atoms
|
||||
|
||||
@property
|
||||
def get_residues(self):
|
||||
if self.ignore_incomplete_res:
|
||||
return [i for i in self.residues.values() if i.is_perfect]
|
||||
else:
|
||||
return list(self.residues.values())
|
||||
|
||||
@property
|
||||
def get_heavy_atoms(self):
|
||||
atoms = []
|
||||
for res in self.get_residues:
|
||||
atoms.extend(res.get_heavy_atoms)
|
||||
return atoms
|
||||
|
||||
@property
|
||||
def get_coords(self):
|
||||
return np.array([i.coord for i in self.get_atoms])
|
||||
|
||||
@property
|
||||
def get_heavy_coords(self):
|
||||
return np.array([i.coord for i in self.get_heavy_atoms])
|
||||
|
||||
@property
|
||||
def center_of_mass(self):
|
||||
atom_mass = np.array([i.mass for i in self.get_atoms]).reshape(-1, 1)
|
||||
return np.sum(self.get_coords*atom_mass, axis=0)/atom_mass.sum()
|
||||
|
||||
@property
|
||||
def centroid(self):
|
||||
return self.get_coords.mean(axis=0)
|
||||
|
||||
def compute_surface_atoms(self):
|
||||
path, filename = os.path.split(self.pdb_file)
|
||||
chain_file_name = filename.split('.')[0] + '_' + self.chain + '.pdb'
|
||||
chain_file = path+'/'+chain_file_name
|
||||
with open(chain_file, 'w') as fw:
|
||||
fw.write(self.to_heavy_string)
|
||||
|
||||
pymol.cmd.load(chain_file)
|
||||
if path == '':
|
||||
path = './'
|
||||
sele = chain_file_name.split('.')[0]
|
||||
pymol.cmd.remove("({}) and hydro".format(sele))
|
||||
name = pymol.util.find_surface_atoms(sele=sele, _self=pymol.cmd)
|
||||
save_name = path + '/' + sele + '-surface.pdb'
|
||||
pymol.cmd.save(save_name,((name)))
|
||||
surf_protein = Protein(save_name, ignore_incomplete_res=False)
|
||||
surf_res_dict = {r.idx:r for r in surf_protein.get_residues}
|
||||
for res in self.get_residues:
|
||||
res_idx = res.idx
|
||||
if res_idx in surf_res_dict:
|
||||
for a in surf_res_dict[res_idx].get_heavy_atoms:
|
||||
res.atom_dict[a.name].is_surf = True
|
||||
self.has_surf_atom = True
|
||||
os.remove(save_name)
|
||||
os.remove(chain_file)
|
||||
|
||||
def get_surf_mask(self):
|
||||
if self.has_surf_atom is False:
|
||||
self.compute_surface_atoms()
|
||||
return np.array([a.is_surf for a in self.get_heavy_atoms], dtype=np.bool)
|
||||
|
||||
def get_res_by_id(self, res_id):
|
||||
return self.residues[res_id]
|
||||
|
||||
def __repr__(self):
|
||||
tmp = 'Chain={}, NumResidues={}, NumAtoms={}, NumHeavyAtoms={}'
|
||||
info = tmp.format(self.chain, len(self.residues), self.get_coords.shape[0], self.get_heavy_coords.shape[0])
|
||||
return '{}({})'.format(self.__class__.__name__, info)
|
||||
|
||||
|
||||
class Protein(object):
|
||||
|
||||
def __init__(self, pdb_file, ignore_incomplete_res=True):
|
||||
self.ignore_incomplete_res = ignore_incomplete_res
|
||||
self.name = pdb_file.split('/')[-1].split('.')[0]
|
||||
self.pdb_file = pdb_file
|
||||
self.has_surf_atom = False
|
||||
#self.pdb = [line.strip() for line in open(pdb_file).readlines()]
|
||||
#atoms = os.popen('grep ATOM {}'.format(pdb_file)).read().strip().split('\n')
|
||||
with open(pdb_file) as fr:
|
||||
lines = fr.readlines()
|
||||
surf_item = lines[0].strip().split()[-1]
|
||||
if surf_item in {'surf', 'inner'}:
|
||||
self.has_surf_atom = True
|
||||
chain_info = {}
|
||||
for line in lines:
|
||||
if line.startswith('ATOM'):
|
||||
line = line.strip()
|
||||
chain = line[21:22].strip()
|
||||
res_idx = int(line[22:26].strip())
|
||||
if chain not in chain_info:
|
||||
chain_info[chain] = {}
|
||||
chain_info[chain][res_idx] = [line]
|
||||
elif res_idx not in chain_info[chain]:
|
||||
chain_info[chain][res_idx] = [line]
|
||||
else:
|
||||
chain_info[chain][res_idx].append(line)
|
||||
|
||||
self.chains = {
|
||||
c:Chain(chain_info[c], ignore_incomplete_res=ignore_incomplete_res, pdb_file=pdb_file) for c in chain_info
|
||||
}
|
||||
self.__normalized__ = False
|
||||
self.center_of_mass_shift = None
|
||||
self.rotate_matrix = None
|
||||
|
||||
def normalize_coord(self):
|
||||
is_chain_normalized = [c for c in self.chains if self.chains[c].__normalized__==True]
|
||||
if len(is_chain_normalized) > 0:
|
||||
chain_normalized = ' '.join(is_chain_normalized)
|
||||
info = 'Warning: The Chains ({}) have been normalized '\
|
||||
'independently, this may lead to wrong coordinate '\
|
||||
'transformation for whole Protein !!!'.format(chain_normalized)
|
||||
print(info)
|
||||
|
||||
wts = np.array([i.mass for i in self.get_atoms])
|
||||
wts = np.expand_dims(wts,axis=1)
|
||||
pos = self.get_coords
|
||||
|
||||
am, self.rotate_matrix, self.center_of_mass_shift = coordinate_adjusting(pos, wts)
|
||||
for ix, a in enumerate(self.get_atoms):
|
||||
a.coord = am[ix]
|
||||
self.__normalized__ = True
|
||||
|
||||
@property
|
||||
def get_incomplete_residues(self):
|
||||
res_list = []
|
||||
for i in self.chains:
|
||||
res_list += self.chains[i].get_incomplete_residues
|
||||
return res_list
|
||||
|
||||
@property
|
||||
def to_heavy_string(self):
|
||||
return '\n'.join([res.to_heavy_string for res in self.get_residues])
|
||||
|
||||
@property
|
||||
def to_string(self):
|
||||
return '\n'.join([res.to_string for res in self.get_residues])
|
||||
|
||||
@property
|
||||
def get_residues(self,):
|
||||
res_list = []
|
||||
for i in self.chains:
|
||||
res_list += self.chains[i].get_residues
|
||||
return res_list
|
||||
|
||||
@property
|
||||
def get_atoms(self):
|
||||
atoms = []
|
||||
for res in self.get_residues:
|
||||
atoms.extend(res.get_atoms)
|
||||
return atoms
|
||||
|
||||
@property
|
||||
def get_heavy_atoms(self):
|
||||
atoms = []
|
||||
for res in self.get_residues:
|
||||
atoms.extend(res.get_heavy_atoms)
|
||||
return atoms
|
||||
|
||||
@property
|
||||
def get_coords(self):
|
||||
return np.array([i.coord for i in self.get_atoms])
|
||||
|
||||
@property
|
||||
def get_heavy_coords(self):
|
||||
return np.array([i.coord for i in self.get_heavy_atoms])
|
||||
|
||||
@property
|
||||
def center_of_mass(self):
|
||||
atom_mass = np.array([i.mass for i in self.get_atoms]).reshape(-1, 1)
|
||||
return np.sum(self.get_coords*atom_mass, axis=0)/atom_mass.sum()
|
||||
|
||||
@property
|
||||
def bond_graph(self):
|
||||
res_list = self.get_residues
|
||||
bond_index = []
|
||||
bond_type = []
|
||||
N_term_list = []
|
||||
C_term_list = []
|
||||
cusum = 0
|
||||
for ix, res in enumerate(res_list):
|
||||
e_idx, e_type = res.bond_graph
|
||||
bond_index.append(e_idx + cusum)
|
||||
bond_type.append(e_type)
|
||||
N_term_ix = [i.name for i in self.get_heavy_atoms].index('N') + cusum
|
||||
C_term_ix = [i.name for i in self.get_heavy_atoms].index('C') + cusum
|
||||
N_term_list.append(N_term_ix)
|
||||
C_term_list.append(C_term_ix)
|
||||
cusum += res.get_heavy_coords.shape[0]
|
||||
if ix != 0:
|
||||
if res.idx-res_list[ix-1].idx == 1 and res.chain == res_list[ix-1].chain:
|
||||
bond_idx_between_res = np.array(
|
||||
[[N_term_ix, C_term_list[ix-1]],[C_term_list[ix-1],N_term_ix]],
|
||||
dtype=np.long
|
||||
)
|
||||
bond_index.append(bond_idx_between_res)
|
||||
bond_type_between_res = np.array([1,1], dtype=np.long)
|
||||
bond_type.append(bond_type_between_res)
|
||||
bond_index = np.concatenate(bond_index, axis=1)
|
||||
bond_type = np.concatenate(bond_type)
|
||||
return bond_index, bond_type
|
||||
|
||||
@property
|
||||
def centroid(self):
|
||||
return self.get_coords.mean(axis=0)
|
||||
|
||||
def get_chain(self, chain_id):
|
||||
return self.chains[chain_id]
|
||||
|
||||
def get_res_by_id(self, chain_id, res_id):
|
||||
return self.chains[chain_id].get_res_by_id(res_id)
|
||||
|
||||
def get_atom_dict(self, removeHs=True, get_surf=False):
|
||||
atom_dict = {
|
||||
'element': [],
|
||||
'pos': [],
|
||||
'is_backbone': [],
|
||||
'atom_name': [],
|
||||
'atom_to_aa_type': []
|
||||
}
|
||||
for a in self.get_atoms:
|
||||
if a.element == 'H' and removeHs:
|
||||
continue
|
||||
atom_dict['element'].append(a.to_dict['element'])
|
||||
atom_dict['pos'].append(a.to_dict['pos'])
|
||||
atom_dict['is_backbone'].append(a.to_dict['is_backbone'])
|
||||
atom_dict['atom_name'].append(a.to_dict['atom_name'])
|
||||
atom_dict['atom_to_aa_type'].append(a.to_dict['atom_to_aa_type'])
|
||||
atom_dict['element'] = np.array(atom_dict['element'], dtype=np.long)
|
||||
atom_dict['pos'] = np.array(atom_dict['pos'], dtype=np.float32)
|
||||
atom_dict['is_backbone'] = np.array(atom_dict['is_backbone'], dtype=np.bool)
|
||||
#atom_dict['atom_name'] = atom_dict['atom_name']
|
||||
if get_surf:
|
||||
atom_dict['surface_mask'] = np.array([a.is_surf for a in self.get_heavy_atoms], dtype=np.bool)#self.get_surf_mask()
|
||||
|
||||
atom_dict['atom_to_aa_type'] = np.array(atom_dict['atom_to_aa_type'], dtype=np.long)
|
||||
atom_dict['molecule_name'] = None
|
||||
protein_bond_index, protein_bond_type = self.bond_graph
|
||||
atom_dict['bond_index'] = protein_bond_index
|
||||
atom_dict['bond_type'] = protein_bond_type
|
||||
atom_dict['filename'] = self.pdb_file
|
||||
return atom_dict
|
||||
|
||||
def get_backbone_dict(self, removeHs=True):
|
||||
atom_dict = self.get_atom_dict(removeHs=removeHs)
|
||||
backbone_dict = {}
|
||||
backbone_dict['element'] = atom_dict['element'][atom_dict['is_backbone']]
|
||||
backbone_dict['pos'] = atom_dict['pos'][atom_dict['is_backbone']]
|
||||
backbone_dict['is_backbone'] = np.ones(atom_dict['is_backbone'].sum(), dtype=np.bool)
|
||||
backbone_dict['atom_name'] = np.array(atom_dict['atom_name'])[atom_dict['is_backbone']].tolist()
|
||||
backbone_dict['atom_to_aa_type'] = atom_dict['atom_to_aa_type'][atom_dict['is_backbone']]
|
||||
backbone_dict['molecule_name'] = atom_dict['molecule_name']
|
||||
atom_dict['bond_index'] = np.empty([0,2], dtype=np.long)
|
||||
atom_dict['bond_type'] = np.empty(0, dtype=np.long)
|
||||
atom_dict['filename'] = self.pdb_file
|
||||
return backbone_dict
|
||||
|
||||
@property
|
||||
def get_backbone(self):
|
||||
atoms = []
|
||||
for res in self.get_residues:
|
||||
bkb = [a for a in res.get_atoms if a.name in BACKBONE_SYMBOL]
|
||||
atoms += bkb
|
||||
return atoms
|
||||
|
||||
def compute_surface_atoms(self):
|
||||
"""
|
||||
If the pdb_file is a pocket_file, the surface atoms is not correct!
|
||||
"""
|
||||
pymol.cmd.load(self.pdb_file)
|
||||
path, filename = os.path.split(self.pdb_file)
|
||||
if path == '':
|
||||
path = './'
|
||||
sele = filename.split('.')[0]
|
||||
pymol.cmd.remove("({}) and hydro".format(sele))
|
||||
name = pymol.util.find_surface_atoms(sele=sele, _self=pymol.cmd)
|
||||
save_name = path + '/' + sele + '-surface.pdb'
|
||||
pymol.cmd.save(save_name,((name)))
|
||||
surf_protein = Protein(save_name, ignore_incomplete_res=False)
|
||||
surf_res_dict = {r.idx:r for r in surf_protein.get_residues}
|
||||
for res in self.get_residues:
|
||||
res_idx = res.idx
|
||||
if res_idx in surf_res_dict:
|
||||
for a in surf_res_dict[res_idx].get_heavy_atoms:
|
||||
res.atom_dict[a.name].is_surf = True
|
||||
self.has_surf_atom = True
|
||||
os.remove(save_name)
|
||||
|
||||
def get_surf_mask(self):
|
||||
if self.has_surf_atom is False:
|
||||
self.compute_surface_atoms()
|
||||
return np.array([a.is_surf for a in self.get_heavy_atoms], dtype=np.bool)
|
||||
|
||||
@staticmethod
|
||||
def empty_dict():
|
||||
empty_pocket_dict = {}
|
||||
empty_pocket_dict = Dict(empty_pocket_dict)
|
||||
empty_pocket_dict.element = np.empty(0, dtype=np.int64)
|
||||
empty_pocket_dict.pos = np.empty([0,3], dtype=np.float32)
|
||||
empty_pocket_dict.is_backbone = np.empty(0, dtype=bool)
|
||||
empty_pocket_dict.atom_name = []
|
||||
empty_pocket_dict.atom_to_aa_type = np.empty(0, dtype=np.int64)
|
||||
empty_pocket_dict.molecule_name = None
|
||||
empty_pocket_dict.bond_index = np.empty([2,0], dtype=np.int64)
|
||||
empty_pocket_dict.bond_type = np.empty(0, dtype=np.int64)
|
||||
empty_pocket_dict.filename = None
|
||||
return empty_pocket_dict
|
||||
|
||||
def __repr__(self):
|
||||
num_res = 0
|
||||
num_atom = 0
|
||||
for i in self.chains:
|
||||
res_list = list(self.chains[i].residues.values())
|
||||
num_res += len(res_list)
|
||||
for i in res_list:
|
||||
num_atom += len(i.get_heavy_atoms)
|
||||
num_incomp = len(self.get_incomplete_residues)
|
||||
tmp = 'Name={}, NumChains={}, NumResidues={}, NumHeavyAtoms={}, NumIncompleteRes={}'
|
||||
info = tmp.format(
|
||||
self.name, len(self.chains), num_res, num_atom, num_incomp
|
||||
)
|
||||
return '{}({})'.format(self.__class__.__name__, info)
|
||||
|
||||
####################################################################################################################
|
||||
|
||||
|
||||
ATOM_FAMILIES = ['Acceptor', 'Donor', 'Aromatic', 'Hydrophobe', 'LumpedHydrophobe', 'NegIonizable', 'PosIonizable', 'ZnBinder']
|
||||
ATOM_FAMILIES_ID = {s: i for i, s in enumerate(ATOM_FAMILIES)}
|
||||
BOND_TYPES = {t: i for i, t in enumerate(BondType.names.values())}
|
||||
BOND_NAMES = {i: t for i, t in enumerate(BondType.names.keys())}
|
||||
|
||||
|
||||
def is_in_ring(mol):
|
||||
d = {a:np.array([], dtype=np.int64) for a in range(len(mol.GetAtoms()))}
|
||||
rings = Chem.GetSymmSSSR(mol)
|
||||
for a in d:
|
||||
for r_idx, ring in enumerate(rings):
|
||||
if a in ring:
|
||||
d[a] = np.append(d[a], r_idx+1)
|
||||
else:
|
||||
d[a] = np.append(d[a], -a)
|
||||
return d
|
||||
|
||||
def parse_sdf_to_dict(mol_file):
|
||||
fdefName = os.path.join(RDConfig.RDDataDir,'BaseFeatures.fdef')
|
||||
factory = ChemicalFeatures.BuildFeatureFactory(fdefName)
|
||||
rdmol = next(iter(Chem.SDMolSupplier(mol_file, removeHs=True)))
|
||||
Chem.Kekulize(rdmol)
|
||||
ring_info = is_in_ring(rdmol)
|
||||
conformer = rdmol.GetConformer()
|
||||
feat_mat = np.zeros([rdmol.GetNumAtoms(), len(ATOM_FAMILIES)], dtype=np.int64)
|
||||
for feat in factory.GetFeaturesForMol(rdmol):
|
||||
feat_mat[feat.GetAtomIds(), ATOM_FAMILIES_ID[feat.GetFamily()]] = 1
|
||||
|
||||
element, pos, atom_mass = [], [], []
|
||||
for a in rdmol.GetAtoms():
|
||||
element.append(a.GetAtomicNum())
|
||||
pos.append(conformer.GetAtomPosition(a.GetIdx()))
|
||||
atom_mass.append(a.GetMass())
|
||||
element = np.array(element, dtype=np.int64)
|
||||
pos = np.array(pos, dtype=np.float32)
|
||||
atom_mass = np.array(atom_mass, np.float32)
|
||||
center_of_mass = (pos * atom_mass.reshape(-1,1)).sum(0)/atom_mass.sum()
|
||||
|
||||
edge_index, edge_type = [], []
|
||||
for b in rdmol.GetBonds():
|
||||
row = [b.GetBeginAtomIdx(), b.GetEndAtomIdx()]
|
||||
col = [b.GetEndAtomIdx(), b.GetBeginAtomIdx()]
|
||||
edge_index.extend([row, col])
|
||||
edge_type.extend([BOND_TYPES[b.GetBondType()]] * 2)
|
||||
edge_index = np.array(edge_index)
|
||||
edge_index_perm = edge_index[:,0].argsort()
|
||||
edge_index = edge_index[edge_index_perm].T
|
||||
edge_type = np.array(edge_type)[edge_index_perm]
|
||||
|
||||
return {'element': element,
|
||||
'pos': pos,
|
||||
'bond_index': edge_index,
|
||||
'bond_type': edge_type,
|
||||
'center_of_mass': center_of_mass,
|
||||
'atom_feature': feat_mat,
|
||||
'ring_info': ring_info,
|
||||
'filename':mol_file
|
||||
}
|
||||
|
||||
|
||||
from scipy.spatial.distance import pdist
|
||||
from scipy.spatial.distance import squareform
|
||||
|
||||
|
||||
class Ligand(object):
|
||||
def __init__(self, mol_file, removeHs=True, sanitize=True):
|
||||
if isinstance(mol_file, Chem.rdchem.Mol):
|
||||
mol = mol_file
|
||||
self.name = None
|
||||
self.lig_file = None
|
||||
else:
|
||||
mol = Chem.MolFromMolFile(mol_file, removeHs=removeHs, sanitize=sanitize)
|
||||
if mol is None:
|
||||
mol = Chem.MolFromMolFile(mol_file, removeHs=removeHs, sanitize=False)
|
||||
mol.UpdatePropertyCache(strict=False)
|
||||
Chem.SanitizeMol(
|
||||
mol, ## 如果报错可以尝试:Chem.rdmolops.SanitizeFlags.SANITIZE_FINDRADICALS
|
||||
Chem.SanitizeFlags.SANITIZE_FINDRADICALS|\
|
||||
Chem.SanitizeFlags.SANITIZE_KEKULIZE|\
|
||||
Chem.SanitizeFlags.SANITIZE_SETAROMATICITY| \
|
||||
Chem.SanitizeFlags.SANITIZE_SETCONJUGATION| \
|
||||
Chem.SanitizeFlags.SANITIZE_SETHYBRIDIZATION| \
|
||||
Chem.SanitizeFlags.SANITIZE_SYMMRINGS,
|
||||
catchErrors=True
|
||||
)
|
||||
self.name = mol_file.split('/')[-1].split('.')[0]
|
||||
self.lig_file = mol_file
|
||||
|
||||
Chem.Kekulize(mol)
|
||||
self.mol = mol
|
||||
#self.self_loop = self_loop
|
||||
self.num_atoms = len(self.mol.GetAtoms())
|
||||
self.normalized_coords = None
|
||||
|
||||
def normalize_pos(self, shift_vector, rotate_matrix):
|
||||
conformer = self.mol.GetConformer()
|
||||
coords = np.array([conformer.GetAtomPosition(a.GetIdx()) for a in self.mol.GetAtoms()])
|
||||
coords = (coords - shift_vector)@rotate_matrix
|
||||
for ix, pos in enumerate(coords):
|
||||
conformer.SetAtomPosition(ix, pos)
|
||||
self.normalized_coords = coords
|
||||
|
||||
def mol_block(self):
|
||||
return Chem.MolToMolBlock(self.mol)
|
||||
|
||||
def to_dict(self):
|
||||
fdefName = os.path.join(RDConfig.RDDataDir,'BaseFeatures.fdef')
|
||||
factory = ChemicalFeatures.BuildFeatureFactory(fdefName)
|
||||
#rdmol = next(iter(Chem.SDMolSupplier(mol_file, removeHs=True)))
|
||||
ring_info = is_in_ring(self.mol)
|
||||
conformer = self.mol.GetConformer()
|
||||
feat_mat = np.zeros([self.mol.GetNumAtoms(), len(ATOM_FAMILIES)], dtype=np.int64)
|
||||
for feat in factory.GetFeaturesForMol(self.mol):
|
||||
feat_mat[feat.GetAtomIds(), ATOM_FAMILIES_ID[feat.GetFamily()]] = 1
|
||||
|
||||
element, pos, atom_mass = [], [], []
|
||||
for a in self.mol.GetAtoms():
|
||||
element.append(a.GetAtomicNum())
|
||||
pos.append(conformer.GetAtomPosition(a.GetIdx()))
|
||||
atom_mass.append(a.GetMass())
|
||||
element = np.array(element, dtype=np.int64)
|
||||
pos = np.array(pos, dtype=np.float32)
|
||||
atom_mass = np.array(atom_mass, np.float32)
|
||||
center_of_mass = (pos * atom_mass.reshape(-1,1)).sum(0)/atom_mass.sum()
|
||||
|
||||
edge_index, edge_type = [], []
|
||||
for b in self.mol.GetBonds():
|
||||
row = [b.GetBeginAtomIdx(), b.GetEndAtomIdx()]
|
||||
col = [b.GetEndAtomIdx(), b.GetBeginAtomIdx()]
|
||||
edge_index.extend([row, col])
|
||||
edge_type.extend([BOND_TYPES[b.GetBondType()]] * 2)
|
||||
edge_index = np.array(edge_index)
|
||||
edge_index_perm = edge_index[:,0].argsort()
|
||||
edge_index = edge_index[edge_index_perm].T
|
||||
edge_type = np.array(edge_type)[edge_index_perm]
|
||||
|
||||
return {'element': element,
|
||||
'pos': pos,
|
||||
'bond_index': edge_index,
|
||||
'bond_type': edge_type,
|
||||
'center_of_mass': center_of_mass,
|
||||
'atom_feature': feat_mat,
|
||||
'ring_info': ring_info,
|
||||
'filename':self.lig_file
|
||||
}
|
||||
|
||||
@staticmethod
|
||||
def empty_dict():
|
||||
empty_ligand_dict = {}
|
||||
empty_ligand_dict = Dict(empty_ligand_dict)
|
||||
empty_ligand_dict.element = np.empty(0, dtype=np.int64)
|
||||
empty_ligand_dict.pos = np.empty([0,3], dtype=np.float32)
|
||||
empty_ligand_dict.bond_index = np.empty([2,0], dtype=np.int64)
|
||||
empty_ligand_dict.bond_type = np.empty(0, dtype=np.int64)
|
||||
empty_ligand_dict.center_of_mass = np.empty([0,3], dtype=np.float32)
|
||||
empty_ligand_dict.atom_feature = np.empty([0,8], dtype=np.float32)
|
||||
empty_ligand_dict.ring_info = {}
|
||||
empty_ligand_dict.filename = None
|
||||
return empty_ligand_dict
|
||||
|
||||
def __repr__(self):
|
||||
tmp = 'Name={}, NumAtoms={}'
|
||||
info = tmp.format(self.name, self.num_atoms)
|
||||
return '{}({})'.format(self.__class__.__name__, info)
|
||||
@@ -1,13 +0,0 @@
|
||||
from .data import *
|
||||
from .generate_utils import *
|
||||
from .ParseFile import *
|
||||
from .train import *
|
||||
from .transform_utils import *
|
||||
from .transform import *
|
||||
from .load_dataset import *
|
||||
from .residues_base import *
|
||||
from .process_raw import *
|
||||
from .metrics import *
|
||||
from .mol_filter import *
|
||||
from . import sascorer
|
||||
from .draw import *
|
||||
@@ -1,133 +0,0 @@
|
||||
from rdkit import Chem
|
||||
import numpy as np
|
||||
import matplotlib.pyplot as plt
|
||||
from scipy.stats import gaussian_kde
|
||||
from tqdm.auto import tqdm
|
||||
from multiprocessing import Pool
|
||||
#from pocket_flow.utils import CrossDocked2020
|
||||
|
||||
from matplotlib.collections import PathCollection
|
||||
from matplotlib.legend_handler import HandlerLine2D
|
||||
|
||||
|
||||
BOND_SYMBOL = {'SINGLE':'-', 'DOUBLE':'=', 'TRIPLE':'#', 'AROMATIC':'@'}
|
||||
def compute_bond_length(mol, atomatic=True):
|
||||
#Chem.Kekulize(mol)
|
||||
conformer = mol.GetConformer()
|
||||
coords = np.array([conformer.GetAtomPosition(a.GetIdx()) for a in mol.GetAtoms()])
|
||||
bond_dict = {}
|
||||
for b in mol.GetBonds():
|
||||
bond_type = b.GetBondType().__str__()
|
||||
if b.GetIsAromatic() and atomatic:
|
||||
bond_type = 'AROMATIC'
|
||||
bond_symbol = BOND_SYMBOL[bond_type]
|
||||
a_ix_start = b.GetBeginAtomIdx()
|
||||
a_ix_end = b.GetEndAtomIdx()
|
||||
bond_lenth = np.linalg.norm(coords[a_ix_start]-coords[a_ix_end])
|
||||
#string_bond_lenth = '{:.3f}'.format(bond_lenth)
|
||||
|
||||
symbol_start = b.GetBeginAtom().GetSymbol()
|
||||
symbol_end = b.GetEndAtom().GetSymbol()
|
||||
bond_key = bond_symbol.join(sorted([symbol_start, symbol_end]))
|
||||
if bond_key not in bond_dict:
|
||||
bond_dict[bond_key] = [bond_lenth]
|
||||
else:
|
||||
bond_dict[bond_key].append(bond_lenth)
|
||||
return bond_dict
|
||||
|
||||
|
||||
def bond_statistic(inputs, from_file=True, interval=5000, n_p=16, sanitize=True):
|
||||
BOND_DICT = {}
|
||||
|
||||
for idx in tqdm(range(0, len(inputs), interval)):
|
||||
if idx+interval >= len(inputs):
|
||||
raw_files = inputs[idx:]
|
||||
else:
|
||||
raw_files = inputs[idx:idx+interval]
|
||||
|
||||
if from_file:
|
||||
mol_list = []
|
||||
for sdf in raw_files:
|
||||
mol = Chem.MolFromMolFile(sdf[1], sanitize=True)
|
||||
if mol:
|
||||
mol_list.append(mol)
|
||||
else:
|
||||
mol_list = raw_files
|
||||
pool = Pool(processes=n_p)
|
||||
dict_list = pool.map(compute_bond_length, mol_list)
|
||||
for d in dict_list:
|
||||
for k in d:
|
||||
if k in BOND_DICT:
|
||||
BOND_DICT[k] += d[k]
|
||||
else:
|
||||
BOND_DICT[k] = d[k]
|
||||
return BOND_DICT
|
||||
|
||||
'''
|
||||
import pickle
|
||||
|
||||
BOND_DICT = bond_statistic(cs2020.index, from_file=True, interval=5000, n_p=16)
|
||||
with open('BondLenthStatistic.pkl','wb') as fw:
|
||||
pickle.dump(BOND_DICT, fw)
|
||||
del BOND_DICT
|
||||
|
||||
with open('BondLenthStatistic.pkl', 'rb') as fr:
|
||||
BOND_DICT = pickle.load(fr)
|
||||
'''
|
||||
|
||||
class VizBondDensity(object):
|
||||
|
||||
def __init__(self, data_bond_dict, gen_bond_dict, linewidth=0.1, color_data="#BC5F6A", color_gen="#19B3B1",
|
||||
transparency=0.2, legend_size=25, axis_label_size=30, tick_size=25, is_fill=True,
|
||||
figsize=(14, 9)):
|
||||
self.data_bond_dict = data_bond_dict
|
||||
self.gen_bond_dict = gen_bond_dict
|
||||
self.linewidth = linewidth
|
||||
self.color_data = color_data
|
||||
self.color_gen = color_gen
|
||||
self.transparency = transparency
|
||||
self.legend_size = legend_size
|
||||
self.axis_label_size = axis_label_size
|
||||
self.tick_size = tick_size
|
||||
self.is_fill = is_fill
|
||||
self.figsize = figsize
|
||||
|
||||
def draw(self, bond_type, save_path=None, dpi=300):
|
||||
#bond_type = 'O=S'
|
||||
density = gaussian_kde(np.array(self.data_bond_dict[bond_type]))
|
||||
density.covariance_factor = lambda : .25
|
||||
density._compute_covariance()
|
||||
|
||||
density_gen = gaussian_kde(np.array(self.gen_bond_dict[bond_type]))
|
||||
density_gen.covariance_factor = lambda : .25
|
||||
density_gen._compute_covariance()
|
||||
|
||||
# Create a vector of 200 values going from 0 to 8:
|
||||
xs = np.linspace(1, 2, 200)
|
||||
|
||||
# Set the figure size
|
||||
plt.figure(figsize=self.figsize)
|
||||
|
||||
#设置图片的右边框和上边框为不显示
|
||||
ax=plt.gca() #gca:get current axis得到当前轴
|
||||
ax.spines['right'].set_color('none')
|
||||
ax.spines['top'].set_color('none')
|
||||
|
||||
# 开始画图
|
||||
plt.plot(xs, density(xs), color=self.color_data, alpha=self.transparency, linewidth=self.linewidth)
|
||||
if self.is_fill:
|
||||
plt.fill_between(xs, density(xs), color=self.color_data, alpha=self.transparency, label='Dataset')
|
||||
|
||||
plt.plot(xs, density_gen(xs), color=self.color_gen, alpha=self.transparency, linewidth=self.linewidth)
|
||||
if self.is_fill:
|
||||
plt.fill_between(xs, density_gen(xs), color=self.color_gen, alpha=self.transparency, label='Generate')
|
||||
plt.legend(prop={'size': self.legend_size})#framealpha=1, handler_map={plt.Line2D: HandlerLine2D(update_func=updateline)},
|
||||
#frameon=True, fontsize=20)
|
||||
plt.yticks(size=self.tick_size) # fontproperties = 'Times New Roman',
|
||||
plt.xticks(size=self.tick_size) # fontproperties = 'Times New Roman',
|
||||
plt.xlabel('Bond Length (Å)', fontdict={'size':self.axis_label_size}) # 'family': Arial
|
||||
plt.ylabel('Density', fontdict={'size':self.axis_label_size})
|
||||
if save_path:
|
||||
plt.savefig(save_path + '/' + '{}.png'.format(bond_type), dpi=dpi)
|
||||
#plt.show()
|
||||
return plt
|
||||
@@ -1,183 +0,0 @@
|
||||
import os
|
||||
import subprocess
|
||||
import random
|
||||
import string
|
||||
from easydict import EasyDict
|
||||
from rdkit import Chem
|
||||
from rdkit.Chem.rdForceFieldHelpers import UFFOptimizeMolecule
|
||||
|
||||
from .reconstruct import reconstruct_from_generated
|
||||
|
||||
|
||||
def get_random_id(length=30):
|
||||
letters = string.ascii_lowercase
|
||||
return ''.join(random.choice(letters) for i in range(length))
|
||||
|
||||
|
||||
def load_pdb(path):
|
||||
with open(path, 'r') as f:
|
||||
return f.read()
|
||||
|
||||
|
||||
def parse_qvina_outputs(docked_sdf_path):
|
||||
|
||||
suppl = Chem.SDMolSupplier(docked_sdf_path)
|
||||
results = []
|
||||
for i, mol in enumerate(suppl):
|
||||
if mol is None:
|
||||
continue
|
||||
line = mol.GetProp('REMARK').splitlines()[0].split()[2:]
|
||||
results.append(EasyDict({
|
||||
'rdmol': mol,
|
||||
'mode_id': i,
|
||||
'affinity': float(line[0]),
|
||||
'rmsd_lb': float(line[1]),
|
||||
'rmsd_ub': float(line[2]),
|
||||
}))
|
||||
|
||||
return results
|
||||
|
||||
class BaseDockingTask(object):
|
||||
|
||||
def __init__(self, pdb_block, ligand_rdmol):
|
||||
super().__init__()
|
||||
self.pdb_block = pdb_block
|
||||
self.ligand_rdmol = ligand_rdmol
|
||||
|
||||
def run(self):
|
||||
raise NotImplementedError()
|
||||
|
||||
def get_results(self):
|
||||
raise NotImplementedError()
|
||||
|
||||
|
||||
class QVinaDockingTask(BaseDockingTask):
|
||||
|
||||
@classmethod
|
||||
def from_generated_data(cls, data, protein_root='./data/crossdocked', **kwargs):
|
||||
protein_fn = os.path.join(
|
||||
os.path.dirname(data.ligand_filename),
|
||||
os.path.basename(data.ligand_filename)[:10] + '.pdb'
|
||||
)
|
||||
protein_path = os.path.join(protein_root, protein_fn)
|
||||
with open(protein_path, 'r') as f:
|
||||
pdb_block = f.read()
|
||||
ligand_rdmol = reconstruct_from_generated(data)
|
||||
return cls(pdb_block, ligand_rdmol, **kwargs)
|
||||
|
||||
@classmethod
|
||||
def from_original_data(cls, data, ligand_root='./data/crossdocked_pocket10', protein_root='./data/crossdocked', **kwargs):
|
||||
protein_fn = os.path.join(
|
||||
os.path.dirname(data.ligand_filename),
|
||||
os.path.basename(data.ligand_filename)[:10] + '.pdb'
|
||||
)
|
||||
protein_path = os.path.join(protein_root, protein_fn)
|
||||
with open(protein_path, 'r') as f:
|
||||
pdb_block = f.read()
|
||||
|
||||
ligand_path = os.path.join(ligand_root, data.ligand_filename)
|
||||
ligand_rdmol = next(iter(Chem.SDMolSupplier(ligand_path)))
|
||||
return cls(pdb_block, ligand_rdmol, **kwargs)
|
||||
|
||||
def __init__(self, pdb_block, ligand_rdmol, conda_env='adt', tmp_dir='./tmp', use_uff=True, center=None):
|
||||
super().__init__(pdb_block, ligand_rdmol)
|
||||
self.conda_env = conda_env
|
||||
self.tmp_dir = os.path.realpath(tmp_dir)
|
||||
os.makedirs(tmp_dir, exist_ok=True)
|
||||
|
||||
self.task_id = get_random_id()
|
||||
self.receptor_id = self.task_id + '_receptor'
|
||||
self.ligand_id = self.task_id + '_ligand'
|
||||
|
||||
self.receptor_path = os.path.join(self.tmp_dir, self.receptor_id + '.pdb')
|
||||
self.ligand_path = os.path.join(self.tmp_dir, self.ligand_id + '.sdf')
|
||||
|
||||
with open(self.receptor_path, 'w') as f:
|
||||
f.write(pdb_block)
|
||||
|
||||
ligand_rdmol = Chem.AddHs(ligand_rdmol, addCoords=True)
|
||||
if use_uff:
|
||||
UFFOptimizeMolecule(ligand_rdmol)
|
||||
sdf_writer = Chem.SDWriter(self.ligand_path)
|
||||
sdf_writer.write(ligand_rdmol)
|
||||
sdf_writer.close()
|
||||
self.ligand_rdmol = ligand_rdmol
|
||||
|
||||
pos = ligand_rdmol.GetConformer(0).GetPositions()
|
||||
if center is None:
|
||||
self.center = (pos.max(0) + pos.min(0)) / 2
|
||||
else:
|
||||
self.center = center
|
||||
|
||||
self.proc = None
|
||||
self.results = None
|
||||
self.output = None
|
||||
self.docked_sdf_path = None
|
||||
|
||||
def run(self, exhaustiveness=16):
|
||||
commands = """
|
||||
eval "$(conda shell.bash hook)"
|
||||
conda activate {env}
|
||||
cd {tmp}
|
||||
# Prepare receptor (PDB->PDBQT)
|
||||
prepare_receptor4.py -r {receptor_id}.pdb
|
||||
# Prepare ligand
|
||||
obabel {ligand_id}.sdf -O {ligand_id}.pdbqt
|
||||
qvina2.1 \
|
||||
--receptor {receptor_id}.pdbqt \
|
||||
--ligand {ligand_id}.pdbqt \
|
||||
--center_x {center_x:.4f} \
|
||||
--center_y {center_y:.4f} \
|
||||
--center_z {center_z:.4f} \
|
||||
--size_x 20 --size_y 20 --size_z 20 \
|
||||
--exhaustiveness {exhaust}
|
||||
obabel {ligand_id}_out.pdbqt -O{ligand_id}_out.sdf -h
|
||||
""".format(
|
||||
receptor_id = self.receptor_id,
|
||||
ligand_id = self.ligand_id,
|
||||
env = self.conda_env,
|
||||
tmp = self.tmp_dir,
|
||||
exhaust = exhaustiveness,
|
||||
center_x = self.center[0],
|
||||
center_y = self.center[1],
|
||||
center_z = self.center[2],
|
||||
)
|
||||
|
||||
self.docked_sdf_path = os.path.join(self.tmp_dir, '%s_out.sdf' % self.ligand_id)
|
||||
|
||||
self.proc = subprocess.Popen(
|
||||
'/bin/bash',
|
||||
shell=False,
|
||||
stdin=subprocess.PIPE,
|
||||
stdout=subprocess.PIPE,
|
||||
stderr=subprocess.PIPE
|
||||
)
|
||||
|
||||
self.proc.stdin.write(commands.encode('utf-8'))
|
||||
self.proc.stdin.close()
|
||||
|
||||
# return commands
|
||||
|
||||
def run_sync(self):
|
||||
self.run()
|
||||
while self.get_results() is None:
|
||||
pass
|
||||
results = self.get_results()
|
||||
print('Best affinity:', results[0]['affinity'])
|
||||
return results
|
||||
|
||||
def get_results(self):
|
||||
if self.proc is None: # Not started
|
||||
return None
|
||||
elif self.proc.poll() is None: # In progress
|
||||
return None
|
||||
else:
|
||||
if self.output is None:
|
||||
self.output = self.proc.stdout.readlines()
|
||||
try:
|
||||
self.results = parse_qvina_outputs(self.docked_sdf_path)
|
||||
except:
|
||||
print('[Error] Vina output error: %s' % self.docked_sdf_path)
|
||||
return []
|
||||
return self.results
|
||||
|
||||
@@ -1,147 +0,0 @@
|
||||
import os
|
||||
from rdkit import Chem
|
||||
from rdkit.Chem import AllChem, Descriptors
|
||||
from rdkit.Chem import Draw
|
||||
from rdkit.Chem.Draw import rdMolDraw2D
|
||||
from rdkit.Chem.Draw.MolDrawing import MolDrawing, DrawingOptions
|
||||
from .train import verify_dir_exists
|
||||
|
||||
|
||||
|
||||
|
||||
def draw_docked_mol_list(mol_list, molsPerRow=5, subImgSize=(300,300)):
|
||||
opts = DrawingOptions()
|
||||
opts.atomLabelFontSize = 30
|
||||
opts.bondLineWidth = 1.5
|
||||
opts.colorBonds = False
|
||||
|
||||
legends = []
|
||||
for m in mol_list:
|
||||
AllChem.Compute2DCoords(m)
|
||||
docking_score = 'Docking Score: {:.3f}'.format(float(m.GetProp('r_i_docking_score')))
|
||||
mw = 'MolWt: {:.3f}'.format(Descriptors.MolWt(m))
|
||||
name = m.GetProp('_Name')
|
||||
legend = name + '\n' + mw + '\n' + docking_score
|
||||
legends.append(legend)
|
||||
|
||||
img = Draw.MolsToGridImage(
|
||||
mol_list,
|
||||
molsPerRow=molsPerRow,
|
||||
subImgSize=subImgSize,
|
||||
legends=legends,
|
||||
returnPNG=False
|
||||
)
|
||||
return img
|
||||
'''
|
||||
def verify_dir_exists(dirname):
|
||||
if os.path.isdir(os.path.dirname(dirname)) == False:
|
||||
#if os.path.isdir(dirname) == False:
|
||||
os.makedirs(os.path.dirname(dirname))'''
|
||||
|
||||
|
||||
def HighlightAtomByWeights(mol, # rdkit.Molecule的instance
|
||||
#weights, # 注意力系数,注意weights的顺序应该与rdkit.Molecule原子idx的顺序一致
|
||||
save=None,
|
||||
size=(300,300),
|
||||
colors=['#FFFFFF','#FF0000'], # 颜色变化,可以定义多个颜色
|
||||
bondLineWidth=5, # 化学键显示的宽度
|
||||
FontSize=3,
|
||||
fixedBondLength=50, # 化学键显示的长度
|
||||
legend=None,
|
||||
legendFontSize=5,
|
||||
elemColor=False,
|
||||
withIsomeric=True): # 是否按元素给原子着色
|
||||
|
||||
draw = rdMolDraw2D.MolDraw2DCairo(size[0], size[1])
|
||||
option = rdMolDraw2D.MolDrawOptions()
|
||||
option.bondLineWidth = bondLineWidth
|
||||
option.fixedBondLength = fixedBondLength
|
||||
option.setHighlightColour((0.95,0.7,0.95))
|
||||
option.baseFontSize = FontSize
|
||||
option.legendFontSize = legendFontSize
|
||||
if elemColor == False:
|
||||
option.updateAtomPalette({k: (0, 0, 0) for k in DrawingOptions.elemDict.keys()})
|
||||
draw.SetDrawOptions(option)
|
||||
if withIsomeric:
|
||||
AllChem.Compute2DCoords(mol)
|
||||
else:
|
||||
smi = Chem.MolToSmiles(mol, isomericSmiles=False)
|
||||
mol = Chem.MolFromSmiles(smi)
|
||||
rdMolDraw2D.PrepareAndDrawMolecule(draw, mol, legend=legend)
|
||||
draw.FinishDrawing()
|
||||
if save:
|
||||
if '.png' in save:
|
||||
n = 0
|
||||
path = '/'.join(save.split('/')[0:-1])
|
||||
while os.path.exists(save):
|
||||
n += 1
|
||||
name = save.split('/')[-1].split('.')[0]
|
||||
name = name + '_' + str(n) + '.png'
|
||||
save = path + '/' + name
|
||||
else:
|
||||
draw.WriteDrawingText(save)
|
||||
else:
|
||||
return draw
|
||||
|
||||
'''
|
||||
supp = list(iter(Chem.SDMolSupplier('dockpose-idscore2500-first50.sdf', removeHs=True, sanitize=1)))
|
||||
save_path = './img_docking_results/idscore_filtered/'
|
||||
verify_dir_exists(save_path)
|
||||
for ix,mol in enumerate(supp[:50]):
|
||||
docking_score = 'Docking Score: {:.3f}'.format(float(mol.GetProp('r_i_docking_score')))
|
||||
mw = 'MolWt: {:.3f}'.format(Descriptors.MolWt(mol))
|
||||
legend = mw + '\n' + docking_score
|
||||
name = save_path + '/' + 'No-' + str(ix) + '-' + mol.GetProp('_Name') + '.png'
|
||||
HighlightAtomByWeights(mol, save=name, bondLineWidth=2, FontSize=1, elemColor=True, legend=legend, legendFontSize=100)
|
||||
'''
|
||||
|
||||
from PIL import Image
|
||||
import matplotlib.pyplot as plt
|
||||
import numpy as np
|
||||
|
||||
def CombineImages(img_file_list, col_num=7, save_dir='./', title='image', img_size=None):
|
||||
|
||||
num_img = len(img_file_list)
|
||||
num_row = num_img//col_num+1 if num_img%col_num != 0 else num_img//col_num
|
||||
|
||||
if img_size is None:
|
||||
img_size = Image.open(img_file_list[0]).size
|
||||
#print((img_size[1]*col_num, img_size[0]*num_row))
|
||||
toImage = Image.new('RGB', (img_size[1]*col_num, img_size[0]*num_row), color=(255,255,255))
|
||||
#print(toImage.size)
|
||||
'''for y in range(1, num_row+1):
|
||||
for x in range(1, num_per_row):'''
|
||||
x_cusum = 0
|
||||
y_cusum = 0
|
||||
num_has_paste = 0
|
||||
for img_file in img_file_list:
|
||||
img = Image.open(img_file)
|
||||
#print((x_cusum, y_cusum))
|
||||
toImage.paste(img, (x_cusum, y_cusum))
|
||||
num_has_paste += 1
|
||||
#print(num_has_paste)
|
||||
if num_has_paste%col_num==0:
|
||||
x_cusum = 0
|
||||
y_cusum += img_size[1]
|
||||
else:
|
||||
x_cusum += img_size[0]
|
||||
|
||||
#toImage.save("merged.png")
|
||||
plt.xticks([])
|
||||
plt.yticks([])
|
||||
plt.axis('off') # 取消所有边框,坐标轴
|
||||
plt.imshow(toImage)
|
||||
#plt.title(title, y=0.02) # title默认设置在图像上方,通过设置y改变位置
|
||||
#plt.show()
|
||||
verify_dir_exists(save_dir)
|
||||
toImage.save(save_dir+'/'+title+'.png')
|
||||
#plt.savefig(fname=save_dir+'/'+title)
|
||||
plt.clf() # 清除当前图形。如果不清除,当使用循环大量作图,机器会为plt分配越来越多的内存,速度会逐渐变慢。
|
||||
|
||||
|
||||
'''
|
||||
import glob
|
||||
|
||||
l = sorted([(int(i.split('/')[-1].split('-')[1]), i) for i in glob.glob('img_docking_results/simlarity_filtered/*.png')], key=lambda x:x[0])
|
||||
CombineImages([i[1] for i in l])
|
||||
'''
|
||||
@@ -1,482 +0,0 @@
|
||||
import torch
|
||||
import torch.nn.functional as F
|
||||
from rdkit import Chem, Geometry
|
||||
from rdkit.Chem import Descriptors
|
||||
import numpy as np
|
||||
import copy
|
||||
import itertools
|
||||
|
||||
|
||||
max_valence_dict = {
|
||||
6:torch.LongTensor([4]), 7:torch.LongTensor([3]), 8:torch.LongTensor([2]),
|
||||
9:torch.LongTensor([1]), 15:torch.LongTensor([5]), 16:torch.LongTensor([6]),
|
||||
17:torch.LongTensor([1]), 35:torch.LongTensor([1]), 53:torch.LongTensor([1]),
|
||||
1:torch.LongTensor([1])
|
||||
}
|
||||
def add_ligand_atom_to_data(
|
||||
old_data, pos, element, bond_index, bond_type, type_map=[1,6,7,8,9,15,16,17,35,53],
|
||||
max_valence_dict=max_valence_dict):
|
||||
data = old_data.clone()
|
||||
if 'max_atom_valence' not in data.__dict__['_store']:
|
||||
data.max_atom_valence = torch.empty(0, dtype=torch.long)
|
||||
# add position of new atom to context
|
||||
data.ligand_context_pos = torch.cat([
|
||||
data.ligand_context_pos, pos.view(1, 3).to(data.ligand_context_pos)
|
||||
], dim=0)
|
||||
|
||||
# add feature of new atom to context
|
||||
data.ligand_context_feature_full = torch.cat([
|
||||
data.ligand_context_feature_full,
|
||||
torch.cat([
|
||||
F.one_hot(element.view(1), len(type_map)).to(data.ligand_context_feature_full), # (1, num_elements)
|
||||
torch.tensor([[1, 0, 0]]).to(data.ligand_context_feature_full), # is_mol_atom, num_neigh (placeholder), valence (placeholder)
|
||||
torch.tensor([[0, 0, 0]]).to(data.ligand_context_feature_full) # num_of_bonds 1, 2, 3(placeholder)
|
||||
], dim=1)
|
||||
], dim=0)
|
||||
data.context_idx = torch.arange(data.context_idx.size(0)+1)
|
||||
#
|
||||
idx_num_neigh = len(type_map) + 1
|
||||
idx_valence = idx_num_neigh + 1
|
||||
idx_num_of_bonds = idx_valence + 1
|
||||
|
||||
# add type of new atom to context
|
||||
element = torch.LongTensor([type_map[element.item()]])
|
||||
data.ligand_context_element = torch.cat([
|
||||
data.ligand_context_element, element.view(1).to(data.ligand_context_element)
|
||||
])
|
||||
max_new_atom_valence = max_valence_dict[element.item()].to(data.max_atom_valence)
|
||||
data.max_atom_valence = torch.cat([data.max_atom_valence, max_new_atom_valence])
|
||||
|
||||
# change the feature of new atom to context according to ligand context
|
||||
if len(bond_type) != 0:
|
||||
bond_index, bond_type = remove_triangle(
|
||||
pos, data.ligand_context_pos, data.ligand_context_bond_index, data.ligand_context_bond_type,
|
||||
bond_index, bond_type)
|
||||
bond_index[0, :] = len(data.ligand_context_pos) - 1
|
||||
'''bond_type = check_double_bond(
|
||||
data.ligand_context_bond_index,
|
||||
data.ligand_context_bond_type,
|
||||
bond_index,
|
||||
bond_type
|
||||
)'''
|
||||
bond_type = check_valence_is_2(
|
||||
bond_index, bond_type,
|
||||
data.ligand_context_element, data.ligand_context_valence
|
||||
)
|
||||
bond_vec = data.ligand_context_pos[bond_index[0]] - data.ligand_context_pos[bond_index[1]]
|
||||
bond_lengths = torch.norm(bond_vec, dim=-1, p=2)
|
||||
if (bond_lengths > 3).any():
|
||||
print(bond_lengths)
|
||||
|
||||
bond_index_all = torch.cat([bond_index, torch.stack([bond_index[1, :], bond_index[0, :]], dim=0)], dim=1)
|
||||
bond_type_all = torch.cat([bond_type, bond_type], dim=0)
|
||||
|
||||
data.ligand_context_bond_index = torch.cat([
|
||||
data.ligand_context_bond_index, bond_index_all.to(data.ligand_context_bond_index)
|
||||
], dim=1)
|
||||
|
||||
data.ligand_context_bond_type = torch.cat([
|
||||
data.ligand_context_bond_type,
|
||||
bond_type_all
|
||||
])
|
||||
# modify atom features related to bonds
|
||||
# previous atom
|
||||
data.ligand_context_feature_full[bond_index[1, :], idx_num_neigh] += 1 # num of neigh of previous nodes
|
||||
data.ligand_context_feature_full[bond_index[1, :], idx_valence] += bond_type # valence of previous nodes
|
||||
data.ligand_context_feature_full[bond_index[1, :], idx_num_of_bonds + bond_type - 1] += 1 # num of bonds of
|
||||
# the new atom
|
||||
data.ligand_context_feature_full[-1, idx_num_neigh] += len(bond_index[1]) # num of neigh of last node
|
||||
data.ligand_context_feature_full[-1, idx_valence] += torch.sum(bond_type) # valence of last node
|
||||
for bond in [1, 2, 3]:
|
||||
data.ligand_context_feature_full[-1, idx_num_of_bonds + bond - 1] += (bond_type == bond).sum() # num of bonds of last node
|
||||
data.ligand_context_valence = data.ligand_context_feature_full[:,idx_valence]
|
||||
del old_data
|
||||
return data
|
||||
|
||||
|
||||
def data2mol(data, raise_error=True, sanitize=True):
|
||||
element = data.ligand_context_element.clone().cpu().tolist()
|
||||
bond_index = data.ligand_context_bond_index.clone().cpu().tolist()
|
||||
bond_type = data.ligand_context_bond_type.clone().cpu().tolist()
|
||||
pos = data.ligand_context_pos.clone().cpu().tolist()
|
||||
n_atoms = len(pos)
|
||||
rd_mol = Chem.RWMol()
|
||||
rd_conf = Chem.Conformer(n_atoms)
|
||||
|
||||
# add atoms and coordinates
|
||||
for i, atom in enumerate(element):
|
||||
rd_atom = Chem.Atom(atom)
|
||||
rd_mol.AddAtom(rd_atom)
|
||||
rd_coords = Geometry.Point3D(*pos[i])
|
||||
rd_conf.SetAtomPosition(i, rd_coords)
|
||||
rd_mol.AddConformer(rd_conf)
|
||||
# add atoms and coordinates
|
||||
# add atoms and coordinates
|
||||
for i, type_this in enumerate(bond_type):
|
||||
node_i, node_j = bond_index[0][i], bond_index[1][i]
|
||||
if node_i < node_j:
|
||||
if type_this == 1:
|
||||
rd_mol.AddBond(node_i, node_j, Chem.BondType.SINGLE)
|
||||
elif type_this == 2:
|
||||
rd_mol.AddBond(node_i, node_j, Chem.BondType.DOUBLE)
|
||||
elif type_this == 3:
|
||||
rd_mol.AddBond(node_i, node_j, Chem.BondType.TRIPLE)
|
||||
elif type_this == 12:
|
||||
rd_mol.AddBond(node_i, node_j, Chem.BondType.AROMATIC)
|
||||
else:
|
||||
raise Exception('unknown bond order {}'.format(type_this))
|
||||
# modify
|
||||
try:
|
||||
rd_mol = modify_submol(rd_mol)
|
||||
except:
|
||||
if raise_error:
|
||||
raise MolReconsError()
|
||||
else:
|
||||
print('MolReconsError')
|
||||
# check valid
|
||||
rd_mol_check = Chem.MolFromSmiles(Chem.MolToSmiles(rd_mol))
|
||||
if rd_mol_check is None:
|
||||
if raise_error:
|
||||
raise MolReconsError()
|
||||
else:
|
||||
print('MolReconsError')
|
||||
rd_mol = rd_mol.GetMol()
|
||||
if 12 in bond_type: # mol may directlu come from ture mols and contains aromatic bonds
|
||||
Chem.Kekulize(rd_mol, clearAromaticFlags=True)
|
||||
if sanitize:
|
||||
Chem.SanitizeMol(rd_mol, Chem.SANITIZE_ALL^Chem.SANITIZE_KEKULIZE^Chem.SANITIZE_SETAROMATICITY)
|
||||
#rd_mol = modify(rd_mol)
|
||||
return rd_mol
|
||||
|
||||
|
||||
def modify_submol(mol): # modify mols containing C=N(C)O
|
||||
submol = Chem.MolFromSmiles('C=N(C)O', sanitize=False)
|
||||
sub_fragments = mol.GetSubstructMatches(submol)
|
||||
for fragment in sub_fragments:
|
||||
atomic_nums = np.array([mol.GetAtomWithIdx(atom).GetAtomicNum() for atom in fragment])
|
||||
idx_atom_N = fragment[np.where(atomic_nums == 7)[0][0]]
|
||||
idx_atom_O = fragment[np.where(atomic_nums == 8)[0][0]]
|
||||
mol.GetAtomWithIdx(idx_atom_N).SetFormalCharge(1) # set N to N+
|
||||
mol.GetAtomWithIdx(idx_atom_O).SetFormalCharge(-1) # set O to O-
|
||||
return mol
|
||||
|
||||
|
||||
class MolReconsError(Exception):
|
||||
pass
|
||||
|
||||
def add_context(data):
|
||||
data.ligand_context_pos = data.ligand_pos
|
||||
data.ligand_context_element = data.ligand_element
|
||||
data.ligand_context_bond_index = data.ligand_bond_index
|
||||
data.ligand_context_bond_type = data.ligand_bond_type
|
||||
return data
|
||||
|
||||
|
||||
def check_valency(mol):
|
||||
try:
|
||||
Chem.SanitizeMol(mol, sanitizeOps=Chem.SanitizeFlags.SANITIZE_PROPERTIES)
|
||||
return True
|
||||
except ValueError:
|
||||
return False
|
||||
|
||||
|
||||
def check_double_bond(
|
||||
ligand_context_bond_index, ligand_context_bond_type, bond_index_to_add, bond_type_to_add
|
||||
):
|
||||
bond_type_in_place = [
|
||||
ligand_context_bond_type[ix==ligand_context_bond_index[0]]\
|
||||
for ix in bond_index_to_add[1]
|
||||
]
|
||||
has_double = torch.BoolTensor([(bt==2).sum()>0 for bt in bond_type_in_place])
|
||||
bond_type_to_add_has_double = bond_type_to_add >= 2
|
||||
mask = torch.stack([has_double,bond_type_to_add_has_double]).all(dim=0)
|
||||
bond_type_to_add = bond_type_to_add - mask.long()
|
||||
return bond_type_to_add
|
||||
|
||||
|
||||
def check_valence_is_2(
|
||||
bond_index_to_add, bond_type_to_add,
|
||||
ligand_context_element, ligand_context_valence
|
||||
):
|
||||
atom_type_to_add = ligand_context_element[-1]
|
||||
new_atom_type_is_C = (atom_type_to_add == 6).view(-1)
|
||||
if new_atom_type_is_C:
|
||||
atom_valence_in_place = ligand_context_valence[bond_index_to_add[1]] >= 2
|
||||
atom_type_in_place_is_C = ligand_context_element[bond_index_to_add[1]] == 6
|
||||
bond_type_to_add_mask = bond_type_to_add >= 2
|
||||
|
||||
mask = torch.stack([atom_valence_in_place, atom_type_in_place_is_C, bond_type_to_add_mask]).all(dim=0)
|
||||
bond_type_to_add = bond_type_to_add - mask.long()
|
||||
return bond_type_to_add
|
||||
|
||||
|
||||
def remove_triangle(pos_to_add, ligand_context_pos, ligand_context_bond_index, ligand_context_bond_type,
|
||||
bond_index_to_add, bond_type_to_add):
|
||||
new_j = bond_index_to_add[1]
|
||||
atom_in_place_adjs = [
|
||||
ligand_context_bond_index[1][ligand_context_bond_index[0]==i] for i in new_j
|
||||
]
|
||||
'''atom_in_place_bond_type = [
|
||||
ligand_context_bond_type[1][ligand_context_bond_index[0]==i] for i in new_j
|
||||
]'''
|
||||
L = []
|
||||
for j in new_j:
|
||||
l = []
|
||||
for i in atom_in_place_adjs:
|
||||
if j in i:
|
||||
l.append(True)
|
||||
else:
|
||||
l.append(False)
|
||||
L.append(l)
|
||||
adj_mask = torch.LongTensor(L).any(-1)
|
||||
if adj_mask.sum() > 0:
|
||||
dist = torch.norm(pos_to_add.view(-1,3) - ligand_context_pos[new_j], dim=-1, p=2)
|
||||
dist_mask_idx = torch.nonzero(adj_mask).view(-1)
|
||||
max_dist_idx_to_remove = dist_mask_idx[dist[dist_mask_idx].argmax()]
|
||||
mask = (torch.arange(len(bond_type_to_add)) == max_dist_idx_to_remove) == False
|
||||
bond_index_to_add = bond_index_to_add[:, mask]
|
||||
bond_type_to_add = bond_type_to_add[mask]
|
||||
return bond_index_to_add, bond_type_to_add
|
||||
|
||||
|
||||
PATTERNS = [
|
||||
#Chem.MolFromSmarts('[C,N]1~&@[C,N]~&@[C,N]~&@[C,N]~&@[C,N]~&@[C,N]~&@1'), # '[R]1~&@[R]~&@[R]~&@[R]~&@[R]~&@[R]~&@1'
|
||||
Chem.MolFromSmarts('[N]1~&@[N]~&@[C]~&@[C]~&@[C]~&@[C]~&@1'), # '[R]1~&@[R]~&@[R]~&@[R]~&@[R]~&@1'
|
||||
Chem.MolFromSmarts('[N]1~&@[C]~&@[N]~&@[C]~&@[C]~&@[C]~&@1'),
|
||||
Chem.MolFromSmarts('[N]1~&@[C]~&@[C]~&@[N]~&@[C]~&@[C]~&@1'),
|
||||
Chem.MolFromSmarts('[N]1~&@[C]~&@[C]~&@[C]~&@[C]~&@[C]~&@1'),
|
||||
Chem.MolFromSmarts('[C]1~&@[C]~&@[C]~&@[C]~&@[C]~&@[C]~&@1')
|
||||
]
|
||||
PATTERNS_1 = [
|
||||
[Chem.MolFromSmarts('[#6,7,8]-[#7]1~&@[#6,7]~&@[#6,7]~&@[#6,7]~&@[#6,7]~&@[#6,7]~&@1'),
|
||||
Chem.MolFromSmarts('[C,N,O]-[N]1~&@[C,N]~&@[C,N]~&@[C,N]~&@[C,N]~&@[C,N]~&@1')],
|
||||
[Chem.MolFromSmarts('[#6,7,8]-[#6]1(-[#6,7,8])~&@[#6,7]~&@[#6,7]~&@[#6,7]~&@[#6,7]~&@[#6,7]~&@1'),
|
||||
Chem.MolFromSmarts('[C,N,O]-[C]1(-[C,N,O])~&@[C,N]~&@[C,N]~&@[C,N]~&@[C,N]~&@[C,N]~&@1')]
|
||||
#Chem.MolFromSmarts('[C,N,O]-[N]1~&@[C]~&@[C]~&@[N]~&@[C]~&@[C]-1'),
|
||||
#Chem.MolFromSmarts('[C,N,O]-[N]1~&@[C]~&@[C]~&@[C]~&@[C]~&@[C]-1'),
|
||||
]
|
||||
MAX_VALENCE = {'C':4, 'N':3}
|
||||
def modify(mol, max_double_in_6ring=0):
|
||||
#atoms = mol.GetAtoms()
|
||||
mol_copy = copy.deepcopy(mol)
|
||||
mw = Chem.RWMol(mol)
|
||||
|
||||
p1 = Chem.MolFromSmarts('[#6,7]=[#6]1-[#6,7]~&@[#6,7]~&@[#6,7]~&@[#6,7]~&@[#6,7]-1')
|
||||
p1_ = Chem.MolFromSmarts('[C,N]=[C]1-[C,N]~&@[C,N]~&@[C,N]~&@[C,N]~&@[C,N]-1')
|
||||
subs = set(list(mw.GetSubstructMatches(p1)) + list(mw.GetSubstructMatches(p1_)))
|
||||
subs_set_1 = [set(s) for s in subs]
|
||||
for sub in subs:
|
||||
comb = itertools.combinations(sub, 2)
|
||||
#b_list = [(c, mw.GetBondBetweenAtoms(*c)) for c in comb if mw.GetBondBetweenAtoms(*c) is not None]
|
||||
change_double = False
|
||||
r_b_double = 0
|
||||
b_list = []
|
||||
for ix,c in enumerate(comb):
|
||||
b = mw.GetBondBetweenAtoms(*c)
|
||||
if ix == 0:
|
||||
bt = b.GetBondType().__str__()
|
||||
is_r = b.IsInRingSize(6)
|
||||
b_list.append((c, bt, is_r))
|
||||
continue
|
||||
if b is not None:
|
||||
bt = b.GetBondType().__str__()
|
||||
is_r = b.IsInRing()
|
||||
b_list.append((c, bt, is_r))
|
||||
if is_r is True and bt == 'DOUBLE':
|
||||
r_b_double += 1
|
||||
if r_b_double > max_double_in_6ring:
|
||||
change_double = True
|
||||
if change_double:
|
||||
for ix,b in enumerate(b_list):
|
||||
if ix == 0:
|
||||
if b[-1] is False:
|
||||
mw.RemoveBond(*b[0])
|
||||
mw.AddBond(*b[0], Chem.rdchem.BondType.SINGLE)
|
||||
else:
|
||||
continue
|
||||
if b[1] == 'DOUBLE' and b[-1] is False:
|
||||
mw.RemoveBond(*b[0])
|
||||
mw.AddBond(*b[0], Chem.rdchem.BondType.SINGLE)
|
||||
break
|
||||
|
||||
#p2 = Chem.MolFromSmarts('[C,N,O]-[N]1~&@[C,N]~&@[C,N]~&@[C,N]~&@[C,N]~&@[C,N]-1')
|
||||
for p2 in PATTERNS_1:
|
||||
Chem.GetSSSR(mw)
|
||||
subs2 = set(list(mw.GetSubstructMatches(p2[0])) + list(mw.GetSubstructMatches(p2[1])))
|
||||
for sub in subs2:
|
||||
comb = itertools.combinations(sub, 2)
|
||||
b_list = [(c, mw.GetBondBetweenAtoms(*c)) for c in comb if mw.GetBondBetweenAtoms(*c) is not None]
|
||||
for b in b_list:
|
||||
if b[-1].GetBondType().__str__() == 'DOUBLE':
|
||||
mw.RemoveBond(*b[0])
|
||||
mw.AddBond(*b[0], Chem.rdchem.BondType.SINGLE)
|
||||
|
||||
Chem.GetSSSR(mw)
|
||||
p3 = Chem.MolFromSmarts('[#8]=[#6]1-[#6,7]~&@[#6,7]~&@[#6,7]~&@[#6,7]~&@[#6,7]-1')
|
||||
p3_ = Chem.MolFromSmarts('[O]=[C]1-[C,N]~&@[C,N]~&@[C,N]~&@[C,N]~&@[C,N]-1')
|
||||
subs = set(list(mw.GetSubstructMatches(p3)) + list(mw.GetSubstructMatches(p3_)))
|
||||
subs_set_2 = [set(s) for s in subs]
|
||||
for sub in subs:
|
||||
comb = itertools.combinations(sub, 2)
|
||||
b_list = [(c, mw.GetBondBetweenAtoms(*c)) for c in comb if mw.GetBondBetweenAtoms(*c) is not None]
|
||||
for b in b_list:
|
||||
if b[-1].GetBondType().__str__() == 'DOUBLE' and b[-1].IsInRing() is True:
|
||||
mw.RemoveBond(*b[0])
|
||||
mw.AddBond(*b[0], Chem.rdchem.BondType.SINGLE)
|
||||
|
||||
p = Chem.MolFromSmarts('[#6,7]1~&@[#6,7]~&@[#6,7]~&@[#6,7]~&@[#6,7]~&@[#6,7]~&@1')
|
||||
p_ = Chem.MolFromSmarts('[C,N]1~&@[C,N]~&@[C,N]~&@[C,N]~&@[C,N]~&@[C,N]~&@1')
|
||||
Chem.GetSSSR(mw)
|
||||
subs = set(list(mw.GetSubstructMatches(p)) + list(mw.GetSubstructMatches(p_)))
|
||||
subs_set_3 = [set(s) for s in subs]
|
||||
for sub in subs:
|
||||
pass_sub = False
|
||||
if subs_set_2:
|
||||
for s in subs_set_2:
|
||||
if len(s-set(sub)) == 1:
|
||||
pass_sub = True
|
||||
break
|
||||
if pass_sub:
|
||||
continue
|
||||
|
||||
bond_list = [(i,sub[0]) if ix+1==len(sub) else (i, sub[ix+1]) for ix,i in enumerate(sub)]
|
||||
if len(bond_list) == 0:
|
||||
continue
|
||||
atoms = [mw.GetAtomWithIdx(i) for i in sub]
|
||||
for a in atoms:
|
||||
if a.GetExplicitValence()==MAX_VALENCE[a.GetSymbol()] and a.GetHybridization().__str__()=='SP3':
|
||||
break
|
||||
else:
|
||||
bond_type = [mw.GetBondBetweenAtoms(*b).GetBondType().__str__() for b in bond_list]
|
||||
if bond_type.count('DOUBLE') > max_double_in_6ring:
|
||||
for b in bond_list:
|
||||
mw.RemoveBond(*b)
|
||||
mw.AddBond(*b, Chem.rdchem.BondType.AROMATIC)
|
||||
|
||||
# get new mol from modified mol
|
||||
conf = mw.GetConformer()
|
||||
rd_mol = Chem.RWMol()
|
||||
rd_conf = Chem.Conformer(mw.GetNumAtoms())
|
||||
for i, atom in enumerate(mw.GetAtoms()):
|
||||
rd_atom = Chem.Atom(atom.GetAtomicNum())
|
||||
rd_mol.AddAtom(rd_atom)
|
||||
rd_coords = conf.GetAtomPosition(i)
|
||||
rd_conf.SetAtomPosition(i, rd_coords)
|
||||
rd_mol.AddConformer(rd_conf)
|
||||
|
||||
for i, bond in enumerate(mw.GetBonds()):
|
||||
bt = bond.GetBondType()
|
||||
node_i = bond.GetBeginAtomIdx()
|
||||
node_j = bond.GetEndAtomIdx()
|
||||
rd_mol.AddBond(node_i, node_j, bt)
|
||||
out_mol = rd_mol.GetMol()
|
||||
|
||||
# check validility of the new mol
|
||||
mol_check = Chem.MolFromSmiles(Chem.MolToSmiles(out_mol))
|
||||
if mol_check:
|
||||
try:
|
||||
Chem.Kekulize(out_mol)
|
||||
del mol_copy
|
||||
return out_mol
|
||||
except:
|
||||
del mol
|
||||
return mol_copy
|
||||
else:
|
||||
del mol
|
||||
return mol_copy
|
||||
|
||||
|
||||
'''
|
||||
PATTERNS = [
|
||||
Chem.MolFromSmarts('[C,N]1~&@[C,N]~&@[C,N]~&@[C,N]~&@[C,N]~&@[C,N]~&@1'), # '[R]1~&@[R]~&@[R]~&@[R]~&@[R]~&@[R]~&@1'
|
||||
#Chem.MolFromSmarts('[C,N]1~&@[C,N]~&@[C,N]~&@[C,N]~&@[C,N]~&@1') # '[R]1~&@[R]~&@[R]~&@[R]~&@[R]~&@1'
|
||||
]
|
||||
def modify(mol):
|
||||
#atoms = mol.GetAtoms()
|
||||
mol_copy = copy.deepcopy(mol)
|
||||
mw = Chem.RWMol(mol)
|
||||
p1 = Chem.MolFromSmarts('[#6]=[#6]1-[C,N]~&@[C,N]~&@[C,N]~&@[C,N]~&@[C,N]-1')
|
||||
subs = mw.GetSubstructMatches(p1)
|
||||
for sub in subs:
|
||||
comb = itertools.combinations(sub, 2)
|
||||
b_list = [(c, mw.GetBondBetweenAtoms(*c)) for c in comb if mw.GetBondBetweenAtoms(*c) is not None]
|
||||
for b in b_list:
|
||||
if b[-1].GetBondType().__str__() == 'DOUBLE' and b[-1].IsInRing() is False:
|
||||
mw.RemoveBond(*b[0])
|
||||
mw.AddBond(*b[0], Chem.rdchem.BondType.SINGLE)
|
||||
|
||||
Chem.GetSSSR(mw)
|
||||
for p in PATTERNS:
|
||||
subs = list(mw.GetSubstructMatches(p))
|
||||
for sub in subs:
|
||||
bond_list = [(i,sub[0]) if ix+1==len(sub) else (i, sub[ix+1]) for ix,i in enumerate(sub)]
|
||||
if len(bond_list) == 0:
|
||||
continue
|
||||
bond_type = [mw.GetBondBetweenAtoms(*b).GetBondType().__str__() for b in bond_list]
|
||||
if bond_type.count('DOUBLE') > 0:
|
||||
for b in bond_list:
|
||||
mw.RemoveBond(*b)
|
||||
mw.AddBond(*b, Chem.rdchem.BondType.AROMATIC)
|
||||
|
||||
# get new mol from modified mol
|
||||
conf = mw.GetConformer()
|
||||
rd_mol = Chem.RWMol()
|
||||
rd_conf = Chem.Conformer(mw.GetNumAtoms())
|
||||
for i, atom in enumerate(mw.GetAtoms()):
|
||||
rd_atom = Chem.Atom(atom.GetAtomicNum())
|
||||
rd_mol.AddAtom(rd_atom)
|
||||
rd_coords = conf.GetAtomPosition(i)
|
||||
rd_conf.SetAtomPosition(i, rd_coords)
|
||||
rd_mol.AddConformer(rd_conf)
|
||||
|
||||
for i, bond in enumerate(mw.GetBonds()):
|
||||
bt = bond.GetBondType()
|
||||
node_i = bond.GetBeginAtomIdx()
|
||||
node_j = bond.GetEndAtomIdx()
|
||||
rd_mol.AddBond(node_i, node_j, bt)
|
||||
out_mol = rd_mol.GetMol()
|
||||
|
||||
# check validility of the new mol
|
||||
mol_check = Chem.MolFromSmiles(Chem.MolToSmiles(out_mol))
|
||||
if mol_check:
|
||||
try:
|
||||
Chem.Kekulize(out_mol)
|
||||
return out_mol
|
||||
except:
|
||||
return mol_copy
|
||||
else:
|
||||
return mol_copy
|
||||
'''
|
||||
|
||||
def save_sdf(mol_list, save_name='mol_gen.sdf'):
|
||||
writer = Chem.SDWriter(save_name)
|
||||
writer.SetProps(['LOGP', 'MW'])
|
||||
for i, mol in enumerate(mol_list):
|
||||
mw = Descriptors.ExactMolWt(mol)
|
||||
logp = Descriptors.MolLogP(mol)
|
||||
mol.SetProp('MW', '%.2f' %(mw))
|
||||
mol.SetProp('LOGP', '%.2f' %(logp))
|
||||
mol.SetProp('_Name', 'No_%s' %(i))
|
||||
Chem.Kekulize(mol)
|
||||
writer.write(mol)
|
||||
writer.close()
|
||||
|
||||
|
||||
def check_alert_structure(mol, alert_smarts):
|
||||
Chem.GetSSSR(mol)
|
||||
pattern = Chem.MolFromSmarts(alert_smarts)
|
||||
subs = mol.GetSubstructMatches(pattern)
|
||||
if len(subs) == 0:
|
||||
return False
|
||||
else:
|
||||
return True
|
||||
|
||||
def check_alert_structures(mol, alert_smarts_list):
|
||||
Chem.GetSSSR(mol)
|
||||
patterns = [Chem.MolFromSmarts(sma) for sma in alert_smarts_list]
|
||||
for p in patterns:
|
||||
subs = mol.GetSubstructMatches(p)
|
||||
if len(subs) != 0:
|
||||
return True
|
||||
else:
|
||||
return False
|
||||
@@ -1,292 +0,0 @@
|
||||
import os
|
||||
import pickle
|
||||
import glob
|
||||
import lmdb
|
||||
import random
|
||||
from rdkit import Chem
|
||||
from multiprocessing import Pool
|
||||
import numpy as np
|
||||
import torch
|
||||
from torch.utils.data import Dataset, Subset
|
||||
#from torch_geometric.loader import DataLoader
|
||||
from tqdm.auto import tqdm
|
||||
from .data import ComplexData, torchify_dict
|
||||
from .ParseFile import Protein, Ligand, parse_sdf_to_dict
|
||||
|
||||
|
||||
|
||||
class LoadDataset(Dataset):
|
||||
|
||||
def __init__(self, dataset_path, transform=None, map_size=10*(1024*1024*1024)):
|
||||
super().__init__()
|
||||
self.dataset_path = dataset_path
|
||||
self.transform = transform
|
||||
self.map_size = map_size
|
||||
self.db = None
|
||||
self.keys = None
|
||||
|
||||
def _connect_db(self):
|
||||
self.db = lmdb.open(
|
||||
self.dataset_path,
|
||||
map_size=self.map_size, # 10GB
|
||||
create=False,
|
||||
subdir=False,
|
||||
readonly=True,
|
||||
lock=False,
|
||||
readahead=False,
|
||||
meminit=False,
|
||||
)
|
||||
with self.db.begin() as txn:
|
||||
self.keys = list(txn.cursor().iternext(values=False))
|
||||
|
||||
def _close_db(self):
|
||||
self.db.close()
|
||||
self.db = None
|
||||
self.keys = None
|
||||
|
||||
def __len__(self):
|
||||
if self.db is None:
|
||||
self._connect_db()
|
||||
return len(self.keys)
|
||||
|
||||
def __getitem__(self, idx):
|
||||
if self.db is None:
|
||||
self._connect_db()
|
||||
key = self.keys[idx]
|
||||
data = pickle.loads(self.db.begin().get(key))
|
||||
#data.id = idx
|
||||
#assert data.protein_pos.size(0) > 0
|
||||
if self.transform is not None:
|
||||
data = self.transform(data)
|
||||
return data
|
||||
|
||||
def __connect_db_edit__(self):
|
||||
self.db = lmdb.open(
|
||||
self.dataset_path,
|
||||
map_size=self.map_size, # 10GB
|
||||
create=False,
|
||||
subdir=False,
|
||||
readonly=False,
|
||||
lock=False,
|
||||
readahead=False,
|
||||
meminit=False,
|
||||
)
|
||||
with self.db.begin() as txn:
|
||||
self.keys = list(txn.cursor().iternext(values=False))
|
||||
|
||||
def remove(self, idx):
|
||||
if self.db is None:
|
||||
self.__connect_db_edit__()
|
||||
txn = self.db.begin(write=True)
|
||||
txn.delete(self.keys[idx])
|
||||
txn.commit()
|
||||
self._close_db()
|
||||
|
||||
@staticmethod
|
||||
def split(dataset, val_num=None, shuffle=True, random_seed=0):
|
||||
index = list(range(len(dataset)))
|
||||
if shuffle:
|
||||
random.seed(random_seed)
|
||||
random.shuffle(index)
|
||||
index = torch.LongTensor(index)
|
||||
split_dic = {'valid':index[:val_num], 'train':index[val_num:]}
|
||||
subsets = {k:Subset(dataset, indices=v) for k, v in split_dic.items()}
|
||||
train_set, val_set = subsets['train'], subsets['valid']
|
||||
return train_set, val_set
|
||||
|
||||
@staticmethod
|
||||
def split_by_name(dataset, test_key_set, name2id_path=None):
|
||||
#self.name2id_path = '/'.join(self.dataset_path.split('/')[:-1])
|
||||
#self.dataset_name = self.dataset_path.split('/')[-1].split('.')[0]
|
||||
if name2id_path is None:
|
||||
name2id_path = '/'.join(dataset.dataset_path.split('/')[:-1])
|
||||
dataset_name = dataset.dataset_path.split('/')[-1].split('.')[0]
|
||||
name2id_file= name2id_path+'/'+dataset_name+'_name2id.pt'
|
||||
if os.path.exists(name2id_file):
|
||||
name2id = torch.load(name2id_file)
|
||||
else:
|
||||
name2id = {}
|
||||
for i in tqdm(range(len(dataset)), 'Indexing Dataset'):
|
||||
try:
|
||||
data = dataset[i]
|
||||
except AssertionError as e:
|
||||
print(i, e)
|
||||
continue
|
||||
name = (
|
||||
'/'.join(data.protein_filename.split('/')[-2:]),
|
||||
'/'.join(data.ligand_filename.split('/')[-2:])
|
||||
)
|
||||
name2id[name] = i
|
||||
torch.save(name2id, name2id_file)
|
||||
else:
|
||||
name2id = torch.load(name2id_file)
|
||||
|
||||
train_idx = []
|
||||
test_idx = []
|
||||
for k,v in tqdm(name2id.items(), 'Spliting'):
|
||||
if k in test_key_set:
|
||||
test_idx.append(v)
|
||||
else:
|
||||
train_idx.append(v)
|
||||
split_dict = {
|
||||
'valid':torch.LongTensor(test_idx), 'train':torch.LongTensor(train_idx)
|
||||
}
|
||||
subsets = {k:Subset(dataset, indices=v) for k, v in split_dict.items()}
|
||||
train_set, val_set = subsets['train'], subsets['valid']
|
||||
return train_set, val_set
|
||||
|
||||
#config = load_config('./configs/train.yml')
|
||||
#dataset = LoadDataset('../Pocket2Mol-learning/data/crossdocked_pocket10_processed.lmdb')
|
||||
#print(len(dataset))
|
||||
|
||||
|
||||
class CrossDocked2020(object):
|
||||
def __init__(self, raw_path, index_path, unexpected_sample=[],
|
||||
atomic_numbers=[6,7,8,9,15,16,17,35,53]):
|
||||
self.raw_path = raw_path
|
||||
self.file_dirname = os.path.dirname(raw_path)
|
||||
self.index_path = index_path
|
||||
self.unexpected_sample = unexpected_sample
|
||||
self.index = self.get_file(index_path, raw_path)
|
||||
self.atomic_numbers = set(atomic_numbers)
|
||||
|
||||
@staticmethod
|
||||
def get_file(index_dirname, index_path):
|
||||
with open(index_dirname, 'rb') as f:
|
||||
index = pickle.load(f)
|
||||
file_list = []
|
||||
for i in index:
|
||||
if i[0] is None:
|
||||
continue
|
||||
else:
|
||||
pdb = os.path.join(index_path, i[0])
|
||||
sdf = os.path.join(index_path, i[1])
|
||||
file_list.append([pdb,sdf])
|
||||
return file_list
|
||||
|
||||
def process(self, raw_file_info):
|
||||
try:
|
||||
pocket_file, ligand_file = raw_file_info
|
||||
lig = Ligand(ligand_file, removeHs=True, sanitize=True)
|
||||
for a in lig.mol.GetAtoms():
|
||||
if a.GetAtomicNum() not in self.atomic_numbers:
|
||||
return None
|
||||
else:
|
||||
ligand_dict = lig.to_dict()
|
||||
if self.only_backbone:
|
||||
pocket_dict = Protein(pocket_file).get_backbone_dict(removeHs=True)
|
||||
else:
|
||||
pocket_dict = Protein(pocket_file).get_atom_dict()
|
||||
#ligand_dict = parse_sdf_to_dict(ligand_fn)
|
||||
data = ComplexData.from_protein_ligand_dicts(
|
||||
protein_dict=torchify_dict(pocket_dict),
|
||||
ligand_dict=torchify_dict(ligand_dict),
|
||||
)
|
||||
data.protein_filename = '/'.join(pocket_file.split('/')[-2:])
|
||||
data.ligand_filename = '/'.join(ligand_file.split('/')[-2:])
|
||||
return data
|
||||
except:
|
||||
return None
|
||||
|
||||
def run(self, dataset_name='crossdocked_pocket10_processed.lmdb', lmdb_path=None,
|
||||
max_ligand_atom=50, only_backbone=False, n_process=16, interval=2000):
|
||||
#file_dirname = os.path.dirname(self.index_path)
|
||||
self.only_backbone = only_backbone
|
||||
if lmdb_path:
|
||||
lmdb_path = os.path.join(lmdb_path, dataset_name.split('/')[-1])
|
||||
else:
|
||||
lmdb_path = dataset_name #os.path.join(self.file_dirname,dataset_name.split('/')[-1])
|
||||
if os.path.exists(lmdb_path):
|
||||
raise FileExistsError(lmdb_path + ' has been existed !')
|
||||
db = lmdb.open(
|
||||
lmdb_path,
|
||||
map_size=200*(1024*1024*1024), # 200GB
|
||||
create=True,
|
||||
subdir=False,
|
||||
readonly=False, # Writable
|
||||
)
|
||||
data_ix_list = []
|
||||
exception_list = []
|
||||
data_ix = 0
|
||||
for idx in tqdm(range(0, len(self.index), interval)):
|
||||
if idx+interval >= len(self.index):
|
||||
raw_files = self.index[idx:]
|
||||
else:
|
||||
raw_files = self.index[idx:idx+interval]
|
||||
val_raw_files = []
|
||||
for items in raw_files:
|
||||
'''if items[0] is None:
|
||||
continue'''
|
||||
if 'ATOM' not in open(items[0]).read():
|
||||
continue
|
||||
elif items[1] in self.unexpected_sample:
|
||||
continue
|
||||
elif Chem.MolFromMolFile(items[1])!=None:
|
||||
val_raw_files.append(items)
|
||||
else:
|
||||
exception_list.append(items)
|
||||
torch.multiprocessing.set_sharing_strategy('file_system')
|
||||
pool = Pool(processes=n_process)
|
||||
data_list = pool.map(self.process, val_raw_files)
|
||||
with db.begin(write=True, buffers=True) as txn:
|
||||
for data in data_list:
|
||||
if data is None: continue
|
||||
if data.protein_pos.size(0) < 50: continue
|
||||
if len(data.ligand_nbh_list) > max_ligand_atom: continue
|
||||
key = str(data_ix).encode()
|
||||
txn.put(
|
||||
key = key,
|
||||
value = pickle.dumps(data)
|
||||
)
|
||||
data_ix_list.append(key)
|
||||
data_ix += 1
|
||||
db.close()
|
||||
data_ix_list = np.array(data_ix_list)
|
||||
np.save(lmdb_path.split('.')[0]+'_Keys', data_ix_list)
|
||||
with open(lmdb_path.split('.')[0]+'_invalid.list','w') as fw:
|
||||
fw.write(str(exception_list))
|
||||
'''
|
||||
unexpected_sample = [
|
||||
line.split()[-1] for line in open('/export/home/jyy/PocketFlow/data/unexcept_element_sample_new.csv').read().split('\n')
|
||||
]
|
||||
cs2020 = CrossDocked2020(
|
||||
'/export/home/jyy/Pocket2Mol-learning/data/crossdocked_pocket10',
|
||||
'/export/home/jyy/Pocket2Mol-learning/data/crossdocked_pocket10/index.pkl',
|
||||
unexpected_sample=unexpected_sample
|
||||
)
|
||||
cs2020.run(
|
||||
dataset_name='crossdocked_pocket10_processed_35Atoms_backbone.lmdb',
|
||||
max_ligand_atom=35,
|
||||
only_backbone=True,
|
||||
lmdb_path=None
|
||||
)
|
||||
'''
|
||||
|
||||
|
||||
class PDBBind2020(CrossDocked2020):
|
||||
def __init__(self, raw_file_list, unexpected_sample=[], atomic_numbers=[6,7,8,9,15,16,17,35,53]):
|
||||
self.index = self.get_file(raw_file_list)
|
||||
self.unexpected_sample = unexpected_sample
|
||||
self.file_dirname = os.path.split(os.path.split(self.index[0][0])[0])
|
||||
self.atomic_numbers = set(atomic_numbers)
|
||||
|
||||
@staticmethod
|
||||
def get_file(raw_file_list):
|
||||
file_list = []
|
||||
for i in raw_file_list:
|
||||
sdf = glob.glob(i + '/*.sdf')
|
||||
pdb = glob.glob(i + '/*pocket.pdb')
|
||||
if pdb and sdf:
|
||||
file_list.append([pdb[0], sdf[0]])
|
||||
return file_list
|
||||
|
||||
|
||||
'''
|
||||
refine_set_dir_list = glob.glob('/export/home/jyy/PDBBind2020/refined-set/*/')
|
||||
other_pl_dir_list = glob.glob('/export/home/jyy/PDBBind2020/v2020-other-PL/*/')
|
||||
|
||||
raw_file_list = refine_set_dir_list + other_pl_dir_list
|
||||
pdbbind_2020 = PDBBind2020(raw_file_list, atomic_numbers=[6,7,8,9,15,16,17,35,53])
|
||||
pdbbind_2020.run(dataset_name='PDBBind_2020.lmdb', max_ligand_atom=35,
|
||||
n_process=6, interval=2000, only_backbone=False, lmdb_path=None)
|
||||
'''
|
||||
@@ -1,316 +0,0 @@
|
||||
from rdkit import Chem
|
||||
|
||||
|
||||
|
||||
FUSED_QUA_RING_PATTERN = [
|
||||
Chem.MolFromSmarts(i) for i in[
|
||||
'[R]12~&@[R]~&@[R]~&@[R]~&@[R]~&@[R]~&@1~&@[R]~&@[R]1~&@[R](~&@[R]~&@[R]3~&@[R]~&@[R]~&@[R]~&@[R]~&@[R]~&@3~&@[R]~&@1)~&@[R]~&@2',
|
||||
'[R]12~&@[R]~&@[R]~&@[R]~&@[R]3~&@[R]~&@1~&@[R](~&@[R]~&@[R]~&@[R]~&@2)~&@[R]~&@[R]1~&@[R]~&@3~&@[R]~&@[R]~&@[R]~&@[R]~&@1',
|
||||
'[R]12~&@[R]3~&@[R]~&@[R]~&@[R]4~&@[R]~&@[R]~&@[R]~&@[R](~&@[R]~&@1~&@4)~&@[R]~&@[R]~&@[R]~&@2~&@[R]~&@[R]~&@[R]~&@3',
|
||||
'[R]12~&@[R]3~&@[R]~&@[R]~&@[R]~&@[R]~&@[R]~&@3~&@[R]3~&@[R]~&@[R]~&@[R]~&@[R]~&@[R]~&@3~&@[R]~&@1~&@[R]~&@[R]~&@[R]~&@[R]~&@2',
|
||||
'[R]12~&@[R]~&@[R]~&@[R]3~&@[R](~&@[R]~&@[R]~&@[R]4~&@[R]~&@3~&@[R]~&@[R]~&@[R]~&@[R]~&@4)~&@[R]~&@1~&@[R]~&@[R]~&@[R]~&@[R]~&@2',
|
||||
'[R]12~&@[R]~&@[R]~&@[R]3~&@[R]~&@[R]4~&@[R]~&@[R]~&@[R]~&@[R]~&@[R]~&@4~&@[R]~&@[R]~&@3~&@[R]~&@1~&@[R]~&@[R]~&@[R]~&@[R]~&@2',
|
||||
'[R]12~&@[R]~&@[R]~&@[R]~&@[R]~&@1~&@[R]~&@[R]1~&@[R]~&@[R]3~&@[R]~&@[R]~&@[R]~&@[R]~&@[R]~&@3~&@[R]~&@[R]~&@1~&@[R]~&@2',
|
||||
'[R]12~&@[R]~&@[R]3~&@[R](~&@[R]~&@[R]4~&@[R]~&@[R]~&@[R]~&@[R]~&@[R]~&@4~&@[R]~&@3)~&@[R]~&@1~&@[R]~&@[R]~&@[R]~&@[R]~&@2',
|
||||
'[R]12~&@[R]~&@[R]~&@[R]3~&@[R]~&@[R]4~&@[R]~&@[R]~&@[R]~&@[R]~&@[R]~&@4~&@[R]~&@[R]~&@3~&@[R]~&@1~&@[R]~&@[R]~&@[R]~&@2',
|
||||
'[R]12~&@[R]~&@[R]3~&@[R]~&@[R]4~&@[R](~&@[R]~&@[R]~&@[R]~&@[R]~&@4)~&@[R]~&@3~&@[R]~&@1~&@[R]~&@[R]~&@[R]~&@[R]~&@2',
|
||||
'[R]12~&@[R]~&@[R]3~&@[R]4~&@[R]~&@[R]~&@[R]~&@[R]~&@[R]~&@4~&@[R]~&@[R]~&@3~&@[R]~&@1~&@[R]~&@[R]~&@[R]~&@[R]~&@2',
|
||||
'[R]12~&@[R]~&@[R]~&@[R]3~&@[R]4~&@[R]~&@[R]~&@[R]~&@[R]~&@4~&@[R]~&@[R]~&@[R]~&@3~&@[R]~&@1~&@[R]~&@[R]~&@[R]~&@2',
|
||||
'[R]12~&@[R]~&@[R]~&@[R]~&@[R]~&@[R]~&@1~&@[R]1~&@[R](~&@[R]~&@[R]~&@[R]~&@1)~&@[R]1~&@[R]~&@2~&@[R]~&@[R]~&@[R]~&@1',
|
||||
'[R]12~&@[R]~&@[R]~&@[R]3~&@[R]~&@[R]~&@[R]4~&@[R](~&@[R]~&@[R]~&@[R]~&@4)~&@[R]~&@3~&@[R]~&@1~&@[R]~&@[R]~&@[R]~&@2',
|
||||
'[R]12~&@[R]~&@[R]~&@[R]~&@[R]~&@1~&@[R]~&@[R]~&@[R]1~&@[R]3~&@[R]~&@[R]~&@[R]~&@[R]~&@[R]~&@3~&@[R]~&@[R]~&@1~&@[R]~&@2',
|
||||
'[R]12~&@[R]~&@[R]~&@[R]~&@[R]~&@[R]~&@1~&@[R]1~&@[R]~&@[R]3~&@[R]~&@[R]~&@[R]~&@[R]~&@[R]~&@3~&@[R]~&@[R]~&@1~&@[R]~&@[R]~&@[R]~&@2',
|
||||
'[R]12~&@[R]~&@[R]~&@[R]~&@[R]3~&@[R]~&@[R]~&@[R]~&@[R]~&@[R]~&@3~&@[R]~&@1~&@[R]~&@[R]~&@[R]~&@2',
|
||||
'[R]12~&@[R]~&@[R]~&@[R]~&@[R]~&@[R]~&@1~&@[R]~&@[R]~&@[R]1~&@[R]~&@[R]~&@[R]~&@[R]~&@1~&@[R]~&@2',
|
||||
'[R]12~&@[R]~&@[R]3~&@[R](~&@[R]~&@[R]~&@[R]4~&@[R]~&@3~&@[R]~&@[R]~&@[R]~&@4)~&@[R]~&@1~&@[R]~&@[R]~&@[R]~&@[R]~&@2',
|
||||
'[R]12~&@[R]~&@[R]3~&@[R](~&@[R]~&@[R]4~&@[R](~&@[R]~&@[R]~&@[R]~&@4)~&@[R]~&@3)~&@[R]~&@1~&@[R]~&@[R]~&@[R]~&@[R]~&@2',
|
||||
'[R]12~&@[R]~&@[R]~&@[R]~&@[R]~&@[R]3~&@[R]~&@1~&@[R]1~&@[R](~&@[R]~&@[R]~&@[R]~&@[R]~&@1~&@[R]~&@[R]~&@3)~&@[R]~&@[R]~&@2',
|
||||
'[R]12~&@[R]~&@[R]~&@[R]~&@[R]~&@[R]3~&@[R]~&@1~&@[R]1~&@[R](~&@[R]~&@[R]~&@[R]~&@1~&@[R]~&@[R]~&@3)~&@[R]~&@[R]~&@2',
|
||||
'[R]12~&@[R]~&@[R]~&@[R]~&@[R]3~&@[R]~&@1~&@[R]1~&@[R](~&@[R]~&@[R]~&@2)~&@[R]~&@[R]~&@[R]~&@1~&@[R]~&@[R]~&@3',
|
||||
'[R]12~&@[R]~&@[R]~&@[R]~&@[R]~&@[R]3~&@[R]~&@1~&@[R](~&@[R]~&@[R]~&@3)~&@[R]~&@[R]~&@[R]~&@2',
|
||||
'[R]12~&@[R]~&@[R]~&@[R]~&@[R]~&@[R]~&@1~&@[R]~&@[R]1~&@[R]3~&@[R]~&@2~&@[R]~&@[R]~&@[R]~&@3~&@[R]~&@[R]~&@[R]~&@1',
|
||||
'[R]12~&@[R]~&@[R]~&@[R]~&@[R]~&@[R]3~&@[R]~&@1~&@[R](~&@[R]1~&@[R](~&@[R]~&@[R]~&@[R]~&@1)~&@[R]~&@2)~&@[R]~&@[R]~&@[R]~&@3',
|
||||
'[R]12~&@[R]~&@[R]~&@[R]3~&@[R]~&@1~&@[R]1~&@[R](~&@[R]~&@[R]~&@2)~&@[R]~&@[R]~&@[R]~&@1~&@[R]~&@[R]~&@3',
|
||||
'[R]12~&@[R]~&@[R]3~&@[R]~&@[R]~&@[R]4~&@[R]~&@3~&@[R](~&@[R]~&@[R]~&@[R]~&@4)~&@[R]~&@1~&@[R]~&@[R]~&@[R]~&@2',
|
||||
'[R]12~&@[R]~&@[R]~&@[R]3~&@[R]~&@1~&@[R](~&@[R]~&@[R]~&@[R]~&@2)~&@[R]~&@[R]1~&@[R]~&@3~&@[R]~&@[R]~&@[R]~&@1',
|
||||
'[R]12~&@[R]~&@[R]~&@[R]~&@[R]3~&@[R]~&@1~&@[R](~&@[R]~&@[R]~&@[R]~&@[R]~&@2)~&@[R]~&@[R]~&@[R]~&@3',
|
||||
'[R]12~&@[R]~&@[R]~&@[R]~&@[R]~&@[R]~&@1~&@[R]~&@[R]1~&@[R]~&@2~&@[R]~&@[R]~&@[R]2~&@[R]~&@1~&@[R]~&@[R]~&@[R]~&@[R]~&@2',
|
||||
'[R]12~&@[R]~&@[R]~&@[R]~&@[R]~&@[R]~&@1~&@[R]~&@[R]1~&@[R]~&@2~&@[R]2~&@[R]~&@[R]~&@[R]~&@[R]~&@[R]~&@2~&@[R]~&@[R]~&@1',
|
||||
'[R]12~&@[R]~&@[R]~&@[R]3~&@[R]~&@1~&@[R](~&@[R]~&@[R]~&@[R]~&@[R]~&@2)~&@[R]~&@[R]~&@3',
|
||||
'[R]12~&@[R]~&@[R]~&@[R]~&@[R]~&@1~&@[R]~&@[R]1~&@[R]3~&@[R]~&@2~&@[R]~&@[R]~&@[R]~&@[R]~&@3~&@[R]~&@[R]~&@1',
|
||||
'[R]12~&@[R]~&@[R]~&@[R]~&@[R]34~&@[R]~&@1~&@[R](~&@[R]~&@[R]~&@[R]~&@2)~&@[R]~&@[R]~&@3~&@[R]~&@[R]~&@[R]~&@[R]~&@4',
|
||||
'[R]12~&@[R]~&@[R]~&@[R]~&@[R]~&@[R]~&@[R]~&@1~&@[R]~&@[R]1~&@[R](~&@[R]~&@[R]3~&@[R]~&@[R]~&@[R]~&@[R]~&@[R]~&@3~&@[R]~&@1)~&@[R]~&@2',
|
||||
'[R]12~&@[R]~&@[R]~&@[R]~&@[R]~&@[R]~&@[R]~&@1~&@[R]~&@[R]1~&@[R](~&@[R]3~&@[R]~&@[R]~&@[R]~&@[R]~&@[R]~&@3~&@[R]~&@[R]~&@1)~&@[R]~&@2',
|
||||
'[R]12~&@[R]~&@[R]~&@[R]~&@[R]~&@[R]~&@[R]~&@1~&@[R]1~&@[R](~&@[R]~&@[R]~&@[R]~&@[R]~&@1)~&@[R]1~&@[R]~&@2~&@[R]~&@[R]~&@[R]~&@[R]~&@1',
|
||||
'[R]12~&@[R]~&@[R]~&@[R]~&@[R]~&@[R]~&@[R]~&@1~&@[R]1~&@[R]3~&@[R](~&@[R]~&@[R]~&@[R]~&@[R]~&@3~&@[R]~&@[R]~&@[R]~&@1)~&@[R]~&@2',
|
||||
'[R]12~&@[R]~&@[R]~&@[R]~&@[R]~&@[R]~&@[R]~&@1~&@[R]~&@[R]1~&@[R](~&@[R]~&@[R]3~&@[R]~&@[R]~&@[R]~&@[R]~&@3~&@[R]~&@1)~&@[R]~&@2',
|
||||
'[R]12~&@[R]~&@[R]~&@[R]~&@[R]~&@[R]~&@[R]~&@1~&@[R]~&@[R]1~&@[R](~&@[R]~&@[R]~&@[R]3~&@[R]~&@1~&@[R]~&@[R]~&@[R]~&@3)~&@[R]~&@2',
|
||||
'[R]12~&@[R]~&@[R]~&@[R]3~&@[R]4~&@[R]~&@1~&@[R](~&@[R]~&@[R]~&@4~&@[R]~&@[R]~&@[R]~&@3)~&@[R]~&@[R]~&@[R]~&@2',
|
||||
'[R]12~&@[R]~&@[R]~&@[R]~&@[R]~&@[R]~&@[R]~&@1~&@[R]1~&@[R]3~&@[R](~&@[R]~&@[R]~&@[R]~&@[R]~&@3~&@[R]~&@[R]~&@1)~&@[R]~&@2',
|
||||
'[R]12~&@[R]~&@[R]~&@[R]~&@[R]3~&@[R](~&@[R]~&@[R]4~&@[R]~&@[R]~&@[R]~&@[R]~&@[R]~&@4~&@[R]~&@3)~&@[R]~&@1~&@[R]~&@[R]~&@[R]~&@[R]~&@2',
|
||||
'[R]12~&@[R]~&@[R]~&@[R]~&@[R]~&@[R]~&@1~&@[R]~&@[R]1~&@[R]~&@[R]3~&@[R]~&@[R]~&@[R]~&@[R]~&@[R]~&@3~&@[R]~&@[R]~&@1~&@[R]~&@[R]~&@2',
|
||||
'[R]12~&@[R]~&@[R]~&@[R]~&@[R]~&@[R]~&@1~&@[R]~&@[R]1~&@[R]3~&@[R]~&@[R]~&@[R]~&@[R]~&@[R]~&@3~&@[R]~&@[R]~&@[R]~&@1~&@[R]~&@[R]~&@2',
|
||||
'[R]12~&@[R]~&@[R]~&@[R]~&@[R]3~&@[R](~&@[R]~&@[R]~&@[R]~&@[R]~&@3)~&@[R]~&@1~&@[R]~&@[R]~&@[R]1~&@[R]~&@2~&@[R]~&@[R]~&@[R]~&@[R]~&@1',
|
||||
'[R]12~&@[R]~&@[R]~&@[R]~&@[R]3~&@[R@@H](~&@[R@H]4~&@[R]~&@[R]~&@[R]~&@[R]~&@[R@H]~&@4~&@[R]~&@[R]~&@3)~&@[R]~&@1~&@[R]~&@[R]~&@[R]~&@[R]~&@2',
|
||||
'[R]12~&@[R]~&@[R]3~&@[R](~&@[R]~&@[R]~&@[R]~&@[R]~&@3)~&@[R]3~&@[R](~&@[R]~&@[R]~&@[R]~&@[R]~&@3)~&@[R]~&@1~&@[R]~&@[R]~&@[R]~&@[R]~&@2',
|
||||
'[R]12~&@[R]~&@[R]3~&@[R](~&@[R]~&@[R]~&@[R]~&@[R]~&@3)~&@[R]3~&@[R](~&@[R]~&@[R]~&@[R]~&@3)~&@[R]~&@1~&@[R]~&@[R]~&@[R]~&@[R]~&@2',
|
||||
'[R]12~&@[R]3~&@[R]~&@[R]~&@[R]~&@[R]~&@[R]~&@3~&@[R]~&@[R]3~&@[R]~&@[R]~&@[R]~&@[R]~&@3~&@[R]~&@1~&@[R]~&@[R]~&@[R]~&@[R]~&@2',
|
||||
'[R]12~&@[R]~&@[R]~&@[R]~&@[R]3~&@[R]~&@1~&@[R](~&@[R]~&@[R]~&@[R]~&@2)~&@[R]1~&@[R]~&@3~&@[R]~&@[R]~&@[R]~&@[R]~&@1',
|
||||
'[R]12~&@[R]~&@[R]~&@[R]~&@[R]~&@[R]~&@1~&@[R]~&@[R]1~&@[R](~&@[R]3~&@[R]~&@[R]~&@[R]~&@[R]~&@3~&@[R]~&@1)~&@[R]~&@2',
|
||||
'[R]12~&@[R]~&@[R]~&@[R]~&@[R]~&@[R]~&@1~&@[R]~&@[R]1~&@[R]~&@[R]~&@[R]~&@[R]~&@[R]~&@1~&@[R]~&@[R]~&@2',
|
||||
'[R]12~&@[R]~&@[R]~&@[R]~&@[R]3~&@[R](~&@[R]~&@[R]~&@[R]~&@[R]~&@3)~&@[R]~&@1~&@[R]~&@[R]~&@[R]~&@[R]~&@2',
|
||||
'[R]12~&@[R]~&@[R]~&@[R]~&@[R]~&@1~&@[R]1~&@[R]~&@[R]~&@[R]~&@[R]3~&@[R]~&@1~&@[R](~&@[R]~&@[R]~&@[R]~&@3)~&@[R]~&@2',
|
||||
'[R]12~&@[R]3~&@[R]~&@[R]~&@[R]~&@[R]~&@[R]~&@3~&@[R]3~&@[R]~&@[R]~&@[R]~&@[R]~&@3~&@[R]~&@1~&@[R]~&@[R]~&@[R]~&@[R]~&@2',
|
||||
'[R]12~&@[R]~&@[R]~&@[R]3~&@[R]~&@1~&@[R](~&@[R]1~&@[R](~&@[R]~&@2)~&@[R]~&@[R]~&@[R]~&@[R]~&@1)~&@[R]~&@[R]~&@[R]~&@3',
|
||||
'[R]12~&@[R]~&@[R]~&@[R]3~&@[R](~&@[R]~&@[R]~&@[R]~&@[R]~&@3)~&@[R]~&@1~&@[R]~&@[R]~&@[R]1~&@[R]~&@2~&@[R]~&@[R]~&@[R]~&@1',
|
||||
'[R]12~&@[R]~&@[R]~&@[R]3~&@[R](~&@[R]~&@[R]~&@[R]~&@[R]~&@3)~&@[R]~&@1~&@[R]~&@[R]1~&@[R](~&@[R]~&@[R]~&@[R]~&@1)~&@[R]~&@2',
|
||||
'[R]12~&@[R]~&@[R]~&@[R]~&@[R]~&@[R]~&@1~&@[R]1~&@[R](~&@[R]~&@[R]~&@[R]3~&@[R]~&@1~&@[R]~&@[R]~&@[R]~&@3)~&@[R]~&@[R]~&@2',
|
||||
'[R]12~&@[R]~&@[R]~&@[R]~&@[R]~&@1~&@[R]~&@[R]1~&@[R]~&@[R]3~&@[R]~&@[R]~&@[R]~&@[R]~&@3~&@[R]~&@[R]~&@1~&@[R]~&@2',
|
||||
'[R]12~&@[R]~&@[R]~&@[R]3~&@[R](~&@[R]~&@[R]4~&@[R]~&@[R]~&@[R]~&@[R]~&@4~&@[R]~&@3)~&@[R]~&@1~&@[R]~&@[R]~&@[R]~&@2',
|
||||
'[R]12~&@[R]~&@[R]~&@[R]3~&@[R]~&@[R]~&@[R]4~&@[R](~&@[R]~&@[R]~&@[R]~&@[R]~&@4)~&@[R]~&@3~&@[R]~&@1~&@[R]~&@[R]~&@[R]~&@[R]~&@2',
|
||||
'[R]12~&@[R]~&@[R]~&@[R]3~&@[R]4~&@[R](~&@[R]~&@[R]~&@[R]~&@3)~&@[R]~&@[R]~&@[R]~&@[R]~&@1~&@4~&@[R]~&@[R]~&@[R]~&@[R]~&@2',
|
||||
'[R]12~&@[R]~&@[R]~&@[R]~&@[R]~&@13~&@[R](~&@[R]~&@[R]~&@[R]~&@3)~&@[R]~&@[R]1~&@[R]~&@2~&@[R]~&@[R]~&@[R]~&@[R]~&@1',
|
||||
'[R]12~&@[R]~&@[R]~&@[R]~&@[R]~&@[R]~&@1~&@[R]~&@[R]1~&@[R]~&@2~&@[R]2~&@[R]~&@[R]~&@[R]~&@[R]~&@2~&@[R]~&@[R]~&@1'
|
||||
]
|
||||
]
|
||||
|
||||
def has_fused4ring(mol):
|
||||
for pat in FUSED_QUA_RING_PATTERN:
|
||||
if mol.HasSubstructMatch(pat):
|
||||
return True
|
||||
else:
|
||||
return False
|
||||
|
||||
|
||||
PATTERNS_1 = [Chem.MolFromSmarts(i) for i in [
|
||||
'[R]12~&@[R]~&@[R]~&@[R]~&@1~&@[R]~&@2',
|
||||
'[R]12~&@[R]~&@[R]~&@1~&@[R]~&@[R]~&@[R]~&@2',
|
||||
'[R]12~&@[R]~&@[R]~&@[R]~&@[R]~&@[R]~&@1~&@[R]~&@2',
|
||||
'[R]12~&@[R]~&@[R]~&@1~&@[R]~&@2',
|
||||
'[R]12~&@[R]~&@[R]~&@[R]~&@1~&@[R]~&@[R]~&@[R]~&@[R]~&@2',
|
||||
'[R]12~&@[R]~&@[R]~&@[R]~&@[R]~&@1~&@[R]~&@[R]~&@2',
|
||||
'[R]12~&@[R]~&@[R]~&@[R]~&@1~&@[R]~&@[R]~&@2',
|
||||
'[R]1~&@[R]~&@[R]~&@12~&@[R]~&@[R]~&@2'
|
||||
]
|
||||
]
|
||||
|
||||
def judge_unexpected_ring(mol):
|
||||
for pat in PATTERNS_1:
|
||||
subs = mol.GetSubstructMatches(pat)
|
||||
if len(subs) > 0:
|
||||
return True
|
||||
else:
|
||||
return False
|
||||
|
||||
|
||||
PATTERNS = [Chem.MolFromSmarts(i) for i in [
|
||||
'[R]12~&@[R]3~&@[R]~&@[R]~&@[R]~&@[R]~&@[R]~&@3~&@[R]~&@[R]~&@[R]~&@1~&@[R]~&@[R]~&@[R]~&@[R]~&@2',
|
||||
'[R]12~&@[R]~&@[R]3~&@[R]~&@[R]~&@[R]~&@[R]~&@[R]~&@3~&@[R]~&@[R]~&@1~&@[R]~&@[R]~&@[R]~&@2',
|
||||
'[R]12~&@[R]~&@[R]~&@[R]~&@[R]3~&@[R]~&@1~&@[R](~&@[R]~&@[R]~&@[R]~&@2)~&@[R]~&@[R]~&@[R]~&@3',
|
||||
'[R]12~&@[R]~&@[R]~&@[R]~&@[R]~&@[R]~&@1~&@[R]1~&@[R](~&@[R]~&@2)~&@[R]~&@[R]~&@[R]~&@[R]~&@1',
|
||||
'[R]12~&@[R]~&@[R]~&@[R]3~&@[R](~&@[R]~&@1~&@[R]~&@[R]~&@[R]~&@[R]~&@2)~&@[R]~&@[R]~&@[R]~&@3',
|
||||
'[R]12~&@[R]~&@[R]~&@[R]~&@[R]~&@[R]~&@1~&@[R]~&@[R]1~&@[R](~&@[R]~&@2)~&@[R]~&@[R]~&@[R]~&@[R]~&@1']
|
||||
]
|
||||
|
||||
def judge_fused_ring(mol):
|
||||
for pat in PATTERNS+FUSED_QUA_RING_PATTERN:
|
||||
if mol.HasSubstructMatch(pat):
|
||||
return True
|
||||
else:
|
||||
return False
|
||||
|
||||
def substructure(mol_lib):
|
||||
total_num = sum([len(i) for i in mol_lib])
|
||||
ring_size_statis = {
|
||||
'tri_ring':{'num':0},
|
||||
'qua_ring':{'num':0},
|
||||
'fif_ring':{'num':0},
|
||||
'hex_ring':{'num':0},
|
||||
'hep_ring':{'num':0},
|
||||
'oct_ring':{'num':0},
|
||||
'big_ring':{'num':0},
|
||||
'fused_ring':{'num':0},
|
||||
'unexpected_ring':{'num':0},
|
||||
'sssr':{}
|
||||
}
|
||||
for s in mol_lib:
|
||||
for mol in s:
|
||||
if mol is None: continue
|
||||
mol = Chem.MolFromSmiles(Chem.MolToSmiles(mol, isomericSmiles=False))
|
||||
sssr = Chem.GetSSSR(mol)
|
||||
if sssr in ring_size_statis['sssr']:
|
||||
ring_size_statis['sssr'][sssr]['num'] += 1
|
||||
else:
|
||||
ring_size_statis['sssr'][sssr] = {}
|
||||
ring_size_statis['sssr'][sssr]['num'] = 1
|
||||
ring = mol.GetRingInfo()
|
||||
has_ring_size_3 = False
|
||||
has_ring_size_4 = False
|
||||
has_ring_size_5 = False
|
||||
has_ring_size_6 = False
|
||||
has_ring_size_7 = False
|
||||
has_ring_size_8 = False
|
||||
has_big_ring = False
|
||||
has_fused_ring = False
|
||||
has_unexpected_ring = False
|
||||
for r in mol.GetRingInfo().AtomRings():
|
||||
if len(r) == 3 and has_ring_size_3 is False:
|
||||
has_ring_size_3 = True
|
||||
ring_size_statis['tri_ring']['num'] += 1
|
||||
if len(r) == 4 and has_ring_size_4 is False:
|
||||
has_ring_size_4 = True
|
||||
ring_size_statis['qua_ring']['num'] += 1
|
||||
if len(r) == 5 and has_ring_size_5 is False:
|
||||
has_ring_size_5 = True
|
||||
ring_size_statis['fif_ring']['num'] += 1
|
||||
if len(r) == 6 and has_ring_size_6 is False:
|
||||
has_ring_size_6 = True
|
||||
ring_size_statis['hex_ring']['num'] += 1
|
||||
if len(r) == 7 and has_ring_size_7 is False:
|
||||
has_ring_size_7 = True
|
||||
ring_size_statis['hep_ring']['num'] += 1
|
||||
if len(r) == 8 and has_ring_size_8 is False:
|
||||
has_ring_size_8 = True
|
||||
ring_size_statis['oct_ring']['num'] += 1
|
||||
if len(r) > 8 and has_big_ring is False:
|
||||
has_big_ring = True
|
||||
ring_size_statis['big_ring']['num'] += 1
|
||||
'''for i in range(mol.GetNumAtoms()):
|
||||
if ring.IsAtomInRingOfSize(i,3) and has_ring_size_3 is False:
|
||||
has_ring_size_3 = True
|
||||
ring_size_statis['tri_ring']['num'] += 1
|
||||
if ring.IsAtomInRingOfSize(i,4) and has_ring_size_4 is False:
|
||||
has_ring_size_4 = True
|
||||
ring_size_statis['qua_ring']['num'] += 1
|
||||
if ring.IsAtomInRingOfSize(i,5) and has_ring_size_5 is False:
|
||||
has_ring_size_5 = True
|
||||
ring_size_statis['fif_ring']['num'] += 1
|
||||
if ring.IsAtomInRingOfSize(i,6) and has_ring_size_6 is False:
|
||||
has_ring_size_6 = True
|
||||
ring_size_statis['hex_ring']['num'] += 1
|
||||
if ring.IsAtomInRingOfSize(i,7) and has_ring_size_7 is False:
|
||||
has_ring_size_7 = True
|
||||
ring_size_statis['hep_ring']['num'] += 1
|
||||
if ring.IsAtomInRingOfSize(i,8) and has_ring_size_8 is False:
|
||||
has_ring_size_8 = True
|
||||
ring_size_statis['oct_ring']['num'] += 1'''
|
||||
if judge_fused_ring(mol) and has_fused_ring is False:
|
||||
has_fused_ring = True
|
||||
ring_size_statis['fused_ring']['num'] += 1
|
||||
if judge_unexpected_ring(mol) and has_unexpected_ring is False:
|
||||
ring_size_statis['unexpected_ring']['num'] += 1
|
||||
ring_size_statis['tri_ring']['rate'] = ring_size_statis['tri_ring']['num']/total_num
|
||||
ring_size_statis['qua_ring']['rate'] = ring_size_statis['qua_ring']['num']/total_num
|
||||
ring_size_statis['fif_ring']['rate'] = ring_size_statis['fif_ring']['num']/total_num
|
||||
ring_size_statis['hex_ring']['rate'] = ring_size_statis['hex_ring']['num']/total_num
|
||||
ring_size_statis['hep_ring']['rate'] = ring_size_statis['hep_ring']['num']/total_num
|
||||
ring_size_statis['oct_ring']['rate'] = ring_size_statis['oct_ring']['num']/total_num
|
||||
ring_size_statis['big_ring']['rate'] = ring_size_statis['big_ring']['num']/total_num
|
||||
ring_size_statis['fused_ring']['rate'] = ring_size_statis['fused_ring']['num']/total_num
|
||||
ring_size_statis['unexpected_ring']['rate'] = ring_size_statis['unexpected_ring']['num']/total_num
|
||||
for k in ring_size_statis['sssr']:
|
||||
ring_size_statis['sssr'][k]['rate'] = ring_size_statis['sssr'][k]['num']/total_num
|
||||
return ring_size_statis
|
||||
|
||||
|
||||
def smoothing(scalars, weight=0.8): # Weight between 0 and 1
|
||||
last = scalars[0] # First value in the plot (first timestep)
|
||||
smoothed = list()
|
||||
for point in scalars:
|
||||
smoothed_val = last * weight + (1 - weight) * point # Calculate smoothed value
|
||||
smoothed.append(smoothed_val) # Save it
|
||||
last = smoothed_val # Anchor the last smoothed value
|
||||
return smoothed
|
||||
|
||||
|
||||
|
||||
|
||||
"""PATTERNS = [Chem.MolFromSmarts(i) for i in [
|
||||
'[R]12~&@[R]3~&@[R]~&@[R]~&@[R]~&@[R]~&@[R]~&@3~&@[R]~&@[R]~&@[R]~&@1~&@[R]~&@[R]~&@[R]~&@[R]~&@2',
|
||||
'[R]12~&@[R]~&@[R]3~&@[R]~&@[R]~&@[R]~&@[R]~&@[R]~&@3~&@[R]~&@[R]~&@1~&@[R]~&@[R]~&@[R]~&@2',
|
||||
'[R]12~&@[R]~&@[R]~&@[R]~&@[R]3~&@[R]~&@1~&@[R](~&@[R]~&@[R]~&@[R]~&@2)~&@[R]~&@[R]~&@[R]~&@3',
|
||||
'[R]12~&@[R]~&@[R]~&@[R]~&@[R]~&@[R]~&@1~&@[R]1~&@[R](~&@[R]~&@2)~&@[R]~&@[R]~&@[R]~&@[R]~&@1',
|
||||
'[R]12~&@[R]~&@[R]~&@[R]3~&@[R](~&@[R]~&@1~&@[R]~&@[R]~&@[R]~&@[R]~&@2)~&@[R]~&@[R]~&@[R]~&@3',
|
||||
'[R]12~&@[R]~&@[R]~&@[R]~&@[R]~&@[R]~&@1~&@[R]~&@[R]1~&@[R](~&@[R]~&@2)~&@[R]~&@[R]~&@[R]~&@[R]~&@1']
|
||||
]
|
||||
|
||||
def judge_fused_ring(mol):
|
||||
for pat in PATTERNS:
|
||||
subs = mol.GetSubstructMatches(pat)
|
||||
if len(subs) > 0:
|
||||
return True
|
||||
else:
|
||||
return False
|
||||
|
||||
def substructure(mol_lib):
|
||||
total_num = sum([len(i) for i in mol_lib])
|
||||
ring_size_statis = {
|
||||
'tri_ring':{'num':0},
|
||||
'qua_ring':{'num':0},
|
||||
'fif_ring':{'num':0},
|
||||
'hex_ring':{'num':0},
|
||||
'hep_ring':{'num':0},
|
||||
'oct_ring':{'num':0},
|
||||
'fused_ring':{'num':0},
|
||||
'sssr':{}
|
||||
}
|
||||
for s in mol_lib:
|
||||
for mol in s:
|
||||
if mol is None: continue
|
||||
mol = Chem.MolFromSmiles(Chem.MolToSmiles(mol, isomericSmiles=False))
|
||||
sssr = Chem.GetSSSR(mol)
|
||||
if sssr in ring_size_statis['sssr']:
|
||||
ring_size_statis['sssr'][sssr]['num'] += 1
|
||||
else:
|
||||
ring_size_statis['sssr'][sssr] = {}
|
||||
ring_size_statis['sssr'][sssr]['num'] = 1
|
||||
ring = mol.GetRingInfo()
|
||||
has_ring_size_3 = False
|
||||
has_ring_size_4 = False
|
||||
has_ring_size_5 = False
|
||||
has_ring_size_6 = False
|
||||
has_ring_size_7 = False
|
||||
has_ring_size_8 = False
|
||||
has_fused_ring = False
|
||||
for i in range(mol.GetNumAtoms()):
|
||||
if ring.IsAtomInRingOfSize(i,3) and has_ring_size_3 is False:
|
||||
has_ring_size_3 = True
|
||||
ring_size_statis['tri_ring']['num'] += 1
|
||||
if ring.IsAtomInRingOfSize(i,4) and has_ring_size_4 is False:
|
||||
has_ring_size_4 = True
|
||||
ring_size_statis['qua_ring']['num'] += 1
|
||||
if ring.IsAtomInRingOfSize(i,5) and has_ring_size_5 is False:
|
||||
has_ring_size_5 = True
|
||||
ring_size_statis['fif_ring']['num'] += 1
|
||||
if ring.IsAtomInRingOfSize(i,6) and has_ring_size_6 is False:
|
||||
has_ring_size_6 = True
|
||||
ring_size_statis['hex_ring']['num'] += 1
|
||||
if ring.IsAtomInRingOfSize(i,7) and has_ring_size_7 is False:
|
||||
has_ring_size_7 = True
|
||||
ring_size_statis['hep_ring']['num'] += 1
|
||||
if ring.IsAtomInRingOfSize(i,8) and has_ring_size_8 is False:
|
||||
has_ring_size_8 = True
|
||||
ring_size_statis['oct_ring']['num'] += 1
|
||||
if judge_fused_ring(mol) and has_fused_ring is False:
|
||||
has_fused_ring = True
|
||||
ring_size_statis['fused_ring']['num'] += 1
|
||||
ring_size_statis['tri_ring']['rate'] = ring_size_statis['tri_ring']['num']/total_num
|
||||
ring_size_statis['qua_ring']['rate'] = ring_size_statis['qua_ring']['num']/total_num
|
||||
ring_size_statis['fif_ring']['rate'] = ring_size_statis['fif_ring']['num']/total_num
|
||||
ring_size_statis['hex_ring']['rate'] = ring_size_statis['hex_ring']['num']/total_num
|
||||
ring_size_statis['hep_ring']['rate'] = ring_size_statis['hep_ring']['num']/total_num
|
||||
ring_size_statis['oct_ring']['rate'] = ring_size_statis['oct_ring']['num']/total_num
|
||||
ring_size_statis['fused_ring']['rate'] = ring_size_statis['fused_ring']['num']/total_num
|
||||
for k in ring_size_statis['sssr']:
|
||||
ring_size_statis['sssr'][k]['rate'] = ring_size_statis['sssr'][k]['num']/total_num
|
||||
return ring_size_statis
|
||||
|
||||
|
||||
def smoothing(scalars, weight=0.8): # Weight between 0 and 1
|
||||
last = scalars[0] # First value in the plot (first timestep)
|
||||
smoothed = list()
|
||||
for point in scalars:
|
||||
smoothed_val = last * weight + (1 - weight) * point # Calculate smoothed value
|
||||
smoothed.append(smoothed_val) # Save it
|
||||
last = smoothed_val # Anchor the last smoothed value
|
||||
return smoothed"""
|
||||
@@ -1,439 +0,0 @@
|
||||
from rdkit import Chem
|
||||
from rdkit.Chem import QED, Descriptors, Crippen, Lipinski
|
||||
from multiprocessing import Pool
|
||||
from tqdm.auto import tqdm
|
||||
from . import sascorer
|
||||
|
||||
|
||||
# from https://doi.org/10.1016/j.bmc.2019.115192
|
||||
ALERT_STRUCTURES = [
|
||||
'C#CI', 'C[C&D2]=!@N[!O]', 'CC([C,H])=!@N[!O]', 'N#CC(=O)', 'C(=[O,S])[F,Cl,Br,I]', '[#6][CH1]=O',
|
||||
'COS(=O)(=O)C(F)(F)F', '[C;H1$(C([#6;!$(C=O)])),H0$(C([#6;!$(C=O)])[#6;!$(C=O)])]=[CH1]!@N([#6;!$(C(=O))])[#6;!$(C(=O))]',
|
||||
'[CX4][Cl,Br,I]', '*=C=*', 'c1(N)ccc(C(=O)NC3(=O))c(c3ccc2)c21', 'N!@C=C-C=[C!r]([OH])', 'NC#N',
|
||||
'[#6]C(=O)-!@OC(=[O,N])', 'O=*N=[N+]=[N-]', 'N=[N+]([O-])C', 'c1ccccc1[N!r]=[N!r]c2ccccc2',
|
||||
'[N;R0]=[N;R0]C#N', 'C(Cl)(Cl)(Cl)C([O,S])[$([NX4+]),$([NX3]);!$(*=*)&!$(*:*)]', 'N[C!r](=O)S',
|
||||
'[Cl]C([C&R0])=N', 'SC(=[!r])S', 'O1CCOCCOCCOCC1', 'O1CCOCCOCCOCCOCC1', 'O1CCOCCOCCOCCOCCOCC1',
|
||||
'N#CC[OH1]', 'c1c([N;!R](~[O;X1])~[O;X1])cc([N;!R](~[O;X1])~[O;X1])cc1',
|
||||
'c1c([N;!R](~[O;X1])~[O;X1])c([N;!R](~[O;X1])~[O;X1])ccc1',
|
||||
'c1c([N;!R](~[O;X1])~[O;X1])ccc([N;!R](~[O;X1])~[O;X1])c1', 'C=[N+]=[N-]', '[N+]#N',
|
||||
'[C!r]=[C&D2][C!r]=[C&D2][C&D3]=O', 'NC(=S)S', 'S1SC=CC1=S', 'S1C=CSC1=C', 'S[C;!$(C=*)]S',
|
||||
'c1cccc(NC(=NC(=[N,S,O])NC(=O)3)C3=N2)c12', 'O=C1NC(=O)c2nc3ccccc3nc2N1',
|
||||
'c1cc(O)cc(OC(=CC(=O)C=C3)C3=C2)c12',
|
||||
'[$([cX3](:*):*),$([cX2+](:*):*),$([CX3]=*),$([CX2+]=*)]1([$([OX1]),$([NH]);!$([*X3])&!$(*-C=O)])[$([cX3](:*):*),$([cX2+](:*):*),$([CX3]=*),$([CX2+]=*)]([$([OX1]),$(N);!$([*X3])&!$(*-C=O)])[$([cX3](:*):*),$([cX2+](:*):*),$([CX3]=*),$([CX2+]=*)][$([cX3](:*):*),$([cX2+](:*):*),$([CX3]=*),$([CX2+]=*)][$([cX3](:*):*),$([cX2+](:*):*),$([CX3]=*),$([CX2+]=*)][$([cX3](:*):*),$([cX2+](:*):*),$([CX3]=*),$([CX2+]=*)]1',
|
||||
'[$([cX3](:*):*),$([cX2+](:*):*),$([CX3]=*),$([CX2+]=*)]1([$([OX1]),$([NH]);!$([*X3])&!$(*-C=O)])[$([cX3](:*):*),$([cX2+](:*):*),$([CX3]=*),$([CX2+]=*)][$([cX3](:*):*),$([cX2+](:*):*),$([CX3]=*),$([CX2+]=*)][$([cX3](:*):*),$([cX2+](:*):*),$([CX3]=*),$([CX2+]=*)]([$([OX1]),$(N);!$([*X3])&!$(*-C=O)])[$([cX3](:*):*),$([cX2+](:*):*),$([CX3]=*),$([CX2+]=*)][$([cX3](:*):*),$([cX2+](:*):*),$([CX3]=*),$([CX2+]=*)]1',
|
||||
'[$([cX3](:*):*),$([cX2+](:*):*),$([CX3]=*),$([CX2+]=*)]1[$([cX3](:*):*),$([cX2+](:*):*),$([CX3]=*),$([CX2+]=*)](O[$(C=O),$(C=N)])[$([cX3](:*):*),$([cX2+](:*):*),$([CX3]=*),$([CX2+]=*)][$([cX3](:*):*),$([cX2+](:*):*),$([CX3]=*),$([CX2+]=*)][$([cX3](:*):*),$([cX2+](:*):*),$([CX3]=*),$([CX2+]=*)](O[$(C=O),$(C=N)])[$([cX3](:*):*),$([cX2+](:*):*),$([CX3]=*),$([CX2+]=*)]1', 'C(=O)Onnn', 'c1[n+]([#6])ccn1([#6])',
|
||||
'[#6,#8,#16]-[CH1]=[NH1]', 'C=N([C,S])[C&v4]', 'N=[C!R]=O', 'N=[C!R]=S', 'C=[C!R]=O', 'P(=S)(S)S',
|
||||
'[#3,#11,#12,#13,#19,#20,#26,#27,#28,#29,#30]',
|
||||
'[#6;$([#6]~[#3,#11,#12,#13,#19,#20,#26,#27,#28,#29,#30])]', '[#16]1cc[#16][#6]1=[#16]',
|
||||
'[C&D1]=C[$([C&D2](C)(=O)),$(C(C)(=O)[!#8]),$(C(C)(=O)[O&D2]),$(C(C)(#N)),$(N(C)(~O)(~O)),$([S&D4](C)(~O)(~O))]',
|
||||
'[C&D1]#C[$([C&D2](C)(=O)),$(C(C)(=O)[!#8]),$(C(C)(=O)[O&D2]),$(C(C)(#N)),$(N(C)(~O)(~O)),$([S&D4](C)(~O)(~O))]',
|
||||
'[$([cX3](:*):*),$([cX2+](:*):*),$([CX3]=*),$([CX2+]=*),$([C&X4])]-[C&D2]=!@C[$([C&D2](C)(=O)),$(C(C)(=O)[!#8]),$(C(C)(=O)[O&D2]),$(C(C)(#N)),$(N(C)(~O)(~O)),$([S&D4](C)(~O)(~O))]',
|
||||
'[$([cX3](:*):*),$([cX2+](:*):*),$([CX3]=*),$([CX2+]=*),$([C&X4])]-[C&D2]#!@C[$([C&D2](C)(=O)),$(C(C)(=O)[!#8]),$(C(C)(=O)[O&D2]),$(C(C)(#N)),$(N(C)(~O)(~O)),$([S&D4](C)(~O)(~O))]',
|
||||
'[C&X4]-[C&D2!r]=[C!r][$([C&D2](C)(=O)),$(C(C)(=O)[!#8]),$(C(C)(=O)[O&D2]),$(C(C)(#N)),$(N(C)(~O)(~O)),$([S&D4](C)(~O)(~O))]',
|
||||
'[$([cX3](:*):*),$([cX2+](:*):*),$([CX3]=*),$([CX2+]=*)]-[C&D2!r]=[C!r][$([C&D2](C)(=O)),$(C(C)(=O)[!#8]),$(C(C)(=O)[O&D2]),$(C(C)(#N)),$(N(C)(~O)(~O)),$([S&D4](C)(~O)(~O))]-[C&X4]',
|
||||
'C1[C&D2!r]=[C!r][$([C&D2](C)(=O)),$(C(C)(=O)[!#8]),$(C(C)(=O)[O&D2]),$(C(C)(#N)),$(N(C)(~O)(~O)),$([S&D4](C)(~O)(~O))]-A=A1',
|
||||
'A1[C&D2!r]=[C!r][$([C&D2](C)(=O)),$(C(C)(=O)[!#8]),$(C(C)(=O)[O&D2]),$(C(C)(#N)),$(N(C)(~O)(~O)),$([S&D4](C)(~O)(~O))]-A1',
|
||||
'c1ccccc1C(=O)C=!@CC(=O)!@*', '[F,I,Cl,Br;!v1;!-]', 'N[F,I,Cl,Br]', '[N+!$(N=O)][O-X1]',
|
||||
'c1ccc(n[o,s]n2)c2c1[N;!R](~[O;X1])~[O;X1]', 'c1c([N;!R](~[O;X1])~[O;X1])cc(n[o,s]n2)c2c1',
|
||||
'[#7v5]', '[N!$([N+][O-])]=O', 'C(O)(O)[OH]', '[#8v3]', 'OO', 'c12cccc3c1c4c(cc3)cccc4cc2', 'C=P',
|
||||
'C=!@CC=!@C', 'c1cccc(cc(cccc2)c2c3)c13', 'c1cccc(c(cccc2)c2cc3)c13',
|
||||
'*[SX2][SX2][SX2]*', '[#16+1!$([#16]~[O-])]', 'c1ccc[o+]c1', 'O=S(=O)O[C,c]', 'COS(=O)O[C,c]',
|
||||
'S(=O)(=O)C#N', '[#16][F,Cl,Br,I]', '[SX2H0]!@[N]', 'C1NNC=NN1', '*[CH1](=!@S)', 'SC#N',
|
||||
'P(=S)(-[S;H1,H0$(S(P)C)])(-[O;H1,H0$(O(P)C)])(-N(C)C)', '*1[O,S]*1', '[#16v3,#16v5]',
|
||||
'C(=O)N(C(=O))OC(=O)', 'B(c1ccccc1)(c2ccccc2)c3ccccc3', 'P(c1ccccc1)(c2ccccc2)(c3ccccc3)',
|
||||
'[Si](c1ccccc1)(c2ccccc2)(c3ccccc3)', 'O=C([C!r]=[C!r]C)[!O&!N]', 'O=C([!O&!N])[C!r]=[C&D1!r]',
|
||||
'N=!@N', 'CC(=O)[C&X2]-!@Oc', 'C=[N!r][N!r]=C', 'S1(CCN4(C(CC41)=O))', 'S1(CC=CN5(C(CC51)=O))',
|
||||
'[S;X2]1[CH2][CH]2[CH]([CH]1)[NH]C([NH]2)=O', 'C(=N)-!@N-!@C(=N)', '*[S&D2!r][S&D2!r]*',
|
||||
'*~[N!H0]!@[N!H0]~*', '[C&D2][C&D2][C&D2][C&D2][C&D2][C&D2][C&D2][C&D2][C&D2][C&v4&D1]',
|
||||
'Nc1aaa(N!@=N)aa1', 'C[n+1]2ccccc2', '[N+1&X4;H0]', 'CC(=[S,O])-!@[S&D2]*', 'CC(=S)-!@O*',
|
||||
'[SX2H1]', '[3#1]', 'N1C(=S)SC(=C)C1=O', 'C1([#6](~[!#6]~[#6][!#6][#6]1=,:[!#6])~[!#1;!#6])=[C;!R;H,H2]',
|
||||
'c1ccc(c(c1)[OH])C=N[#7]', '[#6]C(=[!C;!R])C(=[!C;!R])[#6,$(S(=O)=O)]',
|
||||
'O=C2C(=!@N[#7])c1c(cccc1)N2', 'N([C;H2,H3])([C;H2,H3])c1oc([#6]=,:[#7][#7;H][#6]=,:[!#6])[cH][cH]1',
|
||||
'c1(C(=O)[OH])c([NH]N=C)cccc1', 'c1c(c(ccc1)C(cc)=O)[NH2,$([NH1](c)[!$(C=O)])]',
|
||||
'C(C#N)(C#N)C([$(C#N),$(C=N)])C#N', 'C(C#N)(C#N)C([NH2])=CC#N', 'C(C#N)(C#N)=N[NH]c1ccccc1',
|
||||
'[NH2]C1=C(C#N)[CH](cc)C(C#N)=C([NH2])S1', '[#7+](cc)=,:[#6][CH]=CN([CX4])[#6]',
|
||||
'C(C#N)(C#N)=Cc1ccccc1', 'C1(=C)C(N=CS1)=O', 'C1(C(C=C[!C]1)=C)=[!C]', 'C1(=C)C(=O)NNC1=O',
|
||||
'[#7]C=!@C2C(=O)c1c(cccc1)[!C]2', 'c1(ccccc1)[CH]=!@C3C(=O)c2c(cccc2)S3', 'c12ccccc1C(=O)C(=C)C2=O',
|
||||
'C=!@C([!#1])-@C(=!@[!C])-@C(=!@C)[!#1]', '[#6]C(=O)[CH]=C([NH][#6])C(=O)O[#6]',
|
||||
'[#7]~c1[$(nn),$(n[cH]),$(nc[#7;H,H2]),$(c([#7])n),$(c([#7])[cH]),$(c([#7])c[#7;H,H2])][n,cH,$(c[#7;H,H2])]c([#7H,#7H2,$(O[C;H2,H3])])nn1',
|
||||
'c12c([cH][cH][cH][cH]1)[cH]c(c([cH]2)[OH])C(=O)[NH]N=C',
|
||||
'[C;H2,H3]N([C;H2,H3])c1[cH][cH]c([CH]=NN[$(C(=O)[CH2]Scn),$(C(=O)[CH2]aan),$(C(=O)cc[OH]),$(cn),$([CH2][C;H,H2][OH])])[cH][cH]1',
|
||||
'[#6]N1[CH2][CH2]N([CH2][CH2]1)N=[CH]ca', 'C1C(=[!C;!R])C(=O)[!C]*=1', 'C1(C(=O)NC(=O)NC(=O)1)=N',
|
||||
'c12ccccc1C(=O)[CX4]C2=O', 'c1ccc3c2c(cccc12)[NH][CX4][NH]3', 'n2(c1acccc1)c([CX4])c[cH]c2[CX4]',
|
||||
'N1C(=S)S[CX4]C1=O', 'c1c(ccc(c1)[NH]S(=O)=O)[OH]',
|
||||
'c1([CH2,$(cc)])[cH,$(c[CH2,CH3]),$(cC=O)]sc([nH,$(n[CH,CH2,CH3]),$(n(cc)cc)]1)=[N;!R]',
|
||||
'c1([$([#7][#6](=O)cc),NH2])c(-!@C(=O)N[C;H2,H3])sc(n1[$([CH2][CH]=[CH2]),$(cc)])=S',
|
||||
'o1c(sc2cc(ccc12)[#7,#8])=[O,S]', 'S=c1cc[!#6]cc1', '[#6][#6](=S)[#6]',
|
||||
'[NH2]c1sc([!#1])c([!#1])c1C=O', 'c1cccc2c1cc3c(n2)nc4c(c3[#7])cccc4',
|
||||
'[NH2]c1c([NH]S(=O)=O)[cH]c([NH][C;H2,H3])c([F,Cl,Br,I])[cH]1',
|
||||
'c13[cH][cH][cH][cH]c1oc2[cH]c([NH][C;H2,H3])c(O[C;H2,H3])[cH]c23',
|
||||
'c1ccccc1N3C(=O)C([NH]c2ccc(O[C;H2,H3])cc2)=C([F,Cl,Br,I])C3=O',
|
||||
'[C;H2,H3]Oc1[cH][cH][cH][cH]c1[NH][CH](C=O)S', '[NH2]c1c([OH])cc(S(=O)(=O)[OH])cc1',
|
||||
'c2([NH2])c([OH])c(C=O)[cH]c1[cH][cH][cH][cH]c12', 'c13c(C(Nc2c1cccc2)([#6])[#6])ssc3=*',
|
||||
'c1c(ccc([NH2,$([NH][CX4]),$([N]([CX4])[CX4])])c1)[CX4]c2ccc(cc2)[NH2,$([NH][CX4]),$([N]([CX4])[CX4])]',
|
||||
'[C;H2,H3]N([C;H2,H3])c1[cH][cH]c([CH]=NN=C([#6])cc)[cH][cH]1',
|
||||
'[cH]1c([cH]nc2c1[cH][cH][cH][cH]2)[CH2]N3c4c([CH2][CH2]3)[cH][cH][cH][cH]4',
|
||||
'[cH]2c([NH]C(=S)[NH]c1ccccc1)[cH]c(N([C;H2,H3])[C;H2,H3])[cH][cH]2',
|
||||
'N3C=C(C=O)C(c1ccc(N([C;H2,H3])[C;H2,H3])cc1)[#6]2=,:[#6]3~[#7]~[#6](~[#16])~[#7]~[#6](~[#7])~2',
|
||||
'[C;H2,H3]N([C;H2,H3])c1[cH][cH]c(o1)[CH]=CC#N',
|
||||
'[NH2]c2c(N=C1[CH]=[#6]~[#6]~[#6]=C1)[cH][cH][cH][cH]2',
|
||||
'[NH2]c1[cH][cH]c([cH][cH]1)c2[cH]c(C=O)c([C;H2,H3])o2',
|
||||
'c1(O[C;H2,H3])[cH,$(c[C;H2,H3])][cH]c(O[C;H2,H3])c([CH2][$([NH]C(=O)[CH2][CH2][C;H2,H3]),$([CH]([C;H2,H3])[NH]C(=S)[N;H,H2])])[cH]1',
|
||||
'c2(c([NH2])n(c1c(cccc1)C(=O)[OH])nc2C=O)[$(C#N),$(C=S)]', 'c12ncccc1c([NH2])c(C(=O)~[OX1])s2',
|
||||
'C3(=O)C(=[CH][NH]c1c(C(=O)[OH])cccc1)N=C(c2ccccc2)O3', 'c1([OH])ccc([NH]C(=O)cc)c(c1)C(=O)[OH]',
|
||||
'c1(oc([#6])[cH][cH]1)C(=O)[NH]c2[cH][cH][cH][cH]c2C(=O)[OH]',
|
||||
'c1ccccc1C(=O)[NH]c2ccccc2C(=O)[NH][NH]c3nccs3', 'c1ccc2c(cc1)ccc2', 'c1(c(ccccc1)[N;H,H2])=N[#6]',
|
||||
'c1(ccc(cc1)c2cc(c3c(c(c2)[OH])coc3)=O)S[C;H2,H3]', 'c13ccccc1sc(=NN=c2cccccc2)n3[C;H2,H3]',
|
||||
'c23c([Br])cc1c(cco1)c2[cH]cc(=O)o3',
|
||||
'c12c([F,Cl,Br,I])cc([F,Cl,Br,I])cc1[cH]c(C(=O)[NH2])c(=[NH])o2',
|
||||
'c1(c(nc2c(c1C#N)c(c(cc2)C#N)[NH2])[NH2])C#N',
|
||||
'n3c([SX2]c1c([NH2])cccc1)c(C#N)c(c2ccccc2)c(C#N)c3[NH2]', 'C1(C#N)(C#N)[CH](C(=O)[#6])[CH]1',
|
||||
'C=CC(C#N)(C#N)C(C#N)=C[NH2]', 'ccC(=O)[NH]C(=O)C(C#N)=[CH][NH]cc', 'O=S(=O)C(C#N)=N[N;H,H2]',
|
||||
'c2(c1ccccc1nnc2)C(cc)C#N', 'C1(C([C;H2,H3])=C(C#N)[#6](~[#8])~[#7]~[#6]1~[#8])=[CH]cc',
|
||||
'O=c1c(cc(nn1)C=O)C#N', 'n2(c1ccccc1)c(=O)c(C#N)cc(C#N)n2',
|
||||
'[NH2]C1=C(C#N)[C;H,H2](cc)C([C;H2,H3])=C(C=C)O1',
|
||||
'[NH2,$([OH])]C2=C(C#N)[CH](cc)c1c(n([#6])nc1)O2', '[NH2]C1=C(C#N)[CH](cc)C(C#N)=C(cc)O1',
|
||||
'[NH2]C2=C(C#N)[CH](cc)c1c(ccs1)O2', '[C;H2,H3][SX2]C1=C(C#N)[CH](cc)C(C#N)C(=O)N1',
|
||||
'[NH2]C2=C(C#N)[CH](c1cccs1)C(C(=O)O[#6])=C([C;H2,H3])O2',
|
||||
'[SX2]1C=C(C#N)C([#6])(C=O)C([$(C=O),$(C#N)])=C1[NH2]', '[NH2]C1=C(C#N)[CH](cc)S[CX4]S1',
|
||||
'[NH,$([N][CX4])]1[#6]:,=[#6]([#6](=O)ccc)C([#6])C([$(C=O),$(C#N)])=C1[CH3]',
|
||||
'N2=Nc1n[!#6]nc1N=Ncc2', '[CH]2([OH])c1n[!#6]nc1[CH]([OH])C=C2',
|
||||
'c1ccccc1N([C;H2,H3])[CH]=[CH]C=!@[CH][CH]=CC=@Nc2ccccc2', 'C2(c1c(cc(c(n1)[NH2])C#N)C(C=2)=C)C#N',
|
||||
'C(C#N)(C#N)=C(S)S', '[OH]C(=O)c1c([OH])[cH]c([cH][cH]1)c2[cH][cH]c(o2)[CH]=C(C#N)c3nccn3',
|
||||
'n2([#6])[cH][cH][cH]c2[CH]=C(C#N)c1nccs1',
|
||||
'C3(=O)C(=[CH][$(c1ccccc1),$(c2ccc[!#6]2)])N=C(aaa)[S,$([N]aa)]3', 'N1=CC(C(N1)=S)=C',
|
||||
'c1(ccco1)[CH]=!@C3C(=O)c2c(cccc2)[!C]3', 'S=c1[nH]ccc2c1C(=O)OC2=[C;H,H2]',
|
||||
'[#6]1[O,S]C(C([#6]=,:1)=O)=CC=O', 'C2(=O)C(=[CH]c1cc([F,Cl,Br,I])ccc1O[C;H2,H3])N=C(S[C;H2,H3])S2',
|
||||
'[cH]2[cH][cH][cH]c1[CH2]C(=O)C(=C([C;H2,H3])[C;H2,H3])c12', 'C1C(OC(C=1)(O)[#6])O',
|
||||
'c12[cH]c(O[C;H2,H3])c(O[C;H2,H3])[cH]c1C([#6])=C([#6])S[CH2]2',
|
||||
'c2(O[C;H2,H3])c(O[C;H2,H3])[cH]c1C=C[CH]Sc1[cH]2', 'C1(=O)C(C(C#N)=[CH][#7])C([#7])C=C1',
|
||||
'c23ccc1ccccc1c2[C;H,H2][CX4]NC3=[CH]C(=O)N([C;H2,H3])[C;H2,H3]', 'O=CC1=C(SC(=[CH][#6])S1)C=O',
|
||||
'C1(C(=O)[CH2]C[CH2]C(=O)1)=C([N;H,H2])C=O', 'C#CC(=O)C#C',
|
||||
'aa[CH,$(CC#N)]=C1[#6]:,=[#6]C(=[O,$([N;!R])])[#6]:,=[#6]1',
|
||||
'S1C(=Ncc)[NH,NH2,$(N[CH2][CH2]O),$(Ncc)]C(=O)C1=[CH][$(ccc[Cl]),$(c[!#6])]',
|
||||
'S1C(=O)NC(=S)C1=[CH]cc', 'n2ccc([CH]=C1C(=O)NC(=[!C])N1)c2[C;H2,H3]',
|
||||
'[OH]C(=O)c1cccc(c1)cac[CH]=C2C(=[!C])NC(=[!C])[!C]2', 'n12c(nc(=O)c([C;H2,H3])n1)sc(=[CH]cc)c2=O',
|
||||
'N2([C;H2,H3])C(=S)[NH]C(=[CH]c1cc(Br)ccc1)C2=O',
|
||||
'[C;H2,H3]N2C(=[S,N])[!C]C(=C1[CH]=[CH]ccN1[C;H2,H3])C2=O',
|
||||
'c1ccccc1C(=O)[CH]=c3c(=O)[nH]c(=O)c(=[CH]c2ccccc2)[nH]3', 'O=C1cc[CH2]NC1=[C;H,H2]',
|
||||
'c1(oc([C;H2,H3])[cH][cH]1)[CH]([OH])C#C[CX4]', 'ccn2nc1[CH2][S;X2][CH2]c1c2[NH]C(=O)[CH]=[C;H,H2]',
|
||||
'[cH]1[cH]n([C;H2,H3])c3c1c2[cH][cH]n(c2c(c3O[C;H2,H3])O[C;H2,H3])[C;H2,H3]',
|
||||
'C2(=O)C(=C([C;H2,H3])[NH][CH2][CH2][C;H2,H3])N=C(c1ccccc1)O2',
|
||||
'n4(c1ccccc1)[cH][n+](c2ccccc2)c(=Nc3ccccc3)n4', 'n1nc(c2-c(c1C)ccc2)C',
|
||||
's1c2c(c([C;H2,H3])c1[C;H2,H3])c(ncn2)NN=Cc3ccco3', 'C2([#6])[CH2]C(=O)n1nc([NH2])c([NH2])c1N=2',
|
||||
'c2(c1n(c(ccn1)=O)nc2c3nccc3)C#N', 'n2nnnc1cccc-12',
|
||||
'c2([CH2][C;H2,H3])c([C;H2,H3])c1c(=[X1])n([C;H,H2][$(C(=O)O),$(cc)])[cH,$(cS[C;H2,H3])]nc1[a;X2]2',
|
||||
'c1ccc2c(c1)nc3c(n2)ccc4c3cccc4', 'n1c3c(c(=N)c2ccccc12)cccc3',
|
||||
'c23cc1ccccc1cc2n([C;H2,H3])c(=O)c(cc[NH][C;H2,H3])n3',
|
||||
'c12acccc1nc([CH]=C([OH])[#6])c([CH]=C([OH])[#6])n2',
|
||||
'c1ccc2c(c1)nc(c(n2)[CH2]C(=O)cc)[CH2]C(=O)cc', 'c1ccc2c(c1)nc(c(n2)c3ccccc3)c4c(cccc4)[OH]',
|
||||
'[C;H2,H3]Oc1[cH][cH][cH][cH]c1[NH]c3c2c(O[C;H2,H3])ccc(O[C;H2,H3])c2ncc3',
|
||||
'N=c1[nH]c([N;H,H2,H3])c([N;H,H2])nn1', '[NH](c1[cH][cH][cH][cH]c1[OH])c2nnc(=N)oc2[NH2]',
|
||||
's1[cH][cH][cH]c1C4[NH]N=C(c2ccncc2)c3c(cccc3)N=4',
|
||||
'C([F])([F])C(=O)[NH]c1[cH]n([CH2][CH2]O[CH2]cc)n[cH]1', 'c12cccca1c([!#1])a[n+](~cc)a2',
|
||||
'[#6]13~[#7](cc)~[#6]~[#6]~[#6]~[#6]~1~[#6]2~[#7]~[#6]~[#6]~[#6]~[#7+]~2~[#7]~3',
|
||||
'C1(C=O)(cc)[SX2]C=N[NH]1', 'S=c2[nH]nc(c1[cH][cH]c(O[C;H2,H3])[cH][cH]1)o2', 'N=C1SC(=N)N=C1',
|
||||
'N1([C;H2,H3])C(=S)N(cc)C(=Ncc)C1=Ncc', 'S1C(=N[NH])SC(=Ncc)C1=Ncc',
|
||||
'c1ccccc1C4=Nn3c2ccccc2[n+]c3S[CX4]4', 'n2c1ccccc1n([C;H2,H3])c2S[CH2]C(=O)[NH]N=[CH][CH]=[C;H,H2]',
|
||||
'c12ccccc1[CH2][CH2]N=C2[SX2][CH2]C(=O)c3ccccc3',
|
||||
'c1ccc3c(c1)Sc2[cH][cH][cH,$(cO),$(c[SX2]),$(c[CX4]),$(c[NH2,$([NH][CX4]),$([N]([CX4])[CX4])])]([cH]c2C(C3)[NH2,$([NH][CX4]),$([N]([CX4])[CX4])])',
|
||||
'[C;H2,H3][SX2]c2nc1OC=Nccc1nn2', '[C;H2,H3][SX2]c3nc(c1o[cH][cH][cH]1)c(c2o[cH][cH][cH]2)nn3',
|
||||
'[C;H,H2,H3]C3=NN(c1ccccc1)S2[!C]*=CC=23', 'N=c1ncns1', 'cc[NH]C(=O)c1nnsc1[NH]cc', 'n2sc1CNccc1c2=S',
|
||||
'C1(SC(=[CH]cc)N(cc)N=1)C=O', 'c1([OH])ccc([OH])c(c1)C(=!@C[#7])C=O',
|
||||
'c14[cH][cH]c(O[C;H2,H3])[cH]c1C(=N[NH]c2[cH][cH]c(C(=O)[OH])[cH][cH]2)c3[cH]c(O[C;H2,H3])[cH][cH]c34',
|
||||
'[OH]C(=O)c1ccc(cc1)NN=[CH]c2ac([cH][cH]2)c3ccccc3',
|
||||
'c1(o[cH,$(c[C;H2,H3])][cH][cH]1)C(=O)[NH]N=[CH,$(C[C;H2,H3])]c3cc(***c2occc2)ccc3',
|
||||
'c1cccc2c1cc4c(c2C=N[NH]c3ccccc3)cccc4',
|
||||
'c1(o[cH,$(c[C;H2,H3])][cH][cH]1)[CH,$(C[C;H2,H3])]=N[NH]c2nccs2',
|
||||
'c1(o[cH,$(c[C;H2,H3])][cH][cH]1)[CH,$(C[C;H2,H3])]=N[NH]c2ccncc2',
|
||||
'c1ccccc1N(c2ccccc2)N=[CH]c3ac([cH][cH]3)c4cc(C(=O)[OH])ccc4',
|
||||
'[OH]C(=O)c1cc(ccc1)cacC=N[NH]C(=O)[CH2]O',
|
||||
'C(c1[cH]c[cH,$(c[CX4])][cH][cH]1)(c2[cH][cH][cH,$(c[Cl])][cH][cH]2)=[$(NO[CH2][CH2][CH2]N([C;H2,H3])[C;H2,H3]),$(NO[CH2][CH2]N([C;H2,H3])[C;H2,H3]),$(N[NH]C(=[NH])[NH2]),$([CH][#7])]',
|
||||
'c12[cH][cH][cH][cH]c1[!#6][cH,$(c[OH]),$(c[C;H2,H3])]c2[CH]=N[NH][$(c3nc[cH]s3),$(c[cH][cH]),$(cncncn),$(cnnnn)]',
|
||||
'c1(s[cH][cH][cH,$(c[C;H2,H3])]1)[CH]=N[NH]c2ccccc2',
|
||||
'[CH]4(n1[cH]n[cH][cH]1)c2[cH]c([Br])[cH][cH]c2[CH2][CH2]c3[cH][cH][cH][cH]c34',
|
||||
'n3c(c1ccccc1)c(c2ccccc2)n(N=!@C)c3[NH2]', 'cc[CH]=[CH][CH]=NN([CX4])[CX4]',
|
||||
'C1(C=Nc2c(N1)cccc2)=[CH]C=O', 'c1cC(=O)C=C1N=[CH]N([CX4])[CX4]', 'c1ccc2c(c1)C(C=N2)=[N;!R]',
|
||||
'cc[CH]=[CH][CH]=NN=C', 'N([C;H2,H3])([C;H2,H3])[CH]=NC([C;H2,H3])=NN([C;H2,H3])cc',
|
||||
'c12c([cH][cH][cH][cH]1)c(c([cH][cH]2)C(=Ncc)[C;H2,H3])[OH]',
|
||||
'[NH](c1ccccc1)N=C(C(=O)[C;H2,H3])[NH][NH,NH2,NH3,$(cc)]', '[N;!R]=C2C(=O)c1c(cccc1)S2',
|
||||
'cc[N;!R]=C2C(=[!C])c1c(cccc1)N2', 'C1(=[!C])C(N=CS1)=O', 'C=[N;!R]c1c([OH])cccc1',
|
||||
'[CX4]1C(=O)NNC1=O', 'O=CC=[CH][OH]', '[C;H2,H3]C([OH])=C(C(=O)[C;H2,H3])[CH]C#C',
|
||||
'cc[NH]N=C([C;H2,H3])[CH2]C([C;H2,H3])=N[NH]cc', 'c1cccc2c1C(c3c2nacc3)=O',
|
||||
'c1cccc2c1C(C3C2=N*=CC3)=O', 'c1(cc3c(c2ccccc12)c4c(C3=O)cccc4)[OH]',
|
||||
'c1c(cccc1)C(=O)[NH]N=C3c2c(cccc2)c4c3cccc4', 'c12c(O[CH2]N(ccO[C;H2,H3])[CH2]1)[cH]cc[cH]2',
|
||||
'c1([NH]C(=O)[CH2][CH2]cc)[cH][cH]c([C;H2,H3])c([NH]C(=O)[CH2][CH2]cc)[cH]1',
|
||||
'[N;H,H2](c1[cH][cH]c(O[CH3])c(O[C;H,H2,H3])[cH]1)C(=O)[NH][CH2][CH2][CH2]N([CH3])cc',
|
||||
'c1c3c(cc2c1cccc2)nc(n3)COC(=O)c4cc(cc(c4)[NH2])[NH2]',
|
||||
'n2(c1[cH][cH][cH][cH]c1C(=O)[NH][CH]([C;H2,H3])[CH2]Occ)[cH][cH][cH][cH]2',
|
||||
'C1(cc)[CH2][CH](C(=O)[#6])[CH](C(=O)[OH])[CH2]C(cc)=1', 'C(cc)(cc)(cc)SccC(=O)[OH]',
|
||||
'c1(C([CX4])([CX4])[NH]C(=O)N([CH2][C;H2,H3])[CH2][CH2][C;H,H2][CH2]cc)[cH][cH][cH]c(C(=[CH2])[C;H2,H3])[cH]1',
|
||||
'c1cc3c2c(c1)cccc2C(=CC3=O)O[C;H2,H3]', 'c13ccc(c2c1c(ccc2)C(C=C3C([F])([F])[F])=O)[#7]',
|
||||
'c1(ccc3c2c(cccc12)C(C=C3)=N)[#7]', 'C(c1ccc([OH])cc1)(c2ccc([OH])cc2)OS(=O)=O',
|
||||
'n12cccc2C=N([#6])CC1', 'n2([C;H2,H3])c1c(ccC(=O)1)cc2[C;H2,H3]',
|
||||
'c1(c2c(c(n1C(O)=O)[C;H2,H3])S[CH2]S2)[C;H2,H3]', 'n2(c1ccccc1)[cH][cH][cH]c2C=N[OH]',
|
||||
'c1ccc2c(c1)c4c3c(C2=O)cccc3no4', 'O=C2c1ccccc1c3c([OH])c(=O)nc4cccc2c34',
|
||||
'C1([#6]:,=[#6][#6]:,=[#6]C1=[!C])=[!C]', 'N2(c1ccccc1)C(=NC=O)S[CH2]C2=O',
|
||||
'N4(c1ccccc1)C(=O)S[CH]([NH]c3c2ccccc2ccc3)C4=O', 'N2(c1ccccc1)C(=O)S[CH2]C2=S',
|
||||
'N3(C(=O)c1ccccc1)C(=Nc2ccccc2)S[CH2]C3=O', 'O=C4CCC3C2C(=O)CC1CCCC1C2CCC3=C4',
|
||||
'C2Cc1ccccc1C(c3c2cccc3)=C[#6]', 'c1ccc3c(c1)[SX2,CX4]c2cc[cH,$(c[Cl]),$(c[CX4])]cc2C3=C[#6]',
|
||||
'c1ccc2c(c1)C(c3c(SC2)scc3)=C', 'c1ccc2c(c1)C(c3c2ccc(c3)[NH2])=[CH][#6]',
|
||||
'[CH3]c2nc([NH]S(c1[cH][cH]c([cH][cH]1)O[CH2][CH2][C;H2,H3])(=O)=O)[cH][cH][cH]2',
|
||||
'c1([cH][cH]c([cH][cH]1)[NH2])S(=O)(=O)[NH]c2[cH][cH][cH]nn2',
|
||||
'[CH3]C([CH3])([CH3])c1[cH]c(C([CH3])([CH3])[CH3])c(O[C;H,H2]N)c[cH]1',
|
||||
'a1aaa(aa1)[CH]=[CH]C([NH][NH]c2n(nnn2)[#6])=O', 'c1([F,Cl,Br,I,$([n+](c)c)])c(-!@C=N)sc(n1)=O',
|
||||
'c1(S[C;R])c([CH]([#6])[#6])sc([nH,$(n[C;H2,H3])]1)=O',
|
||||
's2c([NH]c1c([CH2,CH3,$(cc)])cccc1)[n+]([C;H2,H3])c([#6])[cH]2', 'c1nc(sc1)NNS(=O)=O',
|
||||
'c1c(cc(C(=O)[OH])cc1)[NH]c2nc([cH]s2)c3ccc([CH]([C;H,H2,H3])[C;H,H2,H3])cc3',
|
||||
'[C;H2,H3][NH]C=N[NH]c1nc(cc)[cH]s1', 'O=S(=O)(cc)[NH][NH]c1nc(cs1)cc',
|
||||
'n1c3c(cc2c1nc(s2)[#7])sc(n3)[#7]', 's2ccc(c1csc([NH2])n1)[cH]2',
|
||||
'c2([NH]cc[C;H2,H3])nc(c1ccncc1)[cH]s2',
|
||||
'[C;H2,H3]Oc1[cH][cH]c(O[C;H2,H3])[cH]c1[NH]c3scc(c2ccc(O[C;H2,H3])cc2)n3', '[#6][CH](=S)',
|
||||
'[cH]1coc([cH]1)C(=S)N2[CH2][CH2]*[CH2][CH2]2', '[CX4][NH]C(cc)=[CH]C(=S)[NH]c1ccccc1',
|
||||
'[CH](c1ccccc1)(c2ccccc2)C(=S)[N;H,H2]', 'c(N([C;H,H2,H3])[C;H,H2,H3])c[NH]C(=S)[C;H,H2,H3]',
|
||||
'c1cccnc1C(=S)[NH]c2ccccc2O[C;H2,H3]', 'acC(=S)[NH][NH]ca', '[C;H2,H3]SC(=S)[NH][CH2]cc',
|
||||
'C1[CX4]SC(NC=1)=S', 'O=c2sc1cccc(O[C;H2,H3])c1o2', '[NH]1C(=S)[CH](C#N)[CH](cc)[CH]=C1cc',
|
||||
'S=CC([C;H2,H3])=C([C;H2,H3])N([C;H2,H3])[C;H2,H3]', 'C1=CN(C(c2ccccc12)(C#N)C(=S)S)C=O',
|
||||
'c1c(sc(cc1)=S)[#7]', 'ccC(=[SX1])[SX2][C;H,H2][CH2,CH3,$(cc)]', 'n1c(=O)[cH]c([#6])sc1=S',
|
||||
'c1ccccc1N3C(=O)C(Sc2ccccc2)=[CH]C(=O)3', '[CX4][N+]([CX4][OH])=CS[C;H,H2,H3]', 'S=Cc2n1ccccc1cc2',
|
||||
'c2(nanc(c1o[cH][cH][cH]1)n2)S[CX4]', 'C(=O)(N1CCSCC1)c2c([cH][cH][cH][cH]2)S[C;H2,H3]',
|
||||
'c2c([NH]C(=S)[NH][CH2][CH2][CH2]N([C;H2,H3])c1ccccc1)cccc2',
|
||||
'c2c([NH]C(=S)[NH][CH2][CH2]N([C;H2,H3])c1ccccc1)cccc2',
|
||||
'c1(ccccc1)[NH]C(=S)N[NH]C(=O)c2a[!#6]cc2', 'c1(ccccc1)[NH]C(=S)N[NH]c2ccccc2',
|
||||
'C1[NH][NH]C(=S)N[NH]1', 'c1(ccccc1)[NH]C(=S)N[NH][#6](=,:[#7;R])[#7;R]',
|
||||
'c1(ccccc1)[NH]C(=S)[NH]N=Cc2ccnc2', 'c1(ccoc1[C;H,H2,H3])C=N[NH]C(=S)[N;H,H2]',
|
||||
'c2(=S)n1ccnnc1n[nH]2', '[CX4][SX2]C(=N-aaaa)[NH]N=C',
|
||||
'ccN([C;H2,H3])[CH2][CH2][CH2][NH]C(=S)[NH]c1c([F,Cl,Br,I])[cH]c([C;H2,H3])[cH][cH]1',
|
||||
'[cH]3c([NH]C(=S)[NH][C;H,H2]c1oc([C;H,H2,H3])[cH][cH]1)[cH]c2c(O[CH2]O2)[cH]3',
|
||||
'O=C-!@n2ccc1c2[nH]c(=S)[nH]1',
|
||||
'c12c([cH][cH][cH][cH]1)c([cH][cH][cH]2)C([C;H2,H3])=N[NH]C(=S)[NH]ccc',
|
||||
'c1(ccccc1)[#7;H][#6](=S)[#7][#7;H][#6;H]=,:[#6;H][#6]=O',
|
||||
'[C;H2,H3]N([C;H2,H3])[CH]=CC(=O)c1c([SX2])sc([$(C#N),$(C=O)])c1',
|
||||
's1[cH][cH]c(O[C;H2,H3])c1C(=O)[NH][N;H,H2]',
|
||||
'c1(s[cH,$(c[C;H2,H3])][cH][cH]1)[CH,$(C[C;H2,H3])]C(=O)[NH]c2nccs2',
|
||||
'c1csc(c1[NH2])[CH]=[CH]c2cccs2', '[NH2]c3sc([NH]C(=O)c1ccccc1)c(C#N)c3c2aaaaa2'
|
||||
] # '([#7+1!r].[#7+1!r].[#7+1!r])'
|
||||
|
||||
'''l = []
|
||||
lines = open('/export/home/jyy/gbp_pocket_flow_with_edge/pocket_flow/utils/alert_struct.csv').read().split('\n')
|
||||
for line in lines[2:]:
|
||||
if ',"' in line:
|
||||
l.append(line.split(',"')[-1].strip('"'))
|
||||
else:
|
||||
l.append(line.split(',')[-1])'''
|
||||
|
||||
HAT1_REF_MOL = [
|
||||
'CN1C2=C(C=CC(S(=O)(NC3=CC=C(C4=NC=CO4)C=C3)=O)=C2)CCC1C5=CC=CC=C5',
|
||||
'CN1C2=C(C=CC(S(=O)(NC3=CC=C(Br)C=C3)=O)=C2)CCC1C4=CC=CC=C4',
|
||||
'CN1C2=C(C=CC(S(=O)(NC3=C(C=CC=C4)C4=C(Br)C=N3)=O)=C2)CCC1C5=CC=CC=C5',
|
||||
'O=S(C1=CC2=C(C=C1)C=CC(C3=CC=CC=C3)=N2)(NC4=CC=C(Br)C=C4)=O',
|
||||
'O=S(C1=CC2=C(C(F)=C1)C=CC(C3=CC=CC=C3)=N2)(NC4=CC=C(Br)C=C4)=O',
|
||||
'CN1C2=C(C=CC(S(=O)(NC3=CC=C(C4=COC=N4)C=C3)=O)=C2)CCC1C5=CC=C(Cl)C=C5',
|
||||
'CN1C2=C(C=CC(S(=O)(NC3=CC=C(C4=CSC=N4)C=C3)=O)=C2)CCC1C5=CC=C(Cl)C=C5',
|
||||
'BrC1=C(O)C=CC(/C=C2CCC/C(C\2=O)=C\C3=CC=C(O)C(Br)=C3)=C1',
|
||||
'O=S(C1=CC2=C(NC(C3C2C=CC3)C4=C(C=CC=C4)F)C=C1)(NC5=CC=C(C(O)=O)C=C5)=O',
|
||||
'BrC1=C(O)C=CC(/C=C2CC(CC)C/C(C\2=O)=C\C3=CC=C(O)C(Br)=C3)=C1'
|
||||
'O=C1/C(CC/C1=C\C2=CC(Br)=C(O)C=C2)=C/C3=CC=C(O)C(Br)=C3',
|
||||
'O=C(/C=C/C1=CC(Br)=C(O)C=C1)/C=C/C2=CC=C(O)C(Br)=C2'
|
||||
]
|
||||
|
||||
def check_alert_struct(mol):
|
||||
Chem.GetSSSR(mol)
|
||||
for alert_smarts in ALERT_STRUCTURES:
|
||||
p = Chem.MolFromSmarts(alert_smarts)
|
||||
if mol.HasSubstructMatch(p):
|
||||
return False
|
||||
else:
|
||||
return True
|
||||
|
||||
|
||||
def check_lipinski(mol):
|
||||
mw = Descriptors.MolWt(mol)
|
||||
logp = Crippen.MolLogP(mol)
|
||||
#tpsa = Descriptors.TPSA(mol) # tpsa<=140
|
||||
hbd = Lipinski.NumHDonors(mol)
|
||||
hba = Lipinski.NumHAcceptors(mol)
|
||||
#rb = Descriptors.NumRotatableBonds(mol)
|
||||
if sum([mw<=700, logp<=5, hba<=10, hbd<=5]) != 4:
|
||||
return False
|
||||
else:
|
||||
return True
|
||||
|
||||
|
||||
class Filter(object):
|
||||
def __init__(self, sa_threshhold=5.0, qed_threshhold=0.3, mw_threshhold=300):
|
||||
self.sa_threshhold = sa_threshhold
|
||||
self.qed_threshhold = qed_threshhold
|
||||
self.mw_threshhold = mw_threshhold
|
||||
|
||||
def _filter(self, mol):
|
||||
try:
|
||||
qed_check = QED.qed(mol) > self.qed_threshhold
|
||||
sa_check = sascorer.calculateScore(mol) < self.sa_threshhold
|
||||
mw_check = Descriptors.MolWt(mol) > self.mw_threshhold
|
||||
lip_check = check_lipinski(mol)
|
||||
if qed_check and sa_check and mw_check and lip_check:
|
||||
struct_check = check_alert_struct(mol)
|
||||
if struct_check:
|
||||
return (Chem.MolToSmiles(mol), mol)
|
||||
else:
|
||||
return False
|
||||
else:
|
||||
return False
|
||||
except:
|
||||
return False
|
||||
|
||||
def run(self, mol_list, save_name='Filter_Result.sdf', mol_name=None):
|
||||
num_mol = 0
|
||||
smi_dict = {}
|
||||
#filter(lambda x:x!=False, filter_list)
|
||||
for ix, m in tqdm(enumerate(mol_list)):
|
||||
m_ = Chem.MolFromSmiles(Chem.MolToSmiles(m))
|
||||
out = self._filter(m_)
|
||||
if out != False and out[0] not in smi_dict:
|
||||
smi_dict.setdefault(out[0])
|
||||
with open(save_name, 'a') as sdf_writer:
|
||||
#mol = out[1]
|
||||
if mol_name:
|
||||
m.SetProp('_Name', mol_name)
|
||||
mol_block = Chem.MolToMolBlock(m)
|
||||
sdf_writer.write(mol_block + '\n$$$$\n')
|
||||
num_mol += 1
|
||||
|
||||
def run_file_list(self, sdf_list, save_name='Filter_Result.sdf'):
|
||||
num_mol = 0
|
||||
smi_dict = {}
|
||||
for sdf in sdf_list:
|
||||
supp = Chem.SDMolSupplier(sdf)
|
||||
for mol in supp:
|
||||
if mol is None: continue
|
||||
mol_ = Chem.MolFromSmiles(Chem.MolToSmiles(mol))
|
||||
out = self._filter(mol_)
|
||||
if out != False and out[0] not in smi_dict:
|
||||
mol_name = 'No_{}-{}'.format(num_mol, sdf)
|
||||
smi_dict.setdefault(out[0])
|
||||
with open(save_name, 'a') as sdf_writer:
|
||||
#mol = out[1]
|
||||
mol.SetProp('_Name', mol_name)
|
||||
mol_block = Chem.MolToMolBlock(mol)
|
||||
sdf_writer.write(mol_block + '\n$$$$\n')
|
||||
num_mol += 1
|
||||
return num_mol
|
||||
|
||||
def np_run(self, mol_list, save_name='Filter_Result.sdf', n_process=10):
|
||||
smi_dict = {}
|
||||
pool = Pool(processes=n_process)
|
||||
filter_list = pool.map(self._filter, mol_list)
|
||||
#filter(lambda x:x!=False, filter_list)
|
||||
for ix, i in enumerate(filter_list):
|
||||
if i:
|
||||
if i[0] not in smi_dict:
|
||||
smi_dict.setdefault(i[0])
|
||||
with open(save_name, 'a') as sdf_writer:
|
||||
sdf_writer.write(i[1] + '\n$$$$\n')
|
||||
|
||||
|
||||
|
||||
from PIL import Image
|
||||
import matplotlib.pyplot as plt
|
||||
from .train import verify_dir_exists
|
||||
from rdkit import Chem
|
||||
from rdkit.Chem import AllChem, Descriptors, Draw
|
||||
from rdkit.Chem.Draw import rdMolDraw2D
|
||||
from rdkit.Chem.Draw.MolDrawing import DrawingOptions
|
||||
import os
|
||||
|
||||
|
||||
def draw_docked_mol_list(mol_list, molsPerRow=5, subImgSize=(300,300)):
|
||||
opts = DrawingOptions()
|
||||
opts.atomLabelFontSize = 30
|
||||
opts.bondLineWidth = 1.5
|
||||
opts.colorBonds = False
|
||||
|
||||
legends = []
|
||||
for m in mol_list:
|
||||
AllChem.Compute2DCoords(m)
|
||||
docking_score = 'Docking Score: {:.3f}'.format(float(m.GetProp('r_i_docking_score')))
|
||||
mw = 'MolWt: {:.3f}'.format(Descriptors.MolWt(m))
|
||||
legend = mw + '\n' + docking_score
|
||||
legends.append(legend)
|
||||
|
||||
img = Draw.MolsToGridImage(
|
||||
mol_list,
|
||||
molsPerRow=molsPerRow,
|
||||
subImgSize=subImgSize,
|
||||
legends=legends
|
||||
)
|
||||
return img
|
||||
|
||||
|
||||
def DrawDockedMol(mol, # rdkit.Molecule的instance
|
||||
save=None,
|
||||
size=(300,300),
|
||||
bondLineWidth=5, # 化学键显示的宽度
|
||||
FontSize=3,
|
||||
fixedBondLength=50, # 化学键显示的长度
|
||||
legend=None,
|
||||
legendFontSize=5,
|
||||
elemColor=True): # 是否按元素给原子着色
|
||||
|
||||
if legend is None:
|
||||
try:
|
||||
docking_score = 'Docking Score: {:.3f}'.format(float(mol.GetProp('r_i_docking_score')))
|
||||
mw = 'MolWt: {:.3f}'.format(Descriptors.MolWt(mol))
|
||||
legend = mw + '\n' + docking_score
|
||||
except:
|
||||
pass
|
||||
draw = rdMolDraw2D.MolDraw2DCairo(size[0], size[1])
|
||||
option = rdMolDraw2D.MolDrawOptions()
|
||||
option.bondLineWidth = bondLineWidth
|
||||
option.fixedBondLength = fixedBondLength
|
||||
option.setHighlightColour((0.95,0.7,0.95))
|
||||
option.baseFontSize = FontSize
|
||||
option.legendFontSize = legendFontSize
|
||||
if elemColor == False:
|
||||
option.updateAtomPalette({k: (0, 0, 0) for k in DrawingOptions.elemDict.keys()})
|
||||
draw.SetDrawOptions(option)
|
||||
AllChem.Compute2DCoords(mol)
|
||||
rdMolDraw2D.PrepareAndDrawMolecule(draw, mol, legend=legend)
|
||||
draw.FinishDrawing()
|
||||
if save:
|
||||
if '.png' in save:
|
||||
n = 0
|
||||
path = '/'.join(save.split('/')[0:-1])
|
||||
while os.path.exists(save):
|
||||
n += 1
|
||||
name = save.split('/')[-1].split('.')[0]
|
||||
name = name + '_' + str(n) + '.png'
|
||||
save = path + '/' + name
|
||||
else:
|
||||
draw.WriteDrawingText(save)
|
||||
else:
|
||||
return draw
|
||||
|
||||
|
||||
def CombineImages(img_file_list, col_num=7, save_dir='./', title='merged_image.png', img_size=None):
|
||||
num_img = len(img_file_list)
|
||||
num_row = num_img//col_num+1 if num_img%col_num != 0 else num_img//col_num
|
||||
if img_size is None:
|
||||
img_size = Image.open(img_file_list[0]).size
|
||||
toImage = Image.new('RGB', (img_size[1]*col_num, img_size[0]*num_row), color=(255,255,255))
|
||||
x_cusum = 0
|
||||
y_cusum = 0
|
||||
num_has_paste = 0
|
||||
for img_file in img_file_list:
|
||||
img = Image.open(img_file)
|
||||
toImage.paste(img, (x_cusum, y_cusum))
|
||||
num_has_paste += 1
|
||||
if num_has_paste%col_num==0:
|
||||
x_cusum = 0
|
||||
y_cusum += img_size[1]
|
||||
else:
|
||||
x_cusum += img_size[0]
|
||||
verify_dir_exists(save_dir)
|
||||
toImage.save(save_dir+'/'+title)
|
||||
plt.xticks([])
|
||||
plt.yticks([])
|
||||
plt.axis('off') # 取消所有边框,坐标轴
|
||||
plt.imshow(toImage)
|
||||
plt.clf() # 清除当前图形。如果不清除,当使用循环大量作图,机器会为plt分配越来越多的内存,速度会逐渐变慢。
|
||||
@@ -1,218 +0,0 @@
|
||||
from weakref import ref
|
||||
from rdkit import Chem
|
||||
from rdkit.Chem import AllChem
|
||||
from rdkit.Chem import QED, Descriptors, Draw
|
||||
from rdkit import DataStructs
|
||||
from . import sascorer
|
||||
import xlsxwriter
|
||||
from easydict import EasyDict
|
||||
from io import BytesIO
|
||||
import glob
|
||||
|
||||
|
||||
SIMILARITY_METRIC = ['DataStructs.TanimotoSimilarity',
|
||||
'DataStructs.DiceSimilarity',
|
||||
'DataStructs.CosineSimilarity',
|
||||
'DataStructs.SokalSimilarity',
|
||||
'DataStructs.RusselSimilarity',
|
||||
'DataStructs.KulczynskiSimilarity',
|
||||
'DataStructs.McConnaugheySimilarity']
|
||||
|
||||
|
||||
class ProcessSmilesFile(object):
|
||||
def __init__(self, sdf_file):# refer_mol=None):
|
||||
suppl = Chem.SDMolSupplier(sdf_file)
|
||||
self.smiles = {}
|
||||
#self.refer_mol = refer_mol
|
||||
for idx, mol in enumerate(suppl):
|
||||
if mol is None:
|
||||
continue
|
||||
smi = Chem.MolToSmiles(mol)
|
||||
self.smiles[smi] = {}
|
||||
self.smiles[smi]['mol'] = mol
|
||||
self.smiles[smi]['index'] = idx
|
||||
self.smiles[smi]['name'] = mol.GetProp('_Name')
|
||||
self.smiles[smi]['sdf'] = sdf_file
|
||||
|
||||
def _QedScore(self,):
|
||||
for smi in self.smiles:
|
||||
#mol = Chem.MolFromSmiles(smi)
|
||||
#print(self.smiles[smi])
|
||||
self.smiles[smi]['QED'] = QED.qed(self.smiles[smi]['mol'])
|
||||
|
||||
def _SAScore(self,):
|
||||
for smi in self.smiles:
|
||||
#mol = Chem.MolFromSmiles(smi)
|
||||
try:
|
||||
self.smiles[smi]['SA'] = sascorer.calculateScore(self.smiles[smi]['mol'])
|
||||
except:
|
||||
self.smiles[smi]['SA'] = 10.0
|
||||
|
||||
def _Lipinski(self,):
|
||||
for smi in self.smiles:
|
||||
m = self.smiles[smi]['mol']
|
||||
#wt = sum([a.GetMass() for a in m.GetAtoms()])
|
||||
#self.smiles[smi]['MolWt'] = wt
|
||||
self.smiles[smi]['MolWt'] = Descriptors.MolWt(m)
|
||||
self.smiles[smi]['LogP'] = Descriptors.MolLogP(m)
|
||||
self.smiles[smi]['TPSA'] = Descriptors.TPSA(m)
|
||||
self.smiles[smi]['HBD'] = Descriptors.NumHDonors(m)
|
||||
self.smiles[smi]['HBA'] = Descriptors.NumHAcceptors(m)
|
||||
self.smiles[smi]['RB'] = Descriptors.NumRotatableBonds(m)
|
||||
self.smiles[smi]['Lipinski'] = [self.smiles[smi]['MolWt'],
|
||||
self.smiles[smi]['LogP'],
|
||||
self.smiles[smi]['TPSA'],
|
||||
self.smiles[smi]['HBD'],
|
||||
self.smiles[smi]['HBA'],
|
||||
self.smiles[smi]['RB']]
|
||||
|
||||
def _Similarity(self, refer_mol, metric='DataStructs.TanimotoSimilarity'):
|
||||
metric = eval(metric)
|
||||
refer_fp = AllChem.GetMorganFingerprint(refer_mol, 2)
|
||||
for smi in self.smiles:
|
||||
fp = AllChem.GetMorganFingerprint(self.smiles[smi]['mol'],2)
|
||||
'''similarity = DataStructs.FingerprintSimilarity(
|
||||
fp, refer_fp, metric=metric
|
||||
)'''
|
||||
similarity = metric(fp, refer_fp)
|
||||
self.smiles[smi]['similarity'] = similarity
|
||||
|
||||
def Filter(self, molwt=200, lp_rule=False, refer_mol=None,
|
||||
metric='DataStructs.TanimotoSimilarity', similarity=0.5):
|
||||
self._QedScore()
|
||||
self._SAScore()
|
||||
self._Lipinski()
|
||||
if refer_mol:
|
||||
self._Similarity(refer_mol, metric=metric)
|
||||
#smiles = EasyDict(self.smiles)
|
||||
out = {}
|
||||
for s in self.smiles:
|
||||
smi = EasyDict(self.smiles[s])
|
||||
if smi.MolWt < molwt:
|
||||
continue
|
||||
if lp_rule:
|
||||
l = [smi.MolWt<500, smi.LogP<5, smi.RB<10,
|
||||
smi.TPSA<140, smi.HBA<10, smi.HBD<5]
|
||||
#lenth = len(set(l))
|
||||
if sum(l) != 6:
|
||||
continue
|
||||
if refer_mol:
|
||||
if smi.similarity < similarity:
|
||||
continue
|
||||
else:
|
||||
smi.similarity = '--'
|
||||
out[s] = smi
|
||||
return out
|
||||
|
||||
|
||||
def Mol2Excel(mol_items, work_book=None, name='result.xlsx', worksheet_name='worksheet'):
|
||||
#add_worksheet_name = path.split('/')[-2].split('_')[0].split('sample-')[1]
|
||||
if work_book is None:
|
||||
workbook = xlsxwriter.Workbook(name)
|
||||
else:
|
||||
workbook = work_book
|
||||
worksheet = workbook.add_worksheet(worksheet_name)
|
||||
header = ['Image', 'SMILES', 'QED Score', 'SA Score', 'Similarity',
|
||||
'MolWt', 'LogP', 'TPSA', 'HBD', 'HBA', 'RB', 'sdf file',
|
||||
'index']
|
||||
header_style = {'bold':1,'valign':'vcenter','align':'center','top':2,
|
||||
'left':2,'right':2,'bottom':2}
|
||||
HeaderStyle = workbook.add_format(header_style)
|
||||
|
||||
item_style = {'align':'center','valign': 'vcenter','top':2,'left':2,
|
||||
'right':2,'bottom':2,'text_wrap':1}
|
||||
ItemStyle = workbook.add_format(item_style)
|
||||
worksheet.set_column('A:A', 39)
|
||||
worksheet.set_column('B:B', 39)
|
||||
worksheet.set_column('C:C', 10)
|
||||
worksheet.set_column('D:D', 10)
|
||||
worksheet.set_column('E:E', 10)
|
||||
worksheet.set_column('F:F', 10)
|
||||
worksheet.set_column('G:G', 10)
|
||||
worksheet.set_column('H:H', 10)
|
||||
worksheet.set_column('I:I', 10)
|
||||
worksheet.set_column('J:J', 10)
|
||||
worksheet.set_column('K:K', 10)
|
||||
worksheet.set_column('L:L', 30)
|
||||
worksheet.set_column('M:M', 10)
|
||||
|
||||
for ix_, i in enumerate(header):
|
||||
worksheet.write(0, ix_, i, HeaderStyle)
|
||||
|
||||
for ix, items in enumerate(mol_items):
|
||||
smi = items[-2]
|
||||
info = EasyDict(items[-1])
|
||||
img_data = BytesIO()
|
||||
AllChem.Compute2DCoords(info.mol)
|
||||
img = Draw.MolToImage(info.mol)
|
||||
img.save(img_data, format='PNG')
|
||||
worksheet.set_row(ix+1, 185)
|
||||
worksheet.insert_image(ix+1, 0, 'f', {'x_scale': 0.9, 'y_scale': 0.8, 'image_data':img_data, 'positioning':1})
|
||||
worksheet.write(ix+1, 1, smi, ItemStyle)
|
||||
worksheet.write(ix+1, 2, info.QED, ItemStyle)
|
||||
worksheet.write(ix+1, 3, info.SA, ItemStyle)
|
||||
worksheet.write(ix+1, 4, info.similarity, ItemStyle)
|
||||
worksheet.write(ix+1, 5, info.MolWt, ItemStyle)
|
||||
worksheet.write(ix+1, 6, info.LogP, ItemStyle)
|
||||
worksheet.write(ix+1, 7, info.TPSA, ItemStyle)
|
||||
worksheet.write(ix+1, 8, info.HBD, ItemStyle)
|
||||
worksheet.write(ix+1, 9, info.HBA, ItemStyle)
|
||||
worksheet.write(ix+1, 10, info.RB, ItemStyle)
|
||||
worksheet.write(ix+1, 11, info.sdf, ItemStyle)
|
||||
worksheet.write(ix+1, 12, info.index, ItemStyle)
|
||||
if work_book is None:
|
||||
workbook.close()
|
||||
|
||||
def WriteExcel(result_dir_list, name='result.xlsx', molwt=200, lp_rule=False, sorted_idx=0, refer_mol=None):
|
||||
'''
|
||||
sorted_ix: 用于排序的属性的index, 0:SA; 1:QED; 2:SA*QED
|
||||
'''
|
||||
#创建一个工作簿并添加一张工作表,工作表是可以命名的
|
||||
#workbook = xlsxwriter.Workbook(name)
|
||||
for path in result_dir_list:
|
||||
mol_dict = ProcessSmilesFile(path).Filter(
|
||||
molwt=molwt, lp_rule=lp_rule, refer_mol=refer_mol,
|
||||
metric='DataStructs.TanimotoSimilarity', similarity=0.2
|
||||
)
|
||||
#print(mol_dict)
|
||||
#L = [(mol_dict[i]['SA'], mol_dict[i]['QED'], mol_dict[i]['SA']*mol_dict[i]['QED'], i, mol_dict[i]) for i in mol_dict]
|
||||
L = [(mol_dict[i]['SA'], mol_dict[i]['QED'], mol_dict[i]['similarity'], i, mol_dict[i]) for i in mol_dict]
|
||||
if sorted_idx==0:
|
||||
L = sorted(L, reverse=False, key=lambda x: x[sorted_idx])
|
||||
else:
|
||||
L = sorted(L, reverse=True, key=lambda x: x[sorted_idx])
|
||||
#print(L)
|
||||
#add_worksheet_name = path.split('/')[-2]
|
||||
#worksheet = workbook.add_worksheet(add_worksheet_name)
|
||||
Mol2Excel(L, name=name)#, worksheet_name=add_worksheet_name)
|
||||
#workbook.close()
|
||||
|
||||
|
||||
def WriteExcelAll(result_dir_list, name='result.xlsx', molwt=200, lp_rule=False,
|
||||
sorted_ix=0, refer_mol=None, similarity=0.2):
|
||||
all_mol = {}
|
||||
for path in result_dir_list:
|
||||
mol_dict = ProcessSmilesFile(path).Filter(
|
||||
molwt=molwt, lp_rule=lp_rule, refer_mol=refer_mol,
|
||||
metric='DataStructs.TanimotoSimilarity', similarity=similarity
|
||||
)
|
||||
for smi in mol_dict:
|
||||
if smi not in all_mol:
|
||||
all_mol[smi] = mol_dict[smi]
|
||||
|
||||
L = [(all_mol[i]['SA'], all_mol[i]['QED'], all_mol[i]['similarity'], i, all_mol[i]) for i in all_mol]
|
||||
if sorted_ix==0:
|
||||
L = sorted(L, reverse=False, key=lambda x: x[sorted_ix])
|
||||
else:
|
||||
L = sorted(L, reverse=True, key=lambda x: x[sorted_ix])
|
||||
Mol2Excel(L, name=name)
|
||||
|
||||
|
||||
|
||||
'''refmol_1 = Chem.MolFromSmiles('O=C(/C=C/C1=CC=CC=C1)/C=C/C2=CC=CC=C2')
|
||||
refmol_2 = Chem.MolFromSmiles('O=C(/C(CCC/1)=C/C2=CC=CC=C2)C1=C\C3=CC=CC=C3')
|
||||
refmol_3 = Chem.MolFromSmiles('O=S(C1=CC2=NC(C3=CC=CC=C3)=CC=C2C=C1)(NC4=CC=CC=C4)=O')
|
||||
path_list = [i for i in glob.glob('./outputs/HAT1_2022_08_25*/SMILES.txt') if 'sample-1' not in i]
|
||||
path_list += ['HAT1_2022_08_26__18_59_39/SMILES,txt']
|
||||
print(path_list)
|
||||
WriteExcelAll(path_list, molwt=100, name='HAT1_Wt_more_than_100.xlsx', sorted_ix=0, refer_mol=None)'''
|
||||
@@ -1,166 +0,0 @@
|
||||
import os
|
||||
import glob
|
||||
import numpy as np
|
||||
from multiprocessing import Pool
|
||||
from .ParseFile import Protein, parse_sdf_to_dict, Ligand
|
||||
from .residues_base import RESIDUES_TOPO
|
||||
|
||||
|
||||
def verify_dir_exists(dirname):
|
||||
if os.path.isdir(os.path.dirname(dirname)) == False:
|
||||
os.makedirs(os.path.dirname(dirname))
|
||||
|
||||
def ComputeDistMat(m1, m2):
|
||||
#m1_square = np.sum(m1*m1, axis=1, keepdims=True)
|
||||
m1_square = np.expand_dims(np.einsum('ij,ij->i', m1, m1), axis=1)
|
||||
if m1 is m2:
|
||||
m2_square = m1_square.T
|
||||
else:
|
||||
#m2_square = np.sum(m2*m2, axis=1, keepdims=True).T
|
||||
m2_square = np.expand_dims(np.einsum('ij,ij->i', m2, m2), axis=0)
|
||||
dist_mat = m1_square + m2_square - np.dot(m1, m2.T)*2
|
||||
# result maybe less than 0 due to floating point rounding errors.
|
||||
dist_mat = np.maximum(dist_mat, 0, dist_mat)
|
||||
if m1 is m2:
|
||||
# Ensure that distances between vectors and themselves are set to 0.0.
|
||||
# This may not be the case due to floating point rounding errors.
|
||||
dist_mat.flat[::dist_mat.shape[0] + 1] = 0.0
|
||||
dist_mat = np.sqrt(dist_mat)
|
||||
return dist_mat
|
||||
|
||||
class SplitPocket(object):
|
||||
|
||||
def __init__(self,
|
||||
main_path='/export/home/jyy/CrossDocked2020/',
|
||||
sample_path='/Refine_Positive_Samples/',
|
||||
new_sapmle_path='/Refine_Positive_Samples_Pocket/',
|
||||
type_file='/export/home/jyy/CrossDocked2020/Refine_Positive_Samples_New.types',
|
||||
dist_cutoff=10,
|
||||
get_surface_atom=True):
|
||||
|
||||
self.main_path = main_path
|
||||
self.sample_path = sample_path
|
||||
self.new_sapmle_path = new_sapmle_path
|
||||
self.dist_cutoff = dist_cutoff
|
||||
self.type_file = type_file
|
||||
self.types = [line.strip().split() for line in open(type_file).readlines()]
|
||||
self.exceptions = []
|
||||
self.get_surface_atom = get_surface_atom
|
||||
|
||||
@staticmethod
|
||||
def _split_pocket(protein, ligand, dist_cutoff):
|
||||
if protein.__normalized__ == False:
|
||||
protein.normalize_coord()
|
||||
rm = protein.rotate_matrix
|
||||
shift = protein.center_of_mass_shift
|
||||
ligand.normalize_pos(shift, rm)
|
||||
res = np.array(protein.get_residues)
|
||||
cm_res = np.array([r.center_of_mass for r in res])
|
||||
dist_mat = ComputeDistMat(ligand.normalized_coords, cm_res)
|
||||
bool_dist_mat = dist_mat < dist_cutoff
|
||||
pocket_res = res[bool_dist_mat.sum(axis=0)>0]
|
||||
pocket_block = '\n'.join(
|
||||
[i.to_heavy_string for i in pocket_res if len(i.get_heavy_atoms)==len(RESIDUES_TOPO[i.name])]
|
||||
)
|
||||
return pocket_block, ligand.mol_block()
|
||||
|
||||
@staticmethod
|
||||
def _split_pocket_with_surface_atoms(protein, ligand, dist_cutoff):
|
||||
if protein.__normalized__ == False:
|
||||
protein.normalize_coord()
|
||||
rm = protein.rotate_matrix
|
||||
shift = protein.center_of_mass_shift
|
||||
ligand.normalize_pos(shift, rm)
|
||||
|
||||
protein.compute_surface_atoms()
|
||||
|
||||
res = np.array(protein.get_residues)
|
||||
cm_res = np.array([r.center_of_mass for r in res])
|
||||
dist_mat = ComputeDistMat(ligand.normalized_coords, cm_res)
|
||||
bool_dist_mat = dist_mat < dist_cutoff
|
||||
pocket_res = res[bool_dist_mat.sum(axis=0)>0]
|
||||
pocket_block = []
|
||||
for r in pocket_res:
|
||||
for a in r.get_heavy_atoms:
|
||||
if a.is_surf is True:
|
||||
pocket_block.append(a.to_string + ' surf')
|
||||
else:
|
||||
pocket_block.append(a.to_string + ' inner')
|
||||
pocket_block = '\n'.join(pocket_block)
|
||||
return pocket_block, ligand.mol_block()
|
||||
|
||||
def _do_split(self, items):
|
||||
try:
|
||||
sub_path = items[4].split('/')[0]
|
||||
ligand_name = items[4].split('/')[1].split('.')[0]
|
||||
chain_id = items[3].split('.')[0].split('_')[-2]
|
||||
|
||||
protein = Protein(self.main_path+self.sample_path+items[3])
|
||||
chain = protein.get_chain(chain_id)
|
||||
ligand = Ligand(self.main_path+self.sample_path+items[4], sanitize=True)
|
||||
|
||||
if self.get_surface_atom:
|
||||
pocket_block, ligand_block = self._split_pocket_with_surface_atoms(chain, ligand, self.dist_cutoff)
|
||||
else:
|
||||
pocket_block, ligand_block = self._split_pocket(chain, ligand, self.dist_cutoff)
|
||||
|
||||
save_path = '{}/{}/{}/'.format(self.main_path, self.new_sapmle_path, sub_path)
|
||||
verify_dir_exists(save_path)
|
||||
pocket_file_name = '{}/{}_pocket{}.pdb'.format(save_path, ligand_name, self.dist_cutoff)
|
||||
|
||||
open(pocket_file_name, 'w').write(pocket_block)
|
||||
open(save_path+'/{}.mol'.format(ligand_name), 'w').write(ligand_block)
|
||||
except:
|
||||
protein_file = self.main_path+self.sample_path+items[3]
|
||||
ligand_file = self.main_path+self.sample_path+items[4]
|
||||
print('[Exception]', protein_file, ligand_file)
|
||||
|
||||
@staticmethod
|
||||
def split_pocket_from_site_map(site_map, protein_file, dist_cutoff):
|
||||
'''
|
||||
if target dosn't have ligand, we can use the site_map.pdb computed by schrodinger or other software
|
||||
to split potential pocket.
|
||||
'''
|
||||
site_coords = []
|
||||
with open(site_map) as fr:
|
||||
lines = fr.readlines()
|
||||
for line in lines:
|
||||
if line.startswith('HETATM'):
|
||||
line = line.strip()
|
||||
xyz = [float(line[30:38].strip()),
|
||||
float(line[38:46].strip()),
|
||||
float(line[46:54].strip())]
|
||||
site_coords.append(xyz)
|
||||
site_coords = np.array(site_coords)
|
||||
|
||||
protein = Protein(protein_file)
|
||||
res = np.array(protein.get_residues)
|
||||
cm_res = np.array([r.center_of_mass for r in res])
|
||||
dist_mat = ComputeDistMat(site_coords, cm_res)
|
||||
bool_dist_mat = dist_mat < dist_cutoff
|
||||
pocket_res = res[bool_dist_mat.sum(axis=0)>0]
|
||||
pocket_block = '\n'.join(
|
||||
[i.to_heavy_string for i in pocket_res if len(i.get_heavy_atoms)==len(RESIDUES_TOPO[i.name])]
|
||||
)
|
||||
return pocket_block
|
||||
|
||||
def __call__(self, np=10):
|
||||
pool = Pool(processes=np) # 创建进程池对象,进程数与multiprocessing.cpu_count()相同
|
||||
data_pool = pool.map(self._do_split, self.types) # 并行执行函数
|
||||
pool.close()
|
||||
pool.join()
|
||||
print('Done !')
|
||||
|
||||
'''
|
||||
from utils import SplitPocket
|
||||
split_pocket = SplitPocket(
|
||||
main_path='/export/home/jyy/CrossDocked2020/',
|
||||
sample_path = '/Refine_Positive_Samples/',
|
||||
new_sapmle_path = './crossdocked2020_pocket-surf/',
|
||||
type_file = '/export/home/jyy/CrossDocked2020/Refine_Positive_Samples_New.types',
|
||||
dist_cutoff = 10,
|
||||
get_surface_atom = True
|
||||
)
|
||||
split_pocket(np=10)
|
||||
'''
|
||||
|
||||
@@ -1,173 +0,0 @@
|
||||
#
|
||||
# calculation of synthetic accessibility score as described in:
|
||||
#
|
||||
# Estimation of Synthetic Accessibility Score of Drug-like Molecules based on Molecular Complexity and Fragment Contributions
|
||||
# Peter Ertl and Ansgar Schuffenhauer
|
||||
# Journal of Cheminformatics 1:8 (2009)
|
||||
# http://www.jcheminf.com/content/1/1/8
|
||||
#
|
||||
# several small modifications to the original paper are included
|
||||
# particularly slightly different formula for marocyclic penalty
|
||||
# and taking into account also molecule symmetry (fingerprint density)
|
||||
#
|
||||
# for a set of 10k diverse molecules the agreement between the original method
|
||||
# as implemented in PipelinePilot and this implementation is r2 = 0.97
|
||||
#
|
||||
# peter ertl & greg landrum, september 2013
|
||||
#
|
||||
|
||||
|
||||
from rdkit import Chem
|
||||
from rdkit.Chem import rdMolDescriptors
|
||||
import pickle
|
||||
|
||||
import math
|
||||
from collections import defaultdict
|
||||
|
||||
import os.path as op
|
||||
|
||||
_fscores = None
|
||||
|
||||
|
||||
def readFragmentScores(name='fpscores'):
|
||||
import gzip
|
||||
global _fscores
|
||||
# generate the full path filename:
|
||||
if name == "fpscores":
|
||||
name = op.join(op.dirname(__file__), name)
|
||||
data = pickle.load(gzip.open('%s.pkl.gz' % name))
|
||||
outDict = {}
|
||||
for i in data:
|
||||
for j in range(1, len(i)):
|
||||
outDict[i[j]] = float(i[0])
|
||||
_fscores = outDict
|
||||
|
||||
|
||||
def numBridgeheadsAndSpiro(mol, ri=None):
|
||||
nSpiro = rdMolDescriptors.CalcNumSpiroAtoms(mol)
|
||||
nBridgehead = rdMolDescriptors.CalcNumBridgeheadAtoms(mol)
|
||||
return nBridgehead, nSpiro
|
||||
|
||||
|
||||
def calculateScore(m):
|
||||
if _fscores is None:
|
||||
readFragmentScores()
|
||||
|
||||
# fragment score
|
||||
fp = rdMolDescriptors.GetMorganFingerprint(m,
|
||||
2) # <- 2 is the *radius* of the circular fingerprint
|
||||
fps = fp.GetNonzeroElements()
|
||||
score1 = 0.
|
||||
nf = 0
|
||||
for bitId, v in fps.items():
|
||||
nf += v
|
||||
sfp = bitId
|
||||
score1 += _fscores.get(sfp, -4) * v
|
||||
score1 /= nf
|
||||
|
||||
# features score
|
||||
nAtoms = m.GetNumAtoms()
|
||||
nChiralCenters = len(Chem.FindMolChiralCenters(m, includeUnassigned=True))
|
||||
ri = m.GetRingInfo()
|
||||
nBridgeheads, nSpiro = numBridgeheadsAndSpiro(m, ri)
|
||||
nMacrocycles = 0
|
||||
for x in ri.AtomRings():
|
||||
if len(x) > 8:
|
||||
nMacrocycles += 1
|
||||
|
||||
sizePenalty = nAtoms**1.005 - nAtoms
|
||||
stereoPenalty = math.log10(nChiralCenters + 1)
|
||||
spiroPenalty = math.log10(nSpiro + 1)
|
||||
bridgePenalty = math.log10(nBridgeheads + 1)
|
||||
macrocyclePenalty = 0.
|
||||
# ---------------------------------------
|
||||
# This differs from the paper, which defines:
|
||||
# macrocyclePenalty = math.log10(nMacrocycles+1)
|
||||
# This form generates better results when 2 or more macrocycles are present
|
||||
if nMacrocycles > 0:
|
||||
macrocyclePenalty = math.log10(2)
|
||||
|
||||
score2 = 0. - sizePenalty - stereoPenalty - spiroPenalty - bridgePenalty - macrocyclePenalty
|
||||
|
||||
# correction for the fingerprint density
|
||||
# not in the original publication, added in version 1.1
|
||||
# to make highly symmetrical molecules easier to synthetise
|
||||
score3 = 0.
|
||||
if nAtoms > len(fps):
|
||||
score3 = math.log(float(nAtoms) / len(fps)) * .5
|
||||
|
||||
sascore = score1 + score2 + score3
|
||||
|
||||
# need to transform "raw" value into scale between 1 and 10
|
||||
min = -4.0
|
||||
max = 2.5
|
||||
sascore = 11. - (sascore - min + 1) / (max - min) * 9.
|
||||
# smooth the 10-end
|
||||
if sascore > 8.:
|
||||
sascore = 8. + math.log(sascore + 1. - 9.)
|
||||
if sascore > 10.:
|
||||
sascore = 10.0
|
||||
elif sascore < 1.:
|
||||
sascore = 1.0
|
||||
|
||||
return sascore
|
||||
|
||||
|
||||
def processMols(mols):
|
||||
print('smiles\tName\tsa_score')
|
||||
for i, m in enumerate(mols):
|
||||
if m is None:
|
||||
continue
|
||||
|
||||
s = calculateScore(m)
|
||||
|
||||
smiles = Chem.MolToSmiles(m)
|
||||
print(smiles + "\t" + m.GetProp('_Name') + "\t%3f" % s)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
import sys
|
||||
import time
|
||||
|
||||
t1 = time.time()
|
||||
readFragmentScores("fpscores")
|
||||
t2 = time.time()
|
||||
|
||||
suppl = Chem.SmilesMolSupplier(sys.argv[1])
|
||||
t3 = time.time()
|
||||
processMols(suppl)
|
||||
t4 = time.time()
|
||||
|
||||
print('Reading took %.2f seconds. Calculating took %.2f seconds' % ((t2 - t1), (t4 - t3)),
|
||||
file=sys.stderr)
|
||||
|
||||
#
|
||||
# Copyright (c) 2013, Novartis Institutes for BioMedical Research Inc.
|
||||
# All rights reserved.
|
||||
#
|
||||
# Redistribution and use in source and binary forms, with or without
|
||||
# modification, are permitted provided that the following conditions are
|
||||
# met:
|
||||
#
|
||||
# * Redistributions of source code must retain the above copyright
|
||||
# notice, this list of conditions and the following disclaimer.
|
||||
# * Redistributions in binary form must reproduce the above
|
||||
# copyright notice, this list of conditions and the following
|
||||
# disclaimer in the documentation and/or other materials provided
|
||||
# with the distribution.
|
||||
# * Neither the name of Novartis Institutes for BioMedical Research Inc.
|
||||
# nor the names of its contributors may be used to endorse or promote
|
||||
# products derived from this software without specific prior written permission.
|
||||
#
|
||||
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
#
|
||||
@@ -1,389 +0,0 @@
|
||||
import torch
|
||||
import torch_geometric as pyg
|
||||
from torch_geometric.loader import DataLoader, DataListLoader
|
||||
from torch.nn.utils import clip_grad_norm_
|
||||
#from torch.utils.data import Dataset, Subset, DataLoader
|
||||
from torch.cuda import amp
|
||||
import numpy as np
|
||||
import os
|
||||
import time
|
||||
from collections.abc import Iterable
|
||||
from torch.utils import tensorboard
|
||||
|
||||
|
||||
def inf_iterator(iterable):
|
||||
iterator = iterable.__iter__()
|
||||
while True:
|
||||
try:
|
||||
yield iterator.__next__()
|
||||
except StopIteration:
|
||||
iterator = iterable.__iter__()
|
||||
|
||||
|
||||
def get_parameter_number(model):
|
||||
total_num = sum(p.numel() for p in model.parameters())
|
||||
trainable_num = sum(p.numel() for p in model.parameters() if p.requires_grad)
|
||||
return {'Total': total_num, 'Trainable': trainable_num}
|
||||
|
||||
def timewait(time_gap):
|
||||
d = time_gap//(24*3600)
|
||||
d_h = time_gap%(24*3600)
|
||||
h = d_h//3600
|
||||
h_m = d_h%3600
|
||||
m = h_m//60
|
||||
s = h_m%60
|
||||
if d > 0:
|
||||
out = '{}d {}h {}m {}s'.format(int(d),int(h),int(m),round(s,2))
|
||||
elif h > 0:
|
||||
out = '{}h {}m {}s'.format(int(h),int(m),round(s,2))
|
||||
elif m > 0:
|
||||
out = '{}m {}s'.format(int(m),round(s,2))
|
||||
else:
|
||||
out = '{}s'.format(round(s,2))
|
||||
return out
|
||||
|
||||
def verify_dir_exists(dirname):
|
||||
if os.path.isdir(os.path.dirname(dirname)) == False:
|
||||
os.makedirs(os.path.dirname(dirname))
|
||||
|
||||
class Experiment(object):
|
||||
def __init__(self, model, train_set, optimizer, scheduler=None, device='cuda',
|
||||
valid_set=None, clip_grad=True, max_norm=5, norm_type=2,
|
||||
pos_noise_std=0.1, data_parallel=False, use_amp=False):
|
||||
if data_parallel:
|
||||
self.model = pyg.nn.DataParallel(model)
|
||||
else:
|
||||
self.model = model#.to(device)
|
||||
if 'config' in model.__dict__:
|
||||
self.model_config = model.config
|
||||
else:
|
||||
self.model_config = None
|
||||
self.train_set = train_set
|
||||
self.valid_set = valid_set
|
||||
self.optimizer = optimizer
|
||||
self.scheduler = scheduler
|
||||
self.num_train_data = len(train_set)
|
||||
self.data_parallel = data_parallel
|
||||
#self.train_batch_size = train_loader.batch_size
|
||||
if valid_set:
|
||||
self.num_valid_data = len(valid_set)
|
||||
else:
|
||||
self.num_valid_data = None
|
||||
#self.valid_batch_size = valid_loader.batch_size
|
||||
|
||||
self.clip_grad = clip_grad
|
||||
self.max_norm = max_norm
|
||||
self.norm_type = norm_type
|
||||
self.with_tb = False
|
||||
self.device = device
|
||||
self.pos_noise_std = pos_noise_std
|
||||
self.use_amp = use_amp
|
||||
if self.use_amp:
|
||||
self.grad_scaler = amp.GradScaler()
|
||||
|
||||
@staticmethod
|
||||
def get_log(out_dict, key_word, it, time_gap=None):
|
||||
log = []
|
||||
for key, value in out_dict.items():
|
||||
log.append(' {}:{:.5f} |'.format(key, value.item()))
|
||||
log.insert(0, '[{} {}]'.format(key_word, it))
|
||||
if time_gap:
|
||||
log.append(' Time: {}'.format(time_gap))
|
||||
return ''.join(log)
|
||||
|
||||
@staticmethod
|
||||
def write_summary(out_dict, writer, key_word, num_iter, scheduler=None, optimizer=None):
|
||||
for key, value in out_dict.items():
|
||||
writer.add_scalar('{}/{}'.format(key_word, key), value, num_iter)
|
||||
if scheduler is not None:
|
||||
writer.add_scalar('{}/lr'.format(key_word), optimizer.param_groups[0]['lr'], num_iter)
|
||||
writer.flush()
|
||||
|
||||
@staticmethod
|
||||
def get_num_iter(num_data, batch_size):
|
||||
if num_data % batch_size == 0:
|
||||
n_iter = int(num_data/batch_size)
|
||||
else:
|
||||
n_iter = int(num_data/batch_size) + 1
|
||||
return n_iter
|
||||
|
||||
@property
|
||||
def parameter_number(self):
|
||||
return get_parameter_number(self.model)
|
||||
|
||||
def _train_step(self, batch, it=0, print_log=False):
|
||||
start = time.time()
|
||||
self.model.train()
|
||||
self.optimizer.zero_grad()
|
||||
if self.use_amp:
|
||||
with amp.autocast():
|
||||
out_dict = self.model.get_loss(batch)
|
||||
self.grad_scaler.scale(out_dict['loss']).backward()
|
||||
else:
|
||||
out_dict = self.model.get_loss(batch)
|
||||
out_dict['loss'].backward()
|
||||
#out_dict = self.model(batch)
|
||||
|
||||
#with torch.autograd.detect_anomaly():
|
||||
|
||||
#print(out_dict)
|
||||
if self.clip_grad:
|
||||
# 梯度裁剪
|
||||
orig_grad_norm = clip_grad_norm_(
|
||||
self.model.parameters(),
|
||||
max_norm=self.max_norm,
|
||||
norm_type=self.norm_type,
|
||||
error_if_nonfinite=True) # error_if_nonfinite=True:如果梯度存在nan, inf将抛出error
|
||||
if self.use_amp:
|
||||
self.grad_scaler.step(self.optimizer)
|
||||
self.grad_scaler.update()
|
||||
else:
|
||||
self.optimizer.step()
|
||||
|
||||
if self.with_tb:
|
||||
if self.clip_grad:
|
||||
self.writer.add_scalar('train/step/grad', orig_grad_norm, it)
|
||||
self.write_summary(out_dict, self.writer, 'train/step', it)
|
||||
|
||||
end = time.time()
|
||||
time_gap = end - start
|
||||
log = self.get_log(out_dict, 'Step', it, time_gap='{:.3f}'.format(time_gap))
|
||||
with open(self.logdir+'training.log', 'a') as log_writer:
|
||||
log_writer.write(log + '\n')
|
||||
if print_log:
|
||||
print(log)
|
||||
return out_dict
|
||||
|
||||
def train_epoch(self, n_iter, n_epoch, print_log=False):
|
||||
start = time.time()
|
||||
log_dict = {}
|
||||
#train_loader = inf_iterator(self.train_loader)
|
||||
for i in range(n_iter):
|
||||
if self.data_parallel:
|
||||
batch = next(self.train_loader).cuda()#.to(self.device)
|
||||
else:
|
||||
batch = next(self.train_loader).to(self.device)
|
||||
## 计算噪音
|
||||
compose_noise = torch.randn_like(batch.compose_pos) * self.pos_noise_std
|
||||
batch.compose_pos = batch.compose_pos + compose_noise
|
||||
out_dict = self._train_step(batch, it=i+n_epoch*n_iter, print_log=print_log)
|
||||
for key, value in out_dict.items():
|
||||
if key not in log_dict:
|
||||
log_dict[key] = value if 'acc' in key else value * batch.num_graphs
|
||||
else:
|
||||
log_dict[key] += value if 'acc' in key else value * batch.num_graphs
|
||||
#print(log_dict)
|
||||
for key, value in log_dict.items():
|
||||
if 'acc' in key:
|
||||
log_dict[key] = value / n_iter
|
||||
else:
|
||||
log_dict[key] = value / self.num_train_data
|
||||
|
||||
if self.with_tb:
|
||||
self.write_summary(log_dict, self.writer, 'train/epoch', n_epoch,
|
||||
scheduler=self.scheduler, optimizer=self.optimizer)
|
||||
end = time.time()
|
||||
time_gap = timewait(end - start)
|
||||
log = self.get_log(log_dict, 'Epoch', n_epoch, time_gap=time_gap)
|
||||
with open(self.logdir+'training.log', 'a') as log_writer:
|
||||
log_writer.write(log + '\n')
|
||||
if print_log:
|
||||
print(log)
|
||||
|
||||
|
||||
def validate(self, n_iter, n_epoch, print_log=False, schedule_key='loss'):
|
||||
start = time.time()
|
||||
log_dict = {}
|
||||
with torch.no_grad():
|
||||
self.model.eval()
|
||||
for _ in range(n_iter):
|
||||
batch = next(self.valid_loader).to(self.device)
|
||||
out_dict = self.model.get_loss(batch)
|
||||
for key, value in out_dict.items():
|
||||
if key not in log_dict:
|
||||
log_dict[key] = value if 'acc' in key else value * batch.num_graphs
|
||||
else:
|
||||
log_dict[key] += value if 'acc' in key else value * batch.num_graphs
|
||||
|
||||
for key, value in log_dict.items():
|
||||
if 'acc' in key:
|
||||
log_dict[key] = value / n_iter
|
||||
else:
|
||||
log_dict[key] = value / self.num_valid_data
|
||||
if self.scheduler:
|
||||
self.scheduler.step(log_dict[schedule_key])
|
||||
if self.with_tb:
|
||||
self.write_summary(log_dict, self.writer, 'val/epoch', n_epoch,
|
||||
scheduler=self.scheduler, optimizer=self.optimizer)
|
||||
end = time.time()
|
||||
time_gap = timewait(end - start)
|
||||
log = self.get_log(log_dict, 'Validate', n_epoch, time_gap=time_gap)
|
||||
with open(self.logdir+'training.log', 'a') as log_writer:
|
||||
log_writer.write(log + '\n')
|
||||
if print_log:
|
||||
print(log)
|
||||
return log_dict['loss']
|
||||
|
||||
def fit(self, num_epoch, train_batch_size=4, valid_batch_size=16, print_log=True,
|
||||
with_tb=True, logdir='./training_log', early_stop=None, schedule_key='loss',
|
||||
follow_batch=[], exclude_keys=[], collate_fn=None, num_workers=0, pin_memory=False):
|
||||
|
||||
self.train_loader = inf_iterator(DataLoader(
|
||||
self.train_set,
|
||||
batch_size = train_batch_size, #config.train.batch_size,
|
||||
shuffle = True,
|
||||
num_workers = num_workers,
|
||||
pin_memory = pin_memory,
|
||||
follow_batch = follow_batch,
|
||||
exclude_keys = exclude_keys,
|
||||
collate_fn = collate_fn
|
||||
))
|
||||
self.train_batch_size = train_batch_size
|
||||
|
||||
if self.valid_set:
|
||||
self.valid_loader = inf_iterator(DataLoader(
|
||||
self.valid_set,
|
||||
batch_size = valid_batch_size, #config.train.batch_size,
|
||||
shuffle = False,
|
||||
num_workers = num_workers,
|
||||
pin_memory = pin_memory,
|
||||
follow_batch = follow_batch,
|
||||
exclude_keys = exclude_keys,
|
||||
collate_fn = collate_fn
|
||||
))
|
||||
self.valid_batch_size = valid_batch_size
|
||||
|
||||
self.n_iter_train = self.get_num_iter(self.num_train_data, self.train_batch_size)
|
||||
self.n_iter_valid = self.get_num_iter(self.num_valid_data, self.valid_batch_size)
|
||||
## config log
|
||||
date = time.strftime("%Y-%m-%d-%H-%M", time.localtime())
|
||||
self.logdir = logdir+'/'+date+'/'
|
||||
verify_dir_exists(self.logdir)
|
||||
open(self.logdir+'model_config.dir','w').write(str(self.model_config))
|
||||
self.with_tb = with_tb
|
||||
if self.with_tb:
|
||||
self.writer = tensorboard.SummaryWriter(self.logdir)
|
||||
log_writer = open(self.logdir+'training.log', 'w')
|
||||
log_writer.write('\n######## {}; batch_size ########\n'.format(self.parameter_number, train_batch_size))
|
||||
log_writer.close()
|
||||
if print_log:
|
||||
print('\n######## {} ########\n'.format(self.parameter_number))
|
||||
## training/saving model
|
||||
loss_best = float('inf')
|
||||
early_stop_count = 0
|
||||
for epoch in range(num_epoch):
|
||||
self.train_epoch(self.n_iter_train, epoch, print_log=print_log)
|
||||
val_loss = self.validate(self.n_iter_valid, epoch, schedule_key=schedule_key, print_log=print_log)
|
||||
if val_loss < loss_best:
|
||||
torch.save(self.model.state_dict(), self.logdir+'/model.pth')
|
||||
loss_best = val_loss
|
||||
early_stop_count = 0
|
||||
else:
|
||||
early_stop_count += 1
|
||||
if isinstance(early_stop, int):
|
||||
if early_stop_count == early_stop:
|
||||
if print_log:
|
||||
print('Early stopping ......')
|
||||
with open(self.logdir+'training.log', 'a') as log_writer:
|
||||
log_writer.write('Early stopping ......\n')
|
||||
break
|
||||
|
||||
def fit_step(self, num_step, valid_per_step=5000, train_batch_size=4, valid_batch_size=16, print_log=True,
|
||||
with_tb=True, logdir='./training_log', schedule_key='loss', num_workers=0, pin_memory=False,
|
||||
follow_batch=[], exclude_keys=[], collate_fn=None, max_edge_num_in_batch=900000):
|
||||
#torch.multiprocessing.set_sharing_strategy('file_system')
|
||||
if self.data_parallel:
|
||||
self.train_loader = inf_iterator(DataListLoader(
|
||||
self.train_set,
|
||||
batch_size = train_batch_size, #config.train.batch_size,
|
||||
shuffle = True,
|
||||
num_workers = num_workers,
|
||||
pin_memory = pin_memory,
|
||||
#follow_batch = follow_batch,
|
||||
#exclude_keys = exclude_keys,
|
||||
collate_fn = collate_fn
|
||||
))
|
||||
self.train_batch_size = train_batch_size
|
||||
|
||||
if self.valid_set:
|
||||
self.valid_loader = inf_iterator(DataListLoader(
|
||||
self.valid_set,
|
||||
batch_size = valid_batch_size, #config.train.batch_size,
|
||||
shuffle = False,
|
||||
num_workers = num_workers,
|
||||
pin_memory = pin_memory,
|
||||
#follow_batch = follow_batch,
|
||||
#exclude_keys = exclude_keys,
|
||||
collate_fn = collate_fn
|
||||
))
|
||||
self.valid_batch_size = valid_batch_size
|
||||
else:
|
||||
self.train_loader = inf_iterator(DataLoader(
|
||||
self.train_set,
|
||||
batch_size = train_batch_size, #config.train.batch_size,
|
||||
shuffle = False,
|
||||
num_workers = num_workers,
|
||||
pin_memory = pin_memory,
|
||||
follow_batch = follow_batch,
|
||||
exclude_keys = exclude_keys,
|
||||
collate_fn = collate_fn
|
||||
))
|
||||
self.train_batch_size = train_batch_size
|
||||
|
||||
if self.valid_set:
|
||||
self.valid_loader = inf_iterator(DataLoader(
|
||||
self.valid_set,
|
||||
batch_size = valid_batch_size, #config.train.batch_size,
|
||||
shuffle = False,
|
||||
num_workers = num_workers,
|
||||
pin_memory = pin_memory,
|
||||
follow_batch = follow_batch,
|
||||
exclude_keys = exclude_keys,
|
||||
collate_fn = collate_fn
|
||||
))
|
||||
self.valid_batch_size = valid_batch_size
|
||||
|
||||
self.n_iter_train = self.get_num_iter(self.num_train_data, self.train_batch_size)
|
||||
if self.num_valid_data:
|
||||
self.n_iter_valid = self.get_num_iter(self.num_valid_data, self.valid_batch_size)
|
||||
## config log
|
||||
date = time.strftime("%Y-%m-%d-%H-%M", time.localtime())
|
||||
self.logdir = logdir+'/'+date+'/'
|
||||
verify_dir_exists(self.logdir)
|
||||
open(self.logdir+'model_config.dir','w').write(str(self.model_config))
|
||||
self.with_tb = with_tb
|
||||
if self.with_tb:
|
||||
self.writer = tensorboard.SummaryWriter(self.logdir)
|
||||
log_writer = open(self.logdir+'training.log', 'w')
|
||||
log_writer.write('\n######## {}; batch_size ########\n'.format(self.parameter_number, train_batch_size))
|
||||
log_writer.close()
|
||||
if print_log:
|
||||
print('\n######## {} ########\n'.format(self.parameter_number))
|
||||
for step in range(1, num_step+1):
|
||||
#batch = next(self.train_loader)#.to(self.device)
|
||||
## 计算噪音
|
||||
if self.data_parallel:
|
||||
batch = next(self.train_loader)#.to(self.device)
|
||||
for d in batch:
|
||||
d.cpx_pos = d.cpx_pos + torch.randn_like(d.cpx_pos) * self.pos_noise_std
|
||||
else:
|
||||
batch = next(self.train_loader).to(self.device)
|
||||
#print(batch.cpx_edge_index.shape)
|
||||
cpx_noise = torch.randn_like(batch.cpx_pos) * self.pos_noise_std
|
||||
batch.cpx_pos = batch.cpx_pos + cpx_noise
|
||||
if batch.cpx_edge_index.size(1) > max_edge_num_in_batch:
|
||||
continue
|
||||
out_dict = self._train_step(batch, it=step, print_log=print_log)
|
||||
if (step % valid_per_step == 0 or step == num_step):
|
||||
if self.num_valid_data:
|
||||
val_loss = self.validate(self.n_iter_valid, step, schedule_key=schedule_key, print_log=print_log)
|
||||
#torch.save(self.model.state_dict(), self.logdir+'/model.pth')
|
||||
ckpt_path = self.logdir + '/ckpt/'
|
||||
verify_dir_exists(ckpt_path)
|
||||
torch.save({
|
||||
'config': self.model_config,
|
||||
'model': self.model.state_dict(),
|
||||
'optimizer': self.optimizer.state_dict(),
|
||||
'scheduler': self.scheduler.state_dict(),
|
||||
'iteration': step,
|
||||
}, ckpt_path+'/{}.pt'.format(step))
|
||||
@@ -1,340 +0,0 @@
|
||||
import torch
|
||||
import torch.nn.functional as F
|
||||
from torch_geometric.nn.pool import knn_graph, radius_graph
|
||||
from torch_geometric.transforms import Compose
|
||||
from torch_geometric.utils.subgraph import subgraph
|
||||
from torch_geometric.nn import knn, radius
|
||||
from torch_geometric.utils.num_nodes import maybe_num_nodes
|
||||
from torch_scatter import scatter_add
|
||||
import numpy as np
|
||||
import random
|
||||
import copy
|
||||
|
||||
|
||||
def count_neighbors(edge_index, symmetry, valence=None, num_nodes=None):
|
||||
assert symmetry == True, 'Only support symmetrical edges.'
|
||||
|
||||
if num_nodes is None:
|
||||
num_nodes = maybe_num_nodes(edge_index) # 如果没有指定配体原子数,根据edge_index来推测
|
||||
|
||||
if valence is None:
|
||||
valence = torch.ones([edge_index.size(1)], device=edge_index.device)
|
||||
valence = valence.view(edge_index.size(1))
|
||||
return scatter_add(valence, index=edge_index[0], dim=0, dim_size=num_nodes).long()
|
||||
|
||||
|
||||
def change_features_of_neigh(ligand_feature_full, new_num_neigh, new_num_valence, ligand_atom_num_bonds, num_atom_type=7):
|
||||
idx_n_neigh = num_atom_type + 1
|
||||
idx_n_valence = idx_n_neigh + 1
|
||||
idx_n_bonds = idx_n_valence + 1
|
||||
ligand_feature_full[:, idx_n_neigh] = new_num_neigh.long()
|
||||
ligand_feature_full[:, idx_n_valence] = new_num_valence.long()
|
||||
ligand_feature_full[:, idx_n_bonds:idx_n_bonds+3] = ligand_atom_num_bonds.long()
|
||||
return ligand_feature_full
|
||||
|
||||
|
||||
def get_rfs_perm(nbh_list, ring_info):
|
||||
num_nodes = len(nbh_list)
|
||||
node0 = random.randint(0, num_nodes-1)
|
||||
queue,order,not_ring_queue,edge_index = [],[],[],[]
|
||||
if (ring_info[node0]>0).sum():
|
||||
queue.append(node0)
|
||||
else:
|
||||
not_ring_queue.append(node0)
|
||||
while queue or not_ring_queue:
|
||||
if queue:
|
||||
v = queue.pop()
|
||||
order.append(v)
|
||||
elif not_ring_queue:
|
||||
v = not_ring_queue.pop()
|
||||
order.append(v)
|
||||
adj_in_ring, adj_not_ring, edge_idx_step = [], [], []
|
||||
for nbh in nbh_list[v]:
|
||||
if (ring_info[nbh]>0).sum():
|
||||
adj_in_ring.append(nbh)
|
||||
else:
|
||||
adj_not_ring.append(nbh)
|
||||
if nbh in order:
|
||||
edge_idx_step.append((v,nbh))
|
||||
edge_idx_step.append((nbh,v))
|
||||
edge_index.append(edge_idx_step)
|
||||
if adj_not_ring:
|
||||
for w in adj_not_ring:
|
||||
if w not in order and w not in not_ring_queue:
|
||||
not_ring_queue.append(w)
|
||||
# Preferential access to atoms in the same ring of w
|
||||
same_ring_pool = []
|
||||
if adj_in_ring:
|
||||
for w in adj_in_ring:
|
||||
if (w not in order and w not in queue):
|
||||
if (ring_info[w] == ring_info[v]).sum()>0:
|
||||
same_ring_pool.append(w)
|
||||
else:
|
||||
queue.append(w)
|
||||
elif w not in order and w in queue:
|
||||
if (ring_info[w] == ring_info[v]).sum()>0:
|
||||
same_ring_pool.append(w)
|
||||
queue.remove(w)
|
||||
elif w not in order and (ring_info[w]>0).sum()>=1:
|
||||
queue.remove(w)
|
||||
queue.append(w)
|
||||
queue += same_ring_pool
|
||||
return torch.LongTensor(order), edge_index
|
||||
|
||||
|
||||
def get_bfs_perm(nbh_list):
|
||||
num_nodes = len(nbh_list)
|
||||
# 每个节点的邻居数量
|
||||
num_neighbors = torch.LongTensor([len(nbh_list[i]) for i in range(num_nodes)])
|
||||
#print(num_neighbors)
|
||||
# 随机选择一个原子作为起始点
|
||||
bfs_queue = [random.randint(0, num_nodes-1)]
|
||||
bfs_perm, edge_index = [], []
|
||||
num_remains = [num_neighbors.clone()]
|
||||
bfs_next_list = {}
|
||||
visited = {bfs_queue[0]}
|
||||
num_nbh_remain = num_neighbors.clone()
|
||||
while len(bfs_queue) > 0:
|
||||
current = bfs_queue.pop(0) # Remove and return item at index (default last)
|
||||
for nbh in nbh_list[current]:
|
||||
num_nbh_remain[nbh] -= 1
|
||||
bfs_perm.append(current)
|
||||
num_remains.append(num_nbh_remain.clone())
|
||||
next_candid, edge_idx_step = [], []
|
||||
for nxt in nbh_list[current]:
|
||||
if nxt in visited: continue
|
||||
next_candid.append(nxt)
|
||||
visited.add(nxt)
|
||||
for adj in nbh_list[nxt]:
|
||||
if adj in bfs_perm:
|
||||
edge_idx_step.append((adj,nxt))
|
||||
edge_idx_step.append((nxt,adj))
|
||||
edge_index.append(edge_idx_step)
|
||||
random.shuffle(next_candid)
|
||||
bfs_queue += next_candid
|
||||
bfs_next_list[current] = copy.copy(bfs_queue)
|
||||
return torch.LongTensor(bfs_perm), edge_index
|
||||
|
||||
|
||||
def mask_node(data, context_idx, masked_idx, num_atom_type=10, y_pos_std=0.05):
|
||||
data.context_idx = context_idx # for change bond index
|
||||
data.masked_idx = masked_idx
|
||||
# masked ligand atom element/feature/pos.
|
||||
data.ligand_masked_element = data.ligand_element[masked_idx]
|
||||
# data.ligand_masked_feature = data.ligand_atom_feature[masked_idx] # For Prediction. these features are chem properties
|
||||
data.ligand_masked_pos = data.ligand_pos[masked_idx]
|
||||
|
||||
# context ligand atom elment/full features/pos. Note: num_neigh and num_valence features should be changed
|
||||
data.ligand_context_element = data.ligand_element[context_idx]
|
||||
data.ligand_context_feature_full = data.ligand_atom_feature_full[context_idx] # For Input
|
||||
data.ligand_context_pos = data.ligand_pos[context_idx]
|
||||
|
||||
# new bond with ligand context atoms
|
||||
if data.ligand_bond_index.size(1) != 0:
|
||||
data.ligand_context_bond_index, data.ligand_context_bond_type = subgraph(
|
||||
context_idx,
|
||||
data.ligand_bond_index,
|
||||
edge_attr = data.ligand_bond_type,
|
||||
relabel_nodes = True,
|
||||
)
|
||||
else:
|
||||
data.ligand_context_bond_index = torch.empty([2, 0], dtype=torch.long)
|
||||
data.ligand_context_bond_type = torch.empty([0], dtype=torch.long)
|
||||
# re-calculate atom features that relate to bond
|
||||
data.ligand_context_num_neighbors = count_neighbors(
|
||||
data.ligand_context_bond_index,
|
||||
symmetry=True,
|
||||
num_nodes = context_idx.size(0),
|
||||
)
|
||||
data.ligand_context_valence = count_neighbors(
|
||||
data.ligand_context_bond_index,
|
||||
symmetry=True,
|
||||
valence=data.ligand_context_bond_type,
|
||||
num_nodes=context_idx.size(0)
|
||||
)
|
||||
data.ligand_context_num_bonds = torch.stack([
|
||||
count_neighbors(
|
||||
data.ligand_context_bond_index,
|
||||
symmetry=True,
|
||||
valence=data.ligand_context_bond_type == i,
|
||||
num_nodes=context_idx.size(0),
|
||||
) for i in [1, 2, 3]
|
||||
], dim = -1)
|
||||
# re-calculate ligand_context_featrure_full
|
||||
data.ligand_context_feature_full = change_features_of_neigh(
|
||||
data.ligand_context_feature_full,
|
||||
data.ligand_context_num_neighbors,
|
||||
data.ligand_context_valence,
|
||||
data.ligand_context_num_bonds,
|
||||
num_atom_type=num_atom_type
|
||||
)
|
||||
if data.ligand_masked_pos.size(0) == 0:
|
||||
data.y_pos = torch.empty([0,3], dtype=torch.float32)
|
||||
else:
|
||||
data.y_pos = data.ligand_masked_pos[0].view(-1,3)
|
||||
#data.y_pos = data.y_pos.repeat(5, 1)
|
||||
data.y_pos += torch.randn_like(data.y_pos) * y_pos_std
|
||||
data.ligand_frontier = (data.ligand_context_num_neighbors < data.ligand_num_neighbors[context_idx])
|
||||
'''new_step_atom_idx = data.masked_idx[0]
|
||||
candidate_focal_idx_in_context = torch.LongTensor(data.ligand_nbh_list[new_step_atom_idx.item()])
|
||||
focal_idx_in_context_mask = (data.context_idx.unsqueeze(1) == candidate_focal_idx_in_context).any(1)
|
||||
data.focal_idx_in_context = torch.nonzero(focal_idx_in_context_mask).view(-1)
|
||||
if data.focal_idx_in_context.size(0) == 0:
|
||||
data.focal_idx_in_context_ = data.focal_idx_in_context
|
||||
else:
|
||||
data.focal_idx_in_context_ = data.focal_idx_in_context[torch.multinomial(torch.ones_like(data.focal_idx_in_context).float(), 1)]
|
||||
data.focal_label = torch.zeros_like(data.context_idx)
|
||||
data.focal_label[data.focal_idx_in_context] = 1
|
||||
data.atom_label = data.ligand_element[data.masked_idx[0]].unsqueeze(0)
|
||||
# get edge label
|
||||
focal_idx_in_ligand = data.context_idx[focal_idx_in_context_mask]
|
||||
edge_type = []
|
||||
for i in focal_idx_in_ligand:
|
||||
edge_type_mask = torch.stack([data.ligand_bond_index[1]==i, data.ligand_bond_index[0]==new_step_atom_idx], dim=1).all(1)
|
||||
edge_type.append(torch.unique(data.ligand_bond_type[edge_type_mask]))
|
||||
data.edge_label = torch.zeros_like(data.context_idx)
|
||||
data.edge_label[data.focal_idx_in_context] = torch.LongTensor(edge_type)
|
||||
data.edge_query_index_0 = torch.zeros_like(data.edge_label)
|
||||
data.edge_query_index_1 = torch.arange(data.edge_label.size(0))'''
|
||||
return data
|
||||
|
||||
|
||||
def make_pos_label(data, num_real_pos=5, num_fake_pos=5, pos_real_std=0.05, pos_fake_std=2.0, k=16):
|
||||
ligand_context_pos = data.ligand_context_pos
|
||||
ligand_masked_pos = data.ligand_masked_pos
|
||||
protein_pos = data.protein_pos
|
||||
|
||||
if ligand_context_pos.size(0) == 0:
|
||||
# fake position
|
||||
fake_mode = protein_pos[data.candidate_focal_label_in_protein]
|
||||
p = np.ones(fake_mode.size(0), dtype=np.float32)/fake_mode.size(0)
|
||||
pos_fake_idx = np.random.choice(np.arange(fake_mode.size(0)), size=num_fake_pos, p=p)
|
||||
pos_fake = fake_mode[pos_fake_idx]
|
||||
pos_fake += torch.randn_like(pos_fake) * pos_fake_std / 2.
|
||||
# real position
|
||||
p = np.ones(ligand_masked_pos.size(0), dtype=np.float32)/ligand_masked_pos.size(0)
|
||||
pos_real_idx = np.random.choice(np.arange(ligand_masked_pos.size(0)), size=num_real_pos, p=p)
|
||||
pos_real = ligand_masked_pos[pos_real_idx]
|
||||
pos_real += torch.randn_like(pos_real) * pos_real_std
|
||||
else:
|
||||
# fake position
|
||||
fake_mode = ligand_context_pos[data.ligand_frontier]
|
||||
p = np.ones(fake_mode.size(0), dtype=np.float32)/fake_mode.size(0)
|
||||
pos_fake_idx = np.random.choice(np.arange(fake_mode.size(0)), size=num_fake_pos, p=p)
|
||||
pos_fake = fake_mode[pos_fake_idx]
|
||||
pos_fake += torch.randn_like(pos_fake) * pos_fake_std / 2.
|
||||
# real position
|
||||
p = np.ones(ligand_masked_pos.size(0), dtype=np.float32)/ligand_masked_pos.size(0)
|
||||
pos_real_idx = np.random.choice(np.arange(ligand_masked_pos.size(0)), size=num_real_pos-1, p=p)
|
||||
pos_real = ligand_masked_pos[pos_real_idx]
|
||||
pos_real += torch.randn_like(pos_real) * pos_real_std
|
||||
pos_real = torch.cat([pos_real, data.y_pos], dim=0)
|
||||
|
||||
data.pos_fake = pos_fake
|
||||
pos_fake_knn_edge_idx = knn(x=data.cpx_pos, y=pos_fake, k=k, num_workers=16)
|
||||
data.pos_fake_knn_edge_idx_0, data.pos_fake_knn_edge_idx_1 = pos_fake_knn_edge_idx
|
||||
|
||||
data.pos_real = pos_real
|
||||
pos_real_knn_edge_idx = knn(x=data.cpx_pos, y=pos_real, k=k, num_workers=16)
|
||||
data.pos_real_knn_edge_idx_0, data.pos_real_knn_edge_idx_1 = pos_real_knn_edge_idx
|
||||
return data
|
||||
|
||||
|
||||
def get_complex_graph(data, len_ligand_ctx, len_compose, num_workers=1, graph_type='knn', knn=16, radius=10.0):
|
||||
if graph_type == 'rad':
|
||||
data.cpx_edge_index = radius_graph(data.cpx_pos, radius, flow='target_to_source', num_workers=num_workers)
|
||||
elif graph_type == 'knn':
|
||||
data.cpx_edge_index = knn_graph(data.cpx_pos, knn, flow='target_to_source', num_workers=num_workers)
|
||||
# compose_knn_edge_index: 蛋白和配体原子组合在一起计算knn图的邻接矩阵
|
||||
id_cpx_edge = data.cpx_edge_index[0, :len_ligand_ctx*knn] * len_compose + data.cpx_edge_index[1, :len_ligand_ctx*knn]
|
||||
# data.cpx_edge_index[0, :len_ligand_ctx*knn] 将knn的edge中属于ligand_ctx的atom的index选出来
|
||||
id_ligand_ctx_edge = data.ligand_context_bond_index[0] * len_compose + data.ligand_context_bond_index[1]
|
||||
idx_edge = [torch.nonzero(id_cpx_edge == id_) for id_ in id_ligand_ctx_edge] # idx_edge是id_ligand_ctx_edge中的元素在id_cpx_edge中的索引
|
||||
# torch.nonzero(id_ligand_ctx_edge.unsqueeze(1) == id_cpx_edge.unsqueeze(0))[:,1]
|
||||
idx_edge = torch.tensor([a.squeeze() if len(a) > 0 else torch.tensor(-1) for a in idx_edge], dtype=torch.long)
|
||||
data.cpx_edge_type = torch.zeros(len(data.cpx_edge_index[0]), dtype=torch.long) # for encoder edge embedding
|
||||
data.cpx_edge_type[idx_edge[idx_edge>=0]] = data.ligand_context_bond_type[idx_edge>=0]
|
||||
data.cpx_edge_feature = torch.cat([
|
||||
torch.ones([len(data.cpx_edge_index[0]), 1], dtype=torch.long),
|
||||
torch.zeros([len(data.cpx_edge_index[0]), 3], dtype=torch.long),
|
||||
], dim=-1)
|
||||
data.cpx_edge_feature[idx_edge[idx_edge>=0]] = F.one_hot(data.ligand_context_bond_type[idx_edge>=0], num_classes=4) # 0 (1,2,3)-onehot
|
||||
return data
|
||||
|
||||
def get_knn_graph(pos, k=16, edge_feat=None, edge_feat_index=None, num_workers=8, graph_type='knn', radius=5.5):
|
||||
if graph_type == 'rad':
|
||||
cpx_edge_index = radius_graph(pos, radius, flow='target_to_source', num_workers=num_workers)
|
||||
if graph_type == 'knn':
|
||||
cpx_edge_index = knn_graph(pos, k, flow='target_to_source', num_workers=num_workers).long()
|
||||
if isinstance(edge_feat, torch.Tensor) and isinstance(edge_feat_index, torch.Tensor):
|
||||
adj_feat_mat = torch.zeros([pos.size(0), pos.size(0)], dtype=torch.long)
|
||||
adj_feat_mat[edge_feat_index[0],edge_feat_index[1]] = edge_feat
|
||||
cpx_edge_type = adj_feat_mat[cpx_edge_index[0],cpx_edge_index[1]]
|
||||
else:
|
||||
cpx_edge_type = None
|
||||
return cpx_edge_index, cpx_edge_type
|
||||
|
||||
def get_complex_graph_(data, knn=16, num_workers=8, graph_type='knn', radius=5.5):
|
||||
edge_feat = torch.cat([data.ligand_context_bond_type, data.protein_bond_type]).long()
|
||||
edge_feat_index = torch.cat(
|
||||
[data.ligand_context_bond_index, data.protein_bond_index+data.ligand_context_pos.size(0)], dim=1
|
||||
).long()
|
||||
knn_edge_index, knn_edge_type = get_knn_graph(
|
||||
data.cpx_pos, k=knn, edge_feat=edge_feat, edge_feat_index=edge_feat_index,
|
||||
graph_type=graph_type, num_workers=num_workers, radius=radius
|
||||
)
|
||||
data.cpx_edge_index = knn_edge_index
|
||||
data.cpx_edge_type = knn_edge_type
|
||||
data.cpx_edge_feature = F.one_hot(knn_edge_type, num_classes=4)
|
||||
return data
|
||||
|
||||
def sample_edge_with_radius(data, r=4.0):
|
||||
y_pos = data.y_pos
|
||||
ligand_context_pos = data.ligand_context_pos
|
||||
context_idx = data.context_idx
|
||||
masked_idx = data.masked_idx
|
||||
ligand_bond_index = data.ligand_bond_index
|
||||
ligand_bond_type = data.ligand_bond_type
|
||||
# select the atoms whose distance < r between pos_query as edge samples
|
||||
edge_index_radius = radius(ligand_context_pos, y_pos, r=r, num_workers=16)
|
||||
# get the labels of edge samples
|
||||
mask = [i in masked_idx[0] and j in context_idx[edge_index_radius[1]] for i,j in zip(*ligand_bond_index)]
|
||||
new_idx_1 = torch.nonzero((ligand_bond_index[:,mask][1].view(-1,1) == context_idx).any(0)).view(-1)
|
||||
real_bond_type_in_edge_index_radius = torch.nonzero((new_idx_1.view(-1,1) == edge_index_radius[1]).any(0)).view(-1)
|
||||
edge_label = torch.zeros(edge_index_radius.size(1), dtype=torch.long)
|
||||
edge_label[real_bond_type_in_edge_index_radius] = ligand_bond_type[mask]
|
||||
data.edge_query_index_0, data.edge_query_index_1 = edge_index_radius
|
||||
data.edge_label = edge_label
|
||||
return data
|
||||
|
||||
def get_tri_edges(edge_index_query, pos_query, idx_ligand, ligand_bond_index, ligand_bond_type):
|
||||
row, col = edge_index_query
|
||||
acc_num_edges = 0
|
||||
index_real_cps_edge_i_list, index_real_cps_edge_j_list = [], [] # index of real-ctx edge (for attention)
|
||||
for node in torch.arange(pos_query.size(0)):
|
||||
num_edges = (row == node).sum()
|
||||
index_edge_i = torch.arange(num_edges, dtype=torch.long, ) + acc_num_edges
|
||||
index_edge_i, index_edge_j = torch.meshgrid(index_edge_i, index_edge_i, indexing=None)
|
||||
index_edge_i, index_edge_j = index_edge_i.flatten(), index_edge_j.flatten()
|
||||
index_real_cps_edge_i_list.append(index_edge_i)
|
||||
index_real_cps_edge_j_list.append(index_edge_j)
|
||||
acc_num_edges += num_edges
|
||||
index_real_cps_edge_i = torch.cat(index_real_cps_edge_i_list, dim=0).to(pos_query.device) # add len(real_compose_edge_index) in the dataloader for batch
|
||||
index_real_cps_edge_j = torch.cat(index_real_cps_edge_j_list, dim=0).to(pos_query.device)
|
||||
|
||||
node_a_cps_tri_edge = col[index_real_cps_edge_i] # the node of tirangle edge for the edge attention (in the compose)
|
||||
node_b_cps_tri_edge = col[index_real_cps_edge_j]
|
||||
n_context = len(idx_ligand)
|
||||
adj_mat = (torch.zeros([n_context, n_context], dtype=torch.long) - torch.eye(n_context, dtype=torch.long))
|
||||
adj_mat = adj_mat.to(ligand_bond_index.device)
|
||||
adj_mat[ligand_bond_index[0], ligand_bond_index[1]] = ligand_bond_type
|
||||
tri_edge_type = adj_mat[node_a_cps_tri_edge, node_b_cps_tri_edge]
|
||||
tri_edge_feat = (tri_edge_type.view([-1, 1]) == torch.tensor([[-1, 0, 1, 2, 3]]).to(tri_edge_type.device)).long()
|
||||
|
||||
index_real_cps_edge_for_atten = torch.stack([
|
||||
index_real_cps_edge_i, index_real_cps_edge_j # plus len(real_compose_edge_index_0) for dataloader batch
|
||||
], dim=0)
|
||||
tri_edge_index = torch.stack([
|
||||
node_a_cps_tri_edge, node_b_cps_tri_edge # plus len(compose_pos) for dataloader batch
|
||||
], dim=0)
|
||||
return index_real_cps_edge_for_atten, tri_edge_index, tri_edge_feat
|
||||
Reference in New Issue
Block a user