Delete pocket_flow/utils/.ipynb_checkpoints directory

This commit is contained in:
Saoge123
2023-06-16 18:05:24 +08:00
committed by GitHub
parent 36617eb71a
commit 1ec5e0a77d
14 changed files with 0 additions and 4054 deletions

View File

@@ -1,763 +0,0 @@
import os
import copy
import numpy as np
from rdkit import Chem, RDConfig
from rdkit.Chem.rdchem import BondType
from rdkit.Chem import ChemicalFeatures
from .residues_base import RESIDUES_TOPO, RESIDUES_TOPO_WITH_H
try:
import pymol
except:
print('we can not compute the atoms on the surface of protein, '\
'because pymol can not be imported')
pass
#from .BaseFeatures import Mol2Graph, RESIDUE_GRAPH_WITHOUT_H, RESIDUE_GRAPH_WITH_H
#from .utils import coordinate_adjusting
#from .Data import LigandData
ATOM_TYPE_WITH_HYBIRD = [
'SP3_C', 'SP2_C', 'SP_C', 'SP3_N', 'SP2_N', 'SP_N', 'SP3_O', 'SP2_O', 'SP3_F', 'SP3_P',
'SP2_P', 'SP3D_P', 'SP3_S', 'SP2_S', 'SP3D_S', 'SP3D2_S', 'SP3_Cl', 'SP3_Br', 'SP3_I'
]
ATOM_MAP = [6, 6, 6, 7, 7, 7, 8, 8, 9, 15, 15, 15, 16, 16, 16, 16, 17, 35, 53]
PT = Chem.GetPeriodicTable()
BACKBONE_SYMBOL = {'N', 'CA', 'C', 'O'}
AMINO_ACID_TYPE = {
'CYS':0, 'GLY':1, 'ALA':2, 'THR':3, 'LYS':4, 'PRO':5, 'VAL':6, 'SER':7, 'ASN':8, 'LEU':9,
'GLN':10, 'MET':11, 'ASP':12, 'TRP':13, 'HIS':14, 'GLU':15, 'ARG':16, 'ILE':17, 'PHE':18,
'TYR':19
}
'''
def coordinate_adjusting(pos, wts):
center_of_mass = np.sum(pos*wts, axis=0)/wts.sum()
pos = pos - center_of_mass
I_xx = (wts*(pos[...,1]**2 + pos[...,2]**2)).sum()
I_yy = (wts*(pos[...,0]**2 + pos[...,2]**2)).sum()
I_zz = (wts*(pos[...,0]**2 + pos[...,1]**2)).sum()
I_xy = -(wts*(pos[...,0]*pos[...,1])).sum()
I_xz = -(wts*(pos[...,0]*pos[...,2])).sum()
I_yz = -(wts*(pos[...,1]*pos[...,2])).sum()
I = np.array([[I_xx, I_xy, I_xz],
[I_xy, I_yy, I_yz],
[I_xz, I_yz, I_zz]])
eig_v, eig_m = np.linalg.eigh(I) # np.linalg.eigh guarantees you that the eigenvalues are sorted and uses a
# faster algorithm that takes advantage of the fact that the matrix is
# symmetric. If you know that your matrix is symmetric, use this function.
am = pos@eig_m # 用本真矢矩阵变换原子坐标,以‘摆正’分子
rotate_around = []
x, y, z = am[0]
if x < 0:
am[..., 0] = am[..., 0] * -1
rotate_around.append(0)
if y < 0:
am[..., 1] = am[..., 1] * -1
rotate_around.append(1)
if z < 0:
am[..., 2] = am[..., 2] * -1
rotate_around.append(2)
return am, eig_m, center_of_mass, rotate_around
'''
def coordinate_adjusting(pos, wts):
center_of_mass = np.sum(pos*wts, axis=0)/wts.sum()
pos = pos - center_of_mass
I_xx = (wts*(pos[...,1]**2 + pos[...,2]**2)).sum()
I_yy = (wts*(pos[...,0]**2 + pos[...,2]**2)).sum()
I_zz = (wts*(pos[...,0]**2 + pos[...,1]**2)).sum()
I_xy = -(wts*(pos[...,0]*pos[...,1])).sum()
I_xz = -(wts*(pos[...,0]*pos[...,2])).sum()
I_yz = -(wts*(pos[...,1]*pos[...,2])).sum()
I = np.array([[I_xx, I_xy, I_xz],
[I_xy, I_yy, I_yz],
[I_xz, I_yz, I_zz]])
eig_v, eig_m = np.linalg.eigh(I) # np.linalg.eigh guarantees you that the eigenvalues are sorted and uses a
# faster algorithm that takes advantage of the fact that the matrix is
# symmetric. If you know that your matrix is symmetric, use this function.
am = pos@eig_m # 用本真矢矩阵变换原子坐标,以‘摆正’分子
return am, eig_m, center_of_mass
class Dict(dict):
__setattr__ = dict.__setitem__
__getattr__ = dict.__getitem__
class Atom(object):
def __init__(self, atom_info):
self.idx = int(atom_info[6:11])
self.name = atom_info[12:16].strip()
self.res_name = atom_info[17:20].strip() # atom_info[17:20].strip()
self.chain = atom_info[21:22].strip()
self.res_idx = int(atom_info[22:26])
self.coord = np.array([float(atom_info[30:38].strip()),
float(atom_info[38:46].strip()),
float(atom_info[46:54].strip())])
self.occupancy = float(atom_info[54:60])
self.temperature_factor = float(atom_info[60:66].strip())
self.seg_id = atom_info[72:76].strip()
self.element = atom_info[76:78].strip()
if self.element == 'SE':
self.element = 'S'
self.name = 'SD'
self.res_name = 'MET'
self.mass = PT.GetAtomicWeight(self.element)
if self.occupancy < 1.0:
self.is_disorder = True
else:
self.is_disorder = False
self.is_surf = atom_info.split()[-1] == 'surf'
@property
def to_string(self):
fmt = '{:6s}{:5d} {:^4s}{:1s}{:3s} {:1s}{:4d}{:1s} {:8.3f}{:8.3f}{:8.3f}{:6.2f}{:6.2f} {:<2s}{:2s}' # https://cupnet.net/pdb-format/
out = fmt.format('ATOM', self.idx, self.name, '', self.res_name,
self.chain, self.res_idx,'', self.coord[0],
self.coord[1], self.coord[2], self.occupancy,
self.temperature_factor, self.seg_id, self.element)
return out
@property
def to_dict(self):
return {
'element': PT.GetAtomicNumber(self.element),
'pos': self.coord,
'is_backbone': self.name in BACKBONE_SYMBOL,
'atom_name': self.name,
'atom_to_aa_type': AMINO_ACID_TYPE[self.res_name]
}
def __repr__(self):
info = 'name={}, index={}, res={}, chain={}, is_disorder={}'.format(self.name,
self.idx,
self.res_name+str(self.res_idx),
self.chain,
self.is_disorder)
return '{}({})'.format(self.__class__.__name__, info)
class Residue(object):
def __init__(self, res_info):
self.res_info = res_info
#atoms_ = [Atom(i) for i in res_info]
self.atom_dict = {}
disorder = []
for i in res_info: # 排除disorder原子
atom = Atom(i)
if atom.name in self.atom_dict:
continue
else:
self.atom_dict[atom.name] = atom
disorder.append(atom.is_disorder)
if atom.res_name == 'MSE':
atom.res_name = 'MET' # 把MSE残基改成MET
if True in disorder:
self.is_disorder = True
else:
self.is_disorder = False
self.idx = self.atom_dict[atom.name].res_idx
self.chain = self.atom_dict[atom.name].chain
self.name = self.atom_dict[atom.name].res_name
self.is_perfect = True if len(self.get_heavy_atoms)==len(RESIDUES_TOPO[self.name]) else False
@property
def to_heavy_string(self):
return '\n'.join([a.to_string for a in self.get_heavy_atoms])
@property
def to_string(self):
return '\n'.join([a.to_string for a in self.get_atoms])
@property
def get_coords(self):
return np.array([a.coord for a in self.get_atoms])
@property
def get_atoms(self):
return list(self.atom_dict.values())
@property
def get_heavy_atoms(self):
return [a for a in self.atom_dict.values() if 'H' not in a.element]
@property
def get_heavy_coords(self):
return np.array([a.coord for a in self.get_heavy_atoms])
@property
def center_of_mass(self):
atom_mass = np.array([i.mass for i in self.get_atoms]).reshape(-1, 1)
return np.sum(self.get_coords*atom_mass, axis=0)/atom_mass.sum()
@property
def bond_graph(self):
i, j, bt = [], [], []
res_graph = RESIDUES_TOPO[self.name]
atom_names = [i.name for i in self.get_heavy_atoms]
for ix, name in enumerate(atom_names):
for adj in res_graph[name]:
if adj in atom_names:
idx_j = atom_names.index(adj)
i.append(ix)
j.append(idx_j)
bt.append(res_graph[name][adj])
edge_index = np.stack([i,j]).astype(dtype=np.int64)
bt = np.array(bt, dtype=np.int64)
return edge_index, bt
@property
def centroid(self):
return self.get_coords.mean(axis=0)
def __repr__(self):
info = 'name={}, index={}, chain={}, is_disorder={}, is_perfect={}'.format(
self.name, self.idx, self.chain, self.is_disorder, self.is_perfect
)
return '{}({})'.format(self.__class__.__name__, info)
class Chain(object):
def __init__(self, chain_info, ignore_incomplete_res=True, pdb_file=None):
self.pdb_file = pdb_file
self.res_dict = {}
'''if ignore_incomplete_res:
self.residues = {}
for i in chain_info:
res = Residue(chain_info[i])
if len(res.get_heavy_atoms) == len(RESIDUES_TOPO[res.name]):
self.residues[i] = res
else:
self.residues = {i:Residue(chain_info[i]) for i in chain_info}'''
self.residues = {i:Residue(chain_info[i]) for i in chain_info}
#print(self.residues)
self.chain = list(self.residues.values())[0].chain
self.__normalized__ = False
self.center_of_mass_shift = None
self.rotate_matrix = None
self.ignore_incomplete_res =ignore_incomplete_res
def normalize_coord(self):
wts = np.array([i.mass for i in self.get_atoms])
wts = np.expand_dims(wts,axis=1)
pos = self.get_coords
am, self.rotate_matrix, self.center_of_mass_shift = coordinate_adjusting(pos, wts)
for ix, a in enumerate(self.get_atoms):
a.coord = am[ix]
self.__normalized__ = True
@property
def get_incomplete_residues(self):
return [i for i in self.residues.values() if i.is_perfect==False]
@property
def to_heavy_string(self):
return '\n'.join([res.to_heavy_string for res in self.get_residues])
@property
def to_string(self):
return '\n'.join([res.to_string for res in self.get_residues])
@property
def get_atoms(self):
atoms = []
for res in self.get_residues:
atoms.extend(res.get_atoms)
return atoms
@property
def get_residues(self):
if self.ignore_incomplete_res:
return [i for i in self.residues.values() if i.is_perfect]
else:
return list(self.residues.values())
@property
def get_heavy_atoms(self):
atoms = []
for res in self.get_residues:
atoms.extend(res.get_heavy_atoms)
return atoms
@property
def get_coords(self):
return np.array([i.coord for i in self.get_atoms])
@property
def get_heavy_coords(self):
return np.array([i.coord for i in self.get_heavy_atoms])
@property
def center_of_mass(self):
atom_mass = np.array([i.mass for i in self.get_atoms]).reshape(-1, 1)
return np.sum(self.get_coords*atom_mass, axis=0)/atom_mass.sum()
@property
def centroid(self):
return self.get_coords.mean(axis=0)
def compute_surface_atoms(self):
path, filename = os.path.split(self.pdb_file)
chain_file_name = filename.split('.')[0] + '_' + self.chain + '.pdb'
chain_file = path+'/'+chain_file_name
with open(chain_file, 'w') as fw:
fw.write(self.to_heavy_string)
pymol.cmd.load(chain_file)
if path == '':
path = './'
sele = chain_file_name.split('.')[0]
pymol.cmd.remove("({}) and hydro".format(sele))
name = pymol.util.find_surface_atoms(sele=sele, _self=pymol.cmd)
save_name = path + '/' + sele + '-surface.pdb'
pymol.cmd.save(save_name,((name)))
surf_protein = Protein(save_name, ignore_incomplete_res=False)
surf_res_dict = {r.idx:r for r in surf_protein.get_residues}
for res in self.get_residues:
res_idx = res.idx
if res_idx in surf_res_dict:
for a in surf_res_dict[res_idx].get_heavy_atoms:
res.atom_dict[a.name].is_surf = True
self.has_surf_atom = True
os.remove(save_name)
os.remove(chain_file)
def get_surf_mask(self):
if self.has_surf_atom is False:
self.compute_surface_atoms()
return np.array([a.is_surf for a in self.get_heavy_atoms], dtype=np.bool)
def get_res_by_id(self, res_id):
return self.residues[res_id]
def __repr__(self):
tmp = 'Chain={}, NumResidues={}, NumAtoms={}, NumHeavyAtoms={}'
info = tmp.format(self.chain, len(self.residues), self.get_coords.shape[0], self.get_heavy_coords.shape[0])
return '{}({})'.format(self.__class__.__name__, info)
class Protein(object):
def __init__(self, pdb_file, ignore_incomplete_res=True):
self.ignore_incomplete_res = ignore_incomplete_res
self.name = pdb_file.split('/')[-1].split('.')[0]
self.pdb_file = pdb_file
self.has_surf_atom = False
#self.pdb = [line.strip() for line in open(pdb_file).readlines()]
#atoms = os.popen('grep ATOM {}'.format(pdb_file)).read().strip().split('\n')
with open(pdb_file) as fr:
lines = fr.readlines()
surf_item = lines[0].strip().split()[-1]
if surf_item in {'surf', 'inner'}:
self.has_surf_atom = True
chain_info = {}
for line in lines:
if line.startswith('ATOM'):
line = line.strip()
chain = line[21:22].strip()
res_idx = int(line[22:26].strip())
if chain not in chain_info:
chain_info[chain] = {}
chain_info[chain][res_idx] = [line]
elif res_idx not in chain_info[chain]:
chain_info[chain][res_idx] = [line]
else:
chain_info[chain][res_idx].append(line)
self.chains = {
c:Chain(chain_info[c], ignore_incomplete_res=ignore_incomplete_res, pdb_file=pdb_file) for c in chain_info
}
self.__normalized__ = False
self.center_of_mass_shift = None
self.rotate_matrix = None
def normalize_coord(self):
is_chain_normalized = [c for c in self.chains if self.chains[c].__normalized__==True]
if len(is_chain_normalized) > 0:
chain_normalized = ' '.join(is_chain_normalized)
info = 'Warning: The Chains ({}) have been normalized '\
'independently, this may lead to wrong coordinate '\
'transformation for whole Protein !!!'.format(chain_normalized)
print(info)
wts = np.array([i.mass for i in self.get_atoms])
wts = np.expand_dims(wts,axis=1)
pos = self.get_coords
am, self.rotate_matrix, self.center_of_mass_shift = coordinate_adjusting(pos, wts)
for ix, a in enumerate(self.get_atoms):
a.coord = am[ix]
self.__normalized__ = True
@property
def get_incomplete_residues(self):
res_list = []
for i in self.chains:
res_list += self.chains[i].get_incomplete_residues
return res_list
@property
def to_heavy_string(self):
return '\n'.join([res.to_heavy_string for res in self.get_residues])
@property
def to_string(self):
return '\n'.join([res.to_string for res in self.get_residues])
@property
def get_residues(self,):
res_list = []
for i in self.chains:
res_list += self.chains[i].get_residues
return res_list
@property
def get_atoms(self):
atoms = []
for res in self.get_residues:
atoms.extend(res.get_atoms)
return atoms
@property
def get_heavy_atoms(self):
atoms = []
for res in self.get_residues:
atoms.extend(res.get_heavy_atoms)
return atoms
@property
def get_coords(self):
return np.array([i.coord for i in self.get_atoms])
@property
def get_heavy_coords(self):
return np.array([i.coord for i in self.get_heavy_atoms])
@property
def center_of_mass(self):
atom_mass = np.array([i.mass for i in self.get_atoms]).reshape(-1, 1)
return np.sum(self.get_coords*atom_mass, axis=0)/atom_mass.sum()
@property
def bond_graph(self):
res_list = self.get_residues
bond_index = []
bond_type = []
N_term_list = []
C_term_list = []
cusum = 0
for ix, res in enumerate(res_list):
e_idx, e_type = res.bond_graph
bond_index.append(e_idx + cusum)
bond_type.append(e_type)
N_term_ix = [i.name for i in self.get_heavy_atoms].index('N') + cusum
C_term_ix = [i.name for i in self.get_heavy_atoms].index('C') + cusum
N_term_list.append(N_term_ix)
C_term_list.append(C_term_ix)
cusum += res.get_heavy_coords.shape[0]
if ix != 0:
if res.idx-res_list[ix-1].idx == 1 and res.chain == res_list[ix-1].chain:
bond_idx_between_res = np.array(
[[N_term_ix, C_term_list[ix-1]],[C_term_list[ix-1],N_term_ix]],
dtype=np.long
)
bond_index.append(bond_idx_between_res)
bond_type_between_res = np.array([1,1], dtype=np.long)
bond_type.append(bond_type_between_res)
bond_index = np.concatenate(bond_index, axis=1)
bond_type = np.concatenate(bond_type)
return bond_index, bond_type
@property
def centroid(self):
return self.get_coords.mean(axis=0)
def get_chain(self, chain_id):
return self.chains[chain_id]
def get_res_by_id(self, chain_id, res_id):
return self.chains[chain_id].get_res_by_id(res_id)
def get_atom_dict(self, removeHs=True, get_surf=False):
atom_dict = {
'element': [],
'pos': [],
'is_backbone': [],
'atom_name': [],
'atom_to_aa_type': []
}
for a in self.get_atoms:
if a.element == 'H' and removeHs:
continue
atom_dict['element'].append(a.to_dict['element'])
atom_dict['pos'].append(a.to_dict['pos'])
atom_dict['is_backbone'].append(a.to_dict['is_backbone'])
atom_dict['atom_name'].append(a.to_dict['atom_name'])
atom_dict['atom_to_aa_type'].append(a.to_dict['atom_to_aa_type'])
atom_dict['element'] = np.array(atom_dict['element'], dtype=np.long)
atom_dict['pos'] = np.array(atom_dict['pos'], dtype=np.float32)
atom_dict['is_backbone'] = np.array(atom_dict['is_backbone'], dtype=np.bool)
#atom_dict['atom_name'] = atom_dict['atom_name']
if get_surf:
atom_dict['surface_mask'] = np.array([a.is_surf for a in self.get_heavy_atoms], dtype=np.bool)#self.get_surf_mask()
atom_dict['atom_to_aa_type'] = np.array(atom_dict['atom_to_aa_type'], dtype=np.long)
atom_dict['molecule_name'] = None
protein_bond_index, protein_bond_type = self.bond_graph
atom_dict['bond_index'] = protein_bond_index
atom_dict['bond_type'] = protein_bond_type
atom_dict['filename'] = self.pdb_file
return atom_dict
def get_backbone_dict(self, removeHs=True):
atom_dict = self.get_atom_dict(removeHs=removeHs)
backbone_dict = {}
backbone_dict['element'] = atom_dict['element'][atom_dict['is_backbone']]
backbone_dict['pos'] = atom_dict['pos'][atom_dict['is_backbone']]
backbone_dict['is_backbone'] = np.ones(atom_dict['is_backbone'].sum(), dtype=np.bool)
backbone_dict['atom_name'] = np.array(atom_dict['atom_name'])[atom_dict['is_backbone']].tolist()
backbone_dict['atom_to_aa_type'] = atom_dict['atom_to_aa_type'][atom_dict['is_backbone']]
backbone_dict['molecule_name'] = atom_dict['molecule_name']
atom_dict['bond_index'] = np.empty([0,2], dtype=np.long)
atom_dict['bond_type'] = np.empty(0, dtype=np.long)
atom_dict['filename'] = self.pdb_file
return backbone_dict
@property
def get_backbone(self):
atoms = []
for res in self.get_residues:
bkb = [a for a in res.get_atoms if a.name in BACKBONE_SYMBOL]
atoms += bkb
return atoms
def compute_surface_atoms(self):
"""
If the pdb_file is a pocket_file, the surface atoms is not correct!
"""
pymol.cmd.load(self.pdb_file)
path, filename = os.path.split(self.pdb_file)
if path == '':
path = './'
sele = filename.split('.')[0]
pymol.cmd.remove("({}) and hydro".format(sele))
name = pymol.util.find_surface_atoms(sele=sele, _self=pymol.cmd)
save_name = path + '/' + sele + '-surface.pdb'
pymol.cmd.save(save_name,((name)))
surf_protein = Protein(save_name, ignore_incomplete_res=False)
surf_res_dict = {r.idx:r for r in surf_protein.get_residues}
for res in self.get_residues:
res_idx = res.idx
if res_idx in surf_res_dict:
for a in surf_res_dict[res_idx].get_heavy_atoms:
res.atom_dict[a.name].is_surf = True
self.has_surf_atom = True
os.remove(save_name)
def get_surf_mask(self):
if self.has_surf_atom is False:
self.compute_surface_atoms()
return np.array([a.is_surf for a in self.get_heavy_atoms], dtype=np.bool)
@staticmethod
def empty_dict():
empty_pocket_dict = {}
empty_pocket_dict = Dict(empty_pocket_dict)
empty_pocket_dict.element = np.empty(0, dtype=np.int64)
empty_pocket_dict.pos = np.empty([0,3], dtype=np.float32)
empty_pocket_dict.is_backbone = np.empty(0, dtype=bool)
empty_pocket_dict.atom_name = []
empty_pocket_dict.atom_to_aa_type = np.empty(0, dtype=np.int64)
empty_pocket_dict.molecule_name = None
empty_pocket_dict.bond_index = np.empty([2,0], dtype=np.int64)
empty_pocket_dict.bond_type = np.empty(0, dtype=np.int64)
empty_pocket_dict.filename = None
return empty_pocket_dict
def __repr__(self):
num_res = 0
num_atom = 0
for i in self.chains:
res_list = list(self.chains[i].residues.values())
num_res += len(res_list)
for i in res_list:
num_atom += len(i.get_heavy_atoms)
num_incomp = len(self.get_incomplete_residues)
tmp = 'Name={}, NumChains={}, NumResidues={}, NumHeavyAtoms={}, NumIncompleteRes={}'
info = tmp.format(
self.name, len(self.chains), num_res, num_atom, num_incomp
)
return '{}({})'.format(self.__class__.__name__, info)
####################################################################################################################
ATOM_FAMILIES = ['Acceptor', 'Donor', 'Aromatic', 'Hydrophobe', 'LumpedHydrophobe', 'NegIonizable', 'PosIonizable', 'ZnBinder']
ATOM_FAMILIES_ID = {s: i for i, s in enumerate(ATOM_FAMILIES)}
BOND_TYPES = {t: i for i, t in enumerate(BondType.names.values())}
BOND_NAMES = {i: t for i, t in enumerate(BondType.names.keys())}
def is_in_ring(mol):
d = {a:np.array([], dtype=np.int64) for a in range(len(mol.GetAtoms()))}
rings = Chem.GetSymmSSSR(mol)
for a in d:
for r_idx, ring in enumerate(rings):
if a in ring:
d[a] = np.append(d[a], r_idx+1)
else:
d[a] = np.append(d[a], -a)
return d
def parse_sdf_to_dict(mol_file):
fdefName = os.path.join(RDConfig.RDDataDir,'BaseFeatures.fdef')
factory = ChemicalFeatures.BuildFeatureFactory(fdefName)
rdmol = next(iter(Chem.SDMolSupplier(mol_file, removeHs=True)))
Chem.Kekulize(rdmol)
ring_info = is_in_ring(rdmol)
conformer = rdmol.GetConformer()
feat_mat = np.zeros([rdmol.GetNumAtoms(), len(ATOM_FAMILIES)], dtype=np.int64)
for feat in factory.GetFeaturesForMol(rdmol):
feat_mat[feat.GetAtomIds(), ATOM_FAMILIES_ID[feat.GetFamily()]] = 1
element, pos, atom_mass = [], [], []
for a in rdmol.GetAtoms():
element.append(a.GetAtomicNum())
pos.append(conformer.GetAtomPosition(a.GetIdx()))
atom_mass.append(a.GetMass())
element = np.array(element, dtype=np.int64)
pos = np.array(pos, dtype=np.float32)
atom_mass = np.array(atom_mass, np.float32)
center_of_mass = (pos * atom_mass.reshape(-1,1)).sum(0)/atom_mass.sum()
edge_index, edge_type = [], []
for b in rdmol.GetBonds():
row = [b.GetBeginAtomIdx(), b.GetEndAtomIdx()]
col = [b.GetEndAtomIdx(), b.GetBeginAtomIdx()]
edge_index.extend([row, col])
edge_type.extend([BOND_TYPES[b.GetBondType()]] * 2)
edge_index = np.array(edge_index)
edge_index_perm = edge_index[:,0].argsort()
edge_index = edge_index[edge_index_perm].T
edge_type = np.array(edge_type)[edge_index_perm]
return {'element': element,
'pos': pos,
'bond_index': edge_index,
'bond_type': edge_type,
'center_of_mass': center_of_mass,
'atom_feature': feat_mat,
'ring_info': ring_info,
'filename':mol_file
}
from scipy.spatial.distance import pdist
from scipy.spatial.distance import squareform
class Ligand(object):
def __init__(self, mol_file, removeHs=True, sanitize=True):
if isinstance(mol_file, Chem.rdchem.Mol):
mol = mol_file
self.name = None
self.lig_file = None
else:
mol = Chem.MolFromMolFile(mol_file, removeHs=removeHs, sanitize=sanitize)
if mol is None:
mol = Chem.MolFromMolFile(mol_file, removeHs=removeHs, sanitize=False)
mol.UpdatePropertyCache(strict=False)
Chem.SanitizeMol(
mol, ## 如果报错可以尝试Chem.rdmolops.SanitizeFlags.SANITIZE_FINDRADICALS
Chem.SanitizeFlags.SANITIZE_FINDRADICALS|\
Chem.SanitizeFlags.SANITIZE_KEKULIZE|\
Chem.SanitizeFlags.SANITIZE_SETAROMATICITY| \
Chem.SanitizeFlags.SANITIZE_SETCONJUGATION| \
Chem.SanitizeFlags.SANITIZE_SETHYBRIDIZATION| \
Chem.SanitizeFlags.SANITIZE_SYMMRINGS,
catchErrors=True
)
self.name = mol_file.split('/')[-1].split('.')[0]
self.lig_file = mol_file
Chem.Kekulize(mol)
self.mol = mol
#self.self_loop = self_loop
self.num_atoms = len(self.mol.GetAtoms())
self.normalized_coords = None
def normalize_pos(self, shift_vector, rotate_matrix):
conformer = self.mol.GetConformer()
coords = np.array([conformer.GetAtomPosition(a.GetIdx()) for a in self.mol.GetAtoms()])
coords = (coords - shift_vector)@rotate_matrix
for ix, pos in enumerate(coords):
conformer.SetAtomPosition(ix, pos)
self.normalized_coords = coords
def mol_block(self):
return Chem.MolToMolBlock(self.mol)
def to_dict(self):
fdefName = os.path.join(RDConfig.RDDataDir,'BaseFeatures.fdef')
factory = ChemicalFeatures.BuildFeatureFactory(fdefName)
#rdmol = next(iter(Chem.SDMolSupplier(mol_file, removeHs=True)))
ring_info = is_in_ring(self.mol)
conformer = self.mol.GetConformer()
feat_mat = np.zeros([self.mol.GetNumAtoms(), len(ATOM_FAMILIES)], dtype=np.int64)
for feat in factory.GetFeaturesForMol(self.mol):
feat_mat[feat.GetAtomIds(), ATOM_FAMILIES_ID[feat.GetFamily()]] = 1
element, pos, atom_mass = [], [], []
for a in self.mol.GetAtoms():
element.append(a.GetAtomicNum())
pos.append(conformer.GetAtomPosition(a.GetIdx()))
atom_mass.append(a.GetMass())
element = np.array(element, dtype=np.int64)
pos = np.array(pos, dtype=np.float32)
atom_mass = np.array(atom_mass, np.float32)
center_of_mass = (pos * atom_mass.reshape(-1,1)).sum(0)/atom_mass.sum()
edge_index, edge_type = [], []
for b in self.mol.GetBonds():
row = [b.GetBeginAtomIdx(), b.GetEndAtomIdx()]
col = [b.GetEndAtomIdx(), b.GetBeginAtomIdx()]
edge_index.extend([row, col])
edge_type.extend([BOND_TYPES[b.GetBondType()]] * 2)
edge_index = np.array(edge_index)
edge_index_perm = edge_index[:,0].argsort()
edge_index = edge_index[edge_index_perm].T
edge_type = np.array(edge_type)[edge_index_perm]
return {'element': element,
'pos': pos,
'bond_index': edge_index,
'bond_type': edge_type,
'center_of_mass': center_of_mass,
'atom_feature': feat_mat,
'ring_info': ring_info,
'filename':self.lig_file
}
@staticmethod
def empty_dict():
empty_ligand_dict = {}
empty_ligand_dict = Dict(empty_ligand_dict)
empty_ligand_dict.element = np.empty(0, dtype=np.int64)
empty_ligand_dict.pos = np.empty([0,3], dtype=np.float32)
empty_ligand_dict.bond_index = np.empty([2,0], dtype=np.int64)
empty_ligand_dict.bond_type = np.empty(0, dtype=np.int64)
empty_ligand_dict.center_of_mass = np.empty([0,3], dtype=np.float32)
empty_ligand_dict.atom_feature = np.empty([0,8], dtype=np.float32)
empty_ligand_dict.ring_info = {}
empty_ligand_dict.filename = None
return empty_ligand_dict
def __repr__(self):
tmp = 'Name={}, NumAtoms={}'
info = tmp.format(self.name, self.num_atoms)
return '{}({})'.format(self.__class__.__name__, info)

View File

@@ -1,13 +0,0 @@
from .data import *
from .generate_utils import *
from .ParseFile import *
from .train import *
from .transform_utils import *
from .transform import *
from .load_dataset import *
from .residues_base import *
from .process_raw import *
from .metrics import *
from .mol_filter import *
from . import sascorer
from .draw import *

View File

@@ -1,133 +0,0 @@
from rdkit import Chem
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import gaussian_kde
from tqdm.auto import tqdm
from multiprocessing import Pool
#from pocket_flow.utils import CrossDocked2020
from matplotlib.collections import PathCollection
from matplotlib.legend_handler import HandlerLine2D
BOND_SYMBOL = {'SINGLE':'-', 'DOUBLE':'=', 'TRIPLE':'#', 'AROMATIC':'@'}
def compute_bond_length(mol, atomatic=True):
#Chem.Kekulize(mol)
conformer = mol.GetConformer()
coords = np.array([conformer.GetAtomPosition(a.GetIdx()) for a in mol.GetAtoms()])
bond_dict = {}
for b in mol.GetBonds():
bond_type = b.GetBondType().__str__()
if b.GetIsAromatic() and atomatic:
bond_type = 'AROMATIC'
bond_symbol = BOND_SYMBOL[bond_type]
a_ix_start = b.GetBeginAtomIdx()
a_ix_end = b.GetEndAtomIdx()
bond_lenth = np.linalg.norm(coords[a_ix_start]-coords[a_ix_end])
#string_bond_lenth = '{:.3f}'.format(bond_lenth)
symbol_start = b.GetBeginAtom().GetSymbol()
symbol_end = b.GetEndAtom().GetSymbol()
bond_key = bond_symbol.join(sorted([symbol_start, symbol_end]))
if bond_key not in bond_dict:
bond_dict[bond_key] = [bond_lenth]
else:
bond_dict[bond_key].append(bond_lenth)
return bond_dict
def bond_statistic(inputs, from_file=True, interval=5000, n_p=16, sanitize=True):
BOND_DICT = {}
for idx in tqdm(range(0, len(inputs), interval)):
if idx+interval >= len(inputs):
raw_files = inputs[idx:]
else:
raw_files = inputs[idx:idx+interval]
if from_file:
mol_list = []
for sdf in raw_files:
mol = Chem.MolFromMolFile(sdf[1], sanitize=True)
if mol:
mol_list.append(mol)
else:
mol_list = raw_files
pool = Pool(processes=n_p)
dict_list = pool.map(compute_bond_length, mol_list)
for d in dict_list:
for k in d:
if k in BOND_DICT:
BOND_DICT[k] += d[k]
else:
BOND_DICT[k] = d[k]
return BOND_DICT
'''
import pickle
BOND_DICT = bond_statistic(cs2020.index, from_file=True, interval=5000, n_p=16)
with open('BondLenthStatistic.pkl','wb') as fw:
pickle.dump(BOND_DICT, fw)
del BOND_DICT
with open('BondLenthStatistic.pkl', 'rb') as fr:
BOND_DICT = pickle.load(fr)
'''
class VizBondDensity(object):
def __init__(self, data_bond_dict, gen_bond_dict, linewidth=0.1, color_data="#BC5F6A", color_gen="#19B3B1",
transparency=0.2, legend_size=25, axis_label_size=30, tick_size=25, is_fill=True,
figsize=(14, 9)):
self.data_bond_dict = data_bond_dict
self.gen_bond_dict = gen_bond_dict
self.linewidth = linewidth
self.color_data = color_data
self.color_gen = color_gen
self.transparency = transparency
self.legend_size = legend_size
self.axis_label_size = axis_label_size
self.tick_size = tick_size
self.is_fill = is_fill
self.figsize = figsize
def draw(self, bond_type, save_path=None, dpi=300):
#bond_type = 'O=S'
density = gaussian_kde(np.array(self.data_bond_dict[bond_type]))
density.covariance_factor = lambda : .25
density._compute_covariance()
density_gen = gaussian_kde(np.array(self.gen_bond_dict[bond_type]))
density_gen.covariance_factor = lambda : .25
density_gen._compute_covariance()
# Create a vector of 200 values going from 0 to 8:
xs = np.linspace(1, 2, 200)
# Set the figure size
plt.figure(figsize=self.figsize)
#设置图片的右边框和上边框为不显示
ax=plt.gca() #gca:get current axis得到当前轴
ax.spines['right'].set_color('none')
ax.spines['top'].set_color('none')
# 开始画图
plt.plot(xs, density(xs), color=self.color_data, alpha=self.transparency, linewidth=self.linewidth)
if self.is_fill:
plt.fill_between(xs, density(xs), color=self.color_data, alpha=self.transparency, label='Dataset')
plt.plot(xs, density_gen(xs), color=self.color_gen, alpha=self.transparency, linewidth=self.linewidth)
if self.is_fill:
plt.fill_between(xs, density_gen(xs), color=self.color_gen, alpha=self.transparency, label='Generate')
plt.legend(prop={'size': self.legend_size})#framealpha=1, handler_map={plt.Line2D: HandlerLine2D(update_func=updateline)},
#frameon=True, fontsize=20)
plt.yticks(size=self.tick_size) # fontproperties = 'Times New Roman',
plt.xticks(size=self.tick_size) # fontproperties = 'Times New Roman',
plt.xlabel('Bond Length (Å)', fontdict={'size':self.axis_label_size}) # 'family': Arial
plt.ylabel('Density', fontdict={'size':self.axis_label_size})
if save_path:
plt.savefig(save_path + '/' + '{}.png'.format(bond_type), dpi=dpi)
#plt.show()
return plt

View File

@@ -1,183 +0,0 @@
import os
import subprocess
import random
import string
from easydict import EasyDict
from rdkit import Chem
from rdkit.Chem.rdForceFieldHelpers import UFFOptimizeMolecule
from .reconstruct import reconstruct_from_generated
def get_random_id(length=30):
letters = string.ascii_lowercase
return ''.join(random.choice(letters) for i in range(length))
def load_pdb(path):
with open(path, 'r') as f:
return f.read()
def parse_qvina_outputs(docked_sdf_path):
suppl = Chem.SDMolSupplier(docked_sdf_path)
results = []
for i, mol in enumerate(suppl):
if mol is None:
continue
line = mol.GetProp('REMARK').splitlines()[0].split()[2:]
results.append(EasyDict({
'rdmol': mol,
'mode_id': i,
'affinity': float(line[0]),
'rmsd_lb': float(line[1]),
'rmsd_ub': float(line[2]),
}))
return results
class BaseDockingTask(object):
def __init__(self, pdb_block, ligand_rdmol):
super().__init__()
self.pdb_block = pdb_block
self.ligand_rdmol = ligand_rdmol
def run(self):
raise NotImplementedError()
def get_results(self):
raise NotImplementedError()
class QVinaDockingTask(BaseDockingTask):
@classmethod
def from_generated_data(cls, data, protein_root='./data/crossdocked', **kwargs):
protein_fn = os.path.join(
os.path.dirname(data.ligand_filename),
os.path.basename(data.ligand_filename)[:10] + '.pdb'
)
protein_path = os.path.join(protein_root, protein_fn)
with open(protein_path, 'r') as f:
pdb_block = f.read()
ligand_rdmol = reconstruct_from_generated(data)
return cls(pdb_block, ligand_rdmol, **kwargs)
@classmethod
def from_original_data(cls, data, ligand_root='./data/crossdocked_pocket10', protein_root='./data/crossdocked', **kwargs):
protein_fn = os.path.join(
os.path.dirname(data.ligand_filename),
os.path.basename(data.ligand_filename)[:10] + '.pdb'
)
protein_path = os.path.join(protein_root, protein_fn)
with open(protein_path, 'r') as f:
pdb_block = f.read()
ligand_path = os.path.join(ligand_root, data.ligand_filename)
ligand_rdmol = next(iter(Chem.SDMolSupplier(ligand_path)))
return cls(pdb_block, ligand_rdmol, **kwargs)
def __init__(self, pdb_block, ligand_rdmol, conda_env='adt', tmp_dir='./tmp', use_uff=True, center=None):
super().__init__(pdb_block, ligand_rdmol)
self.conda_env = conda_env
self.tmp_dir = os.path.realpath(tmp_dir)
os.makedirs(tmp_dir, exist_ok=True)
self.task_id = get_random_id()
self.receptor_id = self.task_id + '_receptor'
self.ligand_id = self.task_id + '_ligand'
self.receptor_path = os.path.join(self.tmp_dir, self.receptor_id + '.pdb')
self.ligand_path = os.path.join(self.tmp_dir, self.ligand_id + '.sdf')
with open(self.receptor_path, 'w') as f:
f.write(pdb_block)
ligand_rdmol = Chem.AddHs(ligand_rdmol, addCoords=True)
if use_uff:
UFFOptimizeMolecule(ligand_rdmol)
sdf_writer = Chem.SDWriter(self.ligand_path)
sdf_writer.write(ligand_rdmol)
sdf_writer.close()
self.ligand_rdmol = ligand_rdmol
pos = ligand_rdmol.GetConformer(0).GetPositions()
if center is None:
self.center = (pos.max(0) + pos.min(0)) / 2
else:
self.center = center
self.proc = None
self.results = None
self.output = None
self.docked_sdf_path = None
def run(self, exhaustiveness=16):
commands = """
eval "$(conda shell.bash hook)"
conda activate {env}
cd {tmp}
# Prepare receptor (PDB->PDBQT)
prepare_receptor4.py -r {receptor_id}.pdb
# Prepare ligand
obabel {ligand_id}.sdf -O {ligand_id}.pdbqt
qvina2.1 \
--receptor {receptor_id}.pdbqt \
--ligand {ligand_id}.pdbqt \
--center_x {center_x:.4f} \
--center_y {center_y:.4f} \
--center_z {center_z:.4f} \
--size_x 20 --size_y 20 --size_z 20 \
--exhaustiveness {exhaust}
obabel {ligand_id}_out.pdbqt -O{ligand_id}_out.sdf -h
""".format(
receptor_id = self.receptor_id,
ligand_id = self.ligand_id,
env = self.conda_env,
tmp = self.tmp_dir,
exhaust = exhaustiveness,
center_x = self.center[0],
center_y = self.center[1],
center_z = self.center[2],
)
self.docked_sdf_path = os.path.join(self.tmp_dir, '%s_out.sdf' % self.ligand_id)
self.proc = subprocess.Popen(
'/bin/bash',
shell=False,
stdin=subprocess.PIPE,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE
)
self.proc.stdin.write(commands.encode('utf-8'))
self.proc.stdin.close()
# return commands
def run_sync(self):
self.run()
while self.get_results() is None:
pass
results = self.get_results()
print('Best affinity:', results[0]['affinity'])
return results
def get_results(self):
if self.proc is None: # Not started
return None
elif self.proc.poll() is None: # In progress
return None
else:
if self.output is None:
self.output = self.proc.stdout.readlines()
try:
self.results = parse_qvina_outputs(self.docked_sdf_path)
except:
print('[Error] Vina output error: %s' % self.docked_sdf_path)
return []
return self.results

View File

@@ -1,147 +0,0 @@
import os
from rdkit import Chem
from rdkit.Chem import AllChem, Descriptors
from rdkit.Chem import Draw
from rdkit.Chem.Draw import rdMolDraw2D
from rdkit.Chem.Draw.MolDrawing import MolDrawing, DrawingOptions
from .train import verify_dir_exists
def draw_docked_mol_list(mol_list, molsPerRow=5, subImgSize=(300,300)):
opts = DrawingOptions()
opts.atomLabelFontSize = 30
opts.bondLineWidth = 1.5
opts.colorBonds = False
legends = []
for m in mol_list:
AllChem.Compute2DCoords(m)
docking_score = 'Docking Score: {:.3f}'.format(float(m.GetProp('r_i_docking_score')))
mw = 'MolWt: {:.3f}'.format(Descriptors.MolWt(m))
name = m.GetProp('_Name')
legend = name + '\n' + mw + '\n' + docking_score
legends.append(legend)
img = Draw.MolsToGridImage(
mol_list,
molsPerRow=molsPerRow,
subImgSize=subImgSize,
legends=legends,
returnPNG=False
)
return img
'''
def verify_dir_exists(dirname):
if os.path.isdir(os.path.dirname(dirname)) == False:
#if os.path.isdir(dirname) == False:
os.makedirs(os.path.dirname(dirname))'''
def HighlightAtomByWeights(mol, # rdkit.Molecule的instance
#weights, # 注意力系数注意weights的顺序应该与rdkit.Molecule原子idx的顺序一致
save=None,
size=(300,300),
colors=['#FFFFFF','#FF0000'], # 颜色变化,可以定义多个颜色
bondLineWidth=5, # 化学键显示的宽度
FontSize=3,
fixedBondLength=50, # 化学键显示的长度
legend=None,
legendFontSize=5,
elemColor=False,
withIsomeric=True): # 是否按元素给原子着色
draw = rdMolDraw2D.MolDraw2DCairo(size[0], size[1])
option = rdMolDraw2D.MolDrawOptions()
option.bondLineWidth = bondLineWidth
option.fixedBondLength = fixedBondLength
option.setHighlightColour((0.95,0.7,0.95))
option.baseFontSize = FontSize
option.legendFontSize = legendFontSize
if elemColor == False:
option.updateAtomPalette({k: (0, 0, 0) for k in DrawingOptions.elemDict.keys()})
draw.SetDrawOptions(option)
if withIsomeric:
AllChem.Compute2DCoords(mol)
else:
smi = Chem.MolToSmiles(mol, isomericSmiles=False)
mol = Chem.MolFromSmiles(smi)
rdMolDraw2D.PrepareAndDrawMolecule(draw, mol, legend=legend)
draw.FinishDrawing()
if save:
if '.png' in save:
n = 0
path = '/'.join(save.split('/')[0:-1])
while os.path.exists(save):
n += 1
name = save.split('/')[-1].split('.')[0]
name = name + '_' + str(n) + '.png'
save = path + '/' + name
else:
draw.WriteDrawingText(save)
else:
return draw
'''
supp = list(iter(Chem.SDMolSupplier('dockpose-idscore2500-first50.sdf', removeHs=True, sanitize=1)))
save_path = './img_docking_results/idscore_filtered/'
verify_dir_exists(save_path)
for ix,mol in enumerate(supp[:50]):
docking_score = 'Docking Score: {:.3f}'.format(float(mol.GetProp('r_i_docking_score')))
mw = 'MolWt: {:.3f}'.format(Descriptors.MolWt(mol))
legend = mw + '\n' + docking_score
name = save_path + '/' + 'No-' + str(ix) + '-' + mol.GetProp('_Name') + '.png'
HighlightAtomByWeights(mol, save=name, bondLineWidth=2, FontSize=1, elemColor=True, legend=legend, legendFontSize=100)
'''
from PIL import Image
import matplotlib.pyplot as plt
import numpy as np
def CombineImages(img_file_list, col_num=7, save_dir='./', title='image', img_size=None):
num_img = len(img_file_list)
num_row = num_img//col_num+1 if num_img%col_num != 0 else num_img//col_num
if img_size is None:
img_size = Image.open(img_file_list[0]).size
#print((img_size[1]*col_num, img_size[0]*num_row))
toImage = Image.new('RGB', (img_size[1]*col_num, img_size[0]*num_row), color=(255,255,255))
#print(toImage.size)
'''for y in range(1, num_row+1):
for x in range(1, num_per_row):'''
x_cusum = 0
y_cusum = 0
num_has_paste = 0
for img_file in img_file_list:
img = Image.open(img_file)
#print((x_cusum, y_cusum))
toImage.paste(img, (x_cusum, y_cusum))
num_has_paste += 1
#print(num_has_paste)
if num_has_paste%col_num==0:
x_cusum = 0
y_cusum += img_size[1]
else:
x_cusum += img_size[0]
#toImage.save("merged.png")
plt.xticks([])
plt.yticks([])
plt.axis('off') # 取消所有边框,坐标轴
plt.imshow(toImage)
#plt.title(title, y=0.02) # title默认设置在图像上方通过设置y改变位置
#plt.show()
verify_dir_exists(save_dir)
toImage.save(save_dir+'/'+title+'.png')
#plt.savefig(fname=save_dir+'/'+title)
plt.clf() # 清除当前图形。如果不清除当使用循环大量作图机器会为plt分配越来越多的内存速度会逐渐变慢。
'''
import glob
l = sorted([(int(i.split('/')[-1].split('-')[1]), i) for i in glob.glob('img_docking_results/simlarity_filtered/*.png')], key=lambda x:x[0])
CombineImages([i[1] for i in l])
'''

View File

@@ -1,482 +0,0 @@
import torch
import torch.nn.functional as F
from rdkit import Chem, Geometry
from rdkit.Chem import Descriptors
import numpy as np
import copy
import itertools
max_valence_dict = {
6:torch.LongTensor([4]), 7:torch.LongTensor([3]), 8:torch.LongTensor([2]),
9:torch.LongTensor([1]), 15:torch.LongTensor([5]), 16:torch.LongTensor([6]),
17:torch.LongTensor([1]), 35:torch.LongTensor([1]), 53:torch.LongTensor([1]),
1:torch.LongTensor([1])
}
def add_ligand_atom_to_data(
old_data, pos, element, bond_index, bond_type, type_map=[1,6,7,8,9,15,16,17,35,53],
max_valence_dict=max_valence_dict):
data = old_data.clone()
if 'max_atom_valence' not in data.__dict__['_store']:
data.max_atom_valence = torch.empty(0, dtype=torch.long)
# add position of new atom to context
data.ligand_context_pos = torch.cat([
data.ligand_context_pos, pos.view(1, 3).to(data.ligand_context_pos)
], dim=0)
# add feature of new atom to context
data.ligand_context_feature_full = torch.cat([
data.ligand_context_feature_full,
torch.cat([
F.one_hot(element.view(1), len(type_map)).to(data.ligand_context_feature_full), # (1, num_elements)
torch.tensor([[1, 0, 0]]).to(data.ligand_context_feature_full), # is_mol_atom, num_neigh (placeholder), valence (placeholder)
torch.tensor([[0, 0, 0]]).to(data.ligand_context_feature_full) # num_of_bonds 1, 2, 3(placeholder)
], dim=1)
], dim=0)
data.context_idx = torch.arange(data.context_idx.size(0)+1)
#
idx_num_neigh = len(type_map) + 1
idx_valence = idx_num_neigh + 1
idx_num_of_bonds = idx_valence + 1
# add type of new atom to context
element = torch.LongTensor([type_map[element.item()]])
data.ligand_context_element = torch.cat([
data.ligand_context_element, element.view(1).to(data.ligand_context_element)
])
max_new_atom_valence = max_valence_dict[element.item()].to(data.max_atom_valence)
data.max_atom_valence = torch.cat([data.max_atom_valence, max_new_atom_valence])
# change the feature of new atom to context according to ligand context
if len(bond_type) != 0:
bond_index, bond_type = remove_triangle(
pos, data.ligand_context_pos, data.ligand_context_bond_index, data.ligand_context_bond_type,
bond_index, bond_type)
bond_index[0, :] = len(data.ligand_context_pos) - 1
'''bond_type = check_double_bond(
data.ligand_context_bond_index,
data.ligand_context_bond_type,
bond_index,
bond_type
)'''
bond_type = check_valence_is_2(
bond_index, bond_type,
data.ligand_context_element, data.ligand_context_valence
)
bond_vec = data.ligand_context_pos[bond_index[0]] - data.ligand_context_pos[bond_index[1]]
bond_lengths = torch.norm(bond_vec, dim=-1, p=2)
if (bond_lengths > 3).any():
print(bond_lengths)
bond_index_all = torch.cat([bond_index, torch.stack([bond_index[1, :], bond_index[0, :]], dim=0)], dim=1)
bond_type_all = torch.cat([bond_type, bond_type], dim=0)
data.ligand_context_bond_index = torch.cat([
data.ligand_context_bond_index, bond_index_all.to(data.ligand_context_bond_index)
], dim=1)
data.ligand_context_bond_type = torch.cat([
data.ligand_context_bond_type,
bond_type_all
])
# modify atom features related to bonds
# previous atom
data.ligand_context_feature_full[bond_index[1, :], idx_num_neigh] += 1 # num of neigh of previous nodes
data.ligand_context_feature_full[bond_index[1, :], idx_valence] += bond_type # valence of previous nodes
data.ligand_context_feature_full[bond_index[1, :], idx_num_of_bonds + bond_type - 1] += 1 # num of bonds of
# the new atom
data.ligand_context_feature_full[-1, idx_num_neigh] += len(bond_index[1]) # num of neigh of last node
data.ligand_context_feature_full[-1, idx_valence] += torch.sum(bond_type) # valence of last node
for bond in [1, 2, 3]:
data.ligand_context_feature_full[-1, idx_num_of_bonds + bond - 1] += (bond_type == bond).sum() # num of bonds of last node
data.ligand_context_valence = data.ligand_context_feature_full[:,idx_valence]
del old_data
return data
def data2mol(data, raise_error=True, sanitize=True):
element = data.ligand_context_element.clone().cpu().tolist()
bond_index = data.ligand_context_bond_index.clone().cpu().tolist()
bond_type = data.ligand_context_bond_type.clone().cpu().tolist()
pos = data.ligand_context_pos.clone().cpu().tolist()
n_atoms = len(pos)
rd_mol = Chem.RWMol()
rd_conf = Chem.Conformer(n_atoms)
# add atoms and coordinates
for i, atom in enumerate(element):
rd_atom = Chem.Atom(atom)
rd_mol.AddAtom(rd_atom)
rd_coords = Geometry.Point3D(*pos[i])
rd_conf.SetAtomPosition(i, rd_coords)
rd_mol.AddConformer(rd_conf)
# add atoms and coordinates
# add atoms and coordinates
for i, type_this in enumerate(bond_type):
node_i, node_j = bond_index[0][i], bond_index[1][i]
if node_i < node_j:
if type_this == 1:
rd_mol.AddBond(node_i, node_j, Chem.BondType.SINGLE)
elif type_this == 2:
rd_mol.AddBond(node_i, node_j, Chem.BondType.DOUBLE)
elif type_this == 3:
rd_mol.AddBond(node_i, node_j, Chem.BondType.TRIPLE)
elif type_this == 12:
rd_mol.AddBond(node_i, node_j, Chem.BondType.AROMATIC)
else:
raise Exception('unknown bond order {}'.format(type_this))
# modify
try:
rd_mol = modify_submol(rd_mol)
except:
if raise_error:
raise MolReconsError()
else:
print('MolReconsError')
# check valid
rd_mol_check = Chem.MolFromSmiles(Chem.MolToSmiles(rd_mol))
if rd_mol_check is None:
if raise_error:
raise MolReconsError()
else:
print('MolReconsError')
rd_mol = rd_mol.GetMol()
if 12 in bond_type: # mol may directlu come from ture mols and contains aromatic bonds
Chem.Kekulize(rd_mol, clearAromaticFlags=True)
if sanitize:
Chem.SanitizeMol(rd_mol, Chem.SANITIZE_ALL^Chem.SANITIZE_KEKULIZE^Chem.SANITIZE_SETAROMATICITY)
#rd_mol = modify(rd_mol)
return rd_mol
def modify_submol(mol): # modify mols containing C=N(C)O
submol = Chem.MolFromSmiles('C=N(C)O', sanitize=False)
sub_fragments = mol.GetSubstructMatches(submol)
for fragment in sub_fragments:
atomic_nums = np.array([mol.GetAtomWithIdx(atom).GetAtomicNum() for atom in fragment])
idx_atom_N = fragment[np.where(atomic_nums == 7)[0][0]]
idx_atom_O = fragment[np.where(atomic_nums == 8)[0][0]]
mol.GetAtomWithIdx(idx_atom_N).SetFormalCharge(1) # set N to N+
mol.GetAtomWithIdx(idx_atom_O).SetFormalCharge(-1) # set O to O-
return mol
class MolReconsError(Exception):
pass
def add_context(data):
data.ligand_context_pos = data.ligand_pos
data.ligand_context_element = data.ligand_element
data.ligand_context_bond_index = data.ligand_bond_index
data.ligand_context_bond_type = data.ligand_bond_type
return data
def check_valency(mol):
try:
Chem.SanitizeMol(mol, sanitizeOps=Chem.SanitizeFlags.SANITIZE_PROPERTIES)
return True
except ValueError:
return False
def check_double_bond(
ligand_context_bond_index, ligand_context_bond_type, bond_index_to_add, bond_type_to_add
):
bond_type_in_place = [
ligand_context_bond_type[ix==ligand_context_bond_index[0]]\
for ix in bond_index_to_add[1]
]
has_double = torch.BoolTensor([(bt==2).sum()>0 for bt in bond_type_in_place])
bond_type_to_add_has_double = bond_type_to_add >= 2
mask = torch.stack([has_double,bond_type_to_add_has_double]).all(dim=0)
bond_type_to_add = bond_type_to_add - mask.long()
return bond_type_to_add
def check_valence_is_2(
bond_index_to_add, bond_type_to_add,
ligand_context_element, ligand_context_valence
):
atom_type_to_add = ligand_context_element[-1]
new_atom_type_is_C = (atom_type_to_add == 6).view(-1)
if new_atom_type_is_C:
atom_valence_in_place = ligand_context_valence[bond_index_to_add[1]] >= 2
atom_type_in_place_is_C = ligand_context_element[bond_index_to_add[1]] == 6
bond_type_to_add_mask = bond_type_to_add >= 2
mask = torch.stack([atom_valence_in_place, atom_type_in_place_is_C, bond_type_to_add_mask]).all(dim=0)
bond_type_to_add = bond_type_to_add - mask.long()
return bond_type_to_add
def remove_triangle(pos_to_add, ligand_context_pos, ligand_context_bond_index, ligand_context_bond_type,
bond_index_to_add, bond_type_to_add):
new_j = bond_index_to_add[1]
atom_in_place_adjs = [
ligand_context_bond_index[1][ligand_context_bond_index[0]==i] for i in new_j
]
'''atom_in_place_bond_type = [
ligand_context_bond_type[1][ligand_context_bond_index[0]==i] for i in new_j
]'''
L = []
for j in new_j:
l = []
for i in atom_in_place_adjs:
if j in i:
l.append(True)
else:
l.append(False)
L.append(l)
adj_mask = torch.LongTensor(L).any(-1)
if adj_mask.sum() > 0:
dist = torch.norm(pos_to_add.view(-1,3) - ligand_context_pos[new_j], dim=-1, p=2)
dist_mask_idx = torch.nonzero(adj_mask).view(-1)
max_dist_idx_to_remove = dist_mask_idx[dist[dist_mask_idx].argmax()]
mask = (torch.arange(len(bond_type_to_add)) == max_dist_idx_to_remove) == False
bond_index_to_add = bond_index_to_add[:, mask]
bond_type_to_add = bond_type_to_add[mask]
return bond_index_to_add, bond_type_to_add
PATTERNS = [
#Chem.MolFromSmarts('[C,N]1~&@[C,N]~&@[C,N]~&@[C,N]~&@[C,N]~&@[C,N]~&@1'), # '[R]1~&@[R]~&@[R]~&@[R]~&@[R]~&@[R]~&@1'
Chem.MolFromSmarts('[N]1~&@[N]~&@[C]~&@[C]~&@[C]~&@[C]~&@1'), # '[R]1~&@[R]~&@[R]~&@[R]~&@[R]~&@1'
Chem.MolFromSmarts('[N]1~&@[C]~&@[N]~&@[C]~&@[C]~&@[C]~&@1'),
Chem.MolFromSmarts('[N]1~&@[C]~&@[C]~&@[N]~&@[C]~&@[C]~&@1'),
Chem.MolFromSmarts('[N]1~&@[C]~&@[C]~&@[C]~&@[C]~&@[C]~&@1'),
Chem.MolFromSmarts('[C]1~&@[C]~&@[C]~&@[C]~&@[C]~&@[C]~&@1')
]
PATTERNS_1 = [
[Chem.MolFromSmarts('[#6,7,8]-[#7]1~&@[#6,7]~&@[#6,7]~&@[#6,7]~&@[#6,7]~&@[#6,7]~&@1'),
Chem.MolFromSmarts('[C,N,O]-[N]1~&@[C,N]~&@[C,N]~&@[C,N]~&@[C,N]~&@[C,N]~&@1')],
[Chem.MolFromSmarts('[#6,7,8]-[#6]1(-[#6,7,8])~&@[#6,7]~&@[#6,7]~&@[#6,7]~&@[#6,7]~&@[#6,7]~&@1'),
Chem.MolFromSmarts('[C,N,O]-[C]1(-[C,N,O])~&@[C,N]~&@[C,N]~&@[C,N]~&@[C,N]~&@[C,N]~&@1')]
#Chem.MolFromSmarts('[C,N,O]-[N]1~&@[C]~&@[C]~&@[N]~&@[C]~&@[C]-1'),
#Chem.MolFromSmarts('[C,N,O]-[N]1~&@[C]~&@[C]~&@[C]~&@[C]~&@[C]-1'),
]
MAX_VALENCE = {'C':4, 'N':3}
def modify(mol, max_double_in_6ring=0):
#atoms = mol.GetAtoms()
mol_copy = copy.deepcopy(mol)
mw = Chem.RWMol(mol)
p1 = Chem.MolFromSmarts('[#6,7]=[#6]1-[#6,7]~&@[#6,7]~&@[#6,7]~&@[#6,7]~&@[#6,7]-1')
p1_ = Chem.MolFromSmarts('[C,N]=[C]1-[C,N]~&@[C,N]~&@[C,N]~&@[C,N]~&@[C,N]-1')
subs = set(list(mw.GetSubstructMatches(p1)) + list(mw.GetSubstructMatches(p1_)))
subs_set_1 = [set(s) for s in subs]
for sub in subs:
comb = itertools.combinations(sub, 2)
#b_list = [(c, mw.GetBondBetweenAtoms(*c)) for c in comb if mw.GetBondBetweenAtoms(*c) is not None]
change_double = False
r_b_double = 0
b_list = []
for ix,c in enumerate(comb):
b = mw.GetBondBetweenAtoms(*c)
if ix == 0:
bt = b.GetBondType().__str__()
is_r = b.IsInRingSize(6)
b_list.append((c, bt, is_r))
continue
if b is not None:
bt = b.GetBondType().__str__()
is_r = b.IsInRing()
b_list.append((c, bt, is_r))
if is_r is True and bt == 'DOUBLE':
r_b_double += 1
if r_b_double > max_double_in_6ring:
change_double = True
if change_double:
for ix,b in enumerate(b_list):
if ix == 0:
if b[-1] is False:
mw.RemoveBond(*b[0])
mw.AddBond(*b[0], Chem.rdchem.BondType.SINGLE)
else:
continue
if b[1] == 'DOUBLE' and b[-1] is False:
mw.RemoveBond(*b[0])
mw.AddBond(*b[0], Chem.rdchem.BondType.SINGLE)
break
#p2 = Chem.MolFromSmarts('[C,N,O]-[N]1~&@[C,N]~&@[C,N]~&@[C,N]~&@[C,N]~&@[C,N]-1')
for p2 in PATTERNS_1:
Chem.GetSSSR(mw)
subs2 = set(list(mw.GetSubstructMatches(p2[0])) + list(mw.GetSubstructMatches(p2[1])))
for sub in subs2:
comb = itertools.combinations(sub, 2)
b_list = [(c, mw.GetBondBetweenAtoms(*c)) for c in comb if mw.GetBondBetweenAtoms(*c) is not None]
for b in b_list:
if b[-1].GetBondType().__str__() == 'DOUBLE':
mw.RemoveBond(*b[0])
mw.AddBond(*b[0], Chem.rdchem.BondType.SINGLE)
Chem.GetSSSR(mw)
p3 = Chem.MolFromSmarts('[#8]=[#6]1-[#6,7]~&@[#6,7]~&@[#6,7]~&@[#6,7]~&@[#6,7]-1')
p3_ = Chem.MolFromSmarts('[O]=[C]1-[C,N]~&@[C,N]~&@[C,N]~&@[C,N]~&@[C,N]-1')
subs = set(list(mw.GetSubstructMatches(p3)) + list(mw.GetSubstructMatches(p3_)))
subs_set_2 = [set(s) for s in subs]
for sub in subs:
comb = itertools.combinations(sub, 2)
b_list = [(c, mw.GetBondBetweenAtoms(*c)) for c in comb if mw.GetBondBetweenAtoms(*c) is not None]
for b in b_list:
if b[-1].GetBondType().__str__() == 'DOUBLE' and b[-1].IsInRing() is True:
mw.RemoveBond(*b[0])
mw.AddBond(*b[0], Chem.rdchem.BondType.SINGLE)
p = Chem.MolFromSmarts('[#6,7]1~&@[#6,7]~&@[#6,7]~&@[#6,7]~&@[#6,7]~&@[#6,7]~&@1')
p_ = Chem.MolFromSmarts('[C,N]1~&@[C,N]~&@[C,N]~&@[C,N]~&@[C,N]~&@[C,N]~&@1')
Chem.GetSSSR(mw)
subs = set(list(mw.GetSubstructMatches(p)) + list(mw.GetSubstructMatches(p_)))
subs_set_3 = [set(s) for s in subs]
for sub in subs:
pass_sub = False
if subs_set_2:
for s in subs_set_2:
if len(s-set(sub)) == 1:
pass_sub = True
break
if pass_sub:
continue
bond_list = [(i,sub[0]) if ix+1==len(sub) else (i, sub[ix+1]) for ix,i in enumerate(sub)]
if len(bond_list) == 0:
continue
atoms = [mw.GetAtomWithIdx(i) for i in sub]
for a in atoms:
if a.GetExplicitValence()==MAX_VALENCE[a.GetSymbol()] and a.GetHybridization().__str__()=='SP3':
break
else:
bond_type = [mw.GetBondBetweenAtoms(*b).GetBondType().__str__() for b in bond_list]
if bond_type.count('DOUBLE') > max_double_in_6ring:
for b in bond_list:
mw.RemoveBond(*b)
mw.AddBond(*b, Chem.rdchem.BondType.AROMATIC)
# get new mol from modified mol
conf = mw.GetConformer()
rd_mol = Chem.RWMol()
rd_conf = Chem.Conformer(mw.GetNumAtoms())
for i, atom in enumerate(mw.GetAtoms()):
rd_atom = Chem.Atom(atom.GetAtomicNum())
rd_mol.AddAtom(rd_atom)
rd_coords = conf.GetAtomPosition(i)
rd_conf.SetAtomPosition(i, rd_coords)
rd_mol.AddConformer(rd_conf)
for i, bond in enumerate(mw.GetBonds()):
bt = bond.GetBondType()
node_i = bond.GetBeginAtomIdx()
node_j = bond.GetEndAtomIdx()
rd_mol.AddBond(node_i, node_j, bt)
out_mol = rd_mol.GetMol()
# check validility of the new mol
mol_check = Chem.MolFromSmiles(Chem.MolToSmiles(out_mol))
if mol_check:
try:
Chem.Kekulize(out_mol)
del mol_copy
return out_mol
except:
del mol
return mol_copy
else:
del mol
return mol_copy
'''
PATTERNS = [
Chem.MolFromSmarts('[C,N]1~&@[C,N]~&@[C,N]~&@[C,N]~&@[C,N]~&@[C,N]~&@1'), # '[R]1~&@[R]~&@[R]~&@[R]~&@[R]~&@[R]~&@1'
#Chem.MolFromSmarts('[C,N]1~&@[C,N]~&@[C,N]~&@[C,N]~&@[C,N]~&@1') # '[R]1~&@[R]~&@[R]~&@[R]~&@[R]~&@1'
]
def modify(mol):
#atoms = mol.GetAtoms()
mol_copy = copy.deepcopy(mol)
mw = Chem.RWMol(mol)
p1 = Chem.MolFromSmarts('[#6]=[#6]1-[C,N]~&@[C,N]~&@[C,N]~&@[C,N]~&@[C,N]-1')
subs = mw.GetSubstructMatches(p1)
for sub in subs:
comb = itertools.combinations(sub, 2)
b_list = [(c, mw.GetBondBetweenAtoms(*c)) for c in comb if mw.GetBondBetweenAtoms(*c) is not None]
for b in b_list:
if b[-1].GetBondType().__str__() == 'DOUBLE' and b[-1].IsInRing() is False:
mw.RemoveBond(*b[0])
mw.AddBond(*b[0], Chem.rdchem.BondType.SINGLE)
Chem.GetSSSR(mw)
for p in PATTERNS:
subs = list(mw.GetSubstructMatches(p))
for sub in subs:
bond_list = [(i,sub[0]) if ix+1==len(sub) else (i, sub[ix+1]) for ix,i in enumerate(sub)]
if len(bond_list) == 0:
continue
bond_type = [mw.GetBondBetweenAtoms(*b).GetBondType().__str__() for b in bond_list]
if bond_type.count('DOUBLE') > 0:
for b in bond_list:
mw.RemoveBond(*b)
mw.AddBond(*b, Chem.rdchem.BondType.AROMATIC)
# get new mol from modified mol
conf = mw.GetConformer()
rd_mol = Chem.RWMol()
rd_conf = Chem.Conformer(mw.GetNumAtoms())
for i, atom in enumerate(mw.GetAtoms()):
rd_atom = Chem.Atom(atom.GetAtomicNum())
rd_mol.AddAtom(rd_atom)
rd_coords = conf.GetAtomPosition(i)
rd_conf.SetAtomPosition(i, rd_coords)
rd_mol.AddConformer(rd_conf)
for i, bond in enumerate(mw.GetBonds()):
bt = bond.GetBondType()
node_i = bond.GetBeginAtomIdx()
node_j = bond.GetEndAtomIdx()
rd_mol.AddBond(node_i, node_j, bt)
out_mol = rd_mol.GetMol()
# check validility of the new mol
mol_check = Chem.MolFromSmiles(Chem.MolToSmiles(out_mol))
if mol_check:
try:
Chem.Kekulize(out_mol)
return out_mol
except:
return mol_copy
else:
return mol_copy
'''
def save_sdf(mol_list, save_name='mol_gen.sdf'):
writer = Chem.SDWriter(save_name)
writer.SetProps(['LOGP', 'MW'])
for i, mol in enumerate(mol_list):
mw = Descriptors.ExactMolWt(mol)
logp = Descriptors.MolLogP(mol)
mol.SetProp('MW', '%.2f' %(mw))
mol.SetProp('LOGP', '%.2f' %(logp))
mol.SetProp('_Name', 'No_%s' %(i))
Chem.Kekulize(mol)
writer.write(mol)
writer.close()
def check_alert_structure(mol, alert_smarts):
Chem.GetSSSR(mol)
pattern = Chem.MolFromSmarts(alert_smarts)
subs = mol.GetSubstructMatches(pattern)
if len(subs) == 0:
return False
else:
return True
def check_alert_structures(mol, alert_smarts_list):
Chem.GetSSSR(mol)
patterns = [Chem.MolFromSmarts(sma) for sma in alert_smarts_list]
for p in patterns:
subs = mol.GetSubstructMatches(p)
if len(subs) != 0:
return True
else:
return False

View File

@@ -1,292 +0,0 @@
import os
import pickle
import glob
import lmdb
import random
from rdkit import Chem
from multiprocessing import Pool
import numpy as np
import torch
from torch.utils.data import Dataset, Subset
#from torch_geometric.loader import DataLoader
from tqdm.auto import tqdm
from .data import ComplexData, torchify_dict
from .ParseFile import Protein, Ligand, parse_sdf_to_dict
class LoadDataset(Dataset):
def __init__(self, dataset_path, transform=None, map_size=10*(1024*1024*1024)):
super().__init__()
self.dataset_path = dataset_path
self.transform = transform
self.map_size = map_size
self.db = None
self.keys = None
def _connect_db(self):
self.db = lmdb.open(
self.dataset_path,
map_size=self.map_size, # 10GB
create=False,
subdir=False,
readonly=True,
lock=False,
readahead=False,
meminit=False,
)
with self.db.begin() as txn:
self.keys = list(txn.cursor().iternext(values=False))
def _close_db(self):
self.db.close()
self.db = None
self.keys = None
def __len__(self):
if self.db is None:
self._connect_db()
return len(self.keys)
def __getitem__(self, idx):
if self.db is None:
self._connect_db()
key = self.keys[idx]
data = pickle.loads(self.db.begin().get(key))
#data.id = idx
#assert data.protein_pos.size(0) > 0
if self.transform is not None:
data = self.transform(data)
return data
def __connect_db_edit__(self):
self.db = lmdb.open(
self.dataset_path,
map_size=self.map_size, # 10GB
create=False,
subdir=False,
readonly=False,
lock=False,
readahead=False,
meminit=False,
)
with self.db.begin() as txn:
self.keys = list(txn.cursor().iternext(values=False))
def remove(self, idx):
if self.db is None:
self.__connect_db_edit__()
txn = self.db.begin(write=True)
txn.delete(self.keys[idx])
txn.commit()
self._close_db()
@staticmethod
def split(dataset, val_num=None, shuffle=True, random_seed=0):
index = list(range(len(dataset)))
if shuffle:
random.seed(random_seed)
random.shuffle(index)
index = torch.LongTensor(index)
split_dic = {'valid':index[:val_num], 'train':index[val_num:]}
subsets = {k:Subset(dataset, indices=v) for k, v in split_dic.items()}
train_set, val_set = subsets['train'], subsets['valid']
return train_set, val_set
@staticmethod
def split_by_name(dataset, test_key_set, name2id_path=None):
#self.name2id_path = '/'.join(self.dataset_path.split('/')[:-1])
#self.dataset_name = self.dataset_path.split('/')[-1].split('.')[0]
if name2id_path is None:
name2id_path = '/'.join(dataset.dataset_path.split('/')[:-1])
dataset_name = dataset.dataset_path.split('/')[-1].split('.')[0]
name2id_file= name2id_path+'/'+dataset_name+'_name2id.pt'
if os.path.exists(name2id_file):
name2id = torch.load(name2id_file)
else:
name2id = {}
for i in tqdm(range(len(dataset)), 'Indexing Dataset'):
try:
data = dataset[i]
except AssertionError as e:
print(i, e)
continue
name = (
'/'.join(data.protein_filename.split('/')[-2:]),
'/'.join(data.ligand_filename.split('/')[-2:])
)
name2id[name] = i
torch.save(name2id, name2id_file)
else:
name2id = torch.load(name2id_file)
train_idx = []
test_idx = []
for k,v in tqdm(name2id.items(), 'Spliting'):
if k in test_key_set:
test_idx.append(v)
else:
train_idx.append(v)
split_dict = {
'valid':torch.LongTensor(test_idx), 'train':torch.LongTensor(train_idx)
}
subsets = {k:Subset(dataset, indices=v) for k, v in split_dict.items()}
train_set, val_set = subsets['train'], subsets['valid']
return train_set, val_set
#config = load_config('./configs/train.yml')
#dataset = LoadDataset('../Pocket2Mol-learning/data/crossdocked_pocket10_processed.lmdb')
#print(len(dataset))
class CrossDocked2020(object):
def __init__(self, raw_path, index_path, unexpected_sample=[],
atomic_numbers=[6,7,8,9,15,16,17,35,53]):
self.raw_path = raw_path
self.file_dirname = os.path.dirname(raw_path)
self.index_path = index_path
self.unexpected_sample = unexpected_sample
self.index = self.get_file(index_path, raw_path)
self.atomic_numbers = set(atomic_numbers)
@staticmethod
def get_file(index_dirname, index_path):
with open(index_dirname, 'rb') as f:
index = pickle.load(f)
file_list = []
for i in index:
if i[0] is None:
continue
else:
pdb = os.path.join(index_path, i[0])
sdf = os.path.join(index_path, i[1])
file_list.append([pdb,sdf])
return file_list
def process(self, raw_file_info):
try:
pocket_file, ligand_file = raw_file_info
lig = Ligand(ligand_file, removeHs=True, sanitize=True)
for a in lig.mol.GetAtoms():
if a.GetAtomicNum() not in self.atomic_numbers:
return None
else:
ligand_dict = lig.to_dict()
if self.only_backbone:
pocket_dict = Protein(pocket_file).get_backbone_dict(removeHs=True)
else:
pocket_dict = Protein(pocket_file).get_atom_dict()
#ligand_dict = parse_sdf_to_dict(ligand_fn)
data = ComplexData.from_protein_ligand_dicts(
protein_dict=torchify_dict(pocket_dict),
ligand_dict=torchify_dict(ligand_dict),
)
data.protein_filename = '/'.join(pocket_file.split('/')[-2:])
data.ligand_filename = '/'.join(ligand_file.split('/')[-2:])
return data
except:
return None
def run(self, dataset_name='crossdocked_pocket10_processed.lmdb', lmdb_path=None,
max_ligand_atom=50, only_backbone=False, n_process=16, interval=2000):
#file_dirname = os.path.dirname(self.index_path)
self.only_backbone = only_backbone
if lmdb_path:
lmdb_path = os.path.join(lmdb_path, dataset_name.split('/')[-1])
else:
lmdb_path = dataset_name #os.path.join(self.file_dirname,dataset_name.split('/')[-1])
if os.path.exists(lmdb_path):
raise FileExistsError(lmdb_path + ' has been existed !')
db = lmdb.open(
lmdb_path,
map_size=200*(1024*1024*1024), # 200GB
create=True,
subdir=False,
readonly=False, # Writable
)
data_ix_list = []
exception_list = []
data_ix = 0
for idx in tqdm(range(0, len(self.index), interval)):
if idx+interval >= len(self.index):
raw_files = self.index[idx:]
else:
raw_files = self.index[idx:idx+interval]
val_raw_files = []
for items in raw_files:
'''if items[0] is None:
continue'''
if 'ATOM' not in open(items[0]).read():
continue
elif items[1] in self.unexpected_sample:
continue
elif Chem.MolFromMolFile(items[1])!=None:
val_raw_files.append(items)
else:
exception_list.append(items)
torch.multiprocessing.set_sharing_strategy('file_system')
pool = Pool(processes=n_process)
data_list = pool.map(self.process, val_raw_files)
with db.begin(write=True, buffers=True) as txn:
for data in data_list:
if data is None: continue
if data.protein_pos.size(0) < 50: continue
if len(data.ligand_nbh_list) > max_ligand_atom: continue
key = str(data_ix).encode()
txn.put(
key = key,
value = pickle.dumps(data)
)
data_ix_list.append(key)
data_ix += 1
db.close()
data_ix_list = np.array(data_ix_list)
np.save(lmdb_path.split('.')[0]+'_Keys', data_ix_list)
with open(lmdb_path.split('.')[0]+'_invalid.list','w') as fw:
fw.write(str(exception_list))
'''
unexpected_sample = [
line.split()[-1] for line in open('/export/home/jyy/PocketFlow/data/unexcept_element_sample_new.csv').read().split('\n')
]
cs2020 = CrossDocked2020(
'/export/home/jyy/Pocket2Mol-learning/data/crossdocked_pocket10',
'/export/home/jyy/Pocket2Mol-learning/data/crossdocked_pocket10/index.pkl',
unexpected_sample=unexpected_sample
)
cs2020.run(
dataset_name='crossdocked_pocket10_processed_35Atoms_backbone.lmdb',
max_ligand_atom=35,
only_backbone=True,
lmdb_path=None
)
'''
class PDBBind2020(CrossDocked2020):
def __init__(self, raw_file_list, unexpected_sample=[], atomic_numbers=[6,7,8,9,15,16,17,35,53]):
self.index = self.get_file(raw_file_list)
self.unexpected_sample = unexpected_sample
self.file_dirname = os.path.split(os.path.split(self.index[0][0])[0])
self.atomic_numbers = set(atomic_numbers)
@staticmethod
def get_file(raw_file_list):
file_list = []
for i in raw_file_list:
sdf = glob.glob(i + '/*.sdf')
pdb = glob.glob(i + '/*pocket.pdb')
if pdb and sdf:
file_list.append([pdb[0], sdf[0]])
return file_list
'''
refine_set_dir_list = glob.glob('/export/home/jyy/PDBBind2020/refined-set/*/')
other_pl_dir_list = glob.glob('/export/home/jyy/PDBBind2020/v2020-other-PL/*/')
raw_file_list = refine_set_dir_list + other_pl_dir_list
pdbbind_2020 = PDBBind2020(raw_file_list, atomic_numbers=[6,7,8,9,15,16,17,35,53])
pdbbind_2020.run(dataset_name='PDBBind_2020.lmdb', max_ligand_atom=35,
n_process=6, interval=2000, only_backbone=False, lmdb_path=None)
'''

View File

@@ -1,316 +0,0 @@
from rdkit import Chem
FUSED_QUA_RING_PATTERN = [
Chem.MolFromSmarts(i) for i in[
'[R]12~&@[R]~&@[R]~&@[R]~&@[R]~&@[R]~&@1~&@[R]~&@[R]1~&@[R](~&@[R]~&@[R]3~&@[R]~&@[R]~&@[R]~&@[R]~&@[R]~&@3~&@[R]~&@1)~&@[R]~&@2',
'[R]12~&@[R]~&@[R]~&@[R]~&@[R]3~&@[R]~&@1~&@[R](~&@[R]~&@[R]~&@[R]~&@2)~&@[R]~&@[R]1~&@[R]~&@3~&@[R]~&@[R]~&@[R]~&@[R]~&@1',
'[R]12~&@[R]3~&@[R]~&@[R]~&@[R]4~&@[R]~&@[R]~&@[R]~&@[R](~&@[R]~&@1~&@4)~&@[R]~&@[R]~&@[R]~&@2~&@[R]~&@[R]~&@[R]~&@3',
'[R]12~&@[R]3~&@[R]~&@[R]~&@[R]~&@[R]~&@[R]~&@3~&@[R]3~&@[R]~&@[R]~&@[R]~&@[R]~&@[R]~&@3~&@[R]~&@1~&@[R]~&@[R]~&@[R]~&@[R]~&@2',
'[R]12~&@[R]~&@[R]~&@[R]3~&@[R](~&@[R]~&@[R]~&@[R]4~&@[R]~&@3~&@[R]~&@[R]~&@[R]~&@[R]~&@4)~&@[R]~&@1~&@[R]~&@[R]~&@[R]~&@[R]~&@2',
'[R]12~&@[R]~&@[R]~&@[R]3~&@[R]~&@[R]4~&@[R]~&@[R]~&@[R]~&@[R]~&@[R]~&@4~&@[R]~&@[R]~&@3~&@[R]~&@1~&@[R]~&@[R]~&@[R]~&@[R]~&@2',
'[R]12~&@[R]~&@[R]~&@[R]~&@[R]~&@1~&@[R]~&@[R]1~&@[R]~&@[R]3~&@[R]~&@[R]~&@[R]~&@[R]~&@[R]~&@3~&@[R]~&@[R]~&@1~&@[R]~&@2',
'[R]12~&@[R]~&@[R]3~&@[R](~&@[R]~&@[R]4~&@[R]~&@[R]~&@[R]~&@[R]~&@[R]~&@4~&@[R]~&@3)~&@[R]~&@1~&@[R]~&@[R]~&@[R]~&@[R]~&@2',
'[R]12~&@[R]~&@[R]~&@[R]3~&@[R]~&@[R]4~&@[R]~&@[R]~&@[R]~&@[R]~&@[R]~&@4~&@[R]~&@[R]~&@3~&@[R]~&@1~&@[R]~&@[R]~&@[R]~&@2',
'[R]12~&@[R]~&@[R]3~&@[R]~&@[R]4~&@[R](~&@[R]~&@[R]~&@[R]~&@[R]~&@4)~&@[R]~&@3~&@[R]~&@1~&@[R]~&@[R]~&@[R]~&@[R]~&@2',
'[R]12~&@[R]~&@[R]3~&@[R]4~&@[R]~&@[R]~&@[R]~&@[R]~&@[R]~&@4~&@[R]~&@[R]~&@3~&@[R]~&@1~&@[R]~&@[R]~&@[R]~&@[R]~&@2',
'[R]12~&@[R]~&@[R]~&@[R]3~&@[R]4~&@[R]~&@[R]~&@[R]~&@[R]~&@4~&@[R]~&@[R]~&@[R]~&@3~&@[R]~&@1~&@[R]~&@[R]~&@[R]~&@2',
'[R]12~&@[R]~&@[R]~&@[R]~&@[R]~&@[R]~&@1~&@[R]1~&@[R](~&@[R]~&@[R]~&@[R]~&@1)~&@[R]1~&@[R]~&@2~&@[R]~&@[R]~&@[R]~&@1',
'[R]12~&@[R]~&@[R]~&@[R]3~&@[R]~&@[R]~&@[R]4~&@[R](~&@[R]~&@[R]~&@[R]~&@4)~&@[R]~&@3~&@[R]~&@1~&@[R]~&@[R]~&@[R]~&@2',
'[R]12~&@[R]~&@[R]~&@[R]~&@[R]~&@1~&@[R]~&@[R]~&@[R]1~&@[R]3~&@[R]~&@[R]~&@[R]~&@[R]~&@[R]~&@3~&@[R]~&@[R]~&@1~&@[R]~&@2',
'[R]12~&@[R]~&@[R]~&@[R]~&@[R]~&@[R]~&@1~&@[R]1~&@[R]~&@[R]3~&@[R]~&@[R]~&@[R]~&@[R]~&@[R]~&@3~&@[R]~&@[R]~&@1~&@[R]~&@[R]~&@[R]~&@2',
'[R]12~&@[R]~&@[R]~&@[R]~&@[R]3~&@[R]~&@[R]~&@[R]~&@[R]~&@[R]~&@3~&@[R]~&@1~&@[R]~&@[R]~&@[R]~&@2',
'[R]12~&@[R]~&@[R]~&@[R]~&@[R]~&@[R]~&@1~&@[R]~&@[R]~&@[R]1~&@[R]~&@[R]~&@[R]~&@[R]~&@1~&@[R]~&@2',
'[R]12~&@[R]~&@[R]3~&@[R](~&@[R]~&@[R]~&@[R]4~&@[R]~&@3~&@[R]~&@[R]~&@[R]~&@4)~&@[R]~&@1~&@[R]~&@[R]~&@[R]~&@[R]~&@2',
'[R]12~&@[R]~&@[R]3~&@[R](~&@[R]~&@[R]4~&@[R](~&@[R]~&@[R]~&@[R]~&@4)~&@[R]~&@3)~&@[R]~&@1~&@[R]~&@[R]~&@[R]~&@[R]~&@2',
'[R]12~&@[R]~&@[R]~&@[R]~&@[R]~&@[R]3~&@[R]~&@1~&@[R]1~&@[R](~&@[R]~&@[R]~&@[R]~&@[R]~&@1~&@[R]~&@[R]~&@3)~&@[R]~&@[R]~&@2',
'[R]12~&@[R]~&@[R]~&@[R]~&@[R]~&@[R]3~&@[R]~&@1~&@[R]1~&@[R](~&@[R]~&@[R]~&@[R]~&@1~&@[R]~&@[R]~&@3)~&@[R]~&@[R]~&@2',
'[R]12~&@[R]~&@[R]~&@[R]~&@[R]3~&@[R]~&@1~&@[R]1~&@[R](~&@[R]~&@[R]~&@2)~&@[R]~&@[R]~&@[R]~&@1~&@[R]~&@[R]~&@3',
'[R]12~&@[R]~&@[R]~&@[R]~&@[R]~&@[R]3~&@[R]~&@1~&@[R](~&@[R]~&@[R]~&@3)~&@[R]~&@[R]~&@[R]~&@2',
'[R]12~&@[R]~&@[R]~&@[R]~&@[R]~&@[R]~&@1~&@[R]~&@[R]1~&@[R]3~&@[R]~&@2~&@[R]~&@[R]~&@[R]~&@3~&@[R]~&@[R]~&@[R]~&@1',
'[R]12~&@[R]~&@[R]~&@[R]~&@[R]~&@[R]3~&@[R]~&@1~&@[R](~&@[R]1~&@[R](~&@[R]~&@[R]~&@[R]~&@1)~&@[R]~&@2)~&@[R]~&@[R]~&@[R]~&@3',
'[R]12~&@[R]~&@[R]~&@[R]3~&@[R]~&@1~&@[R]1~&@[R](~&@[R]~&@[R]~&@2)~&@[R]~&@[R]~&@[R]~&@1~&@[R]~&@[R]~&@3',
'[R]12~&@[R]~&@[R]3~&@[R]~&@[R]~&@[R]4~&@[R]~&@3~&@[R](~&@[R]~&@[R]~&@[R]~&@4)~&@[R]~&@1~&@[R]~&@[R]~&@[R]~&@2',
'[R]12~&@[R]~&@[R]~&@[R]3~&@[R]~&@1~&@[R](~&@[R]~&@[R]~&@[R]~&@2)~&@[R]~&@[R]1~&@[R]~&@3~&@[R]~&@[R]~&@[R]~&@1',
'[R]12~&@[R]~&@[R]~&@[R]~&@[R]3~&@[R]~&@1~&@[R](~&@[R]~&@[R]~&@[R]~&@[R]~&@2)~&@[R]~&@[R]~&@[R]~&@3',
'[R]12~&@[R]~&@[R]~&@[R]~&@[R]~&@[R]~&@1~&@[R]~&@[R]1~&@[R]~&@2~&@[R]~&@[R]~&@[R]2~&@[R]~&@1~&@[R]~&@[R]~&@[R]~&@[R]~&@2',
'[R]12~&@[R]~&@[R]~&@[R]~&@[R]~&@[R]~&@1~&@[R]~&@[R]1~&@[R]~&@2~&@[R]2~&@[R]~&@[R]~&@[R]~&@[R]~&@[R]~&@2~&@[R]~&@[R]~&@1',
'[R]12~&@[R]~&@[R]~&@[R]3~&@[R]~&@1~&@[R](~&@[R]~&@[R]~&@[R]~&@[R]~&@2)~&@[R]~&@[R]~&@3',
'[R]12~&@[R]~&@[R]~&@[R]~&@[R]~&@1~&@[R]~&@[R]1~&@[R]3~&@[R]~&@2~&@[R]~&@[R]~&@[R]~&@[R]~&@3~&@[R]~&@[R]~&@1',
'[R]12~&@[R]~&@[R]~&@[R]~&@[R]34~&@[R]~&@1~&@[R](~&@[R]~&@[R]~&@[R]~&@2)~&@[R]~&@[R]~&@3~&@[R]~&@[R]~&@[R]~&@[R]~&@4',
'[R]12~&@[R]~&@[R]~&@[R]~&@[R]~&@[R]~&@[R]~&@1~&@[R]~&@[R]1~&@[R](~&@[R]~&@[R]3~&@[R]~&@[R]~&@[R]~&@[R]~&@[R]~&@3~&@[R]~&@1)~&@[R]~&@2',
'[R]12~&@[R]~&@[R]~&@[R]~&@[R]~&@[R]~&@[R]~&@1~&@[R]~&@[R]1~&@[R](~&@[R]3~&@[R]~&@[R]~&@[R]~&@[R]~&@[R]~&@3~&@[R]~&@[R]~&@1)~&@[R]~&@2',
'[R]12~&@[R]~&@[R]~&@[R]~&@[R]~&@[R]~&@[R]~&@1~&@[R]1~&@[R](~&@[R]~&@[R]~&@[R]~&@[R]~&@1)~&@[R]1~&@[R]~&@2~&@[R]~&@[R]~&@[R]~&@[R]~&@1',
'[R]12~&@[R]~&@[R]~&@[R]~&@[R]~&@[R]~&@[R]~&@1~&@[R]1~&@[R]3~&@[R](~&@[R]~&@[R]~&@[R]~&@[R]~&@3~&@[R]~&@[R]~&@[R]~&@1)~&@[R]~&@2',
'[R]12~&@[R]~&@[R]~&@[R]~&@[R]~&@[R]~&@[R]~&@1~&@[R]~&@[R]1~&@[R](~&@[R]~&@[R]3~&@[R]~&@[R]~&@[R]~&@[R]~&@3~&@[R]~&@1)~&@[R]~&@2',
'[R]12~&@[R]~&@[R]~&@[R]~&@[R]~&@[R]~&@[R]~&@1~&@[R]~&@[R]1~&@[R](~&@[R]~&@[R]~&@[R]3~&@[R]~&@1~&@[R]~&@[R]~&@[R]~&@3)~&@[R]~&@2',
'[R]12~&@[R]~&@[R]~&@[R]3~&@[R]4~&@[R]~&@1~&@[R](~&@[R]~&@[R]~&@4~&@[R]~&@[R]~&@[R]~&@3)~&@[R]~&@[R]~&@[R]~&@2',
'[R]12~&@[R]~&@[R]~&@[R]~&@[R]~&@[R]~&@[R]~&@1~&@[R]1~&@[R]3~&@[R](~&@[R]~&@[R]~&@[R]~&@[R]~&@3~&@[R]~&@[R]~&@1)~&@[R]~&@2',
'[R]12~&@[R]~&@[R]~&@[R]~&@[R]3~&@[R](~&@[R]~&@[R]4~&@[R]~&@[R]~&@[R]~&@[R]~&@[R]~&@4~&@[R]~&@3)~&@[R]~&@1~&@[R]~&@[R]~&@[R]~&@[R]~&@2',
'[R]12~&@[R]~&@[R]~&@[R]~&@[R]~&@[R]~&@1~&@[R]~&@[R]1~&@[R]~&@[R]3~&@[R]~&@[R]~&@[R]~&@[R]~&@[R]~&@3~&@[R]~&@[R]~&@1~&@[R]~&@[R]~&@2',
'[R]12~&@[R]~&@[R]~&@[R]~&@[R]~&@[R]~&@1~&@[R]~&@[R]1~&@[R]3~&@[R]~&@[R]~&@[R]~&@[R]~&@[R]~&@3~&@[R]~&@[R]~&@[R]~&@1~&@[R]~&@[R]~&@2',
'[R]12~&@[R]~&@[R]~&@[R]~&@[R]3~&@[R](~&@[R]~&@[R]~&@[R]~&@[R]~&@3)~&@[R]~&@1~&@[R]~&@[R]~&@[R]1~&@[R]~&@2~&@[R]~&@[R]~&@[R]~&@[R]~&@1',
'[R]12~&@[R]~&@[R]~&@[R]~&@[R]3~&@[R@@H](~&@[R@H]4~&@[R]~&@[R]~&@[R]~&@[R]~&@[R@H]~&@4~&@[R]~&@[R]~&@3)~&@[R]~&@1~&@[R]~&@[R]~&@[R]~&@[R]~&@2',
'[R]12~&@[R]~&@[R]3~&@[R](~&@[R]~&@[R]~&@[R]~&@[R]~&@3)~&@[R]3~&@[R](~&@[R]~&@[R]~&@[R]~&@[R]~&@3)~&@[R]~&@1~&@[R]~&@[R]~&@[R]~&@[R]~&@2',
'[R]12~&@[R]~&@[R]3~&@[R](~&@[R]~&@[R]~&@[R]~&@[R]~&@3)~&@[R]3~&@[R](~&@[R]~&@[R]~&@[R]~&@3)~&@[R]~&@1~&@[R]~&@[R]~&@[R]~&@[R]~&@2',
'[R]12~&@[R]3~&@[R]~&@[R]~&@[R]~&@[R]~&@[R]~&@3~&@[R]~&@[R]3~&@[R]~&@[R]~&@[R]~&@[R]~&@3~&@[R]~&@1~&@[R]~&@[R]~&@[R]~&@[R]~&@2',
'[R]12~&@[R]~&@[R]~&@[R]~&@[R]3~&@[R]~&@1~&@[R](~&@[R]~&@[R]~&@[R]~&@2)~&@[R]1~&@[R]~&@3~&@[R]~&@[R]~&@[R]~&@[R]~&@1',
'[R]12~&@[R]~&@[R]~&@[R]~&@[R]~&@[R]~&@1~&@[R]~&@[R]1~&@[R](~&@[R]3~&@[R]~&@[R]~&@[R]~&@[R]~&@3~&@[R]~&@1)~&@[R]~&@2',
'[R]12~&@[R]~&@[R]~&@[R]~&@[R]~&@[R]~&@1~&@[R]~&@[R]1~&@[R]~&@[R]~&@[R]~&@[R]~&@[R]~&@1~&@[R]~&@[R]~&@2',
'[R]12~&@[R]~&@[R]~&@[R]~&@[R]3~&@[R](~&@[R]~&@[R]~&@[R]~&@[R]~&@3)~&@[R]~&@1~&@[R]~&@[R]~&@[R]~&@[R]~&@2',
'[R]12~&@[R]~&@[R]~&@[R]~&@[R]~&@1~&@[R]1~&@[R]~&@[R]~&@[R]~&@[R]3~&@[R]~&@1~&@[R](~&@[R]~&@[R]~&@[R]~&@3)~&@[R]~&@2',
'[R]12~&@[R]3~&@[R]~&@[R]~&@[R]~&@[R]~&@[R]~&@3~&@[R]3~&@[R]~&@[R]~&@[R]~&@[R]~&@3~&@[R]~&@1~&@[R]~&@[R]~&@[R]~&@[R]~&@2',
'[R]12~&@[R]~&@[R]~&@[R]3~&@[R]~&@1~&@[R](~&@[R]1~&@[R](~&@[R]~&@2)~&@[R]~&@[R]~&@[R]~&@[R]~&@1)~&@[R]~&@[R]~&@[R]~&@3',
'[R]12~&@[R]~&@[R]~&@[R]3~&@[R](~&@[R]~&@[R]~&@[R]~&@[R]~&@3)~&@[R]~&@1~&@[R]~&@[R]~&@[R]1~&@[R]~&@2~&@[R]~&@[R]~&@[R]~&@1',
'[R]12~&@[R]~&@[R]~&@[R]3~&@[R](~&@[R]~&@[R]~&@[R]~&@[R]~&@3)~&@[R]~&@1~&@[R]~&@[R]1~&@[R](~&@[R]~&@[R]~&@[R]~&@1)~&@[R]~&@2',
'[R]12~&@[R]~&@[R]~&@[R]~&@[R]~&@[R]~&@1~&@[R]1~&@[R](~&@[R]~&@[R]~&@[R]3~&@[R]~&@1~&@[R]~&@[R]~&@[R]~&@3)~&@[R]~&@[R]~&@2',
'[R]12~&@[R]~&@[R]~&@[R]~&@[R]~&@1~&@[R]~&@[R]1~&@[R]~&@[R]3~&@[R]~&@[R]~&@[R]~&@[R]~&@3~&@[R]~&@[R]~&@1~&@[R]~&@2',
'[R]12~&@[R]~&@[R]~&@[R]3~&@[R](~&@[R]~&@[R]4~&@[R]~&@[R]~&@[R]~&@[R]~&@4~&@[R]~&@3)~&@[R]~&@1~&@[R]~&@[R]~&@[R]~&@2',
'[R]12~&@[R]~&@[R]~&@[R]3~&@[R]~&@[R]~&@[R]4~&@[R](~&@[R]~&@[R]~&@[R]~&@[R]~&@4)~&@[R]~&@3~&@[R]~&@1~&@[R]~&@[R]~&@[R]~&@[R]~&@2',
'[R]12~&@[R]~&@[R]~&@[R]3~&@[R]4~&@[R](~&@[R]~&@[R]~&@[R]~&@3)~&@[R]~&@[R]~&@[R]~&@[R]~&@1~&@4~&@[R]~&@[R]~&@[R]~&@[R]~&@2',
'[R]12~&@[R]~&@[R]~&@[R]~&@[R]~&@13~&@[R](~&@[R]~&@[R]~&@[R]~&@3)~&@[R]~&@[R]1~&@[R]~&@2~&@[R]~&@[R]~&@[R]~&@[R]~&@1',
'[R]12~&@[R]~&@[R]~&@[R]~&@[R]~&@[R]~&@1~&@[R]~&@[R]1~&@[R]~&@2~&@[R]2~&@[R]~&@[R]~&@[R]~&@[R]~&@2~&@[R]~&@[R]~&@1'
]
]
def has_fused4ring(mol):
for pat in FUSED_QUA_RING_PATTERN:
if mol.HasSubstructMatch(pat):
return True
else:
return False
PATTERNS_1 = [Chem.MolFromSmarts(i) for i in [
'[R]12~&@[R]~&@[R]~&@[R]~&@1~&@[R]~&@2',
'[R]12~&@[R]~&@[R]~&@1~&@[R]~&@[R]~&@[R]~&@2',
'[R]12~&@[R]~&@[R]~&@[R]~&@[R]~&@[R]~&@1~&@[R]~&@2',
'[R]12~&@[R]~&@[R]~&@1~&@[R]~&@2',
'[R]12~&@[R]~&@[R]~&@[R]~&@1~&@[R]~&@[R]~&@[R]~&@[R]~&@2',
'[R]12~&@[R]~&@[R]~&@[R]~&@[R]~&@1~&@[R]~&@[R]~&@2',
'[R]12~&@[R]~&@[R]~&@[R]~&@1~&@[R]~&@[R]~&@2',
'[R]1~&@[R]~&@[R]~&@12~&@[R]~&@[R]~&@2'
]
]
def judge_unexpected_ring(mol):
for pat in PATTERNS_1:
subs = mol.GetSubstructMatches(pat)
if len(subs) > 0:
return True
else:
return False
PATTERNS = [Chem.MolFromSmarts(i) for i in [
'[R]12~&@[R]3~&@[R]~&@[R]~&@[R]~&@[R]~&@[R]~&@3~&@[R]~&@[R]~&@[R]~&@1~&@[R]~&@[R]~&@[R]~&@[R]~&@2',
'[R]12~&@[R]~&@[R]3~&@[R]~&@[R]~&@[R]~&@[R]~&@[R]~&@3~&@[R]~&@[R]~&@1~&@[R]~&@[R]~&@[R]~&@2',
'[R]12~&@[R]~&@[R]~&@[R]~&@[R]3~&@[R]~&@1~&@[R](~&@[R]~&@[R]~&@[R]~&@2)~&@[R]~&@[R]~&@[R]~&@3',
'[R]12~&@[R]~&@[R]~&@[R]~&@[R]~&@[R]~&@1~&@[R]1~&@[R](~&@[R]~&@2)~&@[R]~&@[R]~&@[R]~&@[R]~&@1',
'[R]12~&@[R]~&@[R]~&@[R]3~&@[R](~&@[R]~&@1~&@[R]~&@[R]~&@[R]~&@[R]~&@2)~&@[R]~&@[R]~&@[R]~&@3',
'[R]12~&@[R]~&@[R]~&@[R]~&@[R]~&@[R]~&@1~&@[R]~&@[R]1~&@[R](~&@[R]~&@2)~&@[R]~&@[R]~&@[R]~&@[R]~&@1']
]
def judge_fused_ring(mol):
for pat in PATTERNS+FUSED_QUA_RING_PATTERN:
if mol.HasSubstructMatch(pat):
return True
else:
return False
def substructure(mol_lib):
total_num = sum([len(i) for i in mol_lib])
ring_size_statis = {
'tri_ring':{'num':0},
'qua_ring':{'num':0},
'fif_ring':{'num':0},
'hex_ring':{'num':0},
'hep_ring':{'num':0},
'oct_ring':{'num':0},
'big_ring':{'num':0},
'fused_ring':{'num':0},
'unexpected_ring':{'num':0},
'sssr':{}
}
for s in mol_lib:
for mol in s:
if mol is None: continue
mol = Chem.MolFromSmiles(Chem.MolToSmiles(mol, isomericSmiles=False))
sssr = Chem.GetSSSR(mol)
if sssr in ring_size_statis['sssr']:
ring_size_statis['sssr'][sssr]['num'] += 1
else:
ring_size_statis['sssr'][sssr] = {}
ring_size_statis['sssr'][sssr]['num'] = 1
ring = mol.GetRingInfo()
has_ring_size_3 = False
has_ring_size_4 = False
has_ring_size_5 = False
has_ring_size_6 = False
has_ring_size_7 = False
has_ring_size_8 = False
has_big_ring = False
has_fused_ring = False
has_unexpected_ring = False
for r in mol.GetRingInfo().AtomRings():
if len(r) == 3 and has_ring_size_3 is False:
has_ring_size_3 = True
ring_size_statis['tri_ring']['num'] += 1
if len(r) == 4 and has_ring_size_4 is False:
has_ring_size_4 = True
ring_size_statis['qua_ring']['num'] += 1
if len(r) == 5 and has_ring_size_5 is False:
has_ring_size_5 = True
ring_size_statis['fif_ring']['num'] += 1
if len(r) == 6 and has_ring_size_6 is False:
has_ring_size_6 = True
ring_size_statis['hex_ring']['num'] += 1
if len(r) == 7 and has_ring_size_7 is False:
has_ring_size_7 = True
ring_size_statis['hep_ring']['num'] += 1
if len(r) == 8 and has_ring_size_8 is False:
has_ring_size_8 = True
ring_size_statis['oct_ring']['num'] += 1
if len(r) > 8 and has_big_ring is False:
has_big_ring = True
ring_size_statis['big_ring']['num'] += 1
'''for i in range(mol.GetNumAtoms()):
if ring.IsAtomInRingOfSize(i,3) and has_ring_size_3 is False:
has_ring_size_3 = True
ring_size_statis['tri_ring']['num'] += 1
if ring.IsAtomInRingOfSize(i,4) and has_ring_size_4 is False:
has_ring_size_4 = True
ring_size_statis['qua_ring']['num'] += 1
if ring.IsAtomInRingOfSize(i,5) and has_ring_size_5 is False:
has_ring_size_5 = True
ring_size_statis['fif_ring']['num'] += 1
if ring.IsAtomInRingOfSize(i,6) and has_ring_size_6 is False:
has_ring_size_6 = True
ring_size_statis['hex_ring']['num'] += 1
if ring.IsAtomInRingOfSize(i,7) and has_ring_size_7 is False:
has_ring_size_7 = True
ring_size_statis['hep_ring']['num'] += 1
if ring.IsAtomInRingOfSize(i,8) and has_ring_size_8 is False:
has_ring_size_8 = True
ring_size_statis['oct_ring']['num'] += 1'''
if judge_fused_ring(mol) and has_fused_ring is False:
has_fused_ring = True
ring_size_statis['fused_ring']['num'] += 1
if judge_unexpected_ring(mol) and has_unexpected_ring is False:
ring_size_statis['unexpected_ring']['num'] += 1
ring_size_statis['tri_ring']['rate'] = ring_size_statis['tri_ring']['num']/total_num
ring_size_statis['qua_ring']['rate'] = ring_size_statis['qua_ring']['num']/total_num
ring_size_statis['fif_ring']['rate'] = ring_size_statis['fif_ring']['num']/total_num
ring_size_statis['hex_ring']['rate'] = ring_size_statis['hex_ring']['num']/total_num
ring_size_statis['hep_ring']['rate'] = ring_size_statis['hep_ring']['num']/total_num
ring_size_statis['oct_ring']['rate'] = ring_size_statis['oct_ring']['num']/total_num
ring_size_statis['big_ring']['rate'] = ring_size_statis['big_ring']['num']/total_num
ring_size_statis['fused_ring']['rate'] = ring_size_statis['fused_ring']['num']/total_num
ring_size_statis['unexpected_ring']['rate'] = ring_size_statis['unexpected_ring']['num']/total_num
for k in ring_size_statis['sssr']:
ring_size_statis['sssr'][k]['rate'] = ring_size_statis['sssr'][k]['num']/total_num
return ring_size_statis
def smoothing(scalars, weight=0.8): # Weight between 0 and 1
last = scalars[0] # First value in the plot (first timestep)
smoothed = list()
for point in scalars:
smoothed_val = last * weight + (1 - weight) * point # Calculate smoothed value
smoothed.append(smoothed_val) # Save it
last = smoothed_val # Anchor the last smoothed value
return smoothed
"""PATTERNS = [Chem.MolFromSmarts(i) for i in [
'[R]12~&@[R]3~&@[R]~&@[R]~&@[R]~&@[R]~&@[R]~&@3~&@[R]~&@[R]~&@[R]~&@1~&@[R]~&@[R]~&@[R]~&@[R]~&@2',
'[R]12~&@[R]~&@[R]3~&@[R]~&@[R]~&@[R]~&@[R]~&@[R]~&@3~&@[R]~&@[R]~&@1~&@[R]~&@[R]~&@[R]~&@2',
'[R]12~&@[R]~&@[R]~&@[R]~&@[R]3~&@[R]~&@1~&@[R](~&@[R]~&@[R]~&@[R]~&@2)~&@[R]~&@[R]~&@[R]~&@3',
'[R]12~&@[R]~&@[R]~&@[R]~&@[R]~&@[R]~&@1~&@[R]1~&@[R](~&@[R]~&@2)~&@[R]~&@[R]~&@[R]~&@[R]~&@1',
'[R]12~&@[R]~&@[R]~&@[R]3~&@[R](~&@[R]~&@1~&@[R]~&@[R]~&@[R]~&@[R]~&@2)~&@[R]~&@[R]~&@[R]~&@3',
'[R]12~&@[R]~&@[R]~&@[R]~&@[R]~&@[R]~&@1~&@[R]~&@[R]1~&@[R](~&@[R]~&@2)~&@[R]~&@[R]~&@[R]~&@[R]~&@1']
]
def judge_fused_ring(mol):
for pat in PATTERNS:
subs = mol.GetSubstructMatches(pat)
if len(subs) > 0:
return True
else:
return False
def substructure(mol_lib):
total_num = sum([len(i) for i in mol_lib])
ring_size_statis = {
'tri_ring':{'num':0},
'qua_ring':{'num':0},
'fif_ring':{'num':0},
'hex_ring':{'num':0},
'hep_ring':{'num':0},
'oct_ring':{'num':0},
'fused_ring':{'num':0},
'sssr':{}
}
for s in mol_lib:
for mol in s:
if mol is None: continue
mol = Chem.MolFromSmiles(Chem.MolToSmiles(mol, isomericSmiles=False))
sssr = Chem.GetSSSR(mol)
if sssr in ring_size_statis['sssr']:
ring_size_statis['sssr'][sssr]['num'] += 1
else:
ring_size_statis['sssr'][sssr] = {}
ring_size_statis['sssr'][sssr]['num'] = 1
ring = mol.GetRingInfo()
has_ring_size_3 = False
has_ring_size_4 = False
has_ring_size_5 = False
has_ring_size_6 = False
has_ring_size_7 = False
has_ring_size_8 = False
has_fused_ring = False
for i in range(mol.GetNumAtoms()):
if ring.IsAtomInRingOfSize(i,3) and has_ring_size_3 is False:
has_ring_size_3 = True
ring_size_statis['tri_ring']['num'] += 1
if ring.IsAtomInRingOfSize(i,4) and has_ring_size_4 is False:
has_ring_size_4 = True
ring_size_statis['qua_ring']['num'] += 1
if ring.IsAtomInRingOfSize(i,5) and has_ring_size_5 is False:
has_ring_size_5 = True
ring_size_statis['fif_ring']['num'] += 1
if ring.IsAtomInRingOfSize(i,6) and has_ring_size_6 is False:
has_ring_size_6 = True
ring_size_statis['hex_ring']['num'] += 1
if ring.IsAtomInRingOfSize(i,7) and has_ring_size_7 is False:
has_ring_size_7 = True
ring_size_statis['hep_ring']['num'] += 1
if ring.IsAtomInRingOfSize(i,8) and has_ring_size_8 is False:
has_ring_size_8 = True
ring_size_statis['oct_ring']['num'] += 1
if judge_fused_ring(mol) and has_fused_ring is False:
has_fused_ring = True
ring_size_statis['fused_ring']['num'] += 1
ring_size_statis['tri_ring']['rate'] = ring_size_statis['tri_ring']['num']/total_num
ring_size_statis['qua_ring']['rate'] = ring_size_statis['qua_ring']['num']/total_num
ring_size_statis['fif_ring']['rate'] = ring_size_statis['fif_ring']['num']/total_num
ring_size_statis['hex_ring']['rate'] = ring_size_statis['hex_ring']['num']/total_num
ring_size_statis['hep_ring']['rate'] = ring_size_statis['hep_ring']['num']/total_num
ring_size_statis['oct_ring']['rate'] = ring_size_statis['oct_ring']['num']/total_num
ring_size_statis['fused_ring']['rate'] = ring_size_statis['fused_ring']['num']/total_num
for k in ring_size_statis['sssr']:
ring_size_statis['sssr'][k]['rate'] = ring_size_statis['sssr'][k]['num']/total_num
return ring_size_statis
def smoothing(scalars, weight=0.8): # Weight between 0 and 1
last = scalars[0] # First value in the plot (first timestep)
smoothed = list()
for point in scalars:
smoothed_val = last * weight + (1 - weight) * point # Calculate smoothed value
smoothed.append(smoothed_val) # Save it
last = smoothed_val # Anchor the last smoothed value
return smoothed"""

View File

@@ -1,439 +0,0 @@
from rdkit import Chem
from rdkit.Chem import QED, Descriptors, Crippen, Lipinski
from multiprocessing import Pool
from tqdm.auto import tqdm
from . import sascorer
# from https://doi.org/10.1016/j.bmc.2019.115192
ALERT_STRUCTURES = [
'C#CI', 'C[C&D2]=!@N[!O]', 'CC([C,H])=!@N[!O]', 'N#CC(=O)', 'C(=[O,S])[F,Cl,Br,I]', '[#6][CH1]=O',
'COS(=O)(=O)C(F)(F)F', '[C;H1$(C([#6;!$(C=O)])),H0$(C([#6;!$(C=O)])[#6;!$(C=O)])]=[CH1]!@N([#6;!$(C(=O))])[#6;!$(C(=O))]',
'[CX4][Cl,Br,I]', '*=C=*', 'c1(N)ccc(C(=O)NC3(=O))c(c3ccc2)c21', 'N!@C=C-C=[C!r]([OH])', 'NC#N',
'[#6]C(=O)-!@OC(=[O,N])', 'O=*N=[N+]=[N-]', 'N=[N+]([O-])C', 'c1ccccc1[N!r]=[N!r]c2ccccc2',
'[N;R0]=[N;R0]C#N', 'C(Cl)(Cl)(Cl)C([O,S])[$([NX4+]),$([NX3]);!$(*=*)&!$(*:*)]', 'N[C!r](=O)S',
'[Cl]C([C&R0])=N', 'SC(=[!r])S', 'O1CCOCCOCCOCC1', 'O1CCOCCOCCOCCOCC1', 'O1CCOCCOCCOCCOCCOCC1',
'N#CC[OH1]', 'c1c([N;!R](~[O;X1])~[O;X1])cc([N;!R](~[O;X1])~[O;X1])cc1',
'c1c([N;!R](~[O;X1])~[O;X1])c([N;!R](~[O;X1])~[O;X1])ccc1',
'c1c([N;!R](~[O;X1])~[O;X1])ccc([N;!R](~[O;X1])~[O;X1])c1', 'C=[N+]=[N-]', '[N+]#N',
'[C!r]=[C&D2][C!r]=[C&D2][C&D3]=O', 'NC(=S)S', 'S1SC=CC1=S', 'S1C=CSC1=C', 'S[C;!$(C=*)]S',
'c1cccc(NC(=NC(=[N,S,O])NC(=O)3)C3=N2)c12', 'O=C1NC(=O)c2nc3ccccc3nc2N1',
'c1cc(O)cc(OC(=CC(=O)C=C3)C3=C2)c12',
'[$([cX3](:*):*),$([cX2+](:*):*),$([CX3]=*),$([CX2+]=*)]1([$([OX1]),$([NH]);!$([*X3])&!$(*-C=O)])[$([cX3](:*):*),$([cX2+](:*):*),$([CX3]=*),$([CX2+]=*)]([$([OX1]),$(N);!$([*X3])&!$(*-C=O)])[$([cX3](:*):*),$([cX2+](:*):*),$([CX3]=*),$([CX2+]=*)][$([cX3](:*):*),$([cX2+](:*):*),$([CX3]=*),$([CX2+]=*)][$([cX3](:*):*),$([cX2+](:*):*),$([CX3]=*),$([CX2+]=*)][$([cX3](:*):*),$([cX2+](:*):*),$([CX3]=*),$([CX2+]=*)]1',
'[$([cX3](:*):*),$([cX2+](:*):*),$([CX3]=*),$([CX2+]=*)]1([$([OX1]),$([NH]);!$([*X3])&!$(*-C=O)])[$([cX3](:*):*),$([cX2+](:*):*),$([CX3]=*),$([CX2+]=*)][$([cX3](:*):*),$([cX2+](:*):*),$([CX3]=*),$([CX2+]=*)][$([cX3](:*):*),$([cX2+](:*):*),$([CX3]=*),$([CX2+]=*)]([$([OX1]),$(N);!$([*X3])&!$(*-C=O)])[$([cX3](:*):*),$([cX2+](:*):*),$([CX3]=*),$([CX2+]=*)][$([cX3](:*):*),$([cX2+](:*):*),$([CX3]=*),$([CX2+]=*)]1',
'[$([cX3](:*):*),$([cX2+](:*):*),$([CX3]=*),$([CX2+]=*)]1[$([cX3](:*):*),$([cX2+](:*):*),$([CX3]=*),$([CX2+]=*)](O[$(C=O),$(C=N)])[$([cX3](:*):*),$([cX2+](:*):*),$([CX3]=*),$([CX2+]=*)][$([cX3](:*):*),$([cX2+](:*):*),$([CX3]=*),$([CX2+]=*)][$([cX3](:*):*),$([cX2+](:*):*),$([CX3]=*),$([CX2+]=*)](O[$(C=O),$(C=N)])[$([cX3](:*):*),$([cX2+](:*):*),$([CX3]=*),$([CX2+]=*)]1', 'C(=O)Onnn', 'c1[n+]([#6])ccn1([#6])',
'[#6,#8,#16]-[CH1]=[NH1]', 'C=N([C,S])[C&v4]', 'N=[C!R]=O', 'N=[C!R]=S', 'C=[C!R]=O', 'P(=S)(S)S',
'[#3,#11,#12,#13,#19,#20,#26,#27,#28,#29,#30]',
'[#6;$([#6]~[#3,#11,#12,#13,#19,#20,#26,#27,#28,#29,#30])]', '[#16]1cc[#16][#6]1=[#16]',
'[C&D1]=C[$([C&D2](C)(=O)),$(C(C)(=O)[!#8]),$(C(C)(=O)[O&D2]),$(C(C)(#N)),$(N(C)(~O)(~O)),$([S&D4](C)(~O)(~O))]',
'[C&D1]#C[$([C&D2](C)(=O)),$(C(C)(=O)[!#8]),$(C(C)(=O)[O&D2]),$(C(C)(#N)),$(N(C)(~O)(~O)),$([S&D4](C)(~O)(~O))]',
'[$([cX3](:*):*),$([cX2+](:*):*),$([CX3]=*),$([CX2+]=*),$([C&X4])]-[C&D2]=!@C[$([C&D2](C)(=O)),$(C(C)(=O)[!#8]),$(C(C)(=O)[O&D2]),$(C(C)(#N)),$(N(C)(~O)(~O)),$([S&D4](C)(~O)(~O))]',
'[$([cX3](:*):*),$([cX2+](:*):*),$([CX3]=*),$([CX2+]=*),$([C&X4])]-[C&D2]#!@C[$([C&D2](C)(=O)),$(C(C)(=O)[!#8]),$(C(C)(=O)[O&D2]),$(C(C)(#N)),$(N(C)(~O)(~O)),$([S&D4](C)(~O)(~O))]',
'[C&X4]-[C&D2!r]=[C!r][$([C&D2](C)(=O)),$(C(C)(=O)[!#8]),$(C(C)(=O)[O&D2]),$(C(C)(#N)),$(N(C)(~O)(~O)),$([S&D4](C)(~O)(~O))]',
'[$([cX3](:*):*),$([cX2+](:*):*),$([CX3]=*),$([CX2+]=*)]-[C&D2!r]=[C!r][$([C&D2](C)(=O)),$(C(C)(=O)[!#8]),$(C(C)(=O)[O&D2]),$(C(C)(#N)),$(N(C)(~O)(~O)),$([S&D4](C)(~O)(~O))]-[C&X4]',
'C1[C&D2!r]=[C!r][$([C&D2](C)(=O)),$(C(C)(=O)[!#8]),$(C(C)(=O)[O&D2]),$(C(C)(#N)),$(N(C)(~O)(~O)),$([S&D4](C)(~O)(~O))]-A=A1',
'A1[C&D2!r]=[C!r][$([C&D2](C)(=O)),$(C(C)(=O)[!#8]),$(C(C)(=O)[O&D2]),$(C(C)(#N)),$(N(C)(~O)(~O)),$([S&D4](C)(~O)(~O))]-A1',
'c1ccccc1C(=O)C=!@CC(=O)!@*', '[F,I,Cl,Br;!v1;!-]', 'N[F,I,Cl,Br]', '[N+!$(N=O)][O-X1]',
'c1ccc(n[o,s]n2)c2c1[N;!R](~[O;X1])~[O;X1]', 'c1c([N;!R](~[O;X1])~[O;X1])cc(n[o,s]n2)c2c1',
'[#7v5]', '[N!$([N+][O-])]=O', 'C(O)(O)[OH]', '[#8v3]', 'OO', 'c12cccc3c1c4c(cc3)cccc4cc2', 'C=P',
'C=!@CC=!@C', 'c1cccc(cc(cccc2)c2c3)c13', 'c1cccc(c(cccc2)c2cc3)c13',
'*[SX2][SX2][SX2]*', '[#16+1!$([#16]~[O-])]', 'c1ccc[o+]c1', 'O=S(=O)O[C,c]', 'COS(=O)O[C,c]',
'S(=O)(=O)C#N', '[#16][F,Cl,Br,I]', '[SX2H0]!@[N]', 'C1NNC=NN1', '*[CH1](=!@S)', 'SC#N',
'P(=S)(-[S;H1,H0$(S(P)C)])(-[O;H1,H0$(O(P)C)])(-N(C)C)', '*1[O,S]*1', '[#16v3,#16v5]',
'C(=O)N(C(=O))OC(=O)', 'B(c1ccccc1)(c2ccccc2)c3ccccc3', 'P(c1ccccc1)(c2ccccc2)(c3ccccc3)',
'[Si](c1ccccc1)(c2ccccc2)(c3ccccc3)', 'O=C([C!r]=[C!r]C)[!O&!N]', 'O=C([!O&!N])[C!r]=[C&D1!r]',
'N=!@N', 'CC(=O)[C&X2]-!@Oc', 'C=[N!r][N!r]=C', 'S1(CCN4(C(CC41)=O))', 'S1(CC=CN5(C(CC51)=O))',
'[S;X2]1[CH2][CH]2[CH]([CH]1)[NH]C([NH]2)=O', 'C(=N)-!@N-!@C(=N)', '*[S&D2!r][S&D2!r]*',
'*~[N!H0]!@[N!H0]~*', '[C&D2][C&D2][C&D2][C&D2][C&D2][C&D2][C&D2][C&D2][C&D2][C&v4&D1]',
'Nc1aaa(N!@=N)aa1', 'C[n+1]2ccccc2', '[N+1&X4;H0]', 'CC(=[S,O])-!@[S&D2]*', 'CC(=S)-!@O*',
'[SX2H1]', '[3#1]', 'N1C(=S)SC(=C)C1=O', 'C1([#6](~[!#6]~[#6][!#6][#6]1=,:[!#6])~[!#1;!#6])=[C;!R;H,H2]',
'c1ccc(c(c1)[OH])C=N[#7]', '[#6]C(=[!C;!R])C(=[!C;!R])[#6,$(S(=O)=O)]',
'O=C2C(=!@N[#7])c1c(cccc1)N2', 'N([C;H2,H3])([C;H2,H3])c1oc([#6]=,:[#7][#7;H][#6]=,:[!#6])[cH][cH]1',
'c1(C(=O)[OH])c([NH]N=C)cccc1', 'c1c(c(ccc1)C(cc)=O)[NH2,$([NH1](c)[!$(C=O)])]',
'C(C#N)(C#N)C([$(C#N),$(C=N)])C#N', 'C(C#N)(C#N)C([NH2])=CC#N', 'C(C#N)(C#N)=N[NH]c1ccccc1',
'[NH2]C1=C(C#N)[CH](cc)C(C#N)=C([NH2])S1', '[#7+](cc)=,:[#6][CH]=CN([CX4])[#6]',
'C(C#N)(C#N)=Cc1ccccc1', 'C1(=C)C(N=CS1)=O', 'C1(C(C=C[!C]1)=C)=[!C]', 'C1(=C)C(=O)NNC1=O',
'[#7]C=!@C2C(=O)c1c(cccc1)[!C]2', 'c1(ccccc1)[CH]=!@C3C(=O)c2c(cccc2)S3', 'c12ccccc1C(=O)C(=C)C2=O',
'C=!@C([!#1])-@C(=!@[!C])-@C(=!@C)[!#1]', '[#6]C(=O)[CH]=C([NH][#6])C(=O)O[#6]',
'[#7]~c1[$(nn),$(n[cH]),$(nc[#7;H,H2]),$(c([#7])n),$(c([#7])[cH]),$(c([#7])c[#7;H,H2])][n,cH,$(c[#7;H,H2])]c([#7H,#7H2,$(O[C;H2,H3])])nn1',
'c12c([cH][cH][cH][cH]1)[cH]c(c([cH]2)[OH])C(=O)[NH]N=C',
'[C;H2,H3]N([C;H2,H3])c1[cH][cH]c([CH]=NN[$(C(=O)[CH2]Scn),$(C(=O)[CH2]aan),$(C(=O)cc[OH]),$(cn),$([CH2][C;H,H2][OH])])[cH][cH]1',
'[#6]N1[CH2][CH2]N([CH2][CH2]1)N=[CH]ca', 'C1C(=[!C;!R])C(=O)[!C]*=1', 'C1(C(=O)NC(=O)NC(=O)1)=N',
'c12ccccc1C(=O)[CX4]C2=O', 'c1ccc3c2c(cccc12)[NH][CX4][NH]3', 'n2(c1acccc1)c([CX4])c[cH]c2[CX4]',
'N1C(=S)S[CX4]C1=O', 'c1c(ccc(c1)[NH]S(=O)=O)[OH]',
'c1([CH2,$(cc)])[cH,$(c[CH2,CH3]),$(cC=O)]sc([nH,$(n[CH,CH2,CH3]),$(n(cc)cc)]1)=[N;!R]',
'c1([$([#7][#6](=O)cc),NH2])c(-!@C(=O)N[C;H2,H3])sc(n1[$([CH2][CH]=[CH2]),$(cc)])=S',
'o1c(sc2cc(ccc12)[#7,#8])=[O,S]', 'S=c1cc[!#6]cc1', '[#6][#6](=S)[#6]',
'[NH2]c1sc([!#1])c([!#1])c1C=O', 'c1cccc2c1cc3c(n2)nc4c(c3[#7])cccc4',
'[NH2]c1c([NH]S(=O)=O)[cH]c([NH][C;H2,H3])c([F,Cl,Br,I])[cH]1',
'c13[cH][cH][cH][cH]c1oc2[cH]c([NH][C;H2,H3])c(O[C;H2,H3])[cH]c23',
'c1ccccc1N3C(=O)C([NH]c2ccc(O[C;H2,H3])cc2)=C([F,Cl,Br,I])C3=O',
'[C;H2,H3]Oc1[cH][cH][cH][cH]c1[NH][CH](C=O)S', '[NH2]c1c([OH])cc(S(=O)(=O)[OH])cc1',
'c2([NH2])c([OH])c(C=O)[cH]c1[cH][cH][cH][cH]c12', 'c13c(C(Nc2c1cccc2)([#6])[#6])ssc3=*',
'c1c(ccc([NH2,$([NH][CX4]),$([N]([CX4])[CX4])])c1)[CX4]c2ccc(cc2)[NH2,$([NH][CX4]),$([N]([CX4])[CX4])]',
'[C;H2,H3]N([C;H2,H3])c1[cH][cH]c([CH]=NN=C([#6])cc)[cH][cH]1',
'[cH]1c([cH]nc2c1[cH][cH][cH][cH]2)[CH2]N3c4c([CH2][CH2]3)[cH][cH][cH][cH]4',
'[cH]2c([NH]C(=S)[NH]c1ccccc1)[cH]c(N([C;H2,H3])[C;H2,H3])[cH][cH]2',
'N3C=C(C=O)C(c1ccc(N([C;H2,H3])[C;H2,H3])cc1)[#6]2=,:[#6]3~[#7]~[#6](~[#16])~[#7]~[#6](~[#7])~2',
'[C;H2,H3]N([C;H2,H3])c1[cH][cH]c(o1)[CH]=CC#N',
'[NH2]c2c(N=C1[CH]=[#6]~[#6]~[#6]=C1)[cH][cH][cH][cH]2',
'[NH2]c1[cH][cH]c([cH][cH]1)c2[cH]c(C=O)c([C;H2,H3])o2',
'c1(O[C;H2,H3])[cH,$(c[C;H2,H3])][cH]c(O[C;H2,H3])c([CH2][$([NH]C(=O)[CH2][CH2][C;H2,H3]),$([CH]([C;H2,H3])[NH]C(=S)[N;H,H2])])[cH]1',
'c2(c([NH2])n(c1c(cccc1)C(=O)[OH])nc2C=O)[$(C#N),$(C=S)]', 'c12ncccc1c([NH2])c(C(=O)~[OX1])s2',
'C3(=O)C(=[CH][NH]c1c(C(=O)[OH])cccc1)N=C(c2ccccc2)O3', 'c1([OH])ccc([NH]C(=O)cc)c(c1)C(=O)[OH]',
'c1(oc([#6])[cH][cH]1)C(=O)[NH]c2[cH][cH][cH][cH]c2C(=O)[OH]',
'c1ccccc1C(=O)[NH]c2ccccc2C(=O)[NH][NH]c3nccs3', 'c1ccc2c(cc1)ccc2', 'c1(c(ccccc1)[N;H,H2])=N[#6]',
'c1(ccc(cc1)c2cc(c3c(c(c2)[OH])coc3)=O)S[C;H2,H3]', 'c13ccccc1sc(=NN=c2cccccc2)n3[C;H2,H3]',
'c23c([Br])cc1c(cco1)c2[cH]cc(=O)o3',
'c12c([F,Cl,Br,I])cc([F,Cl,Br,I])cc1[cH]c(C(=O)[NH2])c(=[NH])o2',
'c1(c(nc2c(c1C#N)c(c(cc2)C#N)[NH2])[NH2])C#N',
'n3c([SX2]c1c([NH2])cccc1)c(C#N)c(c2ccccc2)c(C#N)c3[NH2]', 'C1(C#N)(C#N)[CH](C(=O)[#6])[CH]1',
'C=CC(C#N)(C#N)C(C#N)=C[NH2]', 'ccC(=O)[NH]C(=O)C(C#N)=[CH][NH]cc', 'O=S(=O)C(C#N)=N[N;H,H2]',
'c2(c1ccccc1nnc2)C(cc)C#N', 'C1(C([C;H2,H3])=C(C#N)[#6](~[#8])~[#7]~[#6]1~[#8])=[CH]cc',
'O=c1c(cc(nn1)C=O)C#N', 'n2(c1ccccc1)c(=O)c(C#N)cc(C#N)n2',
'[NH2]C1=C(C#N)[C;H,H2](cc)C([C;H2,H3])=C(C=C)O1',
'[NH2,$([OH])]C2=C(C#N)[CH](cc)c1c(n([#6])nc1)O2', '[NH2]C1=C(C#N)[CH](cc)C(C#N)=C(cc)O1',
'[NH2]C2=C(C#N)[CH](cc)c1c(ccs1)O2', '[C;H2,H3][SX2]C1=C(C#N)[CH](cc)C(C#N)C(=O)N1',
'[NH2]C2=C(C#N)[CH](c1cccs1)C(C(=O)O[#6])=C([C;H2,H3])O2',
'[SX2]1C=C(C#N)C([#6])(C=O)C([$(C=O),$(C#N)])=C1[NH2]', '[NH2]C1=C(C#N)[CH](cc)S[CX4]S1',
'[NH,$([N][CX4])]1[#6]:,=[#6]([#6](=O)ccc)C([#6])C([$(C=O),$(C#N)])=C1[CH3]',
'N2=Nc1n[!#6]nc1N=Ncc2', '[CH]2([OH])c1n[!#6]nc1[CH]([OH])C=C2',
'c1ccccc1N([C;H2,H3])[CH]=[CH]C=!@[CH][CH]=CC=@Nc2ccccc2', 'C2(c1c(cc(c(n1)[NH2])C#N)C(C=2)=C)C#N',
'C(C#N)(C#N)=C(S)S', '[OH]C(=O)c1c([OH])[cH]c([cH][cH]1)c2[cH][cH]c(o2)[CH]=C(C#N)c3nccn3',
'n2([#6])[cH][cH][cH]c2[CH]=C(C#N)c1nccs1',
'C3(=O)C(=[CH][$(c1ccccc1),$(c2ccc[!#6]2)])N=C(aaa)[S,$([N]aa)]3', 'N1=CC(C(N1)=S)=C',
'c1(ccco1)[CH]=!@C3C(=O)c2c(cccc2)[!C]3', 'S=c1[nH]ccc2c1C(=O)OC2=[C;H,H2]',
'[#6]1[O,S]C(C([#6]=,:1)=O)=CC=O', 'C2(=O)C(=[CH]c1cc([F,Cl,Br,I])ccc1O[C;H2,H3])N=C(S[C;H2,H3])S2',
'[cH]2[cH][cH][cH]c1[CH2]C(=O)C(=C([C;H2,H3])[C;H2,H3])c12', 'C1C(OC(C=1)(O)[#6])O',
'c12[cH]c(O[C;H2,H3])c(O[C;H2,H3])[cH]c1C([#6])=C([#6])S[CH2]2',
'c2(O[C;H2,H3])c(O[C;H2,H3])[cH]c1C=C[CH]Sc1[cH]2', 'C1(=O)C(C(C#N)=[CH][#7])C([#7])C=C1',
'c23ccc1ccccc1c2[C;H,H2][CX4]NC3=[CH]C(=O)N([C;H2,H3])[C;H2,H3]', 'O=CC1=C(SC(=[CH][#6])S1)C=O',
'C1(C(=O)[CH2]C[CH2]C(=O)1)=C([N;H,H2])C=O', 'C#CC(=O)C#C',
'aa[CH,$(CC#N)]=C1[#6]:,=[#6]C(=[O,$([N;!R])])[#6]:,=[#6]1',
'S1C(=Ncc)[NH,NH2,$(N[CH2][CH2]O),$(Ncc)]C(=O)C1=[CH][$(ccc[Cl]),$(c[!#6])]',
'S1C(=O)NC(=S)C1=[CH]cc', 'n2ccc([CH]=C1C(=O)NC(=[!C])N1)c2[C;H2,H3]',
'[OH]C(=O)c1cccc(c1)cac[CH]=C2C(=[!C])NC(=[!C])[!C]2', 'n12c(nc(=O)c([C;H2,H3])n1)sc(=[CH]cc)c2=O',
'N2([C;H2,H3])C(=S)[NH]C(=[CH]c1cc(Br)ccc1)C2=O',
'[C;H2,H3]N2C(=[S,N])[!C]C(=C1[CH]=[CH]ccN1[C;H2,H3])C2=O',
'c1ccccc1C(=O)[CH]=c3c(=O)[nH]c(=O)c(=[CH]c2ccccc2)[nH]3', 'O=C1cc[CH2]NC1=[C;H,H2]',
'c1(oc([C;H2,H3])[cH][cH]1)[CH]([OH])C#C[CX4]', 'ccn2nc1[CH2][S;X2][CH2]c1c2[NH]C(=O)[CH]=[C;H,H2]',
'[cH]1[cH]n([C;H2,H3])c3c1c2[cH][cH]n(c2c(c3O[C;H2,H3])O[C;H2,H3])[C;H2,H3]',
'C2(=O)C(=C([C;H2,H3])[NH][CH2][CH2][C;H2,H3])N=C(c1ccccc1)O2',
'n4(c1ccccc1)[cH][n+](c2ccccc2)c(=Nc3ccccc3)n4', 'n1nc(c2-c(c1C)ccc2)C',
's1c2c(c([C;H2,H3])c1[C;H2,H3])c(ncn2)NN=Cc3ccco3', 'C2([#6])[CH2]C(=O)n1nc([NH2])c([NH2])c1N=2',
'c2(c1n(c(ccn1)=O)nc2c3nccc3)C#N', 'n2nnnc1cccc-12',
'c2([CH2][C;H2,H3])c([C;H2,H3])c1c(=[X1])n([C;H,H2][$(C(=O)O),$(cc)])[cH,$(cS[C;H2,H3])]nc1[a;X2]2',
'c1ccc2c(c1)nc3c(n2)ccc4c3cccc4', 'n1c3c(c(=N)c2ccccc12)cccc3',
'c23cc1ccccc1cc2n([C;H2,H3])c(=O)c(cc[NH][C;H2,H3])n3',
'c12acccc1nc([CH]=C([OH])[#6])c([CH]=C([OH])[#6])n2',
'c1ccc2c(c1)nc(c(n2)[CH2]C(=O)cc)[CH2]C(=O)cc', 'c1ccc2c(c1)nc(c(n2)c3ccccc3)c4c(cccc4)[OH]',
'[C;H2,H3]Oc1[cH][cH][cH][cH]c1[NH]c3c2c(O[C;H2,H3])ccc(O[C;H2,H3])c2ncc3',
'N=c1[nH]c([N;H,H2,H3])c([N;H,H2])nn1', '[NH](c1[cH][cH][cH][cH]c1[OH])c2nnc(=N)oc2[NH2]',
's1[cH][cH][cH]c1C4[NH]N=C(c2ccncc2)c3c(cccc3)N=4',
'C([F])([F])C(=O)[NH]c1[cH]n([CH2][CH2]O[CH2]cc)n[cH]1', 'c12cccca1c([!#1])a[n+](~cc)a2',
'[#6]13~[#7](cc)~[#6]~[#6]~[#6]~[#6]~1~[#6]2~[#7]~[#6]~[#6]~[#6]~[#7+]~2~[#7]~3',
'C1(C=O)(cc)[SX2]C=N[NH]1', 'S=c2[nH]nc(c1[cH][cH]c(O[C;H2,H3])[cH][cH]1)o2', 'N=C1SC(=N)N=C1',
'N1([C;H2,H3])C(=S)N(cc)C(=Ncc)C1=Ncc', 'S1C(=N[NH])SC(=Ncc)C1=Ncc',
'c1ccccc1C4=Nn3c2ccccc2[n+]c3S[CX4]4', 'n2c1ccccc1n([C;H2,H3])c2S[CH2]C(=O)[NH]N=[CH][CH]=[C;H,H2]',
'c12ccccc1[CH2][CH2]N=C2[SX2][CH2]C(=O)c3ccccc3',
'c1ccc3c(c1)Sc2[cH][cH][cH,$(cO),$(c[SX2]),$(c[CX4]),$(c[NH2,$([NH][CX4]),$([N]([CX4])[CX4])])]([cH]c2C(C3)[NH2,$([NH][CX4]),$([N]([CX4])[CX4])])',
'[C;H2,H3][SX2]c2nc1OC=Nccc1nn2', '[C;H2,H3][SX2]c3nc(c1o[cH][cH][cH]1)c(c2o[cH][cH][cH]2)nn3',
'[C;H,H2,H3]C3=NN(c1ccccc1)S2[!C]*=CC=23', 'N=c1ncns1', 'cc[NH]C(=O)c1nnsc1[NH]cc', 'n2sc1CNccc1c2=S',
'C1(SC(=[CH]cc)N(cc)N=1)C=O', 'c1([OH])ccc([OH])c(c1)C(=!@C[#7])C=O',
'c14[cH][cH]c(O[C;H2,H3])[cH]c1C(=N[NH]c2[cH][cH]c(C(=O)[OH])[cH][cH]2)c3[cH]c(O[C;H2,H3])[cH][cH]c34',
'[OH]C(=O)c1ccc(cc1)NN=[CH]c2ac([cH][cH]2)c3ccccc3',
'c1(o[cH,$(c[C;H2,H3])][cH][cH]1)C(=O)[NH]N=[CH,$(C[C;H2,H3])]c3cc(***c2occc2)ccc3',
'c1cccc2c1cc4c(c2C=N[NH]c3ccccc3)cccc4',
'c1(o[cH,$(c[C;H2,H3])][cH][cH]1)[CH,$(C[C;H2,H3])]=N[NH]c2nccs2',
'c1(o[cH,$(c[C;H2,H3])][cH][cH]1)[CH,$(C[C;H2,H3])]=N[NH]c2ccncc2',
'c1ccccc1N(c2ccccc2)N=[CH]c3ac([cH][cH]3)c4cc(C(=O)[OH])ccc4',
'[OH]C(=O)c1cc(ccc1)cacC=N[NH]C(=O)[CH2]O',
'C(c1[cH]c[cH,$(c[CX4])][cH][cH]1)(c2[cH][cH][cH,$(c[Cl])][cH][cH]2)=[$(NO[CH2][CH2][CH2]N([C;H2,H3])[C;H2,H3]),$(NO[CH2][CH2]N([C;H2,H3])[C;H2,H3]),$(N[NH]C(=[NH])[NH2]),$([CH][#7])]',
'c12[cH][cH][cH][cH]c1[!#6][cH,$(c[OH]),$(c[C;H2,H3])]c2[CH]=N[NH][$(c3nc[cH]s3),$(c[cH][cH]),$(cncncn),$(cnnnn)]',
'c1(s[cH][cH][cH,$(c[C;H2,H3])]1)[CH]=N[NH]c2ccccc2',
'[CH]4(n1[cH]n[cH][cH]1)c2[cH]c([Br])[cH][cH]c2[CH2][CH2]c3[cH][cH][cH][cH]c34',
'n3c(c1ccccc1)c(c2ccccc2)n(N=!@C)c3[NH2]', 'cc[CH]=[CH][CH]=NN([CX4])[CX4]',
'C1(C=Nc2c(N1)cccc2)=[CH]C=O', 'c1cC(=O)C=C1N=[CH]N([CX4])[CX4]', 'c1ccc2c(c1)C(C=N2)=[N;!R]',
'cc[CH]=[CH][CH]=NN=C', 'N([C;H2,H3])([C;H2,H3])[CH]=NC([C;H2,H3])=NN([C;H2,H3])cc',
'c12c([cH][cH][cH][cH]1)c(c([cH][cH]2)C(=Ncc)[C;H2,H3])[OH]',
'[NH](c1ccccc1)N=C(C(=O)[C;H2,H3])[NH][NH,NH2,NH3,$(cc)]', '[N;!R]=C2C(=O)c1c(cccc1)S2',
'cc[N;!R]=C2C(=[!C])c1c(cccc1)N2', 'C1(=[!C])C(N=CS1)=O', 'C=[N;!R]c1c([OH])cccc1',
'[CX4]1C(=O)NNC1=O', 'O=CC=[CH][OH]', '[C;H2,H3]C([OH])=C(C(=O)[C;H2,H3])[CH]C#C',
'cc[NH]N=C([C;H2,H3])[CH2]C([C;H2,H3])=N[NH]cc', 'c1cccc2c1C(c3c2nacc3)=O',
'c1cccc2c1C(C3C2=N*=CC3)=O', 'c1(cc3c(c2ccccc12)c4c(C3=O)cccc4)[OH]',
'c1c(cccc1)C(=O)[NH]N=C3c2c(cccc2)c4c3cccc4', 'c12c(O[CH2]N(ccO[C;H2,H3])[CH2]1)[cH]cc[cH]2',
'c1([NH]C(=O)[CH2][CH2]cc)[cH][cH]c([C;H2,H3])c([NH]C(=O)[CH2][CH2]cc)[cH]1',
'[N;H,H2](c1[cH][cH]c(O[CH3])c(O[C;H,H2,H3])[cH]1)C(=O)[NH][CH2][CH2][CH2]N([CH3])cc',
'c1c3c(cc2c1cccc2)nc(n3)COC(=O)c4cc(cc(c4)[NH2])[NH2]',
'n2(c1[cH][cH][cH][cH]c1C(=O)[NH][CH]([C;H2,H3])[CH2]Occ)[cH][cH][cH][cH]2',
'C1(cc)[CH2][CH](C(=O)[#6])[CH](C(=O)[OH])[CH2]C(cc)=1', 'C(cc)(cc)(cc)SccC(=O)[OH]',
'c1(C([CX4])([CX4])[NH]C(=O)N([CH2][C;H2,H3])[CH2][CH2][C;H,H2][CH2]cc)[cH][cH][cH]c(C(=[CH2])[C;H2,H3])[cH]1',
'c1cc3c2c(c1)cccc2C(=CC3=O)O[C;H2,H3]', 'c13ccc(c2c1c(ccc2)C(C=C3C([F])([F])[F])=O)[#7]',
'c1(ccc3c2c(cccc12)C(C=C3)=N)[#7]', 'C(c1ccc([OH])cc1)(c2ccc([OH])cc2)OS(=O)=O',
'n12cccc2C=N([#6])CC1', 'n2([C;H2,H3])c1c(ccC(=O)1)cc2[C;H2,H3]',
'c1(c2c(c(n1C(O)=O)[C;H2,H3])S[CH2]S2)[C;H2,H3]', 'n2(c1ccccc1)[cH][cH][cH]c2C=N[OH]',
'c1ccc2c(c1)c4c3c(C2=O)cccc3no4', 'O=C2c1ccccc1c3c([OH])c(=O)nc4cccc2c34',
'C1([#6]:,=[#6][#6]:,=[#6]C1=[!C])=[!C]', 'N2(c1ccccc1)C(=NC=O)S[CH2]C2=O',
'N4(c1ccccc1)C(=O)S[CH]([NH]c3c2ccccc2ccc3)C4=O', 'N2(c1ccccc1)C(=O)S[CH2]C2=S',
'N3(C(=O)c1ccccc1)C(=Nc2ccccc2)S[CH2]C3=O', 'O=C4CCC3C2C(=O)CC1CCCC1C2CCC3=C4',
'C2Cc1ccccc1C(c3c2cccc3)=C[#6]', 'c1ccc3c(c1)[SX2,CX4]c2cc[cH,$(c[Cl]),$(c[CX4])]cc2C3=C[#6]',
'c1ccc2c(c1)C(c3c(SC2)scc3)=C', 'c1ccc2c(c1)C(c3c2ccc(c3)[NH2])=[CH][#6]',
'[CH3]c2nc([NH]S(c1[cH][cH]c([cH][cH]1)O[CH2][CH2][C;H2,H3])(=O)=O)[cH][cH][cH]2',
'c1([cH][cH]c([cH][cH]1)[NH2])S(=O)(=O)[NH]c2[cH][cH][cH]nn2',
'[CH3]C([CH3])([CH3])c1[cH]c(C([CH3])([CH3])[CH3])c(O[C;H,H2]N)c[cH]1',
'a1aaa(aa1)[CH]=[CH]C([NH][NH]c2n(nnn2)[#6])=O', 'c1([F,Cl,Br,I,$([n+](c)c)])c(-!@C=N)sc(n1)=O',
'c1(S[C;R])c([CH]([#6])[#6])sc([nH,$(n[C;H2,H3])]1)=O',
's2c([NH]c1c([CH2,CH3,$(cc)])cccc1)[n+]([C;H2,H3])c([#6])[cH]2', 'c1nc(sc1)NNS(=O)=O',
'c1c(cc(C(=O)[OH])cc1)[NH]c2nc([cH]s2)c3ccc([CH]([C;H,H2,H3])[C;H,H2,H3])cc3',
'[C;H2,H3][NH]C=N[NH]c1nc(cc)[cH]s1', 'O=S(=O)(cc)[NH][NH]c1nc(cs1)cc',
'n1c3c(cc2c1nc(s2)[#7])sc(n3)[#7]', 's2ccc(c1csc([NH2])n1)[cH]2',
'c2([NH]cc[C;H2,H3])nc(c1ccncc1)[cH]s2',
'[C;H2,H3]Oc1[cH][cH]c(O[C;H2,H3])[cH]c1[NH]c3scc(c2ccc(O[C;H2,H3])cc2)n3', '[#6][CH](=S)',
'[cH]1coc([cH]1)C(=S)N2[CH2][CH2]*[CH2][CH2]2', '[CX4][NH]C(cc)=[CH]C(=S)[NH]c1ccccc1',
'[CH](c1ccccc1)(c2ccccc2)C(=S)[N;H,H2]', 'c(N([C;H,H2,H3])[C;H,H2,H3])c[NH]C(=S)[C;H,H2,H3]',
'c1cccnc1C(=S)[NH]c2ccccc2O[C;H2,H3]', 'acC(=S)[NH][NH]ca', '[C;H2,H3]SC(=S)[NH][CH2]cc',
'C1[CX4]SC(NC=1)=S', 'O=c2sc1cccc(O[C;H2,H3])c1o2', '[NH]1C(=S)[CH](C#N)[CH](cc)[CH]=C1cc',
'S=CC([C;H2,H3])=C([C;H2,H3])N([C;H2,H3])[C;H2,H3]', 'C1=CN(C(c2ccccc12)(C#N)C(=S)S)C=O',
'c1c(sc(cc1)=S)[#7]', 'ccC(=[SX1])[SX2][C;H,H2][CH2,CH3,$(cc)]', 'n1c(=O)[cH]c([#6])sc1=S',
'c1ccccc1N3C(=O)C(Sc2ccccc2)=[CH]C(=O)3', '[CX4][N+]([CX4][OH])=CS[C;H,H2,H3]', 'S=Cc2n1ccccc1cc2',
'c2(nanc(c1o[cH][cH][cH]1)n2)S[CX4]', 'C(=O)(N1CCSCC1)c2c([cH][cH][cH][cH]2)S[C;H2,H3]',
'c2c([NH]C(=S)[NH][CH2][CH2][CH2]N([C;H2,H3])c1ccccc1)cccc2',
'c2c([NH]C(=S)[NH][CH2][CH2]N([C;H2,H3])c1ccccc1)cccc2',
'c1(ccccc1)[NH]C(=S)N[NH]C(=O)c2a[!#6]cc2', 'c1(ccccc1)[NH]C(=S)N[NH]c2ccccc2',
'C1[NH][NH]C(=S)N[NH]1', 'c1(ccccc1)[NH]C(=S)N[NH][#6](=,:[#7;R])[#7;R]',
'c1(ccccc1)[NH]C(=S)[NH]N=Cc2ccnc2', 'c1(ccoc1[C;H,H2,H3])C=N[NH]C(=S)[N;H,H2]',
'c2(=S)n1ccnnc1n[nH]2', '[CX4][SX2]C(=N-aaaa)[NH]N=C',
'ccN([C;H2,H3])[CH2][CH2][CH2][NH]C(=S)[NH]c1c([F,Cl,Br,I])[cH]c([C;H2,H3])[cH][cH]1',
'[cH]3c([NH]C(=S)[NH][C;H,H2]c1oc([C;H,H2,H3])[cH][cH]1)[cH]c2c(O[CH2]O2)[cH]3',
'O=C-!@n2ccc1c2[nH]c(=S)[nH]1',
'c12c([cH][cH][cH][cH]1)c([cH][cH][cH]2)C([C;H2,H3])=N[NH]C(=S)[NH]ccc',
'c1(ccccc1)[#7;H][#6](=S)[#7][#7;H][#6;H]=,:[#6;H][#6]=O',
'[C;H2,H3]N([C;H2,H3])[CH]=CC(=O)c1c([SX2])sc([$(C#N),$(C=O)])c1',
's1[cH][cH]c(O[C;H2,H3])c1C(=O)[NH][N;H,H2]',
'c1(s[cH,$(c[C;H2,H3])][cH][cH]1)[CH,$(C[C;H2,H3])]C(=O)[NH]c2nccs2',
'c1csc(c1[NH2])[CH]=[CH]c2cccs2', '[NH2]c3sc([NH]C(=O)c1ccccc1)c(C#N)c3c2aaaaa2'
] # '([#7+1!r].[#7+1!r].[#7+1!r])'
'''l = []
lines = open('/export/home/jyy/gbp_pocket_flow_with_edge/pocket_flow/utils/alert_struct.csv').read().split('\n')
for line in lines[2:]:
if ',"' in line:
l.append(line.split(',"')[-1].strip('"'))
else:
l.append(line.split(',')[-1])'''
HAT1_REF_MOL = [
'CN1C2=C(C=CC(S(=O)(NC3=CC=C(C4=NC=CO4)C=C3)=O)=C2)CCC1C5=CC=CC=C5',
'CN1C2=C(C=CC(S(=O)(NC3=CC=C(Br)C=C3)=O)=C2)CCC1C4=CC=CC=C4',
'CN1C2=C(C=CC(S(=O)(NC3=C(C=CC=C4)C4=C(Br)C=N3)=O)=C2)CCC1C5=CC=CC=C5',
'O=S(C1=CC2=C(C=C1)C=CC(C3=CC=CC=C3)=N2)(NC4=CC=C(Br)C=C4)=O',
'O=S(C1=CC2=C(C(F)=C1)C=CC(C3=CC=CC=C3)=N2)(NC4=CC=C(Br)C=C4)=O',
'CN1C2=C(C=CC(S(=O)(NC3=CC=C(C4=COC=N4)C=C3)=O)=C2)CCC1C5=CC=C(Cl)C=C5',
'CN1C2=C(C=CC(S(=O)(NC3=CC=C(C4=CSC=N4)C=C3)=O)=C2)CCC1C5=CC=C(Cl)C=C5',
'BrC1=C(O)C=CC(/C=C2CCC/C(C\2=O)=C\C3=CC=C(O)C(Br)=C3)=C1',
'O=S(C1=CC2=C(NC(C3C2C=CC3)C4=C(C=CC=C4)F)C=C1)(NC5=CC=C(C(O)=O)C=C5)=O',
'BrC1=C(O)C=CC(/C=C2CC(CC)C/C(C\2=O)=C\C3=CC=C(O)C(Br)=C3)=C1'
'O=C1/C(CC/C1=C\C2=CC(Br)=C(O)C=C2)=C/C3=CC=C(O)C(Br)=C3',
'O=C(/C=C/C1=CC(Br)=C(O)C=C1)/C=C/C2=CC=C(O)C(Br)=C2'
]
def check_alert_struct(mol):
Chem.GetSSSR(mol)
for alert_smarts in ALERT_STRUCTURES:
p = Chem.MolFromSmarts(alert_smarts)
if mol.HasSubstructMatch(p):
return False
else:
return True
def check_lipinski(mol):
mw = Descriptors.MolWt(mol)
logp = Crippen.MolLogP(mol)
#tpsa = Descriptors.TPSA(mol) # tpsa<=140
hbd = Lipinski.NumHDonors(mol)
hba = Lipinski.NumHAcceptors(mol)
#rb = Descriptors.NumRotatableBonds(mol)
if sum([mw<=700, logp<=5, hba<=10, hbd<=5]) != 4:
return False
else:
return True
class Filter(object):
def __init__(self, sa_threshhold=5.0, qed_threshhold=0.3, mw_threshhold=300):
self.sa_threshhold = sa_threshhold
self.qed_threshhold = qed_threshhold
self.mw_threshhold = mw_threshhold
def _filter(self, mol):
try:
qed_check = QED.qed(mol) > self.qed_threshhold
sa_check = sascorer.calculateScore(mol) < self.sa_threshhold
mw_check = Descriptors.MolWt(mol) > self.mw_threshhold
lip_check = check_lipinski(mol)
if qed_check and sa_check and mw_check and lip_check:
struct_check = check_alert_struct(mol)
if struct_check:
return (Chem.MolToSmiles(mol), mol)
else:
return False
else:
return False
except:
return False
def run(self, mol_list, save_name='Filter_Result.sdf', mol_name=None):
num_mol = 0
smi_dict = {}
#filter(lambda x:x!=False, filter_list)
for ix, m in tqdm(enumerate(mol_list)):
m_ = Chem.MolFromSmiles(Chem.MolToSmiles(m))
out = self._filter(m_)
if out != False and out[0] not in smi_dict:
smi_dict.setdefault(out[0])
with open(save_name, 'a') as sdf_writer:
#mol = out[1]
if mol_name:
m.SetProp('_Name', mol_name)
mol_block = Chem.MolToMolBlock(m)
sdf_writer.write(mol_block + '\n$$$$\n')
num_mol += 1
def run_file_list(self, sdf_list, save_name='Filter_Result.sdf'):
num_mol = 0
smi_dict = {}
for sdf in sdf_list:
supp = Chem.SDMolSupplier(sdf)
for mol in supp:
if mol is None: continue
mol_ = Chem.MolFromSmiles(Chem.MolToSmiles(mol))
out = self._filter(mol_)
if out != False and out[0] not in smi_dict:
mol_name = 'No_{}-{}'.format(num_mol, sdf)
smi_dict.setdefault(out[0])
with open(save_name, 'a') as sdf_writer:
#mol = out[1]
mol.SetProp('_Name', mol_name)
mol_block = Chem.MolToMolBlock(mol)
sdf_writer.write(mol_block + '\n$$$$\n')
num_mol += 1
return num_mol
def np_run(self, mol_list, save_name='Filter_Result.sdf', n_process=10):
smi_dict = {}
pool = Pool(processes=n_process)
filter_list = pool.map(self._filter, mol_list)
#filter(lambda x:x!=False, filter_list)
for ix, i in enumerate(filter_list):
if i:
if i[0] not in smi_dict:
smi_dict.setdefault(i[0])
with open(save_name, 'a') as sdf_writer:
sdf_writer.write(i[1] + '\n$$$$\n')
from PIL import Image
import matplotlib.pyplot as plt
from .train import verify_dir_exists
from rdkit import Chem
from rdkit.Chem import AllChem, Descriptors, Draw
from rdkit.Chem.Draw import rdMolDraw2D
from rdkit.Chem.Draw.MolDrawing import DrawingOptions
import os
def draw_docked_mol_list(mol_list, molsPerRow=5, subImgSize=(300,300)):
opts = DrawingOptions()
opts.atomLabelFontSize = 30
opts.bondLineWidth = 1.5
opts.colorBonds = False
legends = []
for m in mol_list:
AllChem.Compute2DCoords(m)
docking_score = 'Docking Score: {:.3f}'.format(float(m.GetProp('r_i_docking_score')))
mw = 'MolWt: {:.3f}'.format(Descriptors.MolWt(m))
legend = mw + '\n' + docking_score
legends.append(legend)
img = Draw.MolsToGridImage(
mol_list,
molsPerRow=molsPerRow,
subImgSize=subImgSize,
legends=legends
)
return img
def DrawDockedMol(mol, # rdkit.Molecule的instance
save=None,
size=(300,300),
bondLineWidth=5, # 化学键显示的宽度
FontSize=3,
fixedBondLength=50, # 化学键显示的长度
legend=None,
legendFontSize=5,
elemColor=True): # 是否按元素给原子着色
if legend is None:
try:
docking_score = 'Docking Score: {:.3f}'.format(float(mol.GetProp('r_i_docking_score')))
mw = 'MolWt: {:.3f}'.format(Descriptors.MolWt(mol))
legend = mw + '\n' + docking_score
except:
pass
draw = rdMolDraw2D.MolDraw2DCairo(size[0], size[1])
option = rdMolDraw2D.MolDrawOptions()
option.bondLineWidth = bondLineWidth
option.fixedBondLength = fixedBondLength
option.setHighlightColour((0.95,0.7,0.95))
option.baseFontSize = FontSize
option.legendFontSize = legendFontSize
if elemColor == False:
option.updateAtomPalette({k: (0, 0, 0) for k in DrawingOptions.elemDict.keys()})
draw.SetDrawOptions(option)
AllChem.Compute2DCoords(mol)
rdMolDraw2D.PrepareAndDrawMolecule(draw, mol, legend=legend)
draw.FinishDrawing()
if save:
if '.png' in save:
n = 0
path = '/'.join(save.split('/')[0:-1])
while os.path.exists(save):
n += 1
name = save.split('/')[-1].split('.')[0]
name = name + '_' + str(n) + '.png'
save = path + '/' + name
else:
draw.WriteDrawingText(save)
else:
return draw
def CombineImages(img_file_list, col_num=7, save_dir='./', title='merged_image.png', img_size=None):
num_img = len(img_file_list)
num_row = num_img//col_num+1 if num_img%col_num != 0 else num_img//col_num
if img_size is None:
img_size = Image.open(img_file_list[0]).size
toImage = Image.new('RGB', (img_size[1]*col_num, img_size[0]*num_row), color=(255,255,255))
x_cusum = 0
y_cusum = 0
num_has_paste = 0
for img_file in img_file_list:
img = Image.open(img_file)
toImage.paste(img, (x_cusum, y_cusum))
num_has_paste += 1
if num_has_paste%col_num==0:
x_cusum = 0
y_cusum += img_size[1]
else:
x_cusum += img_size[0]
verify_dir_exists(save_dir)
toImage.save(save_dir+'/'+title)
plt.xticks([])
plt.yticks([])
plt.axis('off') # 取消所有边框,坐标轴
plt.imshow(toImage)
plt.clf() # 清除当前图形。如果不清除当使用循环大量作图机器会为plt分配越来越多的内存速度会逐渐变慢。

View File

@@ -1,218 +0,0 @@
from weakref import ref
from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit.Chem import QED, Descriptors, Draw
from rdkit import DataStructs
from . import sascorer
import xlsxwriter
from easydict import EasyDict
from io import BytesIO
import glob
SIMILARITY_METRIC = ['DataStructs.TanimotoSimilarity',
'DataStructs.DiceSimilarity',
'DataStructs.CosineSimilarity',
'DataStructs.SokalSimilarity',
'DataStructs.RusselSimilarity',
'DataStructs.KulczynskiSimilarity',
'DataStructs.McConnaugheySimilarity']
class ProcessSmilesFile(object):
def __init__(self, sdf_file):# refer_mol=None):
suppl = Chem.SDMolSupplier(sdf_file)
self.smiles = {}
#self.refer_mol = refer_mol
for idx, mol in enumerate(suppl):
if mol is None:
continue
smi = Chem.MolToSmiles(mol)
self.smiles[smi] = {}
self.smiles[smi]['mol'] = mol
self.smiles[smi]['index'] = idx
self.smiles[smi]['name'] = mol.GetProp('_Name')
self.smiles[smi]['sdf'] = sdf_file
def _QedScore(self,):
for smi in self.smiles:
#mol = Chem.MolFromSmiles(smi)
#print(self.smiles[smi])
self.smiles[smi]['QED'] = QED.qed(self.smiles[smi]['mol'])
def _SAScore(self,):
for smi in self.smiles:
#mol = Chem.MolFromSmiles(smi)
try:
self.smiles[smi]['SA'] = sascorer.calculateScore(self.smiles[smi]['mol'])
except:
self.smiles[smi]['SA'] = 10.0
def _Lipinski(self,):
for smi in self.smiles:
m = self.smiles[smi]['mol']
#wt = sum([a.GetMass() for a in m.GetAtoms()])
#self.smiles[smi]['MolWt'] = wt
self.smiles[smi]['MolWt'] = Descriptors.MolWt(m)
self.smiles[smi]['LogP'] = Descriptors.MolLogP(m)
self.smiles[smi]['TPSA'] = Descriptors.TPSA(m)
self.smiles[smi]['HBD'] = Descriptors.NumHDonors(m)
self.smiles[smi]['HBA'] = Descriptors.NumHAcceptors(m)
self.smiles[smi]['RB'] = Descriptors.NumRotatableBonds(m)
self.smiles[smi]['Lipinski'] = [self.smiles[smi]['MolWt'],
self.smiles[smi]['LogP'],
self.smiles[smi]['TPSA'],
self.smiles[smi]['HBD'],
self.smiles[smi]['HBA'],
self.smiles[smi]['RB']]
def _Similarity(self, refer_mol, metric='DataStructs.TanimotoSimilarity'):
metric = eval(metric)
refer_fp = AllChem.GetMorganFingerprint(refer_mol, 2)
for smi in self.smiles:
fp = AllChem.GetMorganFingerprint(self.smiles[smi]['mol'],2)
'''similarity = DataStructs.FingerprintSimilarity(
fp, refer_fp, metric=metric
)'''
similarity = metric(fp, refer_fp)
self.smiles[smi]['similarity'] = similarity
def Filter(self, molwt=200, lp_rule=False, refer_mol=None,
metric='DataStructs.TanimotoSimilarity', similarity=0.5):
self._QedScore()
self._SAScore()
self._Lipinski()
if refer_mol:
self._Similarity(refer_mol, metric=metric)
#smiles = EasyDict(self.smiles)
out = {}
for s in self.smiles:
smi = EasyDict(self.smiles[s])
if smi.MolWt < molwt:
continue
if lp_rule:
l = [smi.MolWt<500, smi.LogP<5, smi.RB<10,
smi.TPSA<140, smi.HBA<10, smi.HBD<5]
#lenth = len(set(l))
if sum(l) != 6:
continue
if refer_mol:
if smi.similarity < similarity:
continue
else:
smi.similarity = '--'
out[s] = smi
return out
def Mol2Excel(mol_items, work_book=None, name='result.xlsx', worksheet_name='worksheet'):
#add_worksheet_name = path.split('/')[-2].split('_')[0].split('sample-')[1]
if work_book is None:
workbook = xlsxwriter.Workbook(name)
else:
workbook = work_book
worksheet = workbook.add_worksheet(worksheet_name)
header = ['Image', 'SMILES', 'QED Score', 'SA Score', 'Similarity',
'MolWt', 'LogP', 'TPSA', 'HBD', 'HBA', 'RB', 'sdf file',
'index']
header_style = {'bold':1,'valign':'vcenter','align':'center','top':2,
'left':2,'right':2,'bottom':2}
HeaderStyle = workbook.add_format(header_style)
item_style = {'align':'center','valign': 'vcenter','top':2,'left':2,
'right':2,'bottom':2,'text_wrap':1}
ItemStyle = workbook.add_format(item_style)
worksheet.set_column('A:A', 39)
worksheet.set_column('B:B', 39)
worksheet.set_column('C:C', 10)
worksheet.set_column('D:D', 10)
worksheet.set_column('E:E', 10)
worksheet.set_column('F:F', 10)
worksheet.set_column('G:G', 10)
worksheet.set_column('H:H', 10)
worksheet.set_column('I:I', 10)
worksheet.set_column('J:J', 10)
worksheet.set_column('K:K', 10)
worksheet.set_column('L:L', 30)
worksheet.set_column('M:M', 10)
for ix_, i in enumerate(header):
worksheet.write(0, ix_, i, HeaderStyle)
for ix, items in enumerate(mol_items):
smi = items[-2]
info = EasyDict(items[-1])
img_data = BytesIO()
AllChem.Compute2DCoords(info.mol)
img = Draw.MolToImage(info.mol)
img.save(img_data, format='PNG')
worksheet.set_row(ix+1, 185)
worksheet.insert_image(ix+1, 0, 'f', {'x_scale': 0.9, 'y_scale': 0.8, 'image_data':img_data, 'positioning':1})
worksheet.write(ix+1, 1, smi, ItemStyle)
worksheet.write(ix+1, 2, info.QED, ItemStyle)
worksheet.write(ix+1, 3, info.SA, ItemStyle)
worksheet.write(ix+1, 4, info.similarity, ItemStyle)
worksheet.write(ix+1, 5, info.MolWt, ItemStyle)
worksheet.write(ix+1, 6, info.LogP, ItemStyle)
worksheet.write(ix+1, 7, info.TPSA, ItemStyle)
worksheet.write(ix+1, 8, info.HBD, ItemStyle)
worksheet.write(ix+1, 9, info.HBA, ItemStyle)
worksheet.write(ix+1, 10, info.RB, ItemStyle)
worksheet.write(ix+1, 11, info.sdf, ItemStyle)
worksheet.write(ix+1, 12, info.index, ItemStyle)
if work_book is None:
workbook.close()
def WriteExcel(result_dir_list, name='result.xlsx', molwt=200, lp_rule=False, sorted_idx=0, refer_mol=None):
'''
sorted_ix: 用于排序的属性的index, 0:SA; 1:QED; 2:SA*QED
'''
#创建一个工作簿并添加一张工作表,工作表是可以命名的
#workbook = xlsxwriter.Workbook(name)
for path in result_dir_list:
mol_dict = ProcessSmilesFile(path).Filter(
molwt=molwt, lp_rule=lp_rule, refer_mol=refer_mol,
metric='DataStructs.TanimotoSimilarity', similarity=0.2
)
#print(mol_dict)
#L = [(mol_dict[i]['SA'], mol_dict[i]['QED'], mol_dict[i]['SA']*mol_dict[i]['QED'], i, mol_dict[i]) for i in mol_dict]
L = [(mol_dict[i]['SA'], mol_dict[i]['QED'], mol_dict[i]['similarity'], i, mol_dict[i]) for i in mol_dict]
if sorted_idx==0:
L = sorted(L, reverse=False, key=lambda x: x[sorted_idx])
else:
L = sorted(L, reverse=True, key=lambda x: x[sorted_idx])
#print(L)
#add_worksheet_name = path.split('/')[-2]
#worksheet = workbook.add_worksheet(add_worksheet_name)
Mol2Excel(L, name=name)#, worksheet_name=add_worksheet_name)
#workbook.close()
def WriteExcelAll(result_dir_list, name='result.xlsx', molwt=200, lp_rule=False,
sorted_ix=0, refer_mol=None, similarity=0.2):
all_mol = {}
for path in result_dir_list:
mol_dict = ProcessSmilesFile(path).Filter(
molwt=molwt, lp_rule=lp_rule, refer_mol=refer_mol,
metric='DataStructs.TanimotoSimilarity', similarity=similarity
)
for smi in mol_dict:
if smi not in all_mol:
all_mol[smi] = mol_dict[smi]
L = [(all_mol[i]['SA'], all_mol[i]['QED'], all_mol[i]['similarity'], i, all_mol[i]) for i in all_mol]
if sorted_ix==0:
L = sorted(L, reverse=False, key=lambda x: x[sorted_ix])
else:
L = sorted(L, reverse=True, key=lambda x: x[sorted_ix])
Mol2Excel(L, name=name)
'''refmol_1 = Chem.MolFromSmiles('O=C(/C=C/C1=CC=CC=C1)/C=C/C2=CC=CC=C2')
refmol_2 = Chem.MolFromSmiles('O=C(/C(CCC/1)=C/C2=CC=CC=C2)C1=C\C3=CC=CC=C3')
refmol_3 = Chem.MolFromSmiles('O=S(C1=CC2=NC(C3=CC=CC=C3)=CC=C2C=C1)(NC4=CC=CC=C4)=O')
path_list = [i for i in glob.glob('./outputs/HAT1_2022_08_25*/SMILES.txt') if 'sample-1' not in i]
path_list += ['HAT1_2022_08_26__18_59_39/SMILES,txt']
print(path_list)
WriteExcelAll(path_list, molwt=100, name='HAT1_Wt_more_than_100.xlsx', sorted_ix=0, refer_mol=None)'''

View File

@@ -1,166 +0,0 @@
import os
import glob
import numpy as np
from multiprocessing import Pool
from .ParseFile import Protein, parse_sdf_to_dict, Ligand
from .residues_base import RESIDUES_TOPO
def verify_dir_exists(dirname):
if os.path.isdir(os.path.dirname(dirname)) == False:
os.makedirs(os.path.dirname(dirname))
def ComputeDistMat(m1, m2):
#m1_square = np.sum(m1*m1, axis=1, keepdims=True)
m1_square = np.expand_dims(np.einsum('ij,ij->i', m1, m1), axis=1)
if m1 is m2:
m2_square = m1_square.T
else:
#m2_square = np.sum(m2*m2, axis=1, keepdims=True).T
m2_square = np.expand_dims(np.einsum('ij,ij->i', m2, m2), axis=0)
dist_mat = m1_square + m2_square - np.dot(m1, m2.T)*2
# result maybe less than 0 due to floating point rounding errors.
dist_mat = np.maximum(dist_mat, 0, dist_mat)
if m1 is m2:
# Ensure that distances between vectors and themselves are set to 0.0.
# This may not be the case due to floating point rounding errors.
dist_mat.flat[::dist_mat.shape[0] + 1] = 0.0
dist_mat = np.sqrt(dist_mat)
return dist_mat
class SplitPocket(object):
def __init__(self,
main_path='/export/home/jyy/CrossDocked2020/',
sample_path='/Refine_Positive_Samples/',
new_sapmle_path='/Refine_Positive_Samples_Pocket/',
type_file='/export/home/jyy/CrossDocked2020/Refine_Positive_Samples_New.types',
dist_cutoff=10,
get_surface_atom=True):
self.main_path = main_path
self.sample_path = sample_path
self.new_sapmle_path = new_sapmle_path
self.dist_cutoff = dist_cutoff
self.type_file = type_file
self.types = [line.strip().split() for line in open(type_file).readlines()]
self.exceptions = []
self.get_surface_atom = get_surface_atom
@staticmethod
def _split_pocket(protein, ligand, dist_cutoff):
if protein.__normalized__ == False:
protein.normalize_coord()
rm = protein.rotate_matrix
shift = protein.center_of_mass_shift
ligand.normalize_pos(shift, rm)
res = np.array(protein.get_residues)
cm_res = np.array([r.center_of_mass for r in res])
dist_mat = ComputeDistMat(ligand.normalized_coords, cm_res)
bool_dist_mat = dist_mat < dist_cutoff
pocket_res = res[bool_dist_mat.sum(axis=0)>0]
pocket_block = '\n'.join(
[i.to_heavy_string for i in pocket_res if len(i.get_heavy_atoms)==len(RESIDUES_TOPO[i.name])]
)
return pocket_block, ligand.mol_block()
@staticmethod
def _split_pocket_with_surface_atoms(protein, ligand, dist_cutoff):
if protein.__normalized__ == False:
protein.normalize_coord()
rm = protein.rotate_matrix
shift = protein.center_of_mass_shift
ligand.normalize_pos(shift, rm)
protein.compute_surface_atoms()
res = np.array(protein.get_residues)
cm_res = np.array([r.center_of_mass for r in res])
dist_mat = ComputeDistMat(ligand.normalized_coords, cm_res)
bool_dist_mat = dist_mat < dist_cutoff
pocket_res = res[bool_dist_mat.sum(axis=0)>0]
pocket_block = []
for r in pocket_res:
for a in r.get_heavy_atoms:
if a.is_surf is True:
pocket_block.append(a.to_string + ' surf')
else:
pocket_block.append(a.to_string + ' inner')
pocket_block = '\n'.join(pocket_block)
return pocket_block, ligand.mol_block()
def _do_split(self, items):
try:
sub_path = items[4].split('/')[0]
ligand_name = items[4].split('/')[1].split('.')[0]
chain_id = items[3].split('.')[0].split('_')[-2]
protein = Protein(self.main_path+self.sample_path+items[3])
chain = protein.get_chain(chain_id)
ligand = Ligand(self.main_path+self.sample_path+items[4], sanitize=True)
if self.get_surface_atom:
pocket_block, ligand_block = self._split_pocket_with_surface_atoms(chain, ligand, self.dist_cutoff)
else:
pocket_block, ligand_block = self._split_pocket(chain, ligand, self.dist_cutoff)
save_path = '{}/{}/{}/'.format(self.main_path, self.new_sapmle_path, sub_path)
verify_dir_exists(save_path)
pocket_file_name = '{}/{}_pocket{}.pdb'.format(save_path, ligand_name, self.dist_cutoff)
open(pocket_file_name, 'w').write(pocket_block)
open(save_path+'/{}.mol'.format(ligand_name), 'w').write(ligand_block)
except:
protein_file = self.main_path+self.sample_path+items[3]
ligand_file = self.main_path+self.sample_path+items[4]
print('[Exception]', protein_file, ligand_file)
@staticmethod
def split_pocket_from_site_map(site_map, protein_file, dist_cutoff):
'''
if target dosn't have ligand, we can use the site_map.pdb computed by schrodinger or other software
to split potential pocket.
'''
site_coords = []
with open(site_map) as fr:
lines = fr.readlines()
for line in lines:
if line.startswith('HETATM'):
line = line.strip()
xyz = [float(line[30:38].strip()),
float(line[38:46].strip()),
float(line[46:54].strip())]
site_coords.append(xyz)
site_coords = np.array(site_coords)
protein = Protein(protein_file)
res = np.array(protein.get_residues)
cm_res = np.array([r.center_of_mass for r in res])
dist_mat = ComputeDistMat(site_coords, cm_res)
bool_dist_mat = dist_mat < dist_cutoff
pocket_res = res[bool_dist_mat.sum(axis=0)>0]
pocket_block = '\n'.join(
[i.to_heavy_string for i in pocket_res if len(i.get_heavy_atoms)==len(RESIDUES_TOPO[i.name])]
)
return pocket_block
def __call__(self, np=10):
pool = Pool(processes=np) # 创建进程池对象进程数与multiprocessing.cpu_count()相同
data_pool = pool.map(self._do_split, self.types) # 并行执行函数
pool.close()
pool.join()
print('Done !')
'''
from utils import SplitPocket
split_pocket = SplitPocket(
main_path='/export/home/jyy/CrossDocked2020/',
sample_path = '/Refine_Positive_Samples/',
new_sapmle_path = './crossdocked2020_pocket-surf/',
type_file = '/export/home/jyy/CrossDocked2020/Refine_Positive_Samples_New.types',
dist_cutoff = 10,
get_surface_atom = True
)
split_pocket(np=10)
'''

View File

@@ -1,173 +0,0 @@
#
# calculation of synthetic accessibility score as described in:
#
# Estimation of Synthetic Accessibility Score of Drug-like Molecules based on Molecular Complexity and Fragment Contributions
# Peter Ertl and Ansgar Schuffenhauer
# Journal of Cheminformatics 1:8 (2009)
# http://www.jcheminf.com/content/1/1/8
#
# several small modifications to the original paper are included
# particularly slightly different formula for marocyclic penalty
# and taking into account also molecule symmetry (fingerprint density)
#
# for a set of 10k diverse molecules the agreement between the original method
# as implemented in PipelinePilot and this implementation is r2 = 0.97
#
# peter ertl & greg landrum, september 2013
#
from rdkit import Chem
from rdkit.Chem import rdMolDescriptors
import pickle
import math
from collections import defaultdict
import os.path as op
_fscores = None
def readFragmentScores(name='fpscores'):
import gzip
global _fscores
# generate the full path filename:
if name == "fpscores":
name = op.join(op.dirname(__file__), name)
data = pickle.load(gzip.open('%s.pkl.gz' % name))
outDict = {}
for i in data:
for j in range(1, len(i)):
outDict[i[j]] = float(i[0])
_fscores = outDict
def numBridgeheadsAndSpiro(mol, ri=None):
nSpiro = rdMolDescriptors.CalcNumSpiroAtoms(mol)
nBridgehead = rdMolDescriptors.CalcNumBridgeheadAtoms(mol)
return nBridgehead, nSpiro
def calculateScore(m):
if _fscores is None:
readFragmentScores()
# fragment score
fp = rdMolDescriptors.GetMorganFingerprint(m,
2) # <- 2 is the *radius* of the circular fingerprint
fps = fp.GetNonzeroElements()
score1 = 0.
nf = 0
for bitId, v in fps.items():
nf += v
sfp = bitId
score1 += _fscores.get(sfp, -4) * v
score1 /= nf
# features score
nAtoms = m.GetNumAtoms()
nChiralCenters = len(Chem.FindMolChiralCenters(m, includeUnassigned=True))
ri = m.GetRingInfo()
nBridgeheads, nSpiro = numBridgeheadsAndSpiro(m, ri)
nMacrocycles = 0
for x in ri.AtomRings():
if len(x) > 8:
nMacrocycles += 1
sizePenalty = nAtoms**1.005 - nAtoms
stereoPenalty = math.log10(nChiralCenters + 1)
spiroPenalty = math.log10(nSpiro + 1)
bridgePenalty = math.log10(nBridgeheads + 1)
macrocyclePenalty = 0.
# ---------------------------------------
# This differs from the paper, which defines:
# macrocyclePenalty = math.log10(nMacrocycles+1)
# This form generates better results when 2 or more macrocycles are present
if nMacrocycles > 0:
macrocyclePenalty = math.log10(2)
score2 = 0. - sizePenalty - stereoPenalty - spiroPenalty - bridgePenalty - macrocyclePenalty
# correction for the fingerprint density
# not in the original publication, added in version 1.1
# to make highly symmetrical molecules easier to synthetise
score3 = 0.
if nAtoms > len(fps):
score3 = math.log(float(nAtoms) / len(fps)) * .5
sascore = score1 + score2 + score3
# need to transform "raw" value into scale between 1 and 10
min = -4.0
max = 2.5
sascore = 11. - (sascore - min + 1) / (max - min) * 9.
# smooth the 10-end
if sascore > 8.:
sascore = 8. + math.log(sascore + 1. - 9.)
if sascore > 10.:
sascore = 10.0
elif sascore < 1.:
sascore = 1.0
return sascore
def processMols(mols):
print('smiles\tName\tsa_score')
for i, m in enumerate(mols):
if m is None:
continue
s = calculateScore(m)
smiles = Chem.MolToSmiles(m)
print(smiles + "\t" + m.GetProp('_Name') + "\t%3f" % s)
if __name__ == '__main__':
import sys
import time
t1 = time.time()
readFragmentScores("fpscores")
t2 = time.time()
suppl = Chem.SmilesMolSupplier(sys.argv[1])
t3 = time.time()
processMols(suppl)
t4 = time.time()
print('Reading took %.2f seconds. Calculating took %.2f seconds' % ((t2 - t1), (t4 - t3)),
file=sys.stderr)
#
# Copyright (c) 2013, Novartis Institutes for BioMedical Research Inc.
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are
# met:
#
# * Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.
# * Redistributions in binary form must reproduce the above
# copyright notice, this list of conditions and the following
# disclaimer in the documentation and/or other materials provided
# with the distribution.
# * Neither the name of Novartis Institutes for BioMedical Research Inc.
# nor the names of its contributors may be used to endorse or promote
# products derived from this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#

View File

@@ -1,389 +0,0 @@
import torch
import torch_geometric as pyg
from torch_geometric.loader import DataLoader, DataListLoader
from torch.nn.utils import clip_grad_norm_
#from torch.utils.data import Dataset, Subset, DataLoader
from torch.cuda import amp
import numpy as np
import os
import time
from collections.abc import Iterable
from torch.utils import tensorboard
def inf_iterator(iterable):
iterator = iterable.__iter__()
while True:
try:
yield iterator.__next__()
except StopIteration:
iterator = iterable.__iter__()
def get_parameter_number(model):
total_num = sum(p.numel() for p in model.parameters())
trainable_num = sum(p.numel() for p in model.parameters() if p.requires_grad)
return {'Total': total_num, 'Trainable': trainable_num}
def timewait(time_gap):
d = time_gap//(24*3600)
d_h = time_gap%(24*3600)
h = d_h//3600
h_m = d_h%3600
m = h_m//60
s = h_m%60
if d > 0:
out = '{}d {}h {}m {}s'.format(int(d),int(h),int(m),round(s,2))
elif h > 0:
out = '{}h {}m {}s'.format(int(h),int(m),round(s,2))
elif m > 0:
out = '{}m {}s'.format(int(m),round(s,2))
else:
out = '{}s'.format(round(s,2))
return out
def verify_dir_exists(dirname):
if os.path.isdir(os.path.dirname(dirname)) == False:
os.makedirs(os.path.dirname(dirname))
class Experiment(object):
def __init__(self, model, train_set, optimizer, scheduler=None, device='cuda',
valid_set=None, clip_grad=True, max_norm=5, norm_type=2,
pos_noise_std=0.1, data_parallel=False, use_amp=False):
if data_parallel:
self.model = pyg.nn.DataParallel(model)
else:
self.model = model#.to(device)
if 'config' in model.__dict__:
self.model_config = model.config
else:
self.model_config = None
self.train_set = train_set
self.valid_set = valid_set
self.optimizer = optimizer
self.scheduler = scheduler
self.num_train_data = len(train_set)
self.data_parallel = data_parallel
#self.train_batch_size = train_loader.batch_size
if valid_set:
self.num_valid_data = len(valid_set)
else:
self.num_valid_data = None
#self.valid_batch_size = valid_loader.batch_size
self.clip_grad = clip_grad
self.max_norm = max_norm
self.norm_type = norm_type
self.with_tb = False
self.device = device
self.pos_noise_std = pos_noise_std
self.use_amp = use_amp
if self.use_amp:
self.grad_scaler = amp.GradScaler()
@staticmethod
def get_log(out_dict, key_word, it, time_gap=None):
log = []
for key, value in out_dict.items():
log.append(' {}:{:.5f} |'.format(key, value.item()))
log.insert(0, '[{} {}]'.format(key_word, it))
if time_gap:
log.append(' Time: {}'.format(time_gap))
return ''.join(log)
@staticmethod
def write_summary(out_dict, writer, key_word, num_iter, scheduler=None, optimizer=None):
for key, value in out_dict.items():
writer.add_scalar('{}/{}'.format(key_word, key), value, num_iter)
if scheduler is not None:
writer.add_scalar('{}/lr'.format(key_word), optimizer.param_groups[0]['lr'], num_iter)
writer.flush()
@staticmethod
def get_num_iter(num_data, batch_size):
if num_data % batch_size == 0:
n_iter = int(num_data/batch_size)
else:
n_iter = int(num_data/batch_size) + 1
return n_iter
@property
def parameter_number(self):
return get_parameter_number(self.model)
def _train_step(self, batch, it=0, print_log=False):
start = time.time()
self.model.train()
self.optimizer.zero_grad()
if self.use_amp:
with amp.autocast():
out_dict = self.model.get_loss(batch)
self.grad_scaler.scale(out_dict['loss']).backward()
else:
out_dict = self.model.get_loss(batch)
out_dict['loss'].backward()
#out_dict = self.model(batch)
#with torch.autograd.detect_anomaly():
#print(out_dict)
if self.clip_grad:
# 梯度裁剪
orig_grad_norm = clip_grad_norm_(
self.model.parameters(),
max_norm=self.max_norm,
norm_type=self.norm_type,
error_if_nonfinite=True) # error_if_nonfinite=True如果梯度存在nan, inf将抛出error
if self.use_amp:
self.grad_scaler.step(self.optimizer)
self.grad_scaler.update()
else:
self.optimizer.step()
if self.with_tb:
if self.clip_grad:
self.writer.add_scalar('train/step/grad', orig_grad_norm, it)
self.write_summary(out_dict, self.writer, 'train/step', it)
end = time.time()
time_gap = end - start
log = self.get_log(out_dict, 'Step', it, time_gap='{:.3f}'.format(time_gap))
with open(self.logdir+'training.log', 'a') as log_writer:
log_writer.write(log + '\n')
if print_log:
print(log)
return out_dict
def train_epoch(self, n_iter, n_epoch, print_log=False):
start = time.time()
log_dict = {}
#train_loader = inf_iterator(self.train_loader)
for i in range(n_iter):
if self.data_parallel:
batch = next(self.train_loader).cuda()#.to(self.device)
else:
batch = next(self.train_loader).to(self.device)
## 计算噪音
compose_noise = torch.randn_like(batch.compose_pos) * self.pos_noise_std
batch.compose_pos = batch.compose_pos + compose_noise
out_dict = self._train_step(batch, it=i+n_epoch*n_iter, print_log=print_log)
for key, value in out_dict.items():
if key not in log_dict:
log_dict[key] = value if 'acc' in key else value * batch.num_graphs
else:
log_dict[key] += value if 'acc' in key else value * batch.num_graphs
#print(log_dict)
for key, value in log_dict.items():
if 'acc' in key:
log_dict[key] = value / n_iter
else:
log_dict[key] = value / self.num_train_data
if self.with_tb:
self.write_summary(log_dict, self.writer, 'train/epoch', n_epoch,
scheduler=self.scheduler, optimizer=self.optimizer)
end = time.time()
time_gap = timewait(end - start)
log = self.get_log(log_dict, 'Epoch', n_epoch, time_gap=time_gap)
with open(self.logdir+'training.log', 'a') as log_writer:
log_writer.write(log + '\n')
if print_log:
print(log)
def validate(self, n_iter, n_epoch, print_log=False, schedule_key='loss'):
start = time.time()
log_dict = {}
with torch.no_grad():
self.model.eval()
for _ in range(n_iter):
batch = next(self.valid_loader).to(self.device)
out_dict = self.model.get_loss(batch)
for key, value in out_dict.items():
if key not in log_dict:
log_dict[key] = value if 'acc' in key else value * batch.num_graphs
else:
log_dict[key] += value if 'acc' in key else value * batch.num_graphs
for key, value in log_dict.items():
if 'acc' in key:
log_dict[key] = value / n_iter
else:
log_dict[key] = value / self.num_valid_data
if self.scheduler:
self.scheduler.step(log_dict[schedule_key])
if self.with_tb:
self.write_summary(log_dict, self.writer, 'val/epoch', n_epoch,
scheduler=self.scheduler, optimizer=self.optimizer)
end = time.time()
time_gap = timewait(end - start)
log = self.get_log(log_dict, 'Validate', n_epoch, time_gap=time_gap)
with open(self.logdir+'training.log', 'a') as log_writer:
log_writer.write(log + '\n')
if print_log:
print(log)
return log_dict['loss']
def fit(self, num_epoch, train_batch_size=4, valid_batch_size=16, print_log=True,
with_tb=True, logdir='./training_log', early_stop=None, schedule_key='loss',
follow_batch=[], exclude_keys=[], collate_fn=None, num_workers=0, pin_memory=False):
self.train_loader = inf_iterator(DataLoader(
self.train_set,
batch_size = train_batch_size, #config.train.batch_size,
shuffle = True,
num_workers = num_workers,
pin_memory = pin_memory,
follow_batch = follow_batch,
exclude_keys = exclude_keys,
collate_fn = collate_fn
))
self.train_batch_size = train_batch_size
if self.valid_set:
self.valid_loader = inf_iterator(DataLoader(
self.valid_set,
batch_size = valid_batch_size, #config.train.batch_size,
shuffle = False,
num_workers = num_workers,
pin_memory = pin_memory,
follow_batch = follow_batch,
exclude_keys = exclude_keys,
collate_fn = collate_fn
))
self.valid_batch_size = valid_batch_size
self.n_iter_train = self.get_num_iter(self.num_train_data, self.train_batch_size)
self.n_iter_valid = self.get_num_iter(self.num_valid_data, self.valid_batch_size)
## config log
date = time.strftime("%Y-%m-%d-%H-%M", time.localtime())
self.logdir = logdir+'/'+date+'/'
verify_dir_exists(self.logdir)
open(self.logdir+'model_config.dir','w').write(str(self.model_config))
self.with_tb = with_tb
if self.with_tb:
self.writer = tensorboard.SummaryWriter(self.logdir)
log_writer = open(self.logdir+'training.log', 'w')
log_writer.write('\n######## {}; batch_size ########\n'.format(self.parameter_number, train_batch_size))
log_writer.close()
if print_log:
print('\n######## {} ########\n'.format(self.parameter_number))
## training/saving model
loss_best = float('inf')
early_stop_count = 0
for epoch in range(num_epoch):
self.train_epoch(self.n_iter_train, epoch, print_log=print_log)
val_loss = self.validate(self.n_iter_valid, epoch, schedule_key=schedule_key, print_log=print_log)
if val_loss < loss_best:
torch.save(self.model.state_dict(), self.logdir+'/model.pth')
loss_best = val_loss
early_stop_count = 0
else:
early_stop_count += 1
if isinstance(early_stop, int):
if early_stop_count == early_stop:
if print_log:
print('Early stopping ......')
with open(self.logdir+'training.log', 'a') as log_writer:
log_writer.write('Early stopping ......\n')
break
def fit_step(self, num_step, valid_per_step=5000, train_batch_size=4, valid_batch_size=16, print_log=True,
with_tb=True, logdir='./training_log', schedule_key='loss', num_workers=0, pin_memory=False,
follow_batch=[], exclude_keys=[], collate_fn=None, max_edge_num_in_batch=900000):
#torch.multiprocessing.set_sharing_strategy('file_system')
if self.data_parallel:
self.train_loader = inf_iterator(DataListLoader(
self.train_set,
batch_size = train_batch_size, #config.train.batch_size,
shuffle = True,
num_workers = num_workers,
pin_memory = pin_memory,
#follow_batch = follow_batch,
#exclude_keys = exclude_keys,
collate_fn = collate_fn
))
self.train_batch_size = train_batch_size
if self.valid_set:
self.valid_loader = inf_iterator(DataListLoader(
self.valid_set,
batch_size = valid_batch_size, #config.train.batch_size,
shuffle = False,
num_workers = num_workers,
pin_memory = pin_memory,
#follow_batch = follow_batch,
#exclude_keys = exclude_keys,
collate_fn = collate_fn
))
self.valid_batch_size = valid_batch_size
else:
self.train_loader = inf_iterator(DataLoader(
self.train_set,
batch_size = train_batch_size, #config.train.batch_size,
shuffle = False,
num_workers = num_workers,
pin_memory = pin_memory,
follow_batch = follow_batch,
exclude_keys = exclude_keys,
collate_fn = collate_fn
))
self.train_batch_size = train_batch_size
if self.valid_set:
self.valid_loader = inf_iterator(DataLoader(
self.valid_set,
batch_size = valid_batch_size, #config.train.batch_size,
shuffle = False,
num_workers = num_workers,
pin_memory = pin_memory,
follow_batch = follow_batch,
exclude_keys = exclude_keys,
collate_fn = collate_fn
))
self.valid_batch_size = valid_batch_size
self.n_iter_train = self.get_num_iter(self.num_train_data, self.train_batch_size)
if self.num_valid_data:
self.n_iter_valid = self.get_num_iter(self.num_valid_data, self.valid_batch_size)
## config log
date = time.strftime("%Y-%m-%d-%H-%M", time.localtime())
self.logdir = logdir+'/'+date+'/'
verify_dir_exists(self.logdir)
open(self.logdir+'model_config.dir','w').write(str(self.model_config))
self.with_tb = with_tb
if self.with_tb:
self.writer = tensorboard.SummaryWriter(self.logdir)
log_writer = open(self.logdir+'training.log', 'w')
log_writer.write('\n######## {}; batch_size ########\n'.format(self.parameter_number, train_batch_size))
log_writer.close()
if print_log:
print('\n######## {} ########\n'.format(self.parameter_number))
for step in range(1, num_step+1):
#batch = next(self.train_loader)#.to(self.device)
## 计算噪音
if self.data_parallel:
batch = next(self.train_loader)#.to(self.device)
for d in batch:
d.cpx_pos = d.cpx_pos + torch.randn_like(d.cpx_pos) * self.pos_noise_std
else:
batch = next(self.train_loader).to(self.device)
#print(batch.cpx_edge_index.shape)
cpx_noise = torch.randn_like(batch.cpx_pos) * self.pos_noise_std
batch.cpx_pos = batch.cpx_pos + cpx_noise
if batch.cpx_edge_index.size(1) > max_edge_num_in_batch:
continue
out_dict = self._train_step(batch, it=step, print_log=print_log)
if (step % valid_per_step == 0 or step == num_step):
if self.num_valid_data:
val_loss = self.validate(self.n_iter_valid, step, schedule_key=schedule_key, print_log=print_log)
#torch.save(self.model.state_dict(), self.logdir+'/model.pth')
ckpt_path = self.logdir + '/ckpt/'
verify_dir_exists(ckpt_path)
torch.save({
'config': self.model_config,
'model': self.model.state_dict(),
'optimizer': self.optimizer.state_dict(),
'scheduler': self.scheduler.state_dict(),
'iteration': step,
}, ckpt_path+'/{}.pt'.format(step))

View File

@@ -1,340 +0,0 @@
import torch
import torch.nn.functional as F
from torch_geometric.nn.pool import knn_graph, radius_graph
from torch_geometric.transforms import Compose
from torch_geometric.utils.subgraph import subgraph
from torch_geometric.nn import knn, radius
from torch_geometric.utils.num_nodes import maybe_num_nodes
from torch_scatter import scatter_add
import numpy as np
import random
import copy
def count_neighbors(edge_index, symmetry, valence=None, num_nodes=None):
assert symmetry == True, 'Only support symmetrical edges.'
if num_nodes is None:
num_nodes = maybe_num_nodes(edge_index) # 如果没有指定配体原子数根据edge_index来推测
if valence is None:
valence = torch.ones([edge_index.size(1)], device=edge_index.device)
valence = valence.view(edge_index.size(1))
return scatter_add(valence, index=edge_index[0], dim=0, dim_size=num_nodes).long()
def change_features_of_neigh(ligand_feature_full, new_num_neigh, new_num_valence, ligand_atom_num_bonds, num_atom_type=7):
idx_n_neigh = num_atom_type + 1
idx_n_valence = idx_n_neigh + 1
idx_n_bonds = idx_n_valence + 1
ligand_feature_full[:, idx_n_neigh] = new_num_neigh.long()
ligand_feature_full[:, idx_n_valence] = new_num_valence.long()
ligand_feature_full[:, idx_n_bonds:idx_n_bonds+3] = ligand_atom_num_bonds.long()
return ligand_feature_full
def get_rfs_perm(nbh_list, ring_info):
num_nodes = len(nbh_list)
node0 = random.randint(0, num_nodes-1)
queue,order,not_ring_queue,edge_index = [],[],[],[]
if (ring_info[node0]>0).sum():
queue.append(node0)
else:
not_ring_queue.append(node0)
while queue or not_ring_queue:
if queue:
v = queue.pop()
order.append(v)
elif not_ring_queue:
v = not_ring_queue.pop()
order.append(v)
adj_in_ring, adj_not_ring, edge_idx_step = [], [], []
for nbh in nbh_list[v]:
if (ring_info[nbh]>0).sum():
adj_in_ring.append(nbh)
else:
adj_not_ring.append(nbh)
if nbh in order:
edge_idx_step.append((v,nbh))
edge_idx_step.append((nbh,v))
edge_index.append(edge_idx_step)
if adj_not_ring:
for w in adj_not_ring:
if w not in order and w not in not_ring_queue:
not_ring_queue.append(w)
# Preferential access to atoms in the same ring of w
same_ring_pool = []
if adj_in_ring:
for w in adj_in_ring:
if (w not in order and w not in queue):
if (ring_info[w] == ring_info[v]).sum()>0:
same_ring_pool.append(w)
else:
queue.append(w)
elif w not in order and w in queue:
if (ring_info[w] == ring_info[v]).sum()>0:
same_ring_pool.append(w)
queue.remove(w)
elif w not in order and (ring_info[w]>0).sum()>=1:
queue.remove(w)
queue.append(w)
queue += same_ring_pool
return torch.LongTensor(order), edge_index
def get_bfs_perm(nbh_list):
num_nodes = len(nbh_list)
# 每个节点的邻居数量
num_neighbors = torch.LongTensor([len(nbh_list[i]) for i in range(num_nodes)])
#print(num_neighbors)
# 随机选择一个原子作为起始点
bfs_queue = [random.randint(0, num_nodes-1)]
bfs_perm, edge_index = [], []
num_remains = [num_neighbors.clone()]
bfs_next_list = {}
visited = {bfs_queue[0]}
num_nbh_remain = num_neighbors.clone()
while len(bfs_queue) > 0:
current = bfs_queue.pop(0) # Remove and return item at index (default last)
for nbh in nbh_list[current]:
num_nbh_remain[nbh] -= 1
bfs_perm.append(current)
num_remains.append(num_nbh_remain.clone())
next_candid, edge_idx_step = [], []
for nxt in nbh_list[current]:
if nxt in visited: continue
next_candid.append(nxt)
visited.add(nxt)
for adj in nbh_list[nxt]:
if adj in bfs_perm:
edge_idx_step.append((adj,nxt))
edge_idx_step.append((nxt,adj))
edge_index.append(edge_idx_step)
random.shuffle(next_candid)
bfs_queue += next_candid
bfs_next_list[current] = copy.copy(bfs_queue)
return torch.LongTensor(bfs_perm), edge_index
def mask_node(data, context_idx, masked_idx, num_atom_type=10, y_pos_std=0.05):
data.context_idx = context_idx # for change bond index
data.masked_idx = masked_idx
# masked ligand atom element/feature/pos.
data.ligand_masked_element = data.ligand_element[masked_idx]
# data.ligand_masked_feature = data.ligand_atom_feature[masked_idx] # For Prediction. these features are chem properties
data.ligand_masked_pos = data.ligand_pos[masked_idx]
# context ligand atom elment/full features/pos. Note: num_neigh and num_valence features should be changed
data.ligand_context_element = data.ligand_element[context_idx]
data.ligand_context_feature_full = data.ligand_atom_feature_full[context_idx] # For Input
data.ligand_context_pos = data.ligand_pos[context_idx]
# new bond with ligand context atoms
if data.ligand_bond_index.size(1) != 0:
data.ligand_context_bond_index, data.ligand_context_bond_type = subgraph(
context_idx,
data.ligand_bond_index,
edge_attr = data.ligand_bond_type,
relabel_nodes = True,
)
else:
data.ligand_context_bond_index = torch.empty([2, 0], dtype=torch.long)
data.ligand_context_bond_type = torch.empty([0], dtype=torch.long)
# re-calculate atom features that relate to bond
data.ligand_context_num_neighbors = count_neighbors(
data.ligand_context_bond_index,
symmetry=True,
num_nodes = context_idx.size(0),
)
data.ligand_context_valence = count_neighbors(
data.ligand_context_bond_index,
symmetry=True,
valence=data.ligand_context_bond_type,
num_nodes=context_idx.size(0)
)
data.ligand_context_num_bonds = torch.stack([
count_neighbors(
data.ligand_context_bond_index,
symmetry=True,
valence=data.ligand_context_bond_type == i,
num_nodes=context_idx.size(0),
) for i in [1, 2, 3]
], dim = -1)
# re-calculate ligand_context_featrure_full
data.ligand_context_feature_full = change_features_of_neigh(
data.ligand_context_feature_full,
data.ligand_context_num_neighbors,
data.ligand_context_valence,
data.ligand_context_num_bonds,
num_atom_type=num_atom_type
)
if data.ligand_masked_pos.size(0) == 0:
data.y_pos = torch.empty([0,3], dtype=torch.float32)
else:
data.y_pos = data.ligand_masked_pos[0].view(-1,3)
#data.y_pos = data.y_pos.repeat(5, 1)
data.y_pos += torch.randn_like(data.y_pos) * y_pos_std
data.ligand_frontier = (data.ligand_context_num_neighbors < data.ligand_num_neighbors[context_idx])
'''new_step_atom_idx = data.masked_idx[0]
candidate_focal_idx_in_context = torch.LongTensor(data.ligand_nbh_list[new_step_atom_idx.item()])
focal_idx_in_context_mask = (data.context_idx.unsqueeze(1) == candidate_focal_idx_in_context).any(1)
data.focal_idx_in_context = torch.nonzero(focal_idx_in_context_mask).view(-1)
if data.focal_idx_in_context.size(0) == 0:
data.focal_idx_in_context_ = data.focal_idx_in_context
else:
data.focal_idx_in_context_ = data.focal_idx_in_context[torch.multinomial(torch.ones_like(data.focal_idx_in_context).float(), 1)]
data.focal_label = torch.zeros_like(data.context_idx)
data.focal_label[data.focal_idx_in_context] = 1
data.atom_label = data.ligand_element[data.masked_idx[0]].unsqueeze(0)
# get edge label
focal_idx_in_ligand = data.context_idx[focal_idx_in_context_mask]
edge_type = []
for i in focal_idx_in_ligand:
edge_type_mask = torch.stack([data.ligand_bond_index[1]==i, data.ligand_bond_index[0]==new_step_atom_idx], dim=1).all(1)
edge_type.append(torch.unique(data.ligand_bond_type[edge_type_mask]))
data.edge_label = torch.zeros_like(data.context_idx)
data.edge_label[data.focal_idx_in_context] = torch.LongTensor(edge_type)
data.edge_query_index_0 = torch.zeros_like(data.edge_label)
data.edge_query_index_1 = torch.arange(data.edge_label.size(0))'''
return data
def make_pos_label(data, num_real_pos=5, num_fake_pos=5, pos_real_std=0.05, pos_fake_std=2.0, k=16):
ligand_context_pos = data.ligand_context_pos
ligand_masked_pos = data.ligand_masked_pos
protein_pos = data.protein_pos
if ligand_context_pos.size(0) == 0:
# fake position
fake_mode = protein_pos[data.candidate_focal_label_in_protein]
p = np.ones(fake_mode.size(0), dtype=np.float32)/fake_mode.size(0)
pos_fake_idx = np.random.choice(np.arange(fake_mode.size(0)), size=num_fake_pos, p=p)
pos_fake = fake_mode[pos_fake_idx]
pos_fake += torch.randn_like(pos_fake) * pos_fake_std / 2.
# real position
p = np.ones(ligand_masked_pos.size(0), dtype=np.float32)/ligand_masked_pos.size(0)
pos_real_idx = np.random.choice(np.arange(ligand_masked_pos.size(0)), size=num_real_pos, p=p)
pos_real = ligand_masked_pos[pos_real_idx]
pos_real += torch.randn_like(pos_real) * pos_real_std
else:
# fake position
fake_mode = ligand_context_pos[data.ligand_frontier]
p = np.ones(fake_mode.size(0), dtype=np.float32)/fake_mode.size(0)
pos_fake_idx = np.random.choice(np.arange(fake_mode.size(0)), size=num_fake_pos, p=p)
pos_fake = fake_mode[pos_fake_idx]
pos_fake += torch.randn_like(pos_fake) * pos_fake_std / 2.
# real position
p = np.ones(ligand_masked_pos.size(0), dtype=np.float32)/ligand_masked_pos.size(0)
pos_real_idx = np.random.choice(np.arange(ligand_masked_pos.size(0)), size=num_real_pos-1, p=p)
pos_real = ligand_masked_pos[pos_real_idx]
pos_real += torch.randn_like(pos_real) * pos_real_std
pos_real = torch.cat([pos_real, data.y_pos], dim=0)
data.pos_fake = pos_fake
pos_fake_knn_edge_idx = knn(x=data.cpx_pos, y=pos_fake, k=k, num_workers=16)
data.pos_fake_knn_edge_idx_0, data.pos_fake_knn_edge_idx_1 = pos_fake_knn_edge_idx
data.pos_real = pos_real
pos_real_knn_edge_idx = knn(x=data.cpx_pos, y=pos_real, k=k, num_workers=16)
data.pos_real_knn_edge_idx_0, data.pos_real_knn_edge_idx_1 = pos_real_knn_edge_idx
return data
def get_complex_graph(data, len_ligand_ctx, len_compose, num_workers=1, graph_type='knn', knn=16, radius=10.0):
if graph_type == 'rad':
data.cpx_edge_index = radius_graph(data.cpx_pos, radius, flow='target_to_source', num_workers=num_workers)
elif graph_type == 'knn':
data.cpx_edge_index = knn_graph(data.cpx_pos, knn, flow='target_to_source', num_workers=num_workers)
# compose_knn_edge_index: 蛋白和配体原子组合在一起计算knn图的邻接矩阵
id_cpx_edge = data.cpx_edge_index[0, :len_ligand_ctx*knn] * len_compose + data.cpx_edge_index[1, :len_ligand_ctx*knn]
# data.cpx_edge_index[0, :len_ligand_ctx*knn] 将knn的edge中属于ligand_ctx的atom的index选出来
id_ligand_ctx_edge = data.ligand_context_bond_index[0] * len_compose + data.ligand_context_bond_index[1]
idx_edge = [torch.nonzero(id_cpx_edge == id_) for id_ in id_ligand_ctx_edge] # idx_edge是id_ligand_ctx_edge中的元素在id_cpx_edge中的索引
# torch.nonzero(id_ligand_ctx_edge.unsqueeze(1) == id_cpx_edge.unsqueeze(0))[:,1]
idx_edge = torch.tensor([a.squeeze() if len(a) > 0 else torch.tensor(-1) for a in idx_edge], dtype=torch.long)
data.cpx_edge_type = torch.zeros(len(data.cpx_edge_index[0]), dtype=torch.long) # for encoder edge embedding
data.cpx_edge_type[idx_edge[idx_edge>=0]] = data.ligand_context_bond_type[idx_edge>=0]
data.cpx_edge_feature = torch.cat([
torch.ones([len(data.cpx_edge_index[0]), 1], dtype=torch.long),
torch.zeros([len(data.cpx_edge_index[0]), 3], dtype=torch.long),
], dim=-1)
data.cpx_edge_feature[idx_edge[idx_edge>=0]] = F.one_hot(data.ligand_context_bond_type[idx_edge>=0], num_classes=4) # 0 (1,2,3)-onehot
return data
def get_knn_graph(pos, k=16, edge_feat=None, edge_feat_index=None, num_workers=8, graph_type='knn', radius=5.5):
if graph_type == 'rad':
cpx_edge_index = radius_graph(pos, radius, flow='target_to_source', num_workers=num_workers)
if graph_type == 'knn':
cpx_edge_index = knn_graph(pos, k, flow='target_to_source', num_workers=num_workers).long()
if isinstance(edge_feat, torch.Tensor) and isinstance(edge_feat_index, torch.Tensor):
adj_feat_mat = torch.zeros([pos.size(0), pos.size(0)], dtype=torch.long)
adj_feat_mat[edge_feat_index[0],edge_feat_index[1]] = edge_feat
cpx_edge_type = adj_feat_mat[cpx_edge_index[0],cpx_edge_index[1]]
else:
cpx_edge_type = None
return cpx_edge_index, cpx_edge_type
def get_complex_graph_(data, knn=16, num_workers=8, graph_type='knn', radius=5.5):
edge_feat = torch.cat([data.ligand_context_bond_type, data.protein_bond_type]).long()
edge_feat_index = torch.cat(
[data.ligand_context_bond_index, data.protein_bond_index+data.ligand_context_pos.size(0)], dim=1
).long()
knn_edge_index, knn_edge_type = get_knn_graph(
data.cpx_pos, k=knn, edge_feat=edge_feat, edge_feat_index=edge_feat_index,
graph_type=graph_type, num_workers=num_workers, radius=radius
)
data.cpx_edge_index = knn_edge_index
data.cpx_edge_type = knn_edge_type
data.cpx_edge_feature = F.one_hot(knn_edge_type, num_classes=4)
return data
def sample_edge_with_radius(data, r=4.0):
y_pos = data.y_pos
ligand_context_pos = data.ligand_context_pos
context_idx = data.context_idx
masked_idx = data.masked_idx
ligand_bond_index = data.ligand_bond_index
ligand_bond_type = data.ligand_bond_type
# select the atoms whose distance < r between pos_query as edge samples
edge_index_radius = radius(ligand_context_pos, y_pos, r=r, num_workers=16)
# get the labels of edge samples
mask = [i in masked_idx[0] and j in context_idx[edge_index_radius[1]] for i,j in zip(*ligand_bond_index)]
new_idx_1 = torch.nonzero((ligand_bond_index[:,mask][1].view(-1,1) == context_idx).any(0)).view(-1)
real_bond_type_in_edge_index_radius = torch.nonzero((new_idx_1.view(-1,1) == edge_index_radius[1]).any(0)).view(-1)
edge_label = torch.zeros(edge_index_radius.size(1), dtype=torch.long)
edge_label[real_bond_type_in_edge_index_radius] = ligand_bond_type[mask]
data.edge_query_index_0, data.edge_query_index_1 = edge_index_radius
data.edge_label = edge_label
return data
def get_tri_edges(edge_index_query, pos_query, idx_ligand, ligand_bond_index, ligand_bond_type):
row, col = edge_index_query
acc_num_edges = 0
index_real_cps_edge_i_list, index_real_cps_edge_j_list = [], [] # index of real-ctx edge (for attention)
for node in torch.arange(pos_query.size(0)):
num_edges = (row == node).sum()
index_edge_i = torch.arange(num_edges, dtype=torch.long, ) + acc_num_edges
index_edge_i, index_edge_j = torch.meshgrid(index_edge_i, index_edge_i, indexing=None)
index_edge_i, index_edge_j = index_edge_i.flatten(), index_edge_j.flatten()
index_real_cps_edge_i_list.append(index_edge_i)
index_real_cps_edge_j_list.append(index_edge_j)
acc_num_edges += num_edges
index_real_cps_edge_i = torch.cat(index_real_cps_edge_i_list, dim=0).to(pos_query.device) # add len(real_compose_edge_index) in the dataloader for batch
index_real_cps_edge_j = torch.cat(index_real_cps_edge_j_list, dim=0).to(pos_query.device)
node_a_cps_tri_edge = col[index_real_cps_edge_i] # the node of tirangle edge for the edge attention (in the compose)
node_b_cps_tri_edge = col[index_real_cps_edge_j]
n_context = len(idx_ligand)
adj_mat = (torch.zeros([n_context, n_context], dtype=torch.long) - torch.eye(n_context, dtype=torch.long))
adj_mat = adj_mat.to(ligand_bond_index.device)
adj_mat[ligand_bond_index[0], ligand_bond_index[1]] = ligand_bond_type
tri_edge_type = adj_mat[node_a_cps_tri_edge, node_b_cps_tri_edge]
tri_edge_feat = (tri_edge_type.view([-1, 1]) == torch.tensor([[-1, 0, 1, 2, 3]]).to(tri_edge_type.device)).long()
index_real_cps_edge_for_atten = torch.stack([
index_real_cps_edge_i, index_real_cps_edge_j # plus len(real_compose_edge_index_0) for dataloader batch
], dim=0)
tri_edge_index = torch.stack([
node_a_cps_tri_edge, node_b_cps_tri_edge # plus len(compose_pos) for dataloader batch
], dim=0)
return index_real_cps_edge_for_atten, tri_edge_index, tri_edge_feat