Files
PocketFlow/pocket_flow/utils/.ipynb_checkpoints/ParseFile-checkpoint.py
2023-06-16 17:55:38 +08:00

763 lines
30 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
import os
import copy
import numpy as np
from rdkit import Chem, RDConfig
from rdkit.Chem.rdchem import BondType
from rdkit.Chem import ChemicalFeatures
from .residues_base import RESIDUES_TOPO, RESIDUES_TOPO_WITH_H
try:
import pymol
except:
print('we can not compute the atoms on the surface of protein, '\
'because pymol can not be imported')
pass
#from .BaseFeatures import Mol2Graph, RESIDUE_GRAPH_WITHOUT_H, RESIDUE_GRAPH_WITH_H
#from .utils import coordinate_adjusting
#from .Data import LigandData
ATOM_TYPE_WITH_HYBIRD = [
'SP3_C', 'SP2_C', 'SP_C', 'SP3_N', 'SP2_N', 'SP_N', 'SP3_O', 'SP2_O', 'SP3_F', 'SP3_P',
'SP2_P', 'SP3D_P', 'SP3_S', 'SP2_S', 'SP3D_S', 'SP3D2_S', 'SP3_Cl', 'SP3_Br', 'SP3_I'
]
ATOM_MAP = [6, 6, 6, 7, 7, 7, 8, 8, 9, 15, 15, 15, 16, 16, 16, 16, 17, 35, 53]
PT = Chem.GetPeriodicTable()
BACKBONE_SYMBOL = {'N', 'CA', 'C', 'O'}
AMINO_ACID_TYPE = {
'CYS':0, 'GLY':1, 'ALA':2, 'THR':3, 'LYS':4, 'PRO':5, 'VAL':6, 'SER':7, 'ASN':8, 'LEU':9,
'GLN':10, 'MET':11, 'ASP':12, 'TRP':13, 'HIS':14, 'GLU':15, 'ARG':16, 'ILE':17, 'PHE':18,
'TYR':19
}
'''
def coordinate_adjusting(pos, wts):
center_of_mass = np.sum(pos*wts, axis=0)/wts.sum()
pos = pos - center_of_mass
I_xx = (wts*(pos[...,1]**2 + pos[...,2]**2)).sum()
I_yy = (wts*(pos[...,0]**2 + pos[...,2]**2)).sum()
I_zz = (wts*(pos[...,0]**2 + pos[...,1]**2)).sum()
I_xy = -(wts*(pos[...,0]*pos[...,1])).sum()
I_xz = -(wts*(pos[...,0]*pos[...,2])).sum()
I_yz = -(wts*(pos[...,1]*pos[...,2])).sum()
I = np.array([[I_xx, I_xy, I_xz],
[I_xy, I_yy, I_yz],
[I_xz, I_yz, I_zz]])
eig_v, eig_m = np.linalg.eigh(I) # np.linalg.eigh guarantees you that the eigenvalues are sorted and uses a
# faster algorithm that takes advantage of the fact that the matrix is
# symmetric. If you know that your matrix is symmetric, use this function.
am = pos@eig_m # 用本真矢矩阵变换原子坐标,以‘摆正’分子
rotate_around = []
x, y, z = am[0]
if x < 0:
am[..., 0] = am[..., 0] * -1
rotate_around.append(0)
if y < 0:
am[..., 1] = am[..., 1] * -1
rotate_around.append(1)
if z < 0:
am[..., 2] = am[..., 2] * -1
rotate_around.append(2)
return am, eig_m, center_of_mass, rotate_around
'''
def coordinate_adjusting(pos, wts):
center_of_mass = np.sum(pos*wts, axis=0)/wts.sum()
pos = pos - center_of_mass
I_xx = (wts*(pos[...,1]**2 + pos[...,2]**2)).sum()
I_yy = (wts*(pos[...,0]**2 + pos[...,2]**2)).sum()
I_zz = (wts*(pos[...,0]**2 + pos[...,1]**2)).sum()
I_xy = -(wts*(pos[...,0]*pos[...,1])).sum()
I_xz = -(wts*(pos[...,0]*pos[...,2])).sum()
I_yz = -(wts*(pos[...,1]*pos[...,2])).sum()
I = np.array([[I_xx, I_xy, I_xz],
[I_xy, I_yy, I_yz],
[I_xz, I_yz, I_zz]])
eig_v, eig_m = np.linalg.eigh(I) # np.linalg.eigh guarantees you that the eigenvalues are sorted and uses a
# faster algorithm that takes advantage of the fact that the matrix is
# symmetric. If you know that your matrix is symmetric, use this function.
am = pos@eig_m # 用本真矢矩阵变换原子坐标,以‘摆正’分子
return am, eig_m, center_of_mass
class Dict(dict):
__setattr__ = dict.__setitem__
__getattr__ = dict.__getitem__
class Atom(object):
def __init__(self, atom_info):
self.idx = int(atom_info[6:11])
self.name = atom_info[12:16].strip()
self.res_name = atom_info[17:20].strip() # atom_info[17:20].strip()
self.chain = atom_info[21:22].strip()
self.res_idx = int(atom_info[22:26])
self.coord = np.array([float(atom_info[30:38].strip()),
float(atom_info[38:46].strip()),
float(atom_info[46:54].strip())])
self.occupancy = float(atom_info[54:60])
self.temperature_factor = float(atom_info[60:66].strip())
self.seg_id = atom_info[72:76].strip()
self.element = atom_info[76:78].strip()
if self.element == 'SE':
self.element = 'S'
self.name = 'SD'
self.res_name = 'MET'
self.mass = PT.GetAtomicWeight(self.element)
if self.occupancy < 1.0:
self.is_disorder = True
else:
self.is_disorder = False
self.is_surf = atom_info.split()[-1] == 'surf'
@property
def to_string(self):
fmt = '{:6s}{:5d} {:^4s}{:1s}{:3s} {:1s}{:4d}{:1s} {:8.3f}{:8.3f}{:8.3f}{:6.2f}{:6.2f} {:<2s}{:2s}' # https://cupnet.net/pdb-format/
out = fmt.format('ATOM', self.idx, self.name, '', self.res_name,
self.chain, self.res_idx,'', self.coord[0],
self.coord[1], self.coord[2], self.occupancy,
self.temperature_factor, self.seg_id, self.element)
return out
@property
def to_dict(self):
return {
'element': PT.GetAtomicNumber(self.element),
'pos': self.coord,
'is_backbone': self.name in BACKBONE_SYMBOL,
'atom_name': self.name,
'atom_to_aa_type': AMINO_ACID_TYPE[self.res_name]
}
def __repr__(self):
info = 'name={}, index={}, res={}, chain={}, is_disorder={}'.format(self.name,
self.idx,
self.res_name+str(self.res_idx),
self.chain,
self.is_disorder)
return '{}({})'.format(self.__class__.__name__, info)
class Residue(object):
def __init__(self, res_info):
self.res_info = res_info
#atoms_ = [Atom(i) for i in res_info]
self.atom_dict = {}
disorder = []
for i in res_info: # 排除disorder原子
atom = Atom(i)
if atom.name in self.atom_dict:
continue
else:
self.atom_dict[atom.name] = atom
disorder.append(atom.is_disorder)
if atom.res_name == 'MSE':
atom.res_name = 'MET' # 把MSE残基改成MET
if True in disorder:
self.is_disorder = True
else:
self.is_disorder = False
self.idx = self.atom_dict[atom.name].res_idx
self.chain = self.atom_dict[atom.name].chain
self.name = self.atom_dict[atom.name].res_name
self.is_perfect = True if len(self.get_heavy_atoms)==len(RESIDUES_TOPO[self.name]) else False
@property
def to_heavy_string(self):
return '\n'.join([a.to_string for a in self.get_heavy_atoms])
@property
def to_string(self):
return '\n'.join([a.to_string for a in self.get_atoms])
@property
def get_coords(self):
return np.array([a.coord for a in self.get_atoms])
@property
def get_atoms(self):
return list(self.atom_dict.values())
@property
def get_heavy_atoms(self):
return [a for a in self.atom_dict.values() if 'H' not in a.element]
@property
def get_heavy_coords(self):
return np.array([a.coord for a in self.get_heavy_atoms])
@property
def center_of_mass(self):
atom_mass = np.array([i.mass for i in self.get_atoms]).reshape(-1, 1)
return np.sum(self.get_coords*atom_mass, axis=0)/atom_mass.sum()
@property
def bond_graph(self):
i, j, bt = [], [], []
res_graph = RESIDUES_TOPO[self.name]
atom_names = [i.name for i in self.get_heavy_atoms]
for ix, name in enumerate(atom_names):
for adj in res_graph[name]:
if adj in atom_names:
idx_j = atom_names.index(adj)
i.append(ix)
j.append(idx_j)
bt.append(res_graph[name][adj])
edge_index = np.stack([i,j]).astype(dtype=np.int64)
bt = np.array(bt, dtype=np.int64)
return edge_index, bt
@property
def centroid(self):
return self.get_coords.mean(axis=0)
def __repr__(self):
info = 'name={}, index={}, chain={}, is_disorder={}, is_perfect={}'.format(
self.name, self.idx, self.chain, self.is_disorder, self.is_perfect
)
return '{}({})'.format(self.__class__.__name__, info)
class Chain(object):
def __init__(self, chain_info, ignore_incomplete_res=True, pdb_file=None):
self.pdb_file = pdb_file
self.res_dict = {}
'''if ignore_incomplete_res:
self.residues = {}
for i in chain_info:
res = Residue(chain_info[i])
if len(res.get_heavy_atoms) == len(RESIDUES_TOPO[res.name]):
self.residues[i] = res
else:
self.residues = {i:Residue(chain_info[i]) for i in chain_info}'''
self.residues = {i:Residue(chain_info[i]) for i in chain_info}
#print(self.residues)
self.chain = list(self.residues.values())[0].chain
self.__normalized__ = False
self.center_of_mass_shift = None
self.rotate_matrix = None
self.ignore_incomplete_res =ignore_incomplete_res
def normalize_coord(self):
wts = np.array([i.mass for i in self.get_atoms])
wts = np.expand_dims(wts,axis=1)
pos = self.get_coords
am, self.rotate_matrix, self.center_of_mass_shift = coordinate_adjusting(pos, wts)
for ix, a in enumerate(self.get_atoms):
a.coord = am[ix]
self.__normalized__ = True
@property
def get_incomplete_residues(self):
return [i for i in self.residues.values() if i.is_perfect==False]
@property
def to_heavy_string(self):
return '\n'.join([res.to_heavy_string for res in self.get_residues])
@property
def to_string(self):
return '\n'.join([res.to_string for res in self.get_residues])
@property
def get_atoms(self):
atoms = []
for res in self.get_residues:
atoms.extend(res.get_atoms)
return atoms
@property
def get_residues(self):
if self.ignore_incomplete_res:
return [i for i in self.residues.values() if i.is_perfect]
else:
return list(self.residues.values())
@property
def get_heavy_atoms(self):
atoms = []
for res in self.get_residues:
atoms.extend(res.get_heavy_atoms)
return atoms
@property
def get_coords(self):
return np.array([i.coord for i in self.get_atoms])
@property
def get_heavy_coords(self):
return np.array([i.coord for i in self.get_heavy_atoms])
@property
def center_of_mass(self):
atom_mass = np.array([i.mass for i in self.get_atoms]).reshape(-1, 1)
return np.sum(self.get_coords*atom_mass, axis=0)/atom_mass.sum()
@property
def centroid(self):
return self.get_coords.mean(axis=0)
def compute_surface_atoms(self):
path, filename = os.path.split(self.pdb_file)
chain_file_name = filename.split('.')[0] + '_' + self.chain + '.pdb'
chain_file = path+'/'+chain_file_name
with open(chain_file, 'w') as fw:
fw.write(self.to_heavy_string)
pymol.cmd.load(chain_file)
if path == '':
path = './'
sele = chain_file_name.split('.')[0]
pymol.cmd.remove("({}) and hydro".format(sele))
name = pymol.util.find_surface_atoms(sele=sele, _self=pymol.cmd)
save_name = path + '/' + sele + '-surface.pdb'
pymol.cmd.save(save_name,((name)))
surf_protein = Protein(save_name, ignore_incomplete_res=False)
surf_res_dict = {r.idx:r for r in surf_protein.get_residues}
for res in self.get_residues:
res_idx = res.idx
if res_idx in surf_res_dict:
for a in surf_res_dict[res_idx].get_heavy_atoms:
res.atom_dict[a.name].is_surf = True
self.has_surf_atom = True
os.remove(save_name)
os.remove(chain_file)
def get_surf_mask(self):
if self.has_surf_atom is False:
self.compute_surface_atoms()
return np.array([a.is_surf for a in self.get_heavy_atoms], dtype=np.bool)
def get_res_by_id(self, res_id):
return self.residues[res_id]
def __repr__(self):
tmp = 'Chain={}, NumResidues={}, NumAtoms={}, NumHeavyAtoms={}'
info = tmp.format(self.chain, len(self.residues), self.get_coords.shape[0], self.get_heavy_coords.shape[0])
return '{}({})'.format(self.__class__.__name__, info)
class Protein(object):
def __init__(self, pdb_file, ignore_incomplete_res=True):
self.ignore_incomplete_res = ignore_incomplete_res
self.name = pdb_file.split('/')[-1].split('.')[0]
self.pdb_file = pdb_file
self.has_surf_atom = False
#self.pdb = [line.strip() for line in open(pdb_file).readlines()]
#atoms = os.popen('grep ATOM {}'.format(pdb_file)).read().strip().split('\n')
with open(pdb_file) as fr:
lines = fr.readlines()
surf_item = lines[0].strip().split()[-1]
if surf_item in {'surf', 'inner'}:
self.has_surf_atom = True
chain_info = {}
for line in lines:
if line.startswith('ATOM'):
line = line.strip()
chain = line[21:22].strip()
res_idx = int(line[22:26].strip())
if chain not in chain_info:
chain_info[chain] = {}
chain_info[chain][res_idx] = [line]
elif res_idx not in chain_info[chain]:
chain_info[chain][res_idx] = [line]
else:
chain_info[chain][res_idx].append(line)
self.chains = {
c:Chain(chain_info[c], ignore_incomplete_res=ignore_incomplete_res, pdb_file=pdb_file) for c in chain_info
}
self.__normalized__ = False
self.center_of_mass_shift = None
self.rotate_matrix = None
def normalize_coord(self):
is_chain_normalized = [c for c in self.chains if self.chains[c].__normalized__==True]
if len(is_chain_normalized) > 0:
chain_normalized = ' '.join(is_chain_normalized)
info = 'Warning: The Chains ({}) have been normalized '\
'independently, this may lead to wrong coordinate '\
'transformation for whole Protein !!!'.format(chain_normalized)
print(info)
wts = np.array([i.mass for i in self.get_atoms])
wts = np.expand_dims(wts,axis=1)
pos = self.get_coords
am, self.rotate_matrix, self.center_of_mass_shift = coordinate_adjusting(pos, wts)
for ix, a in enumerate(self.get_atoms):
a.coord = am[ix]
self.__normalized__ = True
@property
def get_incomplete_residues(self):
res_list = []
for i in self.chains:
res_list += self.chains[i].get_incomplete_residues
return res_list
@property
def to_heavy_string(self):
return '\n'.join([res.to_heavy_string for res in self.get_residues])
@property
def to_string(self):
return '\n'.join([res.to_string for res in self.get_residues])
@property
def get_residues(self,):
res_list = []
for i in self.chains:
res_list += self.chains[i].get_residues
return res_list
@property
def get_atoms(self):
atoms = []
for res in self.get_residues:
atoms.extend(res.get_atoms)
return atoms
@property
def get_heavy_atoms(self):
atoms = []
for res in self.get_residues:
atoms.extend(res.get_heavy_atoms)
return atoms
@property
def get_coords(self):
return np.array([i.coord for i in self.get_atoms])
@property
def get_heavy_coords(self):
return np.array([i.coord for i in self.get_heavy_atoms])
@property
def center_of_mass(self):
atom_mass = np.array([i.mass for i in self.get_atoms]).reshape(-1, 1)
return np.sum(self.get_coords*atom_mass, axis=0)/atom_mass.sum()
@property
def bond_graph(self):
res_list = self.get_residues
bond_index = []
bond_type = []
N_term_list = []
C_term_list = []
cusum = 0
for ix, res in enumerate(res_list):
e_idx, e_type = res.bond_graph
bond_index.append(e_idx + cusum)
bond_type.append(e_type)
N_term_ix = [i.name for i in self.get_heavy_atoms].index('N') + cusum
C_term_ix = [i.name for i in self.get_heavy_atoms].index('C') + cusum
N_term_list.append(N_term_ix)
C_term_list.append(C_term_ix)
cusum += res.get_heavy_coords.shape[0]
if ix != 0:
if res.idx-res_list[ix-1].idx == 1 and res.chain == res_list[ix-1].chain:
bond_idx_between_res = np.array(
[[N_term_ix, C_term_list[ix-1]],[C_term_list[ix-1],N_term_ix]],
dtype=np.long
)
bond_index.append(bond_idx_between_res)
bond_type_between_res = np.array([1,1], dtype=np.long)
bond_type.append(bond_type_between_res)
bond_index = np.concatenate(bond_index, axis=1)
bond_type = np.concatenate(bond_type)
return bond_index, bond_type
@property
def centroid(self):
return self.get_coords.mean(axis=0)
def get_chain(self, chain_id):
return self.chains[chain_id]
def get_res_by_id(self, chain_id, res_id):
return self.chains[chain_id].get_res_by_id(res_id)
def get_atom_dict(self, removeHs=True, get_surf=False):
atom_dict = {
'element': [],
'pos': [],
'is_backbone': [],
'atom_name': [],
'atom_to_aa_type': []
}
for a in self.get_atoms:
if a.element == 'H' and removeHs:
continue
atom_dict['element'].append(a.to_dict['element'])
atom_dict['pos'].append(a.to_dict['pos'])
atom_dict['is_backbone'].append(a.to_dict['is_backbone'])
atom_dict['atom_name'].append(a.to_dict['atom_name'])
atom_dict['atom_to_aa_type'].append(a.to_dict['atom_to_aa_type'])
atom_dict['element'] = np.array(atom_dict['element'], dtype=np.long)
atom_dict['pos'] = np.array(atom_dict['pos'], dtype=np.float32)
atom_dict['is_backbone'] = np.array(atom_dict['is_backbone'], dtype=np.bool)
#atom_dict['atom_name'] = atom_dict['atom_name']
if get_surf:
atom_dict['surface_mask'] = np.array([a.is_surf for a in self.get_heavy_atoms], dtype=np.bool)#self.get_surf_mask()
atom_dict['atom_to_aa_type'] = np.array(atom_dict['atom_to_aa_type'], dtype=np.long)
atom_dict['molecule_name'] = None
protein_bond_index, protein_bond_type = self.bond_graph
atom_dict['bond_index'] = protein_bond_index
atom_dict['bond_type'] = protein_bond_type
atom_dict['filename'] = self.pdb_file
return atom_dict
def get_backbone_dict(self, removeHs=True):
atom_dict = self.get_atom_dict(removeHs=removeHs)
backbone_dict = {}
backbone_dict['element'] = atom_dict['element'][atom_dict['is_backbone']]
backbone_dict['pos'] = atom_dict['pos'][atom_dict['is_backbone']]
backbone_dict['is_backbone'] = np.ones(atom_dict['is_backbone'].sum(), dtype=np.bool)
backbone_dict['atom_name'] = np.array(atom_dict['atom_name'])[atom_dict['is_backbone']].tolist()
backbone_dict['atom_to_aa_type'] = atom_dict['atom_to_aa_type'][atom_dict['is_backbone']]
backbone_dict['molecule_name'] = atom_dict['molecule_name']
atom_dict['bond_index'] = np.empty([0,2], dtype=np.long)
atom_dict['bond_type'] = np.empty(0, dtype=np.long)
atom_dict['filename'] = self.pdb_file
return backbone_dict
@property
def get_backbone(self):
atoms = []
for res in self.get_residues:
bkb = [a for a in res.get_atoms if a.name in BACKBONE_SYMBOL]
atoms += bkb
return atoms
def compute_surface_atoms(self):
"""
If the pdb_file is a pocket_file, the surface atoms is not correct!
"""
pymol.cmd.load(self.pdb_file)
path, filename = os.path.split(self.pdb_file)
if path == '':
path = './'
sele = filename.split('.')[0]
pymol.cmd.remove("({}) and hydro".format(sele))
name = pymol.util.find_surface_atoms(sele=sele, _self=pymol.cmd)
save_name = path + '/' + sele + '-surface.pdb'
pymol.cmd.save(save_name,((name)))
surf_protein = Protein(save_name, ignore_incomplete_res=False)
surf_res_dict = {r.idx:r for r in surf_protein.get_residues}
for res in self.get_residues:
res_idx = res.idx
if res_idx in surf_res_dict:
for a in surf_res_dict[res_idx].get_heavy_atoms:
res.atom_dict[a.name].is_surf = True
self.has_surf_atom = True
os.remove(save_name)
def get_surf_mask(self):
if self.has_surf_atom is False:
self.compute_surface_atoms()
return np.array([a.is_surf for a in self.get_heavy_atoms], dtype=np.bool)
@staticmethod
def empty_dict():
empty_pocket_dict = {}
empty_pocket_dict = Dict(empty_pocket_dict)
empty_pocket_dict.element = np.empty(0, dtype=np.int64)
empty_pocket_dict.pos = np.empty([0,3], dtype=np.float32)
empty_pocket_dict.is_backbone = np.empty(0, dtype=bool)
empty_pocket_dict.atom_name = []
empty_pocket_dict.atom_to_aa_type = np.empty(0, dtype=np.int64)
empty_pocket_dict.molecule_name = None
empty_pocket_dict.bond_index = np.empty([2,0], dtype=np.int64)
empty_pocket_dict.bond_type = np.empty(0, dtype=np.int64)
empty_pocket_dict.filename = None
return empty_pocket_dict
def __repr__(self):
num_res = 0
num_atom = 0
for i in self.chains:
res_list = list(self.chains[i].residues.values())
num_res += len(res_list)
for i in res_list:
num_atom += len(i.get_heavy_atoms)
num_incomp = len(self.get_incomplete_residues)
tmp = 'Name={}, NumChains={}, NumResidues={}, NumHeavyAtoms={}, NumIncompleteRes={}'
info = tmp.format(
self.name, len(self.chains), num_res, num_atom, num_incomp
)
return '{}({})'.format(self.__class__.__name__, info)
####################################################################################################################
ATOM_FAMILIES = ['Acceptor', 'Donor', 'Aromatic', 'Hydrophobe', 'LumpedHydrophobe', 'NegIonizable', 'PosIonizable', 'ZnBinder']
ATOM_FAMILIES_ID = {s: i for i, s in enumerate(ATOM_FAMILIES)}
BOND_TYPES = {t: i for i, t in enumerate(BondType.names.values())}
BOND_NAMES = {i: t for i, t in enumerate(BondType.names.keys())}
def is_in_ring(mol):
d = {a:np.array([], dtype=np.int64) for a in range(len(mol.GetAtoms()))}
rings = Chem.GetSymmSSSR(mol)
for a in d:
for r_idx, ring in enumerate(rings):
if a in ring:
d[a] = np.append(d[a], r_idx+1)
else:
d[a] = np.append(d[a], -a)
return d
def parse_sdf_to_dict(mol_file):
fdefName = os.path.join(RDConfig.RDDataDir,'BaseFeatures.fdef')
factory = ChemicalFeatures.BuildFeatureFactory(fdefName)
rdmol = next(iter(Chem.SDMolSupplier(mol_file, removeHs=True)))
Chem.Kekulize(rdmol)
ring_info = is_in_ring(rdmol)
conformer = rdmol.GetConformer()
feat_mat = np.zeros([rdmol.GetNumAtoms(), len(ATOM_FAMILIES)], dtype=np.int64)
for feat in factory.GetFeaturesForMol(rdmol):
feat_mat[feat.GetAtomIds(), ATOM_FAMILIES_ID[feat.GetFamily()]] = 1
element, pos, atom_mass = [], [], []
for a in rdmol.GetAtoms():
element.append(a.GetAtomicNum())
pos.append(conformer.GetAtomPosition(a.GetIdx()))
atom_mass.append(a.GetMass())
element = np.array(element, dtype=np.int64)
pos = np.array(pos, dtype=np.float32)
atom_mass = np.array(atom_mass, np.float32)
center_of_mass = (pos * atom_mass.reshape(-1,1)).sum(0)/atom_mass.sum()
edge_index, edge_type = [], []
for b in rdmol.GetBonds():
row = [b.GetBeginAtomIdx(), b.GetEndAtomIdx()]
col = [b.GetEndAtomIdx(), b.GetBeginAtomIdx()]
edge_index.extend([row, col])
edge_type.extend([BOND_TYPES[b.GetBondType()]] * 2)
edge_index = np.array(edge_index)
edge_index_perm = edge_index[:,0].argsort()
edge_index = edge_index[edge_index_perm].T
edge_type = np.array(edge_type)[edge_index_perm]
return {'element': element,
'pos': pos,
'bond_index': edge_index,
'bond_type': edge_type,
'center_of_mass': center_of_mass,
'atom_feature': feat_mat,
'ring_info': ring_info,
'filename':mol_file
}
from scipy.spatial.distance import pdist
from scipy.spatial.distance import squareform
class Ligand(object):
def __init__(self, mol_file, removeHs=True, sanitize=True):
if isinstance(mol_file, Chem.rdchem.Mol):
mol = mol_file
self.name = None
self.lig_file = None
else:
mol = Chem.MolFromMolFile(mol_file, removeHs=removeHs, sanitize=sanitize)
if mol is None:
mol = Chem.MolFromMolFile(mol_file, removeHs=removeHs, sanitize=False)
mol.UpdatePropertyCache(strict=False)
Chem.SanitizeMol(
mol, ## 如果报错可以尝试Chem.rdmolops.SanitizeFlags.SANITIZE_FINDRADICALS
Chem.SanitizeFlags.SANITIZE_FINDRADICALS|\
Chem.SanitizeFlags.SANITIZE_KEKULIZE|\
Chem.SanitizeFlags.SANITIZE_SETAROMATICITY| \
Chem.SanitizeFlags.SANITIZE_SETCONJUGATION| \
Chem.SanitizeFlags.SANITIZE_SETHYBRIDIZATION| \
Chem.SanitizeFlags.SANITIZE_SYMMRINGS,
catchErrors=True
)
self.name = mol_file.split('/')[-1].split('.')[0]
self.lig_file = mol_file
Chem.Kekulize(mol)
self.mol = mol
#self.self_loop = self_loop
self.num_atoms = len(self.mol.GetAtoms())
self.normalized_coords = None
def normalize_pos(self, shift_vector, rotate_matrix):
conformer = self.mol.GetConformer()
coords = np.array([conformer.GetAtomPosition(a.GetIdx()) for a in self.mol.GetAtoms()])
coords = (coords - shift_vector)@rotate_matrix
for ix, pos in enumerate(coords):
conformer.SetAtomPosition(ix, pos)
self.normalized_coords = coords
def mol_block(self):
return Chem.MolToMolBlock(self.mol)
def to_dict(self):
fdefName = os.path.join(RDConfig.RDDataDir,'BaseFeatures.fdef')
factory = ChemicalFeatures.BuildFeatureFactory(fdefName)
#rdmol = next(iter(Chem.SDMolSupplier(mol_file, removeHs=True)))
ring_info = is_in_ring(self.mol)
conformer = self.mol.GetConformer()
feat_mat = np.zeros([self.mol.GetNumAtoms(), len(ATOM_FAMILIES)], dtype=np.int64)
for feat in factory.GetFeaturesForMol(self.mol):
feat_mat[feat.GetAtomIds(), ATOM_FAMILIES_ID[feat.GetFamily()]] = 1
element, pos, atom_mass = [], [], []
for a in self.mol.GetAtoms():
element.append(a.GetAtomicNum())
pos.append(conformer.GetAtomPosition(a.GetIdx()))
atom_mass.append(a.GetMass())
element = np.array(element, dtype=np.int64)
pos = np.array(pos, dtype=np.float32)
atom_mass = np.array(atom_mass, np.float32)
center_of_mass = (pos * atom_mass.reshape(-1,1)).sum(0)/atom_mass.sum()
edge_index, edge_type = [], []
for b in self.mol.GetBonds():
row = [b.GetBeginAtomIdx(), b.GetEndAtomIdx()]
col = [b.GetEndAtomIdx(), b.GetBeginAtomIdx()]
edge_index.extend([row, col])
edge_type.extend([BOND_TYPES[b.GetBondType()]] * 2)
edge_index = np.array(edge_index)
edge_index_perm = edge_index[:,0].argsort()
edge_index = edge_index[edge_index_perm].T
edge_type = np.array(edge_type)[edge_index_perm]
return {'element': element,
'pos': pos,
'bond_index': edge_index,
'bond_type': edge_type,
'center_of_mass': center_of_mass,
'atom_feature': feat_mat,
'ring_info': ring_info,
'filename':self.lig_file
}
@staticmethod
def empty_dict():
empty_ligand_dict = {}
empty_ligand_dict = Dict(empty_ligand_dict)
empty_ligand_dict.element = np.empty(0, dtype=np.int64)
empty_ligand_dict.pos = np.empty([0,3], dtype=np.float32)
empty_ligand_dict.bond_index = np.empty([2,0], dtype=np.int64)
empty_ligand_dict.bond_type = np.empty(0, dtype=np.int64)
empty_ligand_dict.center_of_mass = np.empty([0,3], dtype=np.float32)
empty_ligand_dict.atom_feature = np.empty([0,8], dtype=np.float32)
empty_ligand_dict.ring_info = {}
empty_ligand_dict.filename = None
return empty_ligand_dict
def __repr__(self):
tmp = 'Name={}, NumAtoms={}'
info = tmp.format(self.name, self.num_atoms)
return '{}({})'.format(self.__class__.__name__, info)