mirror of
https://github.com/jertubiana/ScanNet.git
synced 2026-06-04 13:44:22 +08:00
312 lines
12 KiB
Python
312 lines
12 KiB
Python
import os
|
|
import Bio.PDB
|
|
import warnings
|
|
from utilities.paths import structures_folder
|
|
|
|
from urllib.request import urlretrieve
|
|
from urllib.request import urlcleanup
|
|
import gzip
|
|
|
|
|
|
|
|
def is_PDB_identifier(str):
|
|
return ( (len(str) == 4) & str.isalnum() )
|
|
|
|
def is_UniProt_identifier(str):
|
|
L = len(str)
|
|
correct_length = L in [6,10]
|
|
if not correct_length:
|
|
return False
|
|
only_alnum = str.isalnum()
|
|
only_upper = (str.upper() == str)
|
|
first_is_letter = str[0].isalpha()
|
|
six_is_digit = str[5].isnumeric()
|
|
|
|
valid_uniprot_id = correct_length & only_alnum & only_upper & first_is_letter & six_is_digit
|
|
if L == 10:
|
|
seven_is_letter = str[6].isalpha()
|
|
last_is_digit = str[1].isnumeric()
|
|
valid_uniprot_id = valid_uniprot_id & seven_is_letter & last_is_digit
|
|
return valid_uniprot_id
|
|
|
|
|
|
|
|
#%% Function for downloading biounit files.
|
|
|
|
class myPDBList(Bio.PDB.PDBList):
|
|
PDB_REF = """
|
|
The Protein Data Bank: a computer-based archival file for macromolecular structures.
|
|
F.C.Bernstein, T.F.Koetzle, G.J.B.Williams, E.F.Meyer Jr, M.D.Brice, J.R.Rodgers, O.Kennard, T.Shimanouchi, M.Tasumi
|
|
J. Mol. Biol. 112 pp. 535-542 (1977)
|
|
http://www.pdb.org/.
|
|
"""
|
|
def __init__(self,*args, **kwargs):
|
|
kwargs['pdb'] = structures_folder
|
|
super().__init__(*args,**kwargs)
|
|
self.alphafold_server = 'https://alphafold.ebi.ac.uk/' # entry/Q13469
|
|
self.flat_tree = True
|
|
return
|
|
|
|
def retrieve_pdb_file(self, code, obsolete=False, pdir=None, file_format=None, overwrite=False):
|
|
"""Fetch PDB structure file from PDB server, and store it locally.
|
|
|
|
The PDB structure's file name is returned as a single string.
|
|
If obsolete ``==`` True, the file will be saved in a special file tree.
|
|
|
|
NOTE. The default download format has changed from PDB to PDBx/mmCif
|
|
|
|
:param code: pdb or uniprot ID
|
|
PDB code: 4-symbols structure Id from PDB (e.g. 3J92).
|
|
Uniprot ID: 6 or 10 symbols (e.g. Q8WZ42).
|
|
|
|
:type code: string
|
|
|
|
:param file_format:
|
|
File format. Available options:
|
|
|
|
* "mmCif" (default, PDBx/mmCif file),
|
|
* "pdb" (format PDB),
|
|
* "xml" (PDBML/XML format),
|
|
* "mmtf" (highly compressed),
|
|
* "bundle" (PDB formatted archive for large structure}
|
|
* 'biounit' (format PDB)
|
|
|
|
:type file_format: string
|
|
|
|
:param overwrite: if set to True, existing structure files will be overwritten. Default: False
|
|
:type overwrite: bool
|
|
|
|
:param obsolete:
|
|
Has a meaning only for obsolete structures. If True, download the obsolete structure
|
|
to 'obsolete' folder, otherwise download won't be performed.
|
|
This option doesn't work for mmtf format as obsoleted structures aren't stored in mmtf.
|
|
Also doesn't have meaning when parameter pdir is specified.
|
|
Note: make sure that you are about to download the really obsolete structure.
|
|
Trying to download non-obsolete structure into obsolete folder will not work
|
|
and you face the "structure doesn't exists" error.
|
|
Default: False
|
|
|
|
:type obsolete: bool
|
|
|
|
:param pdir: put the file in this directory (default: create a PDB-style directory tree)
|
|
:type pdir: string
|
|
|
|
:return: filename
|
|
:rtype: string
|
|
"""
|
|
file_format = self._print_default_format_warning(
|
|
file_format) # Deprecation warning
|
|
|
|
is_pdb = is_PDB_identifier(code)
|
|
is_uniprot = is_UniProt_identifier(code)
|
|
if not (is_pdb | is_uniprot):
|
|
raise ValueError('Identifier %s is neither a valid PDB or Uniprot ID')
|
|
|
|
if is_pdb:
|
|
code = code.lower()
|
|
|
|
|
|
if is_pdb:
|
|
# Get the compressed PDB structure
|
|
archive = {'pdb': 'pdb%s.ent.gz', 'mmCif': '%s.cif.gz', 'xml': '%s.xml.gz', 'mmtf': '%s',
|
|
'bundle': '%s-pdb-bundle.tar.gz', 'biounit': '%s.pdb1.gz', 'biounit_mmCif': '%s-assembly1.cif.gz'}
|
|
archive_fn = archive[file_format] % code
|
|
|
|
if file_format not in archive.keys():
|
|
raise("Specified file_format %s doesn't exists or is not supported. Maybe a typo. "
|
|
"Please, use one of the following: mmCif, pdb, xml, mmtf, bundle, biounit" % file_format)
|
|
|
|
if file_format in ('pdb', 'mmCif', 'xml'):
|
|
pdb_dir = "divided" if not obsolete else "obsolete"
|
|
file_type = "pdb" if file_format == "pdb" else "mmCIF" if file_format == "mmCif" else "XML"
|
|
url = (self.pdb_server + '/pub/pdb/data/structures/%s/%s/%s/%s' %
|
|
(pdb_dir, file_type, code[1:3], archive_fn))
|
|
elif file_format == 'bundle':
|
|
url = (self.pdb_server + '/pub/pdb/compatible/pdb_bundle/%s/%s/%s' %
|
|
(code[1:3], code, archive_fn))
|
|
elif file_format == 'biounit':
|
|
url = (self.pdb_server + '/pub/pdb/data/biounit/PDB/divided/%s/%s' %
|
|
(code[1:3], archive_fn))
|
|
elif file_format == 'biounit_mmCif':
|
|
url = (self.pdb_server + '/pub/pdb/data/biounit/mmCIF/divided/%s/%s' %
|
|
(code[1:3], archive_fn))
|
|
else:
|
|
url = ('http://mmtf.rcsb.org/v1.0/full/%s' % code)
|
|
|
|
elif is_uniprot:
|
|
assert file_format in ['pdb','mmCif']
|
|
url = self.alphafold_server + '/files/AF-%s-F1-model_v1%s'%(code, '.pdb' if file_format == 'pdb' else '.cif')
|
|
archive_fn = url.split('/')[-1]
|
|
else:
|
|
return
|
|
|
|
|
|
# Where does the final PDB file get saved?
|
|
if pdir is None:
|
|
path = self.local_pdb if not obsolete else self.obsolete_pdb
|
|
if not self.flat_tree: # Put in PDB-style directory tree
|
|
path = os.path.join(path, code[1:3])
|
|
else: # Put in specified directory
|
|
path = pdir
|
|
if not os.access(path, os.F_OK):
|
|
os.makedirs(path)
|
|
filename = os.path.join(path, archive_fn)
|
|
if is_pdb:
|
|
final = {'pdb': 'pdb%s.ent', 'mmCif': '%s.cif', 'xml': '%s.xml',
|
|
'mmtf': '%s.mmtf', 'bundle': '%s-pdb-bundle.tar', 'biounit': 'pdb%s.bioent', 'biounit_mmCif': '%s_bioentry.cif'}
|
|
elif is_uniprot:
|
|
final = {'pdb':'AF_%s.pdb','mmCif':'AF_%s.cif'}
|
|
else:
|
|
return
|
|
final_file = os.path.join(path, final[file_format] % code)
|
|
|
|
# Skip download if the file already exists
|
|
if not overwrite:
|
|
if os.path.exists(final_file):
|
|
if self._verbose:
|
|
print("Structure exists: '%s' " % final_file)
|
|
return final_file
|
|
|
|
# Retrieve the file
|
|
if self._verbose:
|
|
print("Downloading PDB structure '%s'..." % code)
|
|
try:
|
|
urlcleanup()
|
|
urlretrieve(url, filename)
|
|
except IOError:
|
|
print("Desired structure doesn't exists")
|
|
return
|
|
else:
|
|
if is_pdb:
|
|
with gzip.open(filename, 'rb') as gz:
|
|
with open(final_file, 'wb') as out:
|
|
out.writelines(gz)
|
|
os.remove(filename)
|
|
else:
|
|
os.rename(filename,final_file)
|
|
return final_file
|
|
|
|
|
|
|
|
class ChainSelect(Bio.PDB.Select):
|
|
def __init__(self,selected_chains,*args,**kwargs):
|
|
self.selected_chains = selected_chains
|
|
return super().__init__(*args,**kwargs)
|
|
def accept_model(self,model):
|
|
if self.selected_chains in ['upper','lower','all']:
|
|
return 1
|
|
elif model.id in [x[0] for x in self.selected_chains]:
|
|
return 1
|
|
else:
|
|
return 0
|
|
def accept_chain(self, chain):
|
|
if self.selected_chains == 'all':
|
|
return 1
|
|
elif self.selected_chains == 'upper':
|
|
return int( (chain.get_full_id()[2].isupper() | (chain.get_full_id()[2]==' ') ) )
|
|
elif self.selected_chains == 'lower':
|
|
return int(chain.get_full_id()[2].islower())
|
|
elif (chain.get_full_id()[1],chain.get_full_id()[2]) in self.selected_chains:
|
|
return 1
|
|
else:
|
|
return 0
|
|
|
|
|
|
def parse_str(str):
|
|
str_split = str.split('_')
|
|
if len(str_split) == 1:
|
|
structure_identifier = str
|
|
chains = None
|
|
else:
|
|
if '.' in str_split[-1]: # Special case, str is a path to a file that includes an underscore. Assume file has extension.
|
|
structure_identifier = str
|
|
chains = None
|
|
else:
|
|
structure_identifier = '_'.join(str_split[:-1])
|
|
chains = str_split[-1]
|
|
if chains is not None:
|
|
if ('+' in chains) | ('-' in chains):
|
|
chain_identifiers = chains.split('+')
|
|
chain_identifiers = [(int(x.split('-')[0]),x.split('-')[1]) if '-' in x else (0,x) for x in chain_identifiers]
|
|
else:
|
|
chain_identifiers = [(0,x) for x in chains]
|
|
else:
|
|
chain_identifiers = 'all'
|
|
return structure_identifier,chain_identifiers
|
|
|
|
|
|
def format_chain_id(x):
|
|
return '+'.join([ '%s-%s'%(y[0],y[1]) for y in x])
|
|
|
|
|
|
|
|
def getPDB(identifier_string,biounit=True,structures_folder=structures_folder,verbose=True):
|
|
structure_id,chain = parse_str(identifier_string)
|
|
is_pdb = is_PDB_identifier(structure_id)
|
|
is_uniprot = is_UniProt_identifier(structure_id)
|
|
if is_uniprot:
|
|
biounit=False
|
|
|
|
if not (is_pdb | is_uniprot):
|
|
location = structure_id
|
|
assert os.path.exists(location),'File not found'
|
|
else:
|
|
if is_pdb:
|
|
pdb_id = structure_id.lower()
|
|
if biounit:
|
|
location1 = structures_folder + 'pdb' + pdb_id + '.bioent'
|
|
location2 = structures_folder + pdb_id + '_bioentry.cif'
|
|
else:
|
|
location1 = structures_folder + 'pdb' + pdb_id + '.ent'
|
|
location2 = structures_folder + pdb_id + '.cif' # New§ format
|
|
else:
|
|
uniprot_id = structure_id
|
|
location1 = structures_folder + 'AF_' + uniprot_id + '.pdb'
|
|
location2 = structures_folder + 'AF_' + uniprot_id + '.cif'
|
|
|
|
if os.path.exists(location1):
|
|
location = location1
|
|
elif os.path.exists(location2):
|
|
location = location2
|
|
else:
|
|
pdb_downloader = myPDBList(verbose=verbose)
|
|
if biounit:
|
|
location = pdb_downloader.retrieve_pdb_file(
|
|
structure_id, pdir=structures_folder, file_format='biounit')
|
|
if location is None:
|
|
location = pdb_downloader.retrieve_pdb_file(
|
|
structure_id, pdir=structures_folder, file_format='biounit_mmCif')
|
|
else:
|
|
location = pdb_downloader.retrieve_pdb_file(
|
|
structure_id, pdir=structures_folder, file_format='pdb')
|
|
if location is None:
|
|
location = pdb_downloader.retrieve_pdb_file(
|
|
structure_id, pdir=structures_folder, file_format='mmCif')
|
|
return location,chain
|
|
|
|
|
|
def extract_chains(location,chains,final_location):
|
|
if chains == 'all':
|
|
os.system('scp %s %s'%(location,final_location))
|
|
else:
|
|
with warnings.catch_warnings(record=True) as w:
|
|
if location[-4:] == '.cif':
|
|
parser = Bio.PDB.MMCIFParser()
|
|
else:
|
|
parser = Bio.PDB.PDBParser()
|
|
struct = parser.get_structure('name',location)
|
|
io = Bio.PDB.PDBIO()
|
|
if isinstance(chains,list) & len(chains)==1:
|
|
model,chain = chains[0]
|
|
chain_obj = struct[model][chain]
|
|
if len(chain) > 1:
|
|
chain_obj.id = chain[0]
|
|
io.set_structure(chain_obj)
|
|
for atom in Bio.PDB.Selection.unfold_entities(io.structure, 'A'):
|
|
atom.disordered_flag = 0
|
|
io.save(final_location)
|
|
else:
|
|
io.set_structure(struct)
|
|
io.save(final_location, ChainSelect(chains))
|
|
return final_location
|