Files
ScanNet/preprocessing/PDBio.py
Jérôme Tubiana 3e216d66f1 Initial commit
2021-08-18 11:18:29 +02:00

312 lines
12 KiB
Python

import os
import Bio.PDB
import warnings
from utilities.paths import structures_folder
from urllib.request import urlretrieve
from urllib.request import urlcleanup
import gzip
def is_PDB_identifier(str):
return ( (len(str) == 4) & str.isalnum() )
def is_UniProt_identifier(str):
L = len(str)
correct_length = L in [6,10]
if not correct_length:
return False
only_alnum = str.isalnum()
only_upper = (str.upper() == str)
first_is_letter = str[0].isalpha()
six_is_digit = str[5].isnumeric()
valid_uniprot_id = correct_length & only_alnum & only_upper & first_is_letter & six_is_digit
if L == 10:
seven_is_letter = str[6].isalpha()
last_is_digit = str[1].isnumeric()
valid_uniprot_id = valid_uniprot_id & seven_is_letter & last_is_digit
return valid_uniprot_id
#%% Function for downloading biounit files.
class myPDBList(Bio.PDB.PDBList):
PDB_REF = """
The Protein Data Bank: a computer-based archival file for macromolecular structures.
F.C.Bernstein, T.F.Koetzle, G.J.B.Williams, E.F.Meyer Jr, M.D.Brice, J.R.Rodgers, O.Kennard, T.Shimanouchi, M.Tasumi
J. Mol. Biol. 112 pp. 535-542 (1977)
http://www.pdb.org/.
"""
def __init__(self,*args, **kwargs):
kwargs['pdb'] = structures_folder
super().__init__(*args,**kwargs)
self.alphafold_server = 'https://alphafold.ebi.ac.uk/' # entry/Q13469
self.flat_tree = True
return
def retrieve_pdb_file(self, code, obsolete=False, pdir=None, file_format=None, overwrite=False):
"""Fetch PDB structure file from PDB server, and store it locally.
The PDB structure's file name is returned as a single string.
If obsolete ``==`` True, the file will be saved in a special file tree.
NOTE. The default download format has changed from PDB to PDBx/mmCif
:param code: pdb or uniprot ID
PDB code: 4-symbols structure Id from PDB (e.g. 3J92).
Uniprot ID: 6 or 10 symbols (e.g. Q8WZ42).
:type code: string
:param file_format:
File format. Available options:
* "mmCif" (default, PDBx/mmCif file),
* "pdb" (format PDB),
* "xml" (PDBML/XML format),
* "mmtf" (highly compressed),
* "bundle" (PDB formatted archive for large structure}
* 'biounit' (format PDB)
:type file_format: string
:param overwrite: if set to True, existing structure files will be overwritten. Default: False
:type overwrite: bool
:param obsolete:
Has a meaning only for obsolete structures. If True, download the obsolete structure
to 'obsolete' folder, otherwise download won't be performed.
This option doesn't work for mmtf format as obsoleted structures aren't stored in mmtf.
Also doesn't have meaning when parameter pdir is specified.
Note: make sure that you are about to download the really obsolete structure.
Trying to download non-obsolete structure into obsolete folder will not work
and you face the "structure doesn't exists" error.
Default: False
:type obsolete: bool
:param pdir: put the file in this directory (default: create a PDB-style directory tree)
:type pdir: string
:return: filename
:rtype: string
"""
file_format = self._print_default_format_warning(
file_format) # Deprecation warning
is_pdb = is_PDB_identifier(code)
is_uniprot = is_UniProt_identifier(code)
if not (is_pdb | is_uniprot):
raise ValueError('Identifier %s is neither a valid PDB or Uniprot ID')
if is_pdb:
code = code.lower()
if is_pdb:
# Get the compressed PDB structure
archive = {'pdb': 'pdb%s.ent.gz', 'mmCif': '%s.cif.gz', 'xml': '%s.xml.gz', 'mmtf': '%s',
'bundle': '%s-pdb-bundle.tar.gz', 'biounit': '%s.pdb1.gz', 'biounit_mmCif': '%s-assembly1.cif.gz'}
archive_fn = archive[file_format] % code
if file_format not in archive.keys():
raise("Specified file_format %s doesn't exists or is not supported. Maybe a typo. "
"Please, use one of the following: mmCif, pdb, xml, mmtf, bundle, biounit" % file_format)
if file_format in ('pdb', 'mmCif', 'xml'):
pdb_dir = "divided" if not obsolete else "obsolete"
file_type = "pdb" if file_format == "pdb" else "mmCIF" if file_format == "mmCif" else "XML"
url = (self.pdb_server + '/pub/pdb/data/structures/%s/%s/%s/%s' %
(pdb_dir, file_type, code[1:3], archive_fn))
elif file_format == 'bundle':
url = (self.pdb_server + '/pub/pdb/compatible/pdb_bundle/%s/%s/%s' %
(code[1:3], code, archive_fn))
elif file_format == 'biounit':
url = (self.pdb_server + '/pub/pdb/data/biounit/PDB/divided/%s/%s' %
(code[1:3], archive_fn))
elif file_format == 'biounit_mmCif':
url = (self.pdb_server + '/pub/pdb/data/biounit/mmCIF/divided/%s/%s' %
(code[1:3], archive_fn))
else:
url = ('http://mmtf.rcsb.org/v1.0/full/%s' % code)
elif is_uniprot:
assert file_format in ['pdb','mmCif']
url = self.alphafold_server + '/files/AF-%s-F1-model_v1%s'%(code, '.pdb' if file_format == 'pdb' else '.cif')
archive_fn = url.split('/')[-1]
else:
return
# Where does the final PDB file get saved?
if pdir is None:
path = self.local_pdb if not obsolete else self.obsolete_pdb
if not self.flat_tree: # Put in PDB-style directory tree
path = os.path.join(path, code[1:3])
else: # Put in specified directory
path = pdir
if not os.access(path, os.F_OK):
os.makedirs(path)
filename = os.path.join(path, archive_fn)
if is_pdb:
final = {'pdb': 'pdb%s.ent', 'mmCif': '%s.cif', 'xml': '%s.xml',
'mmtf': '%s.mmtf', 'bundle': '%s-pdb-bundle.tar', 'biounit': 'pdb%s.bioent', 'biounit_mmCif': '%s_bioentry.cif'}
elif is_uniprot:
final = {'pdb':'AF_%s.pdb','mmCif':'AF_%s.cif'}
else:
return
final_file = os.path.join(path, final[file_format] % code)
# Skip download if the file already exists
if not overwrite:
if os.path.exists(final_file):
if self._verbose:
print("Structure exists: '%s' " % final_file)
return final_file
# Retrieve the file
if self._verbose:
print("Downloading PDB structure '%s'..." % code)
try:
urlcleanup()
urlretrieve(url, filename)
except IOError:
print("Desired structure doesn't exists")
return
else:
if is_pdb:
with gzip.open(filename, 'rb') as gz:
with open(final_file, 'wb') as out:
out.writelines(gz)
os.remove(filename)
else:
os.rename(filename,final_file)
return final_file
class ChainSelect(Bio.PDB.Select):
def __init__(self,selected_chains,*args,**kwargs):
self.selected_chains = selected_chains
return super().__init__(*args,**kwargs)
def accept_model(self,model):
if self.selected_chains in ['upper','lower','all']:
return 1
elif model.id in [x[0] for x in self.selected_chains]:
return 1
else:
return 0
def accept_chain(self, chain):
if self.selected_chains == 'all':
return 1
elif self.selected_chains == 'upper':
return int( (chain.get_full_id()[2].isupper() | (chain.get_full_id()[2]==' ') ) )
elif self.selected_chains == 'lower':
return int(chain.get_full_id()[2].islower())
elif (chain.get_full_id()[1],chain.get_full_id()[2]) in self.selected_chains:
return 1
else:
return 0
def parse_str(str):
str_split = str.split('_')
if len(str_split) == 1:
structure_identifier = str
chains = None
else:
if '.' in str_split[-1]: # Special case, str is a path to a file that includes an underscore. Assume file has extension.
structure_identifier = str
chains = None
else:
structure_identifier = '_'.join(str_split[:-1])
chains = str_split[-1]
if chains is not None:
if ('+' in chains) | ('-' in chains):
chain_identifiers = chains.split('+')
chain_identifiers = [(int(x.split('-')[0]),x.split('-')[1]) if '-' in x else (0,x) for x in chain_identifiers]
else:
chain_identifiers = [(0,x) for x in chains]
else:
chain_identifiers = 'all'
return structure_identifier,chain_identifiers
def format_chain_id(x):
return '+'.join([ '%s-%s'%(y[0],y[1]) for y in x])
def getPDB(identifier_string,biounit=True,structures_folder=structures_folder,verbose=True):
structure_id,chain = parse_str(identifier_string)
is_pdb = is_PDB_identifier(structure_id)
is_uniprot = is_UniProt_identifier(structure_id)
if is_uniprot:
biounit=False
if not (is_pdb | is_uniprot):
location = structure_id
assert os.path.exists(location),'File not found'
else:
if is_pdb:
pdb_id = structure_id.lower()
if biounit:
location1 = structures_folder + 'pdb' + pdb_id + '.bioent'
location2 = structures_folder + pdb_id + '_bioentry.cif'
else:
location1 = structures_folder + 'pdb' + pdb_id + '.ent'
location2 = structures_folder + pdb_id + '.cif' # New§ format
else:
uniprot_id = structure_id
location1 = structures_folder + 'AF_' + uniprot_id + '.pdb'
location2 = structures_folder + 'AF_' + uniprot_id + '.cif'
if os.path.exists(location1):
location = location1
elif os.path.exists(location2):
location = location2
else:
pdb_downloader = myPDBList(verbose=verbose)
if biounit:
location = pdb_downloader.retrieve_pdb_file(
structure_id, pdir=structures_folder, file_format='biounit')
if location is None:
location = pdb_downloader.retrieve_pdb_file(
structure_id, pdir=structures_folder, file_format='biounit_mmCif')
else:
location = pdb_downloader.retrieve_pdb_file(
structure_id, pdir=structures_folder, file_format='pdb')
if location is None:
location = pdb_downloader.retrieve_pdb_file(
structure_id, pdir=structures_folder, file_format='mmCif')
return location,chain
def extract_chains(location,chains,final_location):
if chains == 'all':
os.system('scp %s %s'%(location,final_location))
else:
with warnings.catch_warnings(record=True) as w:
if location[-4:] == '.cif':
parser = Bio.PDB.MMCIFParser()
else:
parser = Bio.PDB.PDBParser()
struct = parser.get_structure('name',location)
io = Bio.PDB.PDBIO()
if isinstance(chains,list) & len(chains)==1:
model,chain = chains[0]
chain_obj = struct[model][chain]
if len(chain) > 1:
chain_obj.id = chain[0]
io.set_structure(chain_obj)
for atom in Bio.PDB.Selection.unfold_entities(io.structure, 'A'):
atom.disordered_flag = 0
io.save(final_location)
else:
io.set_structure(struct)
io.save(final_location, ChainSelect(chains))
return final_location