ScanNet/preprocessing/PDBio.py

import os
import Bio.PDB
import warnings
from utilities.paths import structures_folder

from urllib.request import urlretrieve
from urllib.request import urlcleanup
import gzip


def is_PDB_identifier(str):
    return ( (len(str) == 4) & str.isalnum() )

def is_UniProt_identifier(str):
    L = len(str)
    correct_length = L in [6,10]
    if not correct_length:
        return False
    only_alnum = str.isalnum()
    only_upper = (str.upper() == str)
    first_is_letter = str[0].isalpha()
    six_is_digit = str[5].isnumeric()

    valid_uniprot_id = correct_length & only_alnum & only_upper & first_is_letter & six_is_digit
    if L == 10:
        seven_is_letter = str[6].isalpha()
        last_is_digit = str[1].isnumeric()
        valid_uniprot_id = valid_uniprot_id & seven_is_letter & last_is_digit
    return valid_uniprot_id


#%% Function for downloading biounit files.

class myPDBList(Bio.PDB.PDBList):
    PDB_REF = """
    The Protein Data Bank: a computer-based archival file for macromolecular structures.
    F.C.Bernstein, T.F.Koetzle, G.J.B.Williams, E.F.Meyer Jr, M.D.Brice, J.R.Rodgers, O.Kennard, T.Shimanouchi, M.Tasumi
    J. Mol. Biol. 112 pp. 535-542 (1977)
    http://www.pdb.org/.
    """
    def __init__(self,*args, **kwargs):
        kwargs['pdb'] = structures_folder
        super().__init__(*args,**kwargs)
        self.alphafold_server = 'https://alphafold.ebi.ac.uk/' # entry/Q13469
        self.flat_tree = True
        return

    def retrieve_pdb_file(self, code, obsolete=False, pdir=None, file_format=None, overwrite=False):
        """Fetch PDB structure file from PDB server, and store it locally.

        The PDB structure's file name is returned as a single string.
        If obsolete ``==`` True, the file will be saved in a special file tree.

        NOTE. The default download format has changed from PDB to PDBx/mmCif

        :param code: pdb or uniprot ID
         PDB code: 4-symbols structure Id from PDB (e.g. 3J92).
         Uniprot ID: 6 or 10 symbols (e.g. Q8WZ42).

        :type code: string

        :param file_format:
            File format. Available options:

            * "mmCif" (default, PDBx/mmCif file),
            * "pdb" (format PDB),
            * "xml" (PDBML/XML format),
            * "mmtf" (highly compressed),
            * "bundle" (PDB formatted archive for large structure}
            * 'biounit' (format PDB)

        :type file_format: string

        :param overwrite: if set to True, existing structure files will be overwritten. Default: False
        :type overwrite: bool

        :param obsolete:
            Has a meaning only for obsolete structures. If True, download the obsolete structure
            to 'obsolete' folder, otherwise download won't be performed.
            This option doesn't work for mmtf format as obsoleted structures aren't stored in mmtf.
            Also doesn't have meaning when parameter pdir is specified.
            Note: make sure that you are about to download the really obsolete structure.
            Trying to download non-obsolete structure into obsolete folder will not work
            and you face the "structure doesn't exists" error.
            Default: False

        :type obsolete: bool

        :param pdir: put the file in this directory (default: create a PDB-style directory tree)
        :type pdir: string

        :return: filename
        :rtype: string
        """
        file_format = self._print_default_format_warning(
            file_format)  # Deprecation warning

        is_pdb = is_PDB_identifier(code)
        is_uniprot = is_UniProt_identifier(code)
        if not (is_pdb | is_uniprot):
            raise ValueError('Identifier %s is neither a valid PDB or Uniprot ID')

        if is_pdb:
            code = code.lower()


        if is_pdb:
            # Get the compressed PDB structure
            archive = {'pdb': 'pdb%s.ent.gz', 'mmCif': '%s.cif.gz', 'xml': '%s.xml.gz', 'mmtf': '%s',
                       'bundle': '%s-pdb-bundle.tar.gz', 'biounit': '%s.pdb1.gz', 'biounit_mmCif': '%s-assembly1.cif.gz'}
            archive_fn = archive[file_format] % code

            if file_format not in archive.keys():
                raise("Specified file_format %s doesn't exists or is not supported. Maybe a typo. "
                      "Please, use one of the following: mmCif, pdb, xml, mmtf, bundle, biounit" % file_format)

            if file_format in ('pdb', 'mmCif', 'xml'):
                pdb_dir = "divided" if not obsolete else "obsolete"
                file_type = "pdb" if file_format == "pdb" else "mmCIF" if file_format == "mmCif" else "XML"
                url = (self.pdb_server + '/pub/pdb/data/structures/%s/%s/%s/%s' %
                       (pdb_dir, file_type, code[1:3], archive_fn))
            elif file_format == 'bundle':
                url = (self.pdb_server + '/pub/pdb/compatible/pdb_bundle/%s/%s/%s' %
                       (code[1:3], code, archive_fn))
            elif file_format == 'biounit':
                url = (self.pdb_server + '/pub/pdb/data/biounit/PDB/divided/%s/%s' %
                       (code[1:3], archive_fn))
            elif file_format == 'biounit_mmCif':
                url = (self.pdb_server + '/pub/pdb/data/biounit/mmCIF/divided/%s/%s' %
                       (code[1:3], archive_fn))
            else:
                url = ('http://mmtf.rcsb.org/v1.0/full/%s' % code)

        elif is_uniprot:
            assert file_format in ['pdb','mmCif']
            url = self.alphafold_server + '/files/AF-%s-F1-model_v1%s'%(code, '.pdb' if file_format == 'pdb' else '.cif')
            archive_fn = url.split('/')[-1]
        else:
            return


        # Where does the final PDB file get saved?
        if pdir is None:
            path = self.local_pdb if not obsolete else self.obsolete_pdb
            if not self.flat_tree:  # Put in PDB-style directory tree
                path = os.path.join(path, code[1:3])
        else:  # Put in specified directory
            path = pdir
        if not os.access(path, os.F_OK):
            os.makedirs(path)
        filename = os.path.join(path, archive_fn)
        if is_pdb:
            final = {'pdb': 'pdb%s.ent', 'mmCif': '%s.cif', 'xml': '%s.xml',
                     'mmtf': '%s.mmtf', 'bundle': '%s-pdb-bundle.tar', 'biounit': 'pdb%s.bioent', 'biounit_mmCif': '%s_bioentry.cif'}
        elif is_uniprot:
            final = {'pdb':'AF_%s.pdb','mmCif':'AF_%s.cif'}
        else:
            return
        final_file = os.path.join(path, final[file_format] % code)

        # Skip download if the file already exists
        if not overwrite:
            if os.path.exists(final_file):
                if self._verbose:
                    print("Structure exists: '%s' " % final_file)
                return final_file

        # Retrieve the file
        if self._verbose:
            print("Downloading PDB structure '%s'..." % code)
        try:
            urlcleanup()
            urlretrieve(url, filename)
        except IOError:
            print("Desired structure doesn't exists")
            return
        else:
            if is_pdb:
                with gzip.open(filename, 'rb') as gz:
                    with open(final_file, 'wb') as out:
                        out.writelines(gz)
                os.remove(filename)
            else:
                os.rename(filename,final_file)
        return final_file


class ChainSelect(Bio.PDB.Select):
    def __init__(self,selected_chains,*args,**kwargs):
        self.selected_chains = selected_chains
        return super().__init__(*args,**kwargs)
    def accept_model(self,model):
        if self.selected_chains in ['upper','lower','all']:
            return 1
        elif model.id in [x[0] for x in self.selected_chains]:
            return 1
        else:
            return 0
    def accept_chain(self, chain):
        if self.selected_chains == 'all':
            return 1
        elif self.selected_chains == 'upper':
            return int( (chain.get_full_id()[2].isupper() |  (chain.get_full_id()[2]==' ') ) )
        elif self.selected_chains == 'lower':
            return int(chain.get_full_id()[2].islower())
        elif (chain.get_full_id()[1],chain.get_full_id()[2]) in self.selected_chains:
            return 1
        else:
            return 0


def parse_str(str):
    str_split = str.split('_')
    if len(str_split) == 1:
        structure_identifier = str
        chains = None
    else:
        if '.' in str_split[-1]: # Special case, str is a path to a file that includes an underscore. Assume file has extension.
            structure_identifier = str
            chains = None
        else:
            structure_identifier = '_'.join(str_split[:-1])
            chains = str_split[-1]
    if chains is not None:
        if ('+' in chains) | ('-' in chains):
            chain_identifiers = chains.split('+')
            chain_identifiers = [(int(x.split('-')[0]),x.split('-')[1]) if '-' in x else (0,x) for x in  chain_identifiers]
        else:
            chain_identifiers = [(0,x) for x in chains]
    else:
        chain_identifiers = 'all'
    return structure_identifier,chain_identifiers


def format_chain_id(x):
    return '+'.join([ '%s-%s'%(y[0],y[1]) for y in x])


def getPDB(identifier_string,biounit=True,structures_folder=structures_folder,verbose=True):
    structure_id,chain = parse_str(identifier_string)
    is_pdb = is_PDB_identifier(structure_id)
    is_uniprot = is_UniProt_identifier(structure_id)
    if is_uniprot:
        biounit=False

    if not (is_pdb | is_uniprot):
        location = structure_id
        assert os.path.exists(location),'File not found'
    else:
        if is_pdb:
            pdb_id = structure_id.lower()
            if biounit:
                location1 = structures_folder + 'pdb' + pdb_id + '.bioent'
                location2 = structures_folder + pdb_id + '_bioentry.cif'
            else:
                location1 = structures_folder + 'pdb' + pdb_id + '.ent'
                location2 = structures_folder + pdb_id + '.cif'  # New§ format
        else:
            uniprot_id = structure_id
            location1 = structures_folder + 'AF_' + uniprot_id + '.pdb'
            location2 = structures_folder + 'AF_' + uniprot_id + '.cif'

        if os.path.exists(location1):
            location = location1
        elif os.path.exists(location2):
            location = location2
        else:
            pdb_downloader = myPDBList(verbose=verbose)
            if biounit:
                location = pdb_downloader.retrieve_pdb_file(
                    structure_id, pdir=structures_folder, file_format='biounit')
                if location is None:
                    location = pdb_downloader.retrieve_pdb_file(
                        structure_id, pdir=structures_folder, file_format='biounit_mmCif')
            else:
                location = pdb_downloader.retrieve_pdb_file(
                    structure_id, pdir=structures_folder, file_format='pdb')
                if location is None:
                    location = pdb_downloader.retrieve_pdb_file(
                        structure_id, pdir=structures_folder, file_format='mmCif')
    return location,chain


def extract_chains(location,chains,final_location):
    if chains == 'all':
        os.system('scp %s %s'%(location,final_location))
    else:
        with warnings.catch_warnings(record=True) as w:
            if location[-4:] == '.cif':
                parser = Bio.PDB.MMCIFParser()
            else:
                parser = Bio.PDB.PDBParser()
            struct = parser.get_structure('name',location)
            io = Bio.PDB.PDBIO()
            if isinstance(chains,list) & len(chains)==1:
                model,chain = chains[0]
                chain_obj = struct[model][chain]
                if len(chain) > 1:
                    chain_obj.id = chain[0]
                io.set_structure(chain_obj)
                for atom in Bio.PDB.Selection.unfold_entities(io.structure, 'A'):
                    atom.disordered_flag = 0
                io.save(final_location)
            else:
                io.set_structure(struct)
                io.save(final_location, ChainSelect(chains))
    return final_location