mirror of
https://github.com/samsledje/D-SCRIPT.git
synced 2026-06-04 15:04:24 +08:00
790 lines
24 KiB
Python
790 lines
24 KiB
Python
from Bio import SeqIO
|
|
from Bio import SeqRecord
|
|
from Bio import Seq
|
|
from Bio import PDB
|
|
from Bio import pairwise2
|
|
import h5py
|
|
import numpy as np
|
|
import pandas as pd
|
|
import csv
|
|
import argparse
|
|
from datetime import datetime
|
|
import time
|
|
from pathlib import Path
|
|
|
|
MAX_D = 25
|
|
|
|
|
|
def add_args(parser):
|
|
"""
|
|
Create parser for command line utility.
|
|
|
|
:meta private:
|
|
"""
|
|
data_grp = parser.add_argument_group("Data Processing files")
|
|
data_grp.add_argument(
|
|
"--pdb_file",
|
|
required=True,
|
|
help="Plain text file, where each newline contains a FULL PATH to a pdb file (ex. pdb_directory/1AON.pdb)",
|
|
)
|
|
data_grp.add_argument(
|
|
"--filter_chain_minlen",
|
|
required=True,
|
|
help="Filtering step: enter desired minimum length of chain",
|
|
)
|
|
data_grp.add_argument(
|
|
"--filter_chain_maxlen",
|
|
required=True,
|
|
help="Filtering step: enter desired maximum length of chain",
|
|
)
|
|
data_grp.add_argument(
|
|
"--filter_number_of_chains",
|
|
required=False,
|
|
help="Filtering step: enter the desired number(s) of chains as an unseparated group, ex. 1234 keeps pdbs with exactly 1, 2, 3 or 4 chains",
|
|
)
|
|
data_grp.add_argument(
|
|
"--h5_name",
|
|
required=True,
|
|
help="Full path of output H5 (for contact maps)",
|
|
)
|
|
data_grp.add_argument(
|
|
"--fasta",
|
|
required=True,
|
|
help="Full path of output fasta (for sequences)",
|
|
)
|
|
data_grp.add_argument(
|
|
"--tsv",
|
|
required=True,
|
|
help="Full path of output tsv (for PPIs)",
|
|
)
|
|
data_grp.add_argument(
|
|
"--distance_threshold",
|
|
required=False,
|
|
default=25,
|
|
help="enter distance threshold for calculating distances between residues (angstroms) (e.g. 25)",
|
|
)
|
|
data_grp.add_argument(
|
|
"--discontinuity_threshold",
|
|
required=False,
|
|
default=75,
|
|
help="enter percentage threshold for contact map discontinuity (e.g. 75, meaning capped at 75 percent discontinuity)",
|
|
)
|
|
data_grp.add_argument(
|
|
"--interaction_threshold",
|
|
required=False,
|
|
default=8,
|
|
help="enter angstrom threshold to classify as interaction",
|
|
)
|
|
return parser
|
|
|
|
|
|
def get_pdb_list(pdb_file):
|
|
"""
|
|
Returns a list of pdb file paths given a text file containing pdb paths.
|
|
|
|
:param pdb_files: Name of txt file containing pdb full paths.
|
|
:type version: string
|
|
:return: list of pdb files (full path)
|
|
:rtype: list
|
|
"""
|
|
pdb_list = []
|
|
with open(f"{pdb_file}", "r") as pdb_f:
|
|
pdb_list = pdb_f.read().split("\n")
|
|
if "" in pdb_list:
|
|
pdb_list.remove("")
|
|
return pdb_list
|
|
|
|
|
|
def get_sequences_from_chains(chains, pair):
|
|
"""
|
|
Returns a list of Atom Sequences chains
|
|
|
|
:param chains: list of all chains in pdb
|
|
:type version: list
|
|
:param pair: tuple pair of indices taken from chains
|
|
:type version: string
|
|
:return: list of atom sequences
|
|
:rtype: list
|
|
"""
|
|
substitutions = {
|
|
"2AS": "ASP",
|
|
"3AH": "HIS",
|
|
"5HP": "GLU",
|
|
"ACL": "ARG",
|
|
"AGM": "ARG",
|
|
"AIB": "ALA",
|
|
"ALM": "ALA",
|
|
"ALO": "THR",
|
|
"ALY": "LYS",
|
|
"ARM": "ARG",
|
|
"ASA": "ASP",
|
|
"ASB": "ASP",
|
|
"ASK": "ASP",
|
|
"ASL": "ASP",
|
|
"ASQ": "ASP",
|
|
"AYA": "ALA",
|
|
"BCS": "CYS",
|
|
"BHD": "ASP",
|
|
"BMT": "THR",
|
|
"BNN": "ALA",
|
|
"BUC": "CYS",
|
|
"BUG": "LEU",
|
|
"C5C": "CYS",
|
|
"C6C": "CYS",
|
|
"CAS": "CYS",
|
|
"CCS": "CYS",
|
|
"CEA": "CYS",
|
|
"CGU": "GLU",
|
|
"CHG": "ALA",
|
|
"CLE": "LEU",
|
|
"CME": "CYS",
|
|
"CSD": "ALA",
|
|
"CSO": "CYS",
|
|
"CSP": "CYS",
|
|
"CSS": "CYS",
|
|
"CSW": "CYS",
|
|
"CSX": "CYS",
|
|
"CXM": "MET",
|
|
"CY1": "CYS",
|
|
"CY3": "CYS",
|
|
"CYG": "CYS",
|
|
"CYM": "CYS",
|
|
"CYQ": "CYS",
|
|
"DAH": "PHE",
|
|
"DAL": "ALA",
|
|
"DAR": "ARG",
|
|
"DAS": "ASP",
|
|
"DCY": "CYS",
|
|
"DGL": "GLU",
|
|
"DGN": "GLN",
|
|
"DHA": "ALA",
|
|
"DHI": "HIS",
|
|
"DIL": "ILE",
|
|
"DIV": "VAL",
|
|
"DLE": "LEU",
|
|
"DLY": "LYS",
|
|
"DNP": "ALA",
|
|
"DPN": "PHE",
|
|
"DPR": "PRO",
|
|
"DSN": "SER",
|
|
"DSP": "ASP",
|
|
"DTH": "THR",
|
|
"DTR": "TRP",
|
|
"DTY": "TYR",
|
|
"DVA": "VAL",
|
|
"EFC": "CYS",
|
|
"FLA": "ALA",
|
|
"FME": "MET",
|
|
"GGL": "GLU",
|
|
"GL3": "GLY",
|
|
"GLZ": "GLY",
|
|
"GMA": "GLU",
|
|
"GSC": "GLY",
|
|
"HAC": "ALA",
|
|
"HAR": "ARG",
|
|
"HIC": "HIS",
|
|
"HIP": "HIS",
|
|
"HMR": "ARG",
|
|
"HPQ": "PHE",
|
|
"HTR": "TRP",
|
|
"HYP": "PRO",
|
|
"IAS": "ASP",
|
|
"IIL": "ILE",
|
|
"IYR": "TYR",
|
|
"KCX": "LYS",
|
|
"LLP": "LYS",
|
|
"LLY": "LYS",
|
|
"LTR": "TRP",
|
|
"LYM": "LYS",
|
|
"LYZ": "LYS",
|
|
"MAA": "ALA",
|
|
"MEN": "ASN",
|
|
"MHS": "HIS",
|
|
"MIS": "SER",
|
|
"MLE": "LEU",
|
|
"MPQ": "GLY",
|
|
"MSA": "GLY",
|
|
"MSE": "MET",
|
|
"MVA": "VAL",
|
|
"NEM": "HIS",
|
|
"NEP": "HIS",
|
|
"NLE": "LEU",
|
|
"NLN": "LEU",
|
|
"NLP": "LEU",
|
|
"NMC": "GLY",
|
|
"OAS": "SER",
|
|
"OCS": "CYS",
|
|
"OMT": "MET",
|
|
"PAQ": "TYR",
|
|
"PCA": "GLU",
|
|
"PEC": "CYS",
|
|
"PHI": "PHE",
|
|
"PHL": "PHE",
|
|
"PR3": "CYS",
|
|
"PRR": "ALA",
|
|
"PTR": "TYR",
|
|
"PYX": "CYS",
|
|
"SAC": "SER",
|
|
"SAR": "GLY",
|
|
"SCH": "CYS",
|
|
"SCS": "CYS",
|
|
"SCY": "CYS",
|
|
"SEL": "SER",
|
|
"SEP": "SER",
|
|
"SET": "SER",
|
|
"SHC": "CYS",
|
|
"SHR": "LYS",
|
|
"SMC": "CYS",
|
|
"SOC": "CYS",
|
|
"STY": "TYR",
|
|
"SVA": "SER",
|
|
"TIH": "ALA",
|
|
"TPL": "TRP",
|
|
"TPO": "THR",
|
|
"TPQ": "ALA",
|
|
"TRG": "LYS",
|
|
"TRO": "TRP",
|
|
"TYB": "TYR",
|
|
"TYI": "TYR",
|
|
"TYQ": "TYR",
|
|
"TYS": "TYR",
|
|
"TYY": "TYR",
|
|
}
|
|
residues = [
|
|
"ALA",
|
|
"CYS",
|
|
"ASP",
|
|
"GLU",
|
|
"PHE",
|
|
"GLY",
|
|
"HIS",
|
|
"ILE",
|
|
"LYS",
|
|
"LEU",
|
|
"MET",
|
|
"ASN",
|
|
"PRO",
|
|
"GLN",
|
|
"ARG",
|
|
"SER",
|
|
"THR",
|
|
"TRP",
|
|
"TYR",
|
|
"VAL",
|
|
]
|
|
|
|
records = []
|
|
# chains = chains[:2]
|
|
chains_two = []
|
|
chains_two.append(chains[pair[0]])
|
|
chains_two.append(chains[pair[1]])
|
|
|
|
for chain in chains_two:
|
|
chain_string = ""
|
|
for residue in chain:
|
|
if residue.has_id("CA"):
|
|
if residue.get_resname() in substitutions.keys():
|
|
chain_string += "" + PDB.Polypeptide.three_to_one(
|
|
substitutions[residue.get_resname()]
|
|
)
|
|
elif residue.get_resname() == "CA":
|
|
None
|
|
elif residue.get_resname() not in residues:
|
|
# print(residue.get_resname())
|
|
chain_invalid = chain
|
|
chain_string = None
|
|
break
|
|
else:
|
|
chain_string += "" + PDB.Polypeptide.three_to_one(
|
|
residue.get_resname()
|
|
)
|
|
if chain_string is None:
|
|
return [records, chain_invalid]
|
|
chain_seq = Seq.Seq(chain_string)
|
|
chain_record = SeqRecord.SeqRecord(
|
|
chain_seq, name=chain.id, id=chain.id
|
|
)
|
|
records.append(chain_record)
|
|
return records
|
|
|
|
|
|
def get_sequences(pdb, chains, pair):
|
|
"""
|
|
Gets atom and seqres sequences from the pdb file.
|
|
|
|
:param pdb: full path of pdb file
|
|
:type version: string
|
|
:param chains: list of all chains in pdb
|
|
:type version: list
|
|
:param pair: tuple pair of indices taken from chains
|
|
:type version: string
|
|
:return: list containing sequence from seq-res and sequence from atom
|
|
:rtype: list
|
|
"""
|
|
atoms_recs = get_sequences_from_chains(chains, pair)
|
|
if not atoms_recs[0] or len(atoms_recs[0]) == 1:
|
|
return [None, atoms_recs[1]]
|
|
seqs_short = atoms_recs
|
|
seqres_recs = list(SeqIO.parse(pdb, "pdb-seqres"))
|
|
seqs_long = []
|
|
seqs_long.append(seqres_recs[pair[0]])
|
|
seqs_long.append(seqres_recs[pair[1]])
|
|
return [seqs_long, seqs_short]
|
|
|
|
|
|
def check_sequences_valid(seqs_long, seqs_short):
|
|
"""
|
|
Checks if sequences are both valid amino acid chains, not "XX...XX".
|
|
|
|
:param seqs_long: list of seq-res sequences for chains 0 and 1
|
|
:type version: string
|
|
:param seqs_short: list of atom sequences for chains 0 and 1
|
|
:type version: string
|
|
:return: whether both sequences exist and are valid
|
|
:rtype: Boolean
|
|
"""
|
|
# print((seqs_long[0].seq, seqs_long[1].seq))
|
|
# print((seqs_short[0].seq, seqs_short[1].seq))
|
|
# print("\n")
|
|
return (
|
|
len(seqs_short) < 2
|
|
or len(seqs_long) < 2
|
|
or len(str(seqs_long[0].seq)) == 0
|
|
or len(str(seqs_long[1].seq)) == 0
|
|
or (
|
|
str(seqs_long[0].seq)
|
|
== len(str(seqs_long[0].seq)) * str(seqs_long[0].seq)[0]
|
|
)
|
|
or (
|
|
str(seqs_long[1].seq)
|
|
== len(str(seqs_long[1].seq)) * str(seqs_long[1].seq)[0]
|
|
)
|
|
)
|
|
|
|
|
|
def get_chains_prelim_filtering(
|
|
chains, pdb_id, chain_minlen, chain_maxlen, chain_error, pair
|
|
):
|
|
"""
|
|
Gets atom and seqres sequences from the pdb file.
|
|
|
|
:param chains: list of chains contained in pdb file
|
|
:type version: list
|
|
:param pdb_id: name of pdb file
|
|
:type version: string
|
|
:param chain_minlen: minimum length of chain filtered out
|
|
:type version: int
|
|
:param chain_maxlen: maximum length of chain filtered out
|
|
:type version: int
|
|
:param chain_error: list of pdbs that don't satisfy length conditions
|
|
:type version: list
|
|
:param pair: tuple pair of indices taken from chains
|
|
:type version: string
|
|
:return: a list of filtered chains and two lists of pdbs that don't satisfy filtering conditions
|
|
:rtype: list
|
|
"""
|
|
chains_two = []
|
|
chains_two.append(chains[pair[0]])
|
|
chains_two.append(chains[pair[1]])
|
|
if len(chains_two[0]) < chain_minlen or len(chains_two[0]) > chain_maxlen:
|
|
if f"{pdb_id}:{str(chains_two[0].get_id())}" not in chain_error:
|
|
chain_error.append(f"{pdb_id}:{str(chains_two[0].get_id())}")
|
|
print(
|
|
f"REMOVED Chain {str(chains_two[0].get_id())} - Length Constraints"
|
|
)
|
|
return None
|
|
if len(chains_two[1]) < chain_minlen or len(chains_two[1]) > chain_maxlen:
|
|
if f"{pdb_id}:{str(chains_two[1].get_id())}" not in chain_error:
|
|
chain_error.append(f"{pdb_id}:{str(chains_two[1].get_id())}")
|
|
print(
|
|
f"REMOVED Chain {str(chains_two[1].get_id())} - Length Constraints"
|
|
)
|
|
return None
|
|
return [chains_two, chain_error]
|
|
|
|
|
|
def make_fasta_and_tsv(
|
|
tsv_name, fasta_name, valid_pdb, pdbs, chain_error, invalid_resname
|
|
):
|
|
"""
|
|
Takes in a list of valid pdb file paths and creates an output fasta and tsv file.
|
|
|
|
:param tsv_name: path of tsv file
|
|
:type version: string
|
|
:param fasta_name: path of fasta file
|
|
:type version: string
|
|
:param valid_pdb: list of valid pdbs that passed filtering
|
|
:type version: list
|
|
:param pdbs: list of all valid pdb
|
|
:type version: list
|
|
:param chain_error: list of all pdb chains that have invalid lengths
|
|
:type version: list
|
|
:param invalid_resname: list of all pdbs that have invalid residues
|
|
:type version: list
|
|
"""
|
|
with open(f"{tsv_name}", "w+") as tsv_f, open(
|
|
f"{fasta_name}", "w+"
|
|
) as fasta_f:
|
|
for pdb in pdbs:
|
|
sequences = list(SeqIO.parse(pdb, "pdb-seqres"))
|
|
for record in sequences:
|
|
# print(record)
|
|
if (
|
|
record.id not in chain_error
|
|
and record.id not in invalid_resname
|
|
and record.name != "<unknown name>"
|
|
and len(record.seq) > 0
|
|
and record.seq != str(record.seq[0]) * len(str(record.seq))
|
|
):
|
|
fasta_f.write(record.format("fasta-2line"))
|
|
for pdb_pair in valid_pdb.keys():
|
|
tsv_writer = csv.writer(tsv_f, delimiter="\t")
|
|
tsv_writer.writerow([pdb_pair[0:6], pdb_pair[7:13], "1"])
|
|
|
|
|
|
def remove_CA_from_chains(chain_list):
|
|
"""
|
|
Filters chains to remove all HETATM (non-amino acid) residues.
|
|
|
|
:param chain_list: pass in list of original chains
|
|
:type version: list
|
|
:return: list of filtered chains
|
|
:rtype: list
|
|
"""
|
|
chains_f = [[r for r in c if r.has_id("CA")] for c in chain_list]
|
|
return chains_f
|
|
|
|
|
|
def split_sequences(seqs_long, seqs_short, chains_filtered):
|
|
"""
|
|
Gets actual sequences (long = seq-res) and (short = atom) for the two chains
|
|
|
|
:param seqs_long: list of seq-res sequences for chains 0 and 1
|
|
:type version: string
|
|
:param seqs_short: list of atom sequences for chains 0 and 1
|
|
:type version: string
|
|
:param chains_filtered: list of filtered chains
|
|
:type version: list
|
|
:return: list containing: seq-res sequence for chain0, atom sequence for chain0, chain0, seq-res sequence for chain1, atom sequence for chain1, chain1
|
|
:rtype: list
|
|
"""
|
|
seq0_long = seqs_long[0].seq
|
|
seq0_short = seqs_short[0].seq
|
|
chain0 = chains_filtered[0]
|
|
seq1_long = seqs_long[1].seq
|
|
seq1_short = seqs_short[1].seq
|
|
chain1 = chains_filtered[1]
|
|
return [seq0_long, seq0_short, chain0, seq1_long, seq1_short, chain1]
|
|
|
|
|
|
def get_aligned_seqs(seq0_long, seq0_short, seq1_long, seq1_short):
|
|
"""
|
|
Gets sequence0 and sequence1 after sequence alignment (containing "-").
|
|
|
|
:param seq0_long: seq-res sequence for chain 0
|
|
:type version: string
|
|
:param seq0_short: atom sequence for chain 0
|
|
:type version: string
|
|
:param seq1_long: seq-res sequence for chain 1
|
|
:type version: string
|
|
:param seq1_short: atom sequence for chain 1
|
|
:type version: string
|
|
:return: list of aligned seq-res sequence0, aligned atom sequence0, aligned seq-res sequence1, aligned atom sequence1
|
|
:rtype: list
|
|
"""
|
|
align0 = pairwise2.align.globalxx(seq0_long, seq0_short)
|
|
align1 = pairwise2.align.globalxx(seq1_long, seq1_short)
|
|
# print(pairwise2.format_alignment(*align0[0]))
|
|
# print(pairwise2.format_alignment(*align1[0]))
|
|
seq0_long_f = align0[0].seqA
|
|
seq0_short_f = align0[0].seqB
|
|
seq1_long_f = align1[0].seqA
|
|
seq1_short_f = align1[0].seqB
|
|
return [seq0_long_f, seq0_short_f, seq1_long_f, seq1_short_f]
|
|
|
|
|
|
def residue_distance(res0, res1, max_d):
|
|
"""
|
|
Calculates Euclidean distance between two amino acid residues, with a maximum distance threshold.
|
|
|
|
:param res0: first residue
|
|
:type version: Residue Object
|
|
:param res1: second residue
|
|
:type version: Residue Object
|
|
:param max_d: distance
|
|
:type version: float
|
|
:return: distance between the two residues
|
|
:rtype: Numpy float
|
|
"""
|
|
diff_vector = res0["CA"].coord - res1["CA"].coord
|
|
distance = np.sqrt(np.sum(diff_vector ** 2))
|
|
return min(distance, max_d)
|
|
|
|
|
|
def calc_dist_matrix(
|
|
pdb_id,
|
|
chain0,
|
|
chain1,
|
|
seq0_long,
|
|
seq1_long,
|
|
seq0_long_f,
|
|
seq0_short_f,
|
|
seq1_long_f,
|
|
seq1_short_f,
|
|
dist_thresh,
|
|
):
|
|
"""
|
|
Generates contact map between two chains using sequence alignment.
|
|
|
|
:param pdb_id: pdb formatted name of protein
|
|
:type version: string
|
|
:param chain0: first chain
|
|
:type version: Chain Object
|
|
:param chain1: second chain
|
|
:type version: Chain Object
|
|
:param seq0_long: original sequence for chain 0
|
|
:type version: Bio.Seq.Seq
|
|
:param seq1_long: original sequence for chain 1
|
|
:type version: Bio.Seq.Seq
|
|
:param seq0_long_f: long alignment sequence for chain 0
|
|
:type version: string
|
|
:param seq0_short_f: short alignment sequence for chain 0
|
|
:type version: string
|
|
:param seq1_long_f: long alignment sequence for chain 1
|
|
:type version: string
|
|
:param seq1_short_f: short alignment sequence for chain 1
|
|
:type version: string
|
|
:param dist_thresh: argument for distance threshold
|
|
:type version: float
|
|
:return: generated distance matrix between two chains
|
|
:rtype: Numpy matrix
|
|
"""
|
|
D = np.zeros((len(seq0_long), len(seq1_long)))
|
|
D = D + 25
|
|
ch0_it = iter(chain0)
|
|
x = -1
|
|
y = 0
|
|
for (i, (res0L, res0S)) in enumerate(zip(seq0_long_f, seq0_short_f)):
|
|
|
|
if res0S == "-":
|
|
x += 1
|
|
continue
|
|
elif res0L == "-":
|
|
res0 = next(ch0_it)
|
|
continue
|
|
else:
|
|
x += 1
|
|
res0 = next(ch0_it)
|
|
|
|
ch1_it = iter(chain1)
|
|
for (j, (res1L, res1S)) in enumerate(zip(seq1_long_f, seq1_short_f)):
|
|
if res1S == "-":
|
|
y += 1
|
|
continue
|
|
elif res1L == "-":
|
|
res1 = next(ch1_it)
|
|
continue
|
|
else:
|
|
res1 = next(ch1_it)
|
|
D[x, y] = residue_distance(res0, res1, dist_thresh)
|
|
y += 1
|
|
y = 0
|
|
df = pd.DataFrame(D, index=list(seq0_long), columns=list(seq1_long))
|
|
# print(df)
|
|
return D
|
|
|
|
|
|
def log(m, file=None, timestamped=True, print_also=False):
|
|
"""
|
|
Outputs a log.
|
|
|
|
:param m: information to be logged
|
|
:type version: Object
|
|
:param file: file name for log to be written to
|
|
:type version: string
|
|
:param timestamped: whether or not log will include timestamp
|
|
:type version: boolean
|
|
:param print_also: whether or not log is printed to standard output as well as written to file
|
|
:type version: boolean
|
|
"""
|
|
curr_time = f"[{datetime.now().strftime('%Y-%m-%d-%H:%M:%S')}] "
|
|
log_string = f"{curr_time if timestamped else ''}{m}"
|
|
if file is None:
|
|
print(log_string)
|
|
else:
|
|
print(log_string, file=file)
|
|
if print_also:
|
|
print(log_string)
|
|
file.flush()
|
|
|
|
|
|
def main(args):
|
|
start = time.perf_counter()
|
|
h5_name = args.h5_name
|
|
fasta_name = args.fasta
|
|
tsv_name = args.tsv
|
|
pdb_text = args.pdb_file
|
|
if args.filter_number_of_chains is not None:
|
|
chain_lengths_allowed = list(args.filter_number_of_chains)
|
|
chain_lengths_allowed = [int(i) for i in chain_lengths_allowed]
|
|
pdb_list = get_pdb_list(pdb_text)
|
|
chain_minlen = int(args.filter_chain_minlen)
|
|
chain_maxlen = int(args.filter_chain_maxlen)
|
|
dist_thresh = float(args.distance_threshold)
|
|
discont_thresh = float(args.discontinuity_threshold) / 100
|
|
interact_thresh = int(args.interaction_threshold)
|
|
|
|
with h5py.File(f"{h5_name}", "w") as hf_pair:
|
|
total = 0
|
|
valid_pdb = {}
|
|
pdbs = []
|
|
chain_error = []
|
|
seq_error = []
|
|
invalid_resname = []
|
|
chainlen_unsatisfied = []
|
|
unknown_name = []
|
|
|
|
for pdb in pdb_list:
|
|
total += 1
|
|
print(f"Total: {total}")
|
|
print(pdb)
|
|
pdb_id = Path(pdb).stem
|
|
|
|
structure = PDB.PDBParser().get_structure(pdb_id, pdb)
|
|
sequences = list(SeqIO.parse(pdb, "pdb-seqres"))
|
|
flag = False
|
|
for item in sequences:
|
|
if item.name == "<unknown name>":
|
|
flag = True
|
|
if pdb_id not in unknown_name:
|
|
unknown_name.append(pdb_id)
|
|
if flag:
|
|
continue
|
|
chains = list(structure.get_chains())
|
|
if len(chains) != len(sequences):
|
|
seq_error.append(pdb_id)
|
|
continue
|
|
# print(chains)
|
|
if args.filter_number_of_chains is not None:
|
|
if len(chains) not in chain_lengths_allowed:
|
|
chainlen_unsatisfied.append(pdb_id)
|
|
continue
|
|
chains.sort(key=lambda chain: chain.id)
|
|
chain_pairing = [item for item in range(0, len(chains))]
|
|
pair = [
|
|
(a, b)
|
|
for idx, a in enumerate(chain_pairing)
|
|
for b in chain_pairing[idx + 1 :]
|
|
]
|
|
# print(pair)
|
|
for item in pair:
|
|
seqences_pair = get_sequences(pdb, chains, item)
|
|
if seqences_pair[0] is None:
|
|
if (
|
|
f"{pdb_id}:{str(seqences_pair[1].id)}"
|
|
not in invalid_resname
|
|
):
|
|
invalid_resname.append(
|
|
f"{pdb_id}:{str(seqences_pair[1].id)}"
|
|
)
|
|
print("REMOVED - Invalid Residue Name")
|
|
continue
|
|
[seqs_long, seqs_short] = seqences_pair
|
|
|
|
seq_verify = check_sequences_valid(seqs_short, seqs_long)
|
|
if seq_verify is True:
|
|
if pdb_id not in seq_error:
|
|
seq_error.append(pdb_id)
|
|
print("REMOVED - <2 Sequences")
|
|
continue
|
|
|
|
output = get_chains_prelim_filtering(
|
|
chains,
|
|
pdb_id,
|
|
chain_minlen,
|
|
chain_maxlen,
|
|
chain_error,
|
|
item,
|
|
)
|
|
if output is None:
|
|
continue
|
|
[chains_two, chain_error] = output
|
|
|
|
# print(chains_two)
|
|
# print(seqs_long)
|
|
# print(seqs_short)
|
|
|
|
if pdb not in pdbs:
|
|
pdbs.append(pdb)
|
|
|
|
chains_filtered = remove_CA_from_chains(chains_two)
|
|
[
|
|
seq0_long,
|
|
seq0_short,
|
|
chain0,
|
|
seq1_long,
|
|
seq1_short,
|
|
chain1,
|
|
] = split_sequences(seqs_long, seqs_short, chains_filtered)
|
|
|
|
[
|
|
seq0_long_f,
|
|
seq0_short_f,
|
|
seq1_long_f,
|
|
seq1_short_f,
|
|
] = get_aligned_seqs(
|
|
seq0_long, seq0_short, seq1_long, seq1_short
|
|
)
|
|
D = calc_dist_matrix(
|
|
pdb_id,
|
|
chain0,
|
|
chain1,
|
|
seq0_long,
|
|
seq1_long,
|
|
seq0_long_f,
|
|
seq0_short_f,
|
|
seq1_long_f,
|
|
seq1_short_f,
|
|
dist_thresh,
|
|
)
|
|
count_discontinuities = (D == -1).sum()
|
|
if (
|
|
count_discontinuities
|
|
< discont_thresh * D.shape[0] * D.shape[1]
|
|
):
|
|
interactions = ((D < interact_thresh)).sum()
|
|
if interactions > 0:
|
|
# D = (D <= 8).astype(float)
|
|
print(D)
|
|
hf_pair.create_dataset(
|
|
f"{pdb_id.upper()}:{str(chains_two[0].get_id())}x{pdb_id.upper()}:{str(chains_two[1].get_id())}",
|
|
data=D,
|
|
)
|
|
valid_pdb[
|
|
f"{pdb_id.upper()}:{str(chains_two[0].get_id())}x{pdb_id.upper()}:{str(chains_two[1].get_id())}"
|
|
] = [pdb, chains_two]
|
|
|
|
make_fasta_and_tsv(
|
|
tsv_name, fasta_name, valid_pdb, pdbs, chain_error, invalid_resname
|
|
)
|
|
log(f"PDB chains that <{chain_minlen} or >{chain_maxlen}: {chain_error}")
|
|
log(
|
|
f"PDBs with an invalid # of sequences (<2 or != # chains): {seq_error}"
|
|
)
|
|
log(
|
|
f"PDB chains with invalid residue names (e.g. MSE, PLM): {invalid_resname}"
|
|
)
|
|
log(
|
|
f"PDBs with chain lengths that don't satisfy user-entered constraints: {chainlen_unsatisfied}"
|
|
)
|
|
log(f"PDBs with unknown names/descriptions: {unknown_name}")
|
|
end = time.perf_counter()
|
|
print(f"Elapsed {(end-start)/60} minutes.")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
parser = argparse.ArgumentParser(description=__doc__)
|
|
add_args(parser)
|
|
print(parser.parse_args())
|
|
main(parser.parse_args())
|