made updates to processing.py, incorporating/debugging filtering and sequence alginment code

This commit is contained in:
Lynn Tao
2022-09-05 06:45:33 -04:00
parent 091a8936c7
commit bde88ced0c
13 changed files with 281 additions and 370 deletions

View File

@@ -1,20 +0,0 @@
"""
# UNFINISHED
# convert batches of downloaded pdb gzip files into parsable pdb protein files.
"""
import gzip
import matplotlib.pyplot as plt
import os
import pandas as pd
path = "dscript/pdbsNEW/batch-download-structures-1661040401355"
zips = os.listdir(path)
if ".DS_Store" in zips:
zips.remove(".DS_Store")
for item in zips:
name = item[3:7]
op = open(f"dscript/pdbsNEW/{name}.pdb", "w")
with gzip.open(f"{path}/{item}", "rb") as fi:
op.write(fi.read().decode("utf-8"))

View File

@@ -1,123 +0,0 @@
"""
# parsing PDB files into pairwise and binary contact maps
# REMOVE ANY CA ERROR PDBS
"""
from pickle import FALSE
import Bio.PDB
import numpy
import matplotlib.pyplot as plt
import h5py
import os
import pandas as pd
# SOURCE CODE: https://warwick.ac.uk/fac/sci/moac/people/students/peter_cock/python/protein_contact_map/
def has_CA(residue):
"""
Return whether or not residue contains an alpha-carbon atom.
:param residue: pass in an amino acid
:type version: Residue object
:return: if amino acid residue has alpha-carbons
:rtype: boolean
"""
return residue.has_id("CA")
def filter_chains(chain_list):
chains_f = [[r for r in c if r.has_id("CA")] for c in chain_list]
return chains_f
def calc_residue_dist(residue_one, residue_two, limit):
"""
Returns the C-alpha distance between two residues
:param residue_one: pass in an amino acid residue object
:type residue_one: Residue object
:param residue_two: pass in an amino acid residue object
:type residue_two: Residue object
:param limit: pass in a distance threshold
:type limit: float
:return: C-alpha distance between two residues
:rtype: float
"""
diff_vector = residue_one["CA"].coord - residue_two["CA"].coord
distance = numpy.sqrt(numpy.sum(diff_vector * diff_vector))
if distance >= float(limit):
return float(limit)
return distance
def calc_dist_matrix(chain_one, chain_two, limit):
"""
Returns a contact map matrix of C-alpha distances between two chains
:param chain_one: pass in a chain 1
:type chain_one: Chain object
:param chain_two: pass in a chain 2
:type chain_two: Chain object
:param limit: pass in a distance threshold
:type limit: float
:return: a contact map matrix of C-alpha distances between two chains
:rtype: Numpy matrix
"""
chains = filter_chains([chain_one, chain_two])
chain1 = chains[0]
chain2 = chains[1]
# chain2 = filter_chains(chain_two)
# for residue_one in chain_one:
# if has_CA(residue_one):
# chain1.append(residue_one)
# for residue_two in chain_two:
# if has_CA(residue_two):
# chain2.append(residue_two)
answer = numpy.zeros((len(chain1), len(chain2)), float)
print(answer.shape)
for i in range(0, len(chain1)):
for j in range(0, len(chain2)):
# print((chain1[i].get_resname(), chain2[j].get_resname()))
answer[i][j] = calc_residue_dist(chain1[i], chain2[j], limit)
return answer
def main():
files = os.listdir("dscript/pdbsNEW")
if ".DS_Store" in files:
files.remove(".DS_Store")
for i in range(0, len(files)):
files[i] = files[i][:4]
hf_pair = h5py.File(f"data/paircmaps_train.h5", "w")
count = 0
for protein in files:
count += 1
print(protein)
print(count)
pdb_code = protein.upper()
pdb_filename = f"dscript/pdbsNEW/{protein}.pdb"
structure = Bio.PDB.PDBParser().get_structure(pdb_code, pdb_filename)
model = structure[0]
# GET PROTEIN CHAINS
chain = []
for chains in structure.get_chains():
chain.append(str(chains.get_id()))
chain = chain[:2]
dist_matrix = calc_dist_matrix(
model[chain[0]], model[chain[1]], 25.000
)
print(f"{pdb_code}:{chain[0]}x{pdb_code}:{chain[1]}")
hf_pair.create_dataset(
f"{pdb_code}:{chain[0]}x{pdb_code}:{chain[1]}", data=dist_matrix
)
if __name__ == "__main__":
main()

View File

@@ -1,31 +0,0 @@
"""
# converting protein PDB files into fasta sequence format
"""
import Bio
from Bio import SeqIO
import os
entries = os.listdir("dscript/pdbsNEW")
if ".DS_Store" in entries:
entries.remove(".DS_Store")
# print(entries)
# CONVERT ALL TO FASTAS
for i in range(0, len(entries)):
# print(entries[i])
if entries[i][0] != ".":
with open(f"dscript/fastasNEW/{entries[i][:-4]}.fasta", "w") as f:
# pdb-atom
for record in SeqIO.parse(
f"dscript/pdbsNEW/{entries[i]}", "pdb-seqres"
):
f.write(record.format("fasta-2line"))
f.close()
# TEST A SINGLE
# with open(f'dscript/proteins/15c8.fasta', 'w') as f:
# for record in SeqIO.parse(f"dscript/15c8.pdb", "pdb-seqres"):
# # print("Record id %s, chain %s" % (record.id, record.annotations["chain"]))
# # print(record.format("fasta"))
# f.write(record.format("fasta-2line"))
# f.close()

View File

@@ -1,50 +0,0 @@
from pickle import FALSE
import Bio.PDB
import numpy
import matplotlib.pyplot as plt
import h5py
import random
import os
import pandas as pd
import torch
fi = h5py.File("data/paircmaps_train.h5", "r")
ke = list(fi.keys())
# print(ke[0][:4].lower())
print(fi[ke[0]].shape)
fastas = os.listdir("dscript/fastasNEW")
# fastas.remove(".DS_Store")
# print(len(fastas))
deletes = []
for item in ke:
# print(fi[item].shape)
if os.path.exists(f"dscript/fastasNEW/{item[:4].lower()}.fasta"):
with open(
f"dscript/fastasNEW/{item[:4].lower()}.fasta", "r"
) as in_file:
l = in_file.read().splitlines()
seq1 = len(l[1])
seq2 = len(l[3])
print((seq1, seq2))
if (seq1 == fi[item].shape[0] and seq2 == fi[item].shape[1]) or (
seq2 == fi[item].shape[0] and seq1 == fi[item].shape[1]
):
# print(f"{item} MATCHES")
None
else:
# print(f"{item} DELETE")
deletes.append(item[:4].lower())
# print()
print(deletes)
# deletes = ['1fmh', '1dzb', '1dnw', '1dnu', '1fxv', '1d5l', '1d7w', '1dtd', '1e8n']
# for item in deletes:
# if os.path.exists(f"dscript/pdbs/{item}.pdb"):
# os.remove(f"dscript/pdbs/{item}.pdb")
# for item in deletes:
# if os.path.exists(f"dscript/fastas/{item}.fasta"):
# os.remove(f"dscript/fastas/{item}.fasta")

View File

@@ -1,32 +0,0 @@
"""
# CLEAR CMAP.FASTA BEFORE RUNNING THIS FILE
# making fasta ids simpler to match the contact maps, putting them all together in data/seqs as 1 fasta file
"""
import Bio.PDB
import numpy as np
import matplotlib.pyplot as plt
import h5py
import random
import os
import pandas as pd
import torch
import csv
files = os.listdir("dscript/fastasTEST")
if ".DS_Store" in files:
files.remove(".DS_Store")
# print(files)
for item in files:
with open('data/seqs/cmap.fasta', 'a') as out_file:
with open(f'dscript/fastasTEST/{item}','r') as in_file:
# print(item)
l = in_file.read().splitlines()
# print(l)
for i in range(0, len(l)):
if i%2 == 0:
l[i] = l[i][:7]
# print(l)
for line in l:
out_file.write(line)
out_file.write('\n')

View File

@@ -1,28 +0,0 @@
"""
creating a training cmap tsv file with all the contact map protein pairs, all interaction = 1
"""
import Bio.PDB
import numpy as np
import matplotlib.pyplot as plt
import h5py
import random
import os
import pandas as pd
import torch
import csv
# files = os.listdir("dscript/bincmaps")
# files.remove(".DS_Store")
# print(files)
fi = h5py.File("data/bincmaps_train.h5","r")
ke = list(fi.keys())
with open('data/pairs/cmap_train.tsv', 'wt') as out_file:
tsv_writer = csv.writer(out_file, delimiter='\t')
for item in ke:
# print(item)
prot1 = item[:item.index(":")+2]
prot2 = item[item.index(":")+3:]
# print((prot1, prot2))
tsv_writer.writerow([prot1, prot2, '1'])

View File

@@ -1,22 +0,0 @@
import os
fastas = os.listdir("dscript/fastas")
if ".DS_Store" in fastas:
fastas.remove(".DS_Store")
# print(len(fastas))
deletes = []
for item in fastas:
# print(fi[item].shape)
with open(f"dscript/fastas/{item}", "r") as in_file:
l = in_file.read().splitlines()
if len(l) != 4:
deletes.append(item[:4])
if len(l) == 4:
if len(l[1]) >= 694 or len(l[3]) >= 694:
# print(item)
deletes.append(item[:4])
if len(l[1]) <= 50 or len(l[3]) <= 50:
deletes.append(item[:4])
print(deletes)

View File

@@ -1,13 +1,14 @@
import os
# deletes = ["1B6H", "2Z59", "2N73", "6W0V", "5Z2W", "6S3W", "7JOD", "7N1N", "7PKU", "1B6H", "5JYQ", "6GU0", "7JOE", "6E4D", "2Z59", "7B2B", "5U4K", "6KK3", "2N73", "7AA4", "6W9K", "7JQV", "6W9M", "7JOW"]
# new = []
# for item in deletes:
# new.append(f"{item.lower()}.pdb")
# print(new)
# deletes = ['1cf4', '1irs', '1e0a', '1j4p', '1j4k', '1sdx', '2m6i', '1k2n', '3ugw', '1shc', '1z3m', '1b3g', '5H7P', '1l0a', '2a3i', '4ZXL', '1dow', '5D7E', '1oo4', '2n8j', '5J7J', '4P6W', '4fjp', '2r02', '1z3p', '5GXW', '1f3r', '1htr', '1b2h', '4R8T', '2rod', '2z7f', '1ym0', '1k2m', '2rs9', '1z3l', '1b3f', '1zbc', '1hag', '4XOE', '4SRN', '2yka', '2roc', '1ees', '1d5s', '1d5h', '2roz', '1n7t', '2r05', '1cqg', '1j4q', '2n2h', '4PID', '2rnw', '1f1w', '2kdc', '1fu5', '2aos', '1wlp', '1qg1', '1tce', '1ppe', '1fev', '3n3x', '1smh', '4WB8', '1t01', '2rmx', '2r03', '1d7q', '1j4l', '1b8q', '1zbk', '2vu8', '1jsp', '2pon', '2rny', '2my3', '1ddm', '3llz', '2yu7', '2n0y', '5APR', '2oq1', '2ror', '2qos', '1k3n', '3n2d', '1uoo', '1o0p', '2a0t', '4Q5U', '2uzv', '1l0n', '1uop', '1zbv', '1b3h', '2xjy', '1k3q', '1ssc', '3u8q', '1d5d', '2rqw', '2xs4', '1r8u', '1d4t', '5duo', '2uzt', '1xr0', '4TT2', '1opi', '1qmb', '1b32', '2my2', '1jm4', '1fhr', '2phg', '2rnx', '2m6x', '3h8k', '1zl1', '1wkw', '1fpr', '3m7u', '1b9j', '2uzu', '1cqh', '2n9p', '1t37', '1zbw', '1b4z', '2uzw', '1d5e', '1tg4', '4W8P']
deletes = ['7KLR', '4bs3', '4lnp', '6MHA', '1kbh', '1rgj', '6KBV', '6ES5', '2ofq', '5XBO', '2n2v', '1om2', '1i8h', '5Y20', '6ES7', '1hoy', '2ain', '6KBO', '2mv7', '7KN0', '6MGN', '5X9X', '1q69', '6B7G', '1b18', '2rqg', '7OJ9', '5L85', '2n4q', '1l2z', '1jh4', '4Q6H', '2rpn', '5D5E', '1s5q', '5MIZ', '7E0B', '1ncp', '7JQ8', '1kuz', '1hv2', '4NIB', '6X4X', '5J6Z', '1u0i', '1eci', '2rn5', '1bph', '6XMN', '6ES6', '1iog', '1q68', '1s7p', '1oqp', '6F55', '3t2a', '1kmf', '1pnb', '1k4u', '2n2w', '1sdb', '1hls', '2zpp', '1bxp', '1xgl', '1s5r', '6K59', '1q5w', '6QXZ', '1bzv', '1cph', '1b19', '1u38', '2rsy', '2n9y', '2rm0', '1zsg', '1jmq', '6BGH', '4i5y', '6VES', '4iuz', '1sse', '1t1k', '1uel', '1hiq', '1t1p', '6KH8', '1b2g', '1qh2', '1i5h', '5LIS', '3u4n', '1sf1', '1b2e', '4m5s', '1his', '1b17', '2rqh', '5YC4', '1q0w', '2a4j', '2mvc', '1mhi', '5VIZ', '1hui', '1vkt', '1otr', '1i8g', '4YYP', '1sb0', '1bon', '6KHA', '1bh9', '2mwn', '1pmx', '1tkq', '1io6', '1zgx', '6Q00', '2rly', '6Q8Q', '7QDW', '2n55', '5D53', '2pq4', '5EN9', '1b2d', '1g1e', '9INS', '5GWM', '1sjt', '4ihn', '5ENA', '6C0A', '1jbd', '5D54', '4i5z', '7A2Y', '7CWH', '3i40', '1kdx', '2os6', '7JYN', '1pd7', '1mv0', '5MHD', '1jco', '6KH9', '1b2f', '1t1q', '1k3m', '6K5R', '1bom', '1d5g', '7NHU', '1hit', '1jgn', '5YC3', '2mur', '5D52', '6S3F', '1kup', '1mhj', '2mzd', '2pku', '6NSX', '7F7X', '2mvd', '2wfu', '1dph', '2n2x', '4iyd', '2pdz', '1qfn', '7AC4', '4iyf', '5I22', '3i3z', '5MWQ', '6K5T', '2rol', '7EDP', '1ioh', '1bh8']
string = "3srb1abt6Z8M1aot1b2a5D6J1a931b2b1a0n1kfu1ubo4m1j2omx1ubm1ubl1ai61ubh1ai71ai51ubk1ubj1ai41ajn1a2x1wul4U9H1ubr4U9I6XE31wuj1wuk1ajq2omw1wui1ubu1ubt1wuh1ajp1h2r3fju1abj4WKT1a7f1a9x"
deletes = []
for i in range(0, int(len(string) / 4)):
i = i * 4
deletes.append(string[i : i + 4])
print(deletes)
print(len(deletes))
for item in deletes:
if os.path.exists(f"dscript/pdbs/{item}.pdb"):
@@ -15,4 +16,3 @@ for item in deletes:
for item in deletes:
if os.path.exists(f"dscript/fastas/{item}.fasta"):
os.remove(f"dscript/fastas/{item}.fasta")

View File

@@ -1,23 +0,0 @@
import os
files = os.listdir("dscript/pdbsTEST")
# fastas = os.listdir("dscript/fastasNEW")
# fastas.remove(".DS_Store")
if ".DS_Store" in files:
files.remove(".DS_Store")
# print(len(files))
# print(files)
new_files = []
for item in files:
new_files.append(item[:4].lower())
# print((new_files))
string = "'1r8o', '1nsg', '1pc8', '1ktp', '1puu', '1mlb', '1p6a', '1oo9', '1plg', '1jgv', '1jql', '1kfu', '1nld', '1q9k', '1kem', '1kel', '1jlt', '1qbl', '1mim', '1pqz', '1nlb', '1nl4', '1phn', '1jnn', '1qav', '1qbm', '1jnl', '1nme', '1op9', '1jtt', '1ppf', '1n7m', '1ncw', '1mh2', '1nbv', '1rf8', '1jv5', '1opg', '1n94', '1p69', '1p4q', '1kn4', '1oz7', '1on7', '1kcu', '1ngy', '1kn2', '1kcv', '1ngz', '1pum'"
new_string = ""
for item in string:
# print(item)
if item != "'" and item != " ":
new_string += item[:4]
print(new_string)

View File

@@ -0,0 +1 @@
3srb1abt6Z8M1aot1b2a5D6J1a931b2b1a0n1kfu1ubo4m1j2omx1ubm1ubl1ai61ubh1ai71ai51ubk1ubj1ai41ajn1a2x1wul4U9H1ubr4U9I6XE31wuj1wuk1ajq2omw1wui1ubu1ubt1wuh1ajp1h2r3fju1abj4WKT1a7f1a9x

File diff suppressed because one or more lines are too long

View File

@@ -67,35 +67,49 @@ def main():
pdb_file = f"dscript/pdbsNEW/{pdb_id}.pdb"
seqres_recs = list(SeqIO.parse(pdb_file, "pdb-seqres"))
# print(seqres_recs)
atoms_recs = list(SeqIO.parse(pdb_file, "pdb-atom"))
# print(atoms_recs)
seqs_long = seqres_recs[:2]
# print(seqs_long)
seqs_short = atoms_recs[:2]
# print(seqs_short)
structure = PDB.PDBParser().get_structure(pdb_id, pdb_file)
chains = list(structure.get_chains())
# if len(chains) > 2:
# print(pdb_id)
# continue
chains = list(structure.get_chains())[:2]
# print(chains)
with open(f"dscript/proteins.fasta", "a") as f:
for record in seqs_long:
f.write(record.format("fasta-2line"))
chains_filtered = filter_chains(chains)
# print(chains_filtered)
# which sequence (seqres or atom) does chain match to? seqres?
# the chain length should be equal to the pdb-atom sequence length, right? len(chain) = len(seq_short)
# but for some reason, in a few cases chains aren't the same length as either of the generated pdb-seqres or pdb-atom sequences,
# which can throw a stopiteration error
# seqres = sequence portion near start of pdb
# atom = sequence portion in midway of pdb using distance coordinates and atomic information, can skip residue numbers leadering to X's
# chain = same as atom sequence, but only amino acid residues and not counting skips, so may be shorter than seqres and/or atom
seq0_long = seqs_long[0].seq
# print(len(seq0_long))
seq0_short = seqs_short[0].seq
# print(len(seq0_short))
chain0 = chains_filtered[0]
# print((chain0))
# print(len(chain0))
seq1_long = seqs_long[1].seq
# print(len(seq1_long))
seq1_short = seqs_short[1].seq
# print(len(seq1_short))
chain1 = chains_filtered[1]
# print((chain1))
# print(seq0_long)
# print(seq0_short)
# print([len(seq0_long), len(seq0_short)])
# print(seq1_long)
# print(seq1_short)
# print([len(seq1_long), len(seq1_short)])
# how to get sequence from chain?
# print(len(chain0))
# print(len(chain1))
if len(seq1_short) > len(seq1_long) or len(seq0_short) > len(
@@ -103,18 +117,33 @@ def main():
):
count += 1
# print(seq0_short)
# print(seq0_long)
# print([len(seq0_short), len(seq0_long)])
# print(seq1_short)
# print(seq1_long)
# print([len(seq1_short), len(seq1_long)])
align0 = pairwise2.align.globalxx(seq0_long, seq0_short)
print(pairwise2.format_alignment(*align0[0]))
# print(pairwise2.format_alignment(*align0[0]))
align1 = pairwise2.align.globalxx(seq1_long, seq1_short)
print(pairwise2.format_alignment(*align1[0]))
# print(pairwise2.format_alignment(*align1[0]))
if len(chain0) == len(seq1_long) or len(chain0) == len(seq1_short):
if (
len(chain0) != len(seq0_long) or len(chain0) != len(seq0_short)
) and (
len(chain0) == len(seq1_long) or len(chain0) == len(seq1_short)
):
print("switched")
temp = chain1.copy()
chain1 = chain0
chain0 = temp
print(len(chain0))
print(len(chain1))
# print(len(chain0))
# print(len(chain1))
if len(chain0) != len(seq0_long) or len(chain1) != len(seq1_long):
None
align0 = pairwise2.align.globalxx(seq0_long, seq0_short)
# print(pairwise2.format_alignment(*align0[0]))
@@ -150,22 +179,40 @@ def main():
# print(len(chain0))
# print(len(chain1))
# ch0_it = iter(chain0)
# for (i, (res0L, res0S)) in enumerate(zip(seq0_long_f, seq0_short_f)):
# print((i, (res0L, res0S)))
# if res0S == "-" or res0L == "-":
# continue
# else:
# res0 = next(ch0_it)
ch0_it = iter(chain0)
x = -1
y = 0
for (i, (res0L, res0S)) in enumerate(zip(seq0_long_f, seq0_short_f)):
if res0S == "-":
x += 1
continue
if res0L == "-":
continue
else:
# print((x, (res0L, res0S)))
x += 1
res0 = next(ch0_it)
# ch1_it = iter(chain1)
# for (j, (res1L, res1S)) in enumerate(zip(seq1_long_f, seq1_short_f)):
# print([x,y])
ch1_it = iter(chain1)
for (j, (res1L, res1S)) in enumerate(
zip(seq1_long_f, seq1_short_f)
):
if res1S == "-":
y += 1
continue
if res1L == "-":
continue
else:
res1 = next(ch1_it)
# print((y, (res1L, res1S)))
# print([x,y])
D[x, y] = residue_distance(res0, res1, max_d=MAX_D)
y += 1
# if res1S == "-" or res1L == "-":
# continue
# else:
# res1 = next(ch1_it)
y = 0
# print([i,j])
# D[i,j] = residue_distance(res0, res1, max_d = MAX_D)
# D = calc_dist_matrix(chain0, chain1, seq0_long, seq0_short, seq1_long, seq1_short)

View File

@@ -0,0 +1,192 @@
from Bio import SeqIO
from Bio import PDB
from Bio import pairwise2
import h5py
import numpy as np
import os
import pandas as pd
import csv
MAX_D = 25
def filter_chains(chain_list):
chains_f = [[r for r in c if r.has_id("CA")] for c in chain_list]
return chains_f
def residue_distance(res0, res1, max_d=25.0):
diff_vector = res0["CA"].coord - res1["CA"].coord
distance = np.sqrt(np.sum(diff_vector ** 2))
return min(distance, max_d)
def make_tsv(pdb_id, chains, name):
with open(f"data/{name}.tsv", "a") as out_file:
tsv_writer = csv.writer(out_file, delimiter="\t")
prot1 = f"{pdb_id.upper()}:{str(chains[0].get_id()).upper()}"
prot2 = f"{pdb_id.upper()}:{str(chains[1].get_id()).upper()}"
print((prot1, prot2))
tsv_writer.writerow([prot1, prot2, "1"])
def make_fasta(pdb_id, seqs_long, fasta_name):
with open(f"data/{fasta_name}.fasta", "a") as f:
for record in seqs_long:
f.write(record.format("fasta-2line"))
# None
def calc_dist_matrix(
pdb_id,
chain0,
chain1,
seq0_long,
seq1_long,
seq0_long_f,
seq0_short_f,
seq1_long_f,
seq1_short_f,
):
D = np.zeros((len(seq0_long), len(seq1_long)))
D = D - 1
ch0_it = iter(chain0)
x = -1
y = 0
for (i, (res0L, res0S)) in enumerate(zip(seq0_long_f, seq0_short_f)):
if res0S == "-":
x += 1
continue
if res0L == "-":
continue
else:
x += 1
try:
res0 = next(ch0_it)
except StopIteration:
file1 = open("dscript/processing/pdb_errors.txt", "a")
file1.write(pdb_id)
file1.close()
# res0 = next(ch0_it)
ch1_it = iter(chain1)
for (j, (res1L, res1S)) in enumerate(zip(seq1_long_f, seq1_short_f)):
if res1S == "-":
y += 1
continue
if res1L == "-":
continue
else:
try:
res1 = next(ch1_it)
except StopIteration:
file1 = open("dscript/processing/pdb_errors.txt", "a")
file1.write(pdb_id)
file1.close()
# res1 = next(ch1_it)
D[x, y] = residue_distance(res0, res1, max_d=MAX_D)
y += 1
y = 0
return D
def main():
pdb_directory = input("Name of directory containing pdb files: ")
files = os.listdir(f"dscript/{pdb_directory}")
if ".DS_Store" in files:
files.remove(".DS_Store")
for i in range(0, len(files)):
files[i] = files[i][:4]
total = 0
long = []
h5_name = input("Name of output H5 (for contact maps): ")
hf_pair = h5py.File(f"data/{h5_name}.h5", "w")
fasta_name = input("Name of output fasta (for sequences): ")
tsv_name = input("Name of output tsv (for PPIs): ")
for pdb_id in files:
total += 1
print(f"Total: {total}")
print(pdb_id)
pdb_file = f"dscript/{pdb_directory}/{pdb_id}.pdb"
seqres_recs = list(SeqIO.parse(pdb_file, "pdb-seqres"))
atoms_recs = list(SeqIO.parse(pdb_file, "pdb-atom"))
seqs_long = seqres_recs[:2]
seqs_short = atoms_recs[:2]
structure = PDB.PDBParser().get_structure(pdb_id, pdb_file)
# chains = list(structure.get_chains())
# if len(chains) > 2:
# print(chains)
# print(pdb_id)
# continue
chains = list(structure.get_chains())[:2]
if (
len(chains[0]) > 800
or len(chains[1]) > 800
or len(chains[0]) < 50
or len(chains[1]) < 50
):
long.append(pdb_id)
continue
make_fasta(pdb_id, seqs_long, fasta_name)
make_tsv(pdb_id, chains, tsv_name)
chains_filtered = filter_chains(chains)
seq0_long = seqs_long[0].seq
seq0_short = seqs_short[0].seq
chain0 = chains_filtered[0]
seq1_long = seqs_long[1].seq
seq1_short = seqs_short[1].seq
chain1 = chains_filtered[1]
# if chain lengths switched (chain0 = seq1 and chain1 = seq0)
if (
len(chain0) != len(seq0_long) or len(chain0) != len(seq0_short)
) and (
len(chain0) == len(seq1_long) or len(chain0) == len(seq1_short)
):
temp = chain1.copy()
chain1 = chain0
chain0 = temp
align0 = pairwise2.align.globalxx(seq0_long, seq0_short)
# print(pairwise2.format_alignment(*align0[0]))
align1 = pairwise2.align.globalxx(seq1_long, seq1_short)
# print(pairwise2.format_alignment(*align1[0]))
seq0_long_f = align0[0].seqA
seq0_short_f = align0[0].seqB
seq1_long_f = align1[0].seqA
seq1_short_f = align1[0].seqB
D = calc_dist_matrix(
pdb_id,
chain0,
chain1,
seq0_long,
seq1_long,
seq0_long_f,
seq0_short_f,
seq1_long_f,
seq1_short_f,
)
df = pd.DataFrame(D, index=list(seq0_long), columns=list(seq1_long))
# print(df)
# print(f'{pdb_id}:{str(chains[0].get_id())}x{pdb_id}:{str(chains[1].get_id())}')
hf_pair.create_dataset(
f"{pdb_id}:{str(chains[0].get_id())}x{pdb_id}:{str(chains[1].get_id())}",
data=D,
)
print(long)
if __name__ == "__main__":
main()