mirror of
https://github.com/samsledje/D-SCRIPT.git
synced 2026-06-04 15:04:24 +08:00
made updates to processing.py, incorporating/debugging filtering and sequence alginment code
This commit is contained in:
@@ -1,20 +0,0 @@
|
||||
"""
|
||||
# UNFINISHED
|
||||
# convert batches of downloaded pdb gzip files into parsable pdb protein files.
|
||||
"""
|
||||
import gzip
|
||||
import matplotlib.pyplot as plt
|
||||
import os
|
||||
import pandas as pd
|
||||
|
||||
path = "dscript/pdbsNEW/batch-download-structures-1661040401355"
|
||||
zips = os.listdir(path)
|
||||
if ".DS_Store" in zips:
|
||||
zips.remove(".DS_Store")
|
||||
|
||||
for item in zips:
|
||||
name = item[3:7]
|
||||
op = open(f"dscript/pdbsNEW/{name}.pdb", "w")
|
||||
|
||||
with gzip.open(f"{path}/{item}", "rb") as fi:
|
||||
op.write(fi.read().decode("utf-8"))
|
||||
@@ -1,123 +0,0 @@
|
||||
"""
|
||||
# parsing PDB files into pairwise and binary contact maps
|
||||
# REMOVE ANY CA ERROR PDBS
|
||||
"""
|
||||
from pickle import FALSE
|
||||
import Bio.PDB
|
||||
import numpy
|
||||
import matplotlib.pyplot as plt
|
||||
import h5py
|
||||
import os
|
||||
import pandas as pd
|
||||
|
||||
# SOURCE CODE: https://warwick.ac.uk/fac/sci/moac/people/students/peter_cock/python/protein_contact_map/
|
||||
def has_CA(residue):
|
||||
"""
|
||||
Return whether or not residue contains an alpha-carbon atom.
|
||||
|
||||
:param residue: pass in an amino acid
|
||||
:type version: Residue object
|
||||
:return: if amino acid residue has alpha-carbons
|
||||
:rtype: boolean
|
||||
"""
|
||||
return residue.has_id("CA")
|
||||
|
||||
|
||||
def filter_chains(chain_list):
|
||||
chains_f = [[r for r in c if r.has_id("CA")] for c in chain_list]
|
||||
return chains_f
|
||||
|
||||
|
||||
def calc_residue_dist(residue_one, residue_two, limit):
|
||||
"""
|
||||
Returns the C-alpha distance between two residues
|
||||
|
||||
:param residue_one: pass in an amino acid residue object
|
||||
:type residue_one: Residue object
|
||||
:param residue_two: pass in an amino acid residue object
|
||||
:type residue_two: Residue object
|
||||
:param limit: pass in a distance threshold
|
||||
:type limit: float
|
||||
:return: C-alpha distance between two residues
|
||||
:rtype: float
|
||||
"""
|
||||
diff_vector = residue_one["CA"].coord - residue_two["CA"].coord
|
||||
distance = numpy.sqrt(numpy.sum(diff_vector * diff_vector))
|
||||
if distance >= float(limit):
|
||||
return float(limit)
|
||||
return distance
|
||||
|
||||
|
||||
def calc_dist_matrix(chain_one, chain_two, limit):
|
||||
"""
|
||||
Returns a contact map matrix of C-alpha distances between two chains
|
||||
|
||||
:param chain_one: pass in a chain 1
|
||||
:type chain_one: Chain object
|
||||
:param chain_two: pass in a chain 2
|
||||
:type chain_two: Chain object
|
||||
:param limit: pass in a distance threshold
|
||||
:type limit: float
|
||||
:return: a contact map matrix of C-alpha distances between two chains
|
||||
:rtype: Numpy matrix
|
||||
"""
|
||||
chains = filter_chains([chain_one, chain_two])
|
||||
chain1 = chains[0]
|
||||
chain2 = chains[1]
|
||||
|
||||
# chain2 = filter_chains(chain_two)
|
||||
# for residue_one in chain_one:
|
||||
# if has_CA(residue_one):
|
||||
# chain1.append(residue_one)
|
||||
# for residue_two in chain_two:
|
||||
# if has_CA(residue_two):
|
||||
# chain2.append(residue_two)
|
||||
|
||||
answer = numpy.zeros((len(chain1), len(chain2)), float)
|
||||
print(answer.shape)
|
||||
|
||||
for i in range(0, len(chain1)):
|
||||
for j in range(0, len(chain2)):
|
||||
# print((chain1[i].get_resname(), chain2[j].get_resname()))
|
||||
answer[i][j] = calc_residue_dist(chain1[i], chain2[j], limit)
|
||||
|
||||
return answer
|
||||
|
||||
|
||||
def main():
|
||||
files = os.listdir("dscript/pdbsNEW")
|
||||
if ".DS_Store" in files:
|
||||
files.remove(".DS_Store")
|
||||
|
||||
for i in range(0, len(files)):
|
||||
files[i] = files[i][:4]
|
||||
|
||||
hf_pair = h5py.File(f"data/paircmaps_train.h5", "w")
|
||||
count = 0
|
||||
for protein in files:
|
||||
count += 1
|
||||
print(protein)
|
||||
print(count)
|
||||
pdb_code = protein.upper()
|
||||
pdb_filename = f"dscript/pdbsNEW/{protein}.pdb"
|
||||
structure = Bio.PDB.PDBParser().get_structure(pdb_code, pdb_filename)
|
||||
model = structure[0]
|
||||
|
||||
# GET PROTEIN CHAINS
|
||||
chain = []
|
||||
for chains in structure.get_chains():
|
||||
chain.append(str(chains.get_id()))
|
||||
chain = chain[:2]
|
||||
|
||||
dist_matrix = calc_dist_matrix(
|
||||
model[chain[0]], model[chain[1]], 25.000
|
||||
)
|
||||
|
||||
print(f"{pdb_code}:{chain[0]}x{pdb_code}:{chain[1]}")
|
||||
hf_pair.create_dataset(
|
||||
f"{pdb_code}:{chain[0]}x{pdb_code}:{chain[1]}", data=dist_matrix
|
||||
)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -1,31 +0,0 @@
|
||||
"""
|
||||
# converting protein PDB files into fasta sequence format
|
||||
"""
|
||||
import Bio
|
||||
from Bio import SeqIO
|
||||
import os
|
||||
|
||||
entries = os.listdir("dscript/pdbsNEW")
|
||||
if ".DS_Store" in entries:
|
||||
entries.remove(".DS_Store")
|
||||
# print(entries)
|
||||
|
||||
# CONVERT ALL TO FASTAS
|
||||
for i in range(0, len(entries)):
|
||||
# print(entries[i])
|
||||
if entries[i][0] != ".":
|
||||
with open(f"dscript/fastasNEW/{entries[i][:-4]}.fasta", "w") as f:
|
||||
# pdb-atom
|
||||
for record in SeqIO.parse(
|
||||
f"dscript/pdbsNEW/{entries[i]}", "pdb-seqres"
|
||||
):
|
||||
f.write(record.format("fasta-2line"))
|
||||
f.close()
|
||||
|
||||
# TEST A SINGLE
|
||||
# with open(f'dscript/proteins/15c8.fasta', 'w') as f:
|
||||
# for record in SeqIO.parse(f"dscript/15c8.pdb", "pdb-seqres"):
|
||||
# # print("Record id %s, chain %s" % (record.id, record.annotations["chain"]))
|
||||
# # print(record.format("fasta"))
|
||||
# f.write(record.format("fasta-2line"))
|
||||
# f.close()
|
||||
@@ -1,50 +0,0 @@
|
||||
from pickle import FALSE
|
||||
import Bio.PDB
|
||||
import numpy
|
||||
import matplotlib.pyplot as plt
|
||||
import h5py
|
||||
import random
|
||||
import os
|
||||
import pandas as pd
|
||||
import torch
|
||||
|
||||
fi = h5py.File("data/paircmaps_train.h5", "r")
|
||||
ke = list(fi.keys())
|
||||
# print(ke[0][:4].lower())
|
||||
print(fi[ke[0]].shape)
|
||||
|
||||
fastas = os.listdir("dscript/fastasNEW")
|
||||
# fastas.remove(".DS_Store")
|
||||
# print(len(fastas))
|
||||
|
||||
deletes = []
|
||||
|
||||
for item in ke:
|
||||
# print(fi[item].shape)
|
||||
if os.path.exists(f"dscript/fastasNEW/{item[:4].lower()}.fasta"):
|
||||
with open(
|
||||
f"dscript/fastasNEW/{item[:4].lower()}.fasta", "r"
|
||||
) as in_file:
|
||||
l = in_file.read().splitlines()
|
||||
seq1 = len(l[1])
|
||||
seq2 = len(l[3])
|
||||
print((seq1, seq2))
|
||||
|
||||
if (seq1 == fi[item].shape[0] and seq2 == fi[item].shape[1]) or (
|
||||
seq2 == fi[item].shape[0] and seq1 == fi[item].shape[1]
|
||||
):
|
||||
# print(f"{item} MATCHES")
|
||||
None
|
||||
else:
|
||||
# print(f"{item} DELETE")
|
||||
deletes.append(item[:4].lower())
|
||||
# print()
|
||||
|
||||
print(deletes)
|
||||
# deletes = ['1fmh', '1dzb', '1dnw', '1dnu', '1fxv', '1d5l', '1d7w', '1dtd', '1e8n']
|
||||
# for item in deletes:
|
||||
# if os.path.exists(f"dscript/pdbs/{item}.pdb"):
|
||||
# os.remove(f"dscript/pdbs/{item}.pdb")
|
||||
# for item in deletes:
|
||||
# if os.path.exists(f"dscript/fastas/{item}.fasta"):
|
||||
# os.remove(f"dscript/fastas/{item}.fasta")
|
||||
@@ -1,32 +0,0 @@
|
||||
"""
|
||||
# CLEAR CMAP.FASTA BEFORE RUNNING THIS FILE
|
||||
# making fasta ids simpler to match the contact maps, putting them all together in data/seqs as 1 fasta file
|
||||
"""
|
||||
import Bio.PDB
|
||||
import numpy as np
|
||||
import matplotlib.pyplot as plt
|
||||
import h5py
|
||||
import random
|
||||
import os
|
||||
import pandas as pd
|
||||
import torch
|
||||
import csv
|
||||
|
||||
files = os.listdir("dscript/fastasTEST")
|
||||
if ".DS_Store" in files:
|
||||
files.remove(".DS_Store")
|
||||
# print(files)
|
||||
|
||||
for item in files:
|
||||
with open('data/seqs/cmap.fasta', 'a') as out_file:
|
||||
with open(f'dscript/fastasTEST/{item}','r') as in_file:
|
||||
# print(item)
|
||||
l = in_file.read().splitlines()
|
||||
# print(l)
|
||||
for i in range(0, len(l)):
|
||||
if i%2 == 0:
|
||||
l[i] = l[i][:7]
|
||||
# print(l)
|
||||
for line in l:
|
||||
out_file.write(line)
|
||||
out_file.write('\n')
|
||||
@@ -1,28 +0,0 @@
|
||||
"""
|
||||
creating a training cmap tsv file with all the contact map protein pairs, all interaction = 1
|
||||
"""
|
||||
import Bio.PDB
|
||||
import numpy as np
|
||||
import matplotlib.pyplot as plt
|
||||
import h5py
|
||||
import random
|
||||
import os
|
||||
import pandas as pd
|
||||
import torch
|
||||
import csv
|
||||
|
||||
# files = os.listdir("dscript/bincmaps")
|
||||
# files.remove(".DS_Store")
|
||||
# print(files)
|
||||
fi = h5py.File("data/bincmaps_train.h5","r")
|
||||
ke = list(fi.keys())
|
||||
|
||||
with open('data/pairs/cmap_train.tsv', 'wt') as out_file:
|
||||
tsv_writer = csv.writer(out_file, delimiter='\t')
|
||||
for item in ke:
|
||||
# print(item)
|
||||
prot1 = item[:item.index(":")+2]
|
||||
prot2 = item[item.index(":")+3:]
|
||||
# print((prot1, prot2))
|
||||
tsv_writer.writerow([prot1, prot2, '1'])
|
||||
|
||||
@@ -1,22 +0,0 @@
|
||||
import os
|
||||
|
||||
fastas = os.listdir("dscript/fastas")
|
||||
if ".DS_Store" in fastas:
|
||||
fastas.remove(".DS_Store")
|
||||
# print(len(fastas))
|
||||
|
||||
deletes = []
|
||||
|
||||
for item in fastas:
|
||||
# print(fi[item].shape)
|
||||
with open(f"dscript/fastas/{item}", "r") as in_file:
|
||||
l = in_file.read().splitlines()
|
||||
if len(l) != 4:
|
||||
deletes.append(item[:4])
|
||||
if len(l) == 4:
|
||||
if len(l[1]) >= 694 or len(l[3]) >= 694:
|
||||
# print(item)
|
||||
deletes.append(item[:4])
|
||||
if len(l[1]) <= 50 or len(l[3]) <= 50:
|
||||
deletes.append(item[:4])
|
||||
print(deletes)
|
||||
@@ -1,13 +1,14 @@
|
||||
import os
|
||||
|
||||
# deletes = ["1B6H", "2Z59", "2N73", "6W0V", "5Z2W", "6S3W", "7JOD", "7N1N", "7PKU", "1B6H", "5JYQ", "6GU0", "7JOE", "6E4D", "2Z59", "7B2B", "5U4K", "6KK3", "2N73", "7AA4", "6W9K", "7JQV", "6W9M", "7JOW"]
|
||||
# new = []
|
||||
# for item in deletes:
|
||||
# new.append(f"{item.lower()}.pdb")
|
||||
# print(new)
|
||||
|
||||
# deletes = ['1cf4', '1irs', '1e0a', '1j4p', '1j4k', '1sdx', '2m6i', '1k2n', '3ugw', '1shc', '1z3m', '1b3g', '5H7P', '1l0a', '2a3i', '4ZXL', '1dow', '5D7E', '1oo4', '2n8j', '5J7J', '4P6W', '4fjp', '2r02', '1z3p', '5GXW', '1f3r', '1htr', '1b2h', '4R8T', '2rod', '2z7f', '1ym0', '1k2m', '2rs9', '1z3l', '1b3f', '1zbc', '1hag', '4XOE', '4SRN', '2yka', '2roc', '1ees', '1d5s', '1d5h', '2roz', '1n7t', '2r05', '1cqg', '1j4q', '2n2h', '4PID', '2rnw', '1f1w', '2kdc', '1fu5', '2aos', '1wlp', '1qg1', '1tce', '1ppe', '1fev', '3n3x', '1smh', '4WB8', '1t01', '2rmx', '2r03', '1d7q', '1j4l', '1b8q', '1zbk', '2vu8', '1jsp', '2pon', '2rny', '2my3', '1ddm', '3llz', '2yu7', '2n0y', '5APR', '2oq1', '2ror', '2qos', '1k3n', '3n2d', '1uoo', '1o0p', '2a0t', '4Q5U', '2uzv', '1l0n', '1uop', '1zbv', '1b3h', '2xjy', '1k3q', '1ssc', '3u8q', '1d5d', '2rqw', '2xs4', '1r8u', '1d4t', '5duo', '2uzt', '1xr0', '4TT2', '1opi', '1qmb', '1b32', '2my2', '1jm4', '1fhr', '2phg', '2rnx', '2m6x', '3h8k', '1zl1', '1wkw', '1fpr', '3m7u', '1b9j', '2uzu', '1cqh', '2n9p', '1t37', '1zbw', '1b4z', '2uzw', '1d5e', '1tg4', '4W8P']
|
||||
deletes = ['7KLR', '4bs3', '4lnp', '6MHA', '1kbh', '1rgj', '6KBV', '6ES5', '2ofq', '5XBO', '2n2v', '1om2', '1i8h', '5Y20', '6ES7', '1hoy', '2ain', '6KBO', '2mv7', '7KN0', '6MGN', '5X9X', '1q69', '6B7G', '1b18', '2rqg', '7OJ9', '5L85', '2n4q', '1l2z', '1jh4', '4Q6H', '2rpn', '5D5E', '1s5q', '5MIZ', '7E0B', '1ncp', '7JQ8', '1kuz', '1hv2', '4NIB', '6X4X', '5J6Z', '1u0i', '1eci', '2rn5', '1bph', '6XMN', '6ES6', '1iog', '1q68', '1s7p', '1oqp', '6F55', '3t2a', '1kmf', '1pnb', '1k4u', '2n2w', '1sdb', '1hls', '2zpp', '1bxp', '1xgl', '1s5r', '6K59', '1q5w', '6QXZ', '1bzv', '1cph', '1b19', '1u38', '2rsy', '2n9y', '2rm0', '1zsg', '1jmq', '6BGH', '4i5y', '6VES', '4iuz', '1sse', '1t1k', '1uel', '1hiq', '1t1p', '6KH8', '1b2g', '1qh2', '1i5h', '5LIS', '3u4n', '1sf1', '1b2e', '4m5s', '1his', '1b17', '2rqh', '5YC4', '1q0w', '2a4j', '2mvc', '1mhi', '5VIZ', '1hui', '1vkt', '1otr', '1i8g', '4YYP', '1sb0', '1bon', '6KHA', '1bh9', '2mwn', '1pmx', '1tkq', '1io6', '1zgx', '6Q00', '2rly', '6Q8Q', '7QDW', '2n55', '5D53', '2pq4', '5EN9', '1b2d', '1g1e', '9INS', '5GWM', '1sjt', '4ihn', '5ENA', '6C0A', '1jbd', '5D54', '4i5z', '7A2Y', '7CWH', '3i40', '1kdx', '2os6', '7JYN', '1pd7', '1mv0', '5MHD', '1jco', '6KH9', '1b2f', '1t1q', '1k3m', '6K5R', '1bom', '1d5g', '7NHU', '1hit', '1jgn', '5YC3', '2mur', '5D52', '6S3F', '1kup', '1mhj', '2mzd', '2pku', '6NSX', '7F7X', '2mvd', '2wfu', '1dph', '2n2x', '4iyd', '2pdz', '1qfn', '7AC4', '4iyf', '5I22', '3i3z', '5MWQ', '6K5T', '2rol', '7EDP', '1ioh', '1bh8']
|
||||
string = "3srb1abt6Z8M1aot1b2a5D6J1a931b2b1a0n1kfu1ubo4m1j2omx1ubm1ubl1ai61ubh1ai71ai51ubk1ubj1ai41ajn1a2x1wul4U9H1ubr4U9I6XE31wuj1wuk1ajq2omw1wui1ubu1ubt1wuh1ajp1h2r3fju1abj4WKT1a7f1a9x"
|
||||
|
||||
deletes = []
|
||||
for i in range(0, int(len(string) / 4)):
|
||||
i = i * 4
|
||||
deletes.append(string[i : i + 4])
|
||||
|
||||
print(deletes)
|
||||
|
||||
print(len(deletes))
|
||||
for item in deletes:
|
||||
if os.path.exists(f"dscript/pdbs/{item}.pdb"):
|
||||
@@ -15,4 +16,3 @@ for item in deletes:
|
||||
for item in deletes:
|
||||
if os.path.exists(f"dscript/fastas/{item}.fasta"):
|
||||
os.remove(f"dscript/fastas/{item}.fasta")
|
||||
|
||||
|
||||
@@ -1,23 +0,0 @@
|
||||
import os
|
||||
|
||||
files = os.listdir("dscript/pdbsTEST")
|
||||
# fastas = os.listdir("dscript/fastasNEW")
|
||||
# fastas.remove(".DS_Store")
|
||||
if ".DS_Store" in files:
|
||||
files.remove(".DS_Store")
|
||||
# print(len(files))
|
||||
# print(files)
|
||||
|
||||
new_files = []
|
||||
for item in files:
|
||||
new_files.append(item[:4].lower())
|
||||
# print((new_files))
|
||||
|
||||
string = "'1r8o', '1nsg', '1pc8', '1ktp', '1puu', '1mlb', '1p6a', '1oo9', '1plg', '1jgv', '1jql', '1kfu', '1nld', '1q9k', '1kem', '1kel', '1jlt', '1qbl', '1mim', '1pqz', '1nlb', '1nl4', '1phn', '1jnn', '1qav', '1qbm', '1jnl', '1nme', '1op9', '1jtt', '1ppf', '1n7m', '1ncw', '1mh2', '1nbv', '1rf8', '1jv5', '1opg', '1n94', '1p69', '1p4q', '1kn4', '1oz7', '1on7', '1kcu', '1ngy', '1kn2', '1kcv', '1ngz', '1pum'"
|
||||
new_string = ""
|
||||
for item in string:
|
||||
# print(item)
|
||||
if item != "'" and item != " ":
|
||||
new_string += item[:4]
|
||||
|
||||
print(new_string)
|
||||
1
dscript/processing/pdb_errors.txt
Normal file
1
dscript/processing/pdb_errors.txt
Normal file
@@ -0,0 +1 @@
|
||||
3srb1abt6Z8M1aot1b2a5D6J1a931b2b1a0n1kfu1ubo4m1j2omx1ubm1ubl1ai61ubh1ai71ai51ubk1ubj1ai41ajn1a2x1wul4U9H1ubr4U9I6XE31wuj1wuk1ajq2omw1wui1ubu1ubt1wuh1ajp1h2r3fju1abj4WKT1a7f1a9x
|
||||
File diff suppressed because one or more lines are too long
@@ -67,35 +67,49 @@ def main():
|
||||
pdb_file = f"dscript/pdbsNEW/{pdb_id}.pdb"
|
||||
|
||||
seqres_recs = list(SeqIO.parse(pdb_file, "pdb-seqres"))
|
||||
# print(seqres_recs)
|
||||
atoms_recs = list(SeqIO.parse(pdb_file, "pdb-atom"))
|
||||
# print(atoms_recs)
|
||||
|
||||
seqs_long = seqres_recs[:2]
|
||||
# print(seqs_long)
|
||||
seqs_short = atoms_recs[:2]
|
||||
# print(seqs_short)
|
||||
|
||||
structure = PDB.PDBParser().get_structure(pdb_id, pdb_file)
|
||||
chains = list(structure.get_chains())
|
||||
# if len(chains) > 2:
|
||||
# print(pdb_id)
|
||||
# continue
|
||||
chains = list(structure.get_chains())[:2]
|
||||
# print(chains)
|
||||
|
||||
with open(f"dscript/proteins.fasta", "a") as f:
|
||||
for record in seqs_long:
|
||||
f.write(record.format("fasta-2line"))
|
||||
|
||||
chains_filtered = filter_chains(chains)
|
||||
# print(chains_filtered)
|
||||
# which sequence (seqres or atom) does chain match to? seqres?
|
||||
# the chain length should be equal to the pdb-atom sequence length, right? len(chain) = len(seq_short)
|
||||
# but for some reason, in a few cases chains aren't the same length as either of the generated pdb-seqres or pdb-atom sequences,
|
||||
# which can throw a stopiteration error
|
||||
|
||||
# seqres = sequence portion near start of pdb
|
||||
# atom = sequence portion in midway of pdb using distance coordinates and atomic information, can skip residue numbers leadering to X's
|
||||
# chain = same as atom sequence, but only amino acid residues and not counting skips, so may be shorter than seqres and/or atom
|
||||
|
||||
seq0_long = seqs_long[0].seq
|
||||
# print(len(seq0_long))
|
||||
seq0_short = seqs_short[0].seq
|
||||
# print(len(seq0_short))
|
||||
chain0 = chains_filtered[0]
|
||||
# print((chain0))
|
||||
# print(len(chain0))
|
||||
|
||||
seq1_long = seqs_long[1].seq
|
||||
# print(len(seq1_long))
|
||||
seq1_short = seqs_short[1].seq
|
||||
# print(len(seq1_short))
|
||||
chain1 = chains_filtered[1]
|
||||
# print((chain1))
|
||||
|
||||
# print(seq0_long)
|
||||
# print(seq0_short)
|
||||
# print([len(seq0_long), len(seq0_short)])
|
||||
# print(seq1_long)
|
||||
# print(seq1_short)
|
||||
# print([len(seq1_long), len(seq1_short)])
|
||||
|
||||
# how to get sequence from chain?
|
||||
# print(len(chain0))
|
||||
# print(len(chain1))
|
||||
|
||||
if len(seq1_short) > len(seq1_long) or len(seq0_short) > len(
|
||||
@@ -103,18 +117,33 @@ def main():
|
||||
):
|
||||
count += 1
|
||||
|
||||
# print(seq0_short)
|
||||
# print(seq0_long)
|
||||
# print([len(seq0_short), len(seq0_long)])
|
||||
# print(seq1_short)
|
||||
# print(seq1_long)
|
||||
# print([len(seq1_short), len(seq1_long)])
|
||||
|
||||
align0 = pairwise2.align.globalxx(seq0_long, seq0_short)
|
||||
print(pairwise2.format_alignment(*align0[0]))
|
||||
# print(pairwise2.format_alignment(*align0[0]))
|
||||
|
||||
align1 = pairwise2.align.globalxx(seq1_long, seq1_short)
|
||||
print(pairwise2.format_alignment(*align1[0]))
|
||||
# print(pairwise2.format_alignment(*align1[0]))
|
||||
|
||||
if len(chain0) == len(seq1_long) or len(chain0) == len(seq1_short):
|
||||
if (
|
||||
len(chain0) != len(seq0_long) or len(chain0) != len(seq0_short)
|
||||
) and (
|
||||
len(chain0) == len(seq1_long) or len(chain0) == len(seq1_short)
|
||||
):
|
||||
print("switched")
|
||||
temp = chain1.copy()
|
||||
chain1 = chain0
|
||||
chain0 = temp
|
||||
print(len(chain0))
|
||||
print(len(chain1))
|
||||
# print(len(chain0))
|
||||
# print(len(chain1))
|
||||
|
||||
if len(chain0) != len(seq0_long) or len(chain1) != len(seq1_long):
|
||||
None
|
||||
|
||||
align0 = pairwise2.align.globalxx(seq0_long, seq0_short)
|
||||
# print(pairwise2.format_alignment(*align0[0]))
|
||||
@@ -150,22 +179,40 @@ def main():
|
||||
# print(len(chain0))
|
||||
# print(len(chain1))
|
||||
|
||||
# ch0_it = iter(chain0)
|
||||
# for (i, (res0L, res0S)) in enumerate(zip(seq0_long_f, seq0_short_f)):
|
||||
# print((i, (res0L, res0S)))
|
||||
# if res0S == "-" or res0L == "-":
|
||||
# continue
|
||||
# else:
|
||||
# res0 = next(ch0_it)
|
||||
ch0_it = iter(chain0)
|
||||
x = -1
|
||||
y = 0
|
||||
for (i, (res0L, res0S)) in enumerate(zip(seq0_long_f, seq0_short_f)):
|
||||
if res0S == "-":
|
||||
x += 1
|
||||
continue
|
||||
if res0L == "-":
|
||||
continue
|
||||
else:
|
||||
# print((x, (res0L, res0S)))
|
||||
x += 1
|
||||
res0 = next(ch0_it)
|
||||
|
||||
# ch1_it = iter(chain1)
|
||||
# for (j, (res1L, res1S)) in enumerate(zip(seq1_long_f, seq1_short_f)):
|
||||
# print([x,y])
|
||||
ch1_it = iter(chain1)
|
||||
for (j, (res1L, res1S)) in enumerate(
|
||||
zip(seq1_long_f, seq1_short_f)
|
||||
):
|
||||
if res1S == "-":
|
||||
y += 1
|
||||
continue
|
||||
if res1L == "-":
|
||||
continue
|
||||
else:
|
||||
res1 = next(ch1_it)
|
||||
# print((y, (res1L, res1S)))
|
||||
# print([x,y])
|
||||
D[x, y] = residue_distance(res0, res1, max_d=MAX_D)
|
||||
y += 1
|
||||
|
||||
# if res1S == "-" or res1L == "-":
|
||||
# continue
|
||||
# else:
|
||||
# res1 = next(ch1_it)
|
||||
y = 0
|
||||
|
||||
# print([i,j])
|
||||
# D[i,j] = residue_distance(res0, res1, max_d = MAX_D)
|
||||
|
||||
# D = calc_dist_matrix(chain0, chain1, seq0_long, seq0_short, seq1_long, seq1_short)
|
||||
192
dscript/processing/process_v2.py
Normal file
192
dscript/processing/process_v2.py
Normal file
@@ -0,0 +1,192 @@
|
||||
from Bio import SeqIO
|
||||
from Bio import PDB
|
||||
from Bio import pairwise2
|
||||
import h5py
|
||||
import numpy as np
|
||||
import os
|
||||
import pandas as pd
|
||||
import csv
|
||||
|
||||
MAX_D = 25
|
||||
|
||||
|
||||
def filter_chains(chain_list):
|
||||
chains_f = [[r for r in c if r.has_id("CA")] for c in chain_list]
|
||||
return chains_f
|
||||
|
||||
|
||||
def residue_distance(res0, res1, max_d=25.0):
|
||||
diff_vector = res0["CA"].coord - res1["CA"].coord
|
||||
distance = np.sqrt(np.sum(diff_vector ** 2))
|
||||
return min(distance, max_d)
|
||||
|
||||
|
||||
def make_tsv(pdb_id, chains, name):
|
||||
with open(f"data/{name}.tsv", "a") as out_file:
|
||||
tsv_writer = csv.writer(out_file, delimiter="\t")
|
||||
prot1 = f"{pdb_id.upper()}:{str(chains[0].get_id()).upper()}"
|
||||
prot2 = f"{pdb_id.upper()}:{str(chains[1].get_id()).upper()}"
|
||||
print((prot1, prot2))
|
||||
tsv_writer.writerow([prot1, prot2, "1"])
|
||||
|
||||
|
||||
def make_fasta(pdb_id, seqs_long, fasta_name):
|
||||
with open(f"data/{fasta_name}.fasta", "a") as f:
|
||||
for record in seqs_long:
|
||||
f.write(record.format("fasta-2line"))
|
||||
# None
|
||||
|
||||
|
||||
def calc_dist_matrix(
|
||||
pdb_id,
|
||||
chain0,
|
||||
chain1,
|
||||
seq0_long,
|
||||
seq1_long,
|
||||
seq0_long_f,
|
||||
seq0_short_f,
|
||||
seq1_long_f,
|
||||
seq1_short_f,
|
||||
):
|
||||
D = np.zeros((len(seq0_long), len(seq1_long)))
|
||||
D = D - 1
|
||||
ch0_it = iter(chain0)
|
||||
x = -1
|
||||
y = 0
|
||||
for (i, (res0L, res0S)) in enumerate(zip(seq0_long_f, seq0_short_f)):
|
||||
if res0S == "-":
|
||||
x += 1
|
||||
continue
|
||||
if res0L == "-":
|
||||
continue
|
||||
else:
|
||||
x += 1
|
||||
try:
|
||||
res0 = next(ch0_it)
|
||||
except StopIteration:
|
||||
file1 = open("dscript/processing/pdb_errors.txt", "a")
|
||||
file1.write(pdb_id)
|
||||
file1.close()
|
||||
# res0 = next(ch0_it)
|
||||
|
||||
ch1_it = iter(chain1)
|
||||
for (j, (res1L, res1S)) in enumerate(zip(seq1_long_f, seq1_short_f)):
|
||||
if res1S == "-":
|
||||
y += 1
|
||||
continue
|
||||
if res1L == "-":
|
||||
continue
|
||||
else:
|
||||
try:
|
||||
res1 = next(ch1_it)
|
||||
except StopIteration:
|
||||
file1 = open("dscript/processing/pdb_errors.txt", "a")
|
||||
file1.write(pdb_id)
|
||||
file1.close()
|
||||
# res1 = next(ch1_it)
|
||||
D[x, y] = residue_distance(res0, res1, max_d=MAX_D)
|
||||
y += 1
|
||||
y = 0
|
||||
|
||||
return D
|
||||
|
||||
|
||||
def main():
|
||||
pdb_directory = input("Name of directory containing pdb files: ")
|
||||
files = os.listdir(f"dscript/{pdb_directory}")
|
||||
if ".DS_Store" in files:
|
||||
files.remove(".DS_Store")
|
||||
for i in range(0, len(files)):
|
||||
files[i] = files[i][:4]
|
||||
|
||||
total = 0
|
||||
long = []
|
||||
|
||||
h5_name = input("Name of output H5 (for contact maps): ")
|
||||
hf_pair = h5py.File(f"data/{h5_name}.h5", "w")
|
||||
fasta_name = input("Name of output fasta (for sequences): ")
|
||||
tsv_name = input("Name of output tsv (for PPIs): ")
|
||||
|
||||
for pdb_id in files:
|
||||
total += 1
|
||||
print(f"Total: {total}")
|
||||
print(pdb_id)
|
||||
|
||||
pdb_file = f"dscript/{pdb_directory}/{pdb_id}.pdb"
|
||||
seqres_recs = list(SeqIO.parse(pdb_file, "pdb-seqres"))
|
||||
atoms_recs = list(SeqIO.parse(pdb_file, "pdb-atom"))
|
||||
seqs_long = seqres_recs[:2]
|
||||
seqs_short = atoms_recs[:2]
|
||||
|
||||
structure = PDB.PDBParser().get_structure(pdb_id, pdb_file)
|
||||
# chains = list(structure.get_chains())
|
||||
# if len(chains) > 2:
|
||||
# print(chains)
|
||||
# print(pdb_id)
|
||||
# continue
|
||||
chains = list(structure.get_chains())[:2]
|
||||
if (
|
||||
len(chains[0]) > 800
|
||||
or len(chains[1]) > 800
|
||||
or len(chains[0]) < 50
|
||||
or len(chains[1]) < 50
|
||||
):
|
||||
long.append(pdb_id)
|
||||
continue
|
||||
|
||||
make_fasta(pdb_id, seqs_long, fasta_name)
|
||||
make_tsv(pdb_id, chains, tsv_name)
|
||||
|
||||
chains_filtered = filter_chains(chains)
|
||||
|
||||
seq0_long = seqs_long[0].seq
|
||||
seq0_short = seqs_short[0].seq
|
||||
chain0 = chains_filtered[0]
|
||||
seq1_long = seqs_long[1].seq
|
||||
seq1_short = seqs_short[1].seq
|
||||
chain1 = chains_filtered[1]
|
||||
|
||||
# if chain lengths switched (chain0 = seq1 and chain1 = seq0)
|
||||
if (
|
||||
len(chain0) != len(seq0_long) or len(chain0) != len(seq0_short)
|
||||
) and (
|
||||
len(chain0) == len(seq1_long) or len(chain0) == len(seq1_short)
|
||||
):
|
||||
temp = chain1.copy()
|
||||
chain1 = chain0
|
||||
chain0 = temp
|
||||
|
||||
align0 = pairwise2.align.globalxx(seq0_long, seq0_short)
|
||||
# print(pairwise2.format_alignment(*align0[0]))
|
||||
align1 = pairwise2.align.globalxx(seq1_long, seq1_short)
|
||||
# print(pairwise2.format_alignment(*align1[0]))
|
||||
seq0_long_f = align0[0].seqA
|
||||
seq0_short_f = align0[0].seqB
|
||||
seq1_long_f = align1[0].seqA
|
||||
seq1_short_f = align1[0].seqB
|
||||
|
||||
D = calc_dist_matrix(
|
||||
pdb_id,
|
||||
chain0,
|
||||
chain1,
|
||||
seq0_long,
|
||||
seq1_long,
|
||||
seq0_long_f,
|
||||
seq0_short_f,
|
||||
seq1_long_f,
|
||||
seq1_short_f,
|
||||
)
|
||||
df = pd.DataFrame(D, index=list(seq0_long), columns=list(seq1_long))
|
||||
# print(df)
|
||||
|
||||
# print(f'{pdb_id}:{str(chains[0].get_id())}x{pdb_id}:{str(chains[1].get_id())}')
|
||||
hf_pair.create_dataset(
|
||||
f"{pdb_id}:{str(chains[0].get_id())}x{pdb_id}:{str(chains[1].get_id())}",
|
||||
data=D,
|
||||
)
|
||||
|
||||
print(long)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user