made updates to processing.py, incorporating/debugging filtering and sequence alginment code

2026-06-04 15:04:24 +08:00 · 2022-09-05 06:45:33 -04:00
parent 091a8936c7
commit bde88ced0c
13 changed files with 281 additions and 370 deletions
--- a/dscript/processing/1_gzip_to_pdb.py
+++ b/dscript/processing/1_gzip_to_pdb.py
@@ -1,20 +0,0 @@
-"""
-    # UNFINISHED
-    # convert batches of downloaded pdb gzip files into parsable pdb protein files.
-"""
-import gzip
-import matplotlib.pyplot as plt
-import os
-import pandas as pd
-
-path = "dscript/pdbsNEW/batch-download-structures-1661040401355"
-zips = os.listdir(path)
-if ".DS_Store" in zips:
-    zips.remove(".DS_Store")
-
-for item in zips:
-    name = item[3:7]
-    op = open(f"dscript/pdbsNEW/{name}.pdb", "w")
-
-    with gzip.open(f"{path}/{item}", "rb") as fi:
-        op.write(fi.read().decode("utf-8"))
--- a/dscript/processing/2_pdb_to_cmap.py
+++ b/dscript/processing/2_pdb_to_cmap.py
@@ -1,123 +0,0 @@
-"""
-    # parsing PDB files into pairwise and binary contact maps
-    # REMOVE ANY CA ERROR PDBS
-"""
-from pickle import FALSE
-import Bio.PDB
-import numpy
-import matplotlib.pyplot as plt
-import h5py
-import os
-import pandas as pd
-
-# SOURCE CODE: https://warwick.ac.uk/fac/sci/moac/people/students/peter_cock/python/protein_contact_map/
-def has_CA(residue):
-    """
-    Return whether or not residue contains an alpha-carbon atom.
-
-    :param residue: pass in an amino acid
-    :type version: Residue object
-    :return: if amino acid residue has alpha-carbons
-    :rtype: boolean
-    """
-    return residue.has_id("CA")
-
-
-def filter_chains(chain_list):
-    chains_f = [[r for r in c if r.has_id("CA")] for c in chain_list]
-    return chains_f
-
-
-def calc_residue_dist(residue_one, residue_two, limit):
-    """
-    Returns the C-alpha distance between two residues
-
-    :param residue_one: pass in an amino acid residue object
-    :type residue_one: Residue object
-    :param residue_two: pass in an amino acid residue object
-    :type residue_two: Residue object
-    :param limit: pass in a distance threshold
-    :type limit: float
-    :return: C-alpha distance between two residues
-    :rtype: float
-    """
-    diff_vector = residue_one["CA"].coord - residue_two["CA"].coord
-    distance = numpy.sqrt(numpy.sum(diff_vector * diff_vector))
-    if distance >= float(limit):
-        return float(limit)
-    return distance
-
-
-def calc_dist_matrix(chain_one, chain_two, limit):
-    """
-    Returns a contact map matrix of C-alpha distances between two chains
-
-    :param chain_one: pass in a chain 1
-    :type chain_one: Chain object
-    :param chain_two: pass in a chain 2
-    :type chain_two: Chain object
-    :param limit: pass in a distance threshold
-    :type limit: float
-    :return: a contact map matrix of C-alpha distances between two chains
-    :rtype: Numpy matrix
-    """
-    chains = filter_chains([chain_one, chain_two])
-    chain1 = chains[0]
-    chain2 = chains[1]
-
-    # chain2 = filter_chains(chain_two)
-    # for residue_one in chain_one:
-    #     if has_CA(residue_one):
-    #         chain1.append(residue_one)
-    # for residue_two in chain_two:
-    #     if has_CA(residue_two):
-    #         chain2.append(residue_two)
-
-    answer = numpy.zeros((len(chain1), len(chain2)), float)
-    print(answer.shape)
-
-    for i in range(0, len(chain1)):
-        for j in range(0, len(chain2)):
-            # print((chain1[i].get_resname(), chain2[j].get_resname()))
-            answer[i][j] = calc_residue_dist(chain1[i], chain2[j], limit)
-
-    return answer
-
-
-def main():
-    files = os.listdir("dscript/pdbsNEW")
-    if ".DS_Store" in files:
-        files.remove(".DS_Store")
-
-    for i in range(0, len(files)):
-        files[i] = files[i][:4]
-
-    hf_pair = h5py.File(f"data/paircmaps_train.h5", "w")
-    count = 0
-    for protein in files:
-        count += 1
-        print(protein)
-        print(count)
-        pdb_code = protein.upper()
-        pdb_filename = f"dscript/pdbsNEW/{protein}.pdb"
-        structure = Bio.PDB.PDBParser().get_structure(pdb_code, pdb_filename)
-        model = structure[0]
-
-        # GET PROTEIN CHAINS
-        chain = []
-        for chains in structure.get_chains():
-            chain.append(str(chains.get_id()))
-        chain = chain[:2]
-
-        dist_matrix = calc_dist_matrix(
-            model[chain[0]], model[chain[1]], 25.000
-        )
-
-        print(f"{pdb_code}:{chain[0]}x{pdb_code}:{chain[1]}")
-        hf_pair.create_dataset(
-            f"{pdb_code}:{chain[0]}x{pdb_code}:{chain[1]}", data=dist_matrix
-        )
-
-
-if __name__ == "__main__":
-    main()
--- a/dscript/processing/3_pdb_to_fasta.py
+++ b/dscript/processing/3_pdb_to_fasta.py
@@ -1,31 +0,0 @@
-"""
-    # converting protein PDB files into fasta sequence format
-"""
-import Bio
-from Bio import SeqIO
-import os
-
-entries = os.listdir("dscript/pdbsNEW")
-if ".DS_Store" in entries:
-    entries.remove(".DS_Store")
-# print(entries)
-
-# CONVERT ALL TO FASTAS
-for i in range(0, len(entries)):
-    # print(entries[i])
-    if entries[i][0] != ".":
-        with open(f"dscript/fastasNEW/{entries[i][:-4]}.fasta", "w") as f:
-            # pdb-atom
-            for record in SeqIO.parse(
-                f"dscript/pdbsNEW/{entries[i]}", "pdb-seqres"
-            ):
-                f.write(record.format("fasta-2line"))
-        f.close()
-
-# TEST A SINGLE
-# with open(f'dscript/proteins/15c8.fasta', 'w') as f:
-#         for record in SeqIO.parse(f"dscript/15c8.pdb", "pdb-seqres"):
-#             # print("Record id %s, chain %s" % (record.id, record.annotations["chain"]))
-#             # print(record.format("fasta"))
-#             f.write(record.format("fasta-2line"))
-# f.close()
--- a/dscript/processing/4_match_cmap_fasta.py
+++ b/dscript/processing/4_match_cmap_fasta.py
@@ -1,50 +0,0 @@
-from pickle import FALSE
-import Bio.PDB
-import numpy
-import matplotlib.pyplot as plt
-import h5py
-import random
-import os
-import pandas as pd
-import torch
-
-fi = h5py.File("data/paircmaps_train.h5", "r")
-ke = list(fi.keys())
-# print(ke[0][:4].lower())
-print(fi[ke[0]].shape)
-
-fastas = os.listdir("dscript/fastasNEW")
-# fastas.remove(".DS_Store")
-# print(len(fastas))
-
-deletes = []
-
-for item in ke:
-    # print(fi[item].shape)
-    if os.path.exists(f"dscript/fastasNEW/{item[:4].lower()}.fasta"):
-        with open(
-            f"dscript/fastasNEW/{item[:4].lower()}.fasta", "r"
-        ) as in_file:
-            l = in_file.read().splitlines()
-            seq1 = len(l[1])
-            seq2 = len(l[3])
-            print((seq1, seq2))
-
-        if (seq1 == fi[item].shape[0] and seq2 == fi[item].shape[1]) or (
-            seq2 == fi[item].shape[0] and seq1 == fi[item].shape[1]
-        ):
-            # print(f"{item} MATCHES")
-            None
-        else:
-            # print(f"{item} DELETE")
-            deletes.append(item[:4].lower())
-        # print()
-
-print(deletes)
-# deletes = ['1fmh', '1dzb', '1dnw', '1dnu', '1fxv', '1d5l', '1d7w', '1dtd', '1e8n']
-# for item in deletes:
-#     if os.path.exists(f"dscript/pdbs/{item}.pdb"):
-#         os.remove(f"dscript/pdbs/{item}.pdb")
-# for item in deletes:
-#     if os.path.exists(f"dscript/fastas/{item}.fasta"):
-#         os.remove(f"dscript/fastas/{item}.fasta")
--- a/dscript/processing/5_combine_fastas.py
+++ b/dscript/processing/5_combine_fastas.py
@@ -1,32 +0,0 @@
-"""    
-    # CLEAR CMAP.FASTA BEFORE RUNNING THIS FILE
-    # making fasta ids simpler to match the contact maps, putting them all together in data/seqs as 1 fasta file
-"""
-import Bio.PDB
-import numpy as np
-import matplotlib.pyplot as plt
-import h5py
-import random
-import os
-import pandas as pd
-import torch
-import csv
-
-files = os.listdir("dscript/fastasTEST")
-if ".DS_Store" in files:    
-    files.remove(".DS_Store")
-# print(files)
-
-for item in files:
-    with open('data/seqs/cmap.fasta', 'a') as out_file:
-        with open(f'dscript/fastasTEST/{item}','r') as in_file:
-            # print(item)
-            l = in_file.read().splitlines()
-            # print(l)
-            for i in range(0, len(l)):
-                if i%2 == 0:
-                    l[i] = l[i][:7]
-            # print(l)
-            for line in l:
-                out_file.write(line)
-                out_file.write('\n')
--- a/dscript/processing/6_cmap_make_tsv.py
+++ b/dscript/processing/6_cmap_make_tsv.py
@@ -1,28 +0,0 @@
-"""
-    creating a training cmap tsv file with all the contact map protein pairs, all interaction = 1
-"""
-import Bio.PDB
-import numpy as np
-import matplotlib.pyplot as plt
-import h5py
-import random
-import os
-import pandas as pd
-import torch
-import csv
-
-# files = os.listdir("dscript/bincmaps")
-# files.remove(".DS_Store")
-# print(files)
-fi = h5py.File("data/bincmaps_train.h5","r")
-ke = list(fi.keys())
-
-with open('data/pairs/cmap_train.tsv', 'wt') as out_file:
-    tsv_writer = csv.writer(out_file, delimiter='\t')
-    for item in ke:
-        # print(item)
-        prot1 = item[:item.index(":")+2]
-        prot2 = item[item.index(":")+3:]
-        # print((prot1, prot2))
-        tsv_writer.writerow([prot1, prot2, '1'])
-    
--- a/dscript/processing/check_chain.py
+++ b/dscript/processing/check_chain.py
@@ -1,22 +0,0 @@
-import os
-
-fastas = os.listdir("dscript/fastas")
-if ".DS_Store" in fastas:
-    fastas.remove(".DS_Store")
-# print(len(fastas))
-
-deletes = []
-
-for item in fastas:
-    # print(fi[item].shape)
-    with open(f"dscript/fastas/{item}", "r") as in_file:
-        l = in_file.read().splitlines()
-        if len(l) != 4:
-            deletes.append(item[:4])
-        if len(l) == 4:
-            if len(l[1]) >= 694 or len(l[3]) >= 694:
-                # print(item)
-                deletes.append(item[:4])
-            if len(l[1]) <= 50 or len(l[3]) <= 50:
-                deletes.append(item[:4])
-print(deletes)
--- a/dscript/processing/delete.py
+++ b/dscript/processing/delete.py
@@ -1,13 +1,14 @@
 import os
- 
-# deletes = ["1B6H",  "2Z59", "2N73", "6W0V", "5Z2W", "6S3W", "7JOD", "7N1N", "7PKU", "1B6H", "5JYQ", "6GU0", "7JOE", "6E4D", "2Z59", "7B2B", "5U4K", "6KK3", "2N73", "7AA4", "6W9K", "7JQV", "6W9M", "7JOW"]
-# new = []
-# for item in deletes:
-#     new.append(f"{item.lower()}.pdb")
-# print(new)

-# deletes = ['1cf4', '1irs', '1e0a', '1j4p', '1j4k', '1sdx', '2m6i', '1k2n', '3ugw', '1shc', '1z3m', '1b3g', '5H7P', '1l0a', '2a3i', '4ZXL', '1dow', '5D7E', '1oo4', '2n8j', '5J7J', '4P6W', '4fjp', '2r02', '1z3p', '5GXW', '1f3r', '1htr', '1b2h', '4R8T', '2rod', '2z7f', '1ym0', '1k2m', '2rs9', '1z3l', '1b3f', '1zbc', '1hag', '4XOE', '4SRN', '2yka', '2roc', '1ees', '1d5s', '1d5h', '2roz', '1n7t', '2r05', '1cqg', '1j4q', '2n2h', '4PID', '2rnw', '1f1w', '2kdc', '1fu5', '2aos', '1wlp', '1qg1', '1tce', '1ppe', '1fev', '3n3x', '1smh', '4WB8', '1t01', '2rmx', '2r03', '1d7q', '1j4l', '1b8q', '1zbk', '2vu8', '1jsp', '2pon', '2rny', '2my3', '1ddm', '3llz', '2yu7', '2n0y', '5APR', '2oq1', '2ror', '2qos', '1k3n', '3n2d', '1uoo', '1o0p', '2a0t', '4Q5U', '2uzv', '1l0n', '1uop', '1zbv', '1b3h', '2xjy', '1k3q', '1ssc', '3u8q', '1d5d', '2rqw', '2xs4', '1r8u', '1d4t', '5duo', '2uzt', '1xr0', '4TT2', '1opi', '1qmb', '1b32', '2my2', '1jm4', '1fhr', '2phg', '2rnx', '2m6x', '3h8k', '1zl1', '1wkw', '1fpr', '3m7u', '1b9j', '2uzu', '1cqh', '2n9p', '1t37', '1zbw', '1b4z', '2uzw', '1d5e', '1tg4', '4W8P']
-deletes = ['7KLR', '4bs3', '4lnp', '6MHA', '1kbh', '1rgj', '6KBV', '6ES5', '2ofq', '5XBO', '2n2v', '1om2', '1i8h', '5Y20', '6ES7', '1hoy', '2ain', '6KBO', '2mv7', '7KN0', '6MGN', '5X9X', '1q69', '6B7G', '1b18', '2rqg', '7OJ9', '5L85', '2n4q', '1l2z', '1jh4', '4Q6H', '2rpn', '5D5E', '1s5q', '5MIZ', '7E0B', '1ncp', '7JQ8', '1kuz', '1hv2', '4NIB', '6X4X', '5J6Z', '1u0i', '1eci', '2rn5', '1bph', '6XMN', '6ES6', '1iog', '1q68', '1s7p', '1oqp', '6F55', '3t2a', '1kmf', '1pnb', '1k4u', '2n2w', '1sdb', '1hls', '2zpp', '1bxp', '1xgl', '1s5r', '6K59', '1q5w', '6QXZ', '1bzv', '1cph', '1b19', '1u38', '2rsy', '2n9y', '2rm0', '1zsg', '1jmq', '6BGH', '4i5y', '6VES', '4iuz', '1sse', '1t1k', '1uel', '1hiq', '1t1p', '6KH8', '1b2g', '1qh2', '1i5h', '5LIS', '3u4n', '1sf1', '1b2e', '4m5s', '1his', '1b17', '2rqh', '5YC4', '1q0w', '2a4j', '2mvc', '1mhi', '5VIZ', '1hui', '1vkt', '1otr', '1i8g', '4YYP', '1sb0', '1bon', '6KHA', '1bh9', '2mwn', '1pmx', '1tkq', '1io6', '1zgx', '6Q00', '2rly', '6Q8Q', '7QDW', '2n55', '5D53', '2pq4', '5EN9', '1b2d', '1g1e', '9INS', '5GWM', '1sjt', '4ihn', '5ENA', '6C0A', '1jbd', '5D54', '4i5z', '7A2Y', '7CWH', '3i40', '1kdx', '2os6', '7JYN', '1pd7', '1mv0', '5MHD', '1jco', '6KH9', '1b2f', '1t1q', '1k3m', '6K5R', '1bom', '1d5g', '7NHU', '1hit', '1jgn', '5YC3', '2mur', '5D52', '6S3F', '1kup', '1mhj', '2mzd', '2pku', '6NSX', '7F7X', '2mvd', '2wfu', '1dph', '2n2x', '4iyd', '2pdz', '1qfn', '7AC4', '4iyf', '5I22', '3i3z', '5MWQ', '6K5T', '2rol', '7EDP', '1ioh', '1bh8']
+string = "3srb1abt6Z8M1aot1b2a5D6J1a931b2b1a0n1kfu1ubo4m1j2omx1ubm1ubl1ai61ubh1ai71ai51ubk1ubj1ai41ajn1a2x1wul4U9H1ubr4U9I6XE31wuj1wuk1ajq2omw1wui1ubu1ubt1wuh1ajp1h2r3fju1abj4WKT1a7f1a9x"
+
+deletes = []
+for i in range(0, int(len(string) / 4)):
+    i = i * 4
+    deletes.append(string[i : i + 4])
+
+print(deletes)
+
 print(len(deletes))
 for item in deletes:
    if os.path.exists(f"dscript/pdbs/{item}.pdb"):
@@ -15,4 +16,3 @@ for item in deletes:
 for item in deletes:
    if os.path.exists(f"dscript/fastas/{item}.fasta"):
        os.remove(f"dscript/fastas/{item}.fasta")
-
--- a/dscript/processing/display_files.py
+++ b/dscript/processing/display_files.py
@@ -1,23 +0,0 @@
-import os
-
-files = os.listdir("dscript/pdbsTEST")
-# fastas = os.listdir("dscript/fastasNEW")
-# fastas.remove(".DS_Store")
-if ".DS_Store" in files:
-    files.remove(".DS_Store")
-# print(len(files))
-# print(files)
-
-new_files = []
-for item in files:
-    new_files.append(item[:4].lower())
-# print((new_files))
-
-string = "'1r8o', '1nsg', '1pc8', '1ktp', '1puu', '1mlb', '1p6a', '1oo9', '1plg', '1jgv', '1jql', '1kfu', '1nld', '1q9k', '1kem', '1kel', '1jlt', '1qbl', '1mim', '1pqz', '1nlb', '1nl4', '1phn', '1jnn', '1qav', '1qbm', '1jnl', '1nme', '1op9', '1jtt', '1ppf', '1n7m', '1ncw', '1mh2', '1nbv', '1rf8', '1jv5', '1opg', '1n94', '1p69', '1p4q', '1kn4', '1oz7', '1on7', '1kcu', '1ngy', '1kn2', '1kcv', '1ngz', '1pum'"
-new_string = ""
-for item in string:
-    # print(item)
-    if item != "'" and item != " ":
-        new_string += item[:4]
-
-print(new_string)
--- a/dscript/processing/pdb_errors.txt
+++ b/dscript/processing/pdb_errors.txt
@@ -0,0 +1 @@
+3srb1abt6Z8M1aot1b2a5D6J1a931b2b1a0n1kfu1ubo4m1j2omx1ubm1ubl1ai61ubh1ai71ai51ubk1ubj1ai41ajn1a2x1wul4U9H1ubr4U9I6XE31wuj1wuk1ajq2omw1wui1ubu1ubt1wuh1ajp1h2r3fju1abj4WKT1a7f1a9x
--- a/dscript/processing/pdb_ids.txt
+++ b/dscript/processing/pdb_ids.txt
--- a/dscript/processing/align_and_parse.py
+++ b/dscript/processing/align_and_parse.py
@@ -67,35 +67,49 @@ def main():
        pdb_file = f"dscript/pdbsNEW/{pdb_id}.pdb"

        seqres_recs = list(SeqIO.parse(pdb_file, "pdb-seqres"))
-        # print(seqres_recs)
        atoms_recs = list(SeqIO.parse(pdb_file, "pdb-atom"))
-        # print(atoms_recs)

        seqs_long = seqres_recs[:2]
-        # print(seqs_long)
        seqs_short = atoms_recs[:2]
-        # print(seqs_short)

        structure = PDB.PDBParser().get_structure(pdb_id, pdb_file)
+        chains = list(structure.get_chains())
+        # if len(chains) > 2:
+        #     print(pdb_id)
+        #     continue
        chains = list(structure.get_chains())[:2]
-        # print(chains)
+
+        with open(f"dscript/proteins.fasta", "a") as f:
+            for record in seqs_long:
+                f.write(record.format("fasta-2line"))
+
        chains_filtered = filter_chains(chains)
-        # print(chains_filtered)
+        # which sequence (seqres or atom) does chain match to? seqres?
+        # the chain length should be equal to the pdb-atom sequence length, right? len(chain) = len(seq_short)
+        # but for some reason, in a few cases chains aren't the same length as either of the generated pdb-seqres or pdb-atom sequences,
+        # which can throw a stopiteration error
+
+        # seqres = sequence portion near start of pdb
+        # atom = sequence portion in midway of pdb using distance coordinates and atomic information, can skip residue numbers leadering to X's
+        # chain = same as atom sequence, but only amino acid residues and not counting skips, so may be shorter than seqres and/or atom

        seq0_long = seqs_long[0].seq
-        # print(len(seq0_long))
        seq0_short = seqs_short[0].seq
-        # print(len(seq0_short))
        chain0 = chains_filtered[0]
-        # print((chain0))
-        # print(len(chain0))

        seq1_long = seqs_long[1].seq
-        # print(len(seq1_long))
        seq1_short = seqs_short[1].seq
-        # print(len(seq1_short))
        chain1 = chains_filtered[1]
-        # print((chain1))
+
+        # print(seq0_long)
+        # print(seq0_short)
+        # print([len(seq0_long), len(seq0_short)])
+        # print(seq1_long)
+        # print(seq1_short)
+        # print([len(seq1_long), len(seq1_short)])
+
+        # how to get sequence from chain?
+        # print(len(chain0))
        # print(len(chain1))

        if len(seq1_short) > len(seq1_long) or len(seq0_short) > len(
@@ -103,18 +117,33 @@ def main():
        ):
            count += 1

+            # print(seq0_short)
+            # print(seq0_long)
+            # print([len(seq0_short), len(seq0_long)])
+            # print(seq1_short)
+            # print(seq1_long)
+            # print([len(seq1_short), len(seq1_long)])
+
            align0 = pairwise2.align.globalxx(seq0_long, seq0_short)
-            print(pairwise2.format_alignment(*align0[0]))
+            # print(pairwise2.format_alignment(*align0[0]))

            align1 = pairwise2.align.globalxx(seq1_long, seq1_short)
-            print(pairwise2.format_alignment(*align1[0]))
+            # print(pairwise2.format_alignment(*align1[0]))

-        if len(chain0) == len(seq1_long) or len(chain0) == len(seq1_short):
+        if (
+            len(chain0) != len(seq0_long) or len(chain0) != len(seq0_short)
+        ) and (
+            len(chain0) == len(seq1_long) or len(chain0) == len(seq1_short)
+        ):
+            print("switched")
            temp = chain1.copy()
            chain1 = chain0
            chain0 = temp
-            print(len(chain0))
-            print(len(chain1))
+            # print(len(chain0))
+            # print(len(chain1))
+
+        if len(chain0) != len(seq0_long) or len(chain1) != len(seq1_long):
+            None

        align0 = pairwise2.align.globalxx(seq0_long, seq0_short)
        # print(pairwise2.format_alignment(*align0[0]))
@@ -150,22 +179,40 @@ def main():
        # print(len(chain0))
        # print(len(chain1))

-        # ch0_it = iter(chain0)
-        # for (i, (res0L, res0S)) in enumerate(zip(seq0_long_f, seq0_short_f)):
-        #     print((i, (res0L, res0S)))
-        #     if res0S == "-" or res0L == "-":
-        #         continue
-        #     else:
-        #         res0 = next(ch0_it)
+        ch0_it = iter(chain0)
+        x = -1
+        y = 0
+        for (i, (res0L, res0S)) in enumerate(zip(seq0_long_f, seq0_short_f)):
+            if res0S == "-":
+                x += 1
+                continue
+            if res0L == "-":
+                continue
+            else:
+                # print((x, (res0L, res0S)))
+                x += 1
+                res0 = next(ch0_it)

-        #     ch1_it = iter(chain1)
-        #     for (j, (res1L, res1S)) in enumerate(zip(seq1_long_f, seq1_short_f)):
+            # print([x,y])
+            ch1_it = iter(chain1)
+            for (j, (res1L, res1S)) in enumerate(
+                zip(seq1_long_f, seq1_short_f)
+            ):
+                if res1S == "-":
+                    y += 1
+                    continue
+                if res1L == "-":
+                    continue
+                else:
+                    res1 = next(ch1_it)
+                    # print((y, (res1L, res1S)))
+                    # print([x,y])
+                    D[x, y] = residue_distance(res0, res1, max_d=MAX_D)
+                    y += 1

-        #         if res1S == "-" or res1L == "-":
-        #             continue
-        #         else:
-        #             res1 = next(ch1_it)
+            y = 0

+        #         print([i,j])
        #         D[i,j] = residue_distance(res0, res1, max_d = MAX_D)

        # D = calc_dist_matrix(chain0, chain1, seq0_long, seq0_short, seq1_long, seq1_short)
--- a/dscript/processing/process_v2.py
+++ b/dscript/processing/process_v2.py
@@ -0,0 +1,192 @@
+from Bio import SeqIO
+from Bio import PDB
+from Bio import pairwise2
+import h5py
+import numpy as np
+import os
+import pandas as pd
+import csv
+
+MAX_D = 25
+
+
+def filter_chains(chain_list):
+    chains_f = [[r for r in c if r.has_id("CA")] for c in chain_list]
+    return chains_f
+
+
+def residue_distance(res0, res1, max_d=25.0):
+    diff_vector = res0["CA"].coord - res1["CA"].coord
+    distance = np.sqrt(np.sum(diff_vector ** 2))
+    return min(distance, max_d)
+
+
+def make_tsv(pdb_id, chains, name):
+    with open(f"data/{name}.tsv", "a") as out_file:
+        tsv_writer = csv.writer(out_file, delimiter="\t")
+        prot1 = f"{pdb_id.upper()}:{str(chains[0].get_id()).upper()}"
+        prot2 = f"{pdb_id.upper()}:{str(chains[1].get_id()).upper()}"
+        print((prot1, prot2))
+        tsv_writer.writerow([prot1, prot2, "1"])
+
+
+def make_fasta(pdb_id, seqs_long, fasta_name):
+    with open(f"data/{fasta_name}.fasta", "a") as f:
+        for record in seqs_long:
+            f.write(record.format("fasta-2line"))
+            # None
+
+
+def calc_dist_matrix(
+    pdb_id,
+    chain0,
+    chain1,
+    seq0_long,
+    seq1_long,
+    seq0_long_f,
+    seq0_short_f,
+    seq1_long_f,
+    seq1_short_f,
+):
+    D = np.zeros((len(seq0_long), len(seq1_long)))
+    D = D - 1
+    ch0_it = iter(chain0)
+    x = -1
+    y = 0
+    for (i, (res0L, res0S)) in enumerate(zip(seq0_long_f, seq0_short_f)):
+        if res0S == "-":
+            x += 1
+            continue
+        if res0L == "-":
+            continue
+        else:
+            x += 1
+            try:
+                res0 = next(ch0_it)
+            except StopIteration:
+                file1 = open("dscript/processing/pdb_errors.txt", "a")
+                file1.write(pdb_id)
+                file1.close()
+            # res0 = next(ch0_it)
+
+        ch1_it = iter(chain1)
+        for (j, (res1L, res1S)) in enumerate(zip(seq1_long_f, seq1_short_f)):
+            if res1S == "-":
+                y += 1
+                continue
+            if res1L == "-":
+                continue
+            else:
+                try:
+                    res1 = next(ch1_it)
+                except StopIteration:
+                    file1 = open("dscript/processing/pdb_errors.txt", "a")
+                    file1.write(pdb_id)
+                    file1.close()
+                # res1 = next(ch1_it)
+                D[x, y] = residue_distance(res0, res1, max_d=MAX_D)
+                y += 1
+        y = 0
+
+    return D
+
+
+def main():
+    pdb_directory = input("Name of directory containing pdb files: ")
+    files = os.listdir(f"dscript/{pdb_directory}")
+    if ".DS_Store" in files:
+        files.remove(".DS_Store")
+    for i in range(0, len(files)):
+        files[i] = files[i][:4]
+
+    total = 0
+    long = []
+
+    h5_name = input("Name of output H5 (for contact maps): ")
+    hf_pair = h5py.File(f"data/{h5_name}.h5", "w")
+    fasta_name = input("Name of output fasta (for sequences): ")
+    tsv_name = input("Name of output tsv (for PPIs): ")
+
+    for pdb_id in files:
+        total += 1
+        print(f"Total: {total}")
+        print(pdb_id)
+
+        pdb_file = f"dscript/{pdb_directory}/{pdb_id}.pdb"
+        seqres_recs = list(SeqIO.parse(pdb_file, "pdb-seqres"))
+        atoms_recs = list(SeqIO.parse(pdb_file, "pdb-atom"))
+        seqs_long = seqres_recs[:2]
+        seqs_short = atoms_recs[:2]
+
+        structure = PDB.PDBParser().get_structure(pdb_id, pdb_file)
+        # chains = list(structure.get_chains())
+        # if len(chains) > 2:
+        #     print(chains)
+        #     print(pdb_id)
+        #     continue
+        chains = list(structure.get_chains())[:2]
+        if (
+            len(chains[0]) > 800
+            or len(chains[1]) > 800
+            or len(chains[0]) < 50
+            or len(chains[1]) < 50
+        ):
+            long.append(pdb_id)
+            continue
+
+        make_fasta(pdb_id, seqs_long, fasta_name)
+        make_tsv(pdb_id, chains, tsv_name)
+
+        chains_filtered = filter_chains(chains)
+
+        seq0_long = seqs_long[0].seq
+        seq0_short = seqs_short[0].seq
+        chain0 = chains_filtered[0]
+        seq1_long = seqs_long[1].seq
+        seq1_short = seqs_short[1].seq
+        chain1 = chains_filtered[1]
+
+        # if chain lengths switched (chain0 = seq1 and chain1 = seq0)
+        if (
+            len(chain0) != len(seq0_long) or len(chain0) != len(seq0_short)
+        ) and (
+            len(chain0) == len(seq1_long) or len(chain0) == len(seq1_short)
+        ):
+            temp = chain1.copy()
+            chain1 = chain0
+            chain0 = temp
+
+        align0 = pairwise2.align.globalxx(seq0_long, seq0_short)
+        # print(pairwise2.format_alignment(*align0[0]))
+        align1 = pairwise2.align.globalxx(seq1_long, seq1_short)
+        # print(pairwise2.format_alignment(*align1[0]))
+        seq0_long_f = align0[0].seqA
+        seq0_short_f = align0[0].seqB
+        seq1_long_f = align1[0].seqA
+        seq1_short_f = align1[0].seqB
+
+        D = calc_dist_matrix(
+            pdb_id,
+            chain0,
+            chain1,
+            seq0_long,
+            seq1_long,
+            seq0_long_f,
+            seq0_short_f,
+            seq1_long_f,
+            seq1_short_f,
+        )
+        df = pd.DataFrame(D, index=list(seq0_long), columns=list(seq1_long))
+        # print(df)
+
+        # print(f'{pdb_id}:{str(chains[0].get_id())}x{pdb_id}:{str(chains[1].get_id())}')
+        hf_pair.create_dataset(
+            f"{pdb_id}:{str(chains[0].get_id())}x{pdb_id}:{str(chains[1].get_id())}",
+            data=D,
+        )
+
+    print(long)
+
+
+if __name__ == "__main__":
+    main()
				`@@ -0,0 +1 @@`
				`3srb1abt6Z8M1aot1b2a5D6J1a931b2b1a0n1kfu1ubo4m1j2omx1ubm1ubl1ai61ubh1ai71ai51ubk1ubj1ai41ajn1a2x1wul4U9H1ubr4U9I6XE31wuj1wuk1ajq2omw1wui1ubu1ubt1wuh1ajp1h2r3fju1abj4WKT1a7f1a9x`