From 2bfee227e6b69afb4212e77f9419919d40145f02 Mon Sep 17 00:00:00 2001
From: Lynn Tao <lynntao@31-35-227.wireless.csail.mit.edu>
Date: Wed, 29 Jun 2022 17:10:41 -0400
Subject: [PATCH] Initial commit from Lynn on contact supervision branch

---
 .gitignore                       |  4 ++
 dscript/1a0n.tsv                 |  1 +
 dscript/__init__.py              |  3 +-
 dscript/__main__.py              |  8 ++--
 dscript/alphabets.py             |  2 +
 dscript/commands/embed.py        |  2 +-
 dscript/commands/evaluate.py     |  2 +-
 dscript/commands/predict.py      |  5 ++-
 dscript/commands/train.py        |  8 ++--
 dscript/contact_map.py           | 76 ++++++++++++++++++++++++++++++++
 dscript/data_collection.py       | 22 +++++++++
 dscript/fasta.py                 |  2 +-
 dscript/language_model.py        |  4 +-
 dscript/models/contact.py        |  4 +-
 dscript/models/embedding.py      |  8 ++--
 dscript/models/interaction.py    |  2 +
 dscript/pretrained.py            |  4 +-
 dscript/tests/test_pretrained.py |  2 +-
 dscript/utils.py                 |  2 +-
 environment.yml                  |  1 -
 setup.py                         | 18 ++++----
 21 files changed, 148 insertions(+), 32 deletions(-)
 create mode 100644 dscript/1a0n.tsv
 create mode 100644 dscript/contact_map.py
 create mode 100644 dscript/data_collection.py

diff --git a/.gitignore b/.gitignore
index 0cb3b0b..1d2d254 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,3 +1,6 @@
+dscript/proteins
+dscript/pdbs
+
 RUN_DSCRIPT.sh
 build/*
 scratch/*
@@ -8,6 +11,7 @@ dev*/*
 *.egg-info
 **/*.sav
 **/*_log.txt
+**.DS_Store
 **/*.h5
 **/.ipynb_checkpoints/**
 **/__pycache__/**
diff --git a/dscript/1a0n.tsv b/dscript/1a0n.tsv
new file mode 100644
index 0000000..4544743
--- /dev/null
+++ b/dscript/1a0n.tsv
@@ -0,0 +1 @@
+1A0N:A UNP:P27986 P85A_HUMAN	1A0N:B UNP:P06241 FYN_HUMAN	1
diff --git a/dscript/__init__.py b/dscript/__init__.py
index a2cf8c6..78fb8ad 100644
--- a/dscript/__init__.py
+++ b/dscript/__init__.py
@@ -1,4 +1,5 @@
-__version__ = "0.1.9"
+# goes to this version with relative imports
+__version__ = "0.1.9-Lynn"
 __citation__ = """Sledzieski, Singh, Cowen, Berger. Sequence-based prediction of protein-protein interactions: a structure-aware interpretable deep learning model. Cell Systems, 2021."""
 from . import alphabets, commands, fasta, language_model, models, pretrained
 
diff --git a/dscript/__main__.py b/dscript/__main__.py
index 761cac7..4f1420e 100644
--- a/dscript/__main__.py
+++ b/dscript/__main__.py
@@ -1,11 +1,12 @@
 """
 D-SCRIPT: Structure Aware PPI Prediction
 """
-import argparse
-import os
+import argparse  # command line run
+import os # manages operating system (system software)
 import sys
 
 
+# CLASS prints citation (from initial file)
 class CitationAction(argparse.Action):
     def __init__(self, option_strings, dest, **kwargs):
         super(CitationAction, self).__init__(option_strings, dest, **kwargs)
@@ -17,8 +18,9 @@ class CitationAction(argparse.Action):
         setattr(namespace, self.dest, values)
         sys.exit(0)
 
-
+# Compiles a summary of the dscript information: version, citation, commdands, modules
 def main():
+    # print("Hello World!")
     from . import __version__
 
     parser = argparse.ArgumentParser(description=__doc__)
diff --git a/dscript/alphabets.py b/dscript/alphabets.py
index 7eda3e2..58c41cc 100644
--- a/dscript/alphabets.py
+++ b/dscript/alphabets.py
@@ -3,6 +3,7 @@ from __future__ import print_function, division
 import numpy as np
 
 
+# encodes amino acid seqence into mathematical indices and vice versa
 class Alphabet:
     """
     From `Bepler & Berger <https://github.com/tbepler/protein-sequence-embedding-iclr2019>`_.
@@ -60,6 +61,7 @@ class Alphabet:
         string = self.chars[x]
         return string.tobytes()
 
+# ***
     def unpack(self, h, k):
         """ unpack integer h into array of this alphabet with length k """
         n = self.size
diff --git a/dscript/commands/embed.py b/dscript/commands/embed.py
index a6ed04e..fe9c89b 100644
--- a/dscript/commands/embed.py
+++ b/dscript/commands/embed.py
@@ -5,7 +5,7 @@ Generate new embeddings using pre-trained language model.
 import argparse
 from ..language_model import embed_from_fasta
 
-
+# *** make new embeddings --> this vs. embedding.py (models) vs language_model.py
 def add_args(parser):
     """
     Create parser for command line utility.
diff --git a/dscript/commands/evaluate.py b/dscript/commands/evaluate.py
index acc8422..aa71d16 100644
--- a/dscript/commands/evaluate.py
+++ b/dscript/commands/evaluate.py
@@ -44,7 +44,7 @@ def add_args(parser):
     )
     return parser
 
-
+# *** plot positive and negative predictions?? 
 def plot_eval_predictions(labels, predictions, path="figure"):
     """
     Plot histogram of positive and negative predictions, precision-recall curve, and receiver operating characteristic curve.
diff --git a/dscript/commands/predict.py b/dscript/commands/predict.py
index 6ecf84c..f0b246f 100644
--- a/dscript/commands/predict.py
+++ b/dscript/commands/predict.py
@@ -44,13 +44,16 @@ def add_args(parser):
     )
     return parser
 
-
+# *** simple prediction, interpretation?
 def main(args):
     """
     Run new prediction from arguments.
 
     :meta private:
     """
+    # print("Hello World")
+    # sys.exit(0)
+    
     if args.seqs is None and args.embeddings is None:
         print("One of --seqs or --embeddings is required.")
         sys.exit(0)
diff --git a/dscript/commands/train.py b/dscript/commands/train.py
index 5c5ea03..fa9b35e 100644
--- a/dscript/commands/train.py
+++ b/dscript/commands/train.py
@@ -182,7 +182,7 @@ def predict_cmap_interaction(model, n0, n1, tensors, use_cuda):
     :param use_cuda: Whether to use GPU
     :type use_cuda: bool
     """
-
+    
     b = len(n0)
 
     p_hat = []
@@ -200,7 +200,7 @@ def predict_cmap_interaction(model, n0, n1, tensors, use_cuda):
     c_map_mag = torch.stack(c_map_mag, 0)
     return c_map_mag, p_hat
 
-
+# *** list and list interactions? from cmap predict interactions
 def predict_interaction(model, n0, n1, tensors, use_cuda):
     """
     Predict whether a list of protein pairs will interact.
@@ -243,6 +243,7 @@ def interaction_grad(model, n0, n1, y, tensors, weight=0.35, use_cuda=True):
     :rtype: (torch.Tensor, int, torch.Tensor, int)
     """
 
+
     c_map_mag, p_hat = predict_cmap_interaction(
         model, n0, n1, tensors, use_cuda
     )
@@ -252,6 +253,7 @@ def interaction_grad(model, n0, n1, y, tensors, weight=0.35, use_cuda=True):
 
     p_hat = p_hat.float()
     bce_loss = F.binary_cross_entropy(p_hat.float(), y.float())
+    
     cmap_loss = torch.mean(c_map_mag)
     loss = (weight * bce_loss) + ((1 - weight) * cmap_loss)
     b = len(p_hat)
@@ -397,7 +399,7 @@ def train_model(args, output):
     for prot_name in tqdm(all_proteins):
         embeddings[prot_name] = torch.from_numpy(h5fi[prot_name][:, :])
 
-    if args.checkpoint is None:
+    if args.checkpoint is None: 
 
         # Create embedding model
         input_dim = args.input_dim
diff --git a/dscript/contact_map.py b/dscript/contact_map.py
new file mode 100644
index 0000000..b68366e
--- /dev/null
+++ b/dscript/contact_map.py
@@ -0,0 +1,76 @@
+import Bio.PDB
+import numpy
+import matplotlib.pyplot as plt
+import h5py
+from models.interaction import ModelInteraction
+# from language_model import embed_from_fasta
+
+fi = h5py.File("2022-06-27-06:26.predictions.cmaps.h5","r")
+ke = list(fi.keys())
+# print(fi[ke[0]])
+cmap = fi[ke[0]]
+plt.imshow(cmap)
+plt.show()
+
+# SOURCE CODE: https://warwick.ac.uk/fac/sci/moac/people/students/peter_cock/python/protein_contact_map/
+# pdb_code = "15c8"
+# pdb_filename = "dscript/pdbs/15c8.pdb"
+# pdb_code = "1xi4"
+# pdb_filename = "dscript/1xi4.pdb" #not the full cage!
+
+
+# def calc_residue_dist(residue_one, residue_two) :
+#     """Returns the C-alpha distance between two residues"""
+#     diff_vector  = residue_one["CA"].coord - residue_two["CA"].coord
+#     return numpy.sqrt(numpy.sum(diff_vector * diff_vector))
+
+
+# def calc_dist_matrix(chain_one, chain_two) :
+#     """Returns a matrix of C-alpha distances between two chains"""
+#     answer = numpy.zeros((len(chain_one), len(chain_two)), numpy.float)
+#     for row, residue_one in enumerate(chain_one) :
+#         for col, residue_two in enumerate(chain_two) :
+#             answer[row, col] = calc_residue_dist(residue_one, residue_two)
+#     return answer
+
+    
+# structure = Bio.PDB.PDBParser().get_structure(pdb_code, pdb_filename)
+# print(structure)
+
+# model = structure[0]
+# # print(list(model["H"]))
+
+# # chain1 = model["H"]
+# # for residue in chain1:
+# #     if residue.id[0] != ' ':
+# #         chain1.detach_child(residue.id)
+
+# # chain2 = model["L"]
+# # for residue in chain2:
+# #     if residue.id[0] != ' ':
+# #         chain2.detach_child(residue.id)
+    
+# dist_matrix = calc_dist_matrix(model["D"], model["M"])
+# contact_map = dist_matrix < 12.0
+
+# print(contact_map)
+# print(numpy.min(dist_matrix))
+# print(numpy.max(dist_matrix))
+
+# pylab.matshow(numpy.transpose(dist_matrix))
+# pylab.colorbar()
+# pylab.show()
+
+# pylab.imshow(numpy.transpose(contact_map))
+# pylab.show()
+
+
+# DSCRIPT CONTACT MAPPING
+# embed_from_fasta("dscript/proteins/1a0n.fasta", "dscript/1a0n_embed.h5", device=0, verbose=False)
+# h5fi = h5py.File("dscript/1a0n_embed.h5", "r")
+# z_a = h5fi["1A0N:A UNP:P27986 P85A_HUMAN"]
+# z_b = h5fi["1A0N:B UNP:P06241 FYN_HUMAN"]
+# self = ModelInteraction()
+# cm, ph = self.map_predict(z_a, z_b)
+# print(cm)
+# print(ph)
\ No newline at end of file
diff --git a/dscript/data_collection.py b/dscript/data_collection.py
new file mode 100644
index 0000000..cffc295
--- /dev/null
+++ b/dscript/data_collection.py
@@ -0,0 +1,22 @@
+# converting PDB CIF to sequence format
+import Bio
+from Bio import SeqIO
+import os
+
+entries = os.listdir('dscript/pdbs')
+
+# CONVERT ALL
+for i in range(0, len(entries)):
+    if entries[i][0] != ".":
+        with open(f'dscript/proteins/{entries[i][:-4]}.fasta', 'w') as f:
+            for record in SeqIO.parse(f"dscript/pdbs/{entries[i]}", "pdb-seqres"):
+                f.write(record.format("fasta-2line"))
+        f.close()
+
+# TEST A SINGLE
+# with open(f'dscript/proteins/15c8.fasta', 'w') as f:
+#         for record in SeqIO.parse(f"dscript/15c8.pdb", "pdb-seqres"):
+#             # print("Record id %s, chain %s" % (record.id, record.annotations["chain"]))
+#             # print(record.format("fasta"))
+#             f.write(record.format("fasta-2line"))
+# f.close()
\ No newline at end of file
diff --git a/dscript/fasta.py b/dscript/fasta.py
index 58f28ef..a2d8be6 100644
--- a/dscript/fasta.py
+++ b/dscript/fasta.py
@@ -1,6 +1,6 @@
 from __future__ import print_function, division
 
-
+# deal with formatting of fasta files (header id : protein sequence)
 def parse_stream(f, comment=b"#"):
 
     name = None
diff --git a/dscript/language_model.py b/dscript/language_model.py
index aa6686a..4d047c0 100644
--- a/dscript/language_model.py
+++ b/dscript/language_model.py
@@ -1,6 +1,6 @@
 import os
 import sys
-import subprocess as sp
+import subprocess as sp # handle subprocesses
 import random
 import torch
 import h5py
@@ -12,7 +12,7 @@ from .models.embedding import SkipLSTM
 from .utils import log
 from datetime import datetime
 
-
+# using pre-trained language model to embed protein sequences
 def lm_embed(sequence, use_cuda=False):
     """
     Embed a single sequence using pre-trained language model from `Bepler & Berger <https://github.com/tbepler/protein-sequence-embedding-iclr2019>`_.
diff --git a/dscript/models/contact.py b/dscript/models/contact.py
index 2c2ab54..f0f8dd9 100644
--- a/dscript/models/contact.py
+++ b/dscript/models/contact.py
@@ -5,7 +5,7 @@ import torch
 import torch.nn as nn
 import torch.functional as F
 
-
+# *** broadcast tensor?
 class FullyConnected(nn.Module):
     """
     Performs part 1 of Contact Prediction Module. Takes embeddings from Projection module and produces broadcast tensor.
@@ -54,7 +54,7 @@ class FullyConnected(nn.Module):
 
         return c
 
-
+# *** part 2 of contact model?
 class ContactCNN(nn.Module):
     """
     Residue Contact Prediction Module. Takes embeddings from Projection module and produces contact map, output of Contact module.
diff --git a/dscript/models/embedding.py b/dscript/models/embedding.py
index 88537db..fee226c 100644
--- a/dscript/models/embedding.py
+++ b/dscript/models/embedding.py
@@ -5,7 +5,7 @@ import torch.nn as nn
 import torch.nn.functional as F
 from torch.nn.utils.rnn import PackedSequence
 
-
+# language model --> embedding --> contact
 class IdentityEmbed(nn.Module):
     """
     Does not reduce the dimension of the language model embeddings, just passes them through to the contact model.
@@ -20,7 +20,7 @@ class IdentityEmbed(nn.Module):
         """
         return x
 
-
+# amino acids --> low dimensional projection
 class FullyConnectedEmbed(nn.Module):
     """
     Protein Projection Module. Takes embedding from language model and outputs low-dimensional interaction aware projection.
@@ -57,7 +57,7 @@ class FullyConnectedEmbed(nn.Module):
         t = self.drop(t)
         return t
 
-
+# uses (bidirectional) LSTM as part of embedding into vector process
 class LSTMEmbed(nn.Module):
     def __init__(self, nout, activation="ReLU", sparse=False, p=0.5):
         super(LSTMEmbed, self).__init__()
@@ -104,6 +104,7 @@ class SkipLSTM(nn.Module):
 
     Loaded with pre-trained weights in embedding function.
 
+#  *** 21, not 20? reference said something to explain this
     :param nin: Input dimension of amino acid one-hot [default: 21]
     :type nin: int
     :param nout: Output dimension of final layer [default: 100]
@@ -169,6 +170,7 @@ class SkipLSTM(nn.Module):
             one_hot.scatter_(2, x.unsqueeze(2), 1)
         return one_hot
 
+# *** powerpoint slide, what's going on here?
     def transform(self, x):
         """
         :param x: Input numeric amino acid encoding :math:`(N)`
diff --git a/dscript/models/interaction.py b/dscript/models/interaction.py
index 3b04f28..1c96978 100644
--- a/dscript/models/interaction.py
+++ b/dscript/models/interaction.py
@@ -62,6 +62,7 @@ class ModelInteraction(nn.Module):
         lambda_init=0,
         gamma_init=0,
     ):
+        # *** pooling operations? reference
         """
         Main D-SCRIPT model. Contains an embedding and contact model and offers access to those models. Computes pooling operations on contact map to generate interaction probability.
 
@@ -122,6 +123,7 @@ class ModelInteraction(nn.Module):
 
         self.gamma.data.clamp_(min=0)
 
+# *** explain 'Project down input language model embeddings into low dimension using projection module"?
     def embed(self, x):
         """
         Project down input language model embeddings into low dimension using projection module
diff --git a/dscript/pretrained.py b/dscript/pretrained.py
index 87d92a6..4fa3638 100644
--- a/dscript/pretrained.py
+++ b/dscript/pretrained.py
@@ -7,12 +7,12 @@ from .models.contact import ContactCNN
 from .models.embedding import FullyConnectedEmbed, SkipLSTM
 from .models.interaction import ModelInteraction
 
-
+# create an lstm and a human_v1 model 
 def build_lm_1(state_dict_path):
     """
     :meta private:
     """
-    model = SkipLSTM(21, 100, 1024, 3)
+    model = SkipLSTM(21, 100, 1024, 3) # creates bidirection lstm, read in reference
     state_dict = torch.load(state_dict_path)
     model.load_state_dict(state_dict)
     model.eval()
diff --git a/dscript/tests/test_pretrained.py b/dscript/tests/test_pretrained.py
index c17a915..b100cd0 100644
--- a/dscript/tests/test_pretrained.py
+++ b/dscript/tests/test_pretrained.py
@@ -15,7 +15,7 @@ MODEL_VERSIONS = [
 
 print(dscript.__version__)
 
-
+# checker method to test if pretrained models exist and are working
 def test_get_state_dict():
     for mv in MODEL_VERSIONS:
         sd = get_state_dict(mv, verbose=True)
diff --git a/dscript/utils.py b/dscript/utils.py
index 14496d1..3031d30 100644
--- a/dscript/utils.py
+++ b/dscript/utils.py
@@ -7,7 +7,7 @@ import numpy as np
 import pandas as pd
 import subprocess as sp
 import sys
-import gzip as gz
+import gzip as gz # handles unix/linux gzip files
 
 from datetime import datetime
 
diff --git a/environment.yml b/environment.yml
index fda1760..8f09dde 100644
--- a/environment.yml
+++ b/environment.yml
@@ -7,7 +7,6 @@ channels:
 
 dependencies:
   - pip=20.0
-  - cudatoolkit=10.2
   - python=3.7
   - pytorch=1.5
   - h5py
diff --git a/setup.py b/setup.py
index 6948dc9..1e56448 100644
--- a/setup.py
+++ b/setup.py
@@ -19,14 +19,14 @@ setup(
     },
     include_package_data=True,
     install_requires=[
-        "numpy",
-        "scipy",
-        "pandas",
-        "torch",
-        "matplotlib",
-        "seaborn",
-        "tqdm",
-        "scikit-learn",
-        "h5py",
+        "numpy", # mathematical operations/representations 
+        "scipy", # linear algebra
+        "pandas", # data analysis tools
+        "torch", #nn training/creation
+        "matplotlib", # visualization
+        "seaborn", # matplotlib 2.0
+        "tqdm", # progress bar
+        "scikit-learn", # machine learning
+        "h5py", # store lots of (binary) data
     ],
 )