From 2bfee227e6b69afb4212e77f9419919d40145f02 Mon Sep 17 00:00:00 2001 From: Lynn Tao Date: Wed, 29 Jun 2022 17:10:41 -0400 Subject: [PATCH] Initial commit from Lynn on contact supervision branch --- .gitignore | 4 ++ dscript/1a0n.tsv | 1 + dscript/__init__.py | 3 +- dscript/__main__.py | 8 ++-- dscript/alphabets.py | 2 + dscript/commands/embed.py | 2 +- dscript/commands/evaluate.py | 2 +- dscript/commands/predict.py | 5 ++- dscript/commands/train.py | 8 ++-- dscript/contact_map.py | 76 ++++++++++++++++++++++++++++++++ dscript/data_collection.py | 22 +++++++++ dscript/fasta.py | 2 +- dscript/language_model.py | 4 +- dscript/models/contact.py | 4 +- dscript/models/embedding.py | 8 ++-- dscript/models/interaction.py | 2 + dscript/pretrained.py | 4 +- dscript/tests/test_pretrained.py | 2 +- dscript/utils.py | 2 +- environment.yml | 1 - setup.py | 18 ++++---- 21 files changed, 148 insertions(+), 32 deletions(-) create mode 100644 dscript/1a0n.tsv create mode 100644 dscript/contact_map.py create mode 100644 dscript/data_collection.py diff --git a/.gitignore b/.gitignore index 0cb3b0b..1d2d254 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,6 @@ +dscript/proteins +dscript/pdbs + RUN_DSCRIPT.sh build/* scratch/* @@ -8,6 +11,7 @@ dev*/* *.egg-info **/*.sav **/*_log.txt +**.DS_Store **/*.h5 **/.ipynb_checkpoints/** **/__pycache__/** diff --git a/dscript/1a0n.tsv b/dscript/1a0n.tsv new file mode 100644 index 0000000..4544743 --- /dev/null +++ b/dscript/1a0n.tsv @@ -0,0 +1 @@ +1A0N:A UNP:P27986 P85A_HUMAN 1A0N:B UNP:P06241 FYN_HUMAN 1 diff --git a/dscript/__init__.py b/dscript/__init__.py index a2cf8c6..78fb8ad 100644 --- a/dscript/__init__.py +++ b/dscript/__init__.py @@ -1,4 +1,5 @@ -__version__ = "0.1.9" +# goes to this version with relative imports +__version__ = "0.1.9-Lynn" __citation__ = """Sledzieski, Singh, Cowen, Berger. Sequence-based prediction of protein-protein interactions: a structure-aware interpretable deep learning model. Cell Systems, 2021.""" from . import alphabets, commands, fasta, language_model, models, pretrained diff --git a/dscript/__main__.py b/dscript/__main__.py index 761cac7..4f1420e 100644 --- a/dscript/__main__.py +++ b/dscript/__main__.py @@ -1,11 +1,12 @@ """ D-SCRIPT: Structure Aware PPI Prediction """ -import argparse -import os +import argparse # command line run +import os # manages operating system (system software) import sys +# CLASS prints citation (from initial file) class CitationAction(argparse.Action): def __init__(self, option_strings, dest, **kwargs): super(CitationAction, self).__init__(option_strings, dest, **kwargs) @@ -17,8 +18,9 @@ class CitationAction(argparse.Action): setattr(namespace, self.dest, values) sys.exit(0) - +# Compiles a summary of the dscript information: version, citation, commdands, modules def main(): + # print("Hello World!") from . import __version__ parser = argparse.ArgumentParser(description=__doc__) diff --git a/dscript/alphabets.py b/dscript/alphabets.py index 7eda3e2..58c41cc 100644 --- a/dscript/alphabets.py +++ b/dscript/alphabets.py @@ -3,6 +3,7 @@ from __future__ import print_function, division import numpy as np +# encodes amino acid seqence into mathematical indices and vice versa class Alphabet: """ From `Bepler & Berger `_. @@ -60,6 +61,7 @@ class Alphabet: string = self.chars[x] return string.tobytes() +# *** def unpack(self, h, k): """ unpack integer h into array of this alphabet with length k """ n = self.size diff --git a/dscript/commands/embed.py b/dscript/commands/embed.py index a6ed04e..fe9c89b 100644 --- a/dscript/commands/embed.py +++ b/dscript/commands/embed.py @@ -5,7 +5,7 @@ Generate new embeddings using pre-trained language model. import argparse from ..language_model import embed_from_fasta - +# *** make new embeddings --> this vs. embedding.py (models) vs language_model.py def add_args(parser): """ Create parser for command line utility. diff --git a/dscript/commands/evaluate.py b/dscript/commands/evaluate.py index acc8422..aa71d16 100644 --- a/dscript/commands/evaluate.py +++ b/dscript/commands/evaluate.py @@ -44,7 +44,7 @@ def add_args(parser): ) return parser - +# *** plot positive and negative predictions?? def plot_eval_predictions(labels, predictions, path="figure"): """ Plot histogram of positive and negative predictions, precision-recall curve, and receiver operating characteristic curve. diff --git a/dscript/commands/predict.py b/dscript/commands/predict.py index 6ecf84c..f0b246f 100644 --- a/dscript/commands/predict.py +++ b/dscript/commands/predict.py @@ -44,13 +44,16 @@ def add_args(parser): ) return parser - +# *** simple prediction, interpretation? def main(args): """ Run new prediction from arguments. :meta private: """ + # print("Hello World") + # sys.exit(0) + if args.seqs is None and args.embeddings is None: print("One of --seqs or --embeddings is required.") sys.exit(0) diff --git a/dscript/commands/train.py b/dscript/commands/train.py index 5c5ea03..fa9b35e 100644 --- a/dscript/commands/train.py +++ b/dscript/commands/train.py @@ -182,7 +182,7 @@ def predict_cmap_interaction(model, n0, n1, tensors, use_cuda): :param use_cuda: Whether to use GPU :type use_cuda: bool """ - + b = len(n0) p_hat = [] @@ -200,7 +200,7 @@ def predict_cmap_interaction(model, n0, n1, tensors, use_cuda): c_map_mag = torch.stack(c_map_mag, 0) return c_map_mag, p_hat - +# *** list and list interactions? from cmap predict interactions def predict_interaction(model, n0, n1, tensors, use_cuda): """ Predict whether a list of protein pairs will interact. @@ -243,6 +243,7 @@ def interaction_grad(model, n0, n1, y, tensors, weight=0.35, use_cuda=True): :rtype: (torch.Tensor, int, torch.Tensor, int) """ + c_map_mag, p_hat = predict_cmap_interaction( model, n0, n1, tensors, use_cuda ) @@ -252,6 +253,7 @@ def interaction_grad(model, n0, n1, y, tensors, weight=0.35, use_cuda=True): p_hat = p_hat.float() bce_loss = F.binary_cross_entropy(p_hat.float(), y.float()) + cmap_loss = torch.mean(c_map_mag) loss = (weight * bce_loss) + ((1 - weight) * cmap_loss) b = len(p_hat) @@ -397,7 +399,7 @@ def train_model(args, output): for prot_name in tqdm(all_proteins): embeddings[prot_name] = torch.from_numpy(h5fi[prot_name][:, :]) - if args.checkpoint is None: + if args.checkpoint is None: # Create embedding model input_dim = args.input_dim diff --git a/dscript/contact_map.py b/dscript/contact_map.py new file mode 100644 index 0000000..b68366e --- /dev/null +++ b/dscript/contact_map.py @@ -0,0 +1,76 @@ +import Bio.PDB +import numpy +import matplotlib.pyplot as plt +import h5py +from models.interaction import ModelInteraction +# from language_model import embed_from_fasta + +fi = h5py.File("2022-06-27-06:26.predictions.cmaps.h5","r") +ke = list(fi.keys()) +# print(fi[ke[0]]) +cmap = fi[ke[0]] +plt.imshow(cmap) +plt.show() + +# SOURCE CODE: https://warwick.ac.uk/fac/sci/moac/people/students/peter_cock/python/protein_contact_map/ +# pdb_code = "15c8" +# pdb_filename = "dscript/pdbs/15c8.pdb" +# pdb_code = "1xi4" +# pdb_filename = "dscript/1xi4.pdb" #not the full cage! + + +# def calc_residue_dist(residue_one, residue_two) : +# """Returns the C-alpha distance between two residues""" +# diff_vector = residue_one["CA"].coord - residue_two["CA"].coord +# return numpy.sqrt(numpy.sum(diff_vector * diff_vector)) + + +# def calc_dist_matrix(chain_one, chain_two) : +# """Returns a matrix of C-alpha distances between two chains""" +# answer = numpy.zeros((len(chain_one), len(chain_two)), numpy.float) +# for row, residue_one in enumerate(chain_one) : +# for col, residue_two in enumerate(chain_two) : +# answer[row, col] = calc_residue_dist(residue_one, residue_two) +# return answer + + +# structure = Bio.PDB.PDBParser().get_structure(pdb_code, pdb_filename) +# print(structure) + +# model = structure[0] +# # print(list(model["H"])) + +# # chain1 = model["H"] +# # for residue in chain1: +# # if residue.id[0] != ' ': +# # chain1.detach_child(residue.id) + +# # chain2 = model["L"] +# # for residue in chain2: +# # if residue.id[0] != ' ': +# # chain2.detach_child(residue.id) + +# dist_matrix = calc_dist_matrix(model["D"], model["M"]) +# contact_map = dist_matrix < 12.0 + +# print(contact_map) +# print(numpy.min(dist_matrix)) +# print(numpy.max(dist_matrix)) + +# pylab.matshow(numpy.transpose(dist_matrix)) +# pylab.colorbar() +# pylab.show() + +# pylab.imshow(numpy.transpose(contact_map)) +# pylab.show() + + +# DSCRIPT CONTACT MAPPING +# embed_from_fasta("dscript/proteins/1a0n.fasta", "dscript/1a0n_embed.h5", device=0, verbose=False) +# h5fi = h5py.File("dscript/1a0n_embed.h5", "r") +# z_a = h5fi["1A0N:A UNP:P27986 P85A_HUMAN"] +# z_b = h5fi["1A0N:B UNP:P06241 FYN_HUMAN"] +# self = ModelInteraction() +# cm, ph = self.map_predict(z_a, z_b) +# print(cm) +# print(ph) \ No newline at end of file diff --git a/dscript/data_collection.py b/dscript/data_collection.py new file mode 100644 index 0000000..cffc295 --- /dev/null +++ b/dscript/data_collection.py @@ -0,0 +1,22 @@ +# converting PDB CIF to sequence format +import Bio +from Bio import SeqIO +import os + +entries = os.listdir('dscript/pdbs') + +# CONVERT ALL +for i in range(0, len(entries)): + if entries[i][0] != ".": + with open(f'dscript/proteins/{entries[i][:-4]}.fasta', 'w') as f: + for record in SeqIO.parse(f"dscript/pdbs/{entries[i]}", "pdb-seqres"): + f.write(record.format("fasta-2line")) + f.close() + +# TEST A SINGLE +# with open(f'dscript/proteins/15c8.fasta', 'w') as f: +# for record in SeqIO.parse(f"dscript/15c8.pdb", "pdb-seqres"): +# # print("Record id %s, chain %s" % (record.id, record.annotations["chain"])) +# # print(record.format("fasta")) +# f.write(record.format("fasta-2line")) +# f.close() \ No newline at end of file diff --git a/dscript/fasta.py b/dscript/fasta.py index 58f28ef..a2d8be6 100644 --- a/dscript/fasta.py +++ b/dscript/fasta.py @@ -1,6 +1,6 @@ from __future__ import print_function, division - +# deal with formatting of fasta files (header id : protein sequence) def parse_stream(f, comment=b"#"): name = None diff --git a/dscript/language_model.py b/dscript/language_model.py index aa6686a..4d047c0 100644 --- a/dscript/language_model.py +++ b/dscript/language_model.py @@ -1,6 +1,6 @@ import os import sys -import subprocess as sp +import subprocess as sp # handle subprocesses import random import torch import h5py @@ -12,7 +12,7 @@ from .models.embedding import SkipLSTM from .utils import log from datetime import datetime - +# using pre-trained language model to embed protein sequences def lm_embed(sequence, use_cuda=False): """ Embed a single sequence using pre-trained language model from `Bepler & Berger `_. diff --git a/dscript/models/contact.py b/dscript/models/contact.py index 2c2ab54..f0f8dd9 100644 --- a/dscript/models/contact.py +++ b/dscript/models/contact.py @@ -5,7 +5,7 @@ import torch import torch.nn as nn import torch.functional as F - +# *** broadcast tensor? class FullyConnected(nn.Module): """ Performs part 1 of Contact Prediction Module. Takes embeddings from Projection module and produces broadcast tensor. @@ -54,7 +54,7 @@ class FullyConnected(nn.Module): return c - +# *** part 2 of contact model? class ContactCNN(nn.Module): """ Residue Contact Prediction Module. Takes embeddings from Projection module and produces contact map, output of Contact module. diff --git a/dscript/models/embedding.py b/dscript/models/embedding.py index 88537db..fee226c 100644 --- a/dscript/models/embedding.py +++ b/dscript/models/embedding.py @@ -5,7 +5,7 @@ import torch.nn as nn import torch.nn.functional as F from torch.nn.utils.rnn import PackedSequence - +# language model --> embedding --> contact class IdentityEmbed(nn.Module): """ Does not reduce the dimension of the language model embeddings, just passes them through to the contact model. @@ -20,7 +20,7 @@ class IdentityEmbed(nn.Module): """ return x - +# amino acids --> low dimensional projection class FullyConnectedEmbed(nn.Module): """ Protein Projection Module. Takes embedding from language model and outputs low-dimensional interaction aware projection. @@ -57,7 +57,7 @@ class FullyConnectedEmbed(nn.Module): t = self.drop(t) return t - +# uses (bidirectional) LSTM as part of embedding into vector process class LSTMEmbed(nn.Module): def __init__(self, nout, activation="ReLU", sparse=False, p=0.5): super(LSTMEmbed, self).__init__() @@ -104,6 +104,7 @@ class SkipLSTM(nn.Module): Loaded with pre-trained weights in embedding function. +# *** 21, not 20? reference said something to explain this :param nin: Input dimension of amino acid one-hot [default: 21] :type nin: int :param nout: Output dimension of final layer [default: 100] @@ -169,6 +170,7 @@ class SkipLSTM(nn.Module): one_hot.scatter_(2, x.unsqueeze(2), 1) return one_hot +# *** powerpoint slide, what's going on here? def transform(self, x): """ :param x: Input numeric amino acid encoding :math:`(N)` diff --git a/dscript/models/interaction.py b/dscript/models/interaction.py index 3b04f28..1c96978 100644 --- a/dscript/models/interaction.py +++ b/dscript/models/interaction.py @@ -62,6 +62,7 @@ class ModelInteraction(nn.Module): lambda_init=0, gamma_init=0, ): + # *** pooling operations? reference """ Main D-SCRIPT model. Contains an embedding and contact model and offers access to those models. Computes pooling operations on contact map to generate interaction probability. @@ -122,6 +123,7 @@ class ModelInteraction(nn.Module): self.gamma.data.clamp_(min=0) +# *** explain 'Project down input language model embeddings into low dimension using projection module"? def embed(self, x): """ Project down input language model embeddings into low dimension using projection module diff --git a/dscript/pretrained.py b/dscript/pretrained.py index 87d92a6..4fa3638 100644 --- a/dscript/pretrained.py +++ b/dscript/pretrained.py @@ -7,12 +7,12 @@ from .models.contact import ContactCNN from .models.embedding import FullyConnectedEmbed, SkipLSTM from .models.interaction import ModelInteraction - +# create an lstm and a human_v1 model def build_lm_1(state_dict_path): """ :meta private: """ - model = SkipLSTM(21, 100, 1024, 3) + model = SkipLSTM(21, 100, 1024, 3) # creates bidirection lstm, read in reference state_dict = torch.load(state_dict_path) model.load_state_dict(state_dict) model.eval() diff --git a/dscript/tests/test_pretrained.py b/dscript/tests/test_pretrained.py index c17a915..b100cd0 100644 --- a/dscript/tests/test_pretrained.py +++ b/dscript/tests/test_pretrained.py @@ -15,7 +15,7 @@ MODEL_VERSIONS = [ print(dscript.__version__) - +# checker method to test if pretrained models exist and are working def test_get_state_dict(): for mv in MODEL_VERSIONS: sd = get_state_dict(mv, verbose=True) diff --git a/dscript/utils.py b/dscript/utils.py index 14496d1..3031d30 100644 --- a/dscript/utils.py +++ b/dscript/utils.py @@ -7,7 +7,7 @@ import numpy as np import pandas as pd import subprocess as sp import sys -import gzip as gz +import gzip as gz # handles unix/linux gzip files from datetime import datetime diff --git a/environment.yml b/environment.yml index fda1760..8f09dde 100644 --- a/environment.yml +++ b/environment.yml @@ -7,7 +7,6 @@ channels: dependencies: - pip=20.0 - - cudatoolkit=10.2 - python=3.7 - pytorch=1.5 - h5py diff --git a/setup.py b/setup.py index 6948dc9..1e56448 100644 --- a/setup.py +++ b/setup.py @@ -19,14 +19,14 @@ setup( }, include_package_data=True, install_requires=[ - "numpy", - "scipy", - "pandas", - "torch", - "matplotlib", - "seaborn", - "tqdm", - "scikit-learn", - "h5py", + "numpy", # mathematical operations/representations + "scipy", # linear algebra + "pandas", # data analysis tools + "torch", #nn training/creation + "matplotlib", # visualization + "seaborn", # matplotlib 2.0 + "tqdm", # progress bar + "scikit-learn", # machine learning + "h5py", # store lots of (binary) data ], )