From f9fdbaab34774a4da50c05f58475fe54152e4404 Mon Sep 17 00:00:00 2001 From: Mert Erden Date: Mon, 27 Jun 2022 14:44:31 -0400 Subject: [PATCH 01/15] fix: added biopython as dependency in setup.py --- setup.py | 1 + 1 file changed, 1 insertion(+) diff --git a/setup.py b/setup.py index 6948dc9..27b1944 100644 --- a/setup.py +++ b/setup.py @@ -23,6 +23,7 @@ setup( "scipy", "pandas", "torch", + "biopython", "matplotlib", "seaborn", "tqdm", From 128d360c03189cd33e9466f8dd9f9dfbd1bf5ac7 Mon Sep 17 00:00:00 2001 From: samsledje Date: Tue, 28 Jun 2022 11:31:59 -0400 Subject: [PATCH 02/15] update setup.py requirements --- CHANGELOG.md | 3 +++ dscript/__init__.py | 2 +- setup.py | 4 ++-- 3 files changed, 6 insertions(+), 3 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index f010afc..7dc2c46 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,9 @@ ## v0.2 +### v0.2.1 +- Add biopython to setup.py + ### v0.2.0 - Integrate Topsy-Turvy to allow for top-down supervision diff --git a/dscript/__init__.py b/dscript/__init__.py index 014e231..7435f05 100644 --- a/dscript/__init__.py +++ b/dscript/__init__.py @@ -1,4 +1,4 @@ -__version__ = "0.2.0" +__version__ = "0.2.1" __citation__ = """Sledzieski, Singh, Cowen, Berger. "D-SCRIPT translates genome to phenome with sequence-based, structure-aware, genome-scale predictions of protein-protein interactions." Cell Systems 12, no. 10 (2021): 969-982. Devkota, Singh, Sledzieski, Berger, Cowen, Topsy-Turvy: integrating a global view into sequence-based PPI prediction, Bioinformatics, In Press.""" diff --git a/setup.py b/setup.py index 27b1944..39f2848 100644 --- a/setup.py +++ b/setup.py @@ -22,8 +22,8 @@ setup( "numpy", "scipy", "pandas", - "torch", - "biopython", + "torch>=1.11", + "biopython", "matplotlib", "seaborn", "tqdm", From f4b9b1f961b644c7725db56e7240a00b976cce7a Mon Sep 17 00:00:00 2001 From: samsledje Date: Tue, 28 Jun 2022 11:48:32 -0400 Subject: [PATCH 03/15] This file is tab separated, not comma separated. act like it. --- dscript/tests/test.tsv | 3 +++ 1 file changed, 3 insertions(+) create mode 100644 dscript/tests/test.tsv diff --git a/dscript/tests/test.tsv b/dscript/tests/test.tsv new file mode 100644 index 0000000..0937e24 --- /dev/null +++ b/dscript/tests/test.tsv @@ -0,0 +1,3 @@ +seq1 seq2 1 +seq1 seq3 0 +seq2 seq3 1 From e2db487adc87259083838300908c714d0a0c95b6 Mon Sep 17 00:00:00 2001 From: Samuel Sledzieski Date: Wed, 29 Jun 2022 12:53:59 -0400 Subject: [PATCH 04/15] Update __main__.py --- dscript/__main__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dscript/__main__.py b/dscript/__main__.py index fec04e5..e319205 100644 --- a/dscript/__main__.py +++ b/dscript/__main__.py @@ -37,7 +37,7 @@ def main(): subparsers = parser.add_subparsers(title="D-SCRIPT Commands", dest="cmd") subparsers.required = True - from .commands import train, embed, evaluate, predict, predict_parallel + from .commands import train, embed, evaluate, predict modules = { "train": train, From e4adc459a30d7d31cdd50228a247d0f56ae030ff Mon Sep 17 00:00:00 2001 From: Mert Erden Date: Wed, 29 Jun 2022 14:09:44 -0400 Subject: [PATCH 05/15] enhancement: added typing documentation & automatic retrying for corrupted models --- .pre-commit-config.yaml | 3 ++- dscript/__main__.py | 14 +++++++++-- dscript/commands/embed.py | 11 ++++++++ dscript/commands/evaluate.py | 12 ++++++++- dscript/commands/predict.py | 14 +++++++++++ dscript/commands/train.py | 32 +++++++++++++++++++++++ dscript/pretrained.py | 49 +++++++++++++++++++++++++++++++----- 7 files changed, 125 insertions(+), 10 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 322f16a..5d08ee2 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -16,7 +16,8 @@ repos: rev: 21.6b0 hooks: - id: black - language_version: python3.8 + language_version: python3.7 + additional_dependencies: ['click==8.0.4'] - repo: https://gitlab.com/pycqa/flake8 rev: 3.9.2 hooks: diff --git a/dscript/__main__.py b/dscript/__main__.py index fec04e5..39400e0 100644 --- a/dscript/__main__.py +++ b/dscript/__main__.py @@ -4,6 +4,16 @@ D-SCRIPT: Structure Aware PPI Prediction import argparse import os import sys +from typing import Union + +from .commands.embed import EmbeddingArguments +from .commands.evaluate import EvaluateArguments +from .commands.predict import PredictionArguments +from .commands.train import TrainArguments + +DScriptArguments = Union[ + EmbeddingArguments, EvaluateArguments, PredictionArguments, TrainArguments +] class CitationAction(argparse.Action): @@ -37,7 +47,7 @@ def main(): subparsers = parser.add_subparsers(title="D-SCRIPT Commands", dest="cmd") subparsers.required = True - from .commands import train, embed, evaluate, predict, predict_parallel + from .commands import train, embed, evaluate, predict modules = { "train": train, @@ -51,7 +61,7 @@ def main(): module.add_args(sp) sp.set_defaults(func=module.main) - args = parser.parse_args() + args: DScriptArguments = parser.parse_args() args.func(args) diff --git a/dscript/commands/embed.py b/dscript/commands/embed.py index a6ed04e..c1218f0 100644 --- a/dscript/commands/embed.py +++ b/dscript/commands/embed.py @@ -2,9 +2,20 @@ Generate new embeddings using pre-trained language model. """ +from __future__ import annotations import argparse from ..language_model import embed_from_fasta +from typing import Callable, NamedTuple + + +class EmbeddingArguments(NamedTuple): + cmd: str + device: int + outfile: str + seqs: str + func: Callable[[EmbeddingArguments], None] + def add_args(parser): """ diff --git a/dscript/commands/evaluate.py b/dscript/commands/evaluate.py index 3583c19..7c510e9 100644 --- a/dscript/commands/evaluate.py +++ b/dscript/commands/evaluate.py @@ -2,10 +2,11 @@ Evaluate a trained model. """ +from __future__ import annotations import argparse import datetime -import os import sys +from typing import Callable, NamedTuple import h5py import matplotlib @@ -26,6 +27,15 @@ from ..utils import log, load_hdf5_parallel matplotlib.use("Agg") +class EvaluateArguments(NamedTuple): + cmd: str + device: int + model: str + embedding: str + test: str + func: Callable[[EvaluateArguments], None] + + def add_args(parser): """ Create parser for command line utility. diff --git a/dscript/commands/predict.py b/dscript/commands/predict.py index 4ccaabf..351a60b 100644 --- a/dscript/commands/predict.py +++ b/dscript/commands/predict.py @@ -1,6 +1,7 @@ """ Make new predictions with a pre-trained model. One of --seqs or --embeddings is required. """ +from __future__ import annotations import argparse import datetime import os @@ -12,6 +13,8 @@ import pandas as pd import torch from scipy.special import comb from tqdm import tqdm +from typing import Callable, NamedTuple, Optional + from ..alphabets import Uniprot21 from ..fasta import parse @@ -19,6 +22,17 @@ from ..language_model import lm_embed from ..utils import log, load_hdf5_parallel +class PredictionArguments(NamedTuple): + cmd: str + device: int + embeddings: Optional[str] + outfile: Optional[str] + seqs: str + model: str + thresh: Optional[float] + func: Callable[[PredictionArguments], None] + + def add_args(parser): """ Create parser for command line utility diff --git a/dscript/commands/train.py b/dscript/commands/train.py index 0310bbf..d41db55 100644 --- a/dscript/commands/train.py +++ b/dscript/commands/train.py @@ -2,6 +2,7 @@ Train a new model. """ +from __future__ import annotations import torch import torch.nn as nn import torch.nn.functional as F @@ -9,6 +10,7 @@ from torch.autograd import Variable from torch.utils.data import IterableDataset, DataLoader from sklearn.metrics import average_precision_score as average_precision from tqdm import tqdm +from typing import Callable, NamedTuple, Optional import sys import argparse @@ -32,6 +34,36 @@ from ..models.contact import ContactCNN from ..models.interaction import ModelInteraction +class TrainArguments(NamedTuple): + cmd: str + device: int + train: str + test: str + embedding: str + no_augment: bool + input_dim: int + projection_dim: int + dropout: float + hidden_dim: int + kernel_width: int + no_w: bool + no_sigmoid: bool + do_pool: bool + pool_width: int + num_epochs: int + batch_size: int + weight_decay: float + lr: float + interaction_weight: float + run_tt: bool + glider_weight: float + glider_thresh: float + outfile: Optional[str] + save_prefix: Optional[str] + checkpoint: Optional[str] + func: Callable[[TrainArguments], None] + + def add_args(parser): """ Create parser for command line utility. diff --git a/dscript/pretrained.py b/dscript/pretrained.py index 87d92a6..da6a00f 100644 --- a/dscript/pretrained.py +++ b/dscript/pretrained.py @@ -1,4 +1,6 @@ +from functools import wraps, partial import os +import os.path import sys import torch @@ -42,6 +44,16 @@ def build_human_1(state_dict_path): VALID_MODELS = {"lm_v1": build_lm_1, "human_v1": build_human_1} +STATE_DICT_BASENAME = "dscript_{version}.pt" + + +def get_state_dict_path(version: str) -> str: + state_dict_basedir = os.path.dirname(os.path.realpath(__file__)) + state_dict_fullname = ( + f"{state_dict_basedir}/{STATE_DICT_BASENAME.format(version=version)}" + ) + return state_dict_fullname + def get_state_dict(version="human_v1", verbose=True): """ @@ -54,12 +66,8 @@ def get_state_dict(version="human_v1", verbose=True): :return: Path to state dictionary for pre-trained language model :rtype: str """ - state_dict_basename = f"dscript_{version}.pt" - state_dict_basedir = os.path.dirname(os.path.realpath(__file__)) - state_dict_fullname = f"{state_dict_basedir}/{state_dict_basename}" - state_dict_url = ( - f"http://cb.csail.mit.edu/cb/dscript/data/models/{state_dict_basename}" - ) + state_dict_fullname = get_state_dict_path(version) + state_dict_url = f"http://cb.csail.mit.edu/cb/dscript/data/models/{STATE_DICT_BASENAME.format(version=version)}" if not os.path.exists(state_dict_fullname): try: import shutil @@ -77,6 +85,35 @@ def get_state_dict(version="human_v1", verbose=True): return state_dict_fullname +def retry(retry_count: int): + def decorate(func): + @wraps(func) + def retry_wrapper(*args, **kwargs): + attempt = 0 + version = args[0] + while attempt < retry_count: + try: + result = func(*args, **kwargs) + return result + except RuntimeError as e: + print( + f"\033[93mLoading {version} from disk failed. Retrying download attempt: {attempt + 1}\033[0m" + ) + if e.args[0].startswith("unexpected EOF"): + state_dict_fullname = get_state_dict_path(version) + if os.path.exists(state_dict_fullname): + os.remove(state_dict_fullname) + else: + raise e + attempt += 1 + raise Exception(f"Failed to download {version}") + + return retry_wrapper + + return decorate + + +@retry(3) def get_pretrained(version="human_v1"): """ Get pre-trained model object. From 9ad03503e56897477fb66d657b8089b975f0f1be Mon Sep 17 00:00:00 2001 From: Mert Erden Date: Wed, 29 Jun 2022 16:39:26 -0400 Subject: [PATCH 06/15] fix: replace print usage with log in pretrained.py --- dscript/pretrained.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/dscript/pretrained.py b/dscript/pretrained.py index da6a00f..c037488 100644 --- a/dscript/pretrained.py +++ b/dscript/pretrained.py @@ -8,6 +8,7 @@ import torch from .models.contact import ContactCNN from .models.embedding import FullyConnectedEmbed, SkipLSTM from .models.interaction import ModelInteraction +from .utils import log def build_lm_1(state_dict_path): @@ -74,13 +75,13 @@ def get_state_dict(version="human_v1", verbose=True): import urllib.request if verbose: - print(f"Downloading model {version} from {state_dict_url}...") + log(f"Downloading model {version} from {state_dict_url}...") with urllib.request.urlopen(state_dict_url) as response, open( state_dict_fullname, "wb" ) as out_file: shutil.copyfileobj(response, out_file) except Exception as e: - print("Unable to download model - {}".format(e)) + log("Unable to download model - {}".format(e)) sys.exit(1) return state_dict_fullname @@ -96,7 +97,7 @@ def retry(retry_count: int): result = func(*args, **kwargs) return result except RuntimeError as e: - print( + log( f"\033[93mLoading {version} from disk failed. Retrying download attempt: {attempt + 1}\033[0m" ) if e.args[0].startswith("unexpected EOF"): From a4eaad0f1b1ee1533cb2dcae44dba5b51a6a34cd Mon Sep 17 00:00:00 2001 From: Samuel Sledzieski Date: Thu, 30 Jun 2022 11:26:10 -0400 Subject: [PATCH 07/15] Create automatic test runner --- .github/workflows/python-app.yml | 40 ++++++++++++++++++++++++++++++++ 1 file changed, 40 insertions(+) create mode 100644 .github/workflows/python-app.yml diff --git a/.github/workflows/python-app.yml b/.github/workflows/python-app.yml new file mode 100644 index 0000000..0c93108 --- /dev/null +++ b/.github/workflows/python-app.yml @@ -0,0 +1,40 @@ +# This workflow will install Python dependencies, run tests and lint with a single version of Python +# For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions + +name: Python application + +on: + push: + branches: [ "main" ] + pull_request: + branches: [ "main" ] + +permissions: + contents: read + +jobs: + build: + + runs-on: ubuntu-latest + + steps: + - uses: actions/checkout@v3 + - name: Set up Python 3.7 + uses: actions/setup-python@v3 + with: + python-version: "3.7" + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install flake8 pytest + if [ -f requirements.txt ]; then pip install -r requirements.txt; fi + python setup.py install + - name: Lint with flake8 + run: | + # stop the build if there are Python syntax errors or undefined names + flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics + # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide + flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics + - name: Test with pytest + run: | + pytest From f0d03d32088bb395de16cc27320e24eebfa9e29c Mon Sep 17 00:00:00 2001 From: Samuel Sledzieski Date: Thu, 30 Jun 2022 11:31:28 -0400 Subject: [PATCH 08/15] Update and rename python-app.yml to autorun-tests.yml --- .../{python-app.yml => autorun-tests.yml} | 26 +++++++++---------- 1 file changed, 12 insertions(+), 14 deletions(-) rename .github/workflows/{python-app.yml => autorun-tests.yml} (58%) diff --git a/.github/workflows/python-app.yml b/.github/workflows/autorun-tests.yml similarity index 58% rename from .github/workflows/python-app.yml rename to .github/workflows/autorun-tests.yml index 0c93108..3f1f93c 100644 --- a/.github/workflows/python-app.yml +++ b/.github/workflows/autorun-tests.yml @@ -1,7 +1,4 @@ -# This workflow will install Python dependencies, run tests and lint with a single version of Python -# For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions - -name: Python application +name: Automatically run tests on: push: @@ -9,32 +6,33 @@ on: pull_request: branches: [ "main" ] -permissions: - contents: read - jobs: - build: - + build-linux: runs-on: ubuntu-latest + strategy: + max-parallel: 5 steps: - uses: actions/checkout@v3 - name: Set up Python 3.7 uses: actions/setup-python@v3 with: - python-version: "3.7" + python-version: 3.7 + - name: Add conda to system path + run: | + # $CONDA is an environment variable pointing to the root of the miniconda directory + echo $CONDA/bin >> $GITHUB_PATH - name: Install dependencies run: | - python -m pip install --upgrade pip - pip install flake8 pytest - if [ -f requirements.txt ]; then pip install -r requirements.txt; fi - python setup.py install + conda env update --file environment.yml --name base - name: Lint with flake8 run: | + conda install flake8 # stop the build if there are Python syntax errors or undefined names flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics - name: Test with pytest run: | + conda install pytest pytest From c4e003f05361412d443129dd6816a3e15d91f90b Mon Sep 17 00:00:00 2001 From: Samuel Sledzieski Date: Thu, 30 Jun 2022 11:45:38 -0400 Subject: [PATCH 09/15] Create requirements.txt --- requirements.txt | 11 +++++++++++ 1 file changed, 11 insertions(+) create mode 100644 requirements.txt diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..f93b35a --- /dev/null +++ b/requirements.txt @@ -0,0 +1,11 @@ +pytorch=1.11 +biopython +h5py +matplotlib +numpy +pandas +scikit-learn +scipy +seaborn +setuptools +tqdm From 9be9dadae1f16846de632172e1ea273ee17fd390 Mon Sep 17 00:00:00 2001 From: Samuel Sledzieski Date: Thu, 30 Jun 2022 11:46:34 -0400 Subject: [PATCH 10/15] Update autorun-tests.yml --- .github/workflows/autorun-tests.yml | 26 ++++++++++++++------------ 1 file changed, 14 insertions(+), 12 deletions(-) diff --git a/.github/workflows/autorun-tests.yml b/.github/workflows/autorun-tests.yml index 3f1f93c..0c93108 100644 --- a/.github/workflows/autorun-tests.yml +++ b/.github/workflows/autorun-tests.yml @@ -1,4 +1,7 @@ -name: Automatically run tests +# This workflow will install Python dependencies, run tests and lint with a single version of Python +# For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions + +name: Python application on: push: @@ -6,33 +9,32 @@ on: pull_request: branches: [ "main" ] +permissions: + contents: read + jobs: - build-linux: + build: + runs-on: ubuntu-latest - strategy: - max-parallel: 5 steps: - uses: actions/checkout@v3 - name: Set up Python 3.7 uses: actions/setup-python@v3 with: - python-version: 3.7 - - name: Add conda to system path - run: | - # $CONDA is an environment variable pointing to the root of the miniconda directory - echo $CONDA/bin >> $GITHUB_PATH + python-version: "3.7" - name: Install dependencies run: | - conda env update --file environment.yml --name base + python -m pip install --upgrade pip + pip install flake8 pytest + if [ -f requirements.txt ]; then pip install -r requirements.txt; fi + python setup.py install - name: Lint with flake8 run: | - conda install flake8 # stop the build if there are Python syntax errors or undefined names flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics - name: Test with pytest run: | - conda install pytest pytest From 540f9f79774afd093efd6ba98388ff9470c070ee Mon Sep 17 00:00:00 2001 From: Samuel Sledzieski Date: Thu, 30 Jun 2022 11:47:23 -0400 Subject: [PATCH 11/15] Update requirements.txt --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index f93b35a..05153db 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,4 @@ -pytorch=1.11 +pytorch==1.11 biopython h5py matplotlib From 487b5e3f4fa56d0abff1ef9b04d33c295e18e33a Mon Sep 17 00:00:00 2001 From: Samuel Sledzieski Date: Thu, 30 Jun 2022 11:51:11 -0400 Subject: [PATCH 12/15] Update requirements.txt --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 05153db..52cb6be 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,4 @@ -pytorch==1.11 +torch==1.11 biopython h5py matplotlib From 37759877f52d87160c6d8712734b7f45af7850de Mon Sep 17 00:00:00 2001 From: Samuel Sledzieski Date: Thu, 30 Jun 2022 11:53:37 -0400 Subject: [PATCH 13/15] Delete dscript/legacy directory --- dscript/legacy/alphabets_legacy.py | 77 ---- dscript/legacy/contact_legacy.py | 132 ------ dscript/legacy/embedding_legacy.py | 185 -------- dscript/legacy/fasta_legacy.py | 78 ---- dscript/legacy/interaction_legacy.py | 221 ---------- dscript/legacy/train_legacy.py | 616 --------------------------- dscript/legacy/utils_legacy.py | 170 -------- 7 files changed, 1479 deletions(-) delete mode 100644 dscript/legacy/alphabets_legacy.py delete mode 100644 dscript/legacy/contact_legacy.py delete mode 100644 dscript/legacy/embedding_legacy.py delete mode 100644 dscript/legacy/fasta_legacy.py delete mode 100644 dscript/legacy/interaction_legacy.py delete mode 100644 dscript/legacy/train_legacy.py delete mode 100644 dscript/legacy/utils_legacy.py diff --git a/dscript/legacy/alphabets_legacy.py b/dscript/legacy/alphabets_legacy.py deleted file mode 100644 index d38bfbf..0000000 --- a/dscript/legacy/alphabets_legacy.py +++ /dev/null @@ -1,77 +0,0 @@ -from __future__ import print_function, division - -import numpy as np - - -class Alphabet: - """ - From `Bepler & Berger `_. - - :param chars: List of characters in alphabet - :type chars: byte str - :param encoding: Mapping of characters to numbers [default: encoding] - :type encoding: np.ndarray - :param mask: Set encoding mask [default: False] - :type mask: bool - :param missing: Number to use for a value outside the alphabet [default: 255] - :type missing: int - """ - - def __init__(self, chars, encoding=None, mask=False, missing=255): - self.chars = np.frombuffer(chars, dtype=np.uint8) - self.encoding = np.zeros(256, dtype=np.uint8) + missing - if encoding is None: - self.encoding[self.chars] = np.arange(len(self.chars)) - self.size = len(self.chars) - else: - self.encoding[self.chars] = encoding - self.size = encoding.max() + 1 - self.mask = mask - if mask: - self.size -= 1 - - def __len__(self): - return self.size - - def __getitem__(self, i): - return chr(self.chars[i]) - - def encode(self, x): - """ - Encode a byte string into alphabet indices - - :param x: Amino acid string - :type x: byte str - :return: Numeric encoding - :rtype: np.ndarray - """ - x = np.frombuffer(x, dtype=np.uint8) - return self.encoding[x] - - def decode(self, x): - """ - Decode numeric encoding to byte string of this alphabet - - :param x: Numeric encoding - :type x: np.ndarray - :return: Amino acid string - :rtype: byte str - """ - string = self.chars[x] - return string.tobytes() - - -class Uniprot21(Alphabet): - """ - Uniprot 21 Amino Acid Encoding. - - From `Bepler & Berger `_. - """ - - def __init__(self, mask=False): - chars = b"ARNDCQEGHILKMFPSTWYVXOUBZ" - encoding = np.arange(len(chars)) - encoding[21:] = [11, 4, 20, 20] # encode 'OUBZ' as synonyms - super(Uniprot21, self).__init__( - chars, encoding=encoding, mask=mask, missing=20 - ) diff --git a/dscript/legacy/contact_legacy.py b/dscript/legacy/contact_legacy.py deleted file mode 100644 index c0468c7..0000000 --- a/dscript/legacy/contact_legacy.py +++ /dev/null @@ -1,132 +0,0 @@ -""" -Contact model classes. -""" - -import torch -import torch.nn as nn -import torch.functional as F - - -class FullyConnected(nn.Module): - """ - Performs part 1 of Contact Prediction Module. Takes embeddings from Projection module and produces broadcast tensor. - - Input embeddings of dimension :math:`d` are combined into a :math:`2d` length MLP input :math:`z_{cat}`, where :math:`z_{cat} = [z_0 \\ominus z_1 | z_0 \\odot z_1]` - - :param embed_dim: Output dimension of `dscript.models.embedding <#module-dscript.models.embedding>`_ model :math:`d` [default: 100] - :type embed_dim: int - :param hidden_dim: Hidden dimension :math:`h` [default: 50] - :type hidden_dim: int - :param activation: Activation function for broadcast tensor [default: torch.nn.ReLU()] - :type activation: torch.nn.Module - """ - - def __init__(self, embed_dim, hidden_dim, activation=nn.ReLU()): - super(FullyConnected, self).__init__() - - self.D = embed_dim - self.H = hidden_dim - self.conv = nn.Conv2d(2 * self.D, self.H, 1) - self.batchnorm = nn.BatchNorm2d(self.H) - self.activation = activation - - def forward(self, z0, z1): - """ - :param z0: Projection module embedding :math:`(b \\times N \\times d)` - :type z0: torch.Tensor - :param z1: Projection module embedding :math:`(b \\times M \\times d)` - :type z1: torch.Tensor - :return: Predicted broadcast tensor :math:`(b \\times N \\times M \\times h)` - :rtype: torch.Tensor - """ - - # z0 is (b,N,d), z1 is (b,M,d) - z0 = z0.transpose(1, 2) - z1 = z1.transpose(1, 2) - # z0 is (b,d,N), z1 is (b,d,M) - - z_dif = torch.abs(z0.unsqueeze(3) - z1.unsqueeze(2)) - z_mul = z0.unsqueeze(3) * z1.unsqueeze(2) - z_cat = torch.cat([z_dif, z_mul], 1) - - b = self.conv(z_cat) - b = self.activation(b) - b = self.batchnorm(b) - - return b - - -class ContactCNN(nn.Module): - """ - Residue Contact Prediction Module. Takes embeddings from Projection module and produces contact map, output of Contact module. - - :param embed_dim: Output dimension of `dscript.models.embedding <#module-dscript.models.embedding>`_ model :math:`d` [default: 100] - :type embed_dim: int - :param hidden_dim: Hidden dimension :math:`h` [default: 50] - :type hidden_dim: int - :param width: Width of convolutional filter :math:`2w+1` [default: 7] - :type width: int - :param activation: Activation function for final contact map [default: torch.nn.Sigmoid()] - :type activation: torch.nn.Module - """ - - def __init__( - self, embed_dim=100, hidden_dim=50, width=7, activation=nn.Sigmoid() - ): - super(ContactCNN, self).__init__() - - self.hidden = FullyConnected(embed_dim, hidden_dim) - self.conv = nn.Conv2d(hidden_dim, 1, width, padding=width // 2) - self.batchnorm = nn.BatchNorm2d(1) - self.activation = activation - self.clip() - - def clip(self): - """ - Force the convolutional layer to be transpose invariant. - - :meta private: - """ - - w = self.conv.weight - self.conv.weight.data[:] = 0.5 * (w + w.transpose(2, 3)) - - def forward(self, z0, z1): - """ - :param z0: Projection module embedding :math:`(b \\times N \\times d)` - :type z0: torch.Tensor - :param z1: Projection module embedding :math:`(b \\times M \\times d)` - :type z1: torch.Tensor - :return: Predicted contact map :math:`(b \\times N \\times M)` - :rtype: torch.Tensor - """ - B = self.broadcast(z0, z1) - return self.predict(B) - - def broadcast(self, z0, z1): - """ - Calls `dscript.models.contact.FullyConnected <#module-dscript.models.contact.FullyConnected>`_. - - :param z0: Projection module embedding :math:`(b \\times N \\times d)` - :type z0: torch.Tensor - :param z1: Projection module embedding :math:`(b \\times M \\times d)` - :type z1: torch.Tensor - :return: Predicted contact broadcast tensor :math:`(b \\times N \\times M \\times h)` - :rtype: torch.Tensor - """ - B = self.hidden(z0, z1) - return B - - def predict(self, B): - """ - Predict contact map from broadcast tensor. - - :param B: Predicted contact broadcast :math:`(b \\times N \\times M \\times h)` - :type B: torch.Tensor - :return: Predicted contact map :math:`(b \\times N \\times M)` - :rtype: torch.Tensor - """ - C = self.conv(B) - C = self.batchnorm(C) - C = self.activation(C) - return C diff --git a/dscript/legacy/embedding_legacy.py b/dscript/legacy/embedding_legacy.py deleted file mode 100644 index 80e66d6..0000000 --- a/dscript/legacy/embedding_legacy.py +++ /dev/null @@ -1,185 +0,0 @@ -""" -Embedding model classes. -""" - -import torch -import torch.nn as nn -import torch.nn.functional as F -from torch.nn.utils.rnn import PackedSequence - - -class IdentityEmbed(nn.Module): - """ - Does not reduce the dimension of the language model embeddings, just passes them through to the contact model. - """ - - def forward(self, x): - """ - :param x: Input language model embedding :math:`(b \\times N \\times d_0)` - :type x: torch.Tensor - :return: Same embedding - :rtype: torch.Tensor - """ - return x - - -class FullyConnectedEmbed(nn.Module): - """ - Protein Projection Module. Takes embedding from language model and outputs low-dimensional interaction aware projection. - - :param nin: Size of language model output - :type nin: int - :param nout: Dimension of projection - :type nout: int - :param dropout: Proportion of weights to drop out [default: 0.5] - :type dropout: float - :param activation: Activation for linear projection model - :type activation: torch.nn.Module - """ - - def __init__(self, nin, nout, dropout=0.5, activation=nn.ReLU()): - super(FullyConnectedEmbed, self).__init__() - self.nin = nin - self.nout = nout - self.dropout_p = dropout - - self.transform = nn.Linear(nin, nout) - self.drop = nn.Dropout(p=self.dropout_p) - self.activation = activation - - def forward(self, x): - """ - :param x: Input language model embedding :math:`(b \\times N \\times d_0)` - :type x: torch.Tensor - :return: Low dimensional projection of embedding - :rtype: torch.Tensor - """ - t = self.transform(x) - t = self.activation(t) - t = self.drop(t) - return t - - -class SkipLSTM(nn.Module): - """ - Language model from `Bepler & Berger `_. - - Loaded with pre-trained weights in embedding function. - - :param nin: Input dimension of amino acid one-hot [default: 21] - :type nin: int - :param nout: Output dimension of final layer [default: 100] - :type nout: int - :param hidden_dim: Size of hidden dimension [default: 1024] - :type hidden_dim: int - :param num_layers: Number of stacked LSTM models [default: 3] - :type num_layers: int - :param dropout: Proportion of weights to drop out [default: 0] - :type dropout: float - :param bidirectional: Whether to use biLSTM vs. LSTM - :type bidirectional: bool - """ - - def __init__( - self, - nin=21, - nout=100, - hidden_dim=1024, - num_layers=3, - dropout=0, - bidirectional=True, - ): - super(SkipLSTM, self).__init__() - - self.nin = nin - self.nout = nout - - self.dropout = nn.Dropout(p=dropout) - - self.layers = nn.ModuleList() - dim = nin - for i in range(num_layers): - f = nn.LSTM( - dim, - hidden_dim, - 1, - batch_first=True, - bidirectional=bidirectional, - ) - self.layers.append(f) - if bidirectional: - dim = 2 * hidden_dim - else: - dim = hidden_dim - - n = hidden_dim * num_layers + nin - if bidirectional: - n = 2 * hidden_dim * num_layers + nin - - self.proj = nn.Linear(n, nout) - - def to_one_hot(self, x): - """ - Transform numeric encoded amino acid vector to one-hot encoded vector - - :param x: Input numeric amino acid encoding :math:`(N)` - :type x: torch.Tensor - :return: One-hot encoding vector :math:`(N \\times n_{in})` - :rtype: torch.Tensor - """ - packed = type(x) is PackedSequence - if packed: - one_hot = x.data.new(x.data.size(0), self.nin).float().zero_() - one_hot.scatter_(1, x.data.unsqueeze(1), 1) - one_hot = PackedSequence(one_hot, x.batch_sizes) - else: - one_hot = x.new(x.size(0), x.size(1), self.nin).float().zero_() - one_hot.scatter_(2, x.unsqueeze(2), 1) - return one_hot - - def transform(self, x): - """ - :param x: Input numeric amino acid encoding :math:`(N)` - :type x: torch.Tensor - :return: Concatenation of all hidden layers :math:`(N \\times (n_{in} + 2 \\times \\text{num_layers} \\times \\text{hidden_dim}))` - :rtype: torch.Tensor - """ - one_hot = self.to_one_hot(x) - hs = [one_hot] # [] - h_ = one_hot - for f in self.layers: - h, _ = f(h_) - # h = self.dropout(h) - hs.append(h) - h_ = h - if type(x) is PackedSequence: - h = torch.cat([z.data for z in hs], 1) - h = PackedSequence(h, x.batch_sizes) - else: - h = torch.cat([z for z in hs], 2) - return h - - def forward(self, x): - """ - :meta private: - """ - one_hot = self.to_one_hot(x) - hs = [one_hot] - h_ = one_hot - - for f in self.layers: - h, _ = f(h_) - # h = self.dropout(h) - hs.append(h) - h_ = h - - if type(x) is PackedSequence: - h = torch.cat([z.data for z in hs], 1) - z = self.proj(h) - z = PackedSequence(z, x.batch_sizes) - else: - h = torch.cat([z for z in hs], 2) - z = self.proj(h.view(-1, h.size(2))) - z = z.view(x.size(0), x.size(1), -1) - - return z diff --git a/dscript/legacy/fasta_legacy.py b/dscript/legacy/fasta_legacy.py deleted file mode 100644 index 579eeb7..0000000 --- a/dscript/legacy/fasta_legacy.py +++ /dev/null @@ -1,78 +0,0 @@ -def parse(f, comment="#"): - """ - Parse a file in ``.fasta`` format. - - :param f: Input file object - :type f: _io.TextIOWrapper - :param comment: Character used for comments - :type comment: str - - :return: names, sequence - :rtype: list[str], list[str] - """ - starter = ">" - empty = "" - if "b" in f.mode: - comment = b"#" - starter = b">" - empty = b"" - names = [] - sequences = [] - name = None - sequence = [] - for line in f: - if line.startswith(comment): - continue - line = line.strip() - if line.startswith(starter): - if name is not None: - names.append(name) - sequences.append(empty.join(sequence)) - name = line[1:] - sequence = [] - else: - sequence.append(line.upper()) - if name is not None: - names.append(name) - sequences.append(empty.join(sequence)) - - return names, sequences - - -def parse_directory(directory, extension=".seq"): - """ - Parse all files in a directory ending with ``extension``. - - :param directory: Input directory - :type directory: str - :param extension: Extension of all files to read in - :type extension: str - - :return: names, sequence - :rtype: list[str], list[str] - """ - names = [] - sequences = [] - - for seqPath in os.listdir(directory): - if seqPath.endswith(extension): - n, s = parse(open(f"{directory}/{seqPath}", "rb")) - names.append(n[0].decode("utf-8").strip()) - sequences.append(s[0].decode("utf-8").strip()) - return names, sequences - - -def write(nam, seq, f): - """ - Write a file in ``.fasta`` format. - - :param nam: List of names - :type nam: list[str] - :param seq: List of sequences - :type seq: list[str] - :param f: Output file object - :type f: _io.TextIOWrapper - """ - for n, s in zip(nam, seq): - f.write(">{}\n".format(n)) - f.write("{}\n".format(s)) diff --git a/dscript/legacy/interaction_legacy.py b/dscript/legacy/interaction_legacy.py deleted file mode 100644 index 07e572c..0000000 --- a/dscript/legacy/interaction_legacy.py +++ /dev/null @@ -1,221 +0,0 @@ -""" -Interaction model classes. -""" - -import numpy as np - -import torch -import torch.nn as nn -import torch.functional as F - - -class LogisticActivation(nn.Module): - """ - Implementation of Generalized Sigmoid - Applies the element-wise function: - - :math:`\\sigma(x) = \\frac{1}{1 + \\exp(-k(x-x_0))}` - - :param x0: The value of the sigmoid midpoint - :type x0: float - :param k: The slope of the sigmoid - trainable - :math:`k \\geq 0` - :type k: float - :param train: Whether :math:`k` is a trainable parameter - :type train: bool - """ - - def __init__(self, x0=0, k=1, train=False): - super(LogisticActivation, self).__init__() - self.x0 = x0 - self.k = nn.Parameter(torch.FloatTensor([float(k)])) - self.k.requiresGrad = train - - def forward(self, x): - """ - Applies the function to the input elementwise - - :param x: :math:`(N \\times *)` where :math:`*` means, any number of additional dimensions - :type x: torch.Tensor - :return: :math:`(N \\times *)`, same shape as the input - :rtype: torch.Tensor - """ - out = torch.clamp( - 1 / (1 + torch.exp(-self.k * (x - self.x0))), min=0, max=1 - ).squeeze() - return out - - def clip(self): - """ - Restricts sigmoid slope :math:`k` to be greater than or equal to 0, if :math:`k` is trained. - - :meta private: - """ - self.k.data.clamp_(min=0) - - -class ModelInteraction(nn.Module): - """ - Main D-SCRIPT model. Contains an embedding and contact model and offers access to those models. Computes pooling operations on contact map to generate interaction probability. - - :param embedding: Embedding model - :type embedding: dscript.models.embedding.FullyConnectedEmbed - :param contact: Contact model - :type contact: dscript.models.contact.ContactCNN - :param use_cuda: Whether the model should be run on GPU - :type use_cuda: bool - :param pool_size: width of max-pool [default 9] - :type pool_size: bool - :param theta_init: initialization value of :math:`\\theta` for weight matrix [default: 1] - :type theta_init: float - :param lambda_init: initialization value of :math:`\\lambda` for weight matrix [default: 0] - :type lambda_init: float - :param gamma_init: initialization value of :math:`\\gamma` for global pooling [default: 0] - :type gamma_init: float - :param use_W: whether to use the weighting matrix [default: True] - :type use_W: bool - """ - - def __init__( - self, - embedding, - contact, - pool_size=9, - theta_init=1, - lambda_init=0, - gamma_init=0, - use_W=True, - ): - super(ModelInteraction, self).__init__() - self.use_W = use_W - self.activation = LogisticActivation(x0=0.5, k=20) - - self.embedding = embedding - self.contact = contact - - if self.use_W: - self.theta = nn.Parameter(torch.FloatTensor([theta_init])) - self.lambda_ = nn.Parameter(torch.FloatTensor([lambda_init])) - - self.maxPool = nn.MaxPool2d(pool_size, padding=pool_size // 2) - self.gamma = nn.Parameter(torch.FloatTensor([gamma_init])) - - self.clip() - - def clip(self): - """ - Clamp model values - - :meta private: - """ - self.contact.clip() - - if self.use_W: - self.theta.data.clamp_(min=0, max=1) - self.lambda_.data.clamp_(min=0) - - self.gamma.data.clamp_(min=0) - - def embed(self, z): - """ - Project down input language model embeddings into low dimension using projection module - - :param z: Language model embedding :math:`(b \\times N \\times d_0)` - :type z: torch.Tensor - :return: D-SCRIPT projection :math:`(b \\times N \\times d)` - :rtype: torch.Tensor - """ - if self.embedding is None: - return z - else: - return self.embedding(z) - - def cpred(self, z0, z1): - """ - Project down input language model embeddings into low dimension using projection module - - :param z0: Language model embedding :math:`(b \\times N \\times d_0)` - :type z0: torch.Tensor - :param z1: Language model embedding :math:`(b \\times N \\times d_0)` - :type z1: torch.Tensor - :return: Predicted contact map :math:`(b \\times N \\times M)` - :rtype: torch.Tensor - """ - e0 = self.embed(z0) - e1 = self.embed(z1) - B = self.contact.broadcast(e0, e1) - C = self.contact.predict(B) - return C - - def map_predict(self, z0, z1): - """ - Project down input language model embeddings into low dimension using projection module - - :param z0: Language model embedding :math:`(b \\times N \\times d_0)` - :type z0: torch.Tensor - :param z1: Language model embedding :math:`(b \\times N \\times d_0)` - :type z1: torch.Tensor - :return: Predicted contact map, predicted probability of interaction :math:`(b \\times N \\times d_0), (1)` - :rtype: torch.Tensor, torch.Tensor - """ - - C = self.cpred(z0, z1) - - if self.use_W: - # Create contact weighting matrix - N, M = C.shape[2:] - - x1 = torch.from_numpy( - -1 - * ((np.arange(N) + 1 - ((N + 1) / 2)) / (-1 * ((N + 1) / 2))) - ** 2 - ).float() - if self.gamma.device.type == "cuda": - x1 = x1.cuda() - x1 = torch.exp(self.lambda_ * x1) - - x2 = torch.from_numpy( - -1 - * ((np.arange(M) + 1 - ((M + 1) / 2)) / (-1 * ((M + 1) / 2))) - ** 2 - ).float() - if self.gamma.device.type == "cuda": - x2 = x2.cuda() - x2 = torch.exp(self.lambda_ * x2) - - W = x1.unsqueeze(1) * x2 - W = (1 - self.theta) * W + self.theta - - yhat = C * W - - else: - yhat = C - - yhat = self.maxPool(yhat) - - # Mean of contact predictions where p_ij > mu + gamma*sigma - mu = torch.mean(yhat) - sigma = torch.var(yhat) - Q = torch.relu(yhat - mu - (self.gamma * sigma)) - phat = torch.sum(Q) / (torch.sum(torch.sign(Q)) + 1) - phat = self.activation(phat) - return C, phat - - def predict(self, z0, z1): - """ - Project down input language model embeddings into low dimension using projection module - - :param z0: Language model embedding :math:`(b \\times N \\times d_0)` - :type z0: torch.Tensor - :param z1: Language model embedding :math:`(b \\times N \\times d_0)` - :type z1: torch.Tensor - :return: Predicted probability of interaction - :rtype: torch.Tensor, torch.Tensor - """ - _, phat = self.map_predict(z0, z1) - return phat - - def forward(self, z0, z1): - """ - :meta private: - """ - return self.predict(z0, z1) diff --git a/dscript/legacy/train_legacy.py b/dscript/legacy/train_legacy.py deleted file mode 100644 index aa1c68b..0000000 --- a/dscript/legacy/train_legacy.py +++ /dev/null @@ -1,616 +0,0 @@ -""" -Train a new model. -""" - -import sys -import argparse -import h5py -import datetime -import subprocess as sp -import numpy as np -import pandas as pd -import gzip as gz -from tqdm import tqdm - -import torch -import torch.nn as nn -import torch.nn.functional as F -import torch.optim as optim -from torch.autograd import Variable -from torch.utils.data import IterableDataset, DataLoader -from sklearn.metrics import average_precision_score as average_precision - -import dscript -from dscript.utils import PairedDataset, collate_paired_sequences -from dscript.models.embedding import ( - IdentityEmbed, - FullyConnectedEmbed, -) -from dscript.models.contact import ContactCNN -from dscript.models.interaction import ModelInteraction - - -def add_args(parser): - """ - Create parser for command line utility. - - :meta private: - """ - - data_grp = parser.add_argument_group("Data") - proj_grp = parser.add_argument_group("Projection Module") - contact_grp = parser.add_argument_group("Contact Module") - inter_grp = parser.add_argument_group("Interaction Module") - train_grp = parser.add_argument_group("Training") - misc_grp = parser.add_argument_group("Output and Device") - - # Data - data_grp.add_argument("--train", help="Training data", required=True) - data_grp.add_argument("--val", help="Validation data", required=True) - data_grp.add_argument( - "--embedding", help="h5 file with embedded sequences", required=True - ) - data_grp.add_argument( - "--no-augment", - action="store_false", - dest="augment", - help="Set flag to not augment data by adding (B A) for all pairs (A B)", - ) - - # Embedding model - proj_grp.add_argument( - "--projection-dim", - type=int, - default=100, - help="Dimension of embedding projection layer (default: 100)", - ) - proj_grp.add_argument( - "--dropout-p", - type=float, - default=0.5, - help="Parameter p for embedding dropout layer (default: 0.5)", - ) - - # Contact model - contact_grp.add_argument( - "--hidden-dim", - type=int, - default=50, - help="Number of hidden units for comparison layer in contact prediction (default: 50)", - ) - contact_grp.add_argument( - "--kernel-width", - type=int, - default=7, - help="Width of convolutional filter for contact prediction (default: 7)", - ) - - # Interaction Model - inter_grp.add_argument( - "--no-w", - action="store_false", - dest="use_w", - help="Don't use weight matrix in interaction prediction model", - ) - inter_grp.add_argument( - "--pool-width", - type=int, - default=9, - help="Size of max-pool in interaction model (default: 9)", - ) - - # Training - train_grp.add_argument( - "--negative-ratio", - type=int, - default=10, - help="Number of negative training samples for each positive training sample (default: 10)", - ) - train_grp.add_argument( - "--epoch-scale", - type=int, - default=1, - help="Report heldout performance every this many epochs (default: 1)", - ) - train_grp.add_argument( - "--num-epochs", - type=int, - default=10, - help="Number of epochs (default: 10)", - ) - train_grp.add_argument( - "--batch-size", - type=int, - default=25, - help="Minibatch size (default: 25)", - ) - train_grp.add_argument( - "--weight-decay", - type=float, - default=0, - help="L2 regularization (default: 0)", - ) - train_grp.add_argument( - "--lr", - type=float, - default=0.001, - help="Learning rate (default: 0.001)", - ) - train_grp.add_argument( - "--lambda", - dest="lambda_", - type=float, - default=0.35, - help="Weight on the similarity objective (default: 0.35)", - ) - - # Output - misc_grp.add_argument( - "-o", "--outfile", help="Output file path (default: stdout)" - ) - misc_grp.add_argument( - "--save-prefix", help="Path prefix for saving models" - ) - misc_grp.add_argument( - "-d", "--device", type=int, default=-1, help="Compute device to use" - ) - misc_grp.add_argument( - "--checkpoint", help="Checkpoint model to start training from" - ) - - return parser - - -def predict_interaction(model, n0, n1, tensors, use_cuda): - """ - Predict whether a list of protein pairs will interact. - - :param model: Model to be trained - :type model: dscript.models.interaction.ModelInteraction - :param n0: First protein names - :type n0: list[str] - :param n1: Second protein names - :type n1: list[str] - :param tensors: Dictionary of protein names to embeddings - :type tensors: dict[str, torch.Tensor] - :param use_cuda: Whether to use GPU - :type use_cuda: bool - """ - - b = len(n0) - - p_hat = [] - for i in range(b): - z_a = tensors[n0[i]] - z_b = tensors[n1[i]] - if use_cuda: - z_a = z_a.cuda() - z_b = z_b.cuda() - - p_hat.append(model.predict(z_a, z_b)) - p_hat = torch.stack(p_hat, 0) - return p_hat - - -def predict_cmap_interaction(model, n0, n1, tensors, use_cuda): - """ - Predict whether a list of protein pairs will interact, as well as their contact map. - - :param model: Model to be trained - :type model: dscript.models.interaction.ModelInteraction - :param n0: First protein names - :type n0: list[str] - :param n1: Second protein names - :type n1: list[str] - :param tensors: Dictionary of protein names to embeddings - :type tensors: dict[str, torch.Tensor] - :param use_cuda: Whether to use GPU - :type use_cuda: bool - """ - - b = len(n0) - - p_hat = [] - c_map_mag = [] - for i in range(b): - z_a = tensors[n0[i]] - z_b = tensors[n1[i]] - if use_cuda: - z_a = z_a.cuda() - z_b = z_b.cuda() - - cm, ph = model.map_predict(z_a, z_b) - p_hat.append(ph) - c_map_mag.append(torch.mean(cm)) - p_hat = torch.stack(p_hat, 0) - c_map_mag = torch.stack(c_map_mag, 0) - return c_map_mag, p_hat - - -def interaction_grad(model, n0, n1, y, tensors, use_cuda, weight=0.35): - """ - Compute gradient and backpropagate loss for a batch. - - :param model: Model to be trained - :type model: dscript.models.interaction.ModelInteraction - :param n0: First protein names - :type n0: list[str] - :param n1: Second protein names - :type n1: list[str] - :param y: Interaction labels - :type y: torch.Tensor - :param tensors: Dictionary of protein names to embeddings - :type tensors: dict[str, torch.Tensor] - :param use_cuda: Whether to use GPU - :type use_cuda: bool - :param weight: Weight on the contact map magnitude objective. BCE loss is :math:`1 - \\text{weight}`. - :type weight: float - - :return: (Loss, number correct, mean square error, batch size) - :rtype: (torch.Tensor, int, torch.Tensor, int) - """ - - c_map_mag, p_hat = predict_cmap_interaction( - model, n0, n1, tensors, use_cuda - ) - if use_cuda: - y = y.cuda() - y = Variable(y) - - bce_loss = F.binary_cross_entropy(p_hat.float(), y.float()) - cmap_loss = torch.mean(c_map_mag) - loss = (weight * bce_loss) + ((1 - weight) * cmap_loss) - b = len(p_hat) - - # backprop loss - loss.backward() - - if use_cuda: - y = y.cpu() - p_hat = p_hat.cpu() - - with torch.no_grad(): - guess_cutoff = 0.5 - p_hat = p_hat.float() - p_guess = (guess_cutoff * torch.ones(b) < p_hat).float() - y = y.float() - correct = torch.sum(p_guess == y).item() - mse = torch.mean((y.float() - p_hat) ** 2).item() - - return loss, correct, mse, b - - -def interaction_eval(model, test_iterator, tensors, use_cuda): - """ - Evaluate test data set performance. - - :param model: Model to be trained - :type model: dscript.models.interaction.ModelInteraction - :param test_iterator: Test data iterator - :type test_iterator: torch.utils.data.DataLoader - :param tensors: Dictionary of protein names to embeddings - :type tensors: dict[str, torch.Tensor] - :param use_cuda: Whether to use GPU - :type use_cuda: bool - - :return: (Loss, number correct, mean square error, precision, recall, F1 Score, AUPR) - :rtype: (torch.Tensor, int, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor) - """ - p_hat = [] - true_y = [] - - for n0, n1, y in test_iterator: - p_hat.append(predict_interaction(model, n0, n1, tensors, use_cuda)) - true_y.append(y) - - y = torch.cat(true_y, 0) - p_hat = torch.cat(p_hat, 0) - - if use_cuda: - y.cuda() - p_hat = torch.Tensor([x.cuda() for x in p_hat]) - p_hat.cuda() - - loss = F.binary_cross_entropy(p_hat.float(), y.float()).item() - b = len(y) - - with torch.no_grad(): - guess_cutoff = torch.Tensor([0.5]).float() - p_hat = p_hat.float() - y = y.float() - p_guess = (guess_cutoff * torch.ones(b) < p_hat).float() - correct = torch.sum(p_guess == y).item() - mse = torch.mean((y.float() - p_hat) ** 2).item() - - tp = torch.sum(y * p_hat).item() - pr = tp / torch.sum(p_hat).item() - re = tp / torch.sum(y).item() - f1 = 2 * pr * re / (pr + re) - - y = y.cpu().numpy() - p_hat = p_hat.data.cpu().numpy() - - aupr = average_precision(y, p_hat) - - return loss, correct, mse, pr, re, f1, aupr - - -def main(args): - """ - Run training from arguments. - - :meta private: - """ - - output = args.outfile - if output is None: - output = sys.stdout - else: - output = open(output, "w") - - print(f'# Called as: {" ".join(sys.argv)}', file=output) - if output is not sys.stdout: - print(f'Called as: {" ".join(sys.argv)}') - - # Set device - device = args.device - use_cuda = (device >= 0) and torch.cuda.is_available() - if use_cuda: - torch.cuda.set_device(device) - print( - f"# Using CUDA device {device} - {torch.cuda.get_device_name(device)}", - file=output, - ) - else: - print("# Using CPU", file=output) - device = "cpu" - - batch_size = args.batch_size - - train_fi = args.train - test_fi = args.val - augment = args.augment - embedding_h5 = args.embedding - h5fi = h5py.File(embedding_h5, "r") - - print(f"# Loading training pairs from {train_fi}...", file=output) - output.flush() - - train_df = pd.read_csv(train_fi, sep="\t", header=None) - if augment: - train_n0 = pd.concat((train_df[0], train_df[1]), axis=0).reset_index( - drop=True - ) - train_n1 = pd.concat((train_df[1], train_df[0]), axis=0).reset_index( - drop=True - ) - train_y = torch.from_numpy( - pd.concat((train_df[2], train_df[2])).values - ) - else: - train_n0, train_n1 = train_df[0], train_df[1] - train_y = torch.from_numpy(train_df[2].values) - - print(f"# Loading testing pairs from {test_fi}...", file=output) - output.flush() - - test_df = pd.read_csv(test_fi, sep="\t", header=None) - test_n0, test_n1 = test_df[0], test_df[1] - test_y = torch.from_numpy(test_df[2].values) - output.flush() - - train_pairs = PairedDataset(train_n0, train_n1, train_y) - pairs_train_iterator = torch.utils.data.DataLoader( - train_pairs, - batch_size=batch_size, - collate_fn=collate_paired_sequences, - shuffle=True, - ) - - test_pairs = PairedDataset(test_n0, test_n1, test_y) - pairs_test_iterator = torch.utils.data.DataLoader( - test_pairs, - batch_size=batch_size, - collate_fn=collate_paired_sequences, - shuffle=True, - ) - - output.flush() - - print(f"# Loading embeddings", file=output) - tensors = {} - all_proteins = ( - set(train_n0) - .union(set(train_n1)) - .union(set(test_n0)) - .union(set(test_n1)) - ) - for prot_name in tqdm(all_proteins): - tensors[prot_name] = torch.from_numpy(h5fi[prot_name][:, :]) - - use_cuda = (args.device > -1) and torch.cuda.is_available() - - if args.checkpoint is None: - - projection_dim = args.projection_dim - dropout_p = args.dropout_p - embedding = FullyConnectedEmbed( - 6165, projection_dim, dropout=dropout_p - ) - print("# Initializing embedding model with:", file=output) - print(f"\tprojection_dim: {projection_dim}", file=output) - print(f"\tdropout_p: {dropout_p}", file=output) - - # Create contact model - hidden_dim = args.hidden_dim - kernel_width = args.kernel_width - print("# Initializing contact model with:", file=output) - print(f"\thidden_dim: {hidden_dim}", file=output) - print(f"\tkernel_width: {kernel_width}", file=output) - - contact = ContactCNN(projection_dim, hidden_dim, kernel_width) - - # Create the full model - use_W = args.use_w - pool_width = args.pool_width - print("# Initializing interaction model with:", file=output) - print(f"\tpool_width: {pool_width}", file=output) - print(f"\tuse_w: {use_W}", file=output) - model = ModelInteraction( - embedding, contact, use_W=use_W, pool_size=pool_width - ) - - print(model, file=output) - - else: - print( - "# Loading model from checkpoint {}".format(args.checkpoint), - file=output, - ) - model = torch.load(args.checkpoint) - model.use_cuda = use_cuda - - if use_cuda: - model = model.cuda() - - # Train the model - lr = args.lr - wd = args.weight_decay - num_epochs = args.num_epochs - batch_size = args.batch_size - report_steps = args.epoch_scale - inter_weight = args.lambda_ - cmap_weight = 1 - inter_weight - digits = int(np.floor(np.log10(num_epochs))) + 1 - save_prefix = args.save_prefix - if save_prefix is None: - save_prefix = datetime.datetime.now().strftime("%Y-%m-%d-%H-%M") - - params = [p for p in model.parameters() if p.requires_grad] - optim = torch.optim.Adam(params, lr=lr, weight_decay=wd) - - print(f'# Using save prefix "{save_prefix}"', file=output) - print(f"# Training with Adam: lr={lr}, weight_decay={wd}", file=output) - print(f"\tnum_epochs: {num_epochs}", file=output) - print(f"\tepoch_scale: {report_steps}", file=output) - print(f"\tbatch_size: {batch_size}", file=output) - print(f"\tinteraction weight: {inter_weight}", file=output) - print(f"\tcontact map weight: {cmap_weight}", file=output) - output.flush() - - batch_report_fmt = ( - "# [{}/{}] training {:.1%}: Loss={:.6}, Accuracy={:.3%}, MSE={:.6}" - ) - epoch_report_fmt = "# Finished Epoch {}/{}: Loss={:.6}, Accuracy={:.3%}, MSE={:.6}, Precision={:.6}, Recall={:.6}, F1={:.6}, AUPR={:.6}" - - N = len(pairs_train_iterator) * batch_size - for epoch in range(num_epochs): - - model.train() - - n = 0 - loss_accum = 0 - acc_accum = 0 - mse_accum = 0 - - # Train batches - for (z0, z1, y) in tqdm( - pairs_train_iterator, - desc=f"Epoch {epoch+1}/{num_epochs}", - total=len(pairs_train_iterator), - ): - - loss, correct, mse, b = interaction_grad( - model, z0, z1, y, tensors, use_cuda, weight=inter_weight - ) - - n += b - delta = b * (loss - loss_accum) - loss_accum += delta / n - - delta = correct - b * acc_accum - acc_accum += delta / n - - delta = b * (mse - mse_accum) - mse_accum += delta / n - - report = (n - b) // 100 < n // 100 - - optim.step() - optim.zero_grad() - model.clip() - - if report: - tokens = [ - epoch + 1, - num_epochs, - n / N, - loss_accum, - acc_accum, - mse_accum, - ] - if output is not sys.stdout: - print(batch_report_fmt.format(*tokens), file=output) - output.flush() - - if (epoch + 1) % report_steps == 0: - model.eval() - - with torch.no_grad(): - - ( - inter_loss, - inter_correct, - inter_mse, - inter_pr, - inter_re, - inter_f1, - inter_aupr, - ) = interaction_eval( - model, pairs_test_iterator, tensors, use_cuda - ) - tokens = [ - epoch + 1, - num_epochs, - inter_loss, - inter_correct / (len(pairs_test_iterator) * batch_size), - inter_mse, - inter_pr, - inter_re, - inter_f1, - inter_aupr, - ] - print(epoch_report_fmt.format(*tokens), file=output) - output.flush() - - # Save the model - if save_prefix is not None: - save_path = ( - save_prefix - + "_epoch" - + str(epoch + 1).zfill(digits) - + ".sav" - ) - print(f"# Saving model to {save_path}", file=output) - model.cpu() - torch.save(model, save_path) - if use_cuda: - model.cuda() - - output.flush() - - if save_prefix is not None: - save_path = save_prefix + "_final.sav" - print(f"# Saving final model to {save_path}", file=output) - model.cpu() - torch.save(model, save_path) - if use_cuda: - model.cuda() - - output.close() - - -if __name__ == "__main__": - parser = argparse.ArgumentParser(description=__doc__) - add_args(parser) - main(parser.parse_args()) diff --git a/dscript/legacy/utils_legacy.py b/dscript/legacy/utils_legacy.py deleted file mode 100644 index 67d23a7..0000000 --- a/dscript/legacy/utils_legacy.py +++ /dev/null @@ -1,170 +0,0 @@ -import torch -import torch.utils.data - -import numpy as np -import pandas as pd -import subprocess as sp -import sys -import gzip as gz -from datetime import datetime -from .fasta import parse - - -def log(msg, file=sys.stderr): - """ - Log datetime-stamped message to file - - :param msg: Message to log - :param f: Writable file object to log message to - """ - timestr = datetime.utcnow().isoformat(sep="-", timespec="milliseconds") - file.write(f"[{timestr}] {msg}\n") - file.flush() - - -def plot_PR_curve(y, phat, saveFile=None): - """ - Plot precision-recall curve. - - :param y: Labels - :type y: np.ndarray - :param phat: Predicted probabilities - :type phat: np.ndarray - :param saveFile: File for plot of curve to be saved to - :type saveFile: str - """ - import matplotlib.pyplot as plt - from sklearn.metrics import precision_recall_curve, average_precision_score - - aupr = average_precision_score(y, phat) - precision, recall, _ = precision_recall_curve(y, phat) - - plt.step(recall, precision, color="b", alpha=0.2, where="post") - plt.fill_between(recall, precision, step="post", alpha=0.2, color="b") - plt.xlabel("Recall") - plt.ylabel("Precision") - plt.ylim([0.0, 1.05]) - plt.xlim([0.0, 1.0]) - plt.title("Precision-Recall (AUPR: {:.3})".format(aupr)) - if saveFile: - plt.savefig(saveFile) - else: - plt.show() - - -def plot_ROC_curve(y, phat, saveFile=None): - """ - Plot receiver operating characteristic curve. - - :param y: Labels - :type y: np.ndarray - :param phat: Predicted probabilities - :type phat: np.ndarray - :param saveFile: File for plot of curve to be saved to - :type saveFile: str - """ - import matplotlib.pyplot as plt - from sklearn.metrics import roc_curve, roc_auc_score - - auroc = roc_auc_score(y, phat) - - fpr, tpr, roc_thresh = roc_curve(y, phat) - print("AUROC:", auroc) - - plt.step(fpr, tpr, color="b", alpha=0.2, where="post") - plt.fill_between(fpr, tpr, step="post", alpha=0.2, color="b") - plt.xlabel("FPR") - plt.ylabel("TPR") - plt.ylim([0.0, 1.05]) - plt.xlim([0.0, 1.0]) - plt.title("Receiver Operating Characteristic (AUROC: {:.3})".format(auroc)) - if saveFile: - plt.savefig(saveFile) - else: - plt.show() - - -def RBF(D, sigma=None): - """ - Convert distance matrix into similarity matrix using Radial Basis Function (RBF) Kernel. - - :math:`RBF(x,x') = \\exp{\\frac{-(x - x')^{2}}{2\\sigma^{2}}}` - - :param D: Distance matrix - :type D: np.ndarray - :param sigma: Bandwith of RBF Kernel [default: :math:`\\sqrt{\\text{max}(D)}`] - :type sigma: float - :return: Similarity matrix - :rtype: np.ndarray - """ - sigma = sigma or np.sqrt(np.max(D)) - return np.exp(-1 * (np.square(D) / (2 * sigma ** 2))) - - -def gpu_mem(device): - """ - Get current memory usage for GPU. - - :param device: GPU device number - :type device: int - :return: memory used, memory total - :rtype: int, int - """ - result = sp.check_output( - [ - "nvidia-smi", - "--query-gpu=memory.used,memory.total", - "--format=csv,nounits,noheader", - "--id={}".format(device), - ], - encoding="utf-8", - ) - gpu_memory = [int(x) for x in result.strip().split(",")] - return gpu_memory[0], gpu_memory[1] - - -class PairedDataset(torch.utils.data.Dataset): - """ - Dataset to be used by the PyTorch data loader for pairs of sequences and their labels. - - :param X0: List of first item in the pair - :param X1: List of second item in the pair - :param Y: List of labels - """ - - def __init__(self, X0, X1, Y): - self.X0 = X0 - self.X1 = X1 - self.Y = Y - assert len(X0) == len(X1), ( - "X0: " - + str(len(X0)) - + " X1: " - + str(len(X1)) - + " Y: " - + str(len(Y)) - ) - assert len(X0) == len(Y), ( - "X0: " - + str(len(X0)) - + " X1: " - + str(len(X1)) - + " Y: " - + str(len(Y)) - ) - - def __len__(self): - return len(self.X0) - - def __getitem__(self, i): - return self.X0[i], self.X1[i], self.Y[i] - - -def collate_paired_sequences(args): - """ - Collate function for PyTorch data loader. - """ - x0 = [a[0] for a in args] - x1 = [a[1] for a in args] - y = [a[2] for a in args] - return x0, x1, torch.stack(y, 0) From 23731a6e911749381463acb18cbc5b4ba4d09c68 Mon Sep 17 00:00:00 2001 From: Samuel Sledzieski Date: Thu, 30 Jun 2022 11:55:09 -0400 Subject: [PATCH 14/15] Update embedding.py --- dscript/models/embedding.py | 41 ------------------------------------- 1 file changed, 41 deletions(-) diff --git a/dscript/models/embedding.py b/dscript/models/embedding.py index 88537db..e30510d 100644 --- a/dscript/models/embedding.py +++ b/dscript/models/embedding.py @@ -57,47 +57,6 @@ class FullyConnectedEmbed(nn.Module): t = self.drop(t) return t - -class LSTMEmbed(nn.Module): - def __init__(self, nout, activation="ReLU", sparse=False, p=0.5): - super(LSTMEmbed, self).__init__() - self.activation = activation - self.sparse = sparse - self.p = p - - self.embedding = SkipLSTM(21, nout, 1024, 3) - self.embedding.load_state_dict(torch.load(EMBEDDING_STATE_DICT)) - - for param in self.embedding.parameters(): - param.requires_grad = False - torch.nn.init.normal_(self.embedding.proj.weight) - torch.nn.init.uniform_(self.embedding.proj.bias, 0, 0) - self.embedding.proj.weight.requires_grad = True - self.embedding.proj.bias.requires_grad = True - - self.activationDict = nn.ModuleDict( - { - "None": IdentityEmbed(), - "ReLU": nn.ReLU(), - "Sigmoid": nn.Sigmoid(), - } - ) - self.dropout = nn.Dropout(p=self.p) - - def forward(self, x): - - t = self.embedding(x) - if self.activation: - t = self.activationDict[self.activation](t) - if self.sparse: - t = self.dropout(t) - - return t - - def long_embed(self, x): - return self.embedding.transform(x) - - class SkipLSTM(nn.Module): """ Language model from `Bepler & Berger `_. From 11997309de2664e9ad63c91470b7c6b888f9feb4 Mon Sep 17 00:00:00 2001 From: Samuel Sledzieski Date: Thu, 30 Jun 2022 12:05:15 -0400 Subject: [PATCH 15/15] Create pypi_publish.yml --- .github/workflows/pypi_publish.yml | 39 ++++++++++++++++++++++++++++++ 1 file changed, 39 insertions(+) create mode 100644 .github/workflows/pypi_publish.yml diff --git a/.github/workflows/pypi_publish.yml b/.github/workflows/pypi_publish.yml new file mode 100644 index 0000000..2fbdf5b --- /dev/null +++ b/.github/workflows/pypi_publish.yml @@ -0,0 +1,39 @@ +# This workflow will upload a Python Package using Twine when a release is created +# For more information see: https://help.github.com/en/actions/language-and-framework-guides/using-python-with-github-actions#publishing-to-package-registries + +# This workflow uses actions that are not certified by GitHub. +# They are provided by a third-party and are governed by +# separate terms of service, privacy policy, and support +# documentation. + +name: Upload Python Package + +on: + release: + types: [published] + +permissions: + contents: read + +jobs: + deploy: + + runs-on: ubuntu-latest + + steps: + - uses: actions/checkout@v3 + - name: Set up Python + uses: actions/setup-python@v3 + with: + python-version: '3.x' + - name: Install dependencies + run: | + python -m pip install --upgrade pip + if [ -f requirements.txt ]; then pip install -r requirements.txt; fi + - name: Build package + run: python setup.py sdist bdist_wheel + - name: Publish package + uses: pypa/gh-action-pypi-publish@27b31702a0e7fc50959f5ad993c78deac1bdfc29 + with: + user: __token__ + password: ${{ secrets.PYPI_API_TOKEN }}