From f9fdbaab34774a4da50c05f58475fe54152e4404 Mon Sep 17 00:00:00 2001
From: Mert Erden <merden01@eecs.tufts.edu>
Date: Mon, 27 Jun 2022 14:44:31 -0400
Subject: [PATCH 01/15] fix: added biopython as dependency in setup.py

---
 setup.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/setup.py b/setup.py
index 6948dc9..27b1944 100644
--- a/setup.py
+++ b/setup.py
@@ -23,6 +23,7 @@ setup(
         "scipy",
         "pandas",
         "torch",
+	"biopython",
         "matplotlib",
         "seaborn",
         "tqdm",

From 128d360c03189cd33e9466f8dd9f9dfbd1bf5ac7 Mon Sep 17 00:00:00 2001
From: samsledje <samsledje@gmail.com>
Date: Tue, 28 Jun 2022 11:31:59 -0400
Subject: [PATCH 02/15] update setup.py requirements

---
 CHANGELOG.md        | 3 +++
 dscript/__init__.py | 2 +-
 setup.py            | 4 ++--
 3 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index f010afc..7dc2c46 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -7,6 +7,9 @@
 
 ## v0.2
 
+### v0.2.1
+- Add biopython to setup.py
+
 ### v0.2.0
 
 - Integrate Topsy-Turvy to allow for top-down supervision
diff --git a/dscript/__init__.py b/dscript/__init__.py
index 014e231..7435f05 100644
--- a/dscript/__init__.py
+++ b/dscript/__init__.py
@@ -1,4 +1,4 @@
-__version__ = "0.2.0"
+__version__ = "0.2.1"
 __citation__ = """Sledzieski, Singh, Cowen, Berger. "D-SCRIPT translates genome to phenome with sequence-based, structure-aware, genome-scale predictions of protein-protein interactions." Cell Systems 12, no. 10 (2021): 969-982.
 
 Devkota, Singh, Sledzieski, Berger, Cowen, Topsy-Turvy: integrating a global view into sequence-based PPI prediction, Bioinformatics, In Press."""
diff --git a/setup.py b/setup.py
index 27b1944..39f2848 100644
--- a/setup.py
+++ b/setup.py
@@ -22,8 +22,8 @@ setup(
         "numpy",
         "scipy",
         "pandas",
-        "torch",
-	"biopython",
+        "torch>=1.11",
+        "biopython",
         "matplotlib",
         "seaborn",
         "tqdm",

From f4b9b1f961b644c7725db56e7240a00b976cce7a Mon Sep 17 00:00:00 2001
From: samsledje <samsledje@gmail.com>
Date: Tue, 28 Jun 2022 11:48:32 -0400
Subject: [PATCH 03/15] This file is tab separated, not comma separated. act
 like it.

---
 dscript/tests/test.tsv | 3 +++
 1 file changed, 3 insertions(+)
 create mode 100644 dscript/tests/test.tsv

diff --git a/dscript/tests/test.tsv b/dscript/tests/test.tsv
new file mode 100644
index 0000000..0937e24
--- /dev/null
+++ b/dscript/tests/test.tsv
@@ -0,0 +1,3 @@
+seq1	seq2	1
+seq1	seq3	0
+seq2	seq3	1

From e2db487adc87259083838300908c714d0a0c95b6 Mon Sep 17 00:00:00 2001
From: Samuel Sledzieski <samsledje@gmail.com>
Date: Wed, 29 Jun 2022 12:53:59 -0400
Subject: [PATCH 04/15] Update __main__.py

---
 dscript/__main__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dscript/__main__.py b/dscript/__main__.py
index fec04e5..e319205 100644
--- a/dscript/__main__.py
+++ b/dscript/__main__.py
@@ -37,7 +37,7 @@ def main():
     subparsers = parser.add_subparsers(title="D-SCRIPT Commands", dest="cmd")
     subparsers.required = True
 
-    from .commands import train, embed, evaluate, predict, predict_parallel
+    from .commands import train, embed, evaluate, predict
 
     modules = {
         "train": train,

From e4adc459a30d7d31cdd50228a247d0f56ae030ff Mon Sep 17 00:00:00 2001
From: Mert Erden <merden01@eecs.tufts.edu>
Date: Wed, 29 Jun 2022 14:09:44 -0400
Subject: [PATCH 05/15] enhancement: added typing documentation & automatic
 retrying for corrupted models

---
 .pre-commit-config.yaml      |  3 ++-
 dscript/__main__.py          | 14 +++++++++--
 dscript/commands/embed.py    | 11 ++++++++
 dscript/commands/evaluate.py | 12 ++++++++-
 dscript/commands/predict.py  | 14 +++++++++++
 dscript/commands/train.py    | 32 +++++++++++++++++++++++
 dscript/pretrained.py        | 49 +++++++++++++++++++++++++++++++-----
 7 files changed, 125 insertions(+), 10 deletions(-)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 322f16a..5d08ee2 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -16,7 +16,8 @@ repos:
     rev: 21.6b0
     hooks:
     - id: black
-      language_version: python3.8
+      language_version: python3.7
+      additional_dependencies: ['click==8.0.4']
 -   repo: https://gitlab.com/pycqa/flake8
     rev: 3.9.2
     hooks:
diff --git a/dscript/__main__.py b/dscript/__main__.py
index fec04e5..39400e0 100644
--- a/dscript/__main__.py
+++ b/dscript/__main__.py
@@ -4,6 +4,16 @@ D-SCRIPT: Structure Aware PPI Prediction
 import argparse
 import os
 import sys
+from typing import Union
+
+from .commands.embed import EmbeddingArguments
+from .commands.evaluate import EvaluateArguments
+from .commands.predict import PredictionArguments
+from .commands.train import TrainArguments
+
+DScriptArguments = Union[
+    EmbeddingArguments, EvaluateArguments, PredictionArguments, TrainArguments
+]
 
 
 class CitationAction(argparse.Action):
@@ -37,7 +47,7 @@ def main():
     subparsers = parser.add_subparsers(title="D-SCRIPT Commands", dest="cmd")
     subparsers.required = True
 
-    from .commands import train, embed, evaluate, predict, predict_parallel
+    from .commands import train, embed, evaluate, predict
 
     modules = {
         "train": train,
@@ -51,7 +61,7 @@ def main():
         module.add_args(sp)
         sp.set_defaults(func=module.main)
 
-    args = parser.parse_args()
+    args: DScriptArguments = parser.parse_args()
     args.func(args)
 
 
diff --git a/dscript/commands/embed.py b/dscript/commands/embed.py
index a6ed04e..c1218f0 100644
--- a/dscript/commands/embed.py
+++ b/dscript/commands/embed.py
@@ -2,9 +2,20 @@
 Generate new embeddings using pre-trained language model.
 """
 
+from __future__ import annotations
 import argparse
 from ..language_model import embed_from_fasta
 
+from typing import Callable, NamedTuple
+
+
+class EmbeddingArguments(NamedTuple):
+    cmd: str
+    device: int
+    outfile: str
+    seqs: str
+    func: Callable[[EmbeddingArguments], None]
+
 
 def add_args(parser):
     """
diff --git a/dscript/commands/evaluate.py b/dscript/commands/evaluate.py
index 3583c19..7c510e9 100644
--- a/dscript/commands/evaluate.py
+++ b/dscript/commands/evaluate.py
@@ -2,10 +2,11 @@
 Evaluate a trained model.
 """
 
+from __future__ import annotations
 import argparse
 import datetime
-import os
 import sys
+from typing import Callable, NamedTuple
 
 import h5py
 import matplotlib
@@ -26,6 +27,15 @@ from ..utils import log, load_hdf5_parallel
 matplotlib.use("Agg")
 
 
+class EvaluateArguments(NamedTuple):
+    cmd: str
+    device: int
+    model: str
+    embedding: str
+    test: str
+    func: Callable[[EvaluateArguments], None]
+
+
 def add_args(parser):
     """
     Create parser for command line utility.
diff --git a/dscript/commands/predict.py b/dscript/commands/predict.py
index 4ccaabf..351a60b 100644
--- a/dscript/commands/predict.py
+++ b/dscript/commands/predict.py
@@ -1,6 +1,7 @@
 """
 Make new predictions with a pre-trained model. One of --seqs or --embeddings is required.
 """
+from __future__ import annotations
 import argparse
 import datetime
 import os
@@ -12,6 +13,8 @@ import pandas as pd
 import torch
 from scipy.special import comb
 from tqdm import tqdm
+from typing import Callable, NamedTuple, Optional
+
 
 from ..alphabets import Uniprot21
 from ..fasta import parse
@@ -19,6 +22,17 @@ from ..language_model import lm_embed
 from ..utils import log, load_hdf5_parallel
 
 
+class PredictionArguments(NamedTuple):
+    cmd: str
+    device: int
+    embeddings: Optional[str]
+    outfile: Optional[str]
+    seqs: str
+    model: str
+    thresh: Optional[float]
+    func: Callable[[PredictionArguments], None]
+
+
 def add_args(parser):
     """
     Create parser for command line utility
diff --git a/dscript/commands/train.py b/dscript/commands/train.py
index 0310bbf..d41db55 100644
--- a/dscript/commands/train.py
+++ b/dscript/commands/train.py
@@ -2,6 +2,7 @@
 Train a new model.
 """
 
+from __future__ import annotations
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
@@ -9,6 +10,7 @@ from torch.autograd import Variable
 from torch.utils.data import IterableDataset, DataLoader
 from sklearn.metrics import average_precision_score as average_precision
 from tqdm import tqdm
+from typing import Callable, NamedTuple, Optional
 
 import sys
 import argparse
@@ -32,6 +34,36 @@ from ..models.contact import ContactCNN
 from ..models.interaction import ModelInteraction
 
 
+class TrainArguments(NamedTuple):
+    cmd: str
+    device: int
+    train: str
+    test: str
+    embedding: str
+    no_augment: bool
+    input_dim: int
+    projection_dim: int
+    dropout: float
+    hidden_dim: int
+    kernel_width: int
+    no_w: bool
+    no_sigmoid: bool
+    do_pool: bool
+    pool_width: int
+    num_epochs: int
+    batch_size: int
+    weight_decay: float
+    lr: float
+    interaction_weight: float
+    run_tt: bool
+    glider_weight: float
+    glider_thresh: float
+    outfile: Optional[str]
+    save_prefix: Optional[str]
+    checkpoint: Optional[str]
+    func: Callable[[TrainArguments], None]
+
+
 def add_args(parser):
     """
     Create parser for command line utility.
diff --git a/dscript/pretrained.py b/dscript/pretrained.py
index 87d92a6..da6a00f 100644
--- a/dscript/pretrained.py
+++ b/dscript/pretrained.py
@@ -1,4 +1,6 @@
+from functools import wraps, partial
 import os
+import os.path
 import sys
 
 import torch
@@ -42,6 +44,16 @@ def build_human_1(state_dict_path):
 
 VALID_MODELS = {"lm_v1": build_lm_1, "human_v1": build_human_1}
 
+STATE_DICT_BASENAME = "dscript_{version}.pt"
+
+
+def get_state_dict_path(version: str) -> str:
+    state_dict_basedir = os.path.dirname(os.path.realpath(__file__))
+    state_dict_fullname = (
+        f"{state_dict_basedir}/{STATE_DICT_BASENAME.format(version=version)}"
+    )
+    return state_dict_fullname
+
 
 def get_state_dict(version="human_v1", verbose=True):
     """
@@ -54,12 +66,8 @@ def get_state_dict(version="human_v1", verbose=True):
     :return: Path to state dictionary for pre-trained language model
     :rtype: str
     """
-    state_dict_basename = f"dscript_{version}.pt"
-    state_dict_basedir = os.path.dirname(os.path.realpath(__file__))
-    state_dict_fullname = f"{state_dict_basedir}/{state_dict_basename}"
-    state_dict_url = (
-        f"http://cb.csail.mit.edu/cb/dscript/data/models/{state_dict_basename}"
-    )
+    state_dict_fullname = get_state_dict_path(version)
+    state_dict_url = f"http://cb.csail.mit.edu/cb/dscript/data/models/{STATE_DICT_BASENAME.format(version=version)}"
     if not os.path.exists(state_dict_fullname):
         try:
             import shutil
@@ -77,6 +85,35 @@ def get_state_dict(version="human_v1", verbose=True):
     return state_dict_fullname
 
 
+def retry(retry_count: int):
+    def decorate(func):
+        @wraps(func)
+        def retry_wrapper(*args, **kwargs):
+            attempt = 0
+            version = args[0]
+            while attempt < retry_count:
+                try:
+                    result = func(*args, **kwargs)
+                    return result
+                except RuntimeError as e:
+                    print(
+                        f"\033[93mLoading {version} from disk failed. Retrying download attempt: {attempt + 1}\033[0m"
+                    )
+                    if e.args[0].startswith("unexpected EOF"):
+                        state_dict_fullname = get_state_dict_path(version)
+                        if os.path.exists(state_dict_fullname):
+                            os.remove(state_dict_fullname)
+                    else:
+                        raise e
+                attempt += 1
+            raise Exception(f"Failed to download {version}")
+
+        return retry_wrapper
+
+    return decorate
+
+
+@retry(3)
 def get_pretrained(version="human_v1"):
     """
     Get pre-trained model object.

From 9ad03503e56897477fb66d657b8089b975f0f1be Mon Sep 17 00:00:00 2001
From: Mert Erden <merden01@eecs.tufts.edu>
Date: Wed, 29 Jun 2022 16:39:26 -0400
Subject: [PATCH 06/15] fix: replace print usage with log in pretrained.py

---
 dscript/pretrained.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/dscript/pretrained.py b/dscript/pretrained.py
index da6a00f..c037488 100644
--- a/dscript/pretrained.py
+++ b/dscript/pretrained.py
@@ -8,6 +8,7 @@ import torch
 from .models.contact import ContactCNN
 from .models.embedding import FullyConnectedEmbed, SkipLSTM
 from .models.interaction import ModelInteraction
+from .utils import log
 
 
 def build_lm_1(state_dict_path):
@@ -74,13 +75,13 @@ def get_state_dict(version="human_v1", verbose=True):
             import urllib.request
 
             if verbose:
-                print(f"Downloading model {version} from {state_dict_url}...")
+                log(f"Downloading model {version} from {state_dict_url}...")
             with urllib.request.urlopen(state_dict_url) as response, open(
                 state_dict_fullname, "wb"
             ) as out_file:
                 shutil.copyfileobj(response, out_file)
         except Exception as e:
-            print("Unable to download model - {}".format(e))
+            log("Unable to download model - {}".format(e))
             sys.exit(1)
     return state_dict_fullname
 
@@ -96,7 +97,7 @@ def retry(retry_count: int):
                     result = func(*args, **kwargs)
                     return result
                 except RuntimeError as e:
-                    print(
+                    log(
                         f"\033[93mLoading {version} from disk failed. Retrying download attempt: {attempt + 1}\033[0m"
                     )
                     if e.args[0].startswith("unexpected EOF"):

From a4eaad0f1b1ee1533cb2dcae44dba5b51a6a34cd Mon Sep 17 00:00:00 2001
From: Samuel Sledzieski <samsledje@gmail.com>
Date: Thu, 30 Jun 2022 11:26:10 -0400
Subject: [PATCH 07/15] Create automatic test runner

---
 .github/workflows/python-app.yml | 40 ++++++++++++++++++++++++++++++++
 1 file changed, 40 insertions(+)
 create mode 100644 .github/workflows/python-app.yml

diff --git a/.github/workflows/python-app.yml b/.github/workflows/python-app.yml
new file mode 100644
index 0000000..0c93108
--- /dev/null
+++ b/.github/workflows/python-app.yml
@@ -0,0 +1,40 @@
+# This workflow will install Python dependencies, run tests and lint with a single version of Python
+# For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions
+
+name: Python application
+
+on:
+  push:
+    branches: [ "main" ]
+  pull_request:
+    branches: [ "main" ]
+
+permissions:
+  contents: read
+
+jobs:
+  build:
+
+    runs-on: ubuntu-latest
+
+    steps:
+    - uses: actions/checkout@v3
+    - name: Set up Python 3.7
+      uses: actions/setup-python@v3
+      with:
+        python-version: "3.7"
+    - name: Install dependencies
+      run: |
+        python -m pip install --upgrade pip
+        pip install flake8 pytest
+        if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
+        python setup.py install
+    - name: Lint with flake8
+      run: |
+        # stop the build if there are Python syntax errors or undefined names
+        flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics
+        # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
+        flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
+    - name: Test with pytest
+      run: |
+        pytest

From f0d03d32088bb395de16cc27320e24eebfa9e29c Mon Sep 17 00:00:00 2001
From: Samuel Sledzieski <samsledje@gmail.com>
Date: Thu, 30 Jun 2022 11:31:28 -0400
Subject: [PATCH 08/15] Update and rename python-app.yml to autorun-tests.yml

---
 .../{python-app.yml => autorun-tests.yml}     | 26 +++++++++----------
 1 file changed, 12 insertions(+), 14 deletions(-)
 rename .github/workflows/{python-app.yml => autorun-tests.yml} (58%)

diff --git a/.github/workflows/python-app.yml b/.github/workflows/autorun-tests.yml
similarity index 58%
rename from .github/workflows/python-app.yml
rename to .github/workflows/autorun-tests.yml
index 0c93108..3f1f93c 100644
--- a/.github/workflows/python-app.yml
+++ b/.github/workflows/autorun-tests.yml
@@ -1,7 +1,4 @@
-# This workflow will install Python dependencies, run tests and lint with a single version of Python
-# For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions
-
-name: Python application
+name: Automatically run tests
 
 on:
   push:
@@ -9,32 +6,33 @@ on:
   pull_request:
     branches: [ "main" ]
 
-permissions:
-  contents: read
-
 jobs:
-  build:
-
+  build-linux:
     runs-on: ubuntu-latest
+    strategy:
+      max-parallel: 5
 
     steps:
     - uses: actions/checkout@v3
     - name: Set up Python 3.7
       uses: actions/setup-python@v3
       with:
-        python-version: "3.7"
+        python-version: 3.7
+    - name: Add conda to system path
+      run: |
+        # $CONDA is an environment variable pointing to the root of the miniconda directory
+        echo $CONDA/bin >> $GITHUB_PATH
     - name: Install dependencies
       run: |
-        python -m pip install --upgrade pip
-        pip install flake8 pytest
-        if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
-        python setup.py install
+        conda env update --file environment.yml --name base
     - name: Lint with flake8
       run: |
+        conda install flake8
         # stop the build if there are Python syntax errors or undefined names
         flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics
         # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
         flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
     - name: Test with pytest
       run: |
+        conda install pytest
         pytest

From c4e003f05361412d443129dd6816a3e15d91f90b Mon Sep 17 00:00:00 2001
From: Samuel Sledzieski <samsledje@gmail.com>
Date: Thu, 30 Jun 2022 11:45:38 -0400
Subject: [PATCH 09/15] Create requirements.txt

---
 requirements.txt | 11 +++++++++++
 1 file changed, 11 insertions(+)
 create mode 100644 requirements.txt

diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 0000000..f93b35a
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1,11 @@
+pytorch=1.11
+biopython
+h5py
+matplotlib
+numpy
+pandas
+scikit-learn
+scipy
+seaborn
+setuptools
+tqdm

From 9be9dadae1f16846de632172e1ea273ee17fd390 Mon Sep 17 00:00:00 2001
From: Samuel Sledzieski <samsledje@gmail.com>
Date: Thu, 30 Jun 2022 11:46:34 -0400
Subject: [PATCH 10/15] Update autorun-tests.yml

---
 .github/workflows/autorun-tests.yml | 26 ++++++++++++++------------
 1 file changed, 14 insertions(+), 12 deletions(-)

diff --git a/.github/workflows/autorun-tests.yml b/.github/workflows/autorun-tests.yml
index 3f1f93c..0c93108 100644
--- a/.github/workflows/autorun-tests.yml
+++ b/.github/workflows/autorun-tests.yml
@@ -1,4 +1,7 @@
-name: Automatically run tests
+# This workflow will install Python dependencies, run tests and lint with a single version of Python
+# For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions
+
+name: Python application
 
 on:
   push:
@@ -6,33 +9,32 @@ on:
   pull_request:
     branches: [ "main" ]
 
+permissions:
+  contents: read
+
 jobs:
-  build-linux:
+  build:
+
     runs-on: ubuntu-latest
-    strategy:
-      max-parallel: 5
 
     steps:
     - uses: actions/checkout@v3
     - name: Set up Python 3.7
       uses: actions/setup-python@v3
       with:
-        python-version: 3.7
-    - name: Add conda to system path
-      run: |
-        # $CONDA is an environment variable pointing to the root of the miniconda directory
-        echo $CONDA/bin >> $GITHUB_PATH
+        python-version: "3.7"
     - name: Install dependencies
       run: |
-        conda env update --file environment.yml --name base
+        python -m pip install --upgrade pip
+        pip install flake8 pytest
+        if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
+        python setup.py install
     - name: Lint with flake8
       run: |
-        conda install flake8
         # stop the build if there are Python syntax errors or undefined names
         flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics
         # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
         flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
     - name: Test with pytest
       run: |
-        conda install pytest
         pytest

From 540f9f79774afd093efd6ba98388ff9470c070ee Mon Sep 17 00:00:00 2001
From: Samuel Sledzieski <samsledje@gmail.com>
Date: Thu, 30 Jun 2022 11:47:23 -0400
Subject: [PATCH 11/15] Update requirements.txt

---
 requirements.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/requirements.txt b/requirements.txt
index f93b35a..05153db 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,4 +1,4 @@
-pytorch=1.11
+pytorch==1.11
 biopython
 h5py
 matplotlib

From 487b5e3f4fa56d0abff1ef9b04d33c295e18e33a Mon Sep 17 00:00:00 2001
From: Samuel Sledzieski <samsledje@gmail.com>
Date: Thu, 30 Jun 2022 11:51:11 -0400
Subject: [PATCH 12/15] Update requirements.txt

---
 requirements.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/requirements.txt b/requirements.txt
index 05153db..52cb6be 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,4 +1,4 @@
-pytorch==1.11
+torch==1.11
 biopython
 h5py
 matplotlib

From 37759877f52d87160c6d8712734b7f45af7850de Mon Sep 17 00:00:00 2001
From: Samuel Sledzieski <samsledje@gmail.com>
Date: Thu, 30 Jun 2022 11:53:37 -0400
Subject: [PATCH 13/15] Delete dscript/legacy directory

---
 dscript/legacy/alphabets_legacy.py   |  77 ----
 dscript/legacy/contact_legacy.py     | 132 ------
 dscript/legacy/embedding_legacy.py   | 185 --------
 dscript/legacy/fasta_legacy.py       |  78 ----
 dscript/legacy/interaction_legacy.py | 221 ----------
 dscript/legacy/train_legacy.py       | 616 ---------------------------
 dscript/legacy/utils_legacy.py       | 170 --------
 7 files changed, 1479 deletions(-)
 delete mode 100644 dscript/legacy/alphabets_legacy.py
 delete mode 100644 dscript/legacy/contact_legacy.py
 delete mode 100644 dscript/legacy/embedding_legacy.py
 delete mode 100644 dscript/legacy/fasta_legacy.py
 delete mode 100644 dscript/legacy/interaction_legacy.py
 delete mode 100644 dscript/legacy/train_legacy.py
 delete mode 100644 dscript/legacy/utils_legacy.py

diff --git a/dscript/legacy/alphabets_legacy.py b/dscript/legacy/alphabets_legacy.py
deleted file mode 100644
index d38bfbf..0000000
--- a/dscript/legacy/alphabets_legacy.py
+++ /dev/null
@@ -1,77 +0,0 @@
-from __future__ import print_function, division
-
-import numpy as np
-
-
-class Alphabet:
-    """
-    From `Bepler & Berger <https://github.com/tbepler/protein-sequence-embedding-iclr2019>`_.
-
-    :param chars: List of characters in alphabet
-    :type chars: byte str
-    :param encoding: Mapping of characters to numbers [default: encoding]
-    :type encoding: np.ndarray
-    :param mask: Set encoding mask [default: False]
-    :type mask: bool
-    :param missing: Number to use for a value outside the alphabet [default: 255]
-    :type missing: int
-    """
-
-    def __init__(self, chars, encoding=None, mask=False, missing=255):
-        self.chars = np.frombuffer(chars, dtype=np.uint8)
-        self.encoding = np.zeros(256, dtype=np.uint8) + missing
-        if encoding is None:
-            self.encoding[self.chars] = np.arange(len(self.chars))
-            self.size = len(self.chars)
-        else:
-            self.encoding[self.chars] = encoding
-            self.size = encoding.max() + 1
-        self.mask = mask
-        if mask:
-            self.size -= 1
-
-    def __len__(self):
-        return self.size
-
-    def __getitem__(self, i):
-        return chr(self.chars[i])
-
-    def encode(self, x):
-        """
-        Encode a byte string into alphabet indices
-
-        :param x: Amino acid string
-        :type x: byte str
-        :return: Numeric encoding
-        :rtype: np.ndarray
-        """
-        x = np.frombuffer(x, dtype=np.uint8)
-        return self.encoding[x]
-
-    def decode(self, x):
-        """
-        Decode numeric encoding to byte string of this alphabet
-
-        :param x: Numeric encoding
-        :type x: np.ndarray
-        :return: Amino acid string
-        :rtype: byte str
-        """
-        string = self.chars[x]
-        return string.tobytes()
-
-
-class Uniprot21(Alphabet):
-    """
-    Uniprot 21 Amino Acid Encoding.
-
-    From `Bepler & Berger <https://github.com/tbepler/protein-sequence-embedding-iclr2019>`_.
-    """
-
-    def __init__(self, mask=False):
-        chars = b"ARNDCQEGHILKMFPSTWYVXOUBZ"
-        encoding = np.arange(len(chars))
-        encoding[21:] = [11, 4, 20, 20]  # encode 'OUBZ' as synonyms
-        super(Uniprot21, self).__init__(
-            chars, encoding=encoding, mask=mask, missing=20
-        )
diff --git a/dscript/legacy/contact_legacy.py b/dscript/legacy/contact_legacy.py
deleted file mode 100644
index c0468c7..0000000
--- a/dscript/legacy/contact_legacy.py
+++ /dev/null
@@ -1,132 +0,0 @@
-"""
-Contact model classes.
-"""
-
-import torch
-import torch.nn as nn
-import torch.functional as F
-
-
-class FullyConnected(nn.Module):
-    """
-    Performs part 1 of Contact Prediction Module. Takes embeddings from Projection module and produces broadcast tensor.
-
-    Input embeddings of dimension :math:`d` are combined into a :math:`2d` length MLP input :math:`z_{cat}`, where :math:`z_{cat} = [z_0 \\ominus z_1 | z_0 \\odot z_1]`
-
-    :param embed_dim: Output dimension of `dscript.models.embedding <#module-dscript.models.embedding>`_ model :math:`d` [default: 100]
-    :type embed_dim: int
-    :param hidden_dim: Hidden dimension :math:`h` [default: 50]
-    :type hidden_dim: int
-    :param activation: Activation function for broadcast tensor [default: torch.nn.ReLU()]
-    :type activation: torch.nn.Module
-    """
-
-    def __init__(self, embed_dim, hidden_dim, activation=nn.ReLU()):
-        super(FullyConnected, self).__init__()
-
-        self.D = embed_dim
-        self.H = hidden_dim
-        self.conv = nn.Conv2d(2 * self.D, self.H, 1)
-        self.batchnorm = nn.BatchNorm2d(self.H)
-        self.activation = activation
-
-    def forward(self, z0, z1):
-        """
-        :param z0: Projection module embedding :math:`(b \\times N \\times d)`
-        :type z0: torch.Tensor
-        :param z1: Projection module embedding :math:`(b \\times M \\times d)`
-        :type z1: torch.Tensor
-        :return: Predicted broadcast tensor :math:`(b \\times N \\times M \\times h)`
-        :rtype: torch.Tensor
-        """
-
-        # z0 is (b,N,d), z1 is (b,M,d)
-        z0 = z0.transpose(1, 2)
-        z1 = z1.transpose(1, 2)
-        # z0 is (b,d,N), z1 is (b,d,M)
-
-        z_dif = torch.abs(z0.unsqueeze(3) - z1.unsqueeze(2))
-        z_mul = z0.unsqueeze(3) * z1.unsqueeze(2)
-        z_cat = torch.cat([z_dif, z_mul], 1)
-
-        b = self.conv(z_cat)
-        b = self.activation(b)
-        b = self.batchnorm(b)
-
-        return b
-
-
-class ContactCNN(nn.Module):
-    """
-    Residue Contact Prediction Module. Takes embeddings from Projection module and produces contact map, output of Contact module.
-
-    :param embed_dim: Output dimension of `dscript.models.embedding <#module-dscript.models.embedding>`_ model :math:`d` [default: 100]
-    :type embed_dim: int
-    :param hidden_dim: Hidden dimension :math:`h` [default: 50]
-    :type hidden_dim: int
-    :param width: Width of convolutional filter :math:`2w+1` [default: 7]
-    :type width: int
-    :param activation: Activation function for final contact map [default: torch.nn.Sigmoid()]
-    :type activation: torch.nn.Module
-    """
-
-    def __init__(
-        self, embed_dim=100, hidden_dim=50, width=7, activation=nn.Sigmoid()
-    ):
-        super(ContactCNN, self).__init__()
-
-        self.hidden = FullyConnected(embed_dim, hidden_dim)
-        self.conv = nn.Conv2d(hidden_dim, 1, width, padding=width // 2)
-        self.batchnorm = nn.BatchNorm2d(1)
-        self.activation = activation
-        self.clip()
-
-    def clip(self):
-        """
-        Force the convolutional layer to be transpose invariant.
-
-        :meta private:
-        """
-
-        w = self.conv.weight
-        self.conv.weight.data[:] = 0.5 * (w + w.transpose(2, 3))
-
-    def forward(self, z0, z1):
-        """
-        :param z0: Projection module embedding :math:`(b \\times N \\times d)`
-        :type z0: torch.Tensor
-        :param z1: Projection module embedding :math:`(b \\times M \\times d)`
-        :type z1: torch.Tensor
-        :return: Predicted contact map :math:`(b \\times N \\times M)`
-        :rtype: torch.Tensor
-        """
-        B = self.broadcast(z0, z1)
-        return self.predict(B)
-
-    def broadcast(self, z0, z1):
-        """
-        Calls `dscript.models.contact.FullyConnected <#module-dscript.models.contact.FullyConnected>`_.
-
-        :param z0: Projection module embedding :math:`(b \\times N \\times d)`
-        :type z0: torch.Tensor
-        :param z1: Projection module embedding :math:`(b \\times M \\times d)`
-        :type z1: torch.Tensor
-        :return: Predicted contact broadcast tensor :math:`(b \\times N \\times M \\times h)`
-        :rtype: torch.Tensor
-        """
-        B = self.hidden(z0, z1)
-        return B
-
-    def predict(self, B):
-        """
-        Predict contact map from broadcast tensor.
-
-        :param B: Predicted contact broadcast :math:`(b \\times N \\times M \\times h)`
-        :type B: torch.Tensor
-        :return: Predicted contact map :math:`(b \\times N \\times M)`
-        :rtype: torch.Tensor
-        """
-        C = self.conv(B)
-        C = self.batchnorm(C)
-        C = self.activation(C)
-        return C
diff --git a/dscript/legacy/embedding_legacy.py b/dscript/legacy/embedding_legacy.py
deleted file mode 100644
index 80e66d6..0000000
--- a/dscript/legacy/embedding_legacy.py
+++ /dev/null
@@ -1,185 +0,0 @@
-"""
-Embedding model classes.
-"""
-
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-from torch.nn.utils.rnn import PackedSequence
-
-
-class IdentityEmbed(nn.Module):
-    """
-    Does not reduce the dimension of the language model embeddings, just passes them through to the contact model.
-    """
-
-    def forward(self, x):
-        """
-        :param x: Input language model embedding :math:`(b \\times N \\times d_0)`
-        :type x: torch.Tensor
-        :return: Same embedding
-        :rtype: torch.Tensor
-        """
-        return x
-
-
-class FullyConnectedEmbed(nn.Module):
-    """
-    Protein Projection Module. Takes embedding from language model and outputs low-dimensional interaction aware projection.
-
-    :param nin: Size of language model output
-    :type nin: int
-    :param nout: Dimension of projection
-    :type nout: int
-    :param dropout: Proportion of weights to drop out [default: 0.5]
-    :type dropout: float
-    :param activation: Activation for linear projection model
-    :type activation: torch.nn.Module
-    """
-
-    def __init__(self, nin, nout, dropout=0.5, activation=nn.ReLU()):
-        super(FullyConnectedEmbed, self).__init__()
-        self.nin = nin
-        self.nout = nout
-        self.dropout_p = dropout
-
-        self.transform = nn.Linear(nin, nout)
-        self.drop = nn.Dropout(p=self.dropout_p)
-        self.activation = activation
-
-    def forward(self, x):
-        """
-        :param x: Input language model embedding :math:`(b \\times N \\times d_0)`
-        :type x: torch.Tensor
-        :return: Low dimensional projection of embedding
-        :rtype: torch.Tensor
-        """
-        t = self.transform(x)
-        t = self.activation(t)
-        t = self.drop(t)
-        return t
-
-
-class SkipLSTM(nn.Module):
-    """
-    Language model from `Bepler & Berger <https://github.com/tbepler/protein-sequence-embedding-iclr2019>`_.
-
-    Loaded with pre-trained weights in embedding function.
-
-    :param nin: Input dimension of amino acid one-hot [default: 21]
-    :type nin: int
-    :param nout: Output dimension of final layer [default: 100]
-    :type nout: int
-    :param hidden_dim: Size of hidden dimension [default: 1024]
-    :type hidden_dim: int
-    :param num_layers: Number of stacked LSTM models [default: 3]
-    :type num_layers: int
-    :param dropout: Proportion of weights to drop out [default: 0]
-    :type dropout: float
-    :param bidirectional: Whether to use biLSTM vs. LSTM
-    :type bidirectional: bool
-    """
-
-    def __init__(
-        self,
-        nin=21,
-        nout=100,
-        hidden_dim=1024,
-        num_layers=3,
-        dropout=0,
-        bidirectional=True,
-    ):
-        super(SkipLSTM, self).__init__()
-
-        self.nin = nin
-        self.nout = nout
-
-        self.dropout = nn.Dropout(p=dropout)
-
-        self.layers = nn.ModuleList()
-        dim = nin
-        for i in range(num_layers):
-            f = nn.LSTM(
-                dim,
-                hidden_dim,
-                1,
-                batch_first=True,
-                bidirectional=bidirectional,
-            )
-            self.layers.append(f)
-            if bidirectional:
-                dim = 2 * hidden_dim
-            else:
-                dim = hidden_dim
-
-        n = hidden_dim * num_layers + nin
-        if bidirectional:
-            n = 2 * hidden_dim * num_layers + nin
-
-        self.proj = nn.Linear(n, nout)
-
-    def to_one_hot(self, x):
-        """
-        Transform numeric encoded amino acid vector to one-hot encoded vector
-
-        :param x: Input numeric amino acid encoding :math:`(N)`
-        :type x: torch.Tensor
-        :return: One-hot encoding vector :math:`(N \\times n_{in})`
-        :rtype: torch.Tensor
-        """
-        packed = type(x) is PackedSequence
-        if packed:
-            one_hot = x.data.new(x.data.size(0), self.nin).float().zero_()
-            one_hot.scatter_(1, x.data.unsqueeze(1), 1)
-            one_hot = PackedSequence(one_hot, x.batch_sizes)
-        else:
-            one_hot = x.new(x.size(0), x.size(1), self.nin).float().zero_()
-            one_hot.scatter_(2, x.unsqueeze(2), 1)
-        return one_hot
-
-    def transform(self, x):
-        """
-        :param x: Input numeric amino acid encoding :math:`(N)`
-        :type x: torch.Tensor
-        :return: Concatenation of all hidden layers :math:`(N \\times (n_{in} + 2 \\times \\text{num_layers} \\times \\text{hidden_dim}))`
-        :rtype: torch.Tensor
-        """
-        one_hot = self.to_one_hot(x)
-        hs = [one_hot]  # []
-        h_ = one_hot
-        for f in self.layers:
-            h, _ = f(h_)
-            # h = self.dropout(h)
-            hs.append(h)
-            h_ = h
-        if type(x) is PackedSequence:
-            h = torch.cat([z.data for z in hs], 1)
-            h = PackedSequence(h, x.batch_sizes)
-        else:
-            h = torch.cat([z for z in hs], 2)
-        return h
-
-    def forward(self, x):
-        """
-        :meta private:
-        """
-        one_hot = self.to_one_hot(x)
-        hs = [one_hot]
-        h_ = one_hot
-
-        for f in self.layers:
-            h, _ = f(h_)
-            # h = self.dropout(h)
-            hs.append(h)
-            h_ = h
-
-        if type(x) is PackedSequence:
-            h = torch.cat([z.data for z in hs], 1)
-            z = self.proj(h)
-            z = PackedSequence(z, x.batch_sizes)
-        else:
-            h = torch.cat([z for z in hs], 2)
-            z = self.proj(h.view(-1, h.size(2)))
-            z = z.view(x.size(0), x.size(1), -1)
-
-        return z
diff --git a/dscript/legacy/fasta_legacy.py b/dscript/legacy/fasta_legacy.py
deleted file mode 100644
index 579eeb7..0000000
--- a/dscript/legacy/fasta_legacy.py
+++ /dev/null
@@ -1,78 +0,0 @@
-def parse(f, comment="#"):
-    """
-    Parse a file in ``.fasta`` format.
-
-    :param f: Input file object
-    :type f: _io.TextIOWrapper
-    :param comment: Character used for comments
-    :type comment: str
-
-    :return: names, sequence
-    :rtype: list[str], list[str]
-    """
-    starter = ">"
-    empty = ""
-    if "b" in f.mode:
-        comment = b"#"
-        starter = b">"
-        empty = b""
-    names = []
-    sequences = []
-    name = None
-    sequence = []
-    for line in f:
-        if line.startswith(comment):
-            continue
-        line = line.strip()
-        if line.startswith(starter):
-            if name is not None:
-                names.append(name)
-                sequences.append(empty.join(sequence))
-            name = line[1:]
-            sequence = []
-        else:
-            sequence.append(line.upper())
-    if name is not None:
-        names.append(name)
-        sequences.append(empty.join(sequence))
-
-    return names, sequences
-
-
-def parse_directory(directory, extension=".seq"):
-    """
-    Parse all files in a directory ending with ``extension``.
-
-    :param directory: Input directory
-    :type directory: str
-    :param extension: Extension of all files to read in
-    :type extension: str
-
-    :return: names, sequence
-    :rtype: list[str], list[str]
-    """
-    names = []
-    sequences = []
-
-    for seqPath in os.listdir(directory):
-        if seqPath.endswith(extension):
-            n, s = parse(open(f"{directory}/{seqPath}", "rb"))
-            names.append(n[0].decode("utf-8").strip())
-            sequences.append(s[0].decode("utf-8").strip())
-    return names, sequences
-
-
-def write(nam, seq, f):
-    """
-    Write a file in ``.fasta`` format.
-
-    :param nam: List of names
-    :type nam: list[str]
-    :param seq: List of sequences
-    :type seq: list[str]
-    :param f: Output file object
-    :type f: _io.TextIOWrapper
-    """
-    for n, s in zip(nam, seq):
-        f.write(">{}\n".format(n))
-        f.write("{}\n".format(s))
diff --git a/dscript/legacy/interaction_legacy.py b/dscript/legacy/interaction_legacy.py
deleted file mode 100644
index 07e572c..0000000
--- a/dscript/legacy/interaction_legacy.py
+++ /dev/null
@@ -1,221 +0,0 @@
-"""
-Interaction model classes.
-"""
-
-import numpy as np
-
-import torch
-import torch.nn as nn
-import torch.functional as F
-
-
-class LogisticActivation(nn.Module):
-    """
-    Implementation of Generalized Sigmoid
-    Applies the element-wise function:
-
-    :math:`\\sigma(x) = \\frac{1}{1 + \\exp(-k(x-x_0))}`
-
-    :param x0: The value of the sigmoid midpoint
-    :type x0: float
-    :param k: The slope of the sigmoid - trainable -  :math:`k \\geq 0`
-    :type k: float
-    :param train: Whether :math:`k` is a trainable parameter
-    :type train: bool
-    """
-
-    def __init__(self, x0=0, k=1, train=False):
-        super(LogisticActivation, self).__init__()
-        self.x0 = x0
-        self.k = nn.Parameter(torch.FloatTensor([float(k)]))
-        self.k.requiresGrad = train
-
-    def forward(self, x):
-        """
-        Applies the function to the input elementwise
-
-        :param x: :math:`(N \\times *)` where :math:`*` means, any number of additional dimensions
-        :type x: torch.Tensor
-        :return: :math:`(N \\times *)`, same shape as the input
-        :rtype: torch.Tensor
-        """
-        out = torch.clamp(
-            1 / (1 + torch.exp(-self.k * (x - self.x0))), min=0, max=1
-        ).squeeze()
-        return out
-
-    def clip(self):
-        """
-        Restricts sigmoid slope :math:`k` to be greater than or equal to 0, if :math:`k` is trained.
-
-        :meta private:
-        """
-        self.k.data.clamp_(min=0)
-
-
-class ModelInteraction(nn.Module):
-    """
-    Main D-SCRIPT model. Contains an embedding and contact model and offers access to those models. Computes pooling operations on contact map to generate interaction probability.
-
-    :param embedding: Embedding model
-    :type embedding: dscript.models.embedding.FullyConnectedEmbed
-    :param contact: Contact model
-    :type contact: dscript.models.contact.ContactCNN
-    :param use_cuda: Whether the model should be run on GPU
-    :type use_cuda: bool
-    :param pool_size: width of max-pool [default 9]
-    :type pool_size: bool
-    :param theta_init: initialization value of :math:`\\theta` for weight matrix [default: 1]
-    :type theta_init: float
-    :param lambda_init: initialization value of :math:`\\lambda` for weight matrix [default: 0]
-    :type lambda_init: float
-    :param gamma_init: initialization value of :math:`\\gamma` for global pooling [default: 0]
-    :type gamma_init: float
-    :param use_W: whether to use the weighting matrix [default: True]
-    :type use_W: bool
-    """
-
-    def __init__(
-        self,
-        embedding,
-        contact,
-        pool_size=9,
-        theta_init=1,
-        lambda_init=0,
-        gamma_init=0,
-        use_W=True,
-    ):
-        super(ModelInteraction, self).__init__()
-        self.use_W = use_W
-        self.activation = LogisticActivation(x0=0.5, k=20)
-
-        self.embedding = embedding
-        self.contact = contact
-
-        if self.use_W:
-            self.theta = nn.Parameter(torch.FloatTensor([theta_init]))
-            self.lambda_ = nn.Parameter(torch.FloatTensor([lambda_init]))
-
-        self.maxPool = nn.MaxPool2d(pool_size, padding=pool_size // 2)
-        self.gamma = nn.Parameter(torch.FloatTensor([gamma_init]))
-
-        self.clip()
-
-    def clip(self):
-        """
-        Clamp model values
-
-        :meta private:
-        """
-        self.contact.clip()
-
-        if self.use_W:
-            self.theta.data.clamp_(min=0, max=1)
-            self.lambda_.data.clamp_(min=0)
-
-        self.gamma.data.clamp_(min=0)
-
-    def embed(self, z):
-        """
-        Project down input language model embeddings into low dimension using projection module
-
-        :param z: Language model embedding :math:`(b \\times N \\times d_0)`
-        :type z: torch.Tensor
-        :return: D-SCRIPT projection :math:`(b \\times N \\times d)`
-        :rtype: torch.Tensor
-        """
-        if self.embedding is None:
-            return z
-        else:
-            return self.embedding(z)
-
-    def cpred(self, z0, z1):
-        """
-        Project down input language model embeddings into low dimension using projection module
-
-        :param z0: Language model embedding :math:`(b \\times N \\times d_0)`
-        :type z0: torch.Tensor
-        :param z1: Language model embedding :math:`(b \\times N \\times d_0)`
-        :type z1: torch.Tensor
-        :return: Predicted contact map :math:`(b \\times N \\times M)`
-        :rtype: torch.Tensor
-        """
-        e0 = self.embed(z0)
-        e1 = self.embed(z1)
-        B = self.contact.broadcast(e0, e1)
-        C = self.contact.predict(B)
-        return C
-
-    def map_predict(self, z0, z1):
-        """
-        Project down input language model embeddings into low dimension using projection module
-
-        :param z0: Language model embedding :math:`(b \\times N \\times d_0)`
-        :type z0: torch.Tensor
-        :param z1: Language model embedding :math:`(b \\times N \\times d_0)`
-        :type z1: torch.Tensor
-        :return: Predicted contact map, predicted probability of interaction :math:`(b \\times N \\times d_0), (1)`
-        :rtype: torch.Tensor, torch.Tensor
-        """
-
-        C = self.cpred(z0, z1)
-
-        if self.use_W:
-            # Create contact weighting matrix
-            N, M = C.shape[2:]
-
-            x1 = torch.from_numpy(
-                -1
-                * ((np.arange(N) + 1 - ((N + 1) / 2)) / (-1 * ((N + 1) / 2)))
-                ** 2
-            ).float()
-            if self.gamma.device.type == "cuda":
-                x1 = x1.cuda()
-            x1 = torch.exp(self.lambda_ * x1)
-
-            x2 = torch.from_numpy(
-                -1
-                * ((np.arange(M) + 1 - ((M + 1) / 2)) / (-1 * ((M + 1) / 2)))
-                ** 2
-            ).float()
-            if self.gamma.device.type == "cuda":
-                x2 = x2.cuda()
-            x2 = torch.exp(self.lambda_ * x2)
-
-            W = x1.unsqueeze(1) * x2
-            W = (1 - self.theta) * W + self.theta
-
-            yhat = C * W
-
-        else:
-            yhat = C
-
-        yhat = self.maxPool(yhat)
-
-        # Mean of contact predictions where p_ij > mu + gamma*sigma
-        mu = torch.mean(yhat)
-        sigma = torch.var(yhat)
-        Q = torch.relu(yhat - mu - (self.gamma * sigma))
-        phat = torch.sum(Q) / (torch.sum(torch.sign(Q)) + 1)
-        phat = self.activation(phat)
-        return C, phat
-
-    def predict(self, z0, z1):
-        """
-        Project down input language model embeddings into low dimension using projection module
-
-        :param z0: Language model embedding :math:`(b \\times N \\times d_0)`
-        :type z0: torch.Tensor
-        :param z1: Language model embedding :math:`(b \\times N \\times d_0)`
-        :type z1: torch.Tensor
-        :return: Predicted probability of interaction
-        :rtype: torch.Tensor, torch.Tensor
-        """
-        _, phat = self.map_predict(z0, z1)
-        return phat
-
-    def forward(self, z0, z1):
-        """
-        :meta private:
-        """
-        return self.predict(z0, z1)
diff --git a/dscript/legacy/train_legacy.py b/dscript/legacy/train_legacy.py
deleted file mode 100644
index aa1c68b..0000000
--- a/dscript/legacy/train_legacy.py
+++ /dev/null
@@ -1,616 +0,0 @@
-"""
-Train a new model.
-"""
-
-import sys
-import argparse
-import h5py
-import datetime
-import subprocess as sp
-import numpy as np
-import pandas as pd
-import gzip as gz
-from tqdm import tqdm
-
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-import torch.optim as optim
-from torch.autograd import Variable
-from torch.utils.data import IterableDataset, DataLoader
-from sklearn.metrics import average_precision_score as average_precision
-
-import dscript
-from dscript.utils import PairedDataset, collate_paired_sequences
-from dscript.models.embedding import (
-    IdentityEmbed,
-    FullyConnectedEmbed,
-)
-from dscript.models.contact import ContactCNN
-from dscript.models.interaction import ModelInteraction
-
-
-def add_args(parser):
-    """
-    Create parser for command line utility.
-
-    :meta private:
-    """
-
-    data_grp = parser.add_argument_group("Data")
-    proj_grp = parser.add_argument_group("Projection Module")
-    contact_grp = parser.add_argument_group("Contact Module")
-    inter_grp = parser.add_argument_group("Interaction Module")
-    train_grp = parser.add_argument_group("Training")
-    misc_grp = parser.add_argument_group("Output and Device")
-
-    # Data
-    data_grp.add_argument("--train", help="Training data", required=True)
-    data_grp.add_argument("--val", help="Validation data", required=True)
-    data_grp.add_argument(
-        "--embedding", help="h5 file with embedded sequences", required=True
-    )
-    data_grp.add_argument(
-        "--no-augment",
-        action="store_false",
-        dest="augment",
-        help="Set flag to not augment data by adding (B A) for all pairs (A B)",
-    )
-
-    # Embedding model
-    proj_grp.add_argument(
-        "--projection-dim",
-        type=int,
-        default=100,
-        help="Dimension of embedding projection layer (default: 100)",
-    )
-    proj_grp.add_argument(
-        "--dropout-p",
-        type=float,
-        default=0.5,
-        help="Parameter p for embedding dropout layer (default: 0.5)",
-    )
-
-    # Contact model
-    contact_grp.add_argument(
-        "--hidden-dim",
-        type=int,
-        default=50,
-        help="Number of hidden units for comparison layer in contact prediction (default: 50)",
-    )
-    contact_grp.add_argument(
-        "--kernel-width",
-        type=int,
-        default=7,
-        help="Width of convolutional filter for contact prediction (default: 7)",
-    )
-
-    # Interaction Model
-    inter_grp.add_argument(
-        "--no-w",
-        action="store_false",
-        dest="use_w",
-        help="Don't use weight matrix in interaction prediction model",
-    )
-    inter_grp.add_argument(
-        "--pool-width",
-        type=int,
-        default=9,
-        help="Size of max-pool in interaction model (default: 9)",
-    )
-
-    # Training
-    train_grp.add_argument(
-        "--negative-ratio",
-        type=int,
-        default=10,
-        help="Number of negative training samples for each positive training sample (default: 10)",
-    )
-    train_grp.add_argument(
-        "--epoch-scale",
-        type=int,
-        default=1,
-        help="Report heldout performance every this many epochs (default: 1)",
-    )
-    train_grp.add_argument(
-        "--num-epochs",
-        type=int,
-        default=10,
-        help="Number of epochs (default: 10)",
-    )
-    train_grp.add_argument(
-        "--batch-size",
-        type=int,
-        default=25,
-        help="Minibatch size (default: 25)",
-    )
-    train_grp.add_argument(
-        "--weight-decay",
-        type=float,
-        default=0,
-        help="L2 regularization (default: 0)",
-    )
-    train_grp.add_argument(
-        "--lr",
-        type=float,
-        default=0.001,
-        help="Learning rate (default: 0.001)",
-    )
-    train_grp.add_argument(
-        "--lambda",
-        dest="lambda_",
-        type=float,
-        default=0.35,
-        help="Weight on the similarity objective (default: 0.35)",
-    )
-
-    # Output
-    misc_grp.add_argument(
-        "-o", "--outfile", help="Output file path (default: stdout)"
-    )
-    misc_grp.add_argument(
-        "--save-prefix", help="Path prefix for saving models"
-    )
-    misc_grp.add_argument(
-        "-d", "--device", type=int, default=-1, help="Compute device to use"
-    )
-    misc_grp.add_argument(
-        "--checkpoint", help="Checkpoint model to start training from"
-    )
-
-    return parser
-
-
-def predict_interaction(model, n0, n1, tensors, use_cuda):
-    """
-    Predict whether a list of protein pairs will interact.
-
-    :param model: Model to be trained
-    :type model: dscript.models.interaction.ModelInteraction
-    :param n0: First protein names
-    :type n0: list[str]
-    :param n1: Second protein names
-    :type n1: list[str]
-    :param tensors: Dictionary of protein names to embeddings
-    :type tensors: dict[str, torch.Tensor]
-    :param use_cuda: Whether to use GPU
-    :type use_cuda: bool
-    """
-
-    b = len(n0)
-
-    p_hat = []
-    for i in range(b):
-        z_a = tensors[n0[i]]
-        z_b = tensors[n1[i]]
-        if use_cuda:
-            z_a = z_a.cuda()
-            z_b = z_b.cuda()
-
-        p_hat.append(model.predict(z_a, z_b))
-    p_hat = torch.stack(p_hat, 0)
-    return p_hat
-
-
-def predict_cmap_interaction(model, n0, n1, tensors, use_cuda):
-    """
-    Predict whether a list of protein pairs will interact, as well as their contact map.
-
-    :param model: Model to be trained
-    :type model: dscript.models.interaction.ModelInteraction
-    :param n0: First protein names
-    :type n0: list[str]
-    :param n1: Second protein names
-    :type n1: list[str]
-    :param tensors: Dictionary of protein names to embeddings
-    :type tensors: dict[str, torch.Tensor]
-    :param use_cuda: Whether to use GPU
-    :type use_cuda: bool
-    """
-
-    b = len(n0)
-
-    p_hat = []
-    c_map_mag = []
-    for i in range(b):
-        z_a = tensors[n0[i]]
-        z_b = tensors[n1[i]]
-        if use_cuda:
-            z_a = z_a.cuda()
-            z_b = z_b.cuda()
-
-        cm, ph = model.map_predict(z_a, z_b)
-        p_hat.append(ph)
-        c_map_mag.append(torch.mean(cm))
-    p_hat = torch.stack(p_hat, 0)
-    c_map_mag = torch.stack(c_map_mag, 0)
-    return c_map_mag, p_hat
-
-
-def interaction_grad(model, n0, n1, y, tensors, use_cuda, weight=0.35):
-    """
-    Compute gradient and backpropagate loss for a batch.
-
-    :param model: Model to be trained
-    :type model: dscript.models.interaction.ModelInteraction
-    :param n0: First protein names
-    :type n0: list[str]
-    :param n1: Second protein names
-    :type n1: list[str]
-    :param y: Interaction labels
-    :type y: torch.Tensor
-    :param tensors: Dictionary of protein names to embeddings
-    :type tensors: dict[str, torch.Tensor]
-    :param use_cuda: Whether to use GPU
-    :type use_cuda: bool
-    :param weight: Weight on the contact map magnitude objective. BCE loss is :math:`1 - \\text{weight}`.
-    :type weight: float
-
-    :return: (Loss, number correct, mean square error, batch size)
-    :rtype: (torch.Tensor, int, torch.Tensor, int)
-    """
-
-    c_map_mag, p_hat = predict_cmap_interaction(
-        model, n0, n1, tensors, use_cuda
-    )
-    if use_cuda:
-        y = y.cuda()
-    y = Variable(y)
-
-    bce_loss = F.binary_cross_entropy(p_hat.float(), y.float())
-    cmap_loss = torch.mean(c_map_mag)
-    loss = (weight * bce_loss) + ((1 - weight) * cmap_loss)
-    b = len(p_hat)
-
-    # backprop loss
-    loss.backward()
-
-    if use_cuda:
-        y = y.cpu()
-        p_hat = p_hat.cpu()
-
-    with torch.no_grad():
-        guess_cutoff = 0.5
-        p_hat = p_hat.float()
-        p_guess = (guess_cutoff * torch.ones(b) < p_hat).float()
-        y = y.float()
-        correct = torch.sum(p_guess == y).item()
-        mse = torch.mean((y.float() - p_hat) ** 2).item()
-
-    return loss, correct, mse, b
-
-
-def interaction_eval(model, test_iterator, tensors, use_cuda):
-    """
-    Evaluate test data set performance.
-
-    :param model: Model to be trained
-    :type model: dscript.models.interaction.ModelInteraction
-    :param test_iterator: Test data iterator
-    :type test_iterator: torch.utils.data.DataLoader
-    :param tensors: Dictionary of protein names to embeddings
-    :type tensors: dict[str, torch.Tensor]
-    :param use_cuda: Whether to use GPU
-    :type use_cuda: bool
-
-    :return: (Loss, number correct, mean square error, precision, recall, F1 Score, AUPR)
-    :rtype: (torch.Tensor, int, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor)
-    """
-    p_hat = []
-    true_y = []
-
-    for n0, n1, y in test_iterator:
-        p_hat.append(predict_interaction(model, n0, n1, tensors, use_cuda))
-        true_y.append(y)
-
-    y = torch.cat(true_y, 0)
-    p_hat = torch.cat(p_hat, 0)
-
-    if use_cuda:
-        y.cuda()
-        p_hat = torch.Tensor([x.cuda() for x in p_hat])
-        p_hat.cuda()
-
-    loss = F.binary_cross_entropy(p_hat.float(), y.float()).item()
-    b = len(y)
-
-    with torch.no_grad():
-        guess_cutoff = torch.Tensor([0.5]).float()
-        p_hat = p_hat.float()
-        y = y.float()
-        p_guess = (guess_cutoff * torch.ones(b) < p_hat).float()
-        correct = torch.sum(p_guess == y).item()
-        mse = torch.mean((y.float() - p_hat) ** 2).item()
-
-        tp = torch.sum(y * p_hat).item()
-        pr = tp / torch.sum(p_hat).item()
-        re = tp / torch.sum(y).item()
-        f1 = 2 * pr * re / (pr + re)
-
-    y = y.cpu().numpy()
-    p_hat = p_hat.data.cpu().numpy()
-
-    aupr = average_precision(y, p_hat)
-
-    return loss, correct, mse, pr, re, f1, aupr
-
-
-def main(args):
-    """
-    Run training from arguments.
-
-    :meta private:
-    """
-
-    output = args.outfile
-    if output is None:
-        output = sys.stdout
-    else:
-        output = open(output, "w")
-
-    print(f'# Called as: {" ".join(sys.argv)}', file=output)
-    if output is not sys.stdout:
-        print(f'Called as: {" ".join(sys.argv)}')
-
-    # Set device
-    device = args.device
-    use_cuda = (device >= 0) and torch.cuda.is_available()
-    if use_cuda:
-        torch.cuda.set_device(device)
-        print(
-            f"# Using CUDA device {device} - {torch.cuda.get_device_name(device)}",
-            file=output,
-        )
-    else:
-        print("# Using CPU", file=output)
-        device = "cpu"
-
-    batch_size = args.batch_size
-
-    train_fi = args.train
-    test_fi = args.val
-    augment = args.augment
-    embedding_h5 = args.embedding
-    h5fi = h5py.File(embedding_h5, "r")
-
-    print(f"# Loading training pairs from {train_fi}...", file=output)
-    output.flush()
-
-    train_df = pd.read_csv(train_fi, sep="\t", header=None)
-    if augment:
-        train_n0 = pd.concat((train_df[0], train_df[1]), axis=0).reset_index(
-            drop=True
-        )
-        train_n1 = pd.concat((train_df[1], train_df[0]), axis=0).reset_index(
-            drop=True
-        )
-        train_y = torch.from_numpy(
-            pd.concat((train_df[2], train_df[2])).values
-        )
-    else:
-        train_n0, train_n1 = train_df[0], train_df[1]
-        train_y = torch.from_numpy(train_df[2].values)
-
-    print(f"# Loading testing pairs from {test_fi}...", file=output)
-    output.flush()
-
-    test_df = pd.read_csv(test_fi, sep="\t", header=None)
-    test_n0, test_n1 = test_df[0], test_df[1]
-    test_y = torch.from_numpy(test_df[2].values)
-    output.flush()
-
-    train_pairs = PairedDataset(train_n0, train_n1, train_y)
-    pairs_train_iterator = torch.utils.data.DataLoader(
-        train_pairs,
-        batch_size=batch_size,
-        collate_fn=collate_paired_sequences,
-        shuffle=True,
-    )
-
-    test_pairs = PairedDataset(test_n0, test_n1, test_y)
-    pairs_test_iterator = torch.utils.data.DataLoader(
-        test_pairs,
-        batch_size=batch_size,
-        collate_fn=collate_paired_sequences,
-        shuffle=True,
-    )
-
-    output.flush()
-
-    print(f"# Loading embeddings", file=output)
-    tensors = {}
-    all_proteins = (
-        set(train_n0)
-        .union(set(train_n1))
-        .union(set(test_n0))
-        .union(set(test_n1))
-    )
-    for prot_name in tqdm(all_proteins):
-        tensors[prot_name] = torch.from_numpy(h5fi[prot_name][:, :])
-
-    use_cuda = (args.device > -1) and torch.cuda.is_available()
-
-    if args.checkpoint is None:
-
-        projection_dim = args.projection_dim
-        dropout_p = args.dropout_p
-        embedding = FullyConnectedEmbed(
-            6165, projection_dim, dropout=dropout_p
-        )
-        print("# Initializing embedding model with:", file=output)
-        print(f"\tprojection_dim: {projection_dim}", file=output)
-        print(f"\tdropout_p: {dropout_p}", file=output)
-
-        # Create contact model
-        hidden_dim = args.hidden_dim
-        kernel_width = args.kernel_width
-        print("# Initializing contact model with:", file=output)
-        print(f"\thidden_dim: {hidden_dim}", file=output)
-        print(f"\tkernel_width: {kernel_width}", file=output)
-
-        contact = ContactCNN(projection_dim, hidden_dim, kernel_width)
-
-        # Create the full model
-        use_W = args.use_w
-        pool_width = args.pool_width
-        print("# Initializing interaction model with:", file=output)
-        print(f"\tpool_width: {pool_width}", file=output)
-        print(f"\tuse_w: {use_W}", file=output)
-        model = ModelInteraction(
-            embedding, contact, use_W=use_W, pool_size=pool_width
-        )
-
-        print(model, file=output)
-
-    else:
-        print(
-            "# Loading model from checkpoint {}".format(args.checkpoint),
-            file=output,
-        )
-        model = torch.load(args.checkpoint)
-        model.use_cuda = use_cuda
-
-    if use_cuda:
-        model = model.cuda()
-
-    # Train the model
-    lr = args.lr
-    wd = args.weight_decay
-    num_epochs = args.num_epochs
-    batch_size = args.batch_size
-    report_steps = args.epoch_scale
-    inter_weight = args.lambda_
-    cmap_weight = 1 - inter_weight
-    digits = int(np.floor(np.log10(num_epochs))) + 1
-    save_prefix = args.save_prefix
-    if save_prefix is None:
-        save_prefix = datetime.datetime.now().strftime("%Y-%m-%d-%H-%M")
-
-    params = [p for p in model.parameters() if p.requires_grad]
-    optim = torch.optim.Adam(params, lr=lr, weight_decay=wd)
-
-    print(f'# Using save prefix "{save_prefix}"', file=output)
-    print(f"# Training with Adam: lr={lr}, weight_decay={wd}", file=output)
-    print(f"\tnum_epochs: {num_epochs}", file=output)
-    print(f"\tepoch_scale: {report_steps}", file=output)
-    print(f"\tbatch_size: {batch_size}", file=output)
-    print(f"\tinteraction weight: {inter_weight}", file=output)
-    print(f"\tcontact map weight: {cmap_weight}", file=output)
-    output.flush()
-
-    batch_report_fmt = (
-        "# [{}/{}] training {:.1%}: Loss={:.6}, Accuracy={:.3%}, MSE={:.6}"
-    )
-    epoch_report_fmt = "# Finished Epoch {}/{}: Loss={:.6}, Accuracy={:.3%}, MSE={:.6}, Precision={:.6}, Recall={:.6}, F1={:.6}, AUPR={:.6}"
-
-    N = len(pairs_train_iterator) * batch_size
-    for epoch in range(num_epochs):
-
-        model.train()
-
-        n = 0
-        loss_accum = 0
-        acc_accum = 0
-        mse_accum = 0
-
-        # Train batches
-        for (z0, z1, y) in tqdm(
-            pairs_train_iterator,
-            desc=f"Epoch {epoch+1}/{num_epochs}",
-            total=len(pairs_train_iterator),
-        ):
-
-            loss, correct, mse, b = interaction_grad(
-                model, z0, z1, y, tensors, use_cuda, weight=inter_weight
-            )
-
-            n += b
-            delta = b * (loss - loss_accum)
-            loss_accum += delta / n
-
-            delta = correct - b * acc_accum
-            acc_accum += delta / n
-
-            delta = b * (mse - mse_accum)
-            mse_accum += delta / n
-
-            report = (n - b) // 100 < n // 100
-
-            optim.step()
-            optim.zero_grad()
-            model.clip()
-
-            if report:
-                tokens = [
-                    epoch + 1,
-                    num_epochs,
-                    n / N,
-                    loss_accum,
-                    acc_accum,
-                    mse_accum,
-                ]
-                if output is not sys.stdout:
-                    print(batch_report_fmt.format(*tokens), file=output)
-                    output.flush()
-
-        if (epoch + 1) % report_steps == 0:
-            model.eval()
-
-            with torch.no_grad():
-
-                (
-                    inter_loss,
-                    inter_correct,
-                    inter_mse,
-                    inter_pr,
-                    inter_re,
-                    inter_f1,
-                    inter_aupr,
-                ) = interaction_eval(
-                    model, pairs_test_iterator, tensors, use_cuda
-                )
-                tokens = [
-                    epoch + 1,
-                    num_epochs,
-                    inter_loss,
-                    inter_correct / (len(pairs_test_iterator) * batch_size),
-                    inter_mse,
-                    inter_pr,
-                    inter_re,
-                    inter_f1,
-                    inter_aupr,
-                ]
-                print(epoch_report_fmt.format(*tokens), file=output)
-                output.flush()
-
-            # Save the model
-            if save_prefix is not None:
-                save_path = (
-                    save_prefix
-                    + "_epoch"
-                    + str(epoch + 1).zfill(digits)
-                    + ".sav"
-                )
-                print(f"# Saving model to {save_path}", file=output)
-                model.cpu()
-                torch.save(model, save_path)
-                if use_cuda:
-                    model.cuda()
-
-        output.flush()
-
-    if save_prefix is not None:
-        save_path = save_prefix + "_final.sav"
-        print(f"# Saving final model to {save_path}", file=output)
-        model.cpu()
-        torch.save(model, save_path)
-        if use_cuda:
-            model.cuda()
-
-    output.close()
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser(description=__doc__)
-    add_args(parser)
-    main(parser.parse_args())
diff --git a/dscript/legacy/utils_legacy.py b/dscript/legacy/utils_legacy.py
deleted file mode 100644
index 67d23a7..0000000
--- a/dscript/legacy/utils_legacy.py
+++ /dev/null
@@ -1,170 +0,0 @@
-import torch
-import torch.utils.data
-
-import numpy as np
-import pandas as pd
-import subprocess as sp
-import sys
-import gzip as gz
-from datetime import datetime
-from .fasta import parse
-
-
-def log(msg, file=sys.stderr):
-    """
-    Log datetime-stamped message to file
-
-    :param msg: Message to log
-    :param f: Writable file object to log message to
-    """
-    timestr = datetime.utcnow().isoformat(sep="-", timespec="milliseconds")
-    file.write(f"[{timestr}] {msg}\n")
-    file.flush()
-
-
-def plot_PR_curve(y, phat, saveFile=None):
-    """
-    Plot precision-recall curve.
-
-    :param y: Labels
-    :type y: np.ndarray
-    :param phat: Predicted probabilities
-    :type phat: np.ndarray
-    :param saveFile: File for plot of curve to be saved to
-    :type saveFile: str
-    """
-    import matplotlib.pyplot as plt
-    from sklearn.metrics import precision_recall_curve, average_precision_score
-
-    aupr = average_precision_score(y, phat)
-    precision, recall, _ = precision_recall_curve(y, phat)
-
-    plt.step(recall, precision, color="b", alpha=0.2, where="post")
-    plt.fill_between(recall, precision, step="post", alpha=0.2, color="b")
-    plt.xlabel("Recall")
-    plt.ylabel("Precision")
-    plt.ylim([0.0, 1.05])
-    plt.xlim([0.0, 1.0])
-    plt.title("Precision-Recall (AUPR: {:.3})".format(aupr))
-    if saveFile:
-        plt.savefig(saveFile)
-    else:
-        plt.show()
-
-
-def plot_ROC_curve(y, phat, saveFile=None):
-    """
-    Plot receiver operating characteristic curve.
-
-    :param y: Labels
-    :type y: np.ndarray
-    :param phat: Predicted probabilities
-    :type phat: np.ndarray
-    :param saveFile: File for plot of curve to be saved to
-    :type saveFile: str
-    """
-    import matplotlib.pyplot as plt
-    from sklearn.metrics import roc_curve, roc_auc_score
-
-    auroc = roc_auc_score(y, phat)
-
-    fpr, tpr, roc_thresh = roc_curve(y, phat)
-    print("AUROC:", auroc)
-
-    plt.step(fpr, tpr, color="b", alpha=0.2, where="post")
-    plt.fill_between(fpr, tpr, step="post", alpha=0.2, color="b")
-    plt.xlabel("FPR")
-    plt.ylabel("TPR")
-    plt.ylim([0.0, 1.05])
-    plt.xlim([0.0, 1.0])
-    plt.title("Receiver Operating Characteristic (AUROC: {:.3})".format(auroc))
-    if saveFile:
-        plt.savefig(saveFile)
-    else:
-        plt.show()
-
-
-def RBF(D, sigma=None):
-    """
-    Convert distance matrix into similarity matrix using Radial Basis Function (RBF) Kernel.
-
-    :math:`RBF(x,x') = \\exp{\\frac{-(x - x')^{2}}{2\\sigma^{2}}}`
-
-    :param D: Distance matrix
-    :type D: np.ndarray
-    :param sigma: Bandwith of RBF Kernel [default: :math:`\\sqrt{\\text{max}(D)}`]
-    :type sigma: float
-    :return: Similarity matrix
-    :rtype: np.ndarray
-    """
-    sigma = sigma or np.sqrt(np.max(D))
-    return np.exp(-1 * (np.square(D) / (2 * sigma ** 2)))
-
-
-def gpu_mem(device):
-    """
-    Get current memory usage for GPU.
-
-    :param device: GPU device number
-    :type device: int
-    :return: memory used, memory total
-    :rtype: int, int
-    """
-    result = sp.check_output(
-        [
-            "nvidia-smi",
-            "--query-gpu=memory.used,memory.total",
-            "--format=csv,nounits,noheader",
-            "--id={}".format(device),
-        ],
-        encoding="utf-8",
-    )
-    gpu_memory = [int(x) for x in result.strip().split(",")]
-    return gpu_memory[0], gpu_memory[1]
-
-
-class PairedDataset(torch.utils.data.Dataset):
-    """
-    Dataset to be used by the PyTorch data loader for pairs of sequences and their labels.
-
-    :param X0: List of first item in the pair
-    :param X1: List of second item in the pair
-    :param Y: List of labels
-    """
-
-    def __init__(self, X0, X1, Y):
-        self.X0 = X0
-        self.X1 = X1
-        self.Y = Y
-        assert len(X0) == len(X1), (
-            "X0: "
-            + str(len(X0))
-            + " X1: "
-            + str(len(X1))
-            + " Y: "
-            + str(len(Y))
-        )
-        assert len(X0) == len(Y), (
-            "X0: "
-            + str(len(X0))
-            + " X1: "
-            + str(len(X1))
-            + " Y: "
-            + str(len(Y))
-        )
-
-    def __len__(self):
-        return len(self.X0)
-
-    def __getitem__(self, i):
-        return self.X0[i], self.X1[i], self.Y[i]
-
-
-def collate_paired_sequences(args):
-    """
-    Collate function for PyTorch data loader.
-    """
-    x0 = [a[0] for a in args]
-    x1 = [a[1] for a in args]
-    y = [a[2] for a in args]
-    return x0, x1, torch.stack(y, 0)

From 23731a6e911749381463acb18cbc5b4ba4d09c68 Mon Sep 17 00:00:00 2001
From: Samuel Sledzieski <samsledje@gmail.com>
Date: Thu, 30 Jun 2022 11:55:09 -0400
Subject: [PATCH 14/15] Update embedding.py

---
 dscript/models/embedding.py | 41 -------------------------------------
 1 file changed, 41 deletions(-)

diff --git a/dscript/models/embedding.py b/dscript/models/embedding.py
index 88537db..e30510d 100644
--- a/dscript/models/embedding.py
+++ b/dscript/models/embedding.py
@@ -57,47 +57,6 @@ class FullyConnectedEmbed(nn.Module):
         t = self.drop(t)
         return t
 
-
-class LSTMEmbed(nn.Module):
-    def __init__(self, nout, activation="ReLU", sparse=False, p=0.5):
-        super(LSTMEmbed, self).__init__()
-        self.activation = activation
-        self.sparse = sparse
-        self.p = p
-
-        self.embedding = SkipLSTM(21, nout, 1024, 3)
-        self.embedding.load_state_dict(torch.load(EMBEDDING_STATE_DICT))
-
-        for param in self.embedding.parameters():
-            param.requires_grad = False
-        torch.nn.init.normal_(self.embedding.proj.weight)
-        torch.nn.init.uniform_(self.embedding.proj.bias, 0, 0)
-        self.embedding.proj.weight.requires_grad = True
-        self.embedding.proj.bias.requires_grad = True
-
-        self.activationDict = nn.ModuleDict(
-            {
-                "None": IdentityEmbed(),
-                "ReLU": nn.ReLU(),
-                "Sigmoid": nn.Sigmoid(),
-            }
-        )
-        self.dropout = nn.Dropout(p=self.p)
-
-    def forward(self, x):
-
-        t = self.embedding(x)
-        if self.activation:
-            t = self.activationDict[self.activation](t)
-        if self.sparse:
-            t = self.dropout(t)
-
-        return t
-
-    def long_embed(self, x):
-        return self.embedding.transform(x)
-
-
 class SkipLSTM(nn.Module):
     """
     Language model from `Bepler & Berger <https://github.com/tbepler/protein-sequence-embedding-iclr2019>`_.

From 11997309de2664e9ad63c91470b7c6b888f9feb4 Mon Sep 17 00:00:00 2001
From: Samuel Sledzieski <samsledje@gmail.com>
Date: Thu, 30 Jun 2022 12:05:15 -0400
Subject: [PATCH 15/15] Create pypi_publish.yml

---
 .github/workflows/pypi_publish.yml | 39 ++++++++++++++++++++++++++++++
 1 file changed, 39 insertions(+)
 create mode 100644 .github/workflows/pypi_publish.yml

diff --git a/.github/workflows/pypi_publish.yml b/.github/workflows/pypi_publish.yml
new file mode 100644
index 0000000..2fbdf5b
--- /dev/null
+++ b/.github/workflows/pypi_publish.yml
@@ -0,0 +1,39 @@
+# This workflow will upload a Python Package using Twine when a release is created
+# For more information see: https://help.github.com/en/actions/language-and-framework-guides/using-python-with-github-actions#publishing-to-package-registries
+
+# This workflow uses actions that are not certified by GitHub.
+# They are provided by a third-party and are governed by
+# separate terms of service, privacy policy, and support
+# documentation.
+
+name: Upload Python Package
+
+on:
+  release:
+    types: [published]
+
+permissions:
+  contents: read
+
+jobs:
+  deploy:
+
+    runs-on: ubuntu-latest
+
+    steps:
+    - uses: actions/checkout@v3
+    - name: Set up Python
+      uses: actions/setup-python@v3
+      with:
+        python-version: '3.x'
+    - name: Install dependencies
+      run: |
+        python -m pip install --upgrade pip
+        if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
+    - name: Build package
+      run: python setup.py sdist bdist_wheel
+    - name: Publish package
+      uses: pypa/gh-action-pypi-publish@27b31702a0e7fc50959f5ad993c78deac1bdfc29
+      with:
+        user: __token__
+        password: ${{ secrets.PYPI_API_TOKEN }}