mirror of
https://github.com/samsledje/D-SCRIPT.git
synced 2026-06-04 15:04:24 +08:00
634 lines
24 KiB
Plaintext
634 lines
24 KiB
Plaintext
{
|
|
"cells": [
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 80,
|
|
"id": "aaaf2d91-798a-4f07-95e4-40eb86e202b8",
|
|
"metadata": {
|
|
"tags": []
|
|
},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"The autoreload extension is already loaded. To reload it, use:\n",
|
|
" %reload_ext autoreload\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"%load_ext autoreload\n",
|
|
"%autoreload 2"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 81,
|
|
"id": "e5d8f711-2b12-4bee-9a43-bc0dd9308388",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"import torch\n",
|
|
"import torch.nn as nn\n",
|
|
"import torch.nn.functional as F\n",
|
|
"from tqdm import tqdm"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 82,
|
|
"id": "3cb89c82-23ac-4825-9aeb-fe51cbe91e8e",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"from conv import ProteinConv\n",
|
|
"from pool import ProteinMaxPool\n",
|
|
"from concat import ProteinConcat\n",
|
|
"class ProteinModel(nn.Module):\n",
|
|
" def __init__(self, input_dim, dropout = 0.2, activation = \"sigmoid\", stride = 2):\n",
|
|
" super(ProteinModel, self).__init__()\n",
|
|
" \n",
|
|
" activations = {\n",
|
|
" 'relu': F.relu,\n",
|
|
" 'sigmoid': F.sigmoid,\n",
|
|
" 'tanh': torch.tanh\n",
|
|
" }\n",
|
|
" \n",
|
|
" self.activation = activations[activation]\n",
|
|
" \n",
|
|
" self.l1 = nn.Linear(input_dim, 100)\n",
|
|
" self.pconv1 = ProteinConv(no_filters=50, no_dims=100, no_channels=1, window_size=10, dropout_p = dropout, stride = stride, activation = activation)\n",
|
|
" self.pool1 = ProteinMaxPool(3)\n",
|
|
" \n",
|
|
" self.l2 = nn.Linear(100, 50)\n",
|
|
" self.pconcat = ProteinConcat(no_dims=50, no_channels=50, window_size=5, op_size=20, stride = stride, dropout_p = 0.2, activation = activation)\n",
|
|
" \n",
|
|
" self.outlayer = nn.Linear(20, 2)\n",
|
|
" self.softlayer = nn.Softmax(dim=1)\n",
|
|
" self.input_dim = input_dim\n",
|
|
" \n",
|
|
" \n",
|
|
" def forward(self, p1, p2):\n",
|
|
" \n",
|
|
" N1, H1, C1, D1 = p1.shape\n",
|
|
" N2, H2, C2, D2 = p2.shape\n",
|
|
" \n",
|
|
" assert (N1, C1, D1) == (N2, C2, D2)\n",
|
|
" assert D1 == self.input_dim\n",
|
|
"\n",
|
|
" p1 = self.l1(p1)\n",
|
|
" p2 = self.l1(p2)\n",
|
|
" p1, p2 = self.pconv1(p1, p2)\n",
|
|
" p1 = self.pool1(self.activation(p1))\n",
|
|
" p2 = self.pool1(self.activation(p2))\n",
|
|
" \n",
|
|
" p1 = self.l2(p1)\n",
|
|
" p2 = self.l2(p2)\n",
|
|
" pout = self.pconcat(p1, p2)\n",
|
|
" pout = self.softlayer(self.outlayer(pout))\n",
|
|
" return pout"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 83,
|
|
"id": "58dafbd2-979d-4ddb-9848-d16c2e2de6a9",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"data": {
|
|
"text/plain": [
|
|
"(2400, 1096)"
|
|
]
|
|
},
|
|
"execution_count": 83,
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
}
|
|
],
|
|
"source": [
|
|
"import h5py\n",
|
|
"from torch.utils.data import Dataset, DataLoader\n",
|
|
"import pandas as pd\n",
|
|
"\n",
|
|
"embfile = \"/afs/csail.mit.edu/u/k/kdevko01/coip-vs-y2h-folder/data/networks/dscript-tt/y2h-coip.h5\"\n",
|
|
"f = h5py.File(embfile)\n",
|
|
"\n",
|
|
"trainfile = \"/afs/csail.mit.edu/u/k/kdevko01/coip-vs-y2h-folder/data/networks/dscript-tt/coip_train.tsv\"\n",
|
|
"dtr = pd.read_csv(trainfile, sep = \"\\t\", header = None)\n",
|
|
"\n",
|
|
"dtrain = pd.concat([dtr[dtr[2] == 1].sample(n=200), dtr[dtr[2] == 0].sample(n=1000)])\n",
|
|
"dval = pd.concat([dtr[dtr[2] == 1].sample(n=50), dtr[dtr[2] == 0].sample(n=500)])\n",
|
|
"dval = dval.drop(set(dval.index).intersection(set(dtrain.index)))\n",
|
|
"\n",
|
|
"dtrain = pd.concat([dtrain, dtrain.loc[:, [1, 0, 2]]]).reset_index(drop = True)\n",
|
|
"dval = pd.concat([dval, dval.loc[:, [1, 0, 2]]]).reset_index(drop = True)\n",
|
|
" \n",
|
|
"len(dtrain), len(dval)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 84,
|
|
"id": "8803be38-395b-4576-8e14-a75509784fdf",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"data": {
|
|
"text/plain": [
|
|
"2200"
|
|
]
|
|
},
|
|
"execution_count": 84,
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
}
|
|
],
|
|
"source": [
|
|
"testfile = \"/afs/csail.mit.edu/u/k/kdevko01/coip-vs-y2h-folder/data/networks/dscript-tt/coip_test.tsv\"\n",
|
|
"dtest = pd.read_csv(testfile, sep = \"\\t\", header = None)\n",
|
|
"dtest = pd.concat([dtest[dtest[2] == 1].sample(n=100), dtest[dtest[2] == 0].sample(n=1000)])\n",
|
|
"dtest = pd.concat([dtest, dtest.loc[:, [1, 0, 2]]]).reset_index(drop = True)\n",
|
|
"len(dtest)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 85,
|
|
"id": "01e929bb-f65d-4bd4-a74c-83cb9b2bc95b",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"import numpy as np\n",
|
|
"class ProtDataset(Dataset):\n",
|
|
" def __init__(self, df, h5data, min_seqlen=75):\n",
|
|
" self.df = df\n",
|
|
" self.h5 = h5data\n",
|
|
" self.min_seqlen = min_seqlen\n",
|
|
" \n",
|
|
" def __len__(self):\n",
|
|
" return len(self.df)\n",
|
|
" \n",
|
|
" def __getitem__(self, id):\n",
|
|
" p, q, w = self.df.iloc[id, :].values\n",
|
|
" p1, p2, w = torch.tensor(np.array(self.h5[p]), dtype = torch.float32).squeeze(0).unsqueeze(1), torch.tensor(np.array(self.h5[q]), dtype = torch.float32).squeeze(0).unsqueeze(1), torch.tensor(w, dtype = torch.long)\n",
|
|
" \n",
|
|
" dim = p1.shape[2]\n",
|
|
" \n",
|
|
" p1seqlen = p1.shape[0]\n",
|
|
" p2seqlen = p2.shape[0]\n",
|
|
" if p1seqlen < self.min_seqlen:\n",
|
|
" p1 = torch.cat([p1, torch.zeros(self.min_seqlen - p1seqlen, 1, dim, dtype = torch.float32)], dim = 0)\n",
|
|
" if p2seqlen < self.min_seqlen:\n",
|
|
" p2 = torch.cat([p2, torch.zeros(self.min_seqlen - p2seqlen, 1, dim, dtype = torch.float32)], dim = 0)\n",
|
|
" return p1, p2, w\n",
|
|
" \n",
|
|
"trdata = ProtDataset(dtrain, f)\n",
|
|
"tedata = ProtDataset(dtest, f)\n",
|
|
"valdata = ProtDataset(dval, f)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 86,
|
|
"id": "b2aa61ac-ba1d-4b07-80b1-60b1c367fd6e",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"data": {
|
|
"text/plain": [
|
|
"torch.Size([754, 1, 6165])"
|
|
]
|
|
},
|
|
"execution_count": 86,
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
}
|
|
],
|
|
"source": [
|
|
"p, q, w = trdata[0]\n",
|
|
"p.shape"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 87,
|
|
"id": "cc0cd75e-cbf3-4121-af31-3ffa978f09ba",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"dev = torch.device(\"cuda:7\")\n",
|
|
"lr = 0.1\n",
|
|
"no_ep = 5"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 88,
|
|
"id": "b261ddb5-a720-42d8-b249-c8b9a9060569",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"trloader = DataLoader(trdata, batch_size = 1, shuffle = True)\n",
|
|
"valloader = DataLoader(valdata, batch_size = 1, shuffle = True)\n",
|
|
"model = ProteinModel(6165)\n",
|
|
"model = model.to(dev)\n",
|
|
"opt = torch.optim.Adam(model.parameters(), lr = lr)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 89,
|
|
"id": "eac600fa-525d-4e50-a2c0-0be94b3f36ea",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stderr",
|
|
"output_type": "stream",
|
|
"text": [
|
|
" 0%| | 0/2400 [00:00<?, ?it/s]/scratch2/kdevko01/conda/.conda/envs/dscript/lib/python3.7/site-packages/torch/nn/functional.py:1960: UserWarning: nn.functional.sigmoid is deprecated. Use torch.sigmoid instead.\n",
|
|
" warnings.warn(\"nn.functional.sigmoid is deprecated. Use torch.sigmoid instead.\")\n",
|
|
"100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2400/2400 [01:18<00:00, 30.75it/s]\n"
|
|
]
|
|
},
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"Epoch 1: Training Loss : 0.48005625932166973: AUPR: 0.09124087591240876\n"
|
|
]
|
|
},
|
|
{
|
|
"name": "stderr",
|
|
"output_type": "stream",
|
|
"text": [
|
|
" 0%| | 0/2400 [00:00<?, ?it/s]/scratch2/kdevko01/conda/.conda/envs/dscript/lib/python3.7/site-packages/torch/nn/functional.py:1960: UserWarning: nn.functional.sigmoid is deprecated. Use torch.sigmoid instead.\n",
|
|
" warnings.warn(\"nn.functional.sigmoid is deprecated. Use torch.sigmoid instead.\")\n",
|
|
"100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2400/2400 [01:18<00:00, 30.68it/s]\n"
|
|
]
|
|
},
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"Epoch 2: Training Loss : 0.47992831965287525: AUPR: 0.09124087591240876\n"
|
|
]
|
|
},
|
|
{
|
|
"name": "stderr",
|
|
"output_type": "stream",
|
|
"text": [
|
|
" 0%| | 0/2400 [00:00<?, ?it/s]/scratch2/kdevko01/conda/.conda/envs/dscript/lib/python3.7/site-packages/torch/nn/functional.py:1960: UserWarning: nn.functional.sigmoid is deprecated. Use torch.sigmoid instead.\n",
|
|
" warnings.warn(\"nn.functional.sigmoid is deprecated. Use torch.sigmoid instead.\")\n",
|
|
"100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2400/2400 [01:18<00:00, 30.42it/s]\n"
|
|
]
|
|
},
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"Epoch 3: Training Loss : 0.47992831965287525: AUPR: 0.09124087591240876\n"
|
|
]
|
|
},
|
|
{
|
|
"name": "stderr",
|
|
"output_type": "stream",
|
|
"text": [
|
|
" 0%| | 0/2400 [00:00<?, ?it/s]/scratch2/kdevko01/conda/.conda/envs/dscript/lib/python3.7/site-packages/torch/nn/functional.py:1960: UserWarning: nn.functional.sigmoid is deprecated. Use torch.sigmoid instead.\n",
|
|
" warnings.warn(\"nn.functional.sigmoid is deprecated. Use torch.sigmoid instead.\")\n",
|
|
"100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2400/2400 [01:17<00:00, 30.89it/s]\n"
|
|
]
|
|
},
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"Epoch 4: Training Loss : 0.47992831965287525: AUPR: 0.09124087591240876\n"
|
|
]
|
|
},
|
|
{
|
|
"name": "stderr",
|
|
"output_type": "stream",
|
|
"text": [
|
|
" 0%| | 0/2400 [00:00<?, ?it/s]/scratch2/kdevko01/conda/.conda/envs/dscript/lib/python3.7/site-packages/torch/nn/functional.py:1960: UserWarning: nn.functional.sigmoid is deprecated. Use torch.sigmoid instead.\n",
|
|
" warnings.warn(\"nn.functional.sigmoid is deprecated. Use torch.sigmoid instead.\")\n",
|
|
"100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2400/2400 [01:20<00:00, 29.71it/s]\n"
|
|
]
|
|
},
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"Epoch 5: Training Loss : 0.4803449863071243: AUPR: 0.09124087591240876\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"from sklearn.metrics import average_precision_score\n",
|
|
"\n",
|
|
"lossfn = torch.nn.CrossEntropyLoss()\n",
|
|
"\n",
|
|
"def compute_aupr(op, target):\n",
|
|
" if isinstance(op, torch.Tensor):\n",
|
|
" op = op.numpy()\n",
|
|
" if isinstance(target, torch.Tensor):\n",
|
|
" target = target.numpy()\n",
|
|
" \n",
|
|
" op = np.argmax(op, axis = 1)\n",
|
|
" return average_precision_score(op, target)\n",
|
|
" \n",
|
|
"\n",
|
|
"for ep in range(no_ep):\n",
|
|
" running_loss = 0\n",
|
|
" for i, data in enumerate(tqdm(trloader)):\n",
|
|
" ps, qs, wt = data\n",
|
|
" ps = ps.to(dev)\n",
|
|
" qs = qs.to(dev)\n",
|
|
" wt = wt.to(dev)\n",
|
|
" opt.zero_grad()\n",
|
|
" out = model(ps, qs)\n",
|
|
" loss = lossfn(out, wt)\n",
|
|
" loss.backward()\n",
|
|
" opt.step()\n",
|
|
" \n",
|
|
" if dev.type == \"cuda\":\n",
|
|
" ps = ps.to(\"cpu\")\n",
|
|
" qs = qs.to(\"cpu\")\n",
|
|
" wt = wt.to(\"cpu\")\n",
|
|
" loss = loss.to(\"cpu\")\n",
|
|
" running_loss += loss.item()\n",
|
|
" with torch.no_grad():\n",
|
|
" val_loss = 0\n",
|
|
" results = []\n",
|
|
" targets = []\n",
|
|
" for j, data in enumerate(valloader):\n",
|
|
" ps, qs, wt = data\n",
|
|
" ps = ps.to(dev)\n",
|
|
" qs = qs.to(dev)\n",
|
|
" wt = wt.to(dev)\n",
|
|
" out = model(ps, qs)\n",
|
|
" loss = lossfn(out, wt)\n",
|
|
" \n",
|
|
" if dev.type == \"cuda\":\n",
|
|
" ps = ps.to(\"cpu\")\n",
|
|
" qs = qs.to(\"cpu\")\n",
|
|
" wt = wt.to(\"cpu\")\n",
|
|
" loss = loss.to(\"cpu\")\n",
|
|
" out = out.to(\"cpu\")\n",
|
|
" val_loss += loss.item()\n",
|
|
" results += out.numpy()[:, 1].tolist()\n",
|
|
" targets += wt.numpy().tolist()\n",
|
|
" auprval = average_precision_score(targets, results)\n",
|
|
" print(f\"Epoch {ep+1}: Training Loss : {running_loss / (i+1)}: AUPR: {auprval}\")"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "058c1509-60ca-4cd5-ad1d-dc850bb49140",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"p1, p2, w = next(iter(trloader))\n",
|
|
"p1.shape,p2.shape"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 73,
|
|
"id": "f4b0bde3-7856-4f44-b946-de2e9c2f4c9c",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"data": {
|
|
"text/plain": [
|
|
"torch.Size([2, 6, 3, 4])"
|
|
]
|
|
},
|
|
"execution_count": 73,
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
}
|
|
],
|
|
"source": [
|
|
"import torch\n",
|
|
"x = torch.randn(2, 15, 3, dtype = torch.float32)\n",
|
|
"x.unfold(1, 4, 2).shape"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 83,
|
|
"id": "6e874169-e8b9-4a21-89bd-61c42f8f2dfe",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"\n",
|
|
"\n",
|
|
"no_batch, no_height = 2, 20\n",
|
|
"no_dims, no_channels, window_size, op_size = 10, 4, 5, 20\n",
|
|
"\n",
|
|
"p1 = torch.randn(no_batch, no_height, no_channels, no_dims, dtype = torch.float32)\n",
|
|
"p2 = torch.randn(no_batch, no_height, no_channels, no_dims, dtype = torch.float32)\n",
|
|
"\n",
|
|
"pconcat = ProteinConcat(no_dims, no_channels, window_size, op_size, stride = 1, dropout_p = 0.2, activation = \"tanh\")"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 91,
|
|
"id": "1abb2033-35a0-432f-8dca-b6fad6d5cb23",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"from conv import ProteinConv\n",
|
|
"no_filters = 3\n",
|
|
"pconv = ProteinConv(no_filters, no_dims, no_channels, window_size, stride = 1, dropout_p = 0.2, activation = \"tanh\")"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 92,
|
|
"id": "c9766858-42eb-4883-97bf-25ae0faf92a9",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"torch.Size([2, 16, 1, 4, 10, 5])\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"o1, o2 = pconv(p1, p2)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 88,
|
|
"id": "780e07e0-2cb8-49e2-bfcf-6611bb1194dd",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"data": {
|
|
"text/plain": [
|
|
"(torch.Size([2, 16, 3, 10]), torch.Size([2, 16, 3, 10]))"
|
|
]
|
|
},
|
|
"execution_count": 88,
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
}
|
|
],
|
|
"source": [
|
|
"o1.shape, o2.shape"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 94,
|
|
"id": "ee7ba055-801e-4299-be74-5d7e07a71fea",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"data": {
|
|
"text/plain": [
|
|
"torch.Size([2, 5, 4, 10])"
|
|
]
|
|
},
|
|
"execution_count": 94,
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
}
|
|
],
|
|
"source": [
|
|
"from pool import ProteinMaxPool\n",
|
|
"pool = ProteinMaxPool(4)\n",
|
|
"o = pool(p1)\n",
|
|
"o.shape"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 91,
|
|
"id": "19104c5a-a4f4-43ad-a452-860094d0ed8b",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"from Bio import SeqIO"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 92,
|
|
"id": "1cce5dd2-544b-4671-9a7f-186bd21a0319",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"#!cd ..; ln -s /afs/csail.mit.edu/u/r/rsingh/work/corals/data-scratch1/STRING_foldseek_embeddings foldseek_emb"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 110,
|
|
"id": "abad1c03-06e0-4d92-8ffe-a4c735d1da91",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"fasta = SeqIO.parse(\"../foldseek_emb/r1_foldseekrep_seq.fa\", \"fasta\")"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 108,
|
|
"id": "a65b05cc-47bb-4cff-8166-6e013aa331f6",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"ID: 9606.ENSP00000386340\n",
|
|
"Name: 9606.ENSP00000386340\n",
|
|
"Description: 9606.ENSP00000386340 AF2:AF-P63255-F1-model_v2.pdb.gz 9606.ENSP00000386340\n",
|
|
"Number of features: 0\n",
|
|
"Seq('DFAQLQPRDDDDPVQWQQAPNGTHGQCQQAAPPPRHRDDRHQWDDDNSHTHGPP...DDD')\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"vocab = {}\n",
|
|
"i = 0\n",
|
|
"for rec in fasta:\n",
|
|
" print(rec)\n",
|
|
" break"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 105,
|
|
"id": "1b43ac2a-f3c3-47ae-97d1-638a7595add4",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# import json\n",
|
|
"# json.dump(vocab, open(\"../foldseek_vocab.json\", \"w\"))"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 109,
|
|
"id": "2fc3d947-f7f7-44a4-b753-03a48209aa89",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"ename": "NotImplementedError",
|
|
"evalue": "SeqRecord comparison is deliberately not implemented. Explicitly compare the attributes of interest.",
|
|
"output_type": "error",
|
|
"traceback": [
|
|
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
|
|
"\u001b[0;31mNotImplementedError\u001b[0m Traceback (most recent call last)",
|
|
"\u001b[0;32m/tmp/ipykernel_3218513/2270489430.py\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0;34m\"9606.ENSP00000386340\"\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mfasta\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
|
|
"\u001b[0;32m/scratch2/kdevko01/conda/.conda/envs/dscript/lib/python3.7/site-packages/Bio/SeqRecord.py\u001b[0m in \u001b[0;36m__eq__\u001b[0;34m(self, other)\u001b[0m\n\u001b[1;32m 792\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0m__eq__\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mother\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 793\u001b[0m \u001b[0;34m\"\"\"Define the equal-to operand (not implemented).\"\"\"\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 794\u001b[0;31m \u001b[0;32mraise\u001b[0m \u001b[0mNotImplementedError\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0m_NO_SEQRECORD_COMPARISON\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 795\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 796\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0m__ne__\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mother\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
|
|
"\u001b[0;31mNotImplementedError\u001b[0m: SeqRecord comparison is deliberately not implemented. Explicitly compare the attributes of interest."
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"\"9606.ENSP00000386340\" in fasta"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "dac1e71f-4967-49da-9f7a-4e9d16853d38",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": []
|
|
}
|
|
],
|
|
"metadata": {
|
|
"kernelspec": {
|
|
"display_name": "dscript",
|
|
"language": "python",
|
|
"name": "dscript"
|
|
},
|
|
"language_info": {
|
|
"codemirror_mode": {
|
|
"name": "ipython",
|
|
"version": 3
|
|
},
|
|
"file_extension": ".py",
|
|
"mimetype": "text/x-python",
|
|
"name": "python",
|
|
"nbconvert_exporter": "python",
|
|
"pygments_lexer": "ipython3",
|
|
"version": "3.7.12"
|
|
}
|
|
},
|
|
"nbformat": 4,
|
|
"nbformat_minor": 5
|
|
}
|