Various fixes

- Fixed paths
- Uniclust 2020 is the default sequence database
- Few fixes to allow parallel processing of DSSP/MSAs.
This commit is contained in:
Jérôme Tubiana
2021-10-31 10:37:01 +02:00
parent 8cb616a6ee
commit 162d696777
7 changed files with 22 additions and 10 deletions

BIN
.DS_Store vendored

Binary file not shown.

View File

@@ -34,9 +34,19 @@ Then:
```
pip install -r requirements.txt
```
Next, change the paths to library & structures folders in utilities/paths.py
No other external software is necessary to run ScanNet.
Paths to the library, MSA & structures folders are defined in utilities/paths.py
Docker images for cpu and gpu execution are also available from DockerHub here:
```
docker pull scannet
```
```
docker pull scannet-gpu
```
## Predicting binding sites without using evolutionary information

Binary file not shown.

View File

@@ -11,7 +11,7 @@ from utilities.paths import structures_folder,path_to_dssp,path_to_msms
from preprocessing.protein_chemistry import list_atoms,list_atoms_types,VanDerWaalsRadii,atom_mass,atom_type_to_index,atom_to_index,index_to_type,atom_type_mass
from preprocessing.protein_chemistry import residue_dictionary,hetresidue_field
from preprocessing import PDBio
from datetime import datetime
#%% Functions for parsing PDB files.
def is_residue(residue):
@@ -268,9 +268,9 @@ def apply_DSSP(chain_obj, pdbparser=None, io=None, path_to_dssp=path_to_dssp):
for atom in residue:
atom.disordered_flag = 0
hash = str(datetime.now())[-6:]
name = 'tmp_' + pdb_id + \
'_model_%s_chain_%s' % (model, chain) + '.pdb'
'_model_%s_chain_%s' % (model, chain) + '_'+hash + '.pdb'
io.save(name)

View File

@@ -331,7 +331,7 @@ def load_chains(pdb_id=None,
if (file is None) & (pdb_id is not None):
file = getPDB(pdb_id, biounit=biounit, structures_folder=structures_folder)[0]
else:
pdb_id = 'abcd'
pdb_id = file.split('/')[-1].split('.')[0][-4:]
if file[-4:] == '.cif':
parser = mmcifparser

View File

@@ -47,7 +47,8 @@ elif mode == 'tau':
initial_values_folder = model_folder + 'initial_values/' # Where initial values of the parameters for the gaussian kernels and residue-residue graph edges are stored.
homology_folder = library_folder + 'baselines/homology/' # Where files are stored for homology baseline.
path2hhblits = '/specific/netapp5_2/iscb/wolfson/sequence_database/hh-suite/build/bin/hhblits' # Path to hhblits binary. Not required if using ScanNet_noMSA networks.
path2sequence_database = '/specific/netapp5_2/iscb/wolfson/sequence_database/uniclust30_2018_08/uniclust30_2018_08' # Path to sequence database Not required if using ScanNet_noMSA networks.
# path2sequence_database = '/specific/netapp5_2/iscb/wolfson/sequence_database/uniclust30_2018_08/uniclust30_2018_08' # Path to sequence database Not required if using ScanNet_noMSA networks.
path2sequence_database = '/specific/netapp5_2/iscb/wolfson/sequence_database/uniclust30_2020_06/uniclust30_2020_06' # Path to sequence database Not required if using ScanNet_noMSA networks.
path_to_dssp = '/specific/a/home/cc/students/cs/jeromet/Drive/Scripts/3D_Proteins/xssp-3.0.9/mkdssp' # Path to dssp binary. Only for reproducing baseline performance.
path_to_msms = '/specific/a/home/cc/students/cs/jeromet/Drive/Scripts/3D_Proteins/msms/msms.x86_64Linux2.2.6.1' # Path to msms binary. Only for reproducing baseline performance.
path_to_multiprot = '/home/iscb/wolfson/jeromet/MultiProt/multiprot.Linux' # Path to multiprot executable. Only relevant for homology baseline
@@ -63,7 +64,8 @@ elif mode == 'tau_webserver':
initial_values_folder = model_folder + 'initial_values/' # Where initial values of the parameters for the gaussian kernels and residue-residue graph edges are stored.
homology_folder = library_folder + 'baselines/homology/' # Where files are stored for homology baseline.
path2hhblits = '/specific/netapp5_2/iscb/wolfson/sequence_database/hh-suite/build/bin/hhblits' # Path to hhblits binary. Not required if using ScanNet_noMSA networks.
path2sequence_database = '/specific/netapp5_2/iscb/wolfson/sequence_database/uniclust30_2018_08/uniclust30_2018_08' # Path to sequence database Not required if using ScanNet_noMSA networks.
# path2sequence_database = '/specific/netapp5_2/iscb/wolfson/sequence_database/uniclust30_2018_08/uniclust30_2018_08' # Path to sequence database Not required if using ScanNet_noMSA networks.
path2sequence_database = '/specific/netapp5_2/iscb/wolfson/sequence_database/uniclust30_2020_06/uniclust30_2020_06' # Path to sequence database Not required if using ScanNet_noMSA networks.
path_to_dssp = '/specific/a/home/cc/students/cs/jeromet/Drive/Scripts/3D_Proteins/xssp-3.0.9/mkdssp' # Path to dssp binary. Only for reproducing baseline performance.
path_to_msms = '/specific/a/home/cc/students/cs/jeromet/Drive/Scripts/3D_Proteins/msms/msms.x86_64Linux2.2.6.1' # Path to msms binary. Only for reproducing baseline performance.
path_to_multiprot = None # Path to multiprot executable. Only relevant for homology baseline

View File

@@ -1,8 +1,8 @@
# List of paths to folders and binaries. All folder paths should finish with slash (/)
# Paths required for prediction.
library_folder = '/path/to/ScanNet/' # Where the Github Repo is located.
structures_folder = '/path/to/PDB/' # Where pdb/mmCIF structures files are stored.
library_folder = '' # Where the Github Repo is located.
structures_folder = library_folder + 'PDB/' # Where pdb/mmCIF structures files are stored.
predictions_folder = library_folder + 'predictions/' # Output folder.
model_folder = library_folder + 'models/' # Where the networks as stored as pairs of files (.h5,.data).