mirror of
https://github.com/RomeroLab/alphafast.git
synced 2026-06-04 13:30:25 +08:00
661 lines
22 KiB
Bash
Executable File
661 lines
22 KiB
Bash
Executable File
#!/bin/bash
|
|
# Copyright 2026 Romero Lab, Duke University
|
|
#
|
|
# Licensed under CC-BY-NC-SA 4.0. This file is part of AlphaFast,
|
|
# a derivative work of AlphaFold 3 by DeepMind Technologies Limited.
|
|
# https://creativecommons.org/licenses/by-nc-sa/4.0/
|
|
#
|
|
# Download AlphaFold 3 databases for AlphaFast.
|
|
#
|
|
# Default mode downloads protein and RNA databases from HuggingFace.
|
|
# Use --protein-only or --rna-only to narrow the scope.
|
|
# Use --from-source to download raw databases from Google Cloud Storage and
|
|
# build MMseqs databases locally.
|
|
#
|
|
# Usage:
|
|
# ./scripts/setup_databases.sh <target_dir> [OPTIONS]
|
|
#
|
|
# Arguments:
|
|
# target_dir: Directory where databases will be stored
|
|
#
|
|
# Options:
|
|
# --protein-only Download/build only protein databases and mmCIF structures
|
|
# --rna-only Download/build only RNA databases
|
|
# --include-nhmmer Download/build RNA FASTA fallback files
|
|
# --from-source Download raw databases from Google Cloud Storage and build locally
|
|
# --keep-fasta Keep raw FASTA files after conversion (--from-source only)
|
|
# --no-keep-fasta Remove raw FASTA files after conversion (--from-source only)
|
|
#
|
|
# Environment:
|
|
# HF_ENDPOINT HuggingFace-compatible endpoint for pre-built downloads,
|
|
# e.g. https://hf-mirror.com.
|
|
#
|
|
# Requirements (default / HuggingFace mode):
|
|
# - hf CLI (HuggingFace): curl -LsSf https://hf.co/cli/install.sh | bash -s
|
|
# - zstd, tar in PATH (for mmCIF extraction)
|
|
# - ~800 GB free disk space
|
|
#
|
|
# Requirements (--from-source mode):
|
|
# - wget, zstd, tar in PATH
|
|
# - mmseqs (GPU version) in PATH
|
|
# - ~800 GB free disk space (250 GB download + 540 GB MMseqs2 padded)
|
|
#
|
|
# Requirements (--from-prebuilt mode):
|
|
# - hf CLI in PATH (curl -LsSf https://hf.co/cli/install.sh | bash)
|
|
# - zstd, tar in PATH
|
|
# - ~569 GB free disk space
|
|
#
|
|
# Output directory structure:
|
|
# <target_dir>/
|
|
# mmcif_files/ # PDB structures for template retrieval
|
|
# mmseqs/
|
|
# uniref90_padded* # MMseqs2 GPU-ready databases (protein)
|
|
# mgnify_padded*
|
|
# small_bfd_padded*
|
|
# uniprot_padded*
|
|
# pdb_seqres_padded*
|
|
# mmseqs_rna/
|
|
# rfam* # MMseqs2 nucleotide databases (default RNA search)
|
|
# rnacentral*
|
|
# nt_rna*
|
|
# rnacentral_active_seq_id_90_cov_80_linclust.fasta # RNA FASTA (optional nhmmer fallback)
|
|
# rfam_14_9_clust_seq_id_90_cov_80_rep_seq.fasta # RNA FASTA (optional nhmmer fallback)
|
|
# nt_rna_2023_02_23_clust_seq_id_90_cov_80_rep_seq.fasta # RNA FASTA (optional nhmmer fallback)
|
|
|
|
set -euo pipefail
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Parse arguments
|
|
# ---------------------------------------------------------------------------
|
|
usage() {
|
|
local exit_code="${1:-1}"
|
|
echo "Usage: $0 <target_dir> [OPTIONS]"
|
|
echo ""
|
|
echo "Downloads AlphaFold 3 databases for AlphaFast."
|
|
echo ""
|
|
echo "Default mode downloads protein and RNA databases from HuggingFace."
|
|
echo "Use --protein-only or --rna-only to narrow the scope."
|
|
echo "Use --from-source to download raw databases from Google Cloud Storage"
|
|
echo "and build MMseqs databases locally."
|
|
echo ""
|
|
echo "Arguments:"
|
|
echo " target_dir Directory where databases will be stored"
|
|
echo ""
|
|
echo "Options:"
|
|
echo " -h, --help Show this help message and exit."
|
|
echo " --protein-only Download/build only protein databases and mmCIF structures."
|
|
echo " --rna-only Download/build only RNA databases."
|
|
echo " --include-nhmmer Download/build RNA FASTA fallback files."
|
|
echo " --from-source Build from FASTA files (Google Cloud Storage)."
|
|
echo " Requires wget, zstd, tar, mmseqs in PATH."
|
|
echo " --keep-fasta Keep raw FASTA files after conversion (default, --from-source only)"
|
|
echo " --no-keep-fasta Remove raw FASTA files after conversion (--from-source only)"
|
|
echo ""
|
|
echo "Environment:"
|
|
echo " HF_ENDPOINT HuggingFace-compatible endpoint for pre-built downloads"
|
|
echo " (for example: https://hf-mirror.com)."
|
|
exit "$exit_code"
|
|
}
|
|
|
|
if [ "$#" -eq 0 ]; then
|
|
usage
|
|
fi
|
|
|
|
if [ "$1" = "-h" ] || [ "$1" = "--help" ]; then
|
|
usage 0
|
|
fi
|
|
|
|
TARGET_DIR="$1"
|
|
shift
|
|
|
|
KEEP_FASTA=true
|
|
FROM_PREBUILT=true # HuggingFace is the default
|
|
INSTALL_MODE="all" # "all", "protein-only", or "rna-only"
|
|
INCLUDE_NHMMER=false
|
|
DOWNLOAD_RNA_MMSEQS=true
|
|
DOWNLOAD_RNA_FASTA=false
|
|
|
|
while [ "$#" -gt 0 ]; do
|
|
case "$1" in
|
|
-h|--help)
|
|
usage 0
|
|
;;
|
|
--keep-fasta)
|
|
KEEP_FASTA=true
|
|
shift
|
|
;;
|
|
--no-keep-fasta)
|
|
KEEP_FASTA=false
|
|
shift
|
|
;;
|
|
--from-prebuilt)
|
|
FROM_PREBUILT=true
|
|
shift
|
|
;;
|
|
--from-source)
|
|
FROM_PREBUILT=false
|
|
shift
|
|
;;
|
|
--protein-only)
|
|
INSTALL_MODE="protein-only"
|
|
shift
|
|
;;
|
|
--rna-only)
|
|
INSTALL_MODE="rna-only"
|
|
shift
|
|
;;
|
|
--include-nhmmer)
|
|
INCLUDE_NHMMER=true
|
|
shift
|
|
;;
|
|
*)
|
|
echo "Unknown argument: $1"
|
|
usage
|
|
;;
|
|
esac
|
|
done
|
|
|
|
if $FROM_PREBUILT && [ "$KEEP_FASTA" = false ]; then
|
|
echo "ERROR: --no-keep-fasta can only be used with --from-source."
|
|
usage
|
|
fi
|
|
|
|
case "$INSTALL_MODE" in
|
|
all)
|
|
DOWNLOAD_PROTEIN=true
|
|
DOWNLOAD_RNA_MMSEQS=true
|
|
DOWNLOAD_RNA_FASTA="$INCLUDE_NHMMER"
|
|
;;
|
|
protein-only)
|
|
DOWNLOAD_PROTEIN=true
|
|
DOWNLOAD_RNA_MMSEQS=false
|
|
DOWNLOAD_RNA_FASTA=false
|
|
;;
|
|
rna-only)
|
|
DOWNLOAD_PROTEIN=false
|
|
DOWNLOAD_RNA_MMSEQS=true
|
|
DOWNLOAD_RNA_FASTA="$INCLUDE_NHMMER"
|
|
;;
|
|
esac
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Check prerequisites
|
|
# ---------------------------------------------------------------------------
|
|
MISSING=0
|
|
if $FROM_PREBUILT; then
|
|
for cmd in hf tar zstd; do
|
|
if ! command -v "$cmd" &>/dev/null; then
|
|
if [ "$cmd" = "hf" ]; then
|
|
echo "ERROR: $cmd is not installed. Install with: curl -LsSf https://hf.co/cli/install.sh | bash -s"
|
|
else
|
|
echo "ERROR: $cmd is not installed or not in PATH."
|
|
fi
|
|
MISSING=1
|
|
fi
|
|
done
|
|
else
|
|
for cmd in wget tar zstd mmseqs; do
|
|
if ! command -v "$cmd" &>/dev/null; then
|
|
echo "ERROR: $cmd is not installed or not in PATH."
|
|
MISSING=1
|
|
fi
|
|
done
|
|
fi
|
|
if [ "$MISSING" -ne 0 ]; then
|
|
echo ""
|
|
echo "Install missing dependencies before running this script."
|
|
if $FROM_PREBUILT; then
|
|
echo "For HuggingFace CLI:"
|
|
echo " curl -LsSf https://hf.co/cli/install.sh | bash -s"
|
|
else
|
|
echo "For mmseqs with GPU support:"
|
|
echo " wget https://mmseqs.com/latest/mmseqs-linux-gpu.tar.gz"
|
|
echo " tar xzf mmseqs-linux-gpu.tar.gz"
|
|
echo " sudo cp mmseqs/bin/mmseqs /usr/local/bin/"
|
|
fi
|
|
exit 1
|
|
fi
|
|
|
|
if ! $FROM_PREBUILT; then
|
|
echo "Using MMseqs2 version: $(mmseqs version)"
|
|
fi
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Configuration
|
|
# ---------------------------------------------------------------------------
|
|
readonly SOURCE="https://storage.googleapis.com/alphafold-databases/v3.0"
|
|
readonly HF_REPO="RomeroLab-Duke/af3-mmseqs-db"
|
|
MMSEQS_DIR="${TARGET_DIR}/mmseqs"
|
|
RNA_MMSEQS_DIR="${TARGET_DIR}/mmseqs_rna"
|
|
RNA_FASTAS=(
|
|
"rnacentral_active_seq_id_90_cov_80_linclust.fasta"
|
|
"nt_rna_2023_02_23_clust_seq_id_90_cov_80_rep_seq.fasta"
|
|
"rfam_14_9_clust_seq_id_90_cov_80_rep_seq.fasta"
|
|
)
|
|
|
|
mkdir -p "$TARGET_DIR"
|
|
if [ "$DOWNLOAD_PROTEIN" = true ]; then
|
|
mkdir -p "$MMSEQS_DIR"
|
|
fi
|
|
if [ "$DOWNLOAD_RNA_MMSEQS" = true ]; then
|
|
mkdir -p "$RNA_MMSEQS_DIR"
|
|
fi
|
|
|
|
reassemble_part_files() {
|
|
local target_dir="$1"
|
|
shopt -s nullglob
|
|
for part_prefix in "${target_dir}"/*.part00; do
|
|
if [ -f "$part_prefix" ]; then
|
|
local base="${part_prefix%.part00}"
|
|
echo "Reassembling $(basename "$base")..."
|
|
cat "${base}.part"* > "$base"
|
|
rm -f "${base}.part"*
|
|
fi
|
|
done
|
|
shopt -u nullglob
|
|
}
|
|
|
|
decompress_zst_files() {
|
|
local target_dir="$1"
|
|
shopt -s nullglob
|
|
for compressed in "${target_dir}"/*.zst; do
|
|
if [ -f "$compressed" ]; then
|
|
local output="${compressed%.zst}"
|
|
if [ -f "$output" ]; then
|
|
echo "SKIP: $(basename "$output") already decompressed"
|
|
rm -f "$compressed"
|
|
continue
|
|
fi
|
|
echo "Decompressing $(basename "$compressed")..."
|
|
zstd --decompress --force --rm -o "$output" "$compressed"
|
|
fi
|
|
done
|
|
shopt -u nullglob
|
|
}
|
|
|
|
echo "=========================================="
|
|
echo "AlphaFast Database Setup"
|
|
echo "=========================================="
|
|
echo "Target directory: $TARGET_DIR"
|
|
echo "MMseqs2 directory: $MMSEQS_DIR"
|
|
echo "Mode: $(if $FROM_PREBUILT; then echo 'pre-built (HuggingFace)'; else echo 'download raw + build locally'; fi)"
|
|
echo "Install mode: $INSTALL_MODE"
|
|
echo "RNA MMseqs2: $DOWNLOAD_RNA_MMSEQS"
|
|
echo "RNA FASTA: $DOWNLOAD_RNA_FASTA"
|
|
echo "Keep FASTA files: $KEEP_FASTA"
|
|
echo "Start time: $(date)"
|
|
echo "=========================================="
|
|
echo ""
|
|
|
|
# ===========================================================================
|
|
# Pre-built mode: download from HuggingFace
|
|
# ===========================================================================
|
|
if $FROM_PREBUILT; then
|
|
echo "=== Downloading pre-built databases from HuggingFace ==="
|
|
echo "Repository: $HF_REPO"
|
|
echo ""
|
|
if [ "$DOWNLOAD_PROTEIN" = true ]; then
|
|
# Download mmCIF structures
|
|
MMCIF_DIR="${TARGET_DIR}/mmcif_files"
|
|
if [ -d "$MMCIF_DIR" ] && [ "$(ls -A "$MMCIF_DIR" 2>/dev/null)" ]; then
|
|
echo "SKIP: mmcif_files already exists"
|
|
else
|
|
echo "Downloading mmCIF structures..."
|
|
hf download "$HF_REPO" --repo-type dataset --include "mmcif_files.tar.zst.*" --local-dir "$TARGET_DIR"
|
|
# Reassemble and extract
|
|
cat "${TARGET_DIR}/mmcif_files.tar.zst.part"* | tar --use-compress-program=zstd -xf - --directory="$TARGET_DIR"
|
|
rm -f "${TARGET_DIR}/mmcif_files.tar.zst.part"*
|
|
echo "Done: mmCIF structures"
|
|
fi
|
|
echo ""
|
|
else
|
|
echo "SKIP: Protein databases (--rna-only mode)"
|
|
echo ""
|
|
fi
|
|
|
|
# Download protein MMseqs2 padded databases
|
|
if [ "$DOWNLOAD_PROTEIN" = true ]; then
|
|
if [ -f "${MMSEQS_DIR}/uniref90_padded.dbtype" ]; then
|
|
echo "SKIP: Protein MMseqs2 databases already exist"
|
|
else
|
|
echo "Downloading protein MMseqs2 padded databases..."
|
|
hf download "$HF_REPO" --repo-type dataset \
|
|
--include "mmseqs/*.zst" --include "mmseqs/*.zst.*" \
|
|
--local-dir "$TARGET_DIR"
|
|
# Reassemble any split .partNN files and decompress .zst payloads.
|
|
reassemble_part_files "$MMSEQS_DIR"
|
|
decompress_zst_files "$MMSEQS_DIR"
|
|
echo "Done: Protein MMseqs2 databases"
|
|
fi
|
|
echo ""
|
|
fi
|
|
|
|
if [ "$DOWNLOAD_RNA_MMSEQS" = true ]; then
|
|
RNA_MMSEQS_DIR="${TARGET_DIR}/mmseqs_rna"
|
|
if [ -f "${RNA_MMSEQS_DIR}/rfam.dbtype" ]; then
|
|
echo "SKIP: RNA MMseqs2 databases already exist"
|
|
else
|
|
# Download everything including pre-built indices (default)
|
|
echo "Downloading RNA MMseqs2 nucleotide databases (with pre-built indices)..."
|
|
hf download "$HF_REPO" --repo-type dataset \
|
|
--include "mmseqs_rna/*.zst" --include "mmseqs_rna/*.zst.*" \
|
|
--local-dir "$TARGET_DIR"
|
|
reassemble_part_files "$RNA_MMSEQS_DIR"
|
|
decompress_zst_files "$RNA_MMSEQS_DIR"
|
|
echo "Done: RNA MMseqs2 databases"
|
|
fi
|
|
echo ""
|
|
|
|
else
|
|
echo "SKIP: RNA MMseqs2 databases (--protein-only mode)"
|
|
echo ""
|
|
fi
|
|
|
|
if [ "$DOWNLOAD_RNA_FASTA" = true ]; then
|
|
# Download RNA FASTA databases (for nhmmer fallback)
|
|
echo "Downloading RNA FASTA databases (for nhmmer fallback)..."
|
|
hf download "$HF_REPO" --repo-type dataset --include "*.fasta" --include "*.fasta.*" --local-dir "$TARGET_DIR"
|
|
# Reassemble any split files
|
|
for part_prefix in "${TARGET_DIR}"/*.fasta.part00; do
|
|
if [ -f "$part_prefix" ]; then
|
|
base="${part_prefix%.part00}"
|
|
echo "Reassembling $(basename "$base")..."
|
|
cat "${base}.part"* >"$base"
|
|
rm -f "${base}.part"*
|
|
fi
|
|
done
|
|
echo "Done: RNA FASTA databases"
|
|
echo ""
|
|
elif [ "$INSTALL_MODE" = "rna-only" ] || [ "$INSTALL_MODE" = "all" ]; then
|
|
echo "SKIP: RNA FASTA databases (use --include-nhmmer to download them)"
|
|
echo ""
|
|
else
|
|
echo "SKIP: RNA databases (--protein-only mode)"
|
|
echo ""
|
|
fi
|
|
|
|
echo "=== Pre-built download complete ==="
|
|
echo ""
|
|
|
|
# ===========================================================================
|
|
# Build mode: download FASTA and convert
|
|
# ===========================================================================
|
|
else
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Step 1: Download databases
|
|
# ---------------------------------------------------------------------------
|
|
echo "=== Step 1: Download databases ==="
|
|
echo ""
|
|
|
|
# PDB mmCIF structures (for template retrieval)
|
|
MMCIF_DIR="${TARGET_DIR}/mmcif_files"
|
|
if [ -d "$MMCIF_DIR" ] && [ "$(ls -A "$MMCIF_DIR" 2>/dev/null)" ]; then
|
|
echo "SKIP: mmcif_files already exists at $MMCIF_DIR"
|
|
else
|
|
echo "Downloading PDB mmCIF structures..."
|
|
wget --progress=bar:force:noscroll -O - \
|
|
"${SOURCE}/pdb_2022_09_28_mmcif_files.tar.zst" |
|
|
tar --no-same-owner --no-same-permissions \
|
|
--use-compress-program=zstd -xf - --directory="$TARGET_DIR"
|
|
echo "Done: mmCIF structures"
|
|
fi
|
|
echo ""
|
|
|
|
# Protein FASTA databases (for MSA search)
|
|
declare -A PROTEIN_FASTAS=(
|
|
["uniref90_2022_05.fa"]="UniRef90"
|
|
["mgy_clusters_2022_05.fa"]="MGnify"
|
|
["bfd-first_non_consensus_sequences.fasta"]="Small BFD"
|
|
["uniprot_all_2021_04.fa"]="UniProt"
|
|
["pdb_seqres_2022_09_28.fasta"]="PDB SeqRes"
|
|
)
|
|
|
|
if [ "$DOWNLOAD_PROTEIN" = true ]; then
|
|
for fasta_file in "${!PROTEIN_FASTAS[@]}"; do
|
|
db_label="${PROTEIN_FASTAS[$fasta_file]}"
|
|
target_path="${TARGET_DIR}/${fasta_file}"
|
|
|
|
if [ -f "$target_path" ]; then
|
|
echo "SKIP: $db_label already exists at $target_path"
|
|
else
|
|
echo "Downloading $db_label ($fasta_file)..."
|
|
wget --progress=bar:force:noscroll -O - \
|
|
"${SOURCE}/${fasta_file}.zst" |
|
|
zstd --decompress >"$target_path"
|
|
echo "Done: $db_label"
|
|
fi
|
|
done
|
|
else
|
|
echo "SKIP: Protein FASTA downloads (--rna-only mode)"
|
|
fi
|
|
echo ""
|
|
|
|
# RNA databases (for RNA MSA search via nhmmer).
|
|
RNA_FASTAS=(
|
|
"rnacentral_active_seq_id_90_cov_80_linclust.fasta"
|
|
"nt_rna_2023_02_23_clust_seq_id_90_cov_80_rep_seq.fasta"
|
|
"rfam_14_9_clust_seq_id_90_cov_80_rep_seq.fasta"
|
|
)
|
|
|
|
if [ "$DOWNLOAD_RNA_MMSEQS" = true ] || [ "$DOWNLOAD_RNA_FASTA" = true ]; then
|
|
for fasta_file in "${RNA_FASTAS[@]}"; do
|
|
target_path="${TARGET_DIR}/${fasta_file}"
|
|
if [ -f "$target_path" ]; then
|
|
echo "SKIP: RNA database already exists at $target_path"
|
|
else
|
|
echo "Downloading RNA database ($fasta_file)..."
|
|
wget --progress=bar:force:noscroll -O - \
|
|
"${SOURCE}/${fasta_file}.zst" |
|
|
zstd --decompress >"$target_path"
|
|
echo "Done: $fasta_file"
|
|
fi
|
|
done
|
|
elif [ "$INSTALL_MODE" = "rna-only" ] || [ "$INSTALL_MODE" = "all" ]; then
|
|
echo "SKIP: RNA downloads (RNA mode disabled)"
|
|
else
|
|
echo "SKIP: RNA FASTA downloads (--protein-only mode)"
|
|
fi
|
|
echo ""
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Step 1b: Build MMseqs2 databases from RNA FASTA (for nucleotide search)
|
|
# ---------------------------------------------------------------------------
|
|
if [ "$DOWNLOAD_RNA_MMSEQS" = true ]; then
|
|
echo "=== Step 1b: Build MMseqs2 RNA databases (optional) ==="
|
|
echo ""
|
|
|
|
RNA_MMSEQS_DIR="${TARGET_DIR}/mmseqs_rna"
|
|
mkdir -p "$RNA_MMSEQS_DIR"
|
|
|
|
declare -A RNA_MMSEQS_DATABASES=(
|
|
["rfam"]="rfam_14_9_clust_seq_id_90_cov_80_rep_seq.fasta"
|
|
["rnacentral"]="rnacentral_active_seq_id_90_cov_80_linclust.fasta"
|
|
["nt_rna"]="nt_rna_2023_02_23_clust_seq_id_90_cov_80_rep_seq.fasta"
|
|
)
|
|
|
|
for db_name in "${!RNA_MMSEQS_DATABASES[@]}"; do
|
|
source_fasta="${TARGET_DIR}/${RNA_MMSEQS_DATABASES[$db_name]}"
|
|
target_db="${RNA_MMSEQS_DIR}/${db_name}"
|
|
|
|
if [ -f "${target_db}.dbtype" ]; then
|
|
echo "SKIP: MMseqs2 RNA database ${db_name} already exists"
|
|
continue
|
|
fi
|
|
|
|
if [ ! -f "$source_fasta" ]; then
|
|
echo "SKIP: Source FASTA not found: $source_fasta"
|
|
continue
|
|
fi
|
|
|
|
echo "Creating MMseqs2 nucleotide database: $db_name"
|
|
time mmseqs createdb "$source_fasta" "$target_db"
|
|
|
|
# Build k-mer index for fast search.
|
|
if [ ! -f "${target_db}.idx" ]; then
|
|
echo "Creating search index for $db_name..."
|
|
idx_tmp=$(mktemp -d)
|
|
source_size_gb=$(du -B1G "${TARGET_DIR}/${RNA_MMSEQS_DATABASES[$db_name]}" 2>/dev/null | cut -f1)
|
|
split_flag=""
|
|
if [ "${source_size_gb:-0}" -gt 10 ]; then
|
|
split_flag="--split 4"
|
|
echo " Using --split 4 for large database (${source_size_gb}G)"
|
|
fi
|
|
time mmseqs createindex "$target_db" "$idx_tmp" --search-type 3 $split_flag
|
|
rm -rf "$idx_tmp"
|
|
fi
|
|
echo "Done: $db_name"
|
|
done
|
|
echo ""
|
|
else
|
|
echo "SKIP: RNA MMseqs2 database build (--protein-only mode)"
|
|
echo ""
|
|
RNA_MMSEQS_DIR="${TARGET_DIR}/mmseqs_rna"
|
|
fi
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Step 2: Convert protein FASTA to MMseqs2 padded format
|
|
# ---------------------------------------------------------------------------
|
|
if [ "$DOWNLOAD_PROTEIN" = true ]; then
|
|
echo "=== Step 2: Convert to MMseqs2 GPU format ==="
|
|
echo ""
|
|
|
|
# Database mapping: mmseqs_name -> source_fasta_filename
|
|
declare -A MMSEQS_DATABASES=(
|
|
["uniref90"]="uniref90_2022_05.fa"
|
|
["mgnify"]="mgy_clusters_2022_05.fa"
|
|
["small_bfd"]="bfd-first_non_consensus_sequences.fasta"
|
|
["uniprot"]="uniprot_all_2021_04.fa"
|
|
["pdb_seqres"]="pdb_seqres_2022_09_28.fasta"
|
|
)
|
|
|
|
total_dbs=${#MMSEQS_DATABASES[@]}
|
|
current_db=0
|
|
|
|
for db_name in "${!MMSEQS_DATABASES[@]}"; do
|
|
current_db=$((current_db + 1))
|
|
source_fasta="${TARGET_DIR}/${MMSEQS_DATABASES[$db_name]}"
|
|
target_base="${MMSEQS_DIR}/${db_name}"
|
|
target_padded="${target_base}_padded"
|
|
|
|
echo "[$current_db/$total_dbs] Converting: $db_name"
|
|
|
|
# Check if padded database already exists
|
|
if [ -f "${target_padded}.dbtype" ]; then
|
|
echo " SKIP: Padded database already exists"
|
|
echo ""
|
|
continue
|
|
fi
|
|
|
|
if [ ! -f "$source_fasta" ]; then
|
|
echo " WARNING: Source FASTA not found: $source_fasta"
|
|
echo " Skipping."
|
|
echo ""
|
|
continue
|
|
fi
|
|
|
|
# Step 1: createdb
|
|
if [ -f "${target_base}.dbtype" ]; then
|
|
echo " Found intermediate database, skipping createdb..."
|
|
else
|
|
echo " Creating MMseqs2 database..."
|
|
time mmseqs createdb "$source_fasta" "$target_base"
|
|
fi
|
|
|
|
# Step 2: makepaddedseqdb
|
|
echo " Creating padded database for GPU..."
|
|
time mmseqs makepaddedseqdb "$target_base" "$target_padded"
|
|
|
|
# Clean up intermediate (non-padded) database files
|
|
echo " Cleaning up intermediate database files..."
|
|
rm -f "${target_base}" "${target_base}".dbtype "${target_base}".index \
|
|
"${target_base}".lookup "${target_base}".source \
|
|
"${target_base}_h" "${target_base}_h".dbtype "${target_base}_h".index
|
|
|
|
if [ -f "${target_padded}.dbtype" ]; then
|
|
echo " SUCCESS: Created ${target_padded}"
|
|
else
|
|
echo " ERROR: Failed to create padded database"
|
|
fi
|
|
echo ""
|
|
done
|
|
else
|
|
echo "=== Step 2: Convert to MMseqs2 GPU format ==="
|
|
echo "SKIP: Protein MMseqs2 conversion (--rna-only mode)"
|
|
echo ""
|
|
fi
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Step 3: Optionally clean up raw FASTA files
|
|
# ---------------------------------------------------------------------------
|
|
if [ "$KEEP_FASTA" = false ]; then
|
|
echo "=== Step 3: Removing raw FASTA files ==="
|
|
for fasta_file in "${!PROTEIN_FASTAS[@]}"; do
|
|
target_path="${TARGET_DIR}/${fasta_file}"
|
|
if [ -f "$target_path" ]; then
|
|
echo "Removing: $target_path"
|
|
rm -f "$target_path"
|
|
fi
|
|
done
|
|
if [ "$DOWNLOAD_RNA_FASTA" = false ]; then
|
|
for fasta_file in "${RNA_FASTAS[@]}"; do
|
|
target_path="${TARGET_DIR}/${fasta_file}"
|
|
if [ -f "$target_path" ]; then
|
|
echo "Removing: $target_path"
|
|
rm -f "$target_path"
|
|
fi
|
|
done
|
|
fi
|
|
echo ""
|
|
fi
|
|
|
|
fi # end of: if $FROM_PREBUILT; then ... else (build mode)
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Summary
|
|
# ---------------------------------------------------------------------------
|
|
echo "=========================================="
|
|
echo "Setup Complete!"
|
|
echo "=========================================="
|
|
echo "End time: $(date)"
|
|
echo ""
|
|
echo "Database directory: $TARGET_DIR"
|
|
echo "MMseqs2 directory: $MMSEQS_DIR"
|
|
echo "mmCIF directory: ${TARGET_DIR}/mmcif_files"
|
|
echo ""
|
|
echo "RNA MMseqs2 databases (default RNA search):"
|
|
if [ "$DOWNLOAD_RNA_MMSEQS" = true ]; then
|
|
for rna_db in rfam rnacentral nt_rna; do
|
|
if [ -f "${RNA_MMSEQS_DIR}/${rna_db}.dbtype" ]; then
|
|
echo " OK: ${rna_db}"
|
|
else
|
|
echo " NOT BUILT: ${rna_db}"
|
|
fi
|
|
done
|
|
else
|
|
echo " SKIPPED: --protein-only mode"
|
|
fi
|
|
echo ""
|
|
echo "RNA FASTA databases (nhmmer fallback, --use_nhmmer):"
|
|
if [ "$DOWNLOAD_RNA_FASTA" = true ]; then
|
|
for rna_f in "${RNA_FASTAS[@]}"; do
|
|
if [ -f "${TARGET_DIR}/${rna_f}" ]; then
|
|
echo " OK: ${rna_f}"
|
|
else
|
|
echo " MISSING: ${rna_f}"
|
|
fi
|
|
done
|
|
elif [ "$INSTALL_MODE" = "rna-only" ] || [ "$INSTALL_MODE" = "all" ]; then
|
|
echo " SKIPPED: use --include-nhmmer to download them"
|
|
else
|
|
echo " SKIPPED: --protein-only mode"
|
|
fi
|
|
echo ""
|
|
echo "Use these paths with run_alphafast.sh:"
|
|
echo " ./scripts/run_alphafast.sh \\"
|
|
echo " --db_dir $TARGET_DIR \\"
|
|
echo " --weights_dir /path/to/weights \\"
|
|
echo " --input_dir /path/to/inputs \\"
|
|
echo " --output_dir /path/to/outputs"
|
|
echo "=========================================="
|