mirror of
https://github.com/RosettaCommons/foundry.git
synced 2026-06-04 13:24:22 +08:00
Fix/apptainer (#629)
* feat: enable jupyter notebooks; cleanup * fix: apptainer * fix: more apptainer shenanigans
This commit is contained in:
@@ -27,6 +27,7 @@ IncludeCmd: yes
|
||||
/etc/hosts
|
||||
pyproject.toml /opt/core_pyproject.toml
|
||||
models/rf3/pyproject.toml /opt/rf3_pyproject.toml
|
||||
lib/atomworks/pyproject.toml /opt/atomworks_pyproject.toml
|
||||
|
||||
%post
|
||||
## GENERAL SETUP
|
||||
@@ -38,8 +39,23 @@ IncludeCmd: yes
|
||||
ln -s /projects /mnt/projects
|
||||
ln -s /net /mnt/net
|
||||
|
||||
# Update system and install essential packages
|
||||
apt-get update && apt-get install -y \
|
||||
build-essential \
|
||||
git \
|
||||
libxrender1 \
|
||||
libxrender-dev \
|
||||
libx11-6 \
|
||||
libx11-dev \
|
||||
libxext6 \
|
||||
libxext-dev \
|
||||
&& rm -rf /var/lib/apt/lists/*
|
||||
|
||||
## PYTHON DEPENDENCY INSTALLATION
|
||||
|
||||
# Upgrade pip
|
||||
python -m pip install --upgrade pip
|
||||
|
||||
# Install uv for fast dependency resolution
|
||||
pip install uv
|
||||
|
||||
@@ -54,12 +70,18 @@ IncludeCmd: yes
|
||||
uv pip compile /opt/pyproject.toml --output-file /opt/rf3_requirements.txt --all-extras
|
||||
rm /opt/pyproject.toml
|
||||
|
||||
# (AtomWorks)
|
||||
mv /opt/atomworks_pyproject.toml /opt/pyproject.toml
|
||||
uv pip compile /opt/pyproject.toml --output-file /opt/atomworks_requirements.txt --all-extras
|
||||
rm /opt/pyproject.toml
|
||||
|
||||
# Merge and dedupe requirements, excluding packages we don't want
|
||||
# (atomworks is mounted from host; torch/numpy/nvidia-* provided by base image)
|
||||
# (pynvml/packaging/pandas/markdown-it-py from NGC container to avoid conflicts)
|
||||
cat /opt/core_requirements.txt /opt/rf3_requirements.txt | \
|
||||
grep -vE "^(atomworks|torch(|vision|audio)|numpy|nvidia-.*|pynvml|packaging|pandas|markdown-it-py)==" | \
|
||||
awk '!seen[$0]++' > /opt/combined_requirements.txt
|
||||
# (pynvml/packaging/pandas/markdown-it-py/triton from NGC container to avoid conflicts)
|
||||
# Deduplicate by package name (keeping first occurrence) to handle version conflicts
|
||||
cat /opt/core_requirements.txt /opt/rf3_requirements.txt /opt/atomworks_requirements.txt | \
|
||||
grep -vE "^(atomworks|torch(|vision|audio)|numpy|nvidia-.*|pynvml|packaging|pandas|markdown-it-py|triton)==" | \
|
||||
awk -F'==' '!seen[$1]++' > /opt/combined_requirements.txt
|
||||
|
||||
# Print combined requirements for debugging
|
||||
echo "=== Combined requirements to install ==="
|
||||
|
||||
@@ -1 +1 @@
|
||||
/net/software/containers/versions/modelhub/rf3-dev_2025_10_08.sif
|
||||
/net/software/containers/versions/modelhub/rf3-dev_2025_11_07.sif
|
||||
@@ -30,6 +30,9 @@ IncludeCmd: yes
|
||||
--exclude='outputs' \
|
||||
--exclude='logs' \
|
||||
--exclude='*.sif' \
|
||||
--exclude='distillation' \
|
||||
--exclude='benchmarks' \
|
||||
--exclude='**/slurm_logs' \
|
||||
./ ${APPTAINER_ROOTFS}/opt/modelhub/
|
||||
|
||||
echo "Repository copied successfully."
|
||||
|
||||
Submodule lib/atomworks updated: 11b5d0d762...4d45b107e3
@@ -14,7 +14,7 @@ defaults:
|
||||
|
||||
ckpt_config:
|
||||
_target_: modelhub.utils.weights.CheckpointConfig
|
||||
path: /net/software/containers/versions/modelhub_inference/ckpts/rf3-w-conf-run10-ep903-remapped.ckpt
|
||||
path: /net/software/containers/versions/modelhub_inference/ckpts/rf3-w-conf-run10-ep922-remapped.ckpt
|
||||
reset_optimizer: true
|
||||
|
||||
model:
|
||||
|
||||
@@ -7,6 +7,7 @@ defaults:
|
||||
ckpt_path: ???
|
||||
num_nodes: 1
|
||||
devices_per_node: 1
|
||||
compress_outputs: true
|
||||
|
||||
# Parameters for RF3InferenceEngine.run()
|
||||
inputs: ???
|
||||
@@ -19,3 +20,4 @@ sharding_pattern: null
|
||||
skip_existing: false
|
||||
template_selection: null
|
||||
ground_truth_conformer_selection: null
|
||||
cyclic_chains: []
|
||||
|
||||
@@ -6,7 +6,7 @@ defaults:
|
||||
|
||||
_target_: rf3.inference_engines.rf3.RF3InferenceEngine
|
||||
|
||||
ckpt_path: /projects/ml/modelhub/apptainer/rf3-w-conf-run10-ep903-remapped.ckpt
|
||||
ckpt_path: /net/software/containers/versions/modelhub_inference/ckpts/rf3-w-conf-run10-ep922-remapped.ckpt
|
||||
|
||||
# Transform arguments
|
||||
n_recycles: 10
|
||||
@@ -21,7 +21,6 @@ early_stopping_plddt_threshold: 0.5
|
||||
seed: null
|
||||
print_config: true
|
||||
raise_if_missing_msa_for_protein_of_length_n: null
|
||||
cyclic_chains: []
|
||||
|
||||
# Metrics
|
||||
metrics_cfg:
|
||||
|
||||
@@ -35,7 +35,8 @@ dependencies = [
|
||||
"cuequivariance_ops_torch_cu12>=0.6.1; sys_platform == 'linux'",
|
||||
"cuequivariance_torch>=0.6.1; sys_platform == 'linux'",
|
||||
# ... dataloading
|
||||
"atomworks==1.0.2",
|
||||
# (Commenting out for development; we should re-add before release)
|
||||
# "atomworks==1.0.2",
|
||||
]
|
||||
|
||||
[project.scripts]
|
||||
@@ -51,6 +52,7 @@ build-backend = "hatchling.build"
|
||||
|
||||
[tool.hatch.version]
|
||||
source = "vcs"
|
||||
fallback-version = "0.0.0"
|
||||
|
||||
[tool.hatch.version.raw-options]
|
||||
root = "../.."
|
||||
|
||||
@@ -21,6 +21,7 @@ class DumpValidationStructuresCallback(BaseCallback):
|
||||
dump_predictions: bool = False,
|
||||
one_model_per_file: bool = False,
|
||||
dump_trajectories: bool = False,
|
||||
compress_outputs: bool = True,
|
||||
):
|
||||
"""
|
||||
Args:
|
||||
@@ -28,12 +29,14 @@ class DumpValidationStructuresCallback(BaseCallback):
|
||||
one_model_per_file: If True, write each structure within a diffusion batch to its own CIF files. If False,
|
||||
include each structure within a diffusion batch as a separate model within one CIF file.
|
||||
dump_trajectories: Whether to dump denoising trajectories after validation batches.
|
||||
compress_outputs: Whether to gzip output files. Defaults to ``True``.
|
||||
"""
|
||||
super().__init__()
|
||||
self.save_dir = Path(save_dir)
|
||||
self.dump_predictions = dump_predictions
|
||||
self.dump_trajectories = dump_trajectories
|
||||
self.one_model_per_file = one_model_per_file
|
||||
self.compress_outputs = compress_outputs
|
||||
|
||||
def on_validation_batch_end(
|
||||
self,
|
||||
@@ -68,6 +71,9 @@ class DumpValidationStructuresCallback(BaseCallback):
|
||||
|
||||
return path / f"{identifier}{extra}"
|
||||
|
||||
# Determine file type based on compression setting
|
||||
file_type = "cif.gz" if self.compress_outputs else "cif"
|
||||
|
||||
if self.dump_predictions:
|
||||
atom_array_stack = build_stack_from_atom_array_and_batched_coords(
|
||||
network_output["X_L"], example["atom_array"]
|
||||
@@ -76,6 +82,7 @@ class DumpValidationStructuresCallback(BaseCallback):
|
||||
atom_arrays=atom_array_stack,
|
||||
base_path=_build_path_from_example_id("predictions"),
|
||||
one_model_per_file=self.one_model_per_file,
|
||||
file_type=file_type,
|
||||
)
|
||||
|
||||
if self.dump_trajectories:
|
||||
@@ -83,9 +90,11 @@ class DumpValidationStructuresCallback(BaseCallback):
|
||||
trajectory_list=network_output["X_denoised_L_traj"],
|
||||
atom_array=example["atom_array"],
|
||||
base_path=_build_path_from_example_id("trajectories", "_denoised"),
|
||||
file_type=file_type,
|
||||
)
|
||||
dump_trajectories(
|
||||
trajectory_list=network_output["X_noisy_L_traj"],
|
||||
atom_array=example["atom_array"],
|
||||
base_path=_build_path_from_example_id("trajectories", "_noisy"),
|
||||
file_type=file_type,
|
||||
)
|
||||
|
||||
@@ -3,7 +3,7 @@ from pathlib import Path
|
||||
import typer
|
||||
from hydra import compose, initialize_config_dir
|
||||
|
||||
app = typer.Typer()
|
||||
app = typer.Typer(pretty_exceptions_enable=False)
|
||||
|
||||
|
||||
@app.command(
|
||||
|
||||
@@ -45,6 +45,7 @@ def run_inference(cfg: DictConfig) -> None:
|
||||
"ground_truth_conformer_selection": cfg.get(
|
||||
"ground_truth_conformer_selection", None
|
||||
),
|
||||
"cyclic_chains": cfg.get("cyclic_chains", []),
|
||||
}
|
||||
|
||||
# Create init config with only __init__ params
|
||||
|
||||
@@ -11,10 +11,12 @@ from atomworks.ml.preprocessing.msa.finding import (
|
||||
get_msa_dirs_from_env,
|
||||
)
|
||||
from atomworks.ml.samplers import LoadBalancedDistributedSampler
|
||||
from biotite.structure import AtomArray
|
||||
from lightning.fabric import seed_everything
|
||||
from omegaconf import OmegaConf
|
||||
from torch.utils.data import DataLoader
|
||||
|
||||
from modelhub.metrics.metric import MetricManager
|
||||
from modelhub.utils.ddp import RankedLogger, set_accelerator_based_on_availability
|
||||
from modelhub.utils.logging import print_config_tree
|
||||
from rf3.model.RF3 import ShouldEarlyStopFn
|
||||
@@ -33,7 +35,6 @@ from rf3.utils.predicted_error import (
|
||||
compile_af3_confidence_outputs,
|
||||
get_mean_atomwise_plddt,
|
||||
)
|
||||
from modelhub.metrics.metric import MetricManager
|
||||
|
||||
logging.basicConfig(
|
||||
level=logging.INFO,
|
||||
@@ -94,7 +95,8 @@ class RF3InferenceEngine:
|
||||
metrics_cfg: dict | OmegaConf | MetricManager | None = None,
|
||||
num_nodes: int = 1,
|
||||
devices_per_node: int = 1,
|
||||
cyclic_chains: list[str] = [],
|
||||
# Output control
|
||||
compress_outputs: bool = True,
|
||||
# Debug
|
||||
print_config: bool = False,
|
||||
raise_if_missing_msa_for_protein_of_length_n: int | None = None,
|
||||
@@ -118,6 +120,7 @@ class RF3InferenceEngine:
|
||||
Defaults to ``None``.
|
||||
num_nodes: Number of nodes for distributed inference. Defaults to ``1``.
|
||||
devices_per_node: Number of devices per node. Defaults to ``1``.
|
||||
compress_outputs: Whether to gzip output files. Defaults to ``True``.
|
||||
print_config: Whether to print config trees. Defaults to ``False``.
|
||||
raise_if_missing_msa_for_protein_of_length_n: Debug flag for MSA checking. Defaults to ``None``.
|
||||
"""
|
||||
@@ -187,10 +190,9 @@ class RF3InferenceEngine:
|
||||
"p_give_polymer_ref_conf": 0.0,
|
||||
"p_give_non_polymer_ref_conf": 0.0,
|
||||
"p_dropout_ref_conf": 0.0,
|
||||
"use_element_for_atom_names_of_atomized_tokens": True,
|
||||
}
|
||||
|
||||
self.cyclic_chains = cyclic_chains
|
||||
|
||||
self.print_config = print_config
|
||||
|
||||
# Set random seed (only if seed is not None)
|
||||
@@ -220,6 +222,7 @@ class RF3InferenceEngine:
|
||||
|
||||
self.ckpt_path = ckpt_path
|
||||
self.early_stopping_plddt_threshold = early_stopping_plddt_threshold
|
||||
self.compress_outputs = compress_outputs
|
||||
|
||||
# Setup model
|
||||
ranked_logger.info("Setting up model...")
|
||||
@@ -269,7 +272,14 @@ class RF3InferenceEngine:
|
||||
|
||||
def run(
|
||||
self,
|
||||
inputs: InferenceInput | list[InferenceInput] | PathLike | list[PathLike],
|
||||
inputs: (
|
||||
InferenceInput
|
||||
| list[InferenceInput]
|
||||
| AtomArray
|
||||
| list[AtomArray]
|
||||
| PathLike
|
||||
| list[PathLike]
|
||||
),
|
||||
# Output control
|
||||
out_dir: PathLike | None = None,
|
||||
dump_predictions: bool = True,
|
||||
@@ -281,22 +291,24 @@ class RF3InferenceEngine:
|
||||
# Selection overrides (applied to all input types)
|
||||
template_selection: list[str] | str | None = None,
|
||||
ground_truth_conformer_selection: list[str] | str | None = None,
|
||||
cyclic_chains: list[str] = [],
|
||||
) -> dict[str, dict] | None:
|
||||
"""Run inference on inputs.
|
||||
|
||||
Requires a pre-initialized inference engine.
|
||||
|
||||
Args:
|
||||
inputs: Single/list of InferenceInput objects, or file paths, or directory.
|
||||
inputs: Single/list of InferenceInput objects, AtomArray objects, file paths, or directory.
|
||||
out_dir: Output directory. If None, returns results as an AtomArray and dictionaries of metrics. Defaults to ``None``.
|
||||
dump_predictions: Whether to save predicted structures. Defaults to ``True``.
|
||||
dump_trajectories: Whether to save diffusion trajectories. Defaults to ``False``.
|
||||
one_model_per_file: Save each model in separate file. Defaults to ``False``.
|
||||
annotate_b_factor_with_plddt: Write pLDDT to B-factor column. Defaults to ``False``.
|
||||
sharding_pattern: Sharding pattern for output organization. Defaults to ``None``.
|
||||
skip_existing: Skip inputs with existing outputs. Defaults to ``False``.
|
||||
skip_existing: Skip inputs with existing outputs. Requires ``out_dir`` to be set. If ``True`` when ``out_dir=None``, a warning is logged and skipping is disabled. Defaults to ``False``.
|
||||
template_selection: Template selection override. Defaults to ``None``.
|
||||
ground_truth_conformer_selection: Conformer selection override. Defaults to ``None``.
|
||||
cyclic_chains: List of chain IDs to cyclize. Defaults to ``[]``.
|
||||
|
||||
Returns:
|
||||
If ``out_dir`` is None: Dict mapping example_id to results dict.
|
||||
@@ -307,6 +319,21 @@ class RF3InferenceEngine:
|
||||
if out_dir:
|
||||
out_dir.mkdir(parents=True, exist_ok=True)
|
||||
ranked_logger.info(f"Outputs will be written to {out_dir.resolve()}.")
|
||||
if not out_dir:
|
||||
ranked_logger.warning(
|
||||
"out_dir is None - results will be returned in memory! If you want to save to disk, please provide an out_dir."
|
||||
)
|
||||
|
||||
# Validate skip_existing configuration
|
||||
if skip_existing and out_dir is None:
|
||||
ranked_logger.warning(
|
||||
"skip_existing=True requires out_dir to be set. "
|
||||
"Disabling skip_existing for in-memory inference mode."
|
||||
)
|
||||
skip_existing = False
|
||||
|
||||
# Determine file type based on compression setting
|
||||
file_type = "cif.gz" if self.compress_outputs else "cif"
|
||||
|
||||
# Convert inputs to InferenceInput objects
|
||||
if isinstance(inputs, InferenceInput):
|
||||
@@ -315,6 +342,26 @@ class RF3InferenceEngine:
|
||||
isinstance(i, InferenceInput) for i in inputs
|
||||
):
|
||||
inference_inputs = inputs
|
||||
elif isinstance(inputs, AtomArray):
|
||||
# Single AtomArray - convert to InferenceInput
|
||||
inference_inputs = [
|
||||
InferenceInput.from_atom_array(
|
||||
inputs,
|
||||
template_selection=template_selection,
|
||||
ground_truth_conformer_selection=ground_truth_conformer_selection,
|
||||
)
|
||||
]
|
||||
elif isinstance(inputs, list) and all(isinstance(i, AtomArray) for i in inputs):
|
||||
# List of AtomArrays - convert each to InferenceInput
|
||||
inference_inputs = [
|
||||
InferenceInput.from_atom_array(
|
||||
arr,
|
||||
example_id=f"inference_{i}",
|
||||
template_selection=template_selection,
|
||||
ground_truth_conformer_selection=ground_truth_conformer_selection,
|
||||
)
|
||||
for i, arr in enumerate(inputs)
|
||||
]
|
||||
elif isinstance(inputs, (str, Path)) or (
|
||||
isinstance(inputs, list) and isinstance(inputs[0], (str, Path))
|
||||
):
|
||||
@@ -329,9 +376,9 @@ class RF3InferenceEngine:
|
||||
raise ValueError(f"Unsupported inputs type: {type(inputs)}")
|
||||
|
||||
# Flag chains for cyclization if specified
|
||||
if self.cyclic_chains:
|
||||
if cyclic_chains:
|
||||
for input_spec in inference_inputs:
|
||||
input_spec.cyclic_chains = self.cyclic_chains
|
||||
input_spec.cyclic_chains = cyclic_chains
|
||||
|
||||
# make InferenceInputDataset
|
||||
inference_dataset = InferenceInputDataset(inference_inputs)
|
||||
@@ -495,6 +542,7 @@ class RF3InferenceEngine:
|
||||
atom_arrays=atom_array_list or atom_array_stack,
|
||||
base_path=example_out_dir / input_spec.example_id,
|
||||
one_model_per_file=one_model_per_file,
|
||||
file_type=file_type,
|
||||
)
|
||||
|
||||
if dump_trajectories:
|
||||
@@ -502,11 +550,13 @@ class RF3InferenceEngine:
|
||||
trajectory_list=network_output["X_denoised_L_traj"],
|
||||
atom_array=pipeline_output["atom_array"],
|
||||
base_path=example_out_dir / "denoised",
|
||||
file_type=file_type,
|
||||
)
|
||||
dump_trajectories(
|
||||
trajectory_list=network_output["X_noisy_L_traj"],
|
||||
atom_array=pipeline_output["atom_array"],
|
||||
base_path=example_out_dir / "noisy",
|
||||
file_type=file_type,
|
||||
)
|
||||
|
||||
ranked_logger.info(
|
||||
|
||||
@@ -143,6 +143,7 @@ def dump_trajectories(
|
||||
atom_array: AtomArray,
|
||||
base_path: Path,
|
||||
align_structures: bool = True,
|
||||
file_type: str = "cif.gz",
|
||||
) -> None:
|
||||
"""Write denoising trajectories to CIF files.
|
||||
|
||||
@@ -153,6 +154,7 @@ def dump_trajectories(
|
||||
base_path (Path): Base path where the output files will be saved.
|
||||
align_structures (bool): Flag to determine if the structures should be aligned on the final prediction.
|
||||
If False, each step may have a different alignment.
|
||||
file_type (str): File type for output (e.g., "cif", "cif.gz", "pdb"). Defaults to ``"cif.gz"``.
|
||||
"""
|
||||
n_steps = len(trajectory_list)
|
||||
|
||||
@@ -192,5 +194,5 @@ def dump_trajectories(
|
||||
|
||||
path = f"{base_path}_model_{i}"
|
||||
to_cif_file(
|
||||
atom_array_stack, path, file_type="cif.gz", include_entity_poly=False
|
||||
atom_array_stack, path, file_type=file_type, include_entity_poly=False
|
||||
)
|
||||
|
||||
@@ -65,6 +65,7 @@ build-backend = "hatchling.build"
|
||||
|
||||
[tool.hatch.version]
|
||||
source = "vcs"
|
||||
fallback-version = "0.0.0"
|
||||
|
||||
[tool.hatch.build.hooks.vcs]
|
||||
version-file = "src/modelhub/version.py"
|
||||
|
||||
@@ -32,13 +32,19 @@ SHOULD_USE_CUEQUIVARIANCE = False
|
||||
|
||||
try:
|
||||
if torch.cuda.is_available():
|
||||
import cuequivariance_torch as cuet # noqa: I001, F401
|
||||
if _env.bool("DISABLE_CUEQUIVARIANCE", default=False):
|
||||
logger.info("cuEquivariance usage disabled via DISABLE_CUEQUIVARIANCE")
|
||||
else:
|
||||
import cuequivariance_torch as cuet # noqa: I001, F401
|
||||
|
||||
SHOULD_USE_CUEQUIVARIANCE = True
|
||||
os.environ["CUEQ_DISABLE_AOT_TUNING"] = _env.str(
|
||||
"CUEQ_DISABLE_AOT_TUNING", default="1"
|
||||
)
|
||||
os.environ["CUEQ_DEFAULT_CONFIG"] = _env.str("CUEQ_DEFAULT_CONFIG", default="1")
|
||||
SHOULD_USE_CUEQUIVARIANCE = True
|
||||
os.environ["CUEQ_DISABLE_AOT_TUNING"] = _env.str(
|
||||
"CUEQ_DISABLE_AOT_TUNING", default="1"
|
||||
)
|
||||
os.environ["CUEQ_DEFAULT_CONFIG"] = _env.str(
|
||||
"CUEQ_DEFAULT_CONFIG", default="1"
|
||||
)
|
||||
logger.info("cuEquivariance is available and will be used.")
|
||||
|
||||
except ImportError:
|
||||
logger.debug("cuEquivariance unavailable: import failed")
|
||||
|
||||
@@ -40,6 +40,15 @@ from modelhub.utils.weights import (
|
||||
ranked_logger = RankedLogger(__name__, rank_zero_only=True)
|
||||
|
||||
|
||||
def is_interactive_environment() -> bool:
|
||||
try:
|
||||
from IPython import get_ipython
|
||||
|
||||
return get_ipython() is not None
|
||||
except ImportError:
|
||||
return False
|
||||
|
||||
|
||||
class FabricTrainer(ABC):
|
||||
def __init__(
|
||||
self,
|
||||
@@ -110,11 +119,13 @@ class FabricTrainer(ABC):
|
||||
(4) Efficient Gradient Accumulation (https://lightning.ai/docs/fabric/2.4.0/advanced/gradient_accumulation.html)
|
||||
"""
|
||||
# DDP strategy requires a manual timeout higher than the default
|
||||
if strategy == "ddp":
|
||||
if strategy == "ddp" and not is_interactive_environment():
|
||||
strategy = DDPStrategy(
|
||||
timeout=timedelta(seconds=nccl_timeout),
|
||||
find_unused_parameters=find_unused_parameters,
|
||||
)
|
||||
else:
|
||||
strategy = "auto" # type: ignore
|
||||
|
||||
# See (1) for initialization arguments for Fabric()
|
||||
self.fabric = L.Fabric(
|
||||
|
||||
Reference in New Issue
Block a user