Fix/apptainer (#629)

* feat: enable jupyter notebooks; cleanup

* fix: apptainer

* fix: more apptainer shenanigans
This commit is contained in:
Nathaniel Corley
2025-11-07 16:08:06 -08:00
committed by GitHub
parent 8cbf11ea23
commit ae2bb80d02
16 changed files with 136 additions and 28 deletions

View File

@@ -27,6 +27,7 @@ IncludeCmd: yes
/etc/hosts
pyproject.toml /opt/core_pyproject.toml
models/rf3/pyproject.toml /opt/rf3_pyproject.toml
lib/atomworks/pyproject.toml /opt/atomworks_pyproject.toml
%post
## GENERAL SETUP
@@ -38,8 +39,23 @@ IncludeCmd: yes
ln -s /projects /mnt/projects
ln -s /net /mnt/net
# Update system and install essential packages
apt-get update && apt-get install -y \
build-essential \
git \
libxrender1 \
libxrender-dev \
libx11-6 \
libx11-dev \
libxext6 \
libxext-dev \
&& rm -rf /var/lib/apt/lists/*
## PYTHON DEPENDENCY INSTALLATION
# Upgrade pip
python -m pip install --upgrade pip
# Install uv for fast dependency resolution
pip install uv
@@ -54,12 +70,18 @@ IncludeCmd: yes
uv pip compile /opt/pyproject.toml --output-file /opt/rf3_requirements.txt --all-extras
rm /opt/pyproject.toml
# (AtomWorks)
mv /opt/atomworks_pyproject.toml /opt/pyproject.toml
uv pip compile /opt/pyproject.toml --output-file /opt/atomworks_requirements.txt --all-extras
rm /opt/pyproject.toml
# Merge and dedupe requirements, excluding packages we don't want
# (atomworks is mounted from host; torch/numpy/nvidia-* provided by base image)
# (pynvml/packaging/pandas/markdown-it-py from NGC container to avoid conflicts)
cat /opt/core_requirements.txt /opt/rf3_requirements.txt | \
grep -vE "^(atomworks|torch(|vision|audio)|numpy|nvidia-.*|pynvml|packaging|pandas|markdown-it-py)==" | \
awk '!seen[$0]++' > /opt/combined_requirements.txt
# (pynvml/packaging/pandas/markdown-it-py/triton from NGC container to avoid conflicts)
# Deduplicate by package name (keeping first occurrence) to handle version conflicts
cat /opt/core_requirements.txt /opt/rf3_requirements.txt /opt/atomworks_requirements.txt | \
grep -vE "^(atomworks|torch(|vision|audio)|numpy|nvidia-.*|pynvml|packaging|pandas|markdown-it-py|triton)==" | \
awk -F'==' '!seen[$1]++' > /opt/combined_requirements.txt
# Print combined requirements for debugging
echo "=== Combined requirements to install ==="

View File

@@ -1 +1 @@
/net/software/containers/versions/modelhub/rf3-dev_2025_10_08.sif
/net/software/containers/versions/modelhub/rf3-dev_2025_11_07.sif

View File

@@ -30,6 +30,9 @@ IncludeCmd: yes
--exclude='outputs' \
--exclude='logs' \
--exclude='*.sif' \
--exclude='distillation' \
--exclude='benchmarks' \
--exclude='**/slurm_logs' \
./ ${APPTAINER_ROOTFS}/opt/modelhub/
echo "Repository copied successfully."

View File

@@ -14,7 +14,7 @@ defaults:
ckpt_config:
_target_: modelhub.utils.weights.CheckpointConfig
path: /net/software/containers/versions/modelhub_inference/ckpts/rf3-w-conf-run10-ep903-remapped.ckpt
path: /net/software/containers/versions/modelhub_inference/ckpts/rf3-w-conf-run10-ep922-remapped.ckpt
reset_optimizer: true
model:

View File

@@ -7,6 +7,7 @@ defaults:
ckpt_path: ???
num_nodes: 1
devices_per_node: 1
compress_outputs: true
# Parameters for RF3InferenceEngine.run()
inputs: ???
@@ -19,3 +20,4 @@ sharding_pattern: null
skip_existing: false
template_selection: null
ground_truth_conformer_selection: null
cyclic_chains: []

View File

@@ -6,7 +6,7 @@ defaults:
_target_: rf3.inference_engines.rf3.RF3InferenceEngine
ckpt_path: /projects/ml/modelhub/apptainer/rf3-w-conf-run10-ep903-remapped.ckpt
ckpt_path: /net/software/containers/versions/modelhub_inference/ckpts/rf3-w-conf-run10-ep922-remapped.ckpt
# Transform arguments
n_recycles: 10
@@ -21,7 +21,6 @@ early_stopping_plddt_threshold: 0.5
seed: null
print_config: true
raise_if_missing_msa_for_protein_of_length_n: null
cyclic_chains: []
# Metrics
metrics_cfg:

View File

@@ -35,7 +35,8 @@ dependencies = [
"cuequivariance_ops_torch_cu12>=0.6.1; sys_platform == 'linux'",
"cuequivariance_torch>=0.6.1; sys_platform == 'linux'",
# ... dataloading
"atomworks==1.0.2",
# (Commenting out for development; we should re-add before release)
# "atomworks==1.0.2",
]
[project.scripts]
@@ -51,6 +52,7 @@ build-backend = "hatchling.build"
[tool.hatch.version]
source = "vcs"
fallback-version = "0.0.0"
[tool.hatch.version.raw-options]
root = "../.."

View File

@@ -21,6 +21,7 @@ class DumpValidationStructuresCallback(BaseCallback):
dump_predictions: bool = False,
one_model_per_file: bool = False,
dump_trajectories: bool = False,
compress_outputs: bool = True,
):
"""
Args:
@@ -28,12 +29,14 @@ class DumpValidationStructuresCallback(BaseCallback):
one_model_per_file: If True, write each structure within a diffusion batch to its own CIF files. If False,
include each structure within a diffusion batch as a separate model within one CIF file.
dump_trajectories: Whether to dump denoising trajectories after validation batches.
compress_outputs: Whether to gzip output files. Defaults to ``True``.
"""
super().__init__()
self.save_dir = Path(save_dir)
self.dump_predictions = dump_predictions
self.dump_trajectories = dump_trajectories
self.one_model_per_file = one_model_per_file
self.compress_outputs = compress_outputs
def on_validation_batch_end(
self,
@@ -68,6 +71,9 @@ class DumpValidationStructuresCallback(BaseCallback):
return path / f"{identifier}{extra}"
# Determine file type based on compression setting
file_type = "cif.gz" if self.compress_outputs else "cif"
if self.dump_predictions:
atom_array_stack = build_stack_from_atom_array_and_batched_coords(
network_output["X_L"], example["atom_array"]
@@ -76,6 +82,7 @@ class DumpValidationStructuresCallback(BaseCallback):
atom_arrays=atom_array_stack,
base_path=_build_path_from_example_id("predictions"),
one_model_per_file=self.one_model_per_file,
file_type=file_type,
)
if self.dump_trajectories:
@@ -83,9 +90,11 @@ class DumpValidationStructuresCallback(BaseCallback):
trajectory_list=network_output["X_denoised_L_traj"],
atom_array=example["atom_array"],
base_path=_build_path_from_example_id("trajectories", "_denoised"),
file_type=file_type,
)
dump_trajectories(
trajectory_list=network_output["X_noisy_L_traj"],
atom_array=example["atom_array"],
base_path=_build_path_from_example_id("trajectories", "_noisy"),
file_type=file_type,
)

View File

@@ -3,7 +3,7 @@ from pathlib import Path
import typer
from hydra import compose, initialize_config_dir
app = typer.Typer()
app = typer.Typer(pretty_exceptions_enable=False)
@app.command(

View File

@@ -45,6 +45,7 @@ def run_inference(cfg: DictConfig) -> None:
"ground_truth_conformer_selection": cfg.get(
"ground_truth_conformer_selection", None
),
"cyclic_chains": cfg.get("cyclic_chains", []),
}
# Create init config with only __init__ params

View File

@@ -11,10 +11,12 @@ from atomworks.ml.preprocessing.msa.finding import (
get_msa_dirs_from_env,
)
from atomworks.ml.samplers import LoadBalancedDistributedSampler
from biotite.structure import AtomArray
from lightning.fabric import seed_everything
from omegaconf import OmegaConf
from torch.utils.data import DataLoader
from modelhub.metrics.metric import MetricManager
from modelhub.utils.ddp import RankedLogger, set_accelerator_based_on_availability
from modelhub.utils.logging import print_config_tree
from rf3.model.RF3 import ShouldEarlyStopFn
@@ -33,7 +35,6 @@ from rf3.utils.predicted_error import (
compile_af3_confidence_outputs,
get_mean_atomwise_plddt,
)
from modelhub.metrics.metric import MetricManager
logging.basicConfig(
level=logging.INFO,
@@ -94,7 +95,8 @@ class RF3InferenceEngine:
metrics_cfg: dict | OmegaConf | MetricManager | None = None,
num_nodes: int = 1,
devices_per_node: int = 1,
cyclic_chains: list[str] = [],
# Output control
compress_outputs: bool = True,
# Debug
print_config: bool = False,
raise_if_missing_msa_for_protein_of_length_n: int | None = None,
@@ -118,6 +120,7 @@ class RF3InferenceEngine:
Defaults to ``None``.
num_nodes: Number of nodes for distributed inference. Defaults to ``1``.
devices_per_node: Number of devices per node. Defaults to ``1``.
compress_outputs: Whether to gzip output files. Defaults to ``True``.
print_config: Whether to print config trees. Defaults to ``False``.
raise_if_missing_msa_for_protein_of_length_n: Debug flag for MSA checking. Defaults to ``None``.
"""
@@ -187,10 +190,9 @@ class RF3InferenceEngine:
"p_give_polymer_ref_conf": 0.0,
"p_give_non_polymer_ref_conf": 0.0,
"p_dropout_ref_conf": 0.0,
"use_element_for_atom_names_of_atomized_tokens": True,
}
self.cyclic_chains = cyclic_chains
self.print_config = print_config
# Set random seed (only if seed is not None)
@@ -220,6 +222,7 @@ class RF3InferenceEngine:
self.ckpt_path = ckpt_path
self.early_stopping_plddt_threshold = early_stopping_plddt_threshold
self.compress_outputs = compress_outputs
# Setup model
ranked_logger.info("Setting up model...")
@@ -269,7 +272,14 @@ class RF3InferenceEngine:
def run(
self,
inputs: InferenceInput | list[InferenceInput] | PathLike | list[PathLike],
inputs: (
InferenceInput
| list[InferenceInput]
| AtomArray
| list[AtomArray]
| PathLike
| list[PathLike]
),
# Output control
out_dir: PathLike | None = None,
dump_predictions: bool = True,
@@ -281,22 +291,24 @@ class RF3InferenceEngine:
# Selection overrides (applied to all input types)
template_selection: list[str] | str | None = None,
ground_truth_conformer_selection: list[str] | str | None = None,
cyclic_chains: list[str] = [],
) -> dict[str, dict] | None:
"""Run inference on inputs.
Requires a pre-initialized inference engine.
Args:
inputs: Single/list of InferenceInput objects, or file paths, or directory.
inputs: Single/list of InferenceInput objects, AtomArray objects, file paths, or directory.
out_dir: Output directory. If None, returns results as an AtomArray and dictionaries of metrics. Defaults to ``None``.
dump_predictions: Whether to save predicted structures. Defaults to ``True``.
dump_trajectories: Whether to save diffusion trajectories. Defaults to ``False``.
one_model_per_file: Save each model in separate file. Defaults to ``False``.
annotate_b_factor_with_plddt: Write pLDDT to B-factor column. Defaults to ``False``.
sharding_pattern: Sharding pattern for output organization. Defaults to ``None``.
skip_existing: Skip inputs with existing outputs. Defaults to ``False``.
skip_existing: Skip inputs with existing outputs. Requires ``out_dir`` to be set. If ``True`` when ``out_dir=None``, a warning is logged and skipping is disabled. Defaults to ``False``.
template_selection: Template selection override. Defaults to ``None``.
ground_truth_conformer_selection: Conformer selection override. Defaults to ``None``.
cyclic_chains: List of chain IDs to cyclize. Defaults to ``[]``.
Returns:
If ``out_dir`` is None: Dict mapping example_id to results dict.
@@ -307,6 +319,21 @@ class RF3InferenceEngine:
if out_dir:
out_dir.mkdir(parents=True, exist_ok=True)
ranked_logger.info(f"Outputs will be written to {out_dir.resolve()}.")
if not out_dir:
ranked_logger.warning(
"out_dir is None - results will be returned in memory! If you want to save to disk, please provide an out_dir."
)
# Validate skip_existing configuration
if skip_existing and out_dir is None:
ranked_logger.warning(
"skip_existing=True requires out_dir to be set. "
"Disabling skip_existing for in-memory inference mode."
)
skip_existing = False
# Determine file type based on compression setting
file_type = "cif.gz" if self.compress_outputs else "cif"
# Convert inputs to InferenceInput objects
if isinstance(inputs, InferenceInput):
@@ -315,6 +342,26 @@ class RF3InferenceEngine:
isinstance(i, InferenceInput) for i in inputs
):
inference_inputs = inputs
elif isinstance(inputs, AtomArray):
# Single AtomArray - convert to InferenceInput
inference_inputs = [
InferenceInput.from_atom_array(
inputs,
template_selection=template_selection,
ground_truth_conformer_selection=ground_truth_conformer_selection,
)
]
elif isinstance(inputs, list) and all(isinstance(i, AtomArray) for i in inputs):
# List of AtomArrays - convert each to InferenceInput
inference_inputs = [
InferenceInput.from_atom_array(
arr,
example_id=f"inference_{i}",
template_selection=template_selection,
ground_truth_conformer_selection=ground_truth_conformer_selection,
)
for i, arr in enumerate(inputs)
]
elif isinstance(inputs, (str, Path)) or (
isinstance(inputs, list) and isinstance(inputs[0], (str, Path))
):
@@ -329,9 +376,9 @@ class RF3InferenceEngine:
raise ValueError(f"Unsupported inputs type: {type(inputs)}")
# Flag chains for cyclization if specified
if self.cyclic_chains:
if cyclic_chains:
for input_spec in inference_inputs:
input_spec.cyclic_chains = self.cyclic_chains
input_spec.cyclic_chains = cyclic_chains
# make InferenceInputDataset
inference_dataset = InferenceInputDataset(inference_inputs)
@@ -495,6 +542,7 @@ class RF3InferenceEngine:
atom_arrays=atom_array_list or atom_array_stack,
base_path=example_out_dir / input_spec.example_id,
one_model_per_file=one_model_per_file,
file_type=file_type,
)
if dump_trajectories:
@@ -502,11 +550,13 @@ class RF3InferenceEngine:
trajectory_list=network_output["X_denoised_L_traj"],
atom_array=pipeline_output["atom_array"],
base_path=example_out_dir / "denoised",
file_type=file_type,
)
dump_trajectories(
trajectory_list=network_output["X_noisy_L_traj"],
atom_array=pipeline_output["atom_array"],
base_path=example_out_dir / "noisy",
file_type=file_type,
)
ranked_logger.info(

View File

@@ -143,6 +143,7 @@ def dump_trajectories(
atom_array: AtomArray,
base_path: Path,
align_structures: bool = True,
file_type: str = "cif.gz",
) -> None:
"""Write denoising trajectories to CIF files.
@@ -153,6 +154,7 @@ def dump_trajectories(
base_path (Path): Base path where the output files will be saved.
align_structures (bool): Flag to determine if the structures should be aligned on the final prediction.
If False, each step may have a different alignment.
file_type (str): File type for output (e.g., "cif", "cif.gz", "pdb"). Defaults to ``"cif.gz"``.
"""
n_steps = len(trajectory_list)
@@ -192,5 +194,5 @@ def dump_trajectories(
path = f"{base_path}_model_{i}"
to_cif_file(
atom_array_stack, path, file_type="cif.gz", include_entity_poly=False
atom_array_stack, path, file_type=file_type, include_entity_poly=False
)

View File

@@ -65,6 +65,7 @@ build-backend = "hatchling.build"
[tool.hatch.version]
source = "vcs"
fallback-version = "0.0.0"
[tool.hatch.build.hooks.vcs]
version-file = "src/modelhub/version.py"

View File

@@ -32,13 +32,19 @@ SHOULD_USE_CUEQUIVARIANCE = False
try:
if torch.cuda.is_available():
import cuequivariance_torch as cuet # noqa: I001, F401
if _env.bool("DISABLE_CUEQUIVARIANCE", default=False):
logger.info("cuEquivariance usage disabled via DISABLE_CUEQUIVARIANCE")
else:
import cuequivariance_torch as cuet # noqa: I001, F401
SHOULD_USE_CUEQUIVARIANCE = True
os.environ["CUEQ_DISABLE_AOT_TUNING"] = _env.str(
"CUEQ_DISABLE_AOT_TUNING", default="1"
)
os.environ["CUEQ_DEFAULT_CONFIG"] = _env.str("CUEQ_DEFAULT_CONFIG", default="1")
SHOULD_USE_CUEQUIVARIANCE = True
os.environ["CUEQ_DISABLE_AOT_TUNING"] = _env.str(
"CUEQ_DISABLE_AOT_TUNING", default="1"
)
os.environ["CUEQ_DEFAULT_CONFIG"] = _env.str(
"CUEQ_DEFAULT_CONFIG", default="1"
)
logger.info("cuEquivariance is available and will be used.")
except ImportError:
logger.debug("cuEquivariance unavailable: import failed")

View File

@@ -40,6 +40,15 @@ from modelhub.utils.weights import (
ranked_logger = RankedLogger(__name__, rank_zero_only=True)
def is_interactive_environment() -> bool:
try:
from IPython import get_ipython
return get_ipython() is not None
except ImportError:
return False
class FabricTrainer(ABC):
def __init__(
self,
@@ -110,11 +119,13 @@ class FabricTrainer(ABC):
(4) Efficient Gradient Accumulation (https://lightning.ai/docs/fabric/2.4.0/advanced/gradient_accumulation.html)
"""
# DDP strategy requires a manual timeout higher than the default
if strategy == "ddp":
if strategy == "ddp" and not is_interactive_environment():
strategy = DDPStrategy(
timeout=timedelta(seconds=nccl_timeout),
find_unused_parameters=find_unused_parameters,
)
else:
strategy = "auto" # type: ignore
# See (1) for initialization arguments for Fabric()
self.fabric = L.Fabric(