Fix/apptainer (#629)

* feat: enable jupyter notebooks; cleanup * fix: apptainer * fix: more apptainer shenanigans
2026-06-04 13:24:22 +08:00 · 2025-11-07 16:08:06 -08:00
parent 8cbf11ea23
commit ae2bb80d02
16 changed files with 136 additions and 28 deletions
--- a/.ipd/apptainer/rf3-dev.def
+++ b/.ipd/apptainer/rf3-dev.def
@@ -27,6 +27,7 @@ IncludeCmd: yes
    /etc/hosts
    pyproject.toml /opt/core_pyproject.toml
    models/rf3/pyproject.toml /opt/rf3_pyproject.toml
+    lib/atomworks/pyproject.toml /opt/atomworks_pyproject.toml

 %post
    ## GENERAL SETUP
@@ -38,8 +39,23 @@ IncludeCmd: yes
    ln -s /projects /mnt/projects
    ln -s /net /mnt/net

+    # Update system and install essential packages
+    apt-get update && apt-get install -y \
+        build-essential \
+        git \
+        libxrender1 \
+        libxrender-dev \
+        libx11-6 \
+        libx11-dev \
+        libxext6 \
+        libxext-dev \
+        && rm -rf /var/lib/apt/lists/*
+
    ## PYTHON DEPENDENCY INSTALLATION

+    # Upgrade pip
+    python -m pip install --upgrade pip
+
    # Install uv for fast dependency resolution
    pip install uv

@@ -54,12 +70,18 @@ IncludeCmd: yes
    uv pip compile /opt/pyproject.toml --output-file /opt/rf3_requirements.txt --all-extras
    rm /opt/pyproject.toml

+    # (AtomWorks)
+    mv /opt/atomworks_pyproject.toml /opt/pyproject.toml
+    uv pip compile /opt/pyproject.toml --output-file /opt/atomworks_requirements.txt --all-extras
+    rm /opt/pyproject.toml
+
    # Merge and dedupe requirements, excluding packages we don't want
    # (atomworks is mounted from host; torch/numpy/nvidia-* provided by base image)
-    # (pynvml/packaging/pandas/markdown-it-py from NGC container to avoid conflicts)
-    cat /opt/core_requirements.txt /opt/rf3_requirements.txt | \
-        grep -vE "^(atomworks|torch(|vision|audio)|numpy|nvidia-.*|pynvml|packaging|pandas|markdown-it-py)==" | \
-        awk '!seen[$0]++' > /opt/combined_requirements.txt
+    # (pynvml/packaging/pandas/markdown-it-py/triton from NGC container to avoid conflicts)
+    # Deduplicate by package name (keeping first occurrence) to handle version conflicts
+    cat /opt/core_requirements.txt /opt/rf3_requirements.txt /opt/atomworks_requirements.txt | \
+        grep -vE "^(atomworks|torch(|vision|audio)|numpy|nvidia-.*|pynvml|packaging|pandas|markdown-it-py|triton)==" | \
+        awk -F'==' '!seen[$1]++' > /opt/combined_requirements.txt

    # Print combined requirements for debugging
    echo "=== Combined requirements to install ==="
--- a/.ipd/apptainer/rf3-dev.sif
+++ b/.ipd/apptainer/rf3-dev.sif
@@ -1 +1 @@
-/net/software/containers/versions/modelhub/rf3-dev_2025_10_08.sif
+/net/software/containers/versions/modelhub/rf3-dev_2025_11_07.sif
--- a/.ipd/apptainer/rf3-full.def
+++ b/.ipd/apptainer/rf3-full.def
@@ -30,6 +30,9 @@ IncludeCmd: yes
        --exclude='outputs' \
        --exclude='logs' \
        --exclude='*.sif' \
+        --exclude='distillation' \
+        --exclude='benchmarks' \
+        --exclude='**/slurm_logs' \
        ./ ${APPTAINER_ROOTFS}/opt/modelhub/

    echo "Repository copied successfully."
--- a/lib/atomworks
+++ b/lib/atomworks
--- a/models/rf3/configs/experiment/pretrained/rf3.yaml
+++ b/models/rf3/configs/experiment/pretrained/rf3.yaml
@@ -14,7 +14,7 @@ defaults:

 ckpt_config:
  _target_: modelhub.utils.weights.CheckpointConfig
-  path: /net/software/containers/versions/modelhub_inference/ckpts/rf3-w-conf-run10-ep903-remapped.ckpt
+  path: /net/software/containers/versions/modelhub_inference/ckpts/rf3-w-conf-run10-ep922-remapped.ckpt
  reset_optimizer: true

 model:
--- a/models/rf3/configs/inference_engine/base.yaml
+++ b/models/rf3/configs/inference_engine/base.yaml
@@ -7,6 +7,7 @@ defaults:
 ckpt_path: ???
 num_nodes: 1
 devices_per_node: 1
+compress_outputs: true

 # Parameters for RF3InferenceEngine.run()
 inputs: ???
@@ -19,3 +20,4 @@ sharding_pattern: null
 skip_existing: false
 template_selection: null
 ground_truth_conformer_selection: null
+cyclic_chains: []
--- a/models/rf3/configs/inference_engine/rf3.yaml
+++ b/models/rf3/configs/inference_engine/rf3.yaml
@@ -6,7 +6,7 @@ defaults:

 _target_: rf3.inference_engines.rf3.RF3InferenceEngine

-ckpt_path: /projects/ml/modelhub/apptainer/rf3-w-conf-run10-ep903-remapped.ckpt
+ckpt_path: /net/software/containers/versions/modelhub_inference/ckpts/rf3-w-conf-run10-ep922-remapped.ckpt

 # Transform arguments
 n_recycles: 10
@@ -21,7 +21,6 @@ early_stopping_plddt_threshold: 0.5
 seed: null
 print_config: true
 raise_if_missing_msa_for_protein_of_length_n: null
-cyclic_chains: []

 # Metrics
 metrics_cfg:
--- a/models/rf3/pyproject.toml
+++ b/models/rf3/pyproject.toml
@@ -35,7 +35,8 @@ dependencies = [
    "cuequivariance_ops_torch_cu12>=0.6.1; sys_platform == 'linux'",
    "cuequivariance_torch>=0.6.1; sys_platform == 'linux'",
    # ... dataloading
-    "atomworks==1.0.2",
+    # (Commenting out for development; we should re-add before release)
+    # "atomworks==1.0.2",
 ]

 [project.scripts]
@@ -51,6 +52,7 @@ build-backend = "hatchling.build"

 [tool.hatch.version]
 source = "vcs"
+fallback-version = "0.0.0"

 [tool.hatch.version.raw-options]
 root = "../.."
--- a/models/rf3/src/rf3/callbacks/dump_validation_structures.py
+++ b/models/rf3/src/rf3/callbacks/dump_validation_structures.py
@@ -21,6 +21,7 @@ class DumpValidationStructuresCallback(BaseCallback):
        dump_predictions: bool = False,
        one_model_per_file: bool = False,
        dump_trajectories: bool = False,
+        compress_outputs: bool = True,
    ):
        """
        Args:
@@ -28,12 +29,14 @@ class DumpValidationStructuresCallback(BaseCallback):
            one_model_per_file: If True, write each structure within a diffusion batch to its own CIF files. If False,
                include each structure within a diffusion batch as a separate model within one CIF file.
            dump_trajectories: Whether to dump denoising trajectories after validation batches.
+            compress_outputs: Whether to gzip output files. Defaults to ``True``.
        """
        super().__init__()
        self.save_dir = Path(save_dir)
        self.dump_predictions = dump_predictions
        self.dump_trajectories = dump_trajectories
        self.one_model_per_file = one_model_per_file
+        self.compress_outputs = compress_outputs

    def on_validation_batch_end(
        self,
@@ -68,6 +71,9 @@ class DumpValidationStructuresCallback(BaseCallback):

            return path / f"{identifier}{extra}"

+        # Determine file type based on compression setting
+        file_type = "cif.gz" if self.compress_outputs else "cif"
+
        if self.dump_predictions:
            atom_array_stack = build_stack_from_atom_array_and_batched_coords(
                network_output["X_L"], example["atom_array"]
@@ -76,6 +82,7 @@ class DumpValidationStructuresCallback(BaseCallback):
                atom_arrays=atom_array_stack,
                base_path=_build_path_from_example_id("predictions"),
                one_model_per_file=self.one_model_per_file,
+                file_type=file_type,
            )

        if self.dump_trajectories:
@@ -83,9 +90,11 @@ class DumpValidationStructuresCallback(BaseCallback):
                trajectory_list=network_output["X_denoised_L_traj"],
                atom_array=example["atom_array"],
                base_path=_build_path_from_example_id("trajectories", "_denoised"),
+                file_type=file_type,
            )
            dump_trajectories(
                trajectory_list=network_output["X_noisy_L_traj"],
                atom_array=example["atom_array"],
                base_path=_build_path_from_example_id("trajectories", "_noisy"),
+                file_type=file_type,
            )
--- a/models/rf3/src/rf3/cli.py
+++ b/models/rf3/src/rf3/cli.py
@@ -3,7 +3,7 @@ from pathlib import Path
 import typer
 from hydra import compose, initialize_config_dir

-app = typer.Typer()
+app = typer.Typer(pretty_exceptions_enable=False)


@app.command(
--- a/models/rf3/src/rf3/inference.py
+++ b/models/rf3/src/rf3/inference.py
@@ -45,6 +45,7 @@ def run_inference(cfg: DictConfig) -> None:
        "ground_truth_conformer_selection": cfg.get(
            "ground_truth_conformer_selection", None
        ),
+        "cyclic_chains": cfg.get("cyclic_chains", []),
    }

    # Create init config with only __init__ params
--- a/models/rf3/src/rf3/inference_engines/rf3.py
+++ b/models/rf3/src/rf3/inference_engines/rf3.py
@@ -11,10 +11,12 @@ from atomworks.ml.preprocessing.msa.finding import (
    get_msa_dirs_from_env,
 )
 from atomworks.ml.samplers import LoadBalancedDistributedSampler
+from biotite.structure import AtomArray
 from lightning.fabric import seed_everything
 from omegaconf import OmegaConf
 from torch.utils.data import DataLoader

+from modelhub.metrics.metric import MetricManager
 from modelhub.utils.ddp import RankedLogger, set_accelerator_based_on_availability
 from modelhub.utils.logging import print_config_tree
 from rf3.model.RF3 import ShouldEarlyStopFn
@@ -33,7 +35,6 @@ from rf3.utils.predicted_error import (
    compile_af3_confidence_outputs,
    get_mean_atomwise_plddt,
 )
-from modelhub.metrics.metric import MetricManager

 logging.basicConfig(
    level=logging.INFO,
@@ -94,7 +95,8 @@ class RF3InferenceEngine:
        metrics_cfg: dict | OmegaConf | MetricManager | None = None,
        num_nodes: int = 1,
        devices_per_node: int = 1,
-        cyclic_chains: list[str] = [],
+        # Output control
+        compress_outputs: bool = True,
        # Debug
        print_config: bool = False,
        raise_if_missing_msa_for_protein_of_length_n: int | None = None,
@@ -118,6 +120,7 @@ class RF3InferenceEngine:
              Defaults to ``None``.
          num_nodes: Number of nodes for distributed inference. Defaults to ``1``.
          devices_per_node: Number of devices per node. Defaults to ``1``.
+          compress_outputs: Whether to gzip output files. Defaults to ``True``.
          print_config: Whether to print config trees. Defaults to ``False``.
          raise_if_missing_msa_for_protein_of_length_n: Debug flag for MSA checking. Defaults to ``None``.
        """
@@ -187,10 +190,9 @@ class RF3InferenceEngine:
            "p_give_polymer_ref_conf": 0.0,
            "p_give_non_polymer_ref_conf": 0.0,
            "p_dropout_ref_conf": 0.0,
+            "use_element_for_atom_names_of_atomized_tokens": True,
        }

-        self.cyclic_chains = cyclic_chains
-
        self.print_config = print_config

        # Set random seed (only if seed is not None)
@@ -220,6 +222,7 @@ class RF3InferenceEngine:

        self.ckpt_path = ckpt_path
        self.early_stopping_plddt_threshold = early_stopping_plddt_threshold
+        self.compress_outputs = compress_outputs

        # Setup model
        ranked_logger.info("Setting up model...")
@@ -269,7 +272,14 @@ class RF3InferenceEngine:

    def run(
        self,
-        inputs: InferenceInput | list[InferenceInput] | PathLike | list[PathLike],
+        inputs: (
+            InferenceInput
+            | list[InferenceInput]
+            | AtomArray
+            | list[AtomArray]
+            | PathLike
+            | list[PathLike]
+        ),
        # Output control
        out_dir: PathLike | None = None,
        dump_predictions: bool = True,
@@ -281,22 +291,24 @@ class RF3InferenceEngine:
        # Selection overrides (applied to all input types)
        template_selection: list[str] | str | None = None,
        ground_truth_conformer_selection: list[str] | str | None = None,
+        cyclic_chains: list[str] = [],
    ) -> dict[str, dict] | None:
        """Run inference on inputs.

        Requires a pre-initialized inference engine.

        Args:
-          inputs: Single/list of InferenceInput objects, or file paths, or directory.
+          inputs: Single/list of InferenceInput objects, AtomArray objects, file paths, or directory.
          out_dir: Output directory. If None, returns results as an AtomArray and dictionaries of metrics. Defaults to ``None``.
          dump_predictions: Whether to save predicted structures. Defaults to ``True``.
          dump_trajectories: Whether to save diffusion trajectories. Defaults to ``False``.
          one_model_per_file: Save each model in separate file. Defaults to ``False``.
          annotate_b_factor_with_plddt: Write pLDDT to B-factor column. Defaults to ``False``.
          sharding_pattern: Sharding pattern for output organization. Defaults to ``None``.
-          skip_existing: Skip inputs with existing outputs. Defaults to ``False``.
+          skip_existing: Skip inputs with existing outputs. Requires ``out_dir`` to be set. If ``True`` when ``out_dir=None``, a warning is logged and skipping is disabled. Defaults to ``False``.
          template_selection: Template selection override. Defaults to ``None``.
          ground_truth_conformer_selection: Conformer selection override. Defaults to ``None``.
+          cyclic_chains: List of chain IDs to cyclize. Defaults to ``[]``.

        Returns:
          If ``out_dir`` is None: Dict mapping example_id to results dict.
@@ -307,6 +319,21 @@ class RF3InferenceEngine:
        if out_dir:
            out_dir.mkdir(parents=True, exist_ok=True)
            ranked_logger.info(f"Outputs will be written to {out_dir.resolve()}.")
+        if not out_dir:
+            ranked_logger.warning(
+                "out_dir is None - results will be returned in memory! If you want to save to disk, please provide an out_dir."
+            )
+
+        # Validate skip_existing configuration
+        if skip_existing and out_dir is None:
+            ranked_logger.warning(
+                "skip_existing=True requires out_dir to be set. "
+                "Disabling skip_existing for in-memory inference mode."
+            )
+            skip_existing = False
+
+        # Determine file type based on compression setting
+        file_type = "cif.gz" if self.compress_outputs else "cif"

        # Convert inputs to InferenceInput objects
        if isinstance(inputs, InferenceInput):
@@ -315,6 +342,26 @@ class RF3InferenceEngine:
            isinstance(i, InferenceInput) for i in inputs
        ):
            inference_inputs = inputs
+        elif isinstance(inputs, AtomArray):
+            # Single AtomArray - convert to InferenceInput
+            inference_inputs = [
+                InferenceInput.from_atom_array(
+                    inputs,
+                    template_selection=template_selection,
+                    ground_truth_conformer_selection=ground_truth_conformer_selection,
+                )
+            ]
+        elif isinstance(inputs, list) and all(isinstance(i, AtomArray) for i in inputs):
+            # List of AtomArrays - convert each to InferenceInput
+            inference_inputs = [
+                InferenceInput.from_atom_array(
+                    arr,
+                    example_id=f"inference_{i}",
+                    template_selection=template_selection,
+                    ground_truth_conformer_selection=ground_truth_conformer_selection,
+                )
+                for i, arr in enumerate(inputs)
+            ]
        elif isinstance(inputs, (str, Path)) or (
            isinstance(inputs, list) and isinstance(inputs[0], (str, Path))
        ):
@@ -329,9 +376,9 @@ class RF3InferenceEngine:
            raise ValueError(f"Unsupported inputs type: {type(inputs)}")

        # Flag chains for cyclization if specified
-        if self.cyclic_chains:
+        if cyclic_chains:
            for input_spec in inference_inputs:
-                input_spec.cyclic_chains = self.cyclic_chains
+                input_spec.cyclic_chains = cyclic_chains

        # make InferenceInputDataset
        inference_dataset = InferenceInputDataset(inference_inputs)
@@ -495,6 +542,7 @@ class RF3InferenceEngine:
                        atom_arrays=atom_array_list or atom_array_stack,
                        base_path=example_out_dir / input_spec.example_id,
                        one_model_per_file=one_model_per_file,
+                        file_type=file_type,
                    )

                if dump_trajectories:
@@ -502,11 +550,13 @@ class RF3InferenceEngine:
                        trajectory_list=network_output["X_denoised_L_traj"],
                        atom_array=pipeline_output["atom_array"],
                        base_path=example_out_dir / "denoised",
+                        file_type=file_type,
                    )
                    dump_trajectories(
                        trajectory_list=network_output["X_noisy_L_traj"],
                        atom_array=pipeline_output["atom_array"],
                        base_path=example_out_dir / "noisy",
+                        file_type=file_type,
                    )

                ranked_logger.info(
--- a/models/rf3/src/rf3/utils/io.py
+++ b/models/rf3/src/rf3/utils/io.py
@@ -143,6 +143,7 @@ def dump_trajectories(
    atom_array: AtomArray,
    base_path: Path,
    align_structures: bool = True,
+    file_type: str = "cif.gz",
 ) -> None:
    """Write denoising trajectories to CIF files.

@@ -153,6 +154,7 @@ def dump_trajectories(
        base_path (Path): Base path where the output files will be saved.
        align_structures (bool): Flag to determine if the structures should be aligned on the final prediction.
            If False, each step may have a different alignment.
+        file_type (str): File type for output (e.g., "cif", "cif.gz", "pdb"). Defaults to ``"cif.gz"``.
    """
    n_steps = len(trajectory_list)

@@ -192,5 +194,5 @@ def dump_trajectories(

        path = f"{base_path}_model_{i}"
        to_cif_file(
-            atom_array_stack, path, file_type="cif.gz", include_entity_poly=False
+            atom_array_stack, path, file_type=file_type, include_entity_poly=False
        )
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -65,6 +65,7 @@ build-backend = "hatchling.build"

 [tool.hatch.version]
 source = "vcs"
+fallback-version = "0.0.0"

 [tool.hatch.build.hooks.vcs]
 version-file = "src/modelhub/version.py"
--- a/src/modelhub/init.py
+++ b/src/modelhub/init.py
@@ -32,13 +32,19 @@ SHOULD_USE_CUEQUIVARIANCE = False

 try:
    if torch.cuda.is_available():
-        import cuequivariance_torch as cuet  # noqa: I001, F401
+        if _env.bool("DISABLE_CUEQUIVARIANCE", default=False):
+            logger.info("cuEquivariance usage disabled via DISABLE_CUEQUIVARIANCE")
+        else:
+            import cuequivariance_torch as cuet  # noqa: I001, F401

-        SHOULD_USE_CUEQUIVARIANCE = True
-        os.environ["CUEQ_DISABLE_AOT_TUNING"] = _env.str(
-            "CUEQ_DISABLE_AOT_TUNING", default="1"
-        )
-        os.environ["CUEQ_DEFAULT_CONFIG"] = _env.str("CUEQ_DEFAULT_CONFIG", default="1")
+            SHOULD_USE_CUEQUIVARIANCE = True
+            os.environ["CUEQ_DISABLE_AOT_TUNING"] = _env.str(
+                "CUEQ_DISABLE_AOT_TUNING", default="1"
+            )
+            os.environ["CUEQ_DEFAULT_CONFIG"] = _env.str(
+                "CUEQ_DEFAULT_CONFIG", default="1"
+            )
+            logger.info("cuEquivariance is available and will be used.")

 except ImportError:
    logger.debug("cuEquivariance unavailable: import failed")
--- a/src/modelhub/trainers/fabric.py
+++ b/src/modelhub/trainers/fabric.py
@@ -40,6 +40,15 @@ from modelhub.utils.weights import (
 ranked_logger = RankedLogger(__name__, rank_zero_only=True)


+def is_interactive_environment() -> bool:
+    try:
+        from IPython import get_ipython
+
+        return get_ipython() is not None
+    except ImportError:
+        return False
+
+
 class FabricTrainer(ABC):
    def __init__(
        self,
@@ -110,11 +119,13 @@ class FabricTrainer(ABC):
            (4) Efficient Gradient Accumulation (https://lightning.ai/docs/fabric/2.4.0/advanced/gradient_accumulation.html)
        """
        # DDP strategy requires a manual timeout higher than the default
-        if strategy == "ddp":
+        if strategy == "ddp" and not is_interactive_environment():
            strategy = DDPStrategy(
                timeout=timedelta(seconds=nccl_timeout),
                find_unused_parameters=find_unused_parameters,
            )
+        else:
+            strategy = "auto"  # type: ignore

        # See (1) for initialization arguments for Fabric()
        self.fabric = L.Fabric(