From ae2bb80d0267edd6518311b91a20d877572d304f Mon Sep 17 00:00:00 2001 From: Nathaniel Corley Date: Fri, 7 Nov 2025 16:08:06 -0800 Subject: [PATCH] Fix/apptainer (#629) * feat: enable jupyter notebooks; cleanup * fix: apptainer * fix: more apptainer shenanigans --- .ipd/apptainer/rf3-dev.def | 30 ++++++-- .ipd/apptainer/rf3-dev.sif | 2 +- .ipd/apptainer/rf3-full.def | 3 + lib/atomworks | 2 +- .../configs/experiment/pretrained/rf3.yaml | 2 +- models/rf3/configs/inference_engine/base.yaml | 2 + models/rf3/configs/inference_engine/rf3.yaml | 3 +- models/rf3/pyproject.toml | 4 +- .../callbacks/dump_validation_structures.py | 9 +++ models/rf3/src/rf3/cli.py | 2 +- models/rf3/src/rf3/inference.py | 1 + models/rf3/src/rf3/inference_engines/rf3.py | 68 ++++++++++++++++--- models/rf3/src/rf3/utils/io.py | 4 +- pyproject.toml | 1 + src/modelhub/__init__.py | 18 +++-- src/modelhub/trainers/fabric.py | 13 +++- 16 files changed, 136 insertions(+), 28 deletions(-) diff --git a/.ipd/apptainer/rf3-dev.def b/.ipd/apptainer/rf3-dev.def index 381f488..b739785 100644 --- a/.ipd/apptainer/rf3-dev.def +++ b/.ipd/apptainer/rf3-dev.def @@ -27,6 +27,7 @@ IncludeCmd: yes /etc/hosts pyproject.toml /opt/core_pyproject.toml models/rf3/pyproject.toml /opt/rf3_pyproject.toml + lib/atomworks/pyproject.toml /opt/atomworks_pyproject.toml %post ## GENERAL SETUP @@ -38,8 +39,23 @@ IncludeCmd: yes ln -s /projects /mnt/projects ln -s /net /mnt/net + # Update system and install essential packages + apt-get update && apt-get install -y \ + build-essential \ + git \ + libxrender1 \ + libxrender-dev \ + libx11-6 \ + libx11-dev \ + libxext6 \ + libxext-dev \ + && rm -rf /var/lib/apt/lists/* + ## PYTHON DEPENDENCY INSTALLATION + # Upgrade pip + python -m pip install --upgrade pip + # Install uv for fast dependency resolution pip install uv @@ -54,12 +70,18 @@ IncludeCmd: yes uv pip compile /opt/pyproject.toml --output-file /opt/rf3_requirements.txt --all-extras rm /opt/pyproject.toml + # (AtomWorks) + mv /opt/atomworks_pyproject.toml /opt/pyproject.toml + uv pip compile /opt/pyproject.toml --output-file /opt/atomworks_requirements.txt --all-extras + rm /opt/pyproject.toml + # Merge and dedupe requirements, excluding packages we don't want # (atomworks is mounted from host; torch/numpy/nvidia-* provided by base image) - # (pynvml/packaging/pandas/markdown-it-py from NGC container to avoid conflicts) - cat /opt/core_requirements.txt /opt/rf3_requirements.txt | \ - grep -vE "^(atomworks|torch(|vision|audio)|numpy|nvidia-.*|pynvml|packaging|pandas|markdown-it-py)==" | \ - awk '!seen[$0]++' > /opt/combined_requirements.txt + # (pynvml/packaging/pandas/markdown-it-py/triton from NGC container to avoid conflicts) + # Deduplicate by package name (keeping first occurrence) to handle version conflicts + cat /opt/core_requirements.txt /opt/rf3_requirements.txt /opt/atomworks_requirements.txt | \ + grep -vE "^(atomworks|torch(|vision|audio)|numpy|nvidia-.*|pynvml|packaging|pandas|markdown-it-py|triton)==" | \ + awk -F'==' '!seen[$1]++' > /opt/combined_requirements.txt # Print combined requirements for debugging echo "=== Combined requirements to install ===" diff --git a/.ipd/apptainer/rf3-dev.sif b/.ipd/apptainer/rf3-dev.sif index 2e74ecc..ee8c233 120000 --- a/.ipd/apptainer/rf3-dev.sif +++ b/.ipd/apptainer/rf3-dev.sif @@ -1 +1 @@ -/net/software/containers/versions/modelhub/rf3-dev_2025_10_08.sif \ No newline at end of file +/net/software/containers/versions/modelhub/rf3-dev_2025_11_07.sif \ No newline at end of file diff --git a/.ipd/apptainer/rf3-full.def b/.ipd/apptainer/rf3-full.def index e6441b1..46f1f3c 100644 --- a/.ipd/apptainer/rf3-full.def +++ b/.ipd/apptainer/rf3-full.def @@ -30,6 +30,9 @@ IncludeCmd: yes --exclude='outputs' \ --exclude='logs' \ --exclude='*.sif' \ + --exclude='distillation' \ + --exclude='benchmarks' \ + --exclude='**/slurm_logs' \ ./ ${APPTAINER_ROOTFS}/opt/modelhub/ echo "Repository copied successfully." diff --git a/lib/atomworks b/lib/atomworks index 11b5d0d..4d45b10 160000 --- a/lib/atomworks +++ b/lib/atomworks @@ -1 +1 @@ -Subproject commit 11b5d0d76285b837f843bc2cd60867164637b0d8 +Subproject commit 4d45b107e3d78c87f6c37e59fcfda78f44e949de diff --git a/models/rf3/configs/experiment/pretrained/rf3.yaml b/models/rf3/configs/experiment/pretrained/rf3.yaml index df334f6..4e80cc9 100644 --- a/models/rf3/configs/experiment/pretrained/rf3.yaml +++ b/models/rf3/configs/experiment/pretrained/rf3.yaml @@ -14,7 +14,7 @@ defaults: ckpt_config: _target_: modelhub.utils.weights.CheckpointConfig - path: /net/software/containers/versions/modelhub_inference/ckpts/rf3-w-conf-run10-ep903-remapped.ckpt + path: /net/software/containers/versions/modelhub_inference/ckpts/rf3-w-conf-run10-ep922-remapped.ckpt reset_optimizer: true model: diff --git a/models/rf3/configs/inference_engine/base.yaml b/models/rf3/configs/inference_engine/base.yaml index 628d1ac..797d9b6 100644 --- a/models/rf3/configs/inference_engine/base.yaml +++ b/models/rf3/configs/inference_engine/base.yaml @@ -7,6 +7,7 @@ defaults: ckpt_path: ??? num_nodes: 1 devices_per_node: 1 +compress_outputs: true # Parameters for RF3InferenceEngine.run() inputs: ??? @@ -19,3 +20,4 @@ sharding_pattern: null skip_existing: false template_selection: null ground_truth_conformer_selection: null +cyclic_chains: [] diff --git a/models/rf3/configs/inference_engine/rf3.yaml b/models/rf3/configs/inference_engine/rf3.yaml index 2f2aa03..999e630 100644 --- a/models/rf3/configs/inference_engine/rf3.yaml +++ b/models/rf3/configs/inference_engine/rf3.yaml @@ -6,7 +6,7 @@ defaults: _target_: rf3.inference_engines.rf3.RF3InferenceEngine -ckpt_path: /projects/ml/modelhub/apptainer/rf3-w-conf-run10-ep903-remapped.ckpt +ckpt_path: /net/software/containers/versions/modelhub_inference/ckpts/rf3-w-conf-run10-ep922-remapped.ckpt # Transform arguments n_recycles: 10 @@ -21,7 +21,6 @@ early_stopping_plddt_threshold: 0.5 seed: null print_config: true raise_if_missing_msa_for_protein_of_length_n: null -cyclic_chains: [] # Metrics metrics_cfg: diff --git a/models/rf3/pyproject.toml b/models/rf3/pyproject.toml index 9bc44f8..458c6c6 100644 --- a/models/rf3/pyproject.toml +++ b/models/rf3/pyproject.toml @@ -35,7 +35,8 @@ dependencies = [ "cuequivariance_ops_torch_cu12>=0.6.1; sys_platform == 'linux'", "cuequivariance_torch>=0.6.1; sys_platform == 'linux'", # ... dataloading - "atomworks==1.0.2", + # (Commenting out for development; we should re-add before release) + # "atomworks==1.0.2", ] [project.scripts] @@ -51,6 +52,7 @@ build-backend = "hatchling.build" [tool.hatch.version] source = "vcs" +fallback-version = "0.0.0" [tool.hatch.version.raw-options] root = "../.." diff --git a/models/rf3/src/rf3/callbacks/dump_validation_structures.py b/models/rf3/src/rf3/callbacks/dump_validation_structures.py index b488072..8b33efe 100644 --- a/models/rf3/src/rf3/callbacks/dump_validation_structures.py +++ b/models/rf3/src/rf3/callbacks/dump_validation_structures.py @@ -21,6 +21,7 @@ class DumpValidationStructuresCallback(BaseCallback): dump_predictions: bool = False, one_model_per_file: bool = False, dump_trajectories: bool = False, + compress_outputs: bool = True, ): """ Args: @@ -28,12 +29,14 @@ class DumpValidationStructuresCallback(BaseCallback): one_model_per_file: If True, write each structure within a diffusion batch to its own CIF files. If False, include each structure within a diffusion batch as a separate model within one CIF file. dump_trajectories: Whether to dump denoising trajectories after validation batches. + compress_outputs: Whether to gzip output files. Defaults to ``True``. """ super().__init__() self.save_dir = Path(save_dir) self.dump_predictions = dump_predictions self.dump_trajectories = dump_trajectories self.one_model_per_file = one_model_per_file + self.compress_outputs = compress_outputs def on_validation_batch_end( self, @@ -68,6 +71,9 @@ class DumpValidationStructuresCallback(BaseCallback): return path / f"{identifier}{extra}" + # Determine file type based on compression setting + file_type = "cif.gz" if self.compress_outputs else "cif" + if self.dump_predictions: atom_array_stack = build_stack_from_atom_array_and_batched_coords( network_output["X_L"], example["atom_array"] @@ -76,6 +82,7 @@ class DumpValidationStructuresCallback(BaseCallback): atom_arrays=atom_array_stack, base_path=_build_path_from_example_id("predictions"), one_model_per_file=self.one_model_per_file, + file_type=file_type, ) if self.dump_trajectories: @@ -83,9 +90,11 @@ class DumpValidationStructuresCallback(BaseCallback): trajectory_list=network_output["X_denoised_L_traj"], atom_array=example["atom_array"], base_path=_build_path_from_example_id("trajectories", "_denoised"), + file_type=file_type, ) dump_trajectories( trajectory_list=network_output["X_noisy_L_traj"], atom_array=example["atom_array"], base_path=_build_path_from_example_id("trajectories", "_noisy"), + file_type=file_type, ) diff --git a/models/rf3/src/rf3/cli.py b/models/rf3/src/rf3/cli.py index df585ea..4041a1d 100644 --- a/models/rf3/src/rf3/cli.py +++ b/models/rf3/src/rf3/cli.py @@ -3,7 +3,7 @@ from pathlib import Path import typer from hydra import compose, initialize_config_dir -app = typer.Typer() +app = typer.Typer(pretty_exceptions_enable=False) @app.command( diff --git a/models/rf3/src/rf3/inference.py b/models/rf3/src/rf3/inference.py index 0911f2e..af5f1d9 100755 --- a/models/rf3/src/rf3/inference.py +++ b/models/rf3/src/rf3/inference.py @@ -45,6 +45,7 @@ def run_inference(cfg: DictConfig) -> None: "ground_truth_conformer_selection": cfg.get( "ground_truth_conformer_selection", None ), + "cyclic_chains": cfg.get("cyclic_chains", []), } # Create init config with only __init__ params diff --git a/models/rf3/src/rf3/inference_engines/rf3.py b/models/rf3/src/rf3/inference_engines/rf3.py index bc4abe7..eb9af8b 100644 --- a/models/rf3/src/rf3/inference_engines/rf3.py +++ b/models/rf3/src/rf3/inference_engines/rf3.py @@ -11,10 +11,12 @@ from atomworks.ml.preprocessing.msa.finding import ( get_msa_dirs_from_env, ) from atomworks.ml.samplers import LoadBalancedDistributedSampler +from biotite.structure import AtomArray from lightning.fabric import seed_everything from omegaconf import OmegaConf from torch.utils.data import DataLoader +from modelhub.metrics.metric import MetricManager from modelhub.utils.ddp import RankedLogger, set_accelerator_based_on_availability from modelhub.utils.logging import print_config_tree from rf3.model.RF3 import ShouldEarlyStopFn @@ -33,7 +35,6 @@ from rf3.utils.predicted_error import ( compile_af3_confidence_outputs, get_mean_atomwise_plddt, ) -from modelhub.metrics.metric import MetricManager logging.basicConfig( level=logging.INFO, @@ -94,7 +95,8 @@ class RF3InferenceEngine: metrics_cfg: dict | OmegaConf | MetricManager | None = None, num_nodes: int = 1, devices_per_node: int = 1, - cyclic_chains: list[str] = [], + # Output control + compress_outputs: bool = True, # Debug print_config: bool = False, raise_if_missing_msa_for_protein_of_length_n: int | None = None, @@ -118,6 +120,7 @@ class RF3InferenceEngine: Defaults to ``None``. num_nodes: Number of nodes for distributed inference. Defaults to ``1``. devices_per_node: Number of devices per node. Defaults to ``1``. + compress_outputs: Whether to gzip output files. Defaults to ``True``. print_config: Whether to print config trees. Defaults to ``False``. raise_if_missing_msa_for_protein_of_length_n: Debug flag for MSA checking. Defaults to ``None``. """ @@ -187,10 +190,9 @@ class RF3InferenceEngine: "p_give_polymer_ref_conf": 0.0, "p_give_non_polymer_ref_conf": 0.0, "p_dropout_ref_conf": 0.0, + "use_element_for_atom_names_of_atomized_tokens": True, } - self.cyclic_chains = cyclic_chains - self.print_config = print_config # Set random seed (only if seed is not None) @@ -220,6 +222,7 @@ class RF3InferenceEngine: self.ckpt_path = ckpt_path self.early_stopping_plddt_threshold = early_stopping_plddt_threshold + self.compress_outputs = compress_outputs # Setup model ranked_logger.info("Setting up model...") @@ -269,7 +272,14 @@ class RF3InferenceEngine: def run( self, - inputs: InferenceInput | list[InferenceInput] | PathLike | list[PathLike], + inputs: ( + InferenceInput + | list[InferenceInput] + | AtomArray + | list[AtomArray] + | PathLike + | list[PathLike] + ), # Output control out_dir: PathLike | None = None, dump_predictions: bool = True, @@ -281,22 +291,24 @@ class RF3InferenceEngine: # Selection overrides (applied to all input types) template_selection: list[str] | str | None = None, ground_truth_conformer_selection: list[str] | str | None = None, + cyclic_chains: list[str] = [], ) -> dict[str, dict] | None: """Run inference on inputs. Requires a pre-initialized inference engine. Args: - inputs: Single/list of InferenceInput objects, or file paths, or directory. + inputs: Single/list of InferenceInput objects, AtomArray objects, file paths, or directory. out_dir: Output directory. If None, returns results as an AtomArray and dictionaries of metrics. Defaults to ``None``. dump_predictions: Whether to save predicted structures. Defaults to ``True``. dump_trajectories: Whether to save diffusion trajectories. Defaults to ``False``. one_model_per_file: Save each model in separate file. Defaults to ``False``. annotate_b_factor_with_plddt: Write pLDDT to B-factor column. Defaults to ``False``. sharding_pattern: Sharding pattern for output organization. Defaults to ``None``. - skip_existing: Skip inputs with existing outputs. Defaults to ``False``. + skip_existing: Skip inputs with existing outputs. Requires ``out_dir`` to be set. If ``True`` when ``out_dir=None``, a warning is logged and skipping is disabled. Defaults to ``False``. template_selection: Template selection override. Defaults to ``None``. ground_truth_conformer_selection: Conformer selection override. Defaults to ``None``. + cyclic_chains: List of chain IDs to cyclize. Defaults to ``[]``. Returns: If ``out_dir`` is None: Dict mapping example_id to results dict. @@ -307,6 +319,21 @@ class RF3InferenceEngine: if out_dir: out_dir.mkdir(parents=True, exist_ok=True) ranked_logger.info(f"Outputs will be written to {out_dir.resolve()}.") + if not out_dir: + ranked_logger.warning( + "out_dir is None - results will be returned in memory! If you want to save to disk, please provide an out_dir." + ) + + # Validate skip_existing configuration + if skip_existing and out_dir is None: + ranked_logger.warning( + "skip_existing=True requires out_dir to be set. " + "Disabling skip_existing for in-memory inference mode." + ) + skip_existing = False + + # Determine file type based on compression setting + file_type = "cif.gz" if self.compress_outputs else "cif" # Convert inputs to InferenceInput objects if isinstance(inputs, InferenceInput): @@ -315,6 +342,26 @@ class RF3InferenceEngine: isinstance(i, InferenceInput) for i in inputs ): inference_inputs = inputs + elif isinstance(inputs, AtomArray): + # Single AtomArray - convert to InferenceInput + inference_inputs = [ + InferenceInput.from_atom_array( + inputs, + template_selection=template_selection, + ground_truth_conformer_selection=ground_truth_conformer_selection, + ) + ] + elif isinstance(inputs, list) and all(isinstance(i, AtomArray) for i in inputs): + # List of AtomArrays - convert each to InferenceInput + inference_inputs = [ + InferenceInput.from_atom_array( + arr, + example_id=f"inference_{i}", + template_selection=template_selection, + ground_truth_conformer_selection=ground_truth_conformer_selection, + ) + for i, arr in enumerate(inputs) + ] elif isinstance(inputs, (str, Path)) or ( isinstance(inputs, list) and isinstance(inputs[0], (str, Path)) ): @@ -329,9 +376,9 @@ class RF3InferenceEngine: raise ValueError(f"Unsupported inputs type: {type(inputs)}") # Flag chains for cyclization if specified - if self.cyclic_chains: + if cyclic_chains: for input_spec in inference_inputs: - input_spec.cyclic_chains = self.cyclic_chains + input_spec.cyclic_chains = cyclic_chains # make InferenceInputDataset inference_dataset = InferenceInputDataset(inference_inputs) @@ -495,6 +542,7 @@ class RF3InferenceEngine: atom_arrays=atom_array_list or atom_array_stack, base_path=example_out_dir / input_spec.example_id, one_model_per_file=one_model_per_file, + file_type=file_type, ) if dump_trajectories: @@ -502,11 +550,13 @@ class RF3InferenceEngine: trajectory_list=network_output["X_denoised_L_traj"], atom_array=pipeline_output["atom_array"], base_path=example_out_dir / "denoised", + file_type=file_type, ) dump_trajectories( trajectory_list=network_output["X_noisy_L_traj"], atom_array=pipeline_output["atom_array"], base_path=example_out_dir / "noisy", + file_type=file_type, ) ranked_logger.info( diff --git a/models/rf3/src/rf3/utils/io.py b/models/rf3/src/rf3/utils/io.py index 8026813..6fb6949 100644 --- a/models/rf3/src/rf3/utils/io.py +++ b/models/rf3/src/rf3/utils/io.py @@ -143,6 +143,7 @@ def dump_trajectories( atom_array: AtomArray, base_path: Path, align_structures: bool = True, + file_type: str = "cif.gz", ) -> None: """Write denoising trajectories to CIF files. @@ -153,6 +154,7 @@ def dump_trajectories( base_path (Path): Base path where the output files will be saved. align_structures (bool): Flag to determine if the structures should be aligned on the final prediction. If False, each step may have a different alignment. + file_type (str): File type for output (e.g., "cif", "cif.gz", "pdb"). Defaults to ``"cif.gz"``. """ n_steps = len(trajectory_list) @@ -192,5 +194,5 @@ def dump_trajectories( path = f"{base_path}_model_{i}" to_cif_file( - atom_array_stack, path, file_type="cif.gz", include_entity_poly=False + atom_array_stack, path, file_type=file_type, include_entity_poly=False ) diff --git a/pyproject.toml b/pyproject.toml index 95b9850..5ce0a87 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -65,6 +65,7 @@ build-backend = "hatchling.build" [tool.hatch.version] source = "vcs" +fallback-version = "0.0.0" [tool.hatch.build.hooks.vcs] version-file = "src/modelhub/version.py" diff --git a/src/modelhub/__init__.py b/src/modelhub/__init__.py index fe8ddf1..e0d7b10 100644 --- a/src/modelhub/__init__.py +++ b/src/modelhub/__init__.py @@ -32,13 +32,19 @@ SHOULD_USE_CUEQUIVARIANCE = False try: if torch.cuda.is_available(): - import cuequivariance_torch as cuet # noqa: I001, F401 + if _env.bool("DISABLE_CUEQUIVARIANCE", default=False): + logger.info("cuEquivariance usage disabled via DISABLE_CUEQUIVARIANCE") + else: + import cuequivariance_torch as cuet # noqa: I001, F401 - SHOULD_USE_CUEQUIVARIANCE = True - os.environ["CUEQ_DISABLE_AOT_TUNING"] = _env.str( - "CUEQ_DISABLE_AOT_TUNING", default="1" - ) - os.environ["CUEQ_DEFAULT_CONFIG"] = _env.str("CUEQ_DEFAULT_CONFIG", default="1") + SHOULD_USE_CUEQUIVARIANCE = True + os.environ["CUEQ_DISABLE_AOT_TUNING"] = _env.str( + "CUEQ_DISABLE_AOT_TUNING", default="1" + ) + os.environ["CUEQ_DEFAULT_CONFIG"] = _env.str( + "CUEQ_DEFAULT_CONFIG", default="1" + ) + logger.info("cuEquivariance is available and will be used.") except ImportError: logger.debug("cuEquivariance unavailable: import failed") diff --git a/src/modelhub/trainers/fabric.py b/src/modelhub/trainers/fabric.py index 37d053d..2fafa47 100755 --- a/src/modelhub/trainers/fabric.py +++ b/src/modelhub/trainers/fabric.py @@ -40,6 +40,15 @@ from modelhub.utils.weights import ( ranked_logger = RankedLogger(__name__, rank_zero_only=True) +def is_interactive_environment() -> bool: + try: + from IPython import get_ipython + + return get_ipython() is not None + except ImportError: + return False + + class FabricTrainer(ABC): def __init__( self, @@ -110,11 +119,13 @@ class FabricTrainer(ABC): (4) Efficient Gradient Accumulation (https://lightning.ai/docs/fabric/2.4.0/advanced/gradient_accumulation.html) """ # DDP strategy requires a manual timeout higher than the default - if strategy == "ddp": + if strategy == "ddp" and not is_interactive_environment(): strategy = DDPStrategy( timeout=timedelta(seconds=nccl_timeout), find_unused_parameters=find_unused_parameters, ) + else: + strategy = "auto" # type: ignore # See (1) for initialization arguments for Fabric() self.fabric = L.Fabric(