Source code for orbax.checkpoint.options
# Copyright 2026 The Orbax Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Configuration options for APIs like CheckpointManager and Checkpointer."""
import dataclasses
from typing import Callable, Optional, Set
from orbax.checkpoint._src.multihost import multihost
[docs]
@dataclasses.dataclass
class AsyncOptions:
"""Options used to configure async behavior.
See :py:class:`.AsyncCheckpointer` for details.
"""
timeout_secs: int = (
1200 # 20 minutes. Same as default in `AsyncCheckpointer`.
)
barrier_sync_fn: Optional[multihost.BarrierSyncFn] = None
post_finalization_callback: Optional[Callable[[], None]] = None
create_directories_asynchronously: bool = True
[docs]
@dataclasses.dataclass
class MultiprocessingOptions:
"""Options used to configure multiprocessing behavior.
primary_host: the host id of the primary host. Default to 0. If it's set
to None, then all hosts will be considered as primary. It's useful in
the case that all hosts are only working with local storage.
active_processes: A set of process indices (corresponding to
`multihost.process_index()`) over which `CheckpointManager` is expected to
be called. This makes it possible to have a `CheckpointManager` instance
that runs over a subset of processes, rather than all processes as it is
normally expected to do. If specified, `primary_host` must belong to
`active_processes`.
barrier_sync_key_prefix: A string to be prepended to the barrier sync key
used to synchronize processes. This is useful to avoid collisions with
other barrier syncs if another CheckpointManager is being used concurrently.
"""
primary_host: Optional[int] = 0
active_processes: Optional[Set[int]] = None
barrier_sync_key_prefix: Optional[str] = None
[docs]
@dataclasses.dataclass(frozen=True)
class FileOptions:
"""Options used to configure checkpoint directories and files.
Attributes:
path_permission_mode: Path permission mode for step directories, user
metadata files. e.g. 0o750. Please check
https://github.com/google/etils/blob/main/etils/epath/backend.py if your
"""
path_permission_mode: int | None = None
@dataclasses.dataclass
class MemoryLimitOptions:
"""Options for configuring memory limits for save.
Can help to reduce the possibility of OOM's when large checkpoints are saved.
Attributes:
max_transfer_concurrent_gb: The max memory limit in GB allowed for.
Required if `save_device_host_concurrent_gb` is set to `"auto"`.
"""
max_transfer_concurrent_gb: int | None = None