ml.launchers.slurm
Defines a launcher for Slurm jobs.
Steps
Stages the environment to a new working directory
Writes an
sbatch.sh
fileSchedules
sbatch.sh
file
This allows for repeatability by just scheduling the same sbatch.sh file.
- class ml.launchers.slurm.SlurmConfItem(key: str = '???', partition: str = '???', gpus_per_node: int = '???', cpus_per_gpu: int = '???', num_nodes: int = '???', gpu_type: str | None = None, exclusive: bool = False)[source]
Bases:
object
- key: str = '???'
- partition: str = '???'
- gpus_per_node: int = '???'
- cpus_per_gpu: int = '???'
- num_nodes: int = '???'
- gpu_type: str | None = None
- exclusive: bool = False
- class ml.launchers.slurm.SlurmConf(configurations: list[ml.launchers.slurm.SlurmConfItem] = <factory>)[source]
Bases:
object
- configurations: list[ml.launchers.slurm.SlurmConfItem]
- class ml.launchers.slurm.SlurmLauncherConfig(name: str = '???', conf_key: str = '${oc.env:SLURM_DEFAULT_KEY,missing}', time_limit: str = '${oc.env:SLURM_TIME_LIMIT,3-00:00:00}', num_jobs: int = 1, comment: str | None = None, master_port: int = '${ml.get_random_slurm_port:1337}', model_parallelism: int = 1, pipeline_parallelism: int = 1, backend: str | None = None, model_parallel_backend: str | None = None, pipeline_parallel_backend: str | None = None, data_parallel_backend: str | None = None, account: str | None = None)[source]
Bases:
BaseLauncherConfig
- conf_key: str = '${oc.env:SLURM_DEFAULT_KEY,missing}'
- time_limit: str = '${oc.env:SLURM_TIME_LIMIT,3-00:00:00}'
- num_jobs: int = 1
- comment: str | None = None
- master_port: int = '${ml.get_random_slurm_port:1337}'
- model_parallelism: int = 1
- pipeline_parallelism: int = 1
- backend: str | None = None
- model_parallel_backend: str | None = None
- pipeline_parallel_backend: str | None = None
- data_parallel_backend: str | None = None
- account: str | None = None
- class ml.launchers.slurm.SlurmLauncher(config: BaseConfigT)[source]
Bases:
BaseLauncher
[SlurmLauncherConfig
]