ml.launchers.slurm

Defines a launcher for Slurm jobs.

Steps

  1. Stages the environment to a new working directory

  2. Writes an sbatch.sh file

  3. Schedules sbatch.sh file

This allows for repeatability by just scheduling the same sbatch.sh file.

class ml.launchers.slurm.SlurmConfItem(key: str = '???', partition: str = '???', gpus_per_node: int = '???', cpus_per_gpu: int = '???', num_nodes: int = '???', gpu_type: str | None = None, exclusive: bool = False)[source]

Bases: object

key: str = '???'
partition: str = '???'
gpus_per_node: int = '???'
cpus_per_gpu: int = '???'
num_nodes: int = '???'
gpu_type: str | None = None
exclusive: bool = False
class ml.launchers.slurm.SlurmConf(configurations: list[ml.launchers.slurm.SlurmConfItem] = <factory>)[source]

Bases: object

configurations: list[ml.launchers.slurm.SlurmConfItem]
classmethod load() SlurmConf[source]
save() None[source]
ml.launchers.slurm.set_slurm_rank_and_world_size() tuple[int, int][source]
ml.launchers.slurm.set_slurm_master_addr_and_port() str[source]
ml.launchers.slurm.requeue_job() None[source]
class ml.launchers.slurm.SlurmLauncherConfig(name: str = '???', conf_key: str = '${oc.env:SLURM_DEFAULT_KEY,missing}', time_limit: str = '${oc.env:SLURM_TIME_LIMIT,3-00:00:00}', num_jobs: int = 1, comment: str | None = None, master_port: int = '${ml.get_random_slurm_port:1337}', model_parallelism: int = 1, pipeline_parallelism: int = 1, backend: str | None = None, model_parallel_backend: str | None = None, pipeline_parallel_backend: str | None = None, data_parallel_backend: str | None = None, account: str | None = None)[source]

Bases: BaseLauncherConfig

conf_key: str = '${oc.env:SLURM_DEFAULT_KEY,missing}'
time_limit: str = '${oc.env:SLURM_TIME_LIMIT,3-00:00:00}'
num_jobs: int = 1
comment: str | None = None
master_port: int = '${ml.get_random_slurm_port:1337}'
model_parallelism: int = 1
pipeline_parallelism: int = 1
backend: str | None = None
model_parallel_backend: str | None = None
pipeline_parallel_backend: str | None = None
data_parallel_backend: str | None = None
account: str | None = None
class ml.launchers.slurm.SlurmLauncher(config: BaseConfigT)[source]

Bases: BaseLauncher[SlurmLauncherConfig]

launch() None[source]

Launches the training process.

ml.launchers.slurm.slurm_main() None[source]