ml.trainers.sl

Defines a trainer to use for supervised learning.

This trainer is akin to PyTorch Lightning or Keras, in that it handles the training loop, logging, and checkpointing. We get a dataset and dataloader from the task, and then train the model on the dataset.

class ml.trainers.sl.ValidationConfig(valid_every_n_steps: int | None = None, valid_every_n_seconds: float | None = 600.0, valid_first_n_seconds: float | None = 60.0)[source]

Bases: object

valid_every_n_steps: int | None = None
valid_every_n_seconds: float | None = 600.0
valid_first_n_seconds: float | None = 60.0
class ml.trainers.sl.BatchScheduleConfig(num_steps: int = '???', num_batches: int = '???')[source]

Bases: object

num_steps: int = '???'
num_batches: int = '???'
class ml.trainers.sl.SupervisedLearningTrainerConfig(name: str = '???', exp_name: str = '${ml.exp_name:null}', exp_dir: str = '???', log_dir_name: str = 'logs', use_double_weight_precision: bool = False, checkpoint: ml.trainers.base.CheckpointConfig = <factory>, parallel: ml.trainers.mixins.data_parallel.ParallelConfig = <factory>, compiler: ml.trainers.mixins.compile.TorchCompileConfig = <factory>, cpu_stats_ping_interval: int = 1, cpu_stats_only_log_once: bool = False, gpu_stats_ping_interval: int = 10, gpu_stats_only_log_once: bool = False, mixed_precision: ml.trainers.mixins.mixed_precision.MixedPrecisionConfig = <factory>, clip_grad_norm: float = 10.0, clip_grad_norm_type: Any = 2, balance_grad_norms: bool = False, profiler: ml.trainers.mixins.profiler.Profiler = <factory>, set_to_none: bool = True, deterministic: bool = False, use_tf32: bool = True, detect_anomaly: bool = False, validation: ml.trainers.sl.ValidationConfig = <factory>, batches_per_step: int = 1, batches_per_step_schedule: list[ml.trainers.sl.BatchScheduleConfig] | None = None, batch_chunks_per_step_schedule: list[ml.trainers.sl.BatchScheduleConfig] | None = None, batch_dim: int = 0)[source]

Bases: BaseLearningTrainerConfig

validation: ValidationConfig
batches_per_step: int = 1
batches_per_step_schedule: list[ml.trainers.sl.BatchScheduleConfig] | None = None
batch_chunks_per_step_schedule: list[ml.trainers.sl.BatchScheduleConfig] | None = None
batch_dim: int = 0
class ml.trainers.sl.SupervisedLearningTrainer(config: SupervisedLearningTrainerConfigT)[source]

Bases: BaseLearningTrainer[SupervisedLearningTrainerConfigT, ModelT, SupervisedLearningTaskT], Generic[SupervisedLearningTrainerConfigT, ModelT, SupervisedLearningTaskT]

batches_per_step_schedule() list[tuple[int, int]] | None[source]
get_batches_per_step(state: State) int[source]
batch_chunks_schedule() list[tuple[int, int]] | None[source]
get_batch_chunks(state: State) int[source]
should_validate(state: State) bool[source]
train(model: ModelT, task: SupervisedLearningTaskT, optimizer: BaseOptimizer, lr_scheduler: BaseLRScheduler) None[source]

Runs the training loop.

Parameters:
  • model – The current model

  • task – The current task

  • optimizer – The current optimizer

  • lr_scheduler – The current learning rate scheduler

Raises:

ValueError – If the task is not a supervised learning task