ml.trainers.mixins.gpu_stats

A trainer mixin for logging GPU statistics.

This logs GPU memory and utilization in a background process using nvidia-smi, if a GPU is available in the system.

class ml.trainers.mixins.gpu_stats.GPUStatsConfig(name: str = '???', exp_name: str = '${ml.exp_name:null}', exp_dir: str = '???', log_dir_name: str = 'logs', use_double_weight_precision: bool = False, checkpoint: ml.trainers.base.CheckpointConfig = <factory>, gpu_stats_ping_interval: int = 10, gpu_stats_only_log_once: bool = False)[source]

Bases: MonitorProcessConfig

gpu_stats_ping_interval: int = 10
gpu_stats_only_log_once: bool = False
class ml.trainers.mixins.gpu_stats.GPUStats[source]

Bases: Structure

index

Structure/Union member

memory_used

Structure/Union member

temperature

Structure/Union member

utilization

Structure/Union member

class ml.trainers.mixins.gpu_stats.GPUStatsInfo(index: int, memory_used: float, temperature: float, utilization: float)[source]

Bases: object

index: int
memory_used: float
temperature: float
utilization: float
classmethod from_stats(stats: GPUStats) GPUStatsInfo[source]
ml.trainers.mixins.gpu_stats.get_num_gpus() int[source]
ml.trainers.mixins.gpu_stats.parse_number(s: str) float[source]
ml.trainers.mixins.gpu_stats.parse_gpu_stats(row: str) GPUStats[source]
ml.trainers.mixins.gpu_stats.gen_gpu_stats(loop_secs: int = 5) Iterable[GPUStats][source]
ml.trainers.mixins.gpu_stats.worker(ping_interval: int, smems: list[multiprocessing.managers.ValueProxy[ml.trainers.mixins.gpu_stats.GPUStats]], main_event: Event, events: list[multiprocessing.synchronize.Event], start_event: Event) None[source]
class ml.trainers.mixins.gpu_stats.GPUStatsMonitor(ping_interval: float, manager: SyncManager)[source]

Bases: object

get_if_set() dict[int, ml.trainers.mixins.gpu_stats.GPUStatsInfo][source]
get() dict[int, ml.trainers.mixins.gpu_stats.GPUStatsInfo][source]
start(wait: bool = False) None[source]
stop() None[source]
class ml.trainers.mixins.gpu_stats.GPUStatsMixin(config: GPUStatsConfigT)[source]

Bases: MonitorProcessMixin[GPUStatsConfigT, ModelT, TaskT]

Defines a trainer mixin for getting GPU statistics.

on_training_start(state: State, task: TaskT, model: ModelT, optim: Optimizer | dict[str, torch.optim.optimizer.Optimizer], lr_sched: SchedulerAdapter | dict[str, ml.lr_schedulers.base.SchedulerAdapter]) None[source]
on_training_end(state: State, task: TaskT, model: ModelT, optim: Optimizer | dict[str, torch.optim.optimizer.Optimizer], lr_sched: SchedulerAdapter | dict[str, ml.lr_schedulers.base.SchedulerAdapter]) None[source]
on_step_start(state: State, task: TaskT, model: ModelT, optim: Optimizer | dict[str, torch.optim.optimizer.Optimizer], lr_sched: SchedulerAdapter | dict[str, ml.lr_schedulers.base.SchedulerAdapter]) None[source]