ml.trainers.mixins.cpu_stats

A trainer mixin for logging CPU statistics.

This logs memory and CPU utilization in a background process, sending it to the logging process every now and then. This is useful for detecting memory leaks in your dataloader, among other issues.

class ml.trainers.mixins.cpu_stats.CPUStatsConfig(name: str = '???', exp_name: str = '${ml.exp_name:null}', exp_dir: str = '???', log_dir_name: str = 'logs', use_double_weight_precision: bool = False, checkpoint: ml.trainers.base.CheckpointConfig = <factory>, cpu_stats_ping_interval: int = 1, cpu_stats_only_log_once: bool = False)[source]

Bases: MonitorProcessConfig

cpu_stats_ping_interval: int = 1
cpu_stats_only_log_once: bool = False
class ml.trainers.mixins.cpu_stats.CPUStats[source]

Bases: Structure

child_cpu_percent

Structure/Union member

child_mem_percent

Structure/Union member

cpu_percent

Structure/Union member

mem_percent

Structure/Union member

mem_rss

Structure/Union member

mem_rss_total

Structure/Union member

mem_shared

Structure/Union member

mem_vms

Structure/Union member

mem_vms_total

Structure/Union member

num_child_procs

Structure/Union member

class ml.trainers.mixins.cpu_stats.CPUStatsInfo(cpu_percent: float, mem_percent: float, mem_rss: int, mem_vms: int, mem_shared: int, mem_rss_total: int, mem_vms_total: int, child_cpu_percent: float, child_mem_percent: float, num_child_procs: int)[source]

Bases: object

cpu_percent: float
mem_percent: float
mem_rss: int
mem_vms: int
mem_shared: int
mem_rss_total: int
mem_vms_total: int
child_cpu_percent: float
child_mem_percent: float
num_child_procs: int
classmethod from_stats(stats: CPUStats) CPUStatsInfo[source]
ml.trainers.mixins.cpu_stats.worker(ping_interval: float, stats: ValueProxy[CPUStats], monitor_event: Event, start_event: Event, pid: int) None[source]
class ml.trainers.mixins.cpu_stats.CPUStatsMonitor(ping_interval: float, manager: SyncManager)[source]

Bases: object

get_if_set() CPUStatsInfo | None[source]
get() CPUStatsInfo | None[source]
start(wait: bool = False) None[source]
stop() None[source]
class ml.trainers.mixins.cpu_stats.CPUStatsMixin(config: CPUStatsConfigT)[source]

Bases: MonitorProcessMixin[CPUStatsConfigT, ModelT, TaskT]

Defines a trainer mixin for getting CPU statistics.

on_training_start(state: State, task: TaskT, model: ModelT, optim: Optimizer | dict[str, torch.optim.optimizer.Optimizer], lr_sched: SchedulerAdapter | dict[str, ml.lr_schedulers.base.SchedulerAdapter]) None[source]
on_training_end(state: State, task: TaskT, model: ModelT, optim: Optimizer | dict[str, torch.optim.optimizer.Optimizer], lr_sched: SchedulerAdapter | dict[str, ml.lr_schedulers.base.SchedulerAdapter]) None[source]
on_step_start(state: State, task: TaskT, model: ModelT, optim: Optimizer | dict[str, torch.optim.optimizer.Optimizer], lr_sched: SchedulerAdapter | dict[str, ml.lr_schedulers.base.SchedulerAdapter]) None[source]