mirror of
https://github.com/huggingface/lerobot.git
synced 2026-05-13 15:49:53 +00:00
Compare commits
3 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| 73780046b2 | |||
| 093a85f946 | |||
| a669049da2 |
@@ -81,10 +81,14 @@ class AdamWConfig(OptimizerConfig):
|
|||||||
eps: float = 1e-8
|
eps: float = 1e-8
|
||||||
weight_decay: float = 1e-2
|
weight_decay: float = 1e-2
|
||||||
grad_clip_norm: float = 10.0
|
grad_clip_norm: float = 10.0
|
||||||
|
fused: bool = False
|
||||||
|
|
||||||
def build(self, params: dict) -> torch.optim.Optimizer:
|
def build(self, params: dict) -> torch.optim.Optimizer:
|
||||||
kwargs = asdict(self)
|
kwargs = asdict(self)
|
||||||
kwargs.pop("grad_clip_norm")
|
kwargs.pop("grad_clip_norm")
|
||||||
|
# Fused optimizer only works on CUDA
|
||||||
|
if kwargs.get("fused") and not torch.cuda.is_available():
|
||||||
|
kwargs["fused"] = False
|
||||||
return torch.optim.AdamW(params, **kwargs)
|
return torch.optim.AdamW(params, **kwargs)
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -136,6 +136,7 @@ class ACTConfig(PreTrainedConfig):
|
|||||||
optimizer_lr: float = 1e-5
|
optimizer_lr: float = 1e-5
|
||||||
optimizer_weight_decay: float = 1e-4
|
optimizer_weight_decay: float = 1e-4
|
||||||
optimizer_lr_backbone: float = 1e-5
|
optimizer_lr_backbone: float = 1e-5
|
||||||
|
optimizer_fused: bool = False # Use CUDA fused AdamW kernel
|
||||||
|
|
||||||
def __post_init__(self):
|
def __post_init__(self):
|
||||||
super().__post_init__()
|
super().__post_init__()
|
||||||
@@ -164,6 +165,7 @@ class ACTConfig(PreTrainedConfig):
|
|||||||
return AdamWConfig(
|
return AdamWConfig(
|
||||||
lr=self.optimizer_lr,
|
lr=self.optimizer_lr,
|
||||||
weight_decay=self.optimizer_weight_decay,
|
weight_decay=self.optimizer_weight_decay,
|
||||||
|
fused=self.optimizer_fused,
|
||||||
)
|
)
|
||||||
|
|
||||||
def get_scheduler_preset(self) -> None:
|
def get_scheduler_preset(self) -> None:
|
||||||
|
|||||||
@@ -94,6 +94,7 @@ class GrootConfig(PreTrainedConfig):
|
|||||||
optimizer_betas: tuple[float, float] = (0.95, 0.999)
|
optimizer_betas: tuple[float, float] = (0.95, 0.999)
|
||||||
optimizer_eps: float = 1e-8
|
optimizer_eps: float = 1e-8
|
||||||
optimizer_weight_decay: float = 1e-5
|
optimizer_weight_decay: float = 1e-5
|
||||||
|
optimizer_fused: bool = False # Use CUDA fused AdamW kernel
|
||||||
warmup_ratio: float = 0.05
|
warmup_ratio: float = 0.05
|
||||||
use_bf16: bool = True
|
use_bf16: bool = True
|
||||||
|
|
||||||
@@ -174,6 +175,7 @@ class GrootConfig(PreTrainedConfig):
|
|||||||
betas=self.optimizer_betas,
|
betas=self.optimizer_betas,
|
||||||
eps=self.optimizer_eps,
|
eps=self.optimizer_eps,
|
||||||
weight_decay=self.optimizer_weight_decay,
|
weight_decay=self.optimizer_weight_decay,
|
||||||
|
fused=self.optimizer_fused,
|
||||||
)
|
)
|
||||||
|
|
||||||
def get_scheduler_preset(self) -> CosineDecayWithWarmupSchedulerConfig:
|
def get_scheduler_preset(self) -> CosineDecayWithWarmupSchedulerConfig:
|
||||||
|
|||||||
@@ -74,6 +74,7 @@ class PI0Config(PreTrainedConfig):
|
|||||||
gradient_checkpointing: bool = False # Enable gradient checkpointing for memory optimization
|
gradient_checkpointing: bool = False # Enable gradient checkpointing for memory optimization
|
||||||
compile_model: bool = False # Whether to use torch.compile for model optimization
|
compile_model: bool = False # Whether to use torch.compile for model optimization
|
||||||
compile_mode: str = "max-autotune" # Torch compile mode
|
compile_mode: str = "max-autotune" # Torch compile mode
|
||||||
|
optimizer_fused: bool = False # Use CUDA fused AdamW kernel
|
||||||
device: str | None = None # Device to use for the model (None = auto-detect)
|
device: str | None = None # Device to use for the model (None = auto-detect)
|
||||||
|
|
||||||
# Optimizer settings: see openpi `AdamW``
|
# Optimizer settings: see openpi `AdamW``
|
||||||
@@ -141,6 +142,7 @@ class PI0Config(PreTrainedConfig):
|
|||||||
eps=self.optimizer_eps,
|
eps=self.optimizer_eps,
|
||||||
weight_decay=self.optimizer_weight_decay,
|
weight_decay=self.optimizer_weight_decay,
|
||||||
grad_clip_norm=self.optimizer_grad_clip_norm,
|
grad_clip_norm=self.optimizer_grad_clip_norm,
|
||||||
|
fused=self.optimizer_fused,
|
||||||
)
|
)
|
||||||
|
|
||||||
def get_scheduler_preset(self):
|
def get_scheduler_preset(self):
|
||||||
|
|||||||
@@ -74,6 +74,7 @@ class PI05Config(PreTrainedConfig):
|
|||||||
gradient_checkpointing: bool = False # Enable gradient checkpointing for memory optimization
|
gradient_checkpointing: bool = False # Enable gradient checkpointing for memory optimization
|
||||||
compile_model: bool = False # Whether to use torch.compile for model optimization
|
compile_model: bool = False # Whether to use torch.compile for model optimization
|
||||||
compile_mode: str = "max-autotune" # Torch compile mode
|
compile_mode: str = "max-autotune" # Torch compile mode
|
||||||
|
optimizer_fused: bool = False # Use CUDA fused AdamW kernel
|
||||||
device: str | None = None # Device to use for the model (None = auto-detect)
|
device: str | None = None # Device to use for the model (None = auto-detect)
|
||||||
|
|
||||||
# Optimizer settings: see openpi `AdamW`
|
# Optimizer settings: see openpi `AdamW`
|
||||||
@@ -141,6 +142,7 @@ class PI05Config(PreTrainedConfig):
|
|||||||
eps=self.optimizer_eps,
|
eps=self.optimizer_eps,
|
||||||
weight_decay=self.optimizer_weight_decay,
|
weight_decay=self.optimizer_weight_decay,
|
||||||
grad_clip_norm=self.optimizer_grad_clip_norm,
|
grad_clip_norm=self.optimizer_grad_clip_norm,
|
||||||
|
fused=self.optimizer_fused,
|
||||||
)
|
)
|
||||||
|
|
||||||
def get_scheduler_preset(self):
|
def get_scheduler_preset(self):
|
||||||
|
|||||||
@@ -79,6 +79,7 @@ class SmolVLAConfig(PreTrainedConfig):
|
|||||||
optimizer_eps: float = 1e-8
|
optimizer_eps: float = 1e-8
|
||||||
optimizer_weight_decay: float = 1e-10
|
optimizer_weight_decay: float = 1e-10
|
||||||
optimizer_grad_clip_norm: float = 10
|
optimizer_grad_clip_norm: float = 10
|
||||||
|
optimizer_fused: bool = False
|
||||||
|
|
||||||
scheduler_warmup_steps: int = 1_000
|
scheduler_warmup_steps: int = 1_000
|
||||||
scheduler_decay_steps: int = 30_000
|
scheduler_decay_steps: int = 30_000
|
||||||
@@ -136,6 +137,7 @@ class SmolVLAConfig(PreTrainedConfig):
|
|||||||
eps=self.optimizer_eps,
|
eps=self.optimizer_eps,
|
||||||
weight_decay=self.optimizer_weight_decay,
|
weight_decay=self.optimizer_weight_decay,
|
||||||
grad_clip_norm=self.optimizer_grad_clip_norm,
|
grad_clip_norm=self.optimizer_grad_clip_norm,
|
||||||
|
fused=self.optimizer_fused,
|
||||||
)
|
)
|
||||||
|
|
||||||
def get_scheduler_preset(self):
|
def get_scheduler_preset(self):
|
||||||
|
|||||||
Reference in New Issue
Block a user