From f42cdcf1373412971ee2f024e47c02e990a3f79e Mon Sep 17 00:00:00 2001
From: johnnynunez <johnnynuca14@gmail.com>
Date: Thu, 2 Jul 2026 01:04:23 +0200
Subject: [PATCH] fix(groot): align N1.7 fine-tuning
 optimizer/scheduler/precision with Isaac-GR00T

Evidence from the LeRobot-vs-OSS checkpoint comparison: the LeRobot/HF 8k
checkpoint's DiT moved only ~19% as far from base as the OSS-trained one
(0.0547 vs 0.285 relative L2) - undertrained because the scheduler decayed over
a hardcoded 10k steps regardless of --steps, on top of beta1/clip mismatches.

- AdamW betas (0.95, 0.999) -> (0.9, 0.999) and grad_clip_norm 10.0 -> 1.0
  (Isaac defaults)
- scheduler: hardcoded CosineDecayWithWarmup(10k decay, floor 10% peak) ->
  DiffuserSchedulerConfig HF cosine with ceil(max_steps * warmup_ratio) warmup,
  deriving num_training_steps from the outer --steps at runtime
- model_params_fp32 (default true): keep master weights in FP32 and compute
  under BF16 autocast like the native N1.7 recipe (fixes optimizer-update
  numerics vs pure-BF16 params)
- weight-decay grouping via transformers get_parameter_names: biases and norm
  parameters excluded from decay
- restore the TF4 lm_head/embedding weight tie so the unused Qwen LM head stays
  frozen and deduplicated in checkpoints
- action_mask kept in native dtype for the masked flow-matching loss
- drop_n_last_frames: exclude episode tails that cannot supply a complete
  action chunk (Isaac sampler behavior)

Verification: tests/policies/groot/test_groot_training_optim_contract.py
(7 passed) + remaining groot suite 11 passed/5 skipped on RTX PRO 6000 /
CUDA 13.3. Note: tests/policies/groot/test_groot_n1_7.py does not collect on
the base branch (pre-existing ImportError, fixed in PR #37).
---
 .../policies/groot/configuration_groot.py     |  32 +++--
 src/lerobot/policies/groot/groot_n1_7.py      |  16 ++-
 src/lerobot/policies/groot/modeling_groot.py  |  48 ++++++-
 .../test_groot_training_optim_contract.py     | 121 ++++++++++++++++++
 4 files changed, 203 insertions(+), 14 deletions(-)
 create mode 100644 tests/policies/groot/test_groot_training_optim_contract.py

diff --git a/src/lerobot/policies/groot/configuration_groot.py b/src/lerobot/policies/groot/configuration_groot.py
index bf9e96321..97e08bb76 100644
--- a/src/lerobot/policies/groot/configuration_groot.py
+++ b/src/lerobot/policies/groot/configuration_groot.py
@@ -15,11 +15,12 @@
 # limitations under the License.
 
 import logging
+import math
 from dataclasses import dataclass, field
 from pathlib import Path
 
 from lerobot.configs import FeatureType, NormalizationMode, PolicyFeature, PreTrainedConfig
-from lerobot.optim import AdamWConfig, CosineDecayWithWarmupSchedulerConfig
+from lerobot.optim import AdamWConfig, DiffuserSchedulerConfig
 from lerobot.utils.constants import ACTION, OBS_STATE
 
 from .utils import read_json
@@ -336,11 +337,14 @@ class GrootConfig(PreTrainedConfig):
 
     # Training parameters
     optimizer_lr: float = 1e-4
-    optimizer_betas: tuple[float, float] = (0.95, 0.999)
+    # Isaac-GR00T N1.7 fine-tunes with AdamW betas (0.9, 0.999).
+    optimizer_betas: tuple[float, float] = (0.9, 0.999)
     optimizer_eps: float = 1e-8
     optimizer_weight_decay: float = 1e-5
     warmup_ratio: float = 0.05
     use_bf16: bool = True
+    # The native N1.7 fine-tuning recipe keeps model parameters in FP32 and computes under BF16 autocast.
+    model_params_fp32: bool = True
 
     # TODO(Steven): Remove these deprecated fields in a future release.
     # Deprecated Isaac-GR00T runner / GR00T N1.5 fields, plus the (never-wired) LoRA fields — all
@@ -480,15 +484,20 @@ class GrootConfig(PreTrainedConfig):
             betas=self.optimizer_betas,
             eps=self.optimizer_eps,
             weight_decay=self.optimizer_weight_decay,
+            grad_clip_norm=1.0,
         )
 
-    def get_scheduler_preset(self) -> CosineDecayWithWarmupSchedulerConfig:
-        """Return scheduler configuration."""
-        return CosineDecayWithWarmupSchedulerConfig(
-            num_warmup_steps=int(10000 * self.warmup_ratio),  # 5% warmup by default
-            num_decay_steps=10000,  # Adjust based on training steps
-            peak_lr=self.optimizer_lr,
-            decay_lr=self.optimizer_lr * 0.1,
+    def get_scheduler_preset(self) -> DiffuserSchedulerConfig:
+        """Return scheduler configuration.
+
+        Isaac-GR00T uses the HF Trainer cosine schedule with ~5% warmup over the
+        actual training update count; DiffuserSchedulerConfig wraps the same
+        diffusers/transformers `get_scheduler("cosine")` implementation and
+        derives num_training_steps from the outer --steps value at runtime.
+        """
+        return DiffuserSchedulerConfig(
+            name="cosine",
+            num_warmup_steps=math.ceil(self.max_steps * self.warmup_ratio),
         )
 
     @property
@@ -504,6 +513,11 @@ class GrootConfig(PreTrainedConfig):
         )
         return list(range(min(self.chunk_size, model_action_horizon)))
 
+    @property
+    def drop_n_last_frames(self) -> int:
+        """Exclude episode tails that cannot supply a complete N1.7 action chunk."""
+        return max(0, len(self.action_delta_indices) - 1)
+
     @property
     def reward_delta_indices(self) -> None:
         """Return indices for delta rewards (None for Groot)."""
diff --git a/src/lerobot/policies/groot/groot_n1_7.py b/src/lerobot/policies/groot/groot_n1_7.py
index 5a49ceed2..b72eadb7a 100644
--- a/src/lerobot/policies/groot/groot_n1_7.py
+++ b/src/lerobot/policies/groot/groot_n1_7.py
@@ -60,6 +60,19 @@ except ImportError:
 logger = logging.getLogger(__name__)
 
 
+def _tie_unused_qwen_lm_head(model: nn.Module) -> None:
+    """Restore the TF4 weight tie so the unused LM head stays frozen and is omitted on save."""
+    lm_head = getattr(model, "lm_head", None)
+    get_input_embeddings = getattr(model, "get_input_embeddings", None)
+    if lm_head is None or not callable(get_input_embeddings):
+        return
+    input_embeddings = get_input_embeddings()
+    embedding_weight = getattr(input_embeddings, "weight", None)
+    if embedding_weight is None:
+        return
+    lm_head.weight = embedding_weight
+
+
 GR00T_N1_7_DEFAULTS: dict[str, Any] = {
     "model_dtype": "bfloat16",
     "dtype": "bfloat16",
@@ -288,6 +301,7 @@ class Qwen3Backbone(nn.Module):
                 config_kwargs=transformers_loading_kwargs,
             ).eval()
 
+        _tie_unused_qwen_lm_head(self.model)
         while len(self.language_model.layers) > select_layer:
             self.language_model.layers.pop(-1)
 
@@ -603,7 +617,7 @@ class GR00TN17ActionHead(nn.Module):
 
         pred = self.action_decoder(model_output, embodiment_id)
         pred_actions = pred[:, -actions.shape[1] :]
-        action_mask = action_input.action_mask.to(dtype=pred_actions.dtype)
+        action_mask = action_input.action_mask
         action_loss = F.mse_loss(pred_actions, velocity, reduction="none") * action_mask
         loss = action_loss.sum() / (action_mask.sum() + 1e-6)
         return BatchFeature(
diff --git a/src/lerobot/policies/groot/modeling_groot.py b/src/lerobot/policies/groot/modeling_groot.py
index f0e1c08d3..a754c1fb1 100644
--- a/src/lerobot/policies/groot/modeling_groot.py
+++ b/src/lerobot/policies/groot/modeling_groot.py
@@ -34,6 +34,7 @@ from huggingface_hub import hf_hub_download
 from huggingface_hub.constants import SAFETENSORS_SINGLE_FILE
 from huggingface_hub.errors import HfHubHTTPError
 from torch import Tensor
+from transformers.trainer_pt_utils import get_parameter_names
 
 from lerobot.configs import FeatureType, PolicyFeature
 from lerobot.utils.constants import ACTION, OBS_IMAGES
@@ -50,7 +51,7 @@ from .configuration_groot import (
     infer_groot_n1_7_action_execution_horizon,
     infer_groot_n1_7_action_horizon,
 )
-from .groot_n1_7 import GR00TN17
+from .groot_n1_7 import GR00TN17, _tie_unused_qwen_lm_head
 
 logger = logging.getLogger(__name__)
 
@@ -96,11 +97,49 @@ class GrootPolicy(PreTrainedPolicy):
         if self.config.rtc_ramp_rate is not None:
             model_kwargs["rtc_ramp_rate"] = self.config.rtc_ramp_rate
 
-        return GR00TN17.from_pretrained(
+        model = GR00TN17.from_pretrained(
             **model_kwargs,
             tune_vlln=self.config.tune_vlln,
             transformers_loading_kwargs={"trust_remote_code": True},
         )
+        backbone = getattr(model, "backbone", None)
+        qwen_model = getattr(backbone, "model", None)
+        if qwen_model is not None:
+            _tie_unused_qwen_lm_head(qwen_model)
+        if self.config.model_params_fp32:
+            self._cast_model_parameters_to_fp32(model)
+        return model
+
+    @staticmethod
+    def _cast_model_parameters_to_fp32(model: torch.nn.Module) -> None:
+        for parameter in model.parameters():
+            if parameter.is_floating_point():
+                parameter.data = parameter.data.to(torch.float32)
+
+    @staticmethod
+    def _build_weight_decay_parameter_groups(model: torch.nn.Module) -> list[dict[str, object]]:
+        forbidden_name_patterns = [
+            r"bias",
+            r"layernorm",
+            r"rmsnorm",
+            r"(?:^|\.)norm(?:$|\.)",
+            r"_norm(?:$|\.)",
+        ]
+        decay_names = set(get_parameter_names(model, [torch.nn.LayerNorm], forbidden_name_patterns))
+        decay_params = [
+            parameter
+            for name, parameter in model.named_parameters()
+            if parameter.requires_grad and name in decay_names
+        ]
+        no_decay_params = [
+            parameter
+            for name, parameter in model.named_parameters()
+            if parameter.requires_grad and name not in decay_names
+        ]
+        return [
+            {"params": decay_params},
+            {"params": no_decay_params, "weight_decay": 0.0},
+        ]
 
     def reset(self):
         """Reset policy state when environment resets."""
@@ -238,8 +277,9 @@ class GrootPolicy(PreTrainedPolicy):
         policy.eval()
         return policy
 
-    def get_optim_params(self) -> dict:
-        return self.parameters()
+    def get_optim_params(self):  # type: ignore[override]
+        """Isaac-GR00T excludes biases and normalization parameters from weight decay."""
+        return self._build_weight_decay_parameter_groups(self)
 
     def _resolve_action_queue_steps(self) -> int:
         n_action_steps = int(self.config.n_action_steps)
diff --git a/tests/policies/groot/test_groot_training_optim_contract.py b/tests/policies/groot/test_groot_training_optim_contract.py
new file mode 100644
index 000000000..75bcdda0a
--- /dev/null
+++ b/tests/policies/groot/test_groot_training_optim_contract.py
@@ -0,0 +1,121 @@
+#!/usr/bin/env python
+
+# Copyright 2026 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Isaac-GR00T N1.7 optimizer/scheduler/precision training contract.
+
+Pins the LeRobot GR00T fine-tuning recipe to the native Isaac-GR00T contract:
+AdamW(lr=1e-4, betas=(0.9, 0.999), eps=1e-8, weight_decay=1e-5, grad clip 1.0),
+HF cosine schedule with ~5% warmup over the actual update count, FP32 master
+parameters under BF16 autocast, transformers-style weight-decay grouping, the
+frozen LM-head weight tie, and episode-tail exclusion for incomplete chunks.
+"""
+
+import pytest
+import torch
+
+from lerobot.optim.schedulers import DiffuserSchedulerConfig
+from lerobot.policies.groot.configuration_groot import GrootConfig
+from lerobot.policies.groot.groot_n1_7 import _tie_unused_qwen_lm_head
+from lerobot.policies.groot.modeling_groot import GrootPolicy
+
+
+def test_groot_n1_7_optimizer_matches_isaac_training_contract():
+    optimizer = GrootConfig().get_optimizer_preset()
+
+    assert optimizer.lr == pytest.approx(1e-4)
+    assert optimizer.betas == pytest.approx((0.9, 0.999))
+    assert optimizer.eps == pytest.approx(1e-8)
+    assert optimizer.weight_decay == pytest.approx(1e-5)
+    assert optimizer.grad_clip_norm == pytest.approx(1.0)
+
+
+def test_groot_n1_7_sampler_excludes_incomplete_action_tails():
+    config = GrootConfig(chunk_size=16, n_action_steps=16)
+
+    assert len(config.action_delta_indices) == 16
+    assert config.drop_n_last_frames == 15
+
+
+def test_groot_n1_7_scheduler_matches_isaac_hf_cosine_contract():
+    config = GrootConfig(max_steps=20_000)
+    scheduler_config = config.get_scheduler_preset()
+
+    assert isinstance(scheduler_config, DiffuserSchedulerConfig)
+    assert scheduler_config.name == "cosine"
+    assert scheduler_config.num_warmup_steps == 1_000
+
+    parameter = torch.nn.Parameter(torch.ones(()))
+    optimizer = torch.optim.AdamW([parameter], lr=config.optimizer_lr)
+    scheduler = scheduler_config.build(optimizer, num_training_steps=20_000)
+    lr_factor = scheduler.lr_lambdas[0]
+
+    assert lr_factor(0) == pytest.approx(0.0)
+    assert lr_factor(1_000) == pytest.approx(1.0)
+    assert lr_factor(10_500) == pytest.approx(0.5)
+    assert lr_factor(20_000) == pytest.approx(0.0, abs=1e-12)
+
+
+def test_groot_n1_7_scheduler_rounds_fractional_warmup_up_like_transformers():
+    scheduler_config = GrootConfig(max_steps=777).get_scheduler_preset()
+
+    assert scheduler_config.num_warmup_steps == 39
+
+
+def test_groot_n1_7_model_parameters_use_fp32_checkpoint_and_optimizer_precision():
+    module = torch.nn.Module()
+    module.trainable = torch.nn.Parameter(torch.ones(3, dtype=torch.bfloat16))
+    module.frozen = torch.nn.Parameter(torch.ones(3, dtype=torch.bfloat16), requires_grad=False)
+
+    GrootPolicy._cast_model_parameters_to_fp32(module)
+
+    assert module.trainable.dtype == torch.float32
+    assert module.frozen.dtype == torch.float32
+
+
+def test_groot_n1_7_ties_unused_qwen_lm_head_to_frozen_input_embeddings():
+    class DummyQwen(torch.nn.Module):
+        def __init__(self):
+            super().__init__()
+            self.embed_tokens = torch.nn.Embedding(7, 3)
+            self.lm_head = torch.nn.Linear(3, 7, bias=False)
+
+        def get_input_embeddings(self):
+            return self.embed_tokens
+
+    model = DummyQwen()
+    _tie_unused_qwen_lm_head(model)
+
+    assert model.lm_head.weight is model.embed_tokens.weight
+    assert len(list(model.parameters())) == 1
+
+
+def test_groot_n1_7_optimizer_groups_match_transformers_weight_decay_rules():
+    module = torch.nn.Module()
+    module.linear = torch.nn.Linear(3, 2)
+    module.norm = torch.nn.LayerNorm(2)
+    module.frozen = torch.nn.Parameter(torch.ones(1), requires_grad=False)
+
+    groups = GrootPolicy._build_weight_decay_parameter_groups(module)
+
+    assert len(groups) == 2
+    assert "weight_decay" not in groups[0]
+    assert groups[1]["weight_decay"] == 0.0
+    assert groups[0]["params"] == [module.linear.weight]
+    assert {id(parameter) for parameter in groups[1]["params"]} == {
+        id(module.linear.bias),
+        id(module.norm.weight),
+        id(module.norm.bias),
+    }