From 162b07512ab665465b6a888a78a5bdeac895a775 Mon Sep 17 00:00:00 2001
From: nv-sachdevkartik <ksachdev@nvidia.com>
Date: Thu, 11 Jun 2026 17:49:12 +0000
Subject: [PATCH] groot: remove dead N1.5 code (eagle2_hg_model,
 flow_matching_action_head, action_encoder)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

N1.7 backbone is nvidia/Cosmos-Reason2-2B via Qwen3VLForConditionalGeneration,
not Eagle2 — eagle2_hg_model/ had zero refs outside its own dir.

GR00TN17ActionHead (groot_n1_7.py) re-implements MultiEmbodimentActionEncoder +
CategorySpecificLinear + swish + SinusoidalPositionalEncoding locally, so
flow_matching_action_head.py (N1.5 FlowmatchingActionHead) and its sole
dependency action_encoder.py are dead. Verified: no src/ or tests/ reference.

Removed (~2037 LOC):
- eagle2_hg_model/ (4 files, ~1575 LOC)
- action_head/flow_matching_action_head.py (408 LOC)
- action_head/action_encoder.py (54 LOC)

cross_attention_dit.py KEPT (DiT/AlternateVLDiT/SelfAttentionTransformer live in N1.7).
---
 .../groot/action_head/action_encoder.py       |  54 --
 .../action_head/flow_matching_action_head.py  | 408 -------------
 .../configuration_eagle2_5_vl.py              | 135 -----
 .../image_processing_eagle2_5_vl_fast.py      | 503 ----------------
 .../eagle2_hg_model/modeling_eagle2_5_vl.py   | 396 -------------
 .../eagle2_hg_model/processing_eagle2_5_vl.py | 541 ------------------
 6 files changed, 2037 deletions(-)
 delete mode 100644 src/lerobot/policies/groot/action_head/action_encoder.py
 delete mode 100644 src/lerobot/policies/groot/action_head/flow_matching_action_head.py
 delete mode 100755 src/lerobot/policies/groot/eagle2_hg_model/configuration_eagle2_5_vl.py
 delete mode 100644 src/lerobot/policies/groot/eagle2_hg_model/image_processing_eagle2_5_vl_fast.py
 delete mode 100755 src/lerobot/policies/groot/eagle2_hg_model/modeling_eagle2_5_vl.py
 delete mode 100755 src/lerobot/policies/groot/eagle2_hg_model/processing_eagle2_5_vl.py

diff --git a/src/lerobot/policies/groot/action_head/action_encoder.py b/src/lerobot/policies/groot/action_head/action_encoder.py
deleted file mode 100644
index c6fa0a779..000000000
--- a/src/lerobot/policies/groot/action_head/action_encoder.py
+++ /dev/null
@@ -1,54 +0,0 @@
-# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import torch
-import torch.nn as nn
-
-
-def swish(x):
-    return x * torch.sigmoid(x)
-
-
-class SinusoidalPositionalEncoding(nn.Module):
-    """
-    Produces a sinusoidal encoding of shape (B, T, w)
-    given timesteps of shape (B, T).
-    """
-
-    def __init__(self, embedding_dim):
-        super().__init__()
-        self.embedding_dim = embedding_dim
-
-    def forward(self, timesteps):
-        # timesteps: shape (B, T)
-        # We'll compute sin/cos frequencies across dim T
-        timesteps = timesteps.float()  # ensure float
-
-        b, t = timesteps.shape
-        device = timesteps.device
-
-        half_dim = self.embedding_dim // 2
-        # typical log space frequencies for sinusoidal encoding
-        exponent = -torch.arange(half_dim, dtype=torch.float, device=device) * (
-            torch.log(torch.tensor(10000.0)) / half_dim
-        )
-        # Expand timesteps to (B, T, 1) then multiply
-        freqs = timesteps.unsqueeze(-1) * exponent.exp()  # (B, T, half_dim)
-
-        sin = torch.sin(freqs)
-        cos = torch.cos(freqs)
-        enc = torch.cat([sin, cos], dim=-1)  # (B, T, w)
-
-        return enc
diff --git a/src/lerobot/policies/groot/action_head/flow_matching_action_head.py b/src/lerobot/policies/groot/action_head/flow_matching_action_head.py
deleted file mode 100644
index 9c41237d6..000000000
--- a/src/lerobot/policies/groot/action_head/flow_matching_action_head.py
+++ /dev/null
@@ -1,408 +0,0 @@
-# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from dataclasses import field
-from typing import TYPE_CHECKING
-
-import torch
-import torch.nn.functional as F  # noqa: N812
-from torch import nn
-from torch.distributions import Beta
-
-from lerobot.utils.import_utils import _transformers_available
-
-# Conditional import for type checking and lazy loading
-if TYPE_CHECKING or _transformers_available:
-    from transformers import PretrainedConfig
-    from transformers.feature_extraction_utils import BatchFeature
-else:
-    PretrainedConfig = object
-    BatchFeature = None
-
-from .action_encoder import (
-    SinusoidalPositionalEncoding,
-    swish,
-)
-from .cross_attention_dit import DiT, SelfAttentionTransformer
-
-
-class CategorySpecificLinear(nn.Module):
-    def __init__(self, num_categories, input_dim, hidden_dim):
-        super().__init__()
-        self.num_categories = num_categories
-        # For each category, we have separate weights and biases.
-        self.W = nn.Parameter(0.02 * torch.randn(num_categories, input_dim, hidden_dim))
-        self.b = nn.Parameter(torch.zeros(num_categories, hidden_dim))
-
-    def forward(self, x, cat_ids):
-        selected_w = self.W[cat_ids]
-        selected_b = self.b[cat_ids]
-        return torch.bmm(x, selected_w) + selected_b.unsqueeze(1)
-
-
-class CategorySpecificMLP(nn.Module):
-    def __init__(self, num_categories, input_dim, hidden_dim, output_dim):
-        super().__init__()
-        self.num_categories = num_categories
-        self.layer1 = CategorySpecificLinear(num_categories, input_dim, hidden_dim)
-        self.layer2 = CategorySpecificLinear(num_categories, hidden_dim, output_dim)
-
-    def forward(self, x, cat_ids):
-        hidden = F.relu(self.layer1(x, cat_ids))
-        return self.layer2(hidden, cat_ids)
-
-
-class MultiEmbodimentActionEncoder(nn.Module):
-    def __init__(self, action_dim, hidden_size, num_embodiments):
-        super().__init__()
-        self.hidden_size = hidden_size
-        self.num_embodiments = num_embodiments
-
-        # W1: R^{w x d}, W2: R^{w x 2w}, W3: R^{w x w}
-        self.W1 = CategorySpecificLinear(num_embodiments, action_dim, hidden_size)  # (d -> w)
-        self.W2 = CategorySpecificLinear(num_embodiments, 2 * hidden_size, hidden_size)  # (2w -> w)
-        self.W3 = CategorySpecificLinear(num_embodiments, hidden_size, hidden_size)  # (w -> w)
-        self.pos_encoding = SinusoidalPositionalEncoding(hidden_size)
-
-    def forward(self, actions, timesteps, cat_ids):
-        """
-        actions:   shape (B, T, action_dim)
-        timesteps: shape (B,)  -- a single scalar per batch item
-        cat_ids:   shape (B,)
-        returns:   shape (B, T, hidden_size)
-        """
-        b, t, _ = actions.shape
-
-        # 1) Expand each batch's single scalar time 'tau' across all T steps
-        #    so that shape => (B, T)
-        #    e.g. if timesteps is (B,), replicate across T
-        if timesteps.dim() == 1 and timesteps.shape[0] == b:
-            # shape (B,) => (B,T)
-            timesteps = timesteps.unsqueeze(1).expand(-1, t)
-        else:
-            raise ValueError("Expected `timesteps` to have shape (B,) so we can replicate across T.")
-
-        # 2) Standard action MLP step for shape => (B, T, w)
-        a_emb = self.W1(actions, cat_ids)
-
-        # 3) Get the sinusoidal encoding (B, T, w)
-        tau_emb = self.pos_encoding(timesteps).to(dtype=a_emb.dtype)
-
-        # 4) Concat along last dim => (B, T, 2w), then W2 => (B, T, w), swish
-        x = torch.cat([a_emb, tau_emb], dim=-1)
-        x = swish(self.W2(x, cat_ids))
-
-        # 5) Finally W3 => (B, T, w)
-        x = self.W3(x, cat_ids)
-        return x
-
-
-class FlowmatchingActionHeadConfig(PretrainedConfig):
-    """Flow-matching action head used by GR00T backbones."""
-
-    add_pos_embed: bool = field(default=True, metadata={"help": "Whether to add positional embedding"})
-    model_dtype: str = field(default="float32", metadata={"help": "Model data type."})
-    diffusion_model_cfg: dict = field(default=None, metadata={"help": "Diffusion model configuration."})
-    input_embedding_dim: int = field(default=1536, metadata={"help": "Input embedding channel dimension."})
-    backbone_embedding_dim: int = field(
-        default=1536, metadata={"help": "Backbone embedding channel dimension."}
-    )
-
-    hidden_size: int = field(default=1024, metadata={"help": "Input embedding dimension."})
-    max_seq_len: int = field(default=1024, metadata={"help": "Maximum Sequence Length"})
-    action_dim: int = field(default=None, metadata={"help": "Action dimension."})
-    action_horizon: int = field(default=None, metadata={"help": "Action horizon."})
-    noise_beta_alpha: float = field(default=1.5, metadata={"help": ""})
-    noise_beta_beta: float = field(default=1.0, metadata={"help": ""})
-    noise_s: float = field(default=0.999, metadata={"help": "Flow matching noise Beta distribution s."})
-    num_timestep_buckets: int = field(
-        default=1000, metadata={"help": "Number of timestep discretization buckets."}
-    )
-    num_inference_timesteps: int = field(
-        default=None,
-        metadata={"help": "Number of inference steps for noise diffusion."},
-    )
-    max_num_embodiments: int = field(default=32, metadata={"help": "Number of embodiments."})
-    tune_projector: bool = field(default=True, metadata={"help": "Whether to tune the projector."})
-    tune_diffusion_model: bool = field(
-        default=True, metadata={"help": "Whether to tune the diffusion model."}
-    )
-    load_pretrained_det_decode_layer_path: str = field(
-        default=None, metadata={"help": "Path to pretrained detection model."}
-    )
-    detection_coeff: float = field(default=1.0, metadata={"help": "Detection coefficient."})
-
-    freeze_decode_layer: bool = field(default=False)
-    expand_batch: int = field(default=None)
-    use_vlln: bool = field(default=True)
-
-    vl_self_attention_cfg: dict = field(default=None)
-    num_target_vision_tokens: int = field(default=32, metadata={"help": "Number of target vision tokens."})
-
-    def __init__(self, **kwargs):
-        super().__init__(**kwargs)
-        for key, value in kwargs.items():
-            setattr(self, key, value)
-
-
-class FlowmatchingActionHead(nn.Module):
-    config_class = FlowmatchingActionHeadConfig
-    supports_gradient_checkpointing = True
-
-    def __init__(
-        self,
-        config: FlowmatchingActionHeadConfig,
-    ):
-        super().__init__()
-        self.hidden_size = config.hidden_size
-        self.input_embedding_dim = config.input_embedding_dim
-
-        self.model = DiT(**config.diffusion_model_cfg)
-        self.action_dim = config.action_dim
-        self.action_horizon = config.action_horizon
-        self.num_inference_timesteps = config.num_inference_timesteps
-
-        self.state_encoder = CategorySpecificMLP(
-            num_categories=config.max_num_embodiments,
-            input_dim=config.max_state_dim,
-            hidden_dim=self.hidden_size,
-            output_dim=self.input_embedding_dim,
-        )
-        self.action_encoder = MultiEmbodimentActionEncoder(
-            action_dim=config.action_dim,
-            hidden_size=self.input_embedding_dim,
-            num_embodiments=config.max_num_embodiments,
-        )
-        self.action_decoder = CategorySpecificMLP(
-            num_categories=config.max_num_embodiments,
-            input_dim=self.hidden_size,
-            hidden_dim=self.hidden_size,
-            output_dim=self.action_dim,
-        )
-        self.future_tokens = nn.Embedding(config.num_target_vision_tokens, self.input_embedding_dim)
-        nn.init.normal_(self.future_tokens.weight, mean=0.0, std=0.02)
-
-        self.vlln = nn.LayerNorm(config.backbone_embedding_dim) if config.use_vlln else nn.Identity()
-        self.vl_self_attention = (
-            SelfAttentionTransformer(**config.vl_self_attention_cfg) if config.use_vlln else nn.Identity()
-        )
-
-        if config.add_pos_embed:
-            self.position_embedding = nn.Embedding(config.max_seq_len, self.input_embedding_dim)
-            nn.init.normal_(self.position_embedding.weight, mean=0.0, std=0.02)
-
-        self._noise_beta_alpha = config.noise_beta_alpha
-        self._noise_beta_beta = config.noise_beta_beta
-        self._beta_dist = None
-        self.num_timestep_buckets = config.num_timestep_buckets
-        self.config = config
-        self.set_trainable_parameters(config.tune_projector, config.tune_diffusion_model)
-
-    def set_trainable_parameters(self, tune_projector: bool, tune_diffusion_model: bool):
-        self.tune_projector = tune_projector
-        self.tune_diffusion_model = tune_diffusion_model
-        for p in self.parameters():
-            p.requires_grad = True
-        if not tune_projector:
-            self.state_encoder.requires_grad_(False)
-            self.action_encoder.requires_grad_(False)
-            self.action_decoder.requires_grad_(False)
-            if self.config.add_pos_embed:
-                self.position_embedding.requires_grad_(False)
-        if not tune_diffusion_model:
-            self.model.requires_grad_(False)
-        print(f"Tune action head projector: {self.tune_projector}")
-        print(f"Tune action head diffusion model: {self.tune_diffusion_model}")
-        # Check if any parameters are still trainable. If not, print a warning.
-        if not tune_projector and not tune_diffusion_model:
-            for name, p in self.named_parameters():
-                if p.requires_grad:
-                    print(f"Action head trainable parameter: {name}")
-        if not any(p.requires_grad for p in self.parameters()):
-            print("Warning: No action head trainable parameters found.")
-
-    def set_frozen_modules_to_eval_mode(self):
-        """
-        Huggingface will call model.train() at each training_step. To ensure
-        the expected behaviors for modules like dropout, batchnorm, etc., we
-        need to call model.eval() for the frozen modules.
-        """
-        if self.training:
-            if not self.tune_projector:
-                self.state_encoder.eval()
-                self.action_encoder.eval()
-                self.action_decoder.eval()
-                if self.config.add_pos_embed:
-                    self.position_embedding.eval()
-            if not self.tune_diffusion_model:
-                self.model.eval()
-
-    def sample_time(self, batch_size, device, dtype):
-        if self._beta_dist is None:
-            self._beta_dist = Beta(self._noise_beta_alpha, self._noise_beta_beta, validate_args=False)
-        sample = self._beta_dist.sample([batch_size]).to(device, dtype=dtype)
-        return (self.config.noise_s - sample) / self.config.noise_s
-
-    def prepare_input(self, batch: dict) -> BatchFeature:
-        return BatchFeature(data=batch)
-
-    def process_backbone_output(self, backbone_output: BatchFeature) -> BatchFeature:
-        backbone_features = backbone_output["backbone_features"]
-        backbone_features = self.vlln(backbone_features)
-        backbone_features = self.vl_self_attention(backbone_features)
-        backbone_output["backbone_features"] = backbone_features
-        return backbone_output
-
-    def forward(self, backbone_output: BatchFeature, action_input: BatchFeature) -> BatchFeature:
-        # Set frozen modules to eval
-        self.set_frozen_modules_to_eval_mode()
-
-        backbone_output = self.process_backbone_output(backbone_output)
-
-        if self.config.expand_batch is not None:
-            for k, v in backbone_output.items():
-                ndim = len(v.shape)
-                factors = [self.config.expand_batch]
-                while len(factors) < ndim:
-                    factors.append(1)
-                factors = tuple(factors)
-                expanded = v.repeat(*factors)
-                backbone_output[k] = expanded
-
-            for k, v in action_input.items():
-                ndim = len(v.shape)
-                factors = [self.config.expand_batch]
-                while len(factors) < ndim:
-                    factors.append(1)
-                factors = tuple(factors)
-                expanded = v.repeat(*factors)
-                action_input[k] = expanded
-
-        # Get vision and language embeddings.
-        vl_embs = backbone_output.backbone_features
-        device = vl_embs.device
-
-        # Get embodiment ID.
-        embodiment_id = action_input.embodiment_id
-
-        # Embed state.
-        state_features = self.state_encoder(action_input.state, embodiment_id)
-
-        # Embed noised action trajectory.
-        actions = action_input.action
-        noise = torch.randn(actions.shape, device=actions.device, dtype=actions.dtype)
-        t = self.sample_time(actions.shape[0], device=actions.device, dtype=actions.dtype)
-        t = t[:, None, None]  # shape (B,1,1) for broadcast
-
-        noisy_trajectory = (1 - t) * noise + t * actions
-        velocity = actions - noise
-
-        # Convert (continuous) t -> discrete if needed
-        t_discretized = (t[:, 0, 0] * self.num_timestep_buckets).long()
-        action_features = self.action_encoder(noisy_trajectory, t_discretized, embodiment_id)
-
-        # Maybe add position embedding.
-        if self.config.add_pos_embed:
-            pos_ids = torch.arange(action_features.shape[1], dtype=torch.long, device=device)
-            pos_embs = self.position_embedding(pos_ids).unsqueeze(0)
-            action_features = action_features + pos_embs
-
-        # Join vision, language, state and action embedding along sequence dimension.
-        future_tokens = self.future_tokens.weight.unsqueeze(0).expand(vl_embs.shape[0], -1, -1)
-        sa_embs = torch.cat((state_features, future_tokens, action_features), dim=1)
-
-        vl_attn_mask = backbone_output.backbone_attention_mask
-
-        model_output = self.model(
-            hidden_states=sa_embs,
-            encoder_hidden_states=vl_embs,
-            encoder_attention_mask=vl_attn_mask,
-            timestep=t_discretized,
-            return_all_hidden_states=False,  # NOTE (YL): not using flare now
-        )
-        pred = self.action_decoder(model_output, embodiment_id)
-        pred_actions = pred[:, -actions.shape[1] :]
-
-        # Slice out only the action portion of pred and target.
-        action_mask = action_input.action_mask
-        loss = F.mse_loss(pred_actions, velocity, reduction="none") * action_mask
-        loss = loss.sum() / action_mask.sum()
-        output_dict = {
-            "loss": loss,
-        }
-        return BatchFeature(data=output_dict)
-
-    @torch.no_grad()
-    def get_action(self, backbone_output: BatchFeature, action_input: BatchFeature) -> BatchFeature:
-        backbone_output = self.process_backbone_output(backbone_output)
-
-        # Get vision and language embeddings.
-        vl_embs = backbone_output.backbone_features
-        embodiment_id = action_input.embodiment_id
-
-        # Embed state.
-        state_features = self.state_encoder(action_input.state, embodiment_id)
-
-        # Set initial actions as the sampled noise.
-        batch_size = vl_embs.shape[0]
-        device = vl_embs.device
-        actions = torch.randn(
-            size=(batch_size, self.config.action_horizon, self.config.action_dim),
-            dtype=vl_embs.dtype,
-            device=device,
-        )
-
-        num_steps = self.num_inference_timesteps
-        dt = 1.0 / num_steps
-
-        # Run denoising steps.
-        for t in range(num_steps):
-            t_cont = t / float(num_steps)  # e.g. goes 0, 1/N, 2/N, ...
-            t_discretized = int(t_cont * self.num_timestep_buckets)
-
-            # Embed noised action trajectory.
-            timesteps_tensor = torch.full(size=(batch_size,), fill_value=t_discretized, device=device)
-            action_features = self.action_encoder(actions, timesteps_tensor, embodiment_id)
-            # Maybe add position embedding.
-            if self.config.add_pos_embed:
-                pos_ids = torch.arange(action_features.shape[1], dtype=torch.long, device=device)
-                pos_embs = self.position_embedding(pos_ids).unsqueeze(0)
-                action_features = action_features + pos_embs
-
-            # Join vision, language, state and action embedding along sequence dimension.
-            future_tokens = self.future_tokens.weight.unsqueeze(0).expand(vl_embs.shape[0], -1, -1)
-            sa_embs = torch.cat((state_features, future_tokens, action_features), dim=1)
-
-            # Run model forward.
-            model_output = self.model(
-                hidden_states=sa_embs,
-                encoder_hidden_states=vl_embs,
-                timestep=timesteps_tensor,
-            )
-            pred = self.action_decoder(model_output, embodiment_id)
-
-            pred_velocity = pred[:, -self.action_horizon :]
-
-            # Update actions using euler integration.
-            actions = actions + dt * pred_velocity
-        return BatchFeature(data={"action_pred": actions})
-
-    @property
-    def device(self):
-        return next(iter(self.parameters())).device
-
-    @property
-    def dtype(self):
-        return next(iter(self.parameters())).dtype
diff --git a/src/lerobot/policies/groot/eagle2_hg_model/configuration_eagle2_5_vl.py b/src/lerobot/policies/groot/eagle2_hg_model/configuration_eagle2_5_vl.py
deleted file mode 100755
index 526b4f7a2..000000000
--- a/src/lerobot/policies/groot/eagle2_hg_model/configuration_eagle2_5_vl.py
+++ /dev/null
@@ -1,135 +0,0 @@
-# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-import copy
-
-from transformers.configuration_utils import PretrainedConfig
-from transformers.models.llama.configuration_llama import LlamaConfig
-from transformers.models.qwen2.configuration_qwen2 import Qwen2Config
-from transformers.models.qwen3.configuration_qwen3 import Qwen3Config
-from transformers.models.siglip.configuration_siglip import SiglipVisionConfig
-from transformers.utils import logging
-
-logger = logging.get_logger(__name__)
-
-
-class Eagle25VLConfig(PretrainedConfig):
-    model_type = "eagle_2_5_vl"
-    is_composition = True
-    sub_configs = {"vision_config": SiglipVisionConfig, "text_config": Qwen2Config}
-
-    def __init__(
-        self,
-        vision_config=None,
-        text_config=None,
-        use_backbone_lora=0,
-        use_llm_lora=0,
-        pad2square=False,
-        select_layer=-4,
-        force_image_size=None,
-        downsample_ratio=0.5,
-        template=None,
-        dynamic_image_size=False,
-        use_thumbnail=False,
-        loss_version="v1",
-        min_dynamic_tiles=1,
-        max_dynamic_tiles=6,
-        mlp_checkpoint=False,
-        initializer_range=0.02,
-        _attn_implementation="flash_attention_2",
-        _attn_implementation_autoset=False,
-        llm_config=None,
-        image_token_index=None,
-        use_pixel_shuffle=True,
-        mlp_connector_layers=2,
-        **kwargs,
-    ):
-        super().__init__(**kwargs)
-
-        if vision_config is None:
-            vision_config = {"model_type": "siglip_vision_model"}
-            logger.info("vision_config is None. Initializing the InternVisionConfig with default values.")
-
-        if text_config is None:
-            text_config = {"architectures": ["Qwen2ForCausalLM"]}
-            logger.info(
-                "text_config is None. Initializing the LlamaConfig config with default values (`LlamaConfig`)."
-            )
-
-        if vision_config["model_type"] == "siglip_vision_model":
-            self.vision_config = SiglipVisionConfig(**vision_config)
-        else:
-            raise ValueError("Unsupported model_type: {}".format(vision_config["model_type"]))
-
-        if text_config["architectures"][0] == "LlamaForCausalLM":
-            self.text_config = LlamaConfig(**text_config)
-        elif text_config["architectures"][0] == "Qwen2ForCausalLM":
-            self.text_config = Qwen2Config(**text_config)
-        elif text_config["architectures"][0] == "Qwen3ForCausalLM":
-            self.text_config = Qwen3Config(**text_config)
-        else:
-            raise ValueError("Unsupported architecture: {}".format(text_config["architectures"][0]))
-        self.use_backbone_lora = use_backbone_lora
-        self.use_llm_lora = use_llm_lora
-        self.mlp_checkpoint = mlp_checkpoint
-        self.pad2square = pad2square
-        self.select_layer = select_layer
-        self.force_image_size = force_image_size
-        self.downsample_ratio = downsample_ratio
-        self.template = template
-        self.dynamic_image_size = dynamic_image_size
-        self.use_thumbnail = use_thumbnail
-        self.loss_version = loss_version
-        self.initializer_range = initializer_range
-        self.min_dynamic_tiles = min_dynamic_tiles
-        self.max_dynamic_tiles = max_dynamic_tiles
-        self.tie_word_embeddings = self.text_config.tie_word_embeddings
-        self._attn_implementation = _attn_implementation
-        self._attn_implementation_autoset = _attn_implementation_autoset
-        self.image_token_index = image_token_index
-        self.use_pixel_shuffle = use_pixel_shuffle
-        self.mlp_connector_layers = mlp_connector_layers
-        logger.info(f"min_dynamic_tiles: {self.min_dynamic_tiles}")
-        logger.info(f"max_dynamic_tiles: {self.max_dynamic_tiles}")
-
-    def to_dict(self):
-        """
-        Serializes this instance to a Python dictionary. Override the default [`~PretrainedConfig.to_dict`].
-
-        Returns:
-            `Dict[str, any]`: Dictionary of all the attributes that make up this configuration instance,
-        """
-        output = copy.deepcopy(self.__dict__)
-        output["vision_config"] = self.vision_config.to_dict()
-        output["text_config"] = self.text_config.to_dict()
-        output["model_type"] = self.__class__.model_type
-        output["use_backbone_lora"] = self.use_backbone_lora
-        output["use_llm_lora"] = self.use_llm_lora
-        output["pad2square"] = self.pad2square
-        output["select_layer"] = self.select_layer
-        output["force_image_size"] = self.force_image_size
-        output["downsample_ratio"] = self.downsample_ratio
-        output["template"] = self.template
-        output["dynamic_image_size"] = self.dynamic_image_size
-        output["use_thumbnail"] = self.use_thumbnail
-        output["min_dynamic_tiles"] = self.min_dynamic_tiles
-        output["max_dynamic_tiles"] = self.max_dynamic_tiles
-        output["tie_word_embeddings"] = self.tie_word_embeddings
-        output["_attn_implementation"] = self._attn_implementation
-        output["_attn_implementation_autoset"] = self._attn_implementation_autoset
-        output["use_pixel_shuffle"] = self.use_pixel_shuffle
-        output["mlp_connector_layers"] = self.mlp_connector_layers
-        return output
diff --git a/src/lerobot/policies/groot/eagle2_hg_model/image_processing_eagle2_5_vl_fast.py b/src/lerobot/policies/groot/eagle2_hg_model/image_processing_eagle2_5_vl_fast.py
deleted file mode 100644
index 90e9dcecc..000000000
--- a/src/lerobot/policies/groot/eagle2_hg_model/image_processing_eagle2_5_vl_fast.py
+++ /dev/null
@@ -1,503 +0,0 @@
-# --------------------------------------------------------
-# NVIDIA
-# Copyright (c) 2025 NVIDIA
-# Licensed under The MIT License [see LICENSE for details]
-# --------------------------------------------------------
-
-from __future__ import annotations
-
-# copy from https://github.com/huggingface/transformers/blob/main/src/transformers/models/llava_onevision/image_processing_llava_onevision_fast.py
-from transformers.image_processing_utils import (
-    BatchFeature,
-    get_patch_output_size,
-)
-from transformers.image_processing_utils_fast import (
-    BaseImageProcessorFast,
-    ImagesKwargs,
-    group_images_by_shape,
-    reorder_images,
-)
-from transformers.image_utils import (
-    IMAGENET_STANDARD_MEAN,  # 0.5, 0.5, 0.5
-    IMAGENET_STANDARD_STD,  # 0.5, 0.5, 0.5
-    ChannelDimension,
-    ImageInput,
-    PILImageResampling,
-    SizeDict,
-    get_image_size,
-    make_flat_list_of_images,
-    validate_kwargs,
-)
-from transformers.processing_utils import Unpack
-from transformers.utils import (
-    TensorType,
-    add_start_docstrings,
-    is_torch_available,
-    is_torchvision_v2_available,
-)
-from transformers.video_utils import VideoInput
-
-if is_torch_available():
-    import torch
-if is_torchvision_v2_available():
-    from torchvision.transforms.v2 import functional as F  # noqa: N812
-    from transformers.image_utils import pil_torch_interpolation_mapping
-else:
-    from torchvision.transforms import functional as F  # noqa: N812
-
-
-def crop(img: torch.Tensor, left: int, top: int, right: int, bottom: int) -> torch.Tensor:
-    """Crop the given numpy array.
-
-    Args:
-        img (torch.Tensor): Image to be cropped. Format should be (C, H, W).
-        left (int): The left coordinate of the crop box.
-        top (int): The top coordinate of the crop box.
-        right (int): The right coordinate of the crop box.
-        bottom (int): The bottom coordinate of the crop box.
-
-    Returns:
-        torch.Tensor: Cropped image.
-    """
-    if not isinstance(img, torch.Tensor):
-        raise TypeError(f"img should be torch.Tensor. Got {type(img)}")
-
-    if img.ndim not in [2, 3]:
-        raise ValueError(f"Image should have 2 or 3 dimensions. Got {img.ndim}")
-
-    img_height = img.shape[1]
-    img_width = img.shape[2]
-    if top < 0 or left < 0 or bottom > img_height or right > img_width:
-        raise ValueError("Crop coordinates out of bounds")
-
-    if top >= bottom or left >= right:
-        raise ValueError("Invalid crop coordinates")
-
-    return img[:, top:bottom, left:right]
-
-
-class Eagle25VLFastImageProcessorKwargs(ImagesKwargs):
-    max_dynamic_tiles: int | None
-    min_dynamic_tiles: int | None
-    use_thumbnail: bool | None
-    pad_during_tiling: bool | None
-    do_pad: bool | None
-
-
-@add_start_docstrings(
-    "Constructs a fast ConvNeXT image processor. Based on [`SiglipImageProcessor`] with incorporation of processing each video frame.",
-    # BASE_IMAGE_PROCESSOR_FAST_DOCSTRING, TODO: this was depreciated from transformers remove!
-    """
-        image_grid_pinpoints (`List[List[int]]`, *optional*):
-            A list of possible resolutions to use for processing high resolution images. The best resolution is selected
-            based on the original size of the image. Can be overridden by `image_grid_pinpoints` in the `preprocess`
-            method. Not used for processing videos.
-        do_pad (`bool`, *optional*):
-            Whether to pad the image. If `True`, will pad the patch dimension of the images in the batch to the largest
-            number of patches in the batch. Padding will be applied to the bottom and right with zeros.
-    """,
-)
-class Eagle25VLImageProcessorFast(BaseImageProcessorFast):
-    resample = PILImageResampling.BICUBIC
-    image_mean = IMAGENET_STANDARD_MEAN
-    image_std = IMAGENET_STANDARD_STD
-    size = {"height": 448, "width": 448}
-    default_to_square = False
-    crop_size = None
-    do_resize = True
-    do_center_crop = None
-    do_rescale = True
-    do_normalize = True
-    do_convert_rgb = True
-    do_pad = True
-    max_dynamic_tiles = 12
-    min_dynamic_tiles = 1
-    use_thumbnail = True
-    pad_during_tiling = False
-    valid_kwargs = Eagle25VLFastImageProcessorKwargs
-    model_input_names = ["pixel_values_videos"]
-
-    def __init__(self, **kwargs: Unpack[Eagle25VLFastImageProcessorKwargs]):
-        super().__init__(**kwargs)
-
-    @add_start_docstrings(
-        # BASE_IMAGE_PROCESSOR_FAST_DOCSTRING_PREPROCESS, TODO: this was depreciated from transformers remove!
-        """
-            max_dynamic_tiles (`int`, *optional*):
-                The maximum number of dynamic tiles to use for processing high resolution images.
-            min_dynamic_tiles (`int`, *optional*):
-                The minimum number of dynamic tiles to use for processing high resolution images.
-            use_thumbnail (`bool`, *optional*):
-                Whether to use a thumbnail for processing high resolution images.
-            pad_during_tiling (`bool`, *optional*):
-                Whether to pad the image during tiling.
-            do_pad (`bool`, *optional*):
-                    Whether to pad the image. If `True`, will pad the patch dimension of the images in the batch to the largest
-                    number of patches in the batch. Padding will be applied to the bottom and right with zeros.
-        """,
-    )
-
-    # NOTE(YL): we will overload the preprocess method to add the image_flags
-    # def preprocess(
-    #     self, images: ImageInput, **kwargs: Unpack[Eagle25VLFastImageProcessorKwargs]
-    # ) -> BatchFeature:
-    #     return super().preprocess(images, **kwargs)
-
-    def _prepare_images_structure(
-        self,
-        images: ImageInput,
-        expected_ndims: int = 3,
-    ) -> ImageInput:
-        """
-        Prepare the images structure for processing.
-
-        Args:
-            images (`ImageInput`):
-                The input images to process.
-            expected_ndims (`int`, *optional*, defaults to 3):
-                Expected number of dimensions for the images (added for transformers >=4.53.0 compatibility).
-
-        Returns:
-            `ImageInput`: The images with a valid nesting.
-        """
-        return make_flat_list_of_images(images)
-
-    def _resize_for_patching(
-        self,
-        image: torch.Tensor,
-        target_resolution: tuple,
-        interpolation: F.InterpolationMode,
-        input_data_format: ChannelDimension,
-    ) -> torch.Tensor:
-        """
-        Resizes an image to a target resolution while maintaining aspect ratio.
-
-        Args:
-            image ("torch.Tensor"):
-                The input image.
-            target_resolution (tuple):
-                The target resolution (height, width) of the image.
-            interpolation (`InterpolationMode`):
-                Resampling filter to use if resizing the image.
-            input_data_format (`ChannelDimension` or `str`):
-                The channel dimension format of the input image.
-
-        Returns:
-            "torch.Tensor": The resized and padded image.
-        """
-        new_height, new_width = get_patch_output_size(image, target_resolution, input_data_format)
-
-        # Resize the image
-        resized_image = F.resize(image, (new_height, new_width), interpolation=interpolation)
-
-        return resized_image
-
-    def find_closest_aspect_ratio(self, aspect_ratio, target_ratios, width, height, image_size):
-        """
-        previous version mainly focus on ratio.
-        We also consider area ratio here.
-        """
-        best_factor = float("-inf")
-        best_ratio = (1, 1)
-        area = width * height
-        for ratio in target_ratios:
-            target_aspect_ratio = ratio[0] / ratio[1]
-            # ratio_diff = abs(aspect_ratio - target_aspect_ratio)
-            # area_ratio = (ratio[0] * ratio[1] * image_size * image_size) / area
-            """
-            new area > 60% of original image area is enough.
-            """
-            factor_based_on_area_n_ratio = min(
-                (ratio[0] * ratio[1] * image_size * image_size) / area, 0.6
-            ) * min(target_aspect_ratio / aspect_ratio, aspect_ratio / target_aspect_ratio)
-
-            if factor_based_on_area_n_ratio > best_factor:
-                best_factor = factor_based_on_area_n_ratio
-                best_ratio = ratio
-
-        return best_ratio
-
-    def _pad_for_patching(
-        self, image: torch.Tensor, target_resolution: tuple, input_data_format: ChannelDimension
-    ) -> torch.Tensor:
-        """
-        Pad an image to a target resolution while maintaining aspect ratio.
-        """
-        target_height, target_width = target_resolution
-        new_height, new_width = get_patch_output_size(image, target_resolution, input_data_format)
-
-        paste_x = (target_width - new_width) // 2
-        paste_y = (target_height - new_height) // 2
-
-        padded_image = F.pad(image, padding=[paste_x, paste_y, paste_x, paste_y])
-
-        return padded_image
-
-    def _get_image_patches(
-        self,
-        image: torch.Tensor,
-        min_num: int,
-        max_num: int,
-        size: tuple,
-        tile_size: int,
-        use_thumbnail: bool,
-        interpolation: F.InterpolationMode,
-        pad_during_tiling: bool,
-    ) -> list[torch.Tensor]:
-        image_size = get_image_size(image, channel_dim=ChannelDimension.FIRST)
-        orig_height, orig_width = image_size
-        aspect_ratio = orig_width / orig_height
-
-        # calculate the existing image aspect ratio
-        target_ratios = {
-            (i, j)
-            for n in range(min_num, max_num + 1)
-            for i in range(1, n + 1)
-            for j in range(1, n + 1)
-            if i * j <= max_num and i * j >= min_num
-        }
-        target_ratios = sorted(target_ratios, key=lambda x: x[0] * x[1])
-
-        # find the closest aspect ratio to the target
-        target_aspect_ratio = self.find_closest_aspect_ratio(
-            aspect_ratio, target_ratios, orig_width, orig_height, tile_size
-        )
-
-        # calculate the target width and height
-        target_width = tile_size * target_aspect_ratio[0]
-        target_height = tile_size * target_aspect_ratio[1]
-        blocks = target_aspect_ratio[0] * target_aspect_ratio[1]
-        if pad_during_tiling:
-            resized_image = self._resize_for_patching(
-                image,
-                (target_height, target_width),
-                interpolation=interpolation,
-                input_data_format=ChannelDimension.FIRST,
-            )
-            padded_image = self._pad_for_patching(
-                resized_image,
-                (target_height, target_width),
-                input_data_format=ChannelDimension.FIRST,
-            )
-            image_used_to_split = padded_image
-        else:
-            image_used_to_split = F.resize(image, (target_height, target_width), interpolation=interpolation)
-
-        processed_tiles = []
-        for i in range(blocks):
-            box = (
-                (i % (target_width // tile_size)) * tile_size,
-                (i // (target_width // tile_size)) * tile_size,
-                ((i % (target_width // tile_size)) + 1) * tile_size,
-                ((i // (target_width // tile_size)) + 1) * tile_size,
-            )
-            # split the image
-            split_img = crop(image_used_to_split, box[0], box[1], box[2], box[3])
-            processed_tiles.append(split_img)
-        assert len(processed_tiles) == blocks
-
-        if use_thumbnail and len(processed_tiles) != 1:
-            thumbnail_img = F.resize(image, (tile_size, tile_size), interpolation=interpolation)
-            processed_tiles.append(thumbnail_img)
-
-        return processed_tiles
-
-    def _pad_for_batching(
-        self,
-        pixel_values: list[torch.Tensor],
-    ) -> list[torch.Tensor]:
-        """
-        Pads images on the `num_of_patches` dimension with zeros to form a batch of same number of patches.
-
-        Args:
-            pixel_values (`List[torch.Tensor]`):
-                An array of pixel values of each images of shape (`batch_size`, `num_patches`, `image_in_3D`)
-
-        Returns:
-            List[`torch.Tensor`]: The padded images.
-        """
-        max_patch = max(len(x) for x in pixel_values)
-        pixel_values = [
-            torch.nn.functional.pad(image, pad=[0, 0, 0, 0, 0, 0, 0, max_patch - image.shape[0]])
-            for image in pixel_values
-        ]
-
-        return pixel_values
-
-    def _preprocess(
-        self,
-        images: list[torch.Tensor],
-        do_resize: bool,
-        size: SizeDict,
-        max_dynamic_tiles: int,
-        min_dynamic_tiles: int,
-        use_thumbnail: bool,
-        pad_during_tiling: bool,
-        interpolation: F.InterpolationMode | None,
-        do_center_crop: bool,
-        crop_size: SizeDict,
-        do_rescale: bool,
-        rescale_factor: float,
-        do_normalize: bool,
-        image_mean: float | list[float] | None,
-        image_std: float | list[float] | None,
-        do_pad: bool,
-        return_tensors: str | TensorType | None,
-        pad_size: SizeDict | None = None,  # Added for transformers >=4.53.0 compatibility
-        disable_grouping: bool | None = None,  # Added for transformers >=4.53.0 compatibility
-    ) -> BatchFeature:
-        processed_images = []
-        image_sizes = []
-        # Determine the size tuple
-        if size and size.height and size.width:
-            size_tuple = (size.height, size.width)
-        else:
-            size_tuple = (size.shortest_edge, size.shortest_edge)
-
-        # Determine the patch size
-        if crop_size and crop_size.height:
-            tile_size = crop_size.height
-        elif size and size.height:
-            tile_size = size.height
-        else:
-            tile_size = size.shortest_edge
-
-        for image in images:
-            image_patches = self._get_image_patches(
-                image,
-                min_num=min_dynamic_tiles,
-                max_num=max_dynamic_tiles,
-                size=size_tuple,
-                tile_size=tile_size,
-                use_thumbnail=use_thumbnail,
-                interpolation=interpolation,
-                pad_during_tiling=pad_during_tiling,
-            )
-
-            # Group images by size for batched processing
-            processed_image_patches_grouped = {}
-            # Added for transformers >=4.53.0 compatibility
-            grouped_image_patches, grouped_image_patches_index = group_images_by_shape(
-                image_patches,
-                disable_grouping=disable_grouping,
-            )
-
-            for shape, stacked_image_patches in grouped_image_patches.items():
-                if do_resize:
-                    stacked_image_patches = self.resize(
-                        image=stacked_image_patches,
-                        size=size,
-                        interpolation=interpolation,
-                    )
-                if do_center_crop:
-                    stacked_image_patches = self.center_crop(stacked_image_patches, crop_size)
-                # Fused rescale and normalize
-                stacked_image_patches = self.rescale_and_normalize(
-                    stacked_image_patches,
-                    do_rescale,
-                    rescale_factor,
-                    do_normalize,
-                    image_mean,
-                    image_std,
-                )
-                processed_image_patches_grouped[shape] = stacked_image_patches
-            processed_image_patches = reorder_images(
-                processed_image_patches_grouped, grouped_image_patches_index
-            )
-            processed_image_patches = (
-                torch.stack(processed_image_patches, dim=0) if return_tensors else processed_image_patches
-            )
-            processed_images.append(processed_image_patches)
-            image_sizes.append(get_image_size(image, ChannelDimension.FIRST))
-
-        if do_pad:
-            processed_images = self._pad_for_batching(processed_images)
-
-        # processed_images = torch.stack(processed_images, dim=0) if return_tensors else processed_images
-        processed_images = torch.cat(processed_images, dim=0) if return_tensors else processed_images
-        return BatchFeature(
-            data={"pixel_values": processed_images, "image_sizes": image_sizes},
-            tensor_type=return_tensors,
-        )
-
-    def preprocess(
-        self,
-        images: ImageInput,
-        videos: VideoInput = None,
-        **kwargs: Unpack[Eagle25VLFastImageProcessorKwargs],
-    ) -> BatchFeature:
-        validate_kwargs(
-            captured_kwargs=kwargs.keys(),
-            valid_processor_keys=self.valid_kwargs.__annotations__.keys(),
-        )
-        # Set default kwargs from self. This ensures that if a kwarg is not provided
-        # by the user, it gets its default value from the instance, or is set to None.
-        for kwarg_name in self.valid_kwargs.__annotations__:
-            kwargs.setdefault(kwarg_name, getattr(self, kwarg_name, None))
-
-        # Extract parameters that are only used for preparing the input images
-        do_convert_rgb = kwargs.pop("do_convert_rgb")
-        input_data_format = kwargs.pop("input_data_format")
-        device = kwargs.pop("device")
-        # Prepare input images
-        # transformers >= 4.53.0: uses _prepare_image_like_inputs instead of _prepare_input_images
-        if images is not None:
-            images = self._prepare_image_like_inputs(
-                images=images,
-                do_convert_rgb=do_convert_rgb,
-                input_data_format=input_data_format,
-                device=device,
-            )
-
-        if videos is not None:
-            videos = self._prepare_image_like_inputs(
-                images=videos,
-                do_convert_rgb=do_convert_rgb,
-                input_data_format=input_data_format,
-                device=device,
-            )
-
-        # Update kwargs that need further processing before being validated
-        kwargs = self._further_process_kwargs(**kwargs)
-
-        # Validate kwargs
-        self._validate_preprocess_kwargs(**kwargs)
-
-        # torch resize uses interpolation instead of resample
-        # Added for transformers >=4.53.0 compatibility
-        resample = kwargs.pop("resample", self.resample)
-        kwargs["interpolation"] = (
-            pil_torch_interpolation_mapping[resample]
-            if isinstance(resample, PILImageResampling | int)
-            else resample
-        )
-
-        # Filter kwargs to only include those accepted by _preprocess
-        valid_preprocess_kwargs = {
-            "do_resize",
-            "size",
-            "max_dynamic_tiles",
-            "min_dynamic_tiles",
-            "use_thumbnail",
-            "pad_during_tiling",
-            "interpolation",
-            "do_center_crop",
-            "crop_size",
-            "do_rescale",
-            "rescale_factor",
-            "do_normalize",
-            "image_mean",
-            "image_std",
-            "do_pad",
-            "return_tensors",
-            "pad_size",
-            "disable_grouping",
-        }
-        filtered_kwargs = {k: v for k, v in kwargs.items() if k in valid_preprocess_kwargs}
-        if images is not None:
-            return self._preprocess(images, **filtered_kwargs)
-        elif videos is not None:
-            return self._preprocess(videos, **filtered_kwargs)
-
-
-__all__ = ["Eagle25VLImageProcessorFast"]
diff --git a/src/lerobot/policies/groot/eagle2_hg_model/modeling_eagle2_5_vl.py b/src/lerobot/policies/groot/eagle2_hg_model/modeling_eagle2_5_vl.py
deleted file mode 100755
index 6e5532ea4..000000000
--- a/src/lerobot/policies/groot/eagle2_hg_model/modeling_eagle2_5_vl.py
+++ /dev/null
@@ -1,396 +0,0 @@
-# --------------------------------------------------------
-# NVIDIA
-# Copyright (c) 2025 NVIDIA
-# Licensed under The MIT License [see LICENSE for details]
-# --------------------------------------------------------
-
-import inspect
-
-import torch
-import torch.utils.checkpoint as cp
-from peft import LoraConfig, get_peft_model
-from torch import nn
-from torch.nn import CrossEntropyLoss
-from transformers import GenerationConfig
-from transformers.generation import GenerationMixin
-from transformers.modeling_outputs import CausalLMOutputWithPast
-from transformers.modeling_utils import PreTrainedModel
-from transformers.models.llama.modeling_llama import LlamaForCausalLM
-from transformers.models.qwen2.modeling_qwen2 import Qwen2ForCausalLM
-from transformers.models.qwen3.modeling_qwen3 import Qwen3ForCausalLM
-from transformers.models.siglip.modeling_siglip import SiglipVisionModel
-from transformers.utils import add_start_docstrings, logging
-
-from .configuration_eagle2_5_vl import Eagle25VLConfig
-
-logger = logging.get_logger(__name__)
-
-
-# copy from https://github.com/huggingface/transformers/blob/main/src/transformers/models/llava_onevision/modeling_llava_onevision.py#L241C1-L280C1
-EAGLE2_5_VL_START_DOCSTRING = r"""
-    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
-    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
-    etc.)
-
-    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
-    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
-    and behavior.
-
-    Parameters:
-        config ([`Eagle25VLConfig`]):
-            Model configuration class with all the parameters of the model. Initializing with a config file does not
-            load the weights associated with the model, only the configuration. Check out the
-            [`~PreTrainedModel.from_pretrained`] method to load the model weights.
-"""
-
-
-@add_start_docstrings(
-    "The bare Eagle2_5_VL Model outputting raw hidden-states without any specific head on top.",
-    EAGLE2_5_VL_START_DOCSTRING,
-)
-class Eagle25VLPreTrainedModel(PreTrainedModel):
-    config_class = Eagle25VLConfig
-    base_model_prefix = "model"
-    main_input_name = "input_ids"
-    supports_gradient_checkpointing = True
-    _no_split_modules = [
-        "Qwen2DecoderLayer",
-        "LlamaDecoderLayer",
-        "Siglip2EncoderLayer",
-        "SiglipEncoderLayer",
-    ]
-    _skip_keys_device_placement = "past_key_values"
-    _supports_flash_attn = True
-    _supports_flash_attn_2 = True
-    _supports_cache_class = True
-    _supports_static_cache = True
-    _supports_quantized_cache = True
-    _supports_sdpa = True
-
-    def _init_weights(self, module):
-        std = self.config.initializer_range
-        if isinstance(module, nn.Linear | nn.Conv2d):
-            module.weight.data.normal_(mean=0.0, std=std)
-            if module.bias is not None:
-                module.bias.data.zero_()
-        elif isinstance(module, nn.Embedding):
-            module.weight.data.normal_(mean=0.0, std=std)
-            if module.padding_idx is not None:
-                module.weight.data[module.padding_idx].zero_()
-
-
-class Eagle25VLForConditionalGeneration(Eagle25VLPreTrainedModel, GenerationMixin):
-    config_class = Eagle25VLConfig
-
-    def __init__(self, config: Eagle25VLConfig, vision_model=None, language_model=None):
-        super().__init__(config)
-
-        image_size = config.force_image_size or config.vision_config.image_size
-        patch_size = config.vision_config.patch_size
-        self.patch_size = patch_size
-        if config.use_pixel_shuffle:
-            self.num_image_token = int((image_size // patch_size) ** 2 * (config.downsample_ratio**2))
-        else:
-            self.num_image_token = int((image_size // patch_size) ** 2)
-
-        self.select_layer = config.select_layer
-        self.downsample_ratio = config.downsample_ratio
-        self.loss_version = config.loss_version
-        self.mlp_checkpoint = config.mlp_checkpoint
-        self.use_pixel_shuffle = config.use_pixel_shuffle
-        self.mlp_connector_layers = config.mlp_connector_layers
-        logger.info(f"num_image_token: {self.num_image_token}")
-        logger.info(f"mlp_checkpoint: {self.mlp_checkpoint}")
-        if vision_model is not None:
-            self.vision_model = vision_model
-        else:
-            if config.vision_config.model_type == "siglip_vision_model":
-                config.vision_config._attn_implementation = "flash_attention_2"
-                self.vision_model = SiglipVisionModel(config.vision_config)
-            else:
-                raise NotImplementedError(f"{config.vision_config.model_type} is not implemented.")
-
-        if language_model is not None:
-            self.language_model = language_model
-        else:
-            if config.text_config.architectures[0] == "LlamaForCausalLM":
-                self.language_model = LlamaForCausalLM(config.text_config)
-            elif config.text_config.architectures[0] == "Phi3ForCausalLM":
-                raise NotImplementedError("Phi3 is not implemented.")
-                # self.language_model = Phi3ForCausalLM(config.text_config)
-            elif config.text_config.architectures[0] == "Qwen2ForCausalLM":
-                assert config.text_config._attn_implementation == "flash_attention_2", (
-                    f"Qwen2 must use flash_attention_2 but got {config.text_config._attn_implementation}"
-                )
-                self.language_model = Qwen2ForCausalLM(config.text_config)
-            elif config.text_config.architectures[0] == "Qwen3ForCausalLM":
-                self.language_model = Qwen3ForCausalLM(config.text_config)
-            else:
-                raise NotImplementedError(f"{config.text_config.architectures[0]} is not implemented.")
-
-        vit_hidden_size = config.vision_config.hidden_size
-        llm_hidden_size = config.text_config.hidden_size
-
-        if config.mlp_connector_layers == 2:
-            self.mlp1 = nn.Sequential(
-                nn.LayerNorm(vit_hidden_size * int(1 / self.downsample_ratio) ** 2),
-                nn.Linear(vit_hidden_size * int(1 / self.downsample_ratio) ** 2, llm_hidden_size),
-                nn.GELU(),
-                nn.Linear(llm_hidden_size, llm_hidden_size),
-            )
-        elif config.mlp_connector_layers == 1 and config.use_pixel_shuffle:
-            self.mlp1 = nn.Sequential(
-                nn.Linear(vit_hidden_size * int(1 / self.downsample_ratio) ** 2, llm_hidden_size),
-            )
-        elif config.mlp_connector_layers == 1 and not config.use_pixel_shuffle:
-            self.mlp1 = nn.Sequential(
-                nn.Linear(vit_hidden_size, llm_hidden_size),
-            )
-        else:
-            raise NotImplementedError(f"{config.mlp_connector_layers} is not implemented.")
-
-        self.image_token_index = config.image_token_index
-        self.neftune_alpha = None
-
-        if config.use_backbone_lora:
-            self.wrap_backbone_lora(r=config.use_backbone_lora, lora_alpha=2 * config.use_backbone_lora)
-
-        self.use_llm_lora = config.use_llm_lora
-        if config.use_llm_lora:
-            self.wrap_llm_lora(r=config.use_llm_lora, lora_alpha=2 * config.use_llm_lora)
-
-        self.check_forward_kwargs()
-
-    def check_forward_kwargs(self):
-        # We intentionally avoid using **kwargs in forward because Hugging Face Transformers
-        # has special handling for functions with **kwargs parameters that would affect
-        # how our model is processed during training and inference.
-        forward_params = inspect.signature(self.forward).parameters
-        assert not any(k.kind == inspect.Parameter.VAR_KEYWORD for k in forward_params.values())
-
-    def wrap_backbone_lora(self, r=128, lora_alpha=256, lora_dropout=0.05):
-        lora_config = LoraConfig(
-            r=r,
-            target_modules=[
-                "self_attn.q_proj",
-                "self_attn.k_proj",
-                "self_attn.v_proj",
-                "self_attn.out_proj",
-                "mlp.fc1",
-                "mlp.fc2",
-            ],
-            lora_alpha=lora_alpha,
-            lora_dropout=lora_dropout,
-        )
-        self.vision_model = get_peft_model(self.vision_model, lora_config)
-        self.vision_model.print_trainable_parameters()
-
-    def wrap_llm_lora(self, r=128, lora_alpha=256, lora_dropout=0.05):
-        lora_config = LoraConfig(
-            r=r,
-            target_modules=[
-                "self_attn.q_proj",
-                "self_attn.k_proj",
-                "self_attn.v_proj",
-                "self_attn.o_proj",
-                "mlp.gate_proj",
-                "mlp.down_proj",
-                "mlp.up_proj",
-            ],
-            lora_alpha=lora_alpha,
-            lora_dropout=lora_dropout,
-            task_type="CAUSAL_LM",
-        )
-        self.language_model = get_peft_model(self.language_model, lora_config)
-        self.language_model.enable_input_require_grads()
-        self.language_model.print_trainable_parameters()
-        self.use_llm_lora = True
-
-    def forward(
-        self,
-        pixel_values: torch.FloatTensor,
-        input_ids: torch.LongTensor = None,
-        attention_mask: torch.Tensor | None = None,
-        position_ids: torch.LongTensor | None = None,
-        image_flags: torch.LongTensor | None = None,
-        past_key_values: list[torch.FloatTensor] | None = None,
-        labels: torch.LongTensor | None = None,
-        use_cache: bool | None = None,
-        output_attentions: bool | None = None,
-        output_hidden_states: bool | None = None,
-        return_dict: bool | None = None,
-        num_tiles_list: list[torch.Tensor] | None = None,
-    ) -> tuple | CausalLMOutputWithPast:
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
-        input_embeds = self.language_model.get_input_embeddings()(input_ids)
-
-        vit_embeds = self.extract_feature(pixel_values)
-
-        if image_flags is not None:
-            image_flags = image_flags.view(-1)
-            vit_embeds = vit_embeds[image_flags == 1]
-
-        b, n, c = input_embeds.shape
-        input_embeds = input_embeds.reshape(b * n, c)
-
-        input_ids = input_ids.reshape(b * n)
-        selected = input_ids == self.image_token_index
-        try:
-            input_embeds[selected] = input_embeds[selected] * 0.0 + vit_embeds.reshape(-1, c)
-        except Exception as e:
-            vit_embeds = vit_embeds.reshape(-1, c)
-            print(
-                f"warning: {e}, input_embeds[selected].shape={input_embeds[selected].shape}, "
-                f"vit_embeds.shape={vit_embeds.shape}"
-            )
-            n_token = selected.sum()
-            input_embeds[selected] = input_embeds[selected] * 0.0 + vit_embeds[:n_token]
-
-        input_embeds = input_embeds.reshape(b, n, c)
-
-        outputs = self.language_model(
-            inputs_embeds=input_embeds,
-            attention_mask=attention_mask,
-            position_ids=position_ids,
-            past_key_values=past_key_values,
-            use_cache=use_cache,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-        )
-        logits = outputs.logits
-
-        loss = None
-        if labels is not None:
-            # Shift so that tokens < n predict n
-            shift_logits = logits[..., :-1, :].contiguous()
-            shift_labels = labels[..., 1:].contiguous()
-            # Flatten the tokens
-            loss_fct = CrossEntropyLoss()
-            shift_logits = shift_logits.view(-1, self.language_model.config.vocab_size)
-            shift_labels = shift_labels.view(-1)
-            # Enable model parallelism
-            shift_labels = shift_labels.to(shift_logits.device)
-            loss = loss_fct(shift_logits, shift_labels)
-
-        if not return_dict:
-            output = (logits,) + outputs[1:]
-            return (loss,) + output if loss is not None else output
-
-        return CausalLMOutputWithPast(
-            loss=loss,
-            logits=logits,
-            past_key_values=outputs.past_key_values,
-            hidden_states=outputs.hidden_states,
-            attentions=outputs.attentions,
-        )
-
-    def pixel_shuffle(self, x, scale_factor=0.5):
-        n, w, h, c = x.size()
-        # N, W, H, C --> N, W, H * scale, C // scale
-        x = x.view(n, w, int(h * scale_factor), int(c / scale_factor))
-        # N, W, H * scale, C // scale --> N, H * scale, W, C // scale
-        x = x.permute(0, 2, 1, 3).contiguous()
-        # N, H * scale, W, C // scale --> N, H * scale, W * scale, C // (scale ** 2)
-        x = x.view(n, int(h * scale_factor), int(w * scale_factor), int(c / (scale_factor * scale_factor)))
-
-        x = x.permute(0, 2, 1, 3).contiguous()
-        return x
-
-    def extract_feature(self, pixel_values):
-        if self.select_layer == -1:
-            vit_embeds = self.vision_model(
-                pixel_values=pixel_values, output_hidden_states=False, return_dict=True
-            )
-            if hasattr(vit_embeds, "last_hidden_state"):
-                vit_embeds = vit_embeds.last_hidden_state
-
-        else:
-            vit_embeds = self.vision_model(
-                pixel_values=pixel_values, output_hidden_states=True, return_dict=True
-            ).hidden_states[self.select_layer]
-
-        if self.use_pixel_shuffle:
-            h = w = int(vit_embeds.shape[1] ** 0.5)
-            vit_embeds = vit_embeds.reshape(vit_embeds.shape[0], h, w, -1)
-            vit_embeds = self.pixel_shuffle(
-                vit_embeds, scale_factor=self.downsample_ratio
-            )  # torch.Size([B, 1024, 1024]) -> torch.Size([B, 16, 16, 4096])
-            vit_embeds = vit_embeds.reshape(
-                vit_embeds.shape[0], -1, vit_embeds.shape[-1]
-            )  # torch.Size([B, 16, 16, 4096]) -> torch.Size([B, 256, 4096])
-
-        if self.mlp_checkpoint and vit_embeds.requires_grad:
-            vit_embeds = cp.checkpoint(self.mlp1, vit_embeds)
-        else:
-            vit_embeds = self.mlp1(vit_embeds)
-
-        return vit_embeds
-
-    @torch.no_grad()
-    def generate(
-        self,
-        pixel_values: torch.FloatTensor | None = None,
-        input_ids: torch.FloatTensor | None = None,
-        attention_mask: torch.LongTensor | None = None,
-        visual_features: torch.FloatTensor | None = None,
-        generation_config: GenerationConfig | None = None,
-        output_hidden_states: bool | None = None,
-        image_sizes: list[tuple[int, int]] | None = None,
-        **generate_kwargs,
-    ) -> torch.LongTensor:
-        if pixel_values is not None:
-            if visual_features is not None:
-                vit_embeds = visual_features
-            else:
-                vit_embeds = self.extract_feature(pixel_values)
-
-            input_embeds = self.language_model.get_input_embeddings()(input_ids)
-            b, n, c = input_embeds.shape
-            input_embeds = input_embeds.reshape(b * n, c)
-
-            input_ids = input_ids.reshape(b * n)
-            selected = input_ids == self.config.image_token_index
-            assert selected.sum() != 0
-            input_embeds[selected] = vit_embeds.reshape(-1, c).to(input_embeds.device)
-
-            input_embeds = input_embeds.reshape(b, n, c)
-        else:
-            input_embeds = self.language_model.get_input_embeddings()(input_ids)
-
-        if "use_cache" not in generate_kwargs:
-            generate_kwargs["use_cache"] = True
-
-        outputs = self.language_model.generate(
-            inputs_embeds=input_embeds,
-            attention_mask=attention_mask,
-            generation_config=generation_config,
-            output_hidden_states=output_hidden_states,
-            **generate_kwargs,
-        )
-
-        return outputs
-
-    # Copied from transformers.models.llava_next.modeling_llava_next.LlavaNextForConditionalGeneration.get_input_embeddings
-    def get_input_embeddings(self):
-        return self.language_model.get_input_embeddings()
-
-    # Copied from transformers.models.llava_next.modeling_llava_next.LlavaNextForConditionalGeneration.set_input_embeddings
-    def set_input_embeddings(self, value):
-        self.language_model.set_input_embeddings(value)
-
-    # Copied from transformers.models.llava_next.modeling_llava_next.LlavaNextForConditionalGeneration.get_output_embeddings
-    def get_output_embeddings(self):
-        return self.language_model.get_output_embeddings()
-
-    # Copied from transformers.models.llava_next.modeling_llava_next.LlavaNextForConditionalGeneration.set_output_embeddings
-    def set_output_embeddings(self, new_embeddings):
-        self.language_model.set_output_embeddings(new_embeddings)
-
-    # Copied from transformers.models.llava_next.modeling_llava_next.LlavaNextForConditionalGeneration.set_decoder
-    def set_decoder(self, decoder):
-        self.language_model.set_decoder(decoder)
-
-    # Copied from transformers.models.llava_next.modeling_llava_next.LlavaNextForConditionalGeneration.get_decoder
-    def get_decoder(self):
-        return self.language_model.get_decoder()
diff --git a/src/lerobot/policies/groot/eagle2_hg_model/processing_eagle2_5_vl.py b/src/lerobot/policies/groot/eagle2_hg_model/processing_eagle2_5_vl.py
deleted file mode 100755
index b36e70c47..000000000
--- a/src/lerobot/policies/groot/eagle2_hg_model/processing_eagle2_5_vl.py
+++ /dev/null
@@ -1,541 +0,0 @@
-# Copyright 2024 The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-Processor class for Eagle25VL.
-copy from https://github.com/huggingface/transformers/blob/main/src/transformers/models/llava_onevision/processing_llava_onevision.py
-"""
-
-import base64
-import os
-import re
-from io import BytesIO
-
-import requests
-import torch
-from PIL import Image
-from transformers.feature_extraction_utils import BatchFeature
-from transformers.image_utils import ImageInput
-from transformers.processing_utils import ProcessingKwargs, ProcessorMixin, Unpack
-from transformers.tokenization_utils_base import PreTokenizedInput, TextInput
-from transformers.utils import logging
-from transformers.video_utils import VideoInput
-
-logger = logging.get_logger(__name__)
-
-
-FRAME_FACTOR = 2
-FPS = 2.0
-FPS_MIN_FRAMES = 4
-FPS_MAX_FRAMES = 256
-
-
-def to_rgb(pil_image: Image.Image) -> Image.Image:
-    if pil_image.mode == "RGBA":
-        white_background = Image.new("RGB", pil_image.size, (255, 255, 255))
-        white_background.paste(pil_image, mask=pil_image.split()[3])  # Use alpha channel as mask
-        return white_background
-    else:
-        return pil_image.convert("RGB")
-
-
-def fetch_image(ele: dict[str, str | Image.Image]) -> Image.Image:
-    image = ele["image"] if "image" in ele else ele["image_url"]
-    image_obj = None
-    if isinstance(image, Image.Image):
-        image_obj = image
-    elif image.startswith("http://") or image.startswith("https://"):
-        response = requests.get(image, stream=True, timeout=10)
-        image_obj = Image.open(BytesIO(response.content))
-    elif image.startswith("file://"):
-        image_obj = Image.open(image[7:])
-    elif image.startswith("data:image"):
-        if "base64," in image:
-            _, base64_data = image.split("base64,", 1)
-            data = base64.b64decode(base64_data)
-            image_obj = Image.open(BytesIO(data))
-    else:
-        image_obj = Image.open(image)
-    if image_obj is None:
-        raise ValueError(
-            f"Unrecognized image input, support local path, http url, base64 and PIL.Image, got {image}"
-        )
-    image = to_rgb(image_obj)
-    if "scale_factor" in ele:
-        scale_factor = ele["scale_factor"]
-        image = image.resize((image.width * scale_factor, image.height * scale_factor), Image.BILINEAR)
-    return image
-
-
-class Eagle25VLProcessorKwargs(ProcessingKwargs, total=False):
-    # see processing_utils.ProcessingKwargs documentation for usage.
-    _defaults = {
-        "text_kwargs": {
-            "padding": False,
-        },
-        "images_kwargs": {},
-        "videos_kwargs": {"max_dynamic_tiles": 1},
-    }
-
-
-class Eagle25VLProcessor(ProcessorMixin):
-    r"""
-    Constructs a Eagle25VL processor which wraps a Eagle25VL video processor, Eagle25VL image processor and a Eagle25VL tokenizer into a single processor.
-
-    [`Eagle25VLProcessor`] offers all the functionalities of [`Eagle25VLVideoProcessor`], [`Eagle25VLImageProcessor`] and [`Eagle25VLTokenizer`]. See the
-    [`~Eagle25VLVideoProcessor.__call__`], [`~Eagle25VLProcessor.__call__`] and [`~Eagle25VLProcessor.decode`] for more information.
-
-    Args:
-        image_processor ([`LlavaOnevisionImageProcessor`], *optional*):
-            The image processor is a required input.
-        tokenizer ([`LlamaTokenizerFast`], *optional*):
-            The tokenizer is a required input.
-        num_image_tokens (`int`, *optional*):
-            Number of image tokens for one imagethat will be returned by vision tower.
-        vision_feature_select_strategy (`str`, *optional*):
-            The feature selection strategy used to select the vision feature from the vision backbone.
-            Should be same as in model's config
-        chat_template (`str`, *optional*): A Jinja template which will be used to convert lists of messages
-            in a chat into a tokenizable string.
-        image_token (`str`, *optional*, defaults to `"<image>"`):
-            Special token used to denote image location.
-        video_token (`str`, *optional*, defaults to `"<video>"`):
-            Special token used to denote video location.
-    """
-
-    attributes = ["image_processor", "tokenizer"]
-    valid_kwargs = [
-        "chat_template",
-        "num_image_tokens",
-        "vision_feature_select_strategy",
-        "image_token",
-        "video_token",
-        "images_kwargs",
-        "videos_kwargs",
-        "text_kwargs",
-    ]
-    tokenizer_class = "AutoTokenizer"
-
-    def __init__(
-        self,
-        image_processor=None,
-        tokenizer=None,
-        vision_feature_select_strategy=None,
-        chat_template=None,
-        image_token="<IMG_CONTEXT>",  # nosec: B107
-        video_token="<IMG_CONTEXT>",  # nosec: B107
-        tokens_per_tile=256,
-        image_placeholder="image",
-        video_placeholder="video",
-        image_start_token="<img>",
-        image_end_token="</img>",
-        **kwargs,
-    ):
-        self.vision_feature_select_strategy = vision_feature_select_strategy
-        self.image_token = tokenizer.image_token if hasattr(tokenizer, "image_token") else image_token
-        self.video_token = tokenizer.video_token if hasattr(tokenizer, "video_token") else video_token
-        self.image_token_id = (
-            tokenizer.image_token_id
-            if getattr(tokenizer, "image_token_id", None)
-            else tokenizer.convert_tokens_to_ids(self.image_token)
-        )
-        self.video_token_id = (
-            tokenizer.video_token_id
-            if getattr(tokenizer, "video_token_id", None)
-            else tokenizer.convert_tokens_to_ids(self.video_token)
-        )
-        self.image_placeholder = image_placeholder
-        self.video_placeholder = video_placeholder
-        self.tokens_per_tile = tokens_per_tile
-        self.image_start_token = image_start_token
-        self.image_end_token = image_end_token
-        if "auto_map" in kwargs:
-            self.auto_map = kwargs["auto_map"]
-        super().__init__(image_processor, tokenizer, chat_template=chat_template)
-
-    def replace_media_placeholder(
-        self, text, image_list, video_list, timestamps_list, fps_list, **output_kwargs
-    ):
-        num_of_images_in_this_sample = 0
-        num_of_videos_in_this_sample = 0
-        # Regular expression pattern to match formats like <image-1> or <video-2>
-        pattern = re.compile(rf"<({self.image_placeholder}|{self.video_placeholder})-(\d+)>")
-        unified_frame_list = []
-
-        # image_min_dynamic_tiles = output_kwargs["images_kwargs"].get(
-        #     "min_dynamic_tiles", self.image_processor.min_dynamic_tiles
-        # )
-        # image_max_dynamic_tiles = output_kwargs["images_kwargs"].get(
-        #     "max_dynamic_tiles", self.image_processor.max_dynamic_tiles
-        # )
-        # image_use_thumbnail = output_kwargs["images_kwargs"].get(
-        #     "use_thumbnail", self.image_processor.use_thumbnail
-        # )
-        video_min_dynamic_tiles = output_kwargs["videos_kwargs"].get(
-            "min_dynamic_tiles", self.image_processor.min_dynamic_tiles
-        )
-        video_max_dynamic_tiles = output_kwargs["videos_kwargs"].get(
-            "max_dynamic_tiles", self.image_processor.max_dynamic_tiles
-        )
-        video_use_thumbnail = output_kwargs["videos_kwargs"].get(
-            "use_thumbnail", self.image_processor.use_thumbnail
-        )
-
-        tile_size = self.image_processor.size.get("height", 448)
-
-        # Function to replace tags in a single text
-        def replace_in_text(text):
-            # repl callback function for each match replacement operation
-            def repl(match):
-                nonlocal unified_frame_list
-                nonlocal num_of_images_in_this_sample
-                nonlocal num_of_videos_in_this_sample
-                media_type = match.group(1)  # 'image' or 'video'
-                idx_in_list = int(match.group(2)) - 1  # Convert to list index (0-based)
-                # Select the corresponding path based on media type
-                idx_mapper = {
-                    0: "first",
-                    1: "second",
-                    2: "third",
-                    3: "fourth",
-                    4: "fifth",
-                    5: "sixth",
-                    6: "seventh",
-                    7: "eighth",
-                    8: "ninth",
-                    9: "tenth",
-                }
-                if media_type == "image":
-                    image_inputs = self.image_processor(
-                        images=[image_list[idx_in_list]],
-                        videos=None,
-                        **output_kwargs["images_kwargs"],
-                    )
-                    if isinstance(image_inputs["pixel_values"], list):
-                        _pv = image_inputs["pixel_values"]
-                        if _pv and isinstance(_pv[0], list):
-                            _pv = [t for sub in _pv for t in sub]
-                        image_inputs["pixel_values"] = torch.stack(
-                            [t if isinstance(t, torch.Tensor) else torch.as_tensor(t) for t in _pv]
-                        )
-                    num_all_tiles = image_inputs["pixel_values"].shape[0]
-                    special_placeholder = f"<image {idx_in_list + 1}>{self.image_start_token}{self.image_token * num_all_tiles * self.tokens_per_tile}{self.image_end_token}"
-                    unified_frame_list.append(image_inputs)
-                    num_of_images_in_this_sample += 1
-
-                elif media_type == "video":
-                    video_inputs = self.image_processor(
-                        images=None,
-                        videos=[video_list[idx_in_list]],
-                        **output_kwargs["videos_kwargs"],
-                    )
-                    if isinstance(video_inputs["pixel_values"], list):
-                        _pv = video_inputs["pixel_values"]
-                        if _pv and isinstance(_pv[0], list):
-                            _pv = [t for sub in _pv for t in sub]
-                        video_inputs["pixel_values"] = torch.stack(
-                            [t if isinstance(t, torch.Tensor) else torch.as_tensor(t) for t in _pv]
-                        )
-                    num_all_tiles = video_inputs["pixel_values"].shape[0]
-                    image_sizes = video_inputs["image_sizes"]
-                    if timestamps_list is not None and -1 not in timestamps_list:
-                        frame_timestamps = timestamps_list[idx_in_list]
-                    else:
-                        frame_timestamps = None
-                    sampled_fps = fps_list[idx_in_list] if fps_list is not None else None
-
-                    num_of_tiles_each_frame = [
-                        self.get_number_tiles_based_on_image_size(
-                            image_size,
-                            video_min_dynamic_tiles,
-                            video_max_dynamic_tiles,
-                            video_use_thumbnail,
-                            tile_size,
-                        )
-                        for image_size in image_sizes
-                    ]
-                    assert sum(num_of_tiles_each_frame) == num_all_tiles, (
-                        f"The number of tiles in each frame is not equal to the total number of tiles: {sum(num_of_tiles_each_frame)} != {num_all_tiles}"
-                    )
-
-                    if frame_timestamps is not None:
-                        assert len(frame_timestamps) == len(num_of_tiles_each_frame), (
-                            f"The number of timestamps is not equal to the number of frames: {len(frame_timestamps)} != {len(num_of_tiles_each_frame)}"
-                        )
-                        special_placeholder = [
-                            f"Frame {i + 1} sample at {frame_timestamps[i]:.2f}s: {self.image_start_token}{self.image_token * num_of_tiles * self.tokens_per_tile}{self.image_end_token}"
-                            for i, num_of_tiles in enumerate(num_of_tiles_each_frame)
-                        ]
-                    else:
-                        special_placeholder = [
-                            f"Frame {i + 1}: {self.image_start_token}{self.image_token * num_of_tiles * self.tokens_per_tile}{self.image_end_token}"
-                            for i, num_of_tiles in enumerate(num_of_tiles_each_frame)
-                        ]
-
-                    if sampled_fps is not None:
-                        special_placeholder = (
-                            f"The {idx_mapper[idx_in_list]} video sampled with {sampled_fps:.2f} fps: "
-                            + "".join(special_placeholder)
-                        )
-                    else:
-                        special_placeholder = f"The {idx_mapper[idx_in_list]} video: " + "".join(
-                            special_placeholder
-                        )
-                    unified_frame_list.append(video_inputs)
-                    num_of_videos_in_this_sample += 1
-                else:
-                    raise ValueError(f"Unknown media type: {media_type}")
-                return special_placeholder
-
-            return pattern.sub(repl, text)
-
-        text = replace_in_text(text)
-        if len(unified_frame_list) > 0:
-
-            def _to_tensor(v):
-                if isinstance(v, torch.Tensor):
-                    return v
-                if isinstance(v, list):
-                    if v and isinstance(v[0], list):
-                        v = [t for sub in v for t in sub]
-                    return torch.stack([t if isinstance(t, torch.Tensor) else torch.as_tensor(t) for t in v])
-                return torch.as_tensor(v)
-
-            pixel_values = torch.cat([_to_tensor(frame["pixel_values"]) for frame in unified_frame_list])
-            image_sizes = torch.cat([_to_tensor(frame["image_sizes"]) for frame in unified_frame_list])
-        else:
-            pixel_values = None
-            image_sizes = None
-        return (
-            text,
-            pixel_values,
-            image_sizes,
-            num_of_images_in_this_sample,
-            num_of_videos_in_this_sample,
-        )
-
-    def __call__(
-        self,
-        images: ImageInput = None,
-        text: TextInput | PreTokenizedInput | list[TextInput] | list[PreTokenizedInput] = None,
-        audio=None,
-        videos: VideoInput = None,
-        **kwargs: Unpack[Eagle25VLProcessorKwargs],
-    ) -> BatchFeature:
-        """
-        Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
-        and `kwargs` arguments to LlamaTokenizerFast's [`~LlamaTokenizerFast.__call__`] if `text` is not `None` to encode
-        the text. To prepare the image(s), this method forwards the `images` and `kwrags` arguments to
-        LlavaNextImageProcessor's [`~LlavaNextImageProcessor.__call__`] if `images` is not `None`. Please refer to the docstring
-        of the above two methods for more information.
-
-        Args:
-            images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `List[PIL.Image.Image]`, `List[np.ndarray]`, `List[torch.Tensor]`):
-                The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
-                tensor. Both channels-first and channels-last formats are supported.
-            text (`str`, `List[str]`, `List[List[str]]`):
-                The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
-                (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
-                `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
-            videos (`np.ndarray`, `torch.Tensor`, `List[np.ndarray]`, `List[torch.Tensor]`):
-                The image or batch of videos to be prepared. Each video can be a 4D NumPy array or PyTorch
-
-        Returns:
-            [`BatchFeature`]: A [`BatchFeature`] with the following fields:
-
-            - **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None`.
-            - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
-              `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not
-              `None`).
-            - **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`.
-            - **pixel_values_videos** -- Pixel values of a video input to be fed to a model. Returned when `videos` is not `None`.
-            - **image_sizes** -- Size of each image that will be used to unpad an image. Returned when `images` is not `None`.
-        """
-
-        output_kwargs = self._merge_kwargs(
-            Eagle25VLProcessorKwargs,
-            tokenizer_init_kwargs=self.tokenizer.init_kwargs,
-            **kwargs,
-        )
-
-        if isinstance(text, str):
-            text_list = [text]
-        elif not isinstance(text, list) and not isinstance(text[0], str):
-            raise ValueError("Invalid input text. Please provide a string, or a list of strings")
-        elif isinstance(text, list) and isinstance(text[0], str):
-            text_list = text
-
-        if images is None:
-            images = []
-        if videos is None:
-            videos = []
-
-        pixel_values_list = []
-        image_sizes_list = []
-        new_sample_list = []
-        image_start_idx = 0
-        video_start_idx = 0
-        timestamps_batch = output_kwargs["videos_kwargs"].pop("timestamps", None)
-        fps_batch = output_kwargs["videos_kwargs"].pop("fps", None)
-        for sample in text_list:
-            timestamps_list = timestamps_batch[video_start_idx:] if timestamps_batch is not None else None
-            fps_list = fps_batch[video_start_idx:] if fps_batch is not None else None
-            (
-                sample,
-                pixel_values,
-                image_sizes,
-                num_of_images_in_this_sample,
-                num_of_videos_in_this_sample,
-            ) = self.replace_media_placeholder(
-                sample,
-                images[image_start_idx:],
-                videos[video_start_idx:],
-                timestamps_list,
-                fps_list,
-                **output_kwargs,
-            )
-            new_sample_list.append(sample)
-            if pixel_values is not None:
-                pixel_values_list.append(pixel_values)
-                image_sizes_list.append(image_sizes)
-            image_start_idx += num_of_images_in_this_sample
-            video_start_idx += num_of_videos_in_this_sample
-
-        if len(pixel_values_list) > 0:
-            image_inputs = {
-                "pixel_values": torch.cat(pixel_values_list),
-                "image_sizes": torch.cat(image_sizes_list),
-            }
-        else:
-            image_inputs = {}
-        video_inputs = {}
-        text_inputs = self.tokenizer(new_sample_list, **output_kwargs["text_kwargs"])
-        return BatchFeature(data={**text_inputs, **image_inputs, **video_inputs})
-
-    def get_number_tiles_based_on_image_size(
-        self, image_size: tuple, min_num: int, max_num: int, use_thumbnail: bool, tile_size: int
-    ) -> int:
-        """
-        Get the number of tiles based on the image size.
-        """
-        orig_height, orig_width = image_size
-        aspect_ratio = orig_width / orig_height
-        # calculate the existing image aspect ratio
-        target_ratios = {
-            (i, j)
-            for n in range(min_num, max_num + 1)
-            for i in range(1, n + 1)
-            for j in range(1, n + 1)
-            if i * j <= max_num and i * j >= min_num
-        }
-        target_ratios = sorted(target_ratios, key=lambda x: x[0] * x[1])
-
-        # find the closest aspect ratio to the target
-        target_aspect_ratio = self.image_processor.find_closest_aspect_ratio(
-            aspect_ratio, target_ratios, orig_width, orig_height, tile_size
-        )
-        tiles_num = target_aspect_ratio[0] * target_aspect_ratio[1]
-        if use_thumbnail and tiles_num > 1:
-            tiles_num += 1
-        return tiles_num
-
-    # Copied from transformers.models.clip.processing_clip.CLIPProcessor.batch_decode with CLIP->Llama
-    def batch_decode(self, *args, **kwargs):
-        """
-        This method forwards all its arguments to LlamaTokenizerFast's [`~PreTrainedTokenizer.batch_decode`]. Please
-        refer to the docstring of this method for more information.
-        """
-        return self.tokenizer.batch_decode(*args, **kwargs)
-
-    # Copied from transformers.models.clip.processing_clip.CLIPProcessor.decode with CLIP->Llama
-    def decode(self, *args, **kwargs):
-        """
-        This method forwards all its arguments to LlamaTokenizerFast's [`~PreTrainedTokenizer.decode`]. Please refer to
-        the docstring of this method for more information.
-        """
-        return self.tokenizer.decode(*args, **kwargs)
-
-    @property
-    # Copied from transformers.models.clip.processing_clip.CLIPProcessor.model_input_names
-    def model_input_names(self):
-        tokenizer_input_names = self.tokenizer.model_input_names
-        image_processor_input_names = self.image_processor.model_input_names
-        return list(dict.fromkeys(tokenizer_input_names + image_processor_input_names))
-
-    # override to save video-config in a separate config file
-    def save_pretrained(self, save_directory, **kwargs):
-        if os.path.isfile(save_directory):
-            raise ValueError(f"Provided path ({save_directory}) should be a directory, not a file")
-        os.makedirs(save_directory, exist_ok=True)
-
-        outputs = super().save_pretrained(save_directory, **kwargs)
-        return outputs
-
-    # override to load video-config from a separate config file
-    @classmethod
-    def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
-        processor = super().from_pretrained(pretrained_model_name_or_path, **kwargs)
-
-        # if return_unused_kwargs a tuple is returned where the second element is 'unused_kwargs'
-        if isinstance(processor, tuple):
-            processor = processor[0]
-        return processor
-
-    # Copy from https://github.com/QwenLM/Qwen2.5-VL/blob/main/qwen-vl-utils/src/qwen_vl_utils/vision_process.py
-    def process_vision_info(
-        self,
-        conversations: list[dict] | list[list[dict]],
-        return_video_kwargs: bool = False,
-    ) -> tuple[list[Image.Image] | None, list[torch.Tensor | list[Image.Image]] | None, dict | None]:
-        vision_infos = self.extract_vision_info(conversations)
-        ## Read images or videos
-        image_inputs = []
-        video_inputs = []
-        video_sample_fps_list = []
-        video_timestamps_list = []
-        for vision_info in vision_infos:
-            if "image" in vision_info or "image_url" in vision_info:
-                image_inputs.append(fetch_image(vision_info))
-            else:
-                raise ValueError("image, image_url or video should in content.")
-        if len(image_inputs) == 0:
-            image_inputs = None
-        if len(video_inputs) == 0:
-            video_inputs = None
-        if return_video_kwargs:
-            return (
-                image_inputs,
-                video_inputs,
-                {"fps": video_sample_fps_list, "timestamps": video_timestamps_list},
-            )
-        return image_inputs, video_inputs
-
-    def extract_vision_info(self, conversations: list[dict] | list[list[dict]]) -> list[dict]:
-        vision_infos = []
-        if isinstance(conversations[0], dict):
-            conversations = [conversations]
-        for conversation in conversations:
-            for message in conversation:
-                if isinstance(message["content"], list):
-                    for ele in message["content"]:
-                        if (
-                            "image" in ele
-                            or "image_url" in ele
-                            or "video" in ele
-                            or ele["type"] in ("image", "image_url", "video")
-                        ):
-                            vision_infos.append(ele)
-        return vision_infos
-
-
-__all__ = ["Eagle25VLProcessor"]