From 0b326053e927c98731dbf7b5c01d16251343e27c Mon Sep 17 00:00:00 2001
From: Jade Choghari <chogharijade@gmail.com>
Date: Thu, 27 Nov 2025 13:38:12 +0100
Subject: [PATCH] remove timm dep

---
 pyproject.toml                                |  4 ++--
 .../policies/xvla/modeling_florence2.py       | 17 +++++++++++++++-
 src/lerobot/policies/xvla/utils.py            | 20 +++++++++++++++++++
 3 files changed, 38 insertions(+), 3 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index 72e0b50a2..c71bc45fb 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -129,7 +129,7 @@ groot = [
     "ninja>=1.11.1,<2.0.0",
     "flash-attn>=2.5.9,<3.0.0 ; sys_platform != 'darwin'"
 ]
-xlva = ["lerobot[transformers-dep]", "timm>=1.0.0,<1.1.0"]
+xlva = ["lerobot[transformers-dep]"]
 hilserl = ["lerobot[transformers-dep]", "gym-hil>=0.1.13,<0.2.0", "lerobot[grpcio-dep]", "lerobot[placo-dep]"]
 
 # Features
@@ -158,7 +158,7 @@ all = [
     "lerobot[pi]",
     "lerobot[smolvla]",
     # "lerobot[groot]", TODO(Steven): Gr00t requires specific installation instructions for flash-attn
-    # "lerobot[xvla]",
+    "lerobot[xvla]",
     "lerobot[hilserl]",
     "lerobot[async]",
     "lerobot[dev]",
diff --git a/src/lerobot/policies/xvla/modeling_florence2.py b/src/lerobot/policies/xvla/modeling_florence2.py
index e65e15967..49a5e9c84 100644
--- a/src/lerobot/policies/xvla/modeling_florence2.py
+++ b/src/lerobot/policies/xvla/modeling_florence2.py
@@ -23,7 +23,6 @@ import torch.nn.functional as functional
 import torch.utils.checkpoint
 import torch.utils.checkpoint as checkpoint
 from einops import rearrange
-from timm.layers import DropPath
 from torch import nn
 from torch.nn import CrossEntropyLoss
 from transformers.activations import ACT2FN
@@ -52,6 +51,7 @@ from transformers.utils import (
 )
 
 from .configuration_florence2 import Florence2Config, Florence2LanguageConfig, Florence2VisionConfig
+from .utils import drop_path
 
 if is_flash_attn_2_available():
     from flash_attn.bert_padding import index_first_axis, pad_input, unpad_input  # noqa
@@ -61,6 +61,21 @@ logger = logging.get_logger(__name__)
 _CONFIG_FOR_DOC = "Florence2Config"
 
 
+class DropPath(nn.Module):
+    """Drop paths (Stochastic Depth) per sample  (when applied in main path of residual blocks)."""
+
+    def __init__(self, drop_prob: float = 0.0, scale_by_keep: bool = True):
+        super().__init__()
+        self.drop_prob = drop_prob
+        self.scale_by_keep = scale_by_keep
+
+    def forward(self, x):
+        return drop_path(x, self.drop_prob, self.training, self.scale_by_keep)
+
+    def extra_repr(self):
+        return f"drop_prob={round(self.drop_prob, 3):0.3f}"
+
+
 class LearnedAbsolutePositionEmbedding2D(nn.Module):
     """
     This module learns positional embeddings up to a fixed maximum size.
diff --git a/src/lerobot/policies/xvla/utils.py b/src/lerobot/policies/xvla/utils.py
index 73793981e..bf31ffd82 100644
--- a/src/lerobot/policies/xvla/utils.py
+++ b/src/lerobot/policies/xvla/utils.py
@@ -116,3 +116,23 @@ def mat_to_rotate6d(abs_action):
         return np.concatenate([abs_action[:, :3, 0], abs_action[:, :3, 1]], axis=-1)
     else:
         raise NotImplementedError
+
+
+def drop_path(x, drop_prob: float = 0.0, training: bool = False, scale_by_keep: bool = True):
+    """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
+
+    This is the same as the DropConnect impl I created for EfficientNet, etc networks, however,
+    the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper...
+    See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... I've opted for
+    changing the layer and argument names to 'drop path' rather than mix DropConnect as a layer name and use
+    'survival rate' as the argument.
+
+    """
+    if drop_prob == 0.0 or not training:
+        return x
+    keep_prob = 1 - drop_prob
+    shape = (x.shape[0],) + (1,) * (x.ndim - 1)  # work with diff dim tensors, not just 2D ConvNets
+    random_tensor = x.new_empty(shape).bernoulli_(keep_prob)
+    if keep_prob > 0.0 and scale_by_keep:
+        random_tensor.div_(keep_prob)
+    return x * random_tensor