draft changes

fix: use importlib.metadata for plugin discovery to support PEP 660 (#2687 )
Upgrade GitHub Actions for Node 24 compatibility (#2691 )
2026-05-12 23:29:52 +00:00 · 2025-12-26 14:06:30 +00:00 · 2025-12-24 15:45:14 +01:00 · 2025-12-24 10:42:29 +01:00 · 2025-12-24 02:03:12 +01:00 · 2025-12-24 00:40:56 +01:00
24 changed files with 3094 additions and 397 deletions
@@ -33,6 +33,9 @@ on:
    paths:
      - "docs/**"

+  release:
+    types: [published]
+
 # Ensures that only the latest commit for a PR or branch is built, canceling older runs.
 concurrency:
  group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
@@ -43,7 +46,7 @@ jobs:
  build_main_docs:
    name: Build Main Docs
    if: >
-      (github.event_name == 'push' || github.event_name == 'workflow_dispatch') &&
+      (github.event_name == 'push' || github.event_name == 'workflow_dispatch' || github.event_name == 'release') &&
      github.repository == 'huggingface/lerobot'
    permissions:
      contents: read
@@ -51,7 +54,7 @@ jobs:
    with:
      commit_sha: ${{ github.sha }}
      package: lerobot
-      additional_args: --not_python_module
+      additional_args: --not_python_module ${{ github.event_name == 'release' && format('--version {0}', github.event.release.tag_name) || '' }}
    secrets:
      token: ${{ secrets.HUGGINGFACE_PUSH }}
      hf_token: ${{ secrets.HF_DOC_BUILD_PUSH }}
@@ -62,7 +62,7 @@ jobs:
      HF_HOME: /mnt/cache/.cache/huggingface
      HF_LEROBOT_HOME: /mnt/cache/.cache/huggingface/lerobot
    steps:
-      - uses: actions/checkout@v4
+      - uses: actions/checkout@v6
        with:
          persist-credentials: false
          lfs: true
@@ -61,7 +61,7 @@ jobs:
      HF_HOME: /mnt/cache/.cache/huggingface
      HF_LEROBOT_HOME: /mnt/cache/.cache/huggingface/lerobot
    steps:
-      - uses: actions/checkout@v4
+      - uses: actions/checkout@v6
        with:
          lfs: true
          persist-credentials: false
@@ -85,7 +85,7 @@ jobs:
          python-version: ${{ env.PYTHON_VERSION }}

      - name: Install lerobot with all extras
-        run: uv sync --all-extras --no-extra groot --no-extra wallx # TODO(Steven): Make flash-attn optional
+        run: uv sync --extra all # TODO(Steven): Make flash-attn optional

      - name: Run pytest (all extras)
        run: uv run pytest tests -vv --maxfail=10
@@ -127,7 +127,7 @@ jobs:
          sudo apt-get update
          sudo apt-get install git-lfs
          git lfs install
-      - uses: actions/checkout@v4
+      - uses: actions/checkout@v6
        with:
          lfs: true
          persist-credentials: false
@@ -42,38 +42,26 @@ jobs:

            // Keyword Heuristics

-            // Domain Specific
-            if (matches(/\b(bug|error|issue|fault|crash|exception)\b/i)) labelsToAdd.add('bug');
-            if (matches(/\b(feature|enhancement|improvement|support|implement|proposal)\b/i)) labelsToAdd.add('enhancement');
-            if (matches(/\b(question|help|how to||clarify|explain|unclear)\b/i)) labelsToAdd.add('question');
-            if (matches(/\b(maintenance|documentation|docs|readme|tutorial|guide|wiki)\b/i)) labelsToAdd.add('documentation');
-            if (matches(/\b(example|script|sample|demo|notebook)s?\b/i)) labelsToAdd.add('examples');
+            if (matches(/\b(bug|error|crash|exception)\b/i)) labelsToAdd.add('bug');
+            if (matches(/\b(new feature|enhancement|improvement|proposal|feature request)\b/i)) labelsToAdd.add('enhancement');
+            if (matches(/\b(question|how to|clarify|explain|how do i|help me|question about)\b/i)) labelsToAdd.add('question');
+            if (matches(/\b(documentation|docs?|readme|tutorial|wiki|typo|docstring)\b/i)) labelsToAdd.add('documentation');
+            if (matches(/\b(example|sample|demo|notebook)s?\b/i)) labelsToAdd.add('examples');
            if (matches(/\b(datasets?|data loader|data augmentation|data preprocessing)\b/i)) labelsToAdd.add('dataset');
            if (matches(/\b(mujoco|isaac|simulation|sim)\b/i)) labelsToAdd.add('simulation');
-            if (matches(/\b(train|training|loss|optimizer|backward|gradient|wandb|sac)\b/i)) labelsToAdd.add('training');
-            if (matches(/\b(rerun|plot|video|render|visualiz|gif)/i)) labelsToAdd.add('visualization');
-            if (matches(/\b(camera|realsense|lidar|depth|sensor|imu|microphone|rgbd)\b/i)) labelsToAdd.add('sensors');
-            if (matches(/\b(aloha|koch|so-100|so100|mobile|teleop|manipulator|robots?)\b/i)) labelsToAdd.add('robots');
+            if (matches(/\b(train|training|optimizer|gradient|wandb|sac)\b/i)) labelsToAdd.add('training');
+            if (matches(/\b(rerun|plot|render|rendering|visualizer)/i)) labelsToAdd.add('visualization');
+            if (matches(/\b(cameras?|opencv|realsense|lidars?|sensors?|imus?|microphones?|rgbd|encoders?)\b/i)) labelsToAdd.add('sensors');
+            if (matches(/\b(urdf|actuators?|calibration|end-effector|kinematics)\b/i)) labelsToAdd.add('robots');
            if (matches(/\b(teleop|teleoperator|controller|leader|follower|joystick|gamepad)\b/i)) labelsToAdd.add('teleoperators');
-            if (matches(/\b(policy|policies|p0licy)\b/i)) labelsToAdd.add('policies');
-            if (matches(/\b(processors?|pipeline)\b/i)) labelsToAdd.add('processor');
-            if (matches(/\b(eval|evaluate|evaluation|metrics?|score|benchmark)\b/i)) labelsToAdd.add('evaluation');
-
-            // Infrastructure & Code Quality
+            if (matches(/\b(policy|policies|model?)\b/i)) labelsToAdd.add('policies');
+            if (matches(/\b(processor|pipeline|preprocessor|postprocessor)s?\b/i)) labelsToAdd.add('processor');
+            if (matches(/\b(eval|evaluate|evaluation|metrics?|score|benchmarks?)\b/i)) labelsToAdd.add('evaluation');
            if (matches(/\b(tests?|pytest|unittest|failing test)\b/i)) labelsToAdd.add('tests');
-            if (matches(/\b(ci|github actions|workflow|gha|actions?|pipeline)\b/i)) {
-              labelsToAdd.add('CI');
-              labelsToAdd.add('github_actions');
-            }
-            if (matches(/\b(perf|latency|throughput|fps|speed|performance)\b/i)) labelsToAdd.add('performance');
-            if (matches(/\b(dependency|requirements|pip|conda|install error|importerror|package not found)\b/i)) labelsToAdd.add('dependencies');
-            if (matches(/\b(python|pyproject|requirements(\.txt)?|pip install|typing error)\b/i)) labelsToAdd.add('python');
-
-            // Documentation & Meta
-            if (matches(/\b(doc|documentation|docs|readme|typo|how to)\b/i)) labelsToAdd.add('documentation');
-            if (matches(/\b(refactor|cleanup|restructure|rename|modernize code)\b/i)) labelsToAdd.add('refactor');
-            if (matches(/\b(release|changelog|version bump|cut a release|tag v)\b/i)) labelsToAdd.add('release');
-            if (matches(/\b(breaking change|major change)\b/i)) labelsToAdd.add('breaking change');
+            if (matches(/\b(ci|github actions?|github workflows?|gha|docker|pypi)\b/i)) labelsToAdd.add('CI');
+            if (matches(/\b(perf|latency|throughput|fps|speed|performance|slow|fast|slower|faster|memory usage)\b/i)) labelsToAdd.add('performance');
+            if (matches(/\b(dependency|dependencies|pip|install error|importerror|package not found|pyproject)\b/i)) labelsToAdd.add('dependencies');
+            if (matches(/\b(configuration|config|arguments?|input feature|dracuss)\b/i)) labelsToAdd.add('configuration');

            // Apply Labels
            const labels = Array.from(labelsToAdd).filter(Boolean);
@@ -52,7 +52,7 @@ jobs:
          sudo apt-get update
          sudo apt-get install git-lfs
          git lfs install
-      - uses: actions/checkout@v4
+      - uses: actions/checkout@v6
        with:
          lfs: true
          persist-credentials: false
@@ -87,7 +87,7 @@ jobs:
          sudo apt-get update
          sudo apt-get install git-lfs
          git lfs install
-      - uses: actions/checkout@v4
+      - uses: actions/checkout@v6
        with:
          lfs: true
          persist-credentials: false
@@ -43,12 +43,12 @@ jobs:
    runs-on: ubuntu-latest
    steps:
      - name: Checkout code
-        uses: actions/checkout@v4
+        uses: actions/checkout@v6
        with:
          persist-credentials: false

      - name: Set up Python
-        uses: actions/setup-python@v5
+        uses: actions/setup-python@v6
        with:
          python-version: '3.10'

@@ -38,12 +38,12 @@ jobs:

    steps:
      - name: Checkout code
-        uses: actions/checkout@v4
+        uses: actions/checkout@v6
        with:
          persist-credentials: false

      - name: Set up Python
-        uses: actions/setup-python@v5
+        uses: actions/setup-python@v6
        with:
          python-version: '3.10'

@@ -135,7 +135,7 @@ jobs:
    env:
      MUJOCO_GL: egl
    steps:
-      - uses: actions/checkout@v4
+      - uses: actions/checkout@v6
        with:
          lfs: true
          persist-credentials: false
@@ -177,4 +177,3 @@ jobs:

 # TODO(Steven): Publish draft/pre-release and to test pypi weekly
 # TODO(Steven): Separate build and publish job
-# TODO(Steven): Tag documentation with the same version as the package
@@ -43,7 +43,7 @@ jobs:
    runs-on: ubuntu-latest
    steps:
      - name: Checkout code
-        uses: actions/checkout@v4 # zizmor: ignore[unpinned-uses]
+        uses: actions/checkout@v6 # zizmor: ignore[unpinned-uses]
        with:
          fetch-depth: 0
          persist-credentials: false
@@ -49,7 +49,7 @@ jobs:
      HF_HOME: /mnt/cache/.cache/huggingface
      HF_LEROBOT_HOME: /mnt/cache/.cache/huggingface/lerobot
    steps:
-      - uses: actions/checkout@v4
+      - uses: actions/checkout@v6
        with:
          lfs: true
          persist-credentials: false
@@ -78,7 +78,7 @@ jobs:
          echo "Dependencies unbound:" && cat pyproject.toml

      - name: Install lerobot with all extras
-        run: uv sync --all-extras --no-extra groot --no-extra wallx # TODO(Steven): Make flash-attn optional
+        run: uv sync --extra all # TODO(Steven): Make flash-attn optional

      - name: Run pytest (all extras)
        run: uv run pytest tests -vv
@@ -101,7 +101,7 @@ jobs:
          sudo apt-get update
          sudo apt-get install git-lfs
          git lfs install
-      - uses: actions/checkout@v4
+      - uses: actions/checkout@v6
        with:
          lfs: true
          persist-credentials: false
@@ -1,323 +1,83 @@
-# How to contribute to 🤗 LeRobot?
+# How to contribute to 🤗 LeRobot

-Everyone is welcome to contribute, and we value everybody's contribution. Code
-is thus not the only way to help the community. Answering questions, helping
-others, reaching out and improving the documentations are immensely valuable to
-the community.
+Everyone is welcome to contribute, and we value everybody's contribution. Code is not the only way to help the community. Answering questions, helping others, reaching out, and improving the documentation are immensely valuable.

-It also helps us if you spread the word: reference the library from blog posts
-on the awesome projects it made possible, shout out on Twitter when it has
-helped you, or simply ⭐️ the repo to say "thank you".
+Whichever way you choose to contribute, please be mindful to respect our [code of conduct](./CODE_OF_CONDUCT.md).

-Whichever way you choose to contribute, please be mindful to respect our
-[code of conduct](https://github.com/huggingface/lerobot/blob/main/CODE_OF_CONDUCT.md).
+## Ways to Contribute

-## You can contribute in so many ways!
+You can contribute in many ways:

-Some of the ways you can contribute to 🤗 LeRobot:
+- **Fixing issues:** Resolve bugs or improve existing code.
+- **New features:** Develop new features.
+- **Extend:** Implement new models/policies, robots, or simulation environments and upload datasets to the Hugging Face Hub.
+- **Documentation:** Improve examples, guides, and docstrings.
+- **Feedback:** Submit tickets related to bugs or desired new features.

- Fixing outstanding issues with the existing code.
- Implementing new models, datasets or simulation environments.
- Contributing to the examples or to the documentation.
- Submitting issues related to bugs or desired new features.
+If you are unsure where to start, join our [Discord Channel](https://discord.gg/JkrYNdmw).

-Following the guides below, feel free to open issues and PRs and to coordinate your efforts with the community on our [Discord Channel](https://discord.gg/VjFz58wn3R). For specific inquiries, reach out to [Remi Cadene](mailto:remi.cadene@huggingface.co).
+## Development Setup

-If you are not sure how to contribute or want to know the next features we working on, look on this project page: [LeRobot TODO](https://github.com/orgs/huggingface/projects/46)
+To contribute code, you need to set up a development environment.

-## Submitting a new issue or feature request
+### 1. Fork and Clone

-Do your best to follow these guidelines when submitting an issue or a feature
-request. It will make it easier for us to come back to you quickly and with good
-feedback.
-
-### Did you find a bug?
-
-The 🤗 LeRobot library is robust and reliable thanks to the users who notify us of
-the problems they encounter. So thank you for reporting an issue.
-
-First, we would really appreciate it if you could **make sure the bug was not
-already reported** (use the search bar on Github under Issues).
-
-Did not find it? :( So we can act quickly on it, please follow these steps:
-
- Include your **OS type and version**, the versions of **Python** and **PyTorch**.
- A short, self-contained, code snippet that allows us to reproduce the bug in
-  less than 30s.
- The full traceback if an exception is raised.
- Attach any other additional information, like screenshots, you think may help.
-
-### Do you want a new feature?
-
-A good feature request addresses the following points:
-
-1. Motivation first:
-
- Is it related to a problem/frustration with the library? If so, please explain
-  why. Providing a code snippet that demonstrates the problem is best.
- Is it related to something you would need for a project? We'd love to hear
-  about it!
- Is it something you worked on and think could benefit the community?
-  Awesome! Tell us what problem it solved for you.
-
-2. Write a _paragraph_ describing the feature.
-3. Provide a **code snippet** that demonstrates its future use.
-4. In case this is related to a paper, please attach a link.
-5. Attach any additional information (drawings, screenshots, etc.) you think may help.
-
-If your issue is well written we're already 80% of the way there by the time you
-post it.
-
-## Adding new policies, datasets or environments
-
-Look at our implementations for [datasets](./src/lerobot/datasets/), [policies](./src/lerobot/policies/),
-environments ([aloha](https://github.com/huggingface/gym-aloha),
-[pusht](https://github.com/huggingface/gym-pusht))
-and follow the same api design.
-
-When implementing a new dataset loadable with LeRobotDataset follow these steps:
-
- Update `available_datasets_per_env` in `lerobot/__init__.py`
-
-When implementing a new environment (e.g. `gym_aloha`), follow these steps:
-
- Update `available_tasks_per_env` and `available_datasets_per_env` in `lerobot/__init__.py`
-
-When implementing a new policy class (e.g. `DiffusionPolicy`) follow these steps:
-
- Update `available_policies` and `available_policies_per_env`, in `lerobot/__init__.py`
- Set the required `name` class attribute.
- Update variables in `tests/test_available.py` by importing your new Policy class
-
-## Submitting a pull request (PR)
-
-Before writing code, we strongly advise you to search through the existing PRs or
-issues to make sure that nobody is already working on the same thing. If you are
-unsure, it is always a good idea to open an issue to get some feedback.
-
-You will need basic `git` proficiency to be able to contribute to
-🤗 LeRobot. `git` is not the easiest tool to use but it has the greatest
-manual. Type `git --help` in a shell and enjoy. If you prefer books, [Pro
-Git](https://git-scm.com/book/en/v2) is a very good reference.
-
-Follow these steps to start contributing:
-
-1. Fork the [repository](https://github.com/huggingface/lerobot) by
-   clicking on the 'Fork' button on the repository's page. This creates a copy of the code
-   under your GitHub user account.
-
-2. Clone your fork to your local disk, and add the base repository as a remote. The following command
-   assumes you have your public SSH key uploaded to GitHub. See the following guide for more
-   [information](https://docs.github.com/en/repositories/creating-and-managing-repositories/cloning-a-repository).
-
-   ```bash
-   git clone git@github.com:<your Github handle>/lerobot.git
-   cd lerobot
-   git remote add upstream https://github.com/huggingface/lerobot.git
-   ```
-
-3. Create a new branch to hold your development changes, and do this for every new PR you work on.
-
-   Start by synchronizing your `main` branch with the `upstream/main` branch (more details in the [GitHub Docs](https://docs.github.com/en/github/collaborating-with-issues-and-pull-requests/syncing-a-fork)):
-
-   ```bash
-   git checkout main
-   git fetch upstream
-   git rebase upstream/main
-   ```
-
-   Once your `main` branch is synchronized, create a new branch from it:
-
-   ```bash
-   git checkout -b a-descriptive-name-for-my-changes
-   ```
-
-   🚨 **Do not** work on the `main` branch.
-
-4. for development, we advise to use a tool like `poetry` or `uv` instead of just `pip` to easily track our dependencies.
-   Follow the instructions to [install poetry](https://python-poetry.org/docs/#installation) (use a version >=2.1.0) or to [install uv](https://docs.astral.sh/uv/getting-started/installation/#installation-methods) if you don't have one of them already.
-
-   Set up a development environment with conda:
-
-   ```bash
-   conda create -y -n lerobot-dev python=3.10 && conda activate lerobot-dev
-   ```
-
-   If you're using `uv`, it can manage python versions so you can instead do:
-
-   ```bash
-   uv venv --python 3.10 && source .venv/bin/activate
-   ```
-
-   To develop on 🤗 LeRobot, you will at least need to install the `dev` and `test` extras dependencies along with the core library:
-
-   using `poetry`
-
-   ```bash
-   poetry sync --extras "dev test"
-   ```
-
-   using `uv`
-
-   ```bash
-   uv sync --extra dev --extra test
-   ```
-
-   You can also install the project with all its dependencies (including environments):
-
-   using `poetry`
-
-   ```bash
-   poetry sync --all-extras
-   ```
-
-   using `uv`
-
-   ```bash
-   uv sync --all-extras
-   ```
-
-   > **Note:** If you don't install simulation environments with `--all-extras`, the tests that require them will be skipped when running the pytest suite locally. However, they _will_ be tested in the CI. In general, we advise you to install everything and test locally before pushing.
-
-   Whichever command you chose to install the project (e.g. `poetry sync --all-extras`), you should run it again when pulling code with an updated version of `pyproject.toml` and `poetry.lock` in order to synchronize your virtual environment with the new dependencies.
-
-   The equivalent of `pip install some-package`, would just be:
-
-   using `poetry`
-
-   ```bash
-   poetry add some-package
-   ```
-
-   using `uv`
-
-   ```bash
-   uv add some-package
-   ```
-
-   When making changes to the poetry sections of the `pyproject.toml`, you should run the following command to lock dependencies.
-   using `poetry`
-
-   ```bash
-   poetry lock
-   ```
-
-   using `uv`
-
-   ```bash
-   uv lock
-   ```
-
-5. Develop the features on your branch.
-
-   As you work on the features, you should make sure that the test suite
-   passes. You should run the tests impacted by your changes like this (see
-   below an explanation regarding the environment variable):
-
-   ```bash
-   pytest tests/<TEST_TO_RUN>.py
-   ```
-
-6. Follow our style.
-
-   `lerobot` relies on `ruff` to format its source code
-   consistently. Set up [`pre-commit`](https://pre-commit.com/) to run these checks
-   automatically as Git commit hooks.
-
-   Install `pre-commit` hooks:
-
-   ```bash
-   pre-commit install
-   ```
-
-   You can run these hooks whenever you need on staged files with:
-
-   ```bash
-   pre-commit
-   ```
-
-   Once you're happy with your changes, add changed files using `git add` and
-   make a commit with `git commit` to record your changes locally:
-
-   ```bash
-   git add modified_file.py
-   git commit
-   ```
-
-   Note, if you already committed some changes that have a wrong formatting, you can use:
-
-   ```bash
-   pre-commit run --all-files
-   ```
-
-   Please write [good commit messages](https://chris.beams.io/posts/git-commit/).
-
-   It is a good idea to sync your copy of the code with the original
-   repository regularly. This way you can quickly account for changes:
-
-   ```bash
-   git fetch upstream
-   git rebase upstream/main
-   ```
-
-   Push the changes to your account using:
-
-   ```bash
-   git push -u origin a-descriptive-name-for-my-changes
-   ```
-
-7. Once you are satisfied (**and the checklist below is happy too**), go to the
-   webpage of your fork on GitHub. Click on 'Pull request' to send your changes
-   to the project maintainers for review.
-
-8. It's ok if maintainers ask you for changes. It happens to core contributors
-   too! So everyone can see the changes in the Pull request, work in your local
-   branch and push the changes to your fork. They will automatically appear in
-   the pull request.
-
-### Checklist
-
-1. The title of your pull request should be a summary of its contribution;
-2. If your pull request addresses an issue, please mention the issue number in
-   the pull request description to make sure they are linked (and people
-   consulting the issue know you are working on it);
-3. To indicate a work in progress please prefix the title with `[WIP]`, or preferably mark
-   the PR as a draft PR. These are useful to avoid duplicated work, and to differentiate
-   it from PRs ready to be merged;
-4. Make sure existing tests pass;
-
-### Tests
-
-An extensive test suite is included to test the library behavior and several examples. Library tests can be found in the [tests folder](https://github.com/huggingface/lerobot/tree/main/tests).
-
-Install [git lfs](https://git-lfs.com/) to retrieve test artifacts (if you don't have it already).
-
-On Mac:
+Fork the repository on GitHub, then clone your fork:

 ```bash
-brew install git-lfs
-git lfs install
+git clone https://github.com/<your-handle>/lerobot.git
+cd lerobot
+git remote add upstream https://github.com/huggingface/lerobot.git
 ```

-On Ubuntu:
+### 2. Environment Installation
+
+Please follow our [Installation Guide](./docs/source/installation.mdx) for the environment setup & installation from source.
+
+## Running Tests & Quality Checks
+
+### Code Style (Pre-commit)
+
+Install `pre-commit` hooks to run checks automatically before you commit:

 ```bash
-sudo apt-get install git-lfs
-git lfs install
+pre-commit install
 ```

-Pull artifacts if they're not in [tests/artifacts](tests/artifacts)
+To run checks manually on all files:

 ```bash
+pre-commit run --all-files
+```
+
+### Running Tests
+
+We use `pytest`. First, ensure you have test artifacts by installing **git-lfs**:
+
+```bash
+git lfs install
 git lfs pull
 ```

-We use `pytest` in order to run the tests. From the root of the
-repository, here's how to run tests with `pytest` for the library:
+Run the full suite (this may require extras installed):

 ```bash
-python -m pytest -sv ./tests
+pytest -sv ./tests
 ```

-You can specify a smaller set of tests in order to test only the feature
-you're working on.
+Or run a specific test file during development:
+
+```bash
+pytest -sv tests/test_specific_feature.py
+```
+
+## Submitting Issues & Pull Requests
+
+Use the templates for required fields and examples.
+
+- **Issues:** Follow the [ticket template](./.github/ISSUE_TEMPLATE/bug-report.yml).
+- **Pull requests:** Rebase on `upstream/main`, use a descriptive branch (don't work on `main`), run `pre-commit` and tests locally, and follow the [PR template](./.github/PULL_REQUEST_TEMPLATE.md).
+
+One member of the LeRobot team will then review your contribution.
+
+Thank you for contributing to LeRobot!
@@ -41,6 +41,8 @@
    title: NVIDIA GR00T N1.5
  - local: xvla
    title: X-VLA
+  - local: walloss
+    title: WALL-OSS
  title: "Policies"
 - sections:
  - local: sarm
@@ -0,0 +1,35 @@
+# WALL-OSS
+
+This repository contains the Hugging Face port of **WALL-OSS**, a Vision-Language-Action model for cross-embodiment robotic control based on Qwen2.5-VL with flow matching/FAST action prediction.
+
+---
+
+## Model Overview
+
+| Feature            | Description                                           |
+| ------------------ | ----------------------------------------------------- | --- |
+| Base Model         | Qwen2.5-VL (Vision-Language Model)                    |
+| Action Prediction  | Flow Matching (diffusion) or FAST (discrete tokens)   |
+| Architecture       | Mixture of Experts (MoE) with action-specific routing |     |
+| Multi-Modal Inputs | Vision (images/videos), Language, Proprioception      |
+
+---
+
+## Citation
+
+If you use this work, please cite:
+
+```bibtex
+@article{zhai2025igniting,
+    title   = {Igniting VLMs Toward the Embodied Space},
+    author  = {Zhai, Andy and Liu, Brae and Fang, Bruno and Cai, Chalse and Ma, Ellie and Yin, Ethan and Wang, Hao and Zhou, Hugo and Wang, James and Shi, Lights and Liang, Lucy and Wang, Make and Wang, Qian and Gan, Roy and Yu, Ryan and Li, Shalfun and Liu, Starrick and Chen, Sylas and Chen, Vincent and Xu, Zach},
+    journal = {arXiv preprint arXiv:2509.11766},
+    year    = {2025}
+}
+```
+
+---
+
+## License
+
+This port follows the **Apache 2.0 License**.
@@ -0,0 +1,74 @@
+# WALL-OSS
+
+WALL-OSS is an open-source foundation model for embodied intelligence, proposed by the [XSquare Robot](https://x2robot.com/en/research/68bc2cde8497d7f238dde690) team in 2025. The LeRobot implementation is adapted from their open-source [WallX](https://github.com/X-Square-Robot/wall-x) repository.
+
+X Square Robot’s WALL-OSS is now integrated into Hugging Face’s LeRobot ecosystem. This is an exciting collaborative project between the LeRobot and X Square Robot teams. You can now post-train, evaluate, and deploy WALL-OSS directly through LeRobot. With this, we’re aiming to make it easier for the open-source robotics community to customize and deploy WALL-OSS foundation models. Read and explore WALL-OSS [paper](https://arxiv.org/pdf/2509.11766) and [code](https://github.com/X-Square-Robot/wall-x).
+
+## Model Overview
+
+The WALL-OSS team is building the embodied foundation model to capture and compress the world's most valuable data: the continuous, high-fidelity stream of physical interaction. By creating a direct feedback loop between the model's decisions and the body's lived experience, the emergence of a truly generalizable intelligence is enabled—one that understands not just how the world works, but how to act effectively within it.
+
+Technically, WALL-OSS introduces a tightly coupled multimodal architecture (tightly-coupled MoE structure) that integrates both discrete and continuous action modeling strategies. Through a two-stage training pipeline (Inspiration → Integration), the model gradually unifies semantic reasoning and high-frequency action generation. Its core innovations include:
+
+- **Embodied perception–enhanced multimodal pretraining**: Large-scale training on unified vision–language–action data to strengthen spatial, causal, and manipulation understanding.
+- **Unified Cross-Level Chain-of-Thought (Uni-CoT)**: A single differentiable framework that unifies high-level instruction reasoning, sub-task decomposition, and fine-grained action synthesis, forming a continuous chain from “understanding” to “execution.”
+- **Mixture-of-Experts (MoE) action heads**: Dynamically activating experts depending on the task phase and modeling actions in discrete or continuous space to maintain stable VLM priors.
+- **Two-stage training paradigm**:
+  - **Inspiration stage**: Injecting discrete action priors to strengthen spatial understanding and semantic-action alignment.
+  - **Integration stage**: Using flow matching to achieve high-frequency continuous control.
+
+## Installation Requirements
+
+1. Install LeRobot by following our [Installation Guide](./installation).
+2. Install WallX dependencies by running:
+
+   ```bash
+   pip install -e ".[wallx]"
+   ```
+
+## Usage
+
+To use WallX in LeRobot, specify the policy type as:
+
+```python
+policy.type=wall_x
+```
+
+## Training
+
+For training WallX, you can use the standard LeRobot training script with the appropriate configuration:
+
+```bash
+python src/lerobot/scripts/lerobot_train.py \
+    --dataset.repo_id=your_dataset \
+    --policy.type=wall_x \
+    --output_dir=./outputs/wallx_training \
+    --job_name=wallx_training \
+    --policy.repo_id=your_repo_id \
+    --policy.pretrained_name_or_path=x-square-robot/wall-oss-flow \
+    --policy.prediction_mode=diffusion \
+    --policy.attn_implementation=eager \
+    --steps=3000 \
+    --policy.device=cuda \
+    --batch_size=32
+```
+
+### Training Arguments
+
+| Argument                       | Description                                                                                                                                                   |
+| ------------------------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `--dataset.repo_id`            | The Hugging Face Hub repository ID for your training dataset (e.g., `lerobot/aloha_sim_insertion_human`)                                                      |
+| `--policy.type`                | Specifies using the WallX policy architecture                                                                                                                 |
+| `--output_dir`                 | Local directory where training checkpoints and logs will be saved                                                                                             |
+| `--job_name`                   | A name identifier for this training run (used in logging/tracking)                                                                                            |
+| `--policy.repo_id`             | Your Hugging Face Hub repo ID where the trained model will be pushed                                                                                          |
+| `--policy.pretrained_path`     | Path to pretrained WallX weights to initialize from (the official WALL-OSS checkpoint)                                                                        |
+| `--policy.prediction_mode`     | The action prediction strategy: `diffusion` or `fast` - `diffusion` uses iterative denoising for action generation, `fast` uses next token prediction instead |
+| `--policy.attn_implementation` | Attention implementation backend - `eager` uses standard PyTorch attention (alternatives include `flash_attention_2` or `sdpa`)                               |
+| `--steps`                      | Total number of training steps to run                                                                                                                         |
+| `--policy.device`              | Device to train on (`cuda` for GPU, `cpu` for CPU)                                                                                                            |
+| `--batch_size`                 | Number of samples per training batch                                                                                                                          |
+
+## License
+
+This model follows the **Apache 2.0 License**, consistent with the original [WallX repository](https://github.com/X-Square-Robot/wall-x).
@@ -168,7 +168,7 @@ all = [
    "lerobot[kinematics]",
    "lerobot[intelrealsense]",
    # "lerobot[wallx]",
-    "lerobot[pi]",
+    # "lerobot[pi]", TODO(Pepijn): Update pi to transformers v5
    "lerobot[smolvla]",
    # "lerobot[groot]", TODO(Steven): Gr00t requires specific installation instructions for flash-attn
    "lerobot[xvla]",
@@ -405,6 +405,10 @@ conflicts = [
        { extra = "wallx" },
        { extra = "xvla" },
    ],
+    [
+        { extra = "wallx" },
+        { extra = "sarm" },
+    ],
    [
        { extra = "wallx" },
        { extra = "hilserl" },
@@ -417,4 +421,37 @@ conflicts = [
        { extra = "wallx" },
        { extra = "all" },
    ],
+    # pi uses custom branch which conflicts with transformers-dep
+    [
+        { extra = "pi" },
+        { extra = "transformers-dep" },
+    ],
+    [
+        { extra = "pi" },
+        { extra = "smolvla" },
+    ],
+    [
+        { extra = "pi" },
+        { extra = "groot" },
+    ],
+    [
+        { extra = "pi" },
+        { extra = "xvla" },
+    ],
+    [
+        { extra = "pi" },
+        { extra = "sarm" },
+    ],
+    [
+        { extra = "pi" },
+        { extra = "hilserl" },
+    ],
+    [
+        { extra = "pi" },
+        { extra = "libero" },
+    ],
+    [
+        { extra = "pi" },
+        { extra = "all" },
+    ],
 ]
@@ -93,10 +93,11 @@ def create_sinusoidal_pos_embedding(  # see openpi `create_sinusoidal_pos_embedd


 def sample_beta(alpha, beta, bsize, device):  # see openpi `sample_beta` (exact copy)
-    alpha_t = torch.as_tensor(alpha, dtype=torch.float32, device=device)
-    beta_t = torch.as_tensor(beta, dtype=torch.float32, device=device)
+    # Beta sampling uses _sample_dirichlet which isn't implemented for MPS, so sample on CPU
+    alpha_t = torch.tensor(alpha, dtype=torch.float32)
+    beta_t = torch.tensor(beta, dtype=torch.float32)
    dist = torch.distributions.Beta(alpha_t, beta_t)
-    return dist.sample((bsize,))
+    return dist.sample((bsize,)).to(device)


 def make_att_2d_masks(pad_masks, att_masks):  # see openpi `make_att_2d_masks` (exact copy)
@@ -0,0 +1,49 @@
+# π₀.₅ (pi05)
+
+This repository contains the Hugging Face port of **π₀.₅**, adapted from [OpenPI](https://github.com/Physical-Intelligence/openpi) by the Physical Intelligence.
+It is designed as a **Vision-Language-Action model with open-world generalization**.
+
+---
+
+## Model Overview
+
+| Feature              | π₀                                                     | π₀.₅                                      |
+| -------------------- | ------------------------------------------------------ | ----------------------------------------- |
+| Time Conditioning    | Concatenates time with actions via `action_time_mlp_*` | Uses `time_mlp_*` for AdaRMS conditioning |
+| AdaRMS               | Not used                                               | Used in action expert                     |
+| Tokenizer Length     | 48 tokens                                              | 200 tokens                                |
+| Discrete State Input | False (Uses `state_proj` layer)                        | True                                      |
+| Parameter Count      | Higher (includes state embedding)                      | Lower (no state embedding)                |
+
+---
+
+## Citation
+
+If you use this work, please cite both **OpenPI** and the π₀.₅ paper:
+
+```bibtex
+@misc{openpi2024,
+  author       = {Physical Intelligence Lab},
+  title        = {OpenPI: PyTorch Implementation of π0 and π0.5 Policies},
+  year         = {2024},
+  publisher    = {GitHub},
+  howpublished = {\url{https://github.com/Physical-Intelligence/openpi}},
+  license      = {Apache-2.0}
+}
+
+@misc{intelligence2025pi05visionlanguageactionmodelopenworld,
+  title        = {π₀.₅: a Vision-Language-Action Model with Open-World Generalization},
+  author       = {Physical Intelligence and Kevin Black and Noah Brown and James Darpinian and Karan Dhabalia and Danny Driess and Adnan Esmail and Michael Equi and Chelsea Finn and Niccolo Fusai and Manuel Y. Galliker and Dibya Ghosh and Lachy Groom and Karol Hausman and Brian Ichter and Szymon Jakubczak and Tim Jones and Liyiming Ke and Devin LeBlanc and Sergey Levine and Adrian Li-Bell and Mohith Mothukuri and Suraj Nair and Karl Pertsch and Allen Z. Ren and Lucy Xiaoyang Shi and Laura Smith and Jost Tobias Springenberg and Kyle Stachowicz and James Tanner and Quan Vuong and Homer Walke and Anna Walling and Haohuan Wang and Lili Yu and Ury Zhilinsky},
+  year         = {2025},
+  eprint       = {2504.16054},
+  archivePrefix= {arXiv},
+  primaryClass = {cs.LG},
+  url          = {https://arxiv.org/abs/2504.16054},
+}
+```
+
+---
+
+## License
+
+This port follows the **Apache 2.0 License**, consistent with the original [OpenPI repository](https://github.com/Physical-Intelligence/openpi).
@@ -0,0 +1,21 @@
+#!/usr/bin/env python
+
+# Copyright 2025 Physical Intelligence and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .configuration_pi05 import PI05Config
+from .modeling_pi05 import PI05Policy
+from .processor_pi05 import make_pi05_pre_post_processors
+
+__all__ = ["PI05Config", "PI05Policy", "make_pi05_pre_post_processors"]
@@ -0,0 +1,164 @@
+#!/usr/bin/env python
+
+# Copyright 2025 Physical Intelligence and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from dataclasses import dataclass, field
+
+from lerobot.configs.policies import PreTrainedConfig
+from lerobot.configs.types import FeatureType, NormalizationMode, PolicyFeature
+from lerobot.optim.optimizers import AdamWConfig
+from lerobot.optim.schedulers import CosineDecayWithWarmupSchedulerConfig
+from lerobot.policies.rtc.configuration_rtc import RTCConfig
+
+DEFAULT_IMAGE_SIZE = 224
+
+
+@PreTrainedConfig.register_subclass("pi05")
+@dataclass
+class PI05Config(PreTrainedConfig):
+    paligemma_variant: str = "gemma_2b"
+    action_expert_variant: str = "gemma_300m"
+    dtype: str = "float32"  # Options: "bfloat16", "float32"
+
+    n_obs_steps: int = 1
+    chunk_size: int = 50  # Number of action steps to predict, in openpi called "action_horizon"
+    n_action_steps: int = 50  # Number of action steps to execute
+
+    # Shorter state and action vectors will be padded to these dimensions
+    max_state_dim: int = 32
+    max_action_dim: int = 32
+
+    # Flow matching parameters: see openpi `PI0Pytorch`
+    num_inference_steps: int = 10
+    time_sampling_beta_alpha: float = 1.5
+    time_sampling_beta_beta: float = 1.0
+    time_sampling_scale: float = 0.999
+    time_sampling_offset: float = 0.001
+    min_period: float = 4e-3
+    max_period: float = 4.0
+
+    # Real-Time Chunking (RTC) configuration
+    rtc_config: RTCConfig | None = None
+
+    image_resolution: tuple[int, int] = (
+        DEFAULT_IMAGE_SIZE,
+        DEFAULT_IMAGE_SIZE,
+    )  # see openpi `preprocessing_pytorch.py`
+
+    # Add empty images. Used to add empty cameras when no image features are present.
+    empty_cameras: int = 0
+
+    tokenizer_max_length: int = 200  # see openpi `__post_init__`
+
+    normalization_mapping: dict[str, NormalizationMode] = field(
+        default_factory=lambda: {
+            "VISUAL": NormalizationMode.IDENTITY,
+            "STATE": NormalizationMode.QUANTILES,  # Pi0.5 uses quantiles for state
+            "ACTION": NormalizationMode.QUANTILES,  # Pi0.5 uses quantiles for action
+        }
+    )
+
+    # Training settings
+    gradient_checkpointing: bool = False  # Enable gradient checkpointing for memory optimization
+    compile_model: bool = False  # Whether to use torch.compile for model optimization
+    compile_mode: str = "max-autotune"  # Torch compile mode
+    device: str | None = None  # Device to use for the model (None = auto-detect)
+
+    # Optimizer settings: see openpi `AdamW`
+    optimizer_lr: float = 2.5e-5  # see openpi `CosineDecaySchedule: peak_lr`
+    optimizer_betas: tuple[float, float] = (0.9, 0.95)
+    optimizer_eps: float = 1e-8
+    optimizer_weight_decay: float = 0.01
+    optimizer_grad_clip_norm: float = 1.0
+
+    # Scheduler settings: see openpi `CosineDecaySchedule`
+    # Note: These will auto-scale if --steps < scheduler_decay_steps
+    # For example, --steps=3000 will scale warmup to 100 and decay to 3000
+    scheduler_warmup_steps: int = 1_000
+    scheduler_decay_steps: int = 30_000
+    scheduler_decay_lr: float = 2.5e-6
+
+    tokenizer_max_length: int = 200  # see openpi `__post_init__`
+
+    def __post_init__(self):
+        super().__post_init__()
+
+        # Validate configuration
+        if self.n_action_steps > self.chunk_size:
+            raise ValueError(
+                f"n_action_steps ({self.n_action_steps}) cannot be greater than chunk_size ({self.chunk_size})"
+            )
+
+        if self.paligemma_variant not in ["gemma_300m", "gemma_2b"]:
+            raise ValueError(f"Invalid paligemma_variant: {self.paligemma_variant}")
+
+        if self.action_expert_variant not in ["gemma_300m", "gemma_2b"]:
+            raise ValueError(f"Invalid action_expert_variant: {self.action_expert_variant}")
+
+        if self.dtype not in ["bfloat16", "float32"]:
+            raise ValueError(f"Invalid dtype: {self.dtype}")
+
+    def validate_features(self) -> None:
+        """Validate and set up input/output features."""
+        for i in range(self.empty_cameras):
+            key = f"observation.images.empty_camera_{i}"
+            empty_camera = PolicyFeature(
+                type=FeatureType.VISUAL,
+                shape=(3, *self.image_resolution),  # Use configured image resolution
+            )
+            self.input_features[key] = empty_camera
+
+        if "observation.state" not in self.input_features:
+            state_feature = PolicyFeature(
+                type=FeatureType.STATE,
+                shape=(self.max_state_dim,),  # Padded to max_state_dim
+            )
+            self.input_features["observation.state"] = state_feature
+
+        if "action" not in self.output_features:
+            action_feature = PolicyFeature(
+                type=FeatureType.ACTION,
+                shape=(self.max_action_dim,),  # Padded to max_action_dim
+            )
+            self.output_features["action"] = action_feature
+
+    def get_optimizer_preset(self) -> AdamWConfig:
+        return AdamWConfig(
+            lr=self.optimizer_lr,
+            betas=self.optimizer_betas,
+            eps=self.optimizer_eps,
+            weight_decay=self.optimizer_weight_decay,
+            grad_clip_norm=self.optimizer_grad_clip_norm,
+        )
+
+    def get_scheduler_preset(self):
+        return CosineDecayWithWarmupSchedulerConfig(
+            peak_lr=self.optimizer_lr,
+            decay_lr=self.scheduler_decay_lr,
+            num_warmup_steps=self.scheduler_warmup_steps,
+            num_decay_steps=self.scheduler_decay_steps,
+        )
+
+    @property
+    def observation_delta_indices(self) -> None:
+        return None
+
+    @property
+    def action_delta_indices(self) -> list:
+        return list(range(self.chunk_size))
+
+    @property
+    def reward_delta_indices(self) -> None:
+        return None
@@ -0,0 +1,995 @@
+#!/usr/bin/env python
+
+# Copyright 2025 Physical Intelligence and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ONLY AN EXAMPLE FILE, NEVER USED, IT IS OLD CODE
+"""
+π0+FAST: Efficient Action Tokenization for Vision-Language-Action Models
+
+[Paper](https://huggingface.co/papers/2501.09747)
+[Jax code](https://github.com/Physical-Intelligence/openpi)
+
+Designed by Physical Intelligence. Ported from Jax by Hugging Face.
+Disclaimer: It is not expected to perform as well as the original implementation.
+
+Example of finetuning the pi0+FAST pretrained model (`pi0_fast_base` in `openpi`):
+```bash
+lerobot-train \
+--policy.path=lerobot/pi0fast_base \
+--dataset.repo_id=danaaubakirova/koch_test
+```
+
+Example of training the pi0+FAST neural network with from scratch:
+```bash
+lerobot-train \
+--policy.type=pi0fast \
+--dataset.repo_id=danaaubakirova/koch_test
+```
+
+Example of using the pi0 pretrained model outside LeRobot training framework:
+```python
+policy = PI0FASTPolicy.from_pretrained("lerobot/pi0fast_base")
+```
+
+"""
+
+from collections import deque
+from functools import partial
+
+import numpy as np
+import torch
+import torch.nn.functional as F  # noqa: N812
+from PIL import Image
+from scipy.fft import idct
+from torch import Tensor, nn
+from transformers import AutoProcessor, AutoTokenizer, PaliGemmaForConditionalGeneration
+from transformers.cache_utils import HybridCache, StaticCache
+from transformers.models.auto import CONFIG_MAPPING
+
+from lerobot.constants import ACTION, OBS_STATE
+from lerobot.policies.normalize import Normalize, Unnormalize
+from lerobot.policies.pi0fast.configuration_pi0fast import PI0FASTConfig
+from lerobot.policies.pretrained import PreTrainedPolicy
+
+PRECISION = {
+    "float16": torch.float16,
+    "float32": torch.float32,
+    "bfloat16": torch.bfloat16,
+}
+
+
+def normalize(x, min_val, max_val):
+    return (x - min_val) / (max_val - min_val)
+
+
+def unnormalize(x, min_val, max_val):
+    return x * (max_val - min_val) + min_val
+
+
+def safe_arcsin(value):
+    # This ensures that the input stays within
+    # [−1,1] to avoid invalid values for arcsin
+    return torch.arcsin(torch.clamp(value, -1.0, 1.0))
+
+
+def aloha_gripper_to_angular(value):
+    # Aloha transforms the gripper positions into a linear space. The following code
+    # reverses this transformation to be consistent with pi0 which is pretrained in
+    # angular space.
+    #
+    # These values are coming from the Aloha code:
+    # PUPPET_GRIPPER_POSITION_OPEN, PUPPET_GRIPPER_POSITION_CLOSED
+    value = unnormalize(value, min_val=0.01844, max_val=0.05800)
+
+    # This is the inverse of the angular to linear transformation inside the Interbotix code.
+    def linear_to_radian(linear_position, arm_length, horn_radius):
+        value = (horn_radius**2 + linear_position**2 - arm_length**2) / (2 * horn_radius * linear_position)
+        return safe_arcsin(value)
+
+    # The constants are taken from the Interbotix code.
+    value = linear_to_radian(value, arm_length=0.036, horn_radius=0.022)
+
+    # Normalize to [0, 1].
+    # The values 0.4 and 1.5 were measured on an actual Trossen robot.
+    return normalize(value, min_val=0.4, max_val=1.5)
+
+
+def aloha_gripper_from_angular(value):
+    # Convert from the gripper position used by pi0 to the gripper position that is used by Aloha.
+    # Note that the units are still angular but the range is different.
+
+    # The values 0.4 and 1.5 were measured on an actual Trossen robot.
+    value = unnormalize(value, min_val=0.4, max_val=1.5)
+
+    # These values are coming from the Aloha code:
+    # PUPPET_GRIPPER_JOINT_OPEN, PUPPET_GRIPPER_JOINT_CLOSE
+    return normalize(value, min_val=-0.6213, max_val=1.4910)
+
+
+def aloha_gripper_from_angular_inv(value):
+    # Directly inverts the gripper_from_angular function.
+    value = unnormalize(value, min_val=-0.6213, max_val=1.4910)
+    return normalize(value, min_val=0.4, max_val=1.5)
+
+
+class PI0FASTPolicy(PreTrainedPolicy):
+    """Wrapper class around PI0FAST tokenizer and model to train and run inference within LeRobot."""
+
+    config_class = PI0FASTConfig
+    name = "pi0fast"
+
+    def __init__(
+        self,
+        config: PI0FASTConfig,
+        dataset_stats: dict[str, dict[str, Tensor]] | None = None,
+    ):
+        """
+        Args:
+            config: Policy configuration class instance or None, in which case the default instantiation of
+                    the configuration class is used.
+            dataset_stats: Dataset statistics to be used for normalization. If not passed here, it is expected
+                that they will be passed with a call to `load_state_dict` before the policy is used.
+        """
+
+        super().__init__(config)
+        config.validate_features()
+        self.config = config
+
+        self.normalize_inputs = Normalize(config.input_features, config.normalization_mapping, dataset_stats)
+        self.normalize_targets = Normalize(
+            config.output_features, config.normalization_mapping, dataset_stats
+        )
+        self.unnormalize_outputs = Unnormalize(
+            config.output_features, config.normalization_mapping, dataset_stats
+        )
+
+        self.language_tokenizer = AutoProcessor.from_pretrained("google/paligemma-3b-pt-224")
+        self.model = PI0FAST(config)
+
+        self.reset()
+
+    def reset(self):
+        """This should be called whenever the environment is reset."""
+        self._action_queue = deque([], maxlen=self.config.n_action_steps)
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        """Override the from_pretrained method to display important disclaimer."""
+        print(
+            "⚠️  DISCLAIMER: The PI0FAST model is ported from JAX by the Hugging Face team. \n"
+            "   It is not expected to perform as well as the original implementation. \n"
+            "   Original implementation: https://github.com/Physical-Intelligence/openpi"
+        )
+        return super().from_pretrained(*args, **kwargs)
+
+    def get_optim_params(self) -> dict:
+        return self.parameters()
+
+    def _pi_aloha_decode_state(self, state):
+        # Flip the joints.
+        for motor_idx in [1, 2, 8, 9]:
+            state[:, motor_idx] *= -1
+        # Reverse the gripper transformation that is being applied by the Aloha runtime.
+        for motor_idx in [6, 13]:
+            state[:, motor_idx] = aloha_gripper_to_angular(state[:, motor_idx])
+        return state
+
+    def _pi_aloha_encode_actions(self, actions):
+        # Flip the joints.
+        for motor_idx in [1, 2, 8, 9]:
+            actions[:, :, motor_idx] *= -1
+        # Reverse the gripper transformation that is being applied by the Aloha runtime.
+        for motor_idx in [6, 13]:
+            actions[:, :, motor_idx] = aloha_gripper_from_angular(actions[:, :, motor_idx])
+        return actions
+
+    def _pi_aloha_encode_actions_inv(self, actions):
+        # Flip the joints again.
+        for motor_idx in [1, 2, 8, 9]:
+            actions[:, :, motor_idx] *= -1
+        # Reverse the gripper transformation that is being applied by the Aloha runtime.
+        for motor_idx in [6, 13]:
+            actions[:, :, motor_idx] = aloha_gripper_from_angular_inv(actions[:, :, motor_idx])
+        return actions
+
+    @torch.no_grad()
+    def predict_action_chunk(self, batch: dict[str, Tensor]) -> Tensor:
+        """Predict a chunk of actions given environment observations."""
+        raise NotImplementedError("Currently not implemented for PI0FAST")
+
+    @torch.no_grad()
+    def select_action(self, batch: dict[str, Tensor]) -> Tensor:
+        """Select a single action given environment observations.
+
+        This method wraps `select_actions` in order to return one action at a time for execution in the
+        environment. It works by managing the actions in a queue and only calling `select_actions` when the
+        queue is empty.
+        """
+        self.eval()
+
+        if self.config.adapt_to_pi_aloha:
+            batch[OBS_STATE] = self._pi_aloha_decode_state(batch[OBS_STATE])
+
+        batch = self.normalize_inputs(batch)
+
+        # Action queue logic for n_action_steps > 1. When the action_queue is depleted, populate it by
+        # querying the policy.
+        if len(self._action_queue) == 0:
+            actions = self.model.generate_actions(batch)
+
+            actions = actions[:, : self.config.n_action_steps]
+
+            original_action_dim = self.config.action_feature.shape[
+                0
+            ]  # self.config.max_action_dim  # self.config.action_feature.shape[0]
+            actions = actions[:, :, :original_action_dim]
+
+            actions = self.unnormalize_outputs({"action": actions})["action"]
+
+            if self.config.adapt_to_pi_aloha:
+                actions = self._pi_aloha_encode_actions(actions)
+
+            # `self.model.forward` returns a (batch_size, n_action_steps, action_dim) tensor, but the queue
+            # effectively has shape (n_action_steps, batch_size, *), hence the transpose.
+            self._action_queue.extend(actions.transpose(0, 1))
+        return self._action_queue.popleft()
+
+    def forward(self, batch: dict[str, Tensor]) -> dict[str, Tensor]:
+        if self.config.adapt_to_pi_aloha:
+            batch[OBS_STATE] = self._pi_aloha_decode_state(batch[OBS_STATE])
+            batch[ACTION] = self._pi_aloha_encode_actions_inv(batch[ACTION])
+        batch = self.normalize_inputs(batch)
+        batch = self.normalize_targets(batch)
+        loss_dict = self.model.forward(batch)
+        return loss_dict["loss"], loss_dict
+
+
+def block_causal_update_causal_mask(
+    attention_mask,
+    token_type_ids=None,
+    past_key_values=None,
+    cache_position=None,
+    input_tensor=None,
+    attn_implementation: str = "eager",
+    dtype: torch.dtype = "float32",
+):
+    """
+    Update the causal mask during training and generation. It can be customized to different attention masks.
+    """
+    if attn_implementation == "flash_attention_2":
+        if attention_mask is not None and 0.0 in attention_mask:
+            return attention_mask
+        return None
+    using_static_cache = isinstance(past_key_values, StaticCache)
+    min_dtype = torch.finfo(dtype).min
+
+    if input_tensor is None:
+        input_tensor = attention_mask
+
+    inputs_lead_dim, sequence_length = input_tensor.shape[:2]
+
+    if using_static_cache or isinstance(past_key_values, HybridCache):
+        target_length = past_key_values.get_max_cache_shape()
+    else:
+        target_length = (
+            attention_mask.shape[-1]
+            if isinstance(attention_mask, torch.Tensor)
+            else cache_position[0] + sequence_length + 1
+        )
+
+    # Handle precomputed attention masks
+    if attention_mask is not None and attention_mask.dim() == 4:
+        return attention_mask
+
+    # Causal mask initialization
+    causal_mask = torch.full(
+        (sequence_length, target_length), fill_value=min_dtype, dtype=dtype, device=cache_position.device
+    )
+
+    # Standard causal masking (triu ensures tokens can only attend to past)
+    if sequence_length != 1:
+        causal_mask = torch.triu(causal_mask, diagonal=1)
+
+        # Apply block causal mask
+        if token_type_ids is not None:
+            token_type_ids = token_type_ids.to(causal_mask.device).bool()
+            cumsum = torch.cumsum(token_type_ids, dim=1)
+            block_causal_mask = cumsum[:, None, :] <= cumsum[:, :, None]
+
+            # Combine causal_mask with block-wise attention mask
+            causal_mask = torch.where(block_causal_mask, 0.0, causal_mask)
+            causal_mask = causal_mask[:, None, :, :]
+        else:
+            # Apply past cache position constraint
+            causal_mask *= torch.arange(target_length, device=cache_position.device) > cache_position.reshape(
+                -1, 1
+            )
+            causal_mask = causal_mask[None, None, :, :].expand(inputs_lead_dim, 1, -1, -1)
+    else:
+        # Apply past cache position constraint
+        causal_mask *= torch.arange(target_length, device=cache_position.device) > cache_position.reshape(
+            -1, 1
+        )
+        causal_mask = causal_mask[None, None, :, :].expand(inputs_lead_dim, 1, -1, -1)
+
+    if attention_mask is not None:
+        causal_mask = causal_mask.clone()  # Copy to contiguous memory for in-place edits
+        mask_length = attention_mask.shape[-1]
+
+        # Apply padding mask
+        padding_mask = causal_mask[:, :, :, :mask_length] + attention_mask[:, None, None, :].to(
+            causal_mask.device
+        )
+        padding_mask = padding_mask == 0
+        causal_mask[:, :, :, :mask_length] = causal_mask[:, :, :, :mask_length].masked_fill(
+            padding_mask, min_dtype
+        )
+
+    return causal_mask
+
+
+def prepare_inputs_for_generation(
+    # self,
+    input_ids,
+    past_key_values=None,
+    inputs_embeds=None,
+    cache_position=None,
+    position_ids=None,
+    pixel_values=None,
+    attention_mask=None,
+    token_type_ids=None,
+    use_cache=True,
+    num_logits_to_keep=None,
+    labels=None,
+    self=None,
+    **kwargs,
+):
+    # create block causal attention
+    if cache_position[0] > 0 and input_ids.shape[1] > 0:
+        input_tensor = input_ids[:, -1:]
+        new_positions = (
+            torch.ones(
+                (position_ids.shape[0], input_ids.shape[1]),
+                dtype=position_ids.dtype,
+                device=position_ids.device,
+            ).cumsum(-1)
+            + position_ids[:, -1:]
+        )
+        position_ids = torch.cat([position_ids, new_positions], dim=-1)
+    else:
+        input_tensor = inputs_embeds
+    attention_mask = block_causal_update_causal_mask(
+        attention_mask=attention_mask,
+        past_key_values=past_key_values,
+        cache_position=cache_position,
+        input_tensor=input_tensor,
+        token_type_ids=token_type_ids,
+        dtype=self.dtype,
+        attn_implementation=self.config.text_config._attn_implementation,
+    )
+    # Overwritten -- custom `position_ids` and `pixel_values` handling
+    model_inputs = self.language_model.prepare_inputs_for_generation(
+        input_ids,
+        past_key_values=past_key_values,
+        inputs_embeds=inputs_embeds,
+        attention_mask=attention_mask,
+        position_ids=position_ids,
+        cache_position=cache_position,
+        use_cache=use_cache,
+        num_logits_to_keep=num_logits_to_keep,
+        token_type_ids=token_type_ids,
+        **kwargs,
+    )
+
+    # Position_ids in Paligemma are 1-indexed
+    if model_inputs.get("position_ids") is not None:
+        model_inputs["position_ids"] += 1
+    # If we're in cached decoding stage, pixel values should be None because input ids do not contain special image token anymore
+    # Otherwise we need pixel values to be passed to model. NOTE: use_cache=False needs pixel_values always
+    if cache_position[0] == 0:
+        model_inputs["pixel_values"] = pixel_values
+    is_training = token_type_ids is not None and labels is not None
+    if cache_position[0] == 0 and isinstance(past_key_values, HybridCache):
+        input_tensor = inputs_embeds if inputs_embeds is not None else input_ids
+        causal_mask = self._update_causal_mask(
+            attention_mask, token_type_ids, past_key_values, cache_position, input_tensor, is_training
+        )
+        model_inputs["attention_mask"] = causal_mask
+
+    return model_inputs
+
+
+class PI0FAST(nn.Module):
+    def __init__(self, config: PI0FASTConfig):
+        super().__init__()
+        self.config = config
+
+        # TODO: move tokenizers in Policy
+        fast_tokenizer_path = "physical-intelligence/fast"
+        pi0_paligemma_path = "google/paligemma-3b-pt-224"
+        self.paligemma_tokenizer = AutoTokenizer.from_pretrained(pi0_paligemma_path)
+        self.processor = AutoProcessor.from_pretrained(pi0_paligemma_path)
+        self.fast_tokenizer = AutoProcessor.from_pretrained(fast_tokenizer_path, trust_remote_code=True)
+        self.fast_skip_tokens = self.config.fast_skip_tokens
+        self.max_input_seq_len = self.config.max_input_seq_len
+        self.action_horizon = self.config.chunk_size
+        self.action_dim = self.config.action_feature.shape[
+            0
+        ]  # self.config.max_action_dim  # self.config.action_feature.shape[0]
+        precision = config.precision
+        torch_precision = PRECISION.get(precision, torch.float32)
+        self.pad_token_id = (
+            self.paligemma_tokenizer.pad_token_id
+            if hasattr(self.paligemma_tokenizer, "pad_token_id")
+            else self.paligemma_tokenizer.eos_token_id
+        )
+
+        paligemma_config = CONFIG_MAPPING["paligemma"](
+            transformers_version="4.48.1",
+            _vocab_size=257152,
+            bos_token_id=2,
+            eos_token_id=1,
+            hidden_size=2048,
+            image_token_index=257152,
+            model_type="paligemma",
+            pad_token_id=0,
+            projection_dim=2048,
+            text_config={
+                "hidden_activation": "gelu_pytorch_tanh",
+                "hidden_size": 2048,
+                "intermediate_size": 16384,
+                "model_type": "gemma",
+                "num_attention_heads": 8,
+                "num_hidden_layers": 18,
+                "num_image_tokens": 256,
+                "num_key_value_heads": 1,
+                "torch_dtype": precision,
+                "vocab_size": 257152,
+                "_attn_implementation": "eager",
+            },
+            vision_config={
+                "hidden_size": 1152,
+                "intermediate_size": 4304,
+                "model_type": "siglip_vision_model",
+                "num_attention_heads": 16,
+                "num_hidden_layers": 27,
+                "num_image_tokens": 256,
+                "patch_size": 14,
+                "projection_dim": 2048,
+                "projector_hidden_act": "gelu_pytorch_tanh",
+                "torch_dtype": precision,
+                "vision_use_head": False,
+            },
+        )
+        self.pi0_paligemma = PaliGemmaForConditionalGeneration(config=paligemma_config)
+
+        self.pi0_paligemma.prepare_inputs_for_generation = partial(
+            prepare_inputs_for_generation, self=self.pi0_paligemma
+        )
+        # change important stuff in bf16
+        params_to_change_dtype = [
+            "language_model",
+            "vision_tower",
+            "multi_modal",
+        ]
+        for name, param in self.pi0_paligemma.named_parameters():
+            if any(selector in name for selector in params_to_change_dtype):
+                param.data = param.data.to(dtype=torch_precision)
+        self.set_requires_grad()
+        self.image_keys = self.config.image_features.keys()
+        # TODO: Remove this once we bump transformers to >4.52.0 because the attribute will be removed
+        # AttributeError: 'PaliGemmaConfig' object has no attribute 'ignore_index'
+        self.ignore_index = self.pi0_paligemma.config.ignore_index
+        self.padding_side = self.config.padding_side
+
+    def set_requires_grad(self):
+        if self.config.freeze_vision_encoder:
+            self.pi0_paligemma.vision_tower.eval()
+            for params in self.pi0_paligemma.vision_tower.parameters():
+                params.requires_grad = False
+        # To avoid unused params issue with distributed training
+        if self.config.freeze_lm_head:
+            for name, params in self.pi0_paligemma.named_parameters():
+                if "embed_tokens" in name:  # lm heads and embedding layer are tied
+                    params.requires_grad = False
+
+    def embed_tokens(self, tokens: torch.Tensor):
+        return self.pi0_paligemma.language_model.model.embed_tokens(tokens)
+
+    def prepare_inputs_for_generation(self, *args, **kwargs):
+        return self.pi0_paligemma.prepare_inputs_for_generation(*args, **kwargs)
+
+    def prepare_images(self, batch):
+        """Preprocess LeRobot batch into Pi0 inputs"""
+        images = []
+        img_masks = []
+        present_img_keys = [key for key in self.image_keys if key in batch]
+        if len(present_img_keys) == 0:
+            raise ValueError(
+                f"All image features are missing from the batch. At least one expected. (batch: {batch.keys()}) (image_features:{self.config.image_features})"
+            )
+
+        # Preprocess image features present in the batch
+        num_empty_cameras = 0
+        for key in self.image_keys:
+            if key in present_img_keys:
+                img = batch[key]
+
+                if self.config.resize_imgs_with_padding is not None:
+                    img = resize_with_pad(
+                        img,
+                        *self.config.resize_imgs_with_padding,
+                        pad_value=0,
+                        interpolate_like_pi=self.config.interpolate_like_pi,
+                    )
+
+                # Normalize from range [0,1] to [-1,1] as expected by siglip
+                img = img * 2.0 - 1.0
+
+                bsize = img.shape[0]
+                device = img.device
+                mask = torch.ones(bsize, dtype=torch.bool, device=device)
+            else:
+                if num_empty_cameras >= self.config.empty_cameras:
+                    continue
+                img = torch.ones_like(img) * -1
+                bsize = img.shape[0]
+                device = img.device
+                mask = torch.ones(bsize, dtype=torch.bool, device=device)
+                num_empty_cameras += 1
+
+            images.append(img)
+            img_masks.append(mask)
+        return images, img_masks
+
+    def normalize_actions(self, actions: torch.Tensor) -> torch.Tensor:
+        mins = actions.amin(dim=(1, 2), keepdim=True)  # [0]
+        maxs = actions.amax(dim=(1, 2), keepdim=True)  # [0]
+        return 2 * (actions - mins) / (maxs - mins + 1e-8) - 1
+
+    def _act_tokens_to_paligemma_tokens(self, tokens: torch.Tensor) -> torch.Tensor:
+        out = self.paligemma_tokenizer.vocab_size - 1 - self.fast_skip_tokens - tokens
+        return out
+
+    def fast_tokenizer_wrapper(self, actions_norm):
+        """
+        A wrapper for self.fast_tokenizer that ensures batch processing,
+        conversion to PyTorch tensors, and returns a dictionary without padding.
+        """
+        batch_tokens = self.fast_tokenizer(actions_norm)
+        fast_out = self.processor.tokenizer.pad({"input_ids": batch_tokens}, return_tensors="pt")
+
+        return fast_out
+
+    def create_token_type_ids(self, padded_mask: torch.Tensor, prefix_len: int) -> torch.Tensor:
+        token_type_ids = torch.zeros_like(padded_mask, dtype=torch.bool)
+        # Compute cumulative sum mask
+        cumsum_mask = (padded_mask != 0).cumsum(dim=1)
+        # Suffix block (everything after prefix_len)
+        suffix_mask = cumsum_mask > prefix_len
+        token_type_ids = suffix_mask
+        return token_type_ids
+
+    def create_input_tokens(self, state, lang_text, actions=None):
+        bsize = state.shape[0]
+        device = state.device
+        bins = torch.linspace(-1, 1, 256 + 1, device=device)[:-1]
+        discretized = torch.bucketize(state, bins) - 1
+        discretized = discretized[:, :32]
+
+        prefix_texts = []
+        state_text = []
+        for txt, disc in zip(lang_text, discretized, strict=False):
+            cleaned = txt.lower().strip().replace("_", " ")
+            state_str = " ".join(str(val.item()) for val in disc)
+            prefix_texts.append(f"Task: {cleaned}, State: {state_str};\n")
+            state_text.append(f"State: {state_str};\n")
+
+        prefix_out = self.paligemma_tokenizer(
+            prefix_texts, add_special_tokens=True, return_tensors="pt", padding="longest", truncation=False
+        )
+        prefix_ids = prefix_out["input_ids"].to(device)
+        prefix_mask = prefix_out["attention_mask"].to(device)
+        prefix_lens = prefix_mask.sum(dim=1)[:, None].cpu()
+
+        if actions is not None:
+            actions_norm = self.normalize_actions(actions)
+            actions_pad = F.pad(
+                actions_norm, (0, max(0, self.config.max_action_dim - actions_norm.shape[2])), value=0
+            )[:, :, : self.config.max_action_dim]
+            fast_out = self.fast_tokenizer_wrapper(
+                actions_pad.cpu(),
+            )
+            act_ids = fast_out["input_ids"]
+            act_mask = fast_out["attention_mask"].to(device)
+
+            act_ids = self._act_tokens_to_paligemma_tokens(act_ids).to(device)
+            # Replace action with 0 to pad tokens
+            act_ids = torch.where(
+                act_ids == self.paligemma_tokenizer.vocab_size - 1 - self.fast_skip_tokens,
+                self.pad_token_id,
+                act_ids,
+            )
+
+            eos_token = torch.tensor(
+                [self.paligemma_tokenizer.eos_token_id], dtype=torch.long, device=device
+            ).expand(bsize, -1)
+            eos_mask = torch.tensor([1], dtype=torch.long, device=device).expand(bsize, -1)
+            bos = self.paligemma_tokenizer("Action: ", add_special_tokens=False, return_tensors="pt")
+            bos_token = bos["input_ids"].expand(act_ids.shape[0], -1).to(device)
+            bos_mask = bos["attention_mask"].expand(act_ids.shape[0], -1).to(device)
+            act_ids = torch.cat([bos_token, act_ids, eos_token], dim=1)
+            act_mask = torch.cat([bos_mask, act_mask, eos_mask], dim=1)
+            act_mask = act_mask.to(device)
+        else:
+            act_ids = torch.empty(bsize, self.pad_token_id, dtype=torch.long, device=device)
+            act_mask = torch.empty(bsize, 0, dtype=torch.long, device=device)
+        final_ids = torch.cat([prefix_ids, act_ids], dim=1)
+
+        final_mask = torch.cat([prefix_mask, act_mask], dim=1)
+        batch_inputs = {"input_ids": final_ids.tolist(), "attention_mask": final_mask.tolist()}
+
+        # Use tokenizer pad function
+        padded_output = self.paligemma_tokenizer.pad(
+            batch_inputs, padding="longest", max_length=180, return_tensors="pt"
+        )
+        padded_mask = padded_output["attention_mask"]
+
+        # define tensor of padding lengths
+        att_mask = (padded_mask != 0).cumsum(dim=1) > prefix_lens
+
+        token_type_ids = self.create_token_type_ids(padded_mask=padded_mask, prefix_len=prefix_lens)
+
+        padded_output["padded_mask"] = padded_output.pop("attention_mask")
+        padded_output["attention_mask"] = att_mask
+        # loss is computed not on prefix, and not on padding
+        padded_output["loss_mask"] = att_mask & padded_output["padded_mask"]
+        padded_output["token_type_ids"] = token_type_ids
+        return padded_output
+
+    def shift_padding_side(
+        self,
+        tokens: torch.Tensor,
+        ar_mask: torch.Tensor,
+        padding_mask: torch.Tensor,
+        loss_mask: torch.Tensor,
+        targets: torch.Tensor,
+        token_type_ids: torch.Tensor,
+        padding_side: str = "right",
+    ) -> tuple[torch.Tensor]:
+        if padding_side not in ["right", "left"]:
+            return tokens, ar_mask, padding_mask, loss_mask, targets, token_type_ids
+
+        new_tokens = torch.empty_like(tokens)
+        new_ar_masks = torch.empty_like(ar_mask)
+        new_padding_mask = torch.empty_like(padding_mask)
+        new_loss_mask = torch.empty_like(loss_mask)
+        new_targets = torch.empty_like(targets)
+        new_token_type_ids = torch.empty_like(token_type_ids)
+        batch_size = tokens.shape[0]
+        for i in range(batch_size):
+            padding_indices = torch.where(padding_mask[i] == 0)[0]
+            non_padding_indices = torch.where(padding_mask[i] == 1)[0]
+            if padding_side == "left":
+                new_indices = torch.cat((padding_indices, non_padding_indices), dim=0)
+            else:
+                new_indices = torch.cat((non_padding_indices, padding_indices), dim=0)
+            new_tokens[i] = tokens[i].index_select(0, new_indices)
+            new_ar_masks[i] = ar_mask[i].index_select(0, new_indices)
+            new_padding_mask[i] = padding_mask[i].index_select(0, new_indices)
+            new_loss_mask[i] = loss_mask[i].index_select(0, new_indices)
+            new_targets[i] = targets[i].index_select(0, new_indices)
+            new_token_type_ids[i] = token_type_ids[i].index_select(0, new_indices)
+
+        return new_tokens, new_ar_masks, new_padding_mask, new_loss_mask, new_targets, new_token_type_ids
+
+    def forward(self, batch: dict[str, Tensor]):
+        device = batch[OBS_STATE].device
+        # TODO: keep like this or move to the policy .forward
+        images, img_masks = self.prepare_images(batch)
+
+        padded_outs = self.create_input_tokens(
+            state=batch[OBS_STATE],
+            lang_text=batch["task"],
+            actions=batch[ACTION],
+        )
+
+        embs, pad_masks, _, targets, loss_mask, token_type_ids = self.embed_inputs(
+            images,
+            img_masks,
+            padded_outs["input_ids"],
+            padded_outs["padded_mask"],
+            padded_outs["attention_mask"],
+            padded_outs["loss_mask"],
+            padded_outs["token_type_ids"],
+            padding_side=self.padding_side,
+        )
+        position_ids = torch.cumsum(pad_masks, dim=1) - 1
+        token_type_ids = token_type_ids.to(dtype=torch.int64)
+        past_seen_tokens = 0
+        cache_position = torch.arange(past_seen_tokens, past_seen_tokens + embs.shape[1], device=embs.device)
+        pad_masks = block_causal_update_causal_mask(
+            attention_mask=pad_masks,
+            past_key_values=None,
+            cache_position=cache_position,
+            input_tensor=embs,
+            token_type_ids=token_type_ids,
+            dtype=self.pi0_paligemma.dtype,
+            attn_implementation=self.pi0_paligemma.config.text_config._attn_implementation,
+        )
+        outputs = self.pi0_paligemma.forward(
+            input_ids=None,
+            token_type_ids=None,
+            attention_mask=pad_masks,
+            position_ids=position_ids,
+            past_key_values=None,
+            inputs_embeds=embs,
+            use_cache=False,
+            labels=None,
+        )
+
+        logits = outputs.logits
+
+        loss_fct = nn.CrossEntropyLoss(reduction="none")
+
+        # Shift left for next-step prediction
+        logits = logits[:, :-1, :]
+        targets = targets[:, 1:].to(device)  # Shift targets
+        loss_mask = loss_mask[:, 1:].to(device)  # Ensure correct shape
+
+        # Compute per-token loss
+        token_loss = loss_fct(logits.reshape(-1, logits.shape[-1]), targets.reshape(-1))
+
+        # Apply loss mask
+        token_loss = token_loss * loss_mask.reshape(-1)
+
+        # Compute final loss
+        loss = token_loss.sum() / torch.clamp(loss_mask.sum(), min=1)
+
+        # Return loss dictionary
+        loss_dict = {"ce_loss": loss.item(), "loss": loss}
+        return loss_dict
+
+    def decode_actions_with_fast(
+        self,
+        tokens: list[list[int]],
+        *,
+        time_horizon: int | None = None,
+        action_dim: int | None = None,
+        relaxed_decoding: bool = True,
+    ) -> np.array:
+        """
+        Adapt original decoding in FAST to always return actions instead of zeros.
+        """
+        self.time_horizon = (
+            time_horizon or self.fast_tokenizer.time_horizon or self.fast_tokenizer.called_time_horizon
+        )
+        self.action_dim = (
+            action_dim or self.fast_tokenizer.action_dim or self.fast_tokenizer.called_action_dim
+        )
+
+        # Cache the time horizon and action dimension for the next call
+        self.called_time_horizon = self.time_horizon
+        self.called_action_dim = self.action_dim
+
+        assert self.time_horizon is not None and self.action_dim is not None, (
+            "Tokenizer not initialized, call encode() once or pass in time_horizon and action_dim."
+        )
+
+        decoded_actions = []
+        for token in tokens:
+            try:
+                decoded_tokens = self.fast_tokenizer.bpe_tokenizer.decode(token)
+                decoded_dct_coeff = np.array(list(map(ord, decoded_tokens))) + self.fast_tokenizer.min_token
+                if relaxed_decoding:
+                    # Expected sequence length
+                    expected_seq_len = self.time_horizon * self.action_dim
+                    diff = expected_seq_len - decoded_dct_coeff.shape[0]
+                    # Apply truncation if too long
+                    if diff < 0:
+                        decoded_dct_coeff = decoded_dct_coeff[:expected_seq_len]  # Truncate on the right
+                    # Apply padding if too short
+                    elif diff > 0:
+                        decoded_dct_coeff = np.pad(
+                            decoded_dct_coeff, (0, diff), mode="constant", constant_values=0
+                        )
+
+                decoded_dct_coeff = decoded_dct_coeff.reshape(-1, self.action_dim)
+                assert decoded_dct_coeff.shape == (
+                    self.time_horizon,
+                    self.action_dim,
+                ), (
+                    f"Decoded DCT coefficients have shape {decoded_dct_coeff.shape}, expected ({self.time_horizon}, {self.action_dim})"
+                )
+            except Exception as e:
+                print(f"Error decoding tokens: {e}")
+                print(f"Tokens: {token}")
+                decoded_dct_coeff = np.zeros((self.time_horizon, self.action_dim))
+            decoded_actions.append(idct(decoded_dct_coeff / self.fast_tokenizer.scale, axis=0, norm="ortho"))
+        return np.stack(decoded_actions)
+
+    def extract_actions(self, tokens: torch.Tensor, action_horizon: int, action_dim: int) -> torch.Tensor:
+        """
+        Extracts actions from predicted output tokens using the FAST model.
+
+        Args:
+            tokens (torch.Tensor): The input tensor of tokenized outputs.
+            action_horizon (int): The number of timesteps for actions.
+            action_dim (int): The dimensionality of each action.
+
+        Returns:
+            torch.Tensor: The extracted actions as a tensor of shape (action_horizon, action_dim).
+        """
+        # Decode predicted output tokens
+        decoded_tokens = self.paligemma_tokenizer.batch_decode(tokens, skip_special_tokens=True)
+        cleaned_tokens = [
+            tokens_sequence.replace("Action:", "").replace(":", "").strip().split("|")[0].strip()
+            for tokens_sequence in decoded_tokens
+        ]
+        raw_action_tokens = [
+            self.processor.tokenizer.encode(sample_tokens, return_tensors="pt", padding=False)
+            for sample_tokens in cleaned_tokens
+        ]  # something like this should be robust #looks good
+        action_tokens = [
+            self._act_tokens_to_paligemma_tokens(raw_action_token) for raw_action_token in raw_action_tokens
+        ]
+        # returns the tensor of decoded actions per sample in a list
+        decoded_actions = [
+            torch.tensor(
+                self.decode_actions_with_fast(
+                    tok.tolist(),
+                    time_horizon=action_horizon,
+                    action_dim=action_dim,
+                    relaxed_decoding=self.config.relaxed_action_decoding,
+                ),
+                device=tokens.device,
+            ).squeeze(0)
+            for tok in action_tokens
+        ]
+
+        return torch.stack(
+            decoded_actions,
+            dim=0,
+        )
+
+    def generate_actions(self, batch: dict[str, Tensor]):
+        # TODO: keep like this or move to the policy .forward
+        images, img_masks = self.prepare_images(batch)
+
+        padded_outs = self.create_input_tokens(state=batch[OBS_STATE], lang_text=batch["task"], actions=None)
+        embs, pad_masks, att_masks2, targets, loss_mask, token_type_ids = self.embed_inputs(
+            images,
+            img_masks,
+            padded_outs["input_ids"],
+            padded_outs["padded_mask"],
+            padded_outs["attention_mask"],
+            padded_outs["loss_mask"],
+            padded_outs["token_type_ids"],
+            padding_side="left",
+        )
+        token_type_ids = token_type_ids.to(dtype=torch.int64)
+        prefix_position_ids = torch.cumsum(pad_masks, dim=1) - 1
+        output_tokens = self.pi0_paligemma.generate(
+            input_ids=None,
+            attention_mask=pad_masks,
+            position_ids=prefix_position_ids,
+            past_key_values=None,
+            inputs_embeds=embs,
+            use_cache=self.config.use_cache,
+            max_new_tokens=self.config.max_decoding_steps,
+            do_sample=False,
+            num_beams=1,
+            token_type_ids=token_type_ids,
+        )
+        actions = self.extract_actions(output_tokens, self.action_horizon, self.action_dim)
+        return actions
+
+    def embed_image(self, image: torch.Tensor):
+        # Handle different transformers versions
+        if hasattr(self.pi0_paligemma, "get_image_features"):
+            return self.pi0_paligemma.get_image_features(image)
+        else:
+            return self.pi0_paligemma.model.get_image_features(image)
+
+    def embed_inputs(
+        self,
+        images,
+        img_masks,
+        tokens,
+        pad_mask,
+        ar_mask,
+        loss_mask,
+        token_type_ids,
+        padding_side: str = "right",
+    ):
+        # TODO: avoid list in python and torch.cat ; prefer pre-allocation with torch.empty
+        # images are a list of same size
+        # vectorizing everything!
+        device = images[0].device
+        image_embedding_dim = images[0].shape[-1]  # TODO should be from self.config
+        all_images = torch.stack(images, dim=1).to(device)
+        b, n, c, h, w = all_images.shape
+        all_images = all_images.view(b * n, c, h, w)
+        embedded = self.embed_image(all_images).to(device)
+        b_n, p, image_embedding_dim = embedded.shape  # Extract current dimensions
+        m = b_n // b  # Compute the number of images per sample dynamically
+
+        # Reshape dynamically
+        embedded = embedded.view(b, m, p, image_embedding_dim)
+        tokens_embs = self.embed_tokens(tokens.to(device))
+
+        img_masks = torch.stack(img_masks, dim=1).unsqueeze(-1).to(device)
+        num_img_emb = embedded.shape[2]
+        img_pad_masks = img_masks.repeat(1, 1, num_img_emb).view(b, -1)
+        img_att_masks = torch.zeros((b, n, num_img_emb), dtype=torch.long, device=device).reshape(b, -1)
+
+        image_target_tokens = (
+            torch.ones((b, n, num_img_emb), dtype=torch.long, device=device) * self.pad_token_id
+        ).reshape(b, -1)
+        image_loss_mask = torch.zeros((b, n, num_img_emb), dtype=torch.long, device=device).reshape(b, -1)
+
+        embedded = embedded.reshape(b, n * num_img_emb, image_embedding_dim)  # Shape: (B, N*P, D)
+
+        embs = torch.cat([embedded, tokens_embs], dim=1).to(device)
+        pad_masks = torch.cat([img_pad_masks, pad_mask.to(device)], dim=1)
+        att_masks = torch.cat([img_att_masks, ar_mask.to(device)], dim=1)
+        loss_masks = torch.cat([image_loss_mask, loss_mask.to(device)], dim=1)
+        targets = torch.cat([image_target_tokens, tokens.to(device)], dim=1)
+        token_type_ids = torch.cat([img_att_masks, token_type_ids.to(device)], dim=1)
+
+        # Shift pad tokens to the left (.generate()) or right (.train())
+        embs, att_masks, pad_masks, loss_masks, targets, token_type_ids = self.shift_padding_side(
+            embs, att_masks, pad_masks, loss_masks, targets, token_type_ids, padding_side=padding_side
+        )
+
+        targets = torch.where(targets == self.pad_token_id, self.ignore_index, targets)
+        return embs, pad_masks, att_masks, targets, loss_masks, token_type_ids
+
+
+def resize_with_pad(img, width, height, pad_value=0, interpolate_like_pi=True):
+    # assume no-op when width height fits already
+    if img.ndim != 4:
+        raise ValueError(f"(b,c,h,w) expected, but {img.shape}")
+
+    cur_height, cur_width = img.shape[2:]
+
+    ratio = max(cur_width / width, cur_height / height)
+    resized_height = int(cur_height / ratio)
+    resized_width = int(cur_width / ratio)
+
+    if interpolate_like_pi:
+        img = (img * 255.0).to(dtype=torch.uint8)
+        img = img.permute(0, 2, 3, 1)
+        original_device = img.device
+        img = img.to(device="cpu").numpy()
+        imgs = []
+        for sub_img in img:
+            sub_img = Image.fromarray(sub_img)
+            resized_img = sub_img.resize((resized_width, resized_height), resample=2)
+            resized_img = torch.from_numpy(np.array(resized_img))
+            imgs.append(resized_img)
+        img = torch.stack(imgs, dim=0)
+        img = img.permute(0, 3, 1, 2)
+        resized_img = img.to(device=original_device, dtype=torch.float32) / 255.0
+    else:
+        resized_img = F.interpolate(
+            img, size=(resized_height, resized_width), mode="bilinear", align_corners=False
+        )
+
+    pad_height = max(0, int(height - resized_height))
+    pad_width = max(0, int(width - resized_width))
+
+    # pad on left and top of image
+    padded_img = F.pad(resized_img, (pad_width, 0, pad_height, 0), value=pad_value)
+    return padded_img
@@ -0,0 +1,171 @@
+#!/usr/bin/env python
+
+# Copyright 2025 Physical Intelligence and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from copy import deepcopy
+from dataclasses import dataclass
+from typing import Any
+
+import numpy as np
+import torch
+
+from lerobot.configs.types import PipelineFeatureType, PolicyFeature
+from lerobot.policies.pi05.configuration_pi05 import PI05Config
+from lerobot.policies.pi05.modeling_pi05 import pad_vector
+from lerobot.processor import (
+    AddBatchDimensionProcessorStep,
+    DeviceProcessorStep,
+    NormalizerProcessorStep,
+    PolicyAction,
+    PolicyProcessorPipeline,
+    ProcessorStep,
+    ProcessorStepRegistry,
+    RenameObservationsProcessorStep,
+    TokenizerProcessorStep,
+    UnnormalizerProcessorStep,
+)
+from lerobot.processor.converters import policy_action_to_transition, transition_to_policy_action
+from lerobot.processor.core import EnvTransition, TransitionKey
+from lerobot.utils.constants import (
+    OBS_STATE,
+    POLICY_POSTPROCESSOR_DEFAULT_NAME,
+    POLICY_PREPROCESSOR_DEFAULT_NAME,
+)
+
+
+@ProcessorStepRegistry.register(name="pi05_prepare_state_tokenizer_processor_step")
+@dataclass
+class Pi05PrepareStateTokenizerProcessorStep(ProcessorStep):
+    """
+    Processor step to prepare the state and tokenize the language input.
+    """
+
+    max_state_dim: int = 32
+    task_key: str = "task"
+
+    def __call__(self, transition: EnvTransition) -> EnvTransition:
+        transition = transition.copy()
+
+        state = transition.get(TransitionKey.OBSERVATION, {}).get(OBS_STATE)
+        if state is None:
+            raise ValueError("State is required for PI05")
+        tasks = transition.get(TransitionKey.COMPLEMENTARY_DATA, {}).get(self.task_key)
+        if tasks is None:
+            raise ValueError("No task found in complementary data")
+
+        # TODO: check if this necessary
+        state = deepcopy(state)
+
+        # Prepare state (pad to max_state_dim)
+        state = pad_vector(state, self.max_state_dim)
+
+        # State should already be normalized to [-1, 1] by the NormalizerProcessorStep that runs before this step
+        # Discretize into 256 bins (see openpi `PaligemmaTokenizer.tokenize()`)
+        state_np = state.cpu().numpy()
+        discretized_states = np.digitize(state_np, bins=np.linspace(-1, 1, 256 + 1)[:-1]) - 1
+
+        full_prompts = []
+        for i, task in enumerate(tasks):
+            cleaned_text = task.strip().replace("_", " ").replace("\n", " ")
+            state_str = " ".join(map(str, discretized_states[i]))
+            full_prompt = f"Task: {cleaned_text}, State: {state_str};\nAction: "
+            full_prompts.append(full_prompt)
+
+        transition[TransitionKey.COMPLEMENTARY_DATA][self.task_key] = full_prompts
+        # Normalize state to [-1, 1] range if needed (assuming it's already normalized by normalizer processor step!!)
+        # Discretize into 256 bins (see openpi `PaligemmaTokenizer.tokenize()`)
+        return transition
+
+    def transform_features(
+        self, features: dict[PipelineFeatureType, dict[str, PolicyFeature]]
+    ) -> dict[PipelineFeatureType, dict[str, PolicyFeature]]:
+        """
+        This step does not alter the feature definitions.
+        """
+        return features
+
+
+def make_pi05_pre_post_processors(
+    config: PI05Config,
+    dataset_stats: dict[str, dict[str, torch.Tensor]] | None = None,
+) -> tuple[
+    PolicyProcessorPipeline[dict[str, Any], dict[str, Any]],
+    PolicyProcessorPipeline[PolicyAction, PolicyAction],
+]:
+    """
+    Constructs pre-processor and post-processor pipelines for the PI0 policy.
+
+    The pre-processing pipeline prepares input data for the model by:
+    1. Renaming features to match pretrained configurations.
+    2. Normalizing input and output features based on dataset statistics.
+    3. Adding a batch dimension.
+    4. Appending a newline character to the task description for tokenizer compatibility.
+    5. Tokenizing the text prompt using the PaliGemma tokenizer.
+    6. Moving all data to the specified device.
+
+    The post-processing pipeline handles the model's output by:
+    1. Moving data to the CPU.
+    2. Unnormalizing the output features to their original scale.
+
+    Args:
+        config: The configuration object for the PI0 policy.
+        dataset_stats: A dictionary of statistics for normalization.
+        preprocessor_kwargs: Additional arguments for the pre-processor pipeline.
+        postprocessor_kwargs: Additional arguments for the post-processor pipeline.
+
+    Returns:
+        A tuple containing the configured pre-processor and post-processor pipelines.
+    """
+
+    # Add remaining processors
+    input_steps: list[ProcessorStep] = [
+        RenameObservationsProcessorStep(rename_map={}),  # To mimic the same processor as pretrained one
+        AddBatchDimensionProcessorStep(),
+        # NOTE: NormalizerProcessorStep MUST come before Pi05PrepareStateTokenizerProcessorStep
+        # because the tokenizer step expects normalized state in [-1, 1] range for discretization
+        NormalizerProcessorStep(
+            features={**config.input_features, **config.output_features},
+            norm_map=config.normalization_mapping,
+            stats=dataset_stats,
+        ),
+        Pi05PrepareStateTokenizerProcessorStep(max_state_dim=config.max_state_dim),
+        TokenizerProcessorStep(
+            tokenizer_name="google/paligemma-3b-pt-224",
+            max_length=config.tokenizer_max_length,
+            padding_side="right",
+            padding="max_length",
+        ),
+        DeviceProcessorStep(device=config.device),
+    ]
+
+    output_steps: list[ProcessorStep] = [
+        UnnormalizerProcessorStep(
+            features=config.output_features, norm_map=config.normalization_mapping, stats=dataset_stats
+        ),
+        DeviceProcessorStep(device="cpu"),
+    ]
+
+    return (
+        PolicyProcessorPipeline[dict[str, Any], dict[str, Any]](
+            steps=input_steps,
+            name=POLICY_PREPROCESSOR_DEFAULT_NAME,
+        ),
+        PolicyProcessorPipeline[PolicyAction, PolicyAction](
+            steps=output_steps,
+            name=POLICY_POSTPROCESSOR_DEFAULT_NAME,
+            to_transition=policy_action_to_transition,
+            to_output=transition_to_policy_action,
+        ),
+    )
@@ -1,35 +0,0 @@
-# WALL-OSS
-
-This repository contains the Hugging Face port of **WALL-OSS**, a Vision-Language-Action model for cross-embodiment robotic control based on Qwen2.5-VL with flow matching/FAST action prediction.
-
---
-
-## Model Overview
-
-| Feature            | Description                                           |
-| ------------------ | ----------------------------------------------------- | --- |
-| Base Model         | Qwen2.5-VL (Vision-Language Model)                    |
-| Action Prediction  | Flow Matching (diffusion) or FAST (discrete tokens)   |
-| Architecture       | Mixture of Experts (MoE) with action-specific routing |     |
-| Multi-Modal Inputs | Vision (images/videos), Language, Proprioception      |
-
---
-
-## Citation
-
-If you use this work, please cite:
-
-```bibtex
-@article{zhai2025igniting,
-    title   = {Igniting VLMs Toward the Embodied Space},
-    author  = {Zhai, Andy and Liu, Brae and Fang, Bruno and Cai, Chalse and Ma, Ellie and Yin, Ethan and Wang, Hao and Zhou, Hugo and Wang, James and Shi, Lights and Liang, Lucy and Wang, Make and Wang, Qian and Gan, Roy and Yu, Ryan and Li, Shalfun and Liu, Starrick and Chen, Sylas and Chen, Vincent and Xu, Zach},
-    journal = {arXiv preprint arXiv:2509.11766},
-    year    = {2025}
-}
-```
-
---
-
-## License
-
-This port follows the **Apache 2.0 License**.
@@ -0,0 +1 @@
+../../../../docs/source/policy_walloss_README.md
@@ -27,13 +27,14 @@ from dataclasses import dataclass, field
 from typing import TYPE_CHECKING, Any

 import torch
+import torch.nn.functional as F

 from lerobot.configs.types import FeatureType, PipelineFeatureType, PolicyFeature
-from lerobot.utils.constants import OBS_LANGUAGE_ATTENTION_MASK, OBS_LANGUAGE_TOKENS
+from lerobot.utils.constants import OBS_LANGUAGE_ATTENTION_MASK, OBS_LANGUAGE_TOKENS, OBS_STATE
 from lerobot.utils.import_utils import _transformers_available

 from .core import EnvTransition, TransitionKey
-from .pipeline import ObservationProcessorStep, ProcessorStepRegistry
+from .pipeline import ObservationProcessorStep, ProcessorStepRegistry, ProcessorStep

 # Conditional import for type checking and lazy loading
 if TYPE_CHECKING or _transformers_available:
@@ -268,3 +269,328 @@ class TokenizerProcessorStep(ObservationProcessorStep):
            )

        return features
+
+
+@dataclass
+@ProcessorStepRegistry.register(name="pi0fast_tokenizer_processor")
+class PI0FASTTokenizerProcessorStep(ProcessorStep):
+    """
+    Processor step to tokenize state, language, and actions for PI0FAST models.
+
+    This step handles the complete tokenization pipeline for PI0FAST:
+    1. Discretizes state observations
+    2. Formats task descriptions with state
+    3. Tokenizes actions using the FAST tokenizer
+    4. Combines everything into the proper format with masks
+
+    Example usage:
+        ```python
+        from transformers import AutoTokenizer, AutoProcessor
+        from lerobot.processor.tokenizer_processor import PI0FASTTokenizerProcessorStep
+
+        # Initialize tokenizers
+        paligemma_tokenizer = AutoTokenizer.from_pretrained("google/paligemma-3b-pt-224")
+        paligemma_processor = AutoProcessor.from_pretrained("google/paligemma-3b-pt-224")
+        fast_tokenizer = AutoProcessor.from_pretrained("physical-intelligence/fast", trust_remote_code=True)
+
+        # Create processor step
+        processor = PI0FASTTokenizerProcessorStep(
+            paligemma_tokenizer=paligemma_tokenizer,
+            fast_tokenizer=fast_tokenizer,
+            paligemma_processor=paligemma_processor,
+            max_action_dim=7,
+            fast_skip_tokens=2,
+            max_input_seq_len=180,
+            task_key="task",
+            state_key="observation.state"
+        )
+
+        # Apply to a transition
+        tokenized_transition = processor(transition)
+
+        # Access tokenized data from observation
+        input_ids = tokenized_transition["observation"]["pi0fast_input_ids"]
+        attention_mask = tokenized_transition["observation"]["pi0fast_attention_mask"]
+        loss_mask = tokenized_transition["observation"]["pi0fast_loss_mask"]
+        token_type_ids = tokenized_transition["observation"]["pi0fast_token_type_ids"]
+        ```
+
+    Attributes:
+        paligemma_tokenizer: The PaliGemma tokenizer for text
+        fast_tokenizer: The FAST tokenizer for actions
+        paligemma_processor: The PaliGemma processor
+        max_action_dim: Maximum dimension for actions (default: 7)
+        fast_skip_tokens: Number of tokens to skip in FAST tokenizer mapping (default: 2)
+        max_input_seq_len: Maximum input sequence length (default: 180)
+        padding_side: The side to pad on ('left' or 'right', default: 'right')
+        task_key: The key in complementary_data where the task string is stored (default: 'task')
+        state_key: The key in observation where the state is stored (default: 'observation.state')
+    """
+
+    paligemma_tokenizer: Any = None
+    fast_tokenizer: Any = None
+    paligemma_processor: Any = None
+    max_action_dim: int = 7
+    fast_skip_tokens: int = 2
+    max_input_seq_len: int = 180
+    padding_side: str = "right"
+    task_key: str = "task"
+    state_key: str = OBS_STATE
+
+    def __post_init__(self):
+        """Initialize the tokenizers."""
+        if not _transformers_available:
+            raise ImportError(
+                "The 'transformers' library is not installed. "
+                "Please install it with `pip install 'lerobot[transformers-dep]'` to use PI0FASTTokenizerProcessorStep."
+            )
+
+        if self.paligemma_tokenizer is None or self.fast_tokenizer is None or self.paligemma_processor is None:
+            raise ValueError(
+                "paligemma_tokenizer, fast_tokenizer, and paligemma_processor must all be provided. "
+                "These should be initialized tokenizer/processor objects."
+            )
+
+    def normalize_actions(self, actions: torch.Tensor) -> torch.Tensor:
+        """Normalize actions to [-1, 1] range per batch element."""
+        mins = actions.amin(dim=(1, 2), keepdim=True)
+        maxs = actions.amax(dim=(1, 2), keepdim=True)
+        return 2 * (actions - mins) / (maxs - mins + 1e-8) - 1
+
+    def _act_tokens_to_paligemma_tokens(self, tokens: torch.Tensor) -> torch.Tensor:
+        """Convert FAST tokens to PaliGemma vocabulary space."""
+        vocab_size = getattr(self.paligemma_tokenizer, "vocab_size", 257152)
+        return vocab_size - 1 - self.fast_skip_tokens - tokens
+
+    def fast_tokenizer_wrapper(self, actions_norm):
+        """Wrapper for FAST tokenizer that ensures batch processing and returns PyTorch tensors."""
+        batch_tokens = self.fast_tokenizer(actions_norm)
+        fast_out = self.paligemma_processor.tokenizer.pad({"input_ids": batch_tokens}, return_tensors="pt")
+        return fast_out
+
+    def create_token_type_ids(self, padded_mask: torch.Tensor, prefix_len: torch.Tensor) -> torch.Tensor:
+        """Create token type IDs to distinguish prefix from action tokens."""
+        token_type_ids = torch.zeros_like(padded_mask, dtype=torch.bool)
+        cumsum_mask = (padded_mask != 0).cumsum(dim=1)
+        suffix_mask = cumsum_mask > prefix_len
+        return suffix_mask
+
+    def __call__(self, transition: EnvTransition) -> EnvTransition:
+        """
+        Process the transition and add tokenized inputs.
+
+        Args:
+            transition: The environment transition to process
+
+        Returns:
+            The transition with added tokenized data
+        """
+        self.transition = transition
+        
+        # Extract components from transition
+        observation = transition.get(TransitionKey.OBSERVATION)
+        action = transition.get(TransitionKey.ACTION)
+        complementary_data = transition.get(TransitionKey.COMPLEMENTARY_DATA)
+        
+        if observation is None:
+            raise ValueError("Observation is None in transition")
+        
+        # Get state and language
+        state = observation.get(self.state_key)
+        if state is None:
+            raise ValueError(f"State key '{self.state_key}' not found in observation")
+        
+        # Get task description
+        if complementary_data is None:
+            raise ValueError("Complementary data is None, cannot extract task")
+        
+        task_data = complementary_data.get(self.task_key)
+        if task_data is None:
+            raise ValueError(f"Task key '{self.task_key}' not found in complementary data")
+        
+        # Standardize task to list of strings
+        if isinstance(task_data, str):
+            lang_text = [task_data]
+        elif isinstance(task_data, list) and all(isinstance(t, str) for t in task_data):
+            lang_text = task_data
+        else:
+            raise ValueError(f"Task must be string or list of strings, got {type(task_data)}")
+        
+        # Create tokenized inputs
+        tokenized_data = self.create_input_tokens(state, lang_text, action)
+        
+        # Add tokenized data to observation
+        new_observation = dict(observation)
+        new_observation["pi0fast_input_ids"] = tokenized_data["input_ids"]
+        new_observation["pi0fast_attention_mask"] = tokenized_data["attention_mask"]
+        new_observation["pi0fast_padded_mask"] = tokenized_data["padded_mask"]
+        new_observation["pi0fast_loss_mask"] = tokenized_data["loss_mask"]
+        new_observation["pi0fast_token_type_ids"] = tokenized_data["token_type_ids"]
+        
+        # Create new transition with updated observation
+        new_transition = dict(transition)
+        new_transition[TransitionKey.OBSERVATION] = new_observation
+        
+        return new_transition
+
+    def create_input_tokens(self, state, lang_text, actions=None):
+        """
+        Create tokenized input from state, language, and actions.
+
+        This method follows the same logic as the original PI0FAST create_input_tokens method.
+
+        Args:
+            state: State tensor [batch_size, state_dim]
+            lang_text: List of task description strings
+            actions: Optional action tensor [batch_size, horizon, action_dim]
+
+        Returns:
+            Dictionary containing input_ids, attention_mask, padded_mask, loss_mask, and token_type_ids
+        """
+        bsize = state.shape[0]
+        device = state.device
+        
+        # Discretize state
+        bins = torch.linspace(-1, 1, 256 + 1, device=device)[:-1]
+        discretized = torch.bucketize(state, bins) - 1
+        discretized = discretized[:, :32]
+
+        # Create prefix texts with task and state
+        prefix_texts = []
+        for txt, disc in zip(lang_text, discretized, strict=False):
+            cleaned = txt.lower().strip().replace("_", " ")
+            state_str = " ".join(str(val.item()) for val in disc)
+            prefix_texts.append(f"Task: {cleaned}, State: {state_str};\n")
+
+        # Tokenize prefix
+        prefix_out = self.paligemma_tokenizer(
+            prefix_texts, add_special_tokens=True, return_tensors="pt", padding="longest", truncation=False
+        )
+        prefix_ids = prefix_out["input_ids"].to(device)
+        prefix_mask = prefix_out["attention_mask"].to(device)
+        prefix_lens = prefix_mask.sum(dim=1)[:, None].cpu()
+
+        # Get pad token ID
+        pad_token_id = (
+            self.paligemma_tokenizer.pad_token_id
+            if hasattr(self.paligemma_tokenizer, "pad_token_id")
+            else self.paligemma_tokenizer.eos_token_id
+        )
+
+        if actions is not None:
+            # pad actions
+            actions_pad = F.pad(
+                actions, (0, max(0, self.max_action_dim - actions.shape[2])), value=0
+            )[:, :, : self.max_action_dim]
+            
+            # Tokenize actions with FAST tokenizer
+            fast_out = self.fast_tokenizer_wrapper(actions_pad.cpu())
+            act_ids = fast_out["input_ids"]
+            act_mask = fast_out["attention_mask"].to(device)
+
+            # Convert FAST tokens to PaliGemma token space
+            act_ids = self._act_tokens_to_paligemma_tokens(act_ids).to(device)
+            
+            # Replace padding tokens
+            vocab_size = getattr(self.paligemma_tokenizer, "vocab_size", 257152)
+            act_ids = torch.where(
+                act_ids == vocab_size - 1 - self.fast_skip_tokens,
+                pad_token_id,
+                act_ids,
+            )
+
+            # Add BOS ("Action: ") and EOS tokens
+            eos_token = torch.tensor(
+                [self.paligemma_tokenizer.eos_token_id], dtype=torch.long, device=device
+            ).expand(bsize, -1)
+            eos_mask = torch.tensor([1], dtype=torch.long, device=device).expand(bsize, -1)
+            
+            bos = self.paligemma_tokenizer("Action: ", add_special_tokens=False, return_tensors="pt")
+            bos_token = bos["input_ids"].expand(act_ids.shape[0], -1).to(device)
+            bos_mask = bos["attention_mask"].expand(act_ids.shape[0], -1).to(device)
+            
+            act_ids = torch.cat([bos_token, act_ids, eos_token], dim=1)
+            act_mask = torch.cat([bos_mask, act_mask, eos_mask], dim=1)
+            act_mask = act_mask.to(device)
+        else:
+            # No actions provided
+            act_ids = torch.empty(bsize, 0, dtype=torch.long, device=device)
+            act_mask = torch.empty(bsize, 0, dtype=torch.long, device=device)
+
+        # Concatenate prefix and action tokens
+        final_ids = torch.cat([prefix_ids, act_ids], dim=1)
+        final_mask = torch.cat([prefix_mask, act_mask], dim=1)
+        
+        batch_inputs = {"input_ids": final_ids.tolist(), "attention_mask": final_mask.tolist()}
+
+        # Pad to max length
+        padded_output = self.paligemma_tokenizer.pad(
+            batch_inputs, padding="longest", max_length=self.max_input_seq_len, return_tensors="pt"
+        )
+        padded_mask = padded_output["attention_mask"]
+
+        # Create attention mask (excludes prefix)
+        att_mask = (padded_mask != 0).cumsum(dim=1) > prefix_lens
+
+        # Create token type IDs
+        token_type_ids = self.create_token_type_ids(padded_mask=padded_mask, prefix_len=prefix_lens)
+
+        # Return all masks
+        return {
+            "input_ids": padded_output["input_ids"],
+            "attention_mask": att_mask,
+            "padded_mask": padded_mask,
+            "loss_mask": att_mask & padded_mask,  # loss is computed not on prefix, and not on padding
+            "token_type_ids": token_type_ids,
+        }
+
+    def get_config(self) -> dict[str, Any]:
+        """Returns the serializable configuration of the processor."""
+        return {
+            "max_action_dim": self.max_action_dim,
+            "fast_skip_tokens": self.fast_skip_tokens,
+            "max_input_seq_len": self.max_input_seq_len,
+            "padding_side": self.padding_side,
+            "task_key": self.task_key,
+            "state_key": self.state_key,
+        }
+
+    def transform_features(
+        self, features: dict[PipelineFeatureType, dict[str, PolicyFeature]]
+    ) -> dict[PipelineFeatureType, dict[str, PolicyFeature]]:
+        """
+        Adds feature definitions for the tokenized PI0FAST inputs.
+
+        Args:
+            features: The dictionary of existing policy features.
+
+        Returns:
+            The updated dictionary of policy features.
+        """
+        # Add features for tokenized inputs
+        if "pi0fast_input_ids" not in features[PipelineFeatureType.OBSERVATION]:
+            features[PipelineFeatureType.OBSERVATION]["pi0fast_input_ids"] = PolicyFeature(
+                type=FeatureType.LANGUAGE, shape=(self.max_input_seq_len,)
+            )
+
+        if "pi0fast_attention_mask" not in features[PipelineFeatureType.OBSERVATION]:
+            features[PipelineFeatureType.OBSERVATION]["pi0fast_attention_mask"] = PolicyFeature(
+                type=FeatureType.LANGUAGE, shape=(self.max_input_seq_len,)
+            )
+
+        if "pi0fast_padded_mask" not in features[PipelineFeatureType.OBSERVATION]:
+            features[PipelineFeatureType.OBSERVATION]["pi0fast_padded_mask"] = PolicyFeature(
+                type=FeatureType.LANGUAGE, shape=(self.max_input_seq_len,)
+            )
+
+        if "pi0fast_loss_mask" not in features[PipelineFeatureType.OBSERVATION]:
+            features[PipelineFeatureType.OBSERVATION]["pi0fast_loss_mask"] = PolicyFeature(
+                type=FeatureType.LANGUAGE, shape=(self.max_input_seq_len,)
+            )
+
+        if "pi0fast_token_type_ids" not in features[PipelineFeatureType.OBSERVATION]:
+            features[PipelineFeatureType.OBSERVATION]["pi0fast_token_type_ids"] = PolicyFeature(
+                type=FeatureType.LANGUAGE, shape=(self.max_input_seq_len,)
+            )
+
+        return features
@@ -14,8 +14,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import importlib
+import importlib.metadata
 import logging
-import pkgutil
 from typing import Any

 from draccus.choice_types import ChoiceRegistry
@@ -132,24 +132,30 @@ def make_device_from_device_class(config: ChoiceRegistry) -> Any:

 def register_third_party_plugins() -> None:
    """
-    Discover and import third-party lerobot_* plugins so they can register themselves.
+    Discover and import third-party LeRobot plugins so they can register themselves.

-    Scans top-level modules on sys.path for packages starting with
-    'lerobot_robot_', 'lerobot_camera_', 'lerobot_teleoperator_' or 'lerobot_policy_' and imports them.
+    This function uses `importlib.metadata` to find packages installed in the environment
+    (including editable installs) starting with 'lerobot_robot_', 'lerobot_camera_',
+    'lerobot_teleoperator_', or 'lerobot_policy_' and imports them.
    """
    prefixes = ("lerobot_robot_", "lerobot_camera_", "lerobot_teleoperator_", "lerobot_policy_")
    imported: list[str] = []
    failed: list[str] = []

-    for module_info in pkgutil.iter_modules():
-        name = module_info.name
-        if name.startswith(prefixes):
-            try:
-                importlib.import_module(name)
-                imported.append(name)
-                logging.info("Imported third-party plugin: %s", name)
-            except Exception:
-                logging.exception("Could not import third-party plugin: %s", name)
-                failed.append(name)
+    def attempt_import(module_name: str):
+        try:
+            importlib.import_module(module_name)
+            imported.append(module_name)
+            logging.info("Imported third-party plugin: %s", module_name)
+        except Exception:
+            logging.exception("Could not import third-party plugin: %s", module_name)
+            failed.append(module_name)
+
+    for dist in importlib.metadata.distributions():
+        dist_name = dist.metadata.get("Name")
+        if not dist_name:
+            continue
+        if dist_name.startswith(prefixes):
+            attempt_import(dist_name)

    logging.debug("Third-party plugin import summary: imported=%s failed=%s", imported, failed)
Author	SHA1	Message	Date
Jade Choghari	cbb380df34	draft changes	2025-12-26 14:06:30 +00:00
Alexis Alva	12043b3b5c	fix: use importlib.metadata for plugin discovery to support PEP 660 (#2687 )	2025-12-24 15:45:14 +01:00
Salman Chishti	a06f4b9140	Upgrade GitHub Actions for Node 24 compatibility (#2691 )	2025-12-24 10:42:29 +01:00
Steven Palma	20c22a2799	chore(ci): make keyword matching more conservative (#2711 )	2025-12-24 02:03:12 +01:00
Steven Palma	2f238fce15	feat(ci): adds release versioning to docs (#2709 ) * feat(ci): adds release versioning to docs * chore(ci): remove TODO	2025-12-24 00:40:56 +01:00
Pepijn	ff271e8b51	pi fixes for dependencies (#2706 ) * pi fixes for dependencies * add walls sarm conflict * also add conflicts for pi * fix(ci): use --extra all instead of --all-extras + --no-extra --------- Co-authored-by: Steven Palma <steven.palma@huggingface.co>	2025-12-23 23:58:34 +01:00
Pepijn	a142c365dd	use syslink for wall-x readme (#2708 ) * use syslink for wall-x readme * remove whitespace	2025-12-23 14:13:32 +01:00
Steven Palma	b2ef6ae720	chore: modernize contributing.md (#2677 )	2025-12-23 12:10:44 +01:00
Tong Wu	a64f2fd322	modify the README file for wallx (#2705 ) * support wallx * fix bugs in flow * incorporate wallx model into lerobot * update the policy methods * reduce to least config and params & pass lerobot basic test * fixed dtype bugs * add wallx dependencies * update * remove flash-attn requirement && fix bug in inference and fast mode * fix bug for inference * add some small modifications * fix pre-commit errors * remove lerobot[wallx] * fix ci * fix precommit issues * fix: exclude wallx extra properly in CI workflows * fix: add uv conflicts for wallx transformers version * fix: peft test import * pre-commit * only export WallXConfig from wall_x package to avoid peft import in CI * remove torch dep * precommit * add import * update doc files * fix minor errors --------- Signed-off-by: Pepijn <138571049+pkooij@users.noreply.github.com> Co-authored-by: vincentchen <chenlufang@x2robot.com> Co-authored-by: Geoffrey19 <sympathischmann35@gmail.com> Co-authored-by: Pepijn <138571049+pkooij@users.noreply.github.com> Co-authored-by: Pepijn <pepijn@huggingface.co>	2025-12-23 11:35:06 +01:00
				`@@ -0,0 +1 @@`
				`../../../../docs/source/policy_walloss_README.md`