From 02b315ab6a6709c04a4d19b8cacfc1988c247d37 Mon Sep 17 00:00:00 2001 From: Nikodem Bartnik <39432165+NikodemBartnik@users.noreply.github.com> Date: Fri, 12 Jun 2026 13:26:52 +0200 Subject: [PATCH] Docs/model card improvements (#3634) * update policy deployment instruction with rollout * add port and fix formatting * add more base models to generate model card * updated and extended model descriptions * fix bug * improved and extended structure * exclude the templates from config * add images and visualize dataset button * add all policies we have docs for * remove policies without the docs * new fields, improved examples --- .pre-commit-config.yaml | 3 + src/lerobot/policies/pretrained.py | 90 ++++++- .../templates/lerobot_modelcard_template.md | 252 ++++++++++++++---- 3 files changed, 282 insertions(+), 63 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index dff7416f4..8ae913e4e 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -65,6 +65,9 @@ repos: name: Format Markdown with Prettier types_or: [markdown, mdx] args: [--prose-wrap=preserve] + # Jinja2 model-card templates use a .md extension but contain {% ... %} / + # {{ ... }} tags that prettier's Markdown formatter mangles (e.g. table loops). + exclude: ^src/lerobot/templates/.*\.md$ ##### Security ##### - repo: https://github.com/gitleaks/gitleaks diff --git a/src/lerobot/policies/pretrained.py b/src/lerobot/policies/pretrained.py index 724f920f3..a69487f3f 100644 --- a/src/lerobot/policies/pretrained.py +++ b/src/lerobot/policies/pretrained.py @@ -29,6 +29,7 @@ from huggingface_hub.errors import HfHubHTTPError from safetensors.torch import load_model as load_model_as_safetensor, save_model as save_model_as_safetensor from torch import Tensor, nn +from lerobot.__version__ import __version__ from lerobot.configs import PreTrainedConfig from lerobot.configs.train import TrainPipelineConfig from lerobot.utils.hub import HubMixin @@ -38,6 +39,67 @@ from .utils import log_model_loading_keys T = TypeVar("T", bound="PreTrainedPolicy") +def _build_card_context( + cfg: TrainPipelineConfig | None, + dataset_repo_id: str | None, + input_features: dict | None, + output_features: dict | None, +) -> dict: + """Collect optional data for the model-card template. + + Returns plain values only (no Markdown) — the template in + ``lerobot/templates/lerobot_modelcard_template.md`` decides how and whether to show + each one. Everything is best-effort: anything unavailable is left empty/None and the + template simply skips that section, so this never breaks a Hub push. + """ + context = { + "training": None, + "input_features": input_features or {}, + "output_features": output_features or {}, + "dataset": None, + "robot_type": None, + "cameras": [], + } + + if cfg is not None: + optimizer = getattr(cfg, "optimizer", None) + context["training"] = { + "steps": cfg.steps, + "batch_size": cfg.batch_size, + "seed": cfg.seed, + "optimizer": getattr(optimizer, "type", None) if optimizer else None, + "lr": getattr(optimizer, "lr", None) if optimizer else None, + "lerobot_version": __version__, + } + + if dataset_repo_id: + dataset_cfg = getattr(cfg, "dataset", None) + try: + from lerobot.datasets.dataset_metadata import LeRobotDatasetMetadata + + meta = LeRobotDatasetMetadata( + dataset_repo_id, + root=getattr(dataset_cfg, "root", None), + revision=getattr(dataset_cfg, "revision", None), + ) + context["dataset"] = { + "repo_id": dataset_repo_id, + "episodes": meta.total_episodes, + "frames": meta.total_frames, + "fps": meta.fps, + "tasks": [str(task) for task in meta.tasks.index], + } + context["robot_type"] = meta.robot_type + context["cameras"] = [key.split(".")[-1] for key in meta.camera_keys] + except Exception as e: # noqa: BLE001 — dataset details are optional, never fail the push + logging.warning( + f"Could not load dataset metadata for '{dataset_repo_id}'; those sections will be " + f"omitted from the model card. ({e})" + ) + + return context + + class ActionSelectKwargs(TypedDict, total=False): noise: Tensor | None @@ -228,7 +290,7 @@ class PreTrainedPolicy(nn.Module, HubMixin, abc.ABC): self.save_pretrained(saved_path) # Calls _save_pretrained and stores model tensors card = self.generate_model_card( - cfg.dataset.repo_id, self.config.type, self.config.license, self.config.tags + cfg.dataset.repo_id, self.config.type, self.config.license, self.config.tags, cfg=cfg ) card.save(str(saved_path / "README.md")) @@ -246,9 +308,20 @@ class PreTrainedPolicy(nn.Module, HubMixin, abc.ABC): logging.info(f"Model pushed to {commit_info.repo_url.url}") def generate_model_card( - self, dataset_repo_id: str, model_type: str, license: str | None, tags: list[str] | None + self, + dataset_repo_id: str, + model_type: str, + license: str | None, + tags: list[str] | None, + cfg: TrainPipelineConfig | None = None, ) -> ModelCard: - base_model = "lerobot/smolvla_base" if model_type == "smolvla" else None # Set a base model + base_model_mapping = { + "smolvla": "lerobot/smolvla_base", + "pi0": "lerobot/pi0_base", + "pi05": "lerobot/pi05_base", + "pi0_fast": "lerobot/pi0fast-base", + "xvla": "lerobot/xvla-base", + } card_data = ModelCardData( license=license or "apache-2.0", @@ -257,13 +330,20 @@ class PreTrainedPolicy(nn.Module, HubMixin, abc.ABC): tags=list(set(tags or []).union({"robotics", "lerobot", model_type})), model_name=model_type, datasets=dataset_repo_id, - base_model=base_model, + base_model=base_model_mapping.get(model_type), ) + context = _build_card_context( + cfg, dataset_repo_id, self.config.input_features, self.config.output_features + ) + # Used by the template to pre-fill commands and the "Fine-tuned from" line. + context["policy_repo_id"] = getattr(self.config, "repo_id", None) + context["base_model"] = base_model_mapping.get(model_type) + template_card = ( files("lerobot.templates").joinpath("lerobot_modelcard_template.md").read_text(encoding="utf-8") ) - card = ModelCard.from_template(card_data, template_str=template_card) + card = ModelCard.from_template(card_data, template_str=template_card, **context) card.validate() return card diff --git a/src/lerobot/templates/lerobot_modelcard_template.md b/src/lerobot/templates/lerobot_modelcard_template.md index b93e83b6e..6ecda06c9 100644 --- a/src/lerobot/templates/lerobot_modelcard_template.md +++ b/src/lerobot/templates/lerobot_modelcard_template.md @@ -13,77 +13,213 @@ [SmolVLA](https://huggingface.co/papers/2506.01844) is a compact, efficient vision-language-action model that achieves competitive performance at reduced computational costs and can be deployed on consumer-grade hardware. {% elif model_name == "act" %} [Action Chunking with Transformers (ACT)](https://huggingface.co/papers/2304.13705) is an imitation-learning method that predicts short action chunks instead of single steps. It learns from teleoperated data and often achieves high success rates. -{% elif model_name == "tdmpc" %} -[TD-MPC](https://huggingface.co/papers/2203.04955) combines model-free and model-based approaches to improve sample efficiency and performance in continuous control tasks by using a learned latent dynamics model and terminal value function. {% elif model_name == "diffusion" %} [Diffusion Policy](https://huggingface.co/papers/2303.04137) treats visuomotor control as a generative diffusion process, producing smooth, multi-step action trajectories that excel at contact-rich manipulation. -{% elif model_name == "vqbet" %} -[VQ-BET](https://huggingface.co/papers/2403.03181) combines vector-quantised action tokens with Behaviour Transformers to discretise control and achieve data-efficient imitation across diverse skills. {% elif model_name == "pi0" %} -**π₀ (Pi0)** - -π₀ is a Vision-Language-Action model for general robot control, from Physical Intelligence. The LeRobot implementation is adapted from their open source OpenPI repository. - -**Model Overview** - -π₀ represents a breakthrough in robotics as the first general-purpose robot foundation model developed by Physical Intelligence. Unlike traditional robots that are narrow specialists programmed for repetitive motions, π₀ is designed to be a generalist policy that can understand visual inputs, interpret natural language instructions, and control a variety of different robots across diverse tasks. - -For more details, see the [Physical Intelligence π₀ blog post](https://www.physicalintelligence.company/blog/pi0). +[π₀ (Pi0)](https://www.physicalintelligence.company/blog/pi0) is a general-purpose robot foundation model from Physical Intelligence: a generalist Vision-Language-Action policy that understands visual inputs, interprets natural language instructions, and controls a variety of different robots across diverse tasks. The LeRobot implementation is adapted from their open-source OpenPI repository. {% elif model_name == "pi05" %} -**π₀.₅ (Pi05) Policy** - -π₀.₅ is a Vision-Language-Action model with open-world generalization, from Physical Intelligence. The LeRobot implementation is adapted from their open source OpenPI repository. - -**Model Overview** - -π₀.₅ represents a significant evolution from π₀, developed by Physical Intelligence to address a big challenge in robotics: open-world generalization. While robots can perform impressive tasks in controlled environments, π₀.₅ is designed to generalize to entirely new environments and situations that were never seen during training. - -For more details, see the [Physical Intelligence π₀.₅ blog post](https://www.physicalintelligence.company/blog/pi05). +[π₀.₅ (Pi05)](https://www.physicalintelligence.company/blog/pi05) is a Vision-Language-Action model from Physical Intelligence designed for open-world generalization: it evolves π₀ to generalize to entirely new environments and situations that were never seen during training. The LeRobot implementation is adapted from their open-source OpenPI repository. +{% elif model_name == "molmoact2" %} +[MolmoAct2](https://allenai.org/blog/molmoact2) is an open robotics foundation model from the Allen Institute for AI (Ai2) that maps camera images and language instructions to robot action chunks. The LeRobot implementation supports training and evaluation of the regular MolmoAct2 model. +{% elif model_name == "vla_jepa" %} +[VLA-JEPA](https://arxiv.org/abs/2602.10098) is a Vision-Language-Action model that combines a Qwen3-VL language backbone with a self-supervised video world model (V-JEPA2) and a flow-matching DiT action head. {% elif model_name == "gaussian_actor" %} This is a Gaussian Actor policy (Gaussian policy with a tanh squash) — the policy-side component used by [Soft Actor-Critic (SAC)](https://huggingface.co/papers/1801.01290) and related maximum-entropy continuous-control algorithms. +{% elif model_name == "pi0_fast" %} +[π₀-FAST (Pi0-FAST)](https://www.physicalintelligence.company/research/fast) is a Vision-Language-Action model for general robot control, from Physical Intelligence. It models continuous robot actions with autoregressive next-token prediction using FAST (Frequency-space Action Sequence Tokenization), training up to 5x faster than diffusion-based π₀. +{% elif model_name == "eo1" %} +[EO-1](https://huggingface.co/papers/2508.21112) is a Vision-Language-Action model for general robot control. It pairs a Qwen2.5-VL backbone for vision-language understanding with a continuous flow-matching action head that denoises action chunks. +{% elif model_name == "groot" %} +[GR00T N1.5](https://github.com/NVIDIA/Isaac-GR00T) is an open, cross-embodiment foundation model from NVIDIA for generalized humanoid robot reasoning and skills. It takes language and images as input and uses a flow-matching action transformer to predict actions conditioned on vision, language, and proprioception. +{% elif model_name == "multi_task_dit" %} +[Multi-Task Diffusion Transformer (DiT)](https://huggingface.co/papers/2507.05331) extends Diffusion Policy with a large Diffusion Transformer and text + vision conditioning for multi-task robot learning. It supports both diffusion and flow-matching objectives and reaches high dexterity with only ~450M parameters. +{% elif model_name == "wall_x" %} +[WALL-OSS](https://huggingface.co/papers/2509.11766) is an open-source foundation model for embodied intelligence from XSquare Robot. Built on Qwen2.5-VL, it uses a tightly-coupled multimodal architecture with flow matching to unify semantic reasoning and high-frequency action generation for cross-embodiment control. +{% elif model_name == "xvla" %} +[X-VLA](https://huggingface.co/papers/2510.10274) is a soft-prompted, flow-matching Vision-Language-Action framework that treats each robot or hardware setup as a "task" encoded with a small set of learnable Soft Prompt embeddings, letting a single model reconcile diverse robot morphologies, sensors, and action spaces. {% else %} -_Model type not recognized — please update this template._ +This is a **{{ model_name }}** policy trained with [LeRobot](https://github.com/huggingface/lerobot). +{% endif %} +{% set diagrams = { + "smolvla": "https://cdn-uploads.huggingface.co/production/uploads/640e21ef3c82bd463ee5a76d/aooU0a3DMtYmy_1IWMaIM.png", + "pi0": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/lerobot/lerobot-pi0%20(1).png", + "pi0_fast": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/lerobot/lerobot-pifast.png", + "eo1": "https://huggingface.co/datasets/HaomingSong/lerobot-documentation-images/resolve/main/lerobot/eo_pipeline.png", + "groot": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/lerobot/lerobot-groot-paper1%20(1).png", + "wall_x": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/lerobot/walloss-lerobot-paper.png", + "xvla": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/lerobot/xvla-architecture.png" +} %} +{% if diagrams.get(model_name) %} +
+
+