From e67ceb213ded322ec9ba01f41a35b6aba72889df Mon Sep 17 00:00:00 2001
From: Pepijn <pepijn@huggingface.co>
Date: Tue, 14 Apr 2026 21:03:15 +0200
Subject: [PATCH] feat(robotwin): eval 5 diverse tasks per CI run with NL
 descriptions

Widen the smoke eval from a single task (beat_block_hammer) to five:
click_bell, handover_block, open_laptop, stack_blocks_two on top of the
original. Each gets its own rollout video in videos/<task>_0/ so the
dashboard can surface visually distinct behaviours.

extract_task_descriptions.py now has a RoboTwin branch that reads
`description/task_instruction/<task>.json` (already shipped in the clone
at /opt/robotwin) and pulls the `full_description` field. CI cds into
the clone before invoking the script so the relative path resolves.

parse_eval_metrics.py is invoked with the same 5-task list so the
metrics.json embeds one entry per task.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 .github/workflows/benchmark_tests.yml   |  8 ++++++--
 scripts/ci/extract_task_descriptions.py | 20 ++++++++++++++++++++
 2 files changed, 26 insertions(+), 2 deletions(-)
diff --git a/.github/workflows/benchmark_tests.yml b/.github/workflows/benchmark_tests.yml
index 8ac040f67..02b908cd6 100644
--- a/.github/workflows/benchmark_tests.yml
+++ b/.github/workflows/benchmark_tests.yml
@@ -361,13 +361,17 @@ jobs:
               cd /opt/robotwin && lerobot-eval \
                 --policy.path=pepijn223/smolvla_robotwin \
                 --env.type=robotwin \
-                --env.task=beat_block_hammer \
+                --env.task=beat_block_hammer,click_bell,handover_block,open_laptop,stack_blocks_two \
                 --eval.batch_size=1 \
                 --eval.n_episodes=1 \
                 --eval.use_async_envs=false \
                 --policy.device=cuda \
                 '--rename_map={\"observation.images.head_camera\": \"observation.images.camera1\", \"observation.images.left_camera\": \"observation.images.camera2\", \"observation.images.right_camera\": \"observation.images.camera3\"}' \
                 --output_dir=/tmp/eval-artifacts
+              python /lerobot/scripts/ci/extract_task_descriptions.py \
+                --env robotwin \
+                --task beat_block_hammer,click_bell,handover_block,open_laptop,stack_blocks_two \
+                --output /tmp/eval-artifacts/task_descriptions.json
             "
 
       - name: Copy RoboTwin artifacts from container
@@ -383,7 +387,7 @@ jobs:
           python3 scripts/ci/parse_eval_metrics.py \
             --artifacts-dir /tmp/robotwin-artifacts \
             --env robotwin \
-            --task beat_block_hammer \
+            --task beat_block_hammer,click_bell,handover_block,open_laptop,stack_blocks_two \
             --policy pepijn223/smolvla_robotwin
 
       - name: Upload RoboTwin rollout video
diff --git a/scripts/ci/extract_task_descriptions.py b/scripts/ci/extract_task_descriptions.py
index 5fbc1c35a..9035bacb0 100644
--- a/scripts/ci/extract_task_descriptions.py
+++ b/scripts/ci/extract_task_descriptions.py
@@ -57,6 +57,24 @@ def _metaworld_descriptions(task_name: str) -> dict[str, str]:
     return {f"{task_name}_0": label}
 
 
+def _robotwin_descriptions(task_names: str) -> dict[str, str]:
+    """Return descriptions for each requested RoboTwin task. Reads
+    `description/task_instruction/<task>.json` from the RoboTwin clone
+    (cwd is /opt/robotwin in CI). Falls back to the task name if missing."""
+    out: dict[str, str] = {}
+    root = Path("description/task_instruction")
+    for name in (t.strip() for t in task_names.split(",") if t.strip()):
+        desc_file = root / f"{name}.json"
+        desc = name.replace("_", " ")
+        if desc_file.is_file():
+            data = json.loads(desc_file.read_text())
+            full = data.get("full_description") or desc
+            # Strip the schema placeholders ({A}, {a}) — keep the sentence readable.
+            desc = full.replace("<", "").replace(">", "")
+        out[f"{name}_0"] = desc
+    return out
+
+
 def main() -> int:
     parser = argparse.ArgumentParser(description=__doc__)
     parser.add_argument("--env", required=True, help="Environment family (libero, metaworld, ...)")
@@ -70,6 +88,8 @@ def main() -> int:
             descriptions = _libero_descriptions(args.task)
         elif args.env == "metaworld":
             descriptions = _metaworld_descriptions(args.task)
+        elif args.env == "robotwin":
+            descriptions = _robotwin_descriptions(args.task)
         else:
             print(
                 f"[extract_task_descriptions] No description extractor for env '{args.env}'.",