From e67ceb213ded322ec9ba01f41a35b6aba72889df Mon Sep 17 00:00:00 2001 From: Pepijn Date: Tue, 14 Apr 2026 21:03:15 +0200 Subject: [PATCH] feat(robotwin): eval 5 diverse tasks per CI run with NL descriptions Widen the smoke eval from a single task (beat_block_hammer) to five: click_bell, handover_block, open_laptop, stack_blocks_two on top of the original. Each gets its own rollout video in videos/_0/ so the dashboard can surface visually distinct behaviours. extract_task_descriptions.py now has a RoboTwin branch that reads `description/task_instruction/.json` (already shipped in the clone at /opt/robotwin) and pulls the `full_description` field. CI cds into the clone before invoking the script so the relative path resolves. parse_eval_metrics.py is invoked with the same 5-task list so the metrics.json embeds one entry per task. Co-Authored-By: Claude Opus 4.6 (1M context) --- .github/workflows/benchmark_tests.yml | 8 ++++++-- scripts/ci/extract_task_descriptions.py | 20 ++++++++++++++++++++ 2 files changed, 26 insertions(+), 2 deletions(-) diff --git a/.github/workflows/benchmark_tests.yml b/.github/workflows/benchmark_tests.yml index 8ac040f67..02b908cd6 100644 --- a/.github/workflows/benchmark_tests.yml +++ b/.github/workflows/benchmark_tests.yml @@ -361,13 +361,17 @@ jobs: cd /opt/robotwin && lerobot-eval \ --policy.path=pepijn223/smolvla_robotwin \ --env.type=robotwin \ - --env.task=beat_block_hammer \ + --env.task=beat_block_hammer,click_bell,handover_block,open_laptop,stack_blocks_two \ --eval.batch_size=1 \ --eval.n_episodes=1 \ --eval.use_async_envs=false \ --policy.device=cuda \ '--rename_map={\"observation.images.head_camera\": \"observation.images.camera1\", \"observation.images.left_camera\": \"observation.images.camera2\", \"observation.images.right_camera\": \"observation.images.camera3\"}' \ --output_dir=/tmp/eval-artifacts + python /lerobot/scripts/ci/extract_task_descriptions.py \ + --env robotwin \ + --task beat_block_hammer,click_bell,handover_block,open_laptop,stack_blocks_two \ + --output /tmp/eval-artifacts/task_descriptions.json " - name: Copy RoboTwin artifacts from container @@ -383,7 +387,7 @@ jobs: python3 scripts/ci/parse_eval_metrics.py \ --artifacts-dir /tmp/robotwin-artifacts \ --env robotwin \ - --task beat_block_hammer \ + --task beat_block_hammer,click_bell,handover_block,open_laptop,stack_blocks_two \ --policy pepijn223/smolvla_robotwin - name: Upload RoboTwin rollout video diff --git a/scripts/ci/extract_task_descriptions.py b/scripts/ci/extract_task_descriptions.py index 5fbc1c35a..9035bacb0 100644 --- a/scripts/ci/extract_task_descriptions.py +++ b/scripts/ci/extract_task_descriptions.py @@ -57,6 +57,24 @@ def _metaworld_descriptions(task_name: str) -> dict[str, str]: return {f"{task_name}_0": label} +def _robotwin_descriptions(task_names: str) -> dict[str, str]: + """Return descriptions for each requested RoboTwin task. Reads + `description/task_instruction/.json` from the RoboTwin clone + (cwd is /opt/robotwin in CI). Falls back to the task name if missing.""" + out: dict[str, str] = {} + root = Path("description/task_instruction") + for name in (t.strip() for t in task_names.split(",") if t.strip()): + desc_file = root / f"{name}.json" + desc = name.replace("_", " ") + if desc_file.is_file(): + data = json.loads(desc_file.read_text()) + full = data.get("full_description") or desc + # Strip the schema placeholders ({A}, {a}) — keep the sentence readable. + desc = full.replace("<", "").replace(">", "") + out[f"{name}_0"] = desc + return out + + def main() -> int: parser = argparse.ArgumentParser(description=__doc__) parser.add_argument("--env", required=True, help="Environment family (libero, metaworld, ...)") @@ -70,6 +88,8 @@ def main() -> int: descriptions = _libero_descriptions(args.task) elif args.env == "metaworld": descriptions = _metaworld_descriptions(args.task) + elif args.env == "robotwin": + descriptions = _robotwin_descriptions(args.task) else: print( f"[extract_task_descriptions] No description extractor for env '{args.env}'.",