From 64773e7b2201748a846f1f44779d33caab384e15 Mon Sep 17 00:00:00 2001 From: Khalil Meftah Date: Sun, 14 Jun 2026 14:19:25 +0200 Subject: [PATCH] refactor(training): rename eval_freq to env_eval_freq - Rename eval_freq to env_eval_freq to distinguish sim environment evaluation from offline loss evaluation. --- .github/workflows/benchmark_tests.yml | 6 +++--- Makefile | 8 ++++---- docs/source/hilserl.mdx | 2 +- docs/source/libero.mdx | 2 +- docs/source/libero_plus.mdx | 2 +- docs/source/metaworld.mdx | 4 ++-- docs/source/molmoact2.mdx | 4 ++-- docs/source/multi_task_dit.mdx | 2 +- docs/source/robocasa.mdx | 2 +- docs/source/vlabench.mdx | 2 +- src/lerobot/configs/train.py | 3 ++- src/lerobot/scripts/lerobot_train.py | 6 +++--- tests/training/test_multi_gpu.py | 4 ++-- 13 files changed, 24 insertions(+), 23 deletions(-) diff --git a/.github/workflows/benchmark_tests.yml b/.github/workflows/benchmark_tests.yml index b82c59a8b..3493e5048 100644 --- a/.github/workflows/benchmark_tests.yml +++ b/.github/workflows/benchmark_tests.yml @@ -167,9 +167,9 @@ jobs: # ── LIBERO TRAIN+EVAL SMOKE ────────────────────────────────────────────── # Train SmolVLA for 1 step (batch_size=1, dataset episode 0 only) then - # immediately runs eval inside the training loop (eval_freq=1, 1 episode). + # immediately runs eval inside the training loop (env_eval_freq=1, 1 episode). # Tests the full train→eval-within-training pipeline end-to-end. - - name: Run Libero train+eval smoke (1 step, eval_freq=1) + - name: Run Libero train+eval smoke (1 step, env_eval_freq=1) if: env.HF_USER_TOKEN != '' run: | docker run --name libero-train-smoke --gpus all \ @@ -196,7 +196,7 @@ jobs: --output_dir=/tmp/train-smoke \ --steps=1 \ --batch_size=1 \ - --eval_freq=1 \ + --env_eval_freq=1 \ --eval.n_episodes=1 \ --eval.batch_size=1 \ --eval.use_async_envs=false \ diff --git a/Makefile b/Makefile index d3987101f..ea3b6e261 100644 --- a/Makefile +++ b/Makefile @@ -58,7 +58,7 @@ test-act-ete-train: --dataset.episodes="[0]" \ --batch_size=2 \ --steps=4 \ - --eval_freq=2 \ + --env_eval_freq=2 \ --eval.n_episodes=1 \ --eval.batch_size=1 \ --save_freq=2 \ @@ -96,7 +96,7 @@ test-diffusion-ete-train: --dataset.episodes="[0]" \ --batch_size=2 \ --steps=2 \ - --eval_freq=2 \ + --env_eval_freq=2 \ --eval.n_episodes=1 \ --eval.batch_size=1 \ --save_checkpoint=true \ @@ -126,7 +126,7 @@ test-tdmpc-ete-train: --dataset.episodes="[0]" \ --batch_size=2 \ --steps=2 \ - --eval_freq=2 \ + --env_eval_freq=2 \ --eval.n_episodes=1 \ --eval.batch_size=1 \ --save_checkpoint=true \ @@ -161,7 +161,7 @@ test-smolvla-ete-train: --dataset.episodes="[0]" \ --batch_size=2 \ --steps=4 \ - --eval_freq=2 \ + --env_eval_freq=2 \ --eval.n_episodes=1 \ --eval.batch_size=1 \ --save_freq=2 \ diff --git a/docs/source/hilserl.mdx b/docs/source/hilserl.mdx index 76e985cfe..09a370f3d 100644 --- a/docs/source/hilserl.mdx +++ b/docs/source/hilserl.mdx @@ -719,7 +719,7 @@ Example configuration for training the [reward classifier](https://huggingface.c "num_workers": 4, "steps": 5000, "log_freq": 10, - "eval_freq": 1000, + "env_eval_freq": 1000, "save_freq": 1000, "save_checkpoint": true, "seed": 2, diff --git a/docs/source/libero.mdx b/docs/source/libero.mdx index 043348690..b95af1d27 100644 --- a/docs/source/libero.mdx +++ b/docs/source/libero.mdx @@ -143,7 +143,7 @@ lerobot-train \ --batch_size=4 \ --eval.batch_size=1 \ --eval.n_episodes=1 \ - --eval_freq=1000 + --env_eval_freq=1000 ``` ## Reproducing published results diff --git a/docs/source/libero_plus.mdx b/docs/source/libero_plus.mdx index 4249bf49e..b065649fa 100644 --- a/docs/source/libero_plus.mdx +++ b/docs/source/libero_plus.mdx @@ -173,7 +173,7 @@ lerobot-train \ --batch_size=4 \ --eval.batch_size=1 \ --eval.n_episodes=1 \ - --eval_freq=1000 + --env_eval_freq=1000 ``` ## Relationship to LIBERO diff --git a/docs/source/metaworld.mdx b/docs/source/metaworld.mdx index 8e629dea9..b7accdfa2 100644 --- a/docs/source/metaworld.mdx +++ b/docs/source/metaworld.mdx @@ -120,11 +120,11 @@ lerobot-train \ --batch_size=4 \ --eval.batch_size=1 \ --eval.n_episodes=1 \ - --eval_freq=1000 + --env_eval_freq=1000 ``` ## Practical tips - Use the one-hot task conditioning for multi-task training (MT10/MT50 conventions) so policies have explicit task context. - Inspect the dataset task descriptions and the `info["is_success"]` keys when writing post-processing or logging so your success metrics line up with the benchmark. -- Adjust `batch_size`, `steps`, and `eval_freq` to match your compute budget. +- Adjust `batch_size`, `steps`, and `env_eval_freq` to match your compute budget. diff --git a/docs/source/molmoact2.mdx b/docs/source/molmoact2.mdx index c6ae24e9e..9c9a0f1a2 100644 --- a/docs/source/molmoact2.mdx +++ b/docs/source/molmoact2.mdx @@ -103,7 +103,7 @@ accelerate launch \ --batch_size=32 \ --num_workers=4 \ --log_freq=20 \ - --eval_freq=-1 \ + --env_eval_freq=-1 \ --save_checkpoint=true \ --save_freq=2000 ``` @@ -142,7 +142,7 @@ accelerate launch \ --batch_size=32 \ --num_workers=4 \ --log_freq=20 \ - --eval_freq=-1 \ + --env_eval_freq=-1 \ --save_checkpoint=true \ --save_freq=2000 ``` diff --git a/docs/source/multi_task_dit.mdx b/docs/source/multi_task_dit.mdx index 450d8a9f2..ebe46489a 100644 --- a/docs/source/multi_task_dit.mdx +++ b/docs/source/multi_task_dit.mdx @@ -314,7 +314,7 @@ lerobot-train \ --steps=30000 \ --save_freq=1000 \ --log_freq=100 \ - --eval_freq=1000 \ + --env_eval_freq=1000 \ --policy.type=multi_task_dit \ --policy.device=cuda \ --policy.horizon=32 \ diff --git a/docs/source/robocasa.mdx b/docs/source/robocasa.mdx index f6a784e72..5a335a484 100644 --- a/docs/source/robocasa.mdx +++ b/docs/source/robocasa.mdx @@ -166,7 +166,7 @@ lerobot-train \ --output_dir=./outputs/smolvla_robocasa_CloseFridge \ --steps=100000 \ --batch_size=4 \ - --eval_freq=5000 \ + --env_eval_freq=5000 \ --eval.batch_size=1 \ --eval.n_episodes=5 \ --save_freq=10000 diff --git a/docs/source/vlabench.mdx b/docs/source/vlabench.mdx index da579d674..9d45da4ec 100644 --- a/docs/source/vlabench.mdx +++ b/docs/source/vlabench.mdx @@ -165,7 +165,7 @@ lerobot-train \ --output_dir=./outputs/smolvla_vlabench_primitive \ --steps=100000 \ --batch_size=4 \ - --eval_freq=5000 \ + --env_eval_freq=5000 \ --eval.batch_size=1 \ --eval.n_episodes=1 \ --save_freq=10000 diff --git a/src/lerobot/configs/train.py b/src/lerobot/configs/train.py index bac1a946b..949ebbae0 100644 --- a/src/lerobot/configs/train.py +++ b/src/lerobot/configs/train.py @@ -100,7 +100,8 @@ class TrainPipelineConfig(HubMixin): prefetch_factor: int = 4 persistent_workers: bool = True steps: int = 100_000 - eval_freq: int = 20_000 + # Run policy in the simulation environment every N steps to measure reward/success (0 = disabled). + env_eval_freq: int = 20_000 log_freq: int = 200 tolerance_s: float = 1e-4 save_checkpoint: bool = True diff --git a/src/lerobot/scripts/lerobot_train.py b/src/lerobot/scripts/lerobot_train.py index 70a5e9e9d..5bfc3cb86 100644 --- a/src/lerobot/scripts/lerobot_train.py +++ b/src/lerobot/scripts/lerobot_train.py @@ -256,7 +256,7 @@ def train(cfg: TrainPipelineConfig, accelerator: "Accelerator | None" = None): # On real-world data, no need to create an environment as evaluations are done outside train.py, # using the eval.py instead, with gym_dora environment and dora-rs. eval_env = None - if cfg.eval_freq > 0 and cfg.env is not None and is_main_process: + if cfg.env_eval_freq > 0 and cfg.env is not None and is_main_process: logging.info("Creating env") eval_env = make_env(cfg.env, n_envs=cfg.eval.batch_size, use_async_envs=cfg.eval.use_async_envs) @@ -534,7 +534,7 @@ def train(cfg: TrainPipelineConfig, accelerator: "Accelerator | None" = None): train_tracker.step() is_log_step = cfg.log_freq > 0 and step % cfg.log_freq == 0 is_saving_step = step % cfg.save_freq == 0 or step == cfg.steps - is_eval_step = cfg.eval_freq > 0 and step % cfg.eval_freq == 0 + is_env_eval_step = cfg.env_eval_freq > 0 and step % cfg.env_eval_freq == 0 if is_log_step: # Collective reduce must run on every rank, before the main-process gate below. @@ -579,7 +579,7 @@ def train(cfg: TrainPipelineConfig, accelerator: "Accelerator | None" = None): accelerator.wait_for_everyone() - if cfg.env and is_eval_step: + if cfg.env and is_env_eval_step: if is_main_process: step_id = get_step_identifier(step, cfg.steps) logging.info(f"Eval policy at step {step}") diff --git a/tests/training/test_multi_gpu.py b/tests/training/test_multi_gpu.py index 638dc3131..34046463b 100644 --- a/tests/training/test_multi_gpu.py +++ b/tests/training/test_multi_gpu.py @@ -134,7 +134,7 @@ class TestMultiGPUTraining: f"--output_dir={output_dir}", "--batch_size=4", "--steps=10", - "--eval_freq=-1", + "--env_eval_freq=-1", "--log_freq=5", "--save_freq=10", "--seed=42", @@ -177,7 +177,7 @@ class TestMultiGPUTraining: f"--output_dir={output_dir}", "--batch_size=4", "--steps=20", - "--eval_freq=-1", + "--env_eval_freq=-1", "--log_freq=5", "--save_freq=10", "--seed=42",