Compare commits

...

205 Commits

Author SHA1 Message Date
Pepijn 4e671ef080 fix 2025-09-01 15:41:24 +02:00
Pepijn cf9796b2f7 fix eval 2025-09-01 14:57:24 +02:00
Pepijn 88116b11e1 remove full pos embedding 2025-09-01 14:51:33 +02:00
Pepijn cf0c3f0a9a change config 2025-09-01 14:37:15 +02:00
Pepijn ee48a80e4d hls_gaus true 2025-09-01 14:19:07 +02:00
Pepijn cb0fb8ad15 hls_gaus true 2025-09-01 13:56:08 +02:00
Pepijn f79fdf7205 increase stride 2025-09-01 13:53:43 +02:00
Pepijn a305f5f46a hl-gauss 2025-09-01 13:34:55 +02:00
Pepijn 45348d7b69 remove debug log 2025-09-01 13:32:37 +02:00
Pepijn d4c1c123c6 hl-gauss 2025-09-01 13:24:28 +02:00
Pepijn da861139a3 hl-gauss 2025-09-01 13:11:53 +02:00
Pepijn 4f51f7153c hl-gauss 2025-09-01 13:09:00 +02:00
Pepijn 9027c7866f less prefetching 2025-09-01 12:12:36 +02:00
Pepijn c2bf226082 regulalizer 2025-09-01 12:07:37 +02:00
Pepijn f84c20d403 huberman loss 2025-09-01 11:59:20 +02:00
Pepijn 4c4462edea huberman loss 2025-09-01 11:56:58 +02:00
Pepijn 0b710932e2 huberman loss 2025-09-01 11:53:30 +02:00
Pepijn 9a19f8f6f4 use cls token 2025-09-01 11:31:28 +02:00
Pepijn 3504d17fef smaller siglip2 2025-09-01 11:18:35 +02:00
Pepijn d35ed3fd83 conversion dest 2025-09-01 11:01:27 +02:00
Pepijn ce5b27d255 siglip again 2025-09-01 10:55:12 +02:00
Pepijn 9dcb407ba7 siglip again 2025-09-01 10:27:58 +02:00
Pepijn 5eb5bf7164 clean 2025-09-01 10:14:43 +02:00
Pepijn 65fb5d3b1a fix 2025-09-01 00:12:30 +02:00
Pepijn d6a24e2882 fix 2025-08-31 21:47:11 +02:00
Pepijn d51bbe9492 fix 2025-08-31 21:38:46 +02:00
Pepijn d8c875e069 use patch tokens 2025-08-31 20:52:00 +02:00
Pepijn eff5b90542 add lower out of bound sampling 2025-08-31 20:38:45 +02:00
Pepijn a1a3fa435d fix dinov3 2025-08-31 20:21:58 +02:00
Pepijn 79c3466f0f fix dinov3 2025-08-31 19:44:27 +02:00
Pepijn e1d433cbfc fix dinov3 2025-08-31 19:41:16 +02:00
Pepijn 16e82fd29f fix stride unique samplin 2025-08-31 19:31:27 +02:00
Pepijn ae57fe2d33 debug frames 2025-08-31 19:20:18 +02:00
Pepijn e3306951c0 debug frames 2025-08-31 19:18:52 +02:00
Pepijn 10e36f2453 dinov3 base 2025-08-31 19:07:46 +02:00
Pepijn 9204a8bccd debug same frame 2025-08-31 19:06:30 +02:00
Pepijn 43eedf62e4 use dinov3 2025-08-31 18:49:06 +02:00
Pepijn c51d40ad56 add vision feature debug 2025-08-31 18:38:50 +02:00
Pepijn 5c1d930a34 add stride 2025-08-31 18:32:47 +02:00
Pepijn 8d20ca1625 extend head 2025-08-31 18:18:03 +02:00
Pepijn e4df9ccb63 fix progress 2025-08-31 18:11:18 +02:00
Pepijn 086815edb7 fix progress 2025-08-31 17:13:49 +02:00
Pepijn c9243c29b0 cleanup 2025-08-31 16:34:46 +02:00
Pepijn e7617076ca cleanup 2025-08-31 16:03:24 +02:00
Pepijn 221e5862ea cleanup 2025-08-31 15:52:15 +02:00
Pepijn 1e1b010257 cleanup 2025-08-31 15:40:00 +02:00
Pepijn def71cc439 change sampling 2025-08-31 15:20:20 +02:00
Pepijn 4557655ab1 simple eval 2025-08-31 14:11:47 +02:00
Pepijn 28298fbe78 simple eval 2025-08-31 14:08:48 +02:00
Pepijn f84affec23 simple eval 2025-08-31 14:00:19 +02:00
Pepijn dad0babbf5 simple eval 2025-08-31 13:54:03 +02:00
Pepijn fc5cd05fb0 simple eval 2025-08-31 13:48:40 +02:00
Pepijn d01b060d24 simple eval 2025-08-31 13:43:09 +02:00
Pepijn 7da15ba069 simple eval 2025-08-31 13:40:13 +02:00
Pepijn b0a5b88c21 simple eval 2025-08-31 13:28:04 +02:00
Pepijn 42fbcc89c5 ddebugging 2025-08-31 02:10:52 +02:00
Pepijn 9767120eb4 debug sampling 2025-08-31 01:48:35 +02:00
Pepijn 852713dc84 random sample for log 2025-08-31 01:33:58 +02:00
Pepijn 1f38712c95 fix pos enc 2025-08-31 01:22:54 +02:00
Pepijn 0ffc5b4741 add layernorm in head 2025-08-31 01:13:22 +02:00
Pepijn a1b1643ff6 change head init 2025-08-31 01:02:25 +02:00
Pepijn 7739fe12e4 sigmoid head 2025-08-31 00:53:23 +02:00
Pepijn be9bdc242f add pos relative 2025-08-31 00:43:26 +02:00
Pepijn 195cc79c49 add pos info for all frames 2025-08-31 00:29:08 +02:00
Pepijn f8d42cc038 fix 2025-08-30 23:58:58 +02:00
Pepijn 1797dea3d5 fix 2025-08-30 23:40:03 +02:00
Pepijn 825c0666a9 fix 2025-08-30 23:11:26 +02:00
Pepijn 47bc670ad2 less video prefetch 2025-08-30 21:21:27 +02:00
Pepijn aa505d4192 more video prefetch 2025-08-30 16:40:18 +02:00
Pepijn e380653c62 more video prefetch 2025-08-30 16:30:04 +02:00
Pepijn bf5c037959 remove decode logging 2025-08-30 16:28:29 +02:00
Pepijn 1234e71cfb add decode logging 2025-08-30 16:16:08 +02:00
Pepijn b1ff7132c1 add decode logging 2025-08-30 16:08:21 +02:00
Pepijn b357a8c4d8 add decode logging 2025-08-30 16:05:58 +02:00
Pepijn 0be53ef3e1 add decode logging 2025-08-30 16:00:55 +02:00
Pepijn aed90c8042 add decode logging 2025-08-30 15:52:24 +02:00
Pepijn 0b5da92a58 optimzize data loading 2025-08-30 15:40:36 +02:00
Pepijn 599218fe9a use rewind 2025-08-30 14:41:15 +02:00
Pepijn 2507341a32 stats every minute 2025-08-30 14:38:28 +02:00
Pepijn bde397e891 use siglip 2 2025-08-30 14:28:55 +02:00
Pepijn 76e260c401 fix 2025-08-30 13:07:51 +02:00
Pepijn 5179515d81 fix 2025-08-30 12:40:55 +02:00
Pepijn 8ad00d1ee7 fix 2025-08-30 12:33:39 +02:00
Pepijn 7440d772ff fix 2025-08-30 12:28:18 +02:00
Pepijn a4fc02a636 fix 2025-08-30 12:05:38 +02:00
Pepijn f5c39d6292 fix 2025-08-30 11:37:16 +02:00
Pepijn 3f616f0ebe add benchmark 2025-08-29 15:33:45 +02:00
Pepijn 9698e74e88 small impr 2025-08-29 09:05:53 +02:00
Pepijn 04d55e4670 small impr 2025-08-28 22:45:23 +02:00
Pepijn 7dce022a05 exactly as rewind code 2025-08-28 21:18:41 +02:00
Pepijn cc05067a76 dino v2 2025-08-28 19:23:17 +02:00
Pepijn bead25a58a smaller model 2025-08-28 17:43:03 +02:00
Pepijn c877e98658 use only rewind loss 2025-08-28 14:22:57 +02:00
Pepijn a4c88d6340 nit 2025-08-28 08:52:48 +02:00
Pepijn 34ca077d78 pad seq 2025-08-27 17:16:31 +02:00
Pepijn 2a901f8134 add multipe timesteps 2025-08-27 16:34:22 +02:00
Pepijn 450be9d7d1 add multipe timesteps 2025-08-27 16:33:53 +02:00
Pepijn 681be962ae initial commit 2025-08-27 14:58:34 +02:00
Adil Zouitine b16e18f978 Fix typo in documentation for adapters in robots/teleop section 2025-08-08 16:36:09 +02:00
Pepijn 652e3cb859 Add phone docs and use pipeline for robots/teleop docs 2025-08-08 16:05:34 +02:00
Michel Aractingi 2a5c757d58 Improved doc implement_your_own_pipeline
- Use normalization processor as default example
- Add section on transform features
- Add section on overrides.
2025-08-08 00:58:59 +02:00
pre-commit-ci[bot] 6d4e983197 [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
2025-08-07 18:13:34 +02:00
Adil Zouitine ecda7482c7 feat(docs): Enhance introduction to processors with additional converter functions
- Updated the introduction to processors documentation to include default batch-to-transition and transition-to-batch converters.
- Added detailed descriptions and examples for new specialized converter functions: `to_transition_teleop_action`, `to_transition_robot_observation`, `to_output_robot_action`, and `to_dataset_frame`.
- Improved clarity on how these converters facilitate integration with existing robotics applications.
2025-08-07 18:13:34 +02:00
pre-commit-ci[bot] 7124d471c1 [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
2025-08-07 18:13:34 +02:00
Adil Zouitine a14af62ee3 Add comprehensive documentation for processors in robotics
- Introduced a detailed guide on processors, covering their role in transforming raw robot data into model-ready inputs and vice versa.
- Explained core concepts such as EnvTransition, ProcessorStep, and RobotProcessor, along with their functionalities.
- Included examples of common processor steps like normalization, device management, batch processing, and text tokenization.
- Provided insights on building complete pipelines, integrating processors into training loops, and saving/loading configurations.
- Emphasized best practices and advanced features for effective usage of processors in robotics applications.
2025-08-07 18:13:34 +02:00
Michel Aractingi ac80f1f081 improved part 2 of processor guide 2025-08-07 18:13:34 +02:00
Michel Aractingi feb3fed5e8 precommit style nit 2025-08-07 18:13:34 +02:00
Michel Aractingi 8d5f519fcb Added script for the second part of the processor doc 2025-08-07 18:13:34 +02:00
Adil Zouitine b9d3c34ae4 chore(docs): initialize doc 2025-08-07 18:13:34 +02:00
Adil Zouitine 5f759b1637 feat(dependencies): Add scipy as a required dependency
- Included `scipy>=1.15.2` in the project dependencies to enhance functionality and support for scientific computing tasks.
2025-08-07 18:09:49 +02:00
Adil Zouitine 6a75b4761a refactor(TokenizerProcessor): improve dependency handling and observation management
- Updated TokenizerProcessor to conditionally import AutoTokenizer based on the availability of the transformers library, enhancing flexibility.
- Modified tokenizer attribute type to Any to accommodate scenarios where transformers may not be installed.
- Improved observation handling by using a more concise approach to manage the transition dictionary, ensuring compatibility with existing data structures.
- Added error handling for missing transformers library, providing clear guidance for users on installation requirements.
2025-08-07 17:07:20 +02:00
Pepijn e5ade5565d Integrate pipeline and add phone teleop (#1681)
* Add normalization processor and related components

- Introduced `NormalizationProcessor` to handle both observation normalization and action unnormalization.
- Added `ObservationNormalizer` and `ActionUnnormalizer` classes for specific normalization tasks.
- Updated `__init__.py` to include the new `NormalizationProcessor` in the module exports.
- Enhanced `ObservationProcessor` with registration in the `ProcessorStepRegistry` for better modularity.
- Created `RenameProcessor` for renaming keys in observations, improving flexibility in data processing.

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Enhance processing architecture with new components

- Added `RenameProcessor` to facilitate key renaming in observations, improving data handling flexibility.
- Updated `__init__.py` to include `RenameProcessor` in module exports.
- Refactored `NormalizationProcessor` and `ObservationNormalizer` to use `rsplit` for better key handling.
- Introduced comprehensive tests for `NormalizationProcessor` and `RenameProcessor` to ensure functionality and robustness.

* chore (docs): add docstring for processor

* fix (test): test factory

* fix(test): policies

* Update tests/processor/test_observation_processor.py

Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
Signed-off-by: Adil Zouitine <adilzouitinegm@gmail.com>

* chore(test): add suggestion made by copilot regarding numpy test

* fix(test): import issue

* Refactor normalization components and update tests

- Renamed `ObservationNormalizer` to `NormalizerProcessor` and `ActionUnnormalizer` to `UnnormalizerProcessor` for clarity.
- Consolidated normalization logic for both observations and actions into `NormalizerProcessor` and `UnnormalizerProcessor`.
- Updated tests to reflect the new class names and ensure proper functionality of normalization and unnormalization processes.
- Enhanced handling of missing statistics in normalization processes.

* chore (docstrin):Improve docstring for NormalizerProcessor

* feat (device processor): Implement device processor

* chore (batch handling): Enhance processing components with batch conversion utilities

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* fix(test): linting issue

* chore (output format): improves output format

* chore (type): add typing for multiprocess envs

* feat (overrides): Implement support for loading processors with parameter overrides

- Added the ability to provide non-serializable objects when loading processors from saved configurations using the `overrides` parameter.
- Enhanced error handling for invalid override keys and instantiation errors.
- Updated documentation and examples to illustrate the usage of overrides for both registered and unregistered steps.
- Added comprehensive tests to validate the new functionality and ensure backward compatibility.

* chore(normalization): addressing comments from copilot

* chore(learner): nit comment from copilot

* feat(pipeline): Enhance step_through method to support both tuple and dict inputs

* refactor(pipeline): Simplify observation and padding data handling in batch transitions

* Apply suggestions from code review

Co-authored-by: Simon Alibert <75076266+aliberts@users.noreply.github.com>
Signed-off-by: Adil Zouitine <adilzouitinegm@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* refactor(pipeline): Introduce ComplementaryDataProcessor for handling complementary data in transitions

* fix(ci): temporary fix on dataset deps version

* feat(processors): Introduce processors for various policy types

- Added `make_processor` function to create processor instances for different policy types, including `tdmpc`, `diffusion`, `act`, `vqbet`, `pi0`, `pi0fast`, `sac`, and `reward_classifier`.
- Implemented corresponding processor files for each policy type, encapsulating normalization and unnormalization steps.
- Updated existing policies to remove direct normalization dependencies, enhancing modularity and clarity.
- Enhanced test coverage to validate the integration of new processors with existing policy configurations.

* refactor(learner): Remove normalization from cached image features retrieval

- Simplified the retrieval of observation features by removing the normalization step from the `get_cached_image_features` method calls.
- This change enhances clarity and aligns with the recent updates to policy processors.

* refactor(policies): Remove unnormalization step from action predictions

- Eliminated the unnormalization of actions in both `TDMPCPolicy` and `VQBeTPolicy` classes to streamline action prediction.
- This change improves code clarity and aligns with recent updates to policy processors.

* feat(train): Integrate preprocessor into training pipeline

* refactor(train): Update preprocessor initialization to include dataset statistics

* refactor(policies): Enhance processor creation and add NaN detection hook

* refactor(train): Update memory pinning logic for mps compatibility

* feat: initial commit phone teleop

* ugly delta control

* use quaternion

* Refactor observation preprocessing to use a modular pipeline system

- Introduced `RobotPipeline` and `ObservationProcessor` for handling observation transformations.
- Updated `preprocess_observation` to maintain backward compatibility while leveraging the new pipeline.
- Added tests for the new processing components and ensured they match the original functionality.
- Removed hardcoded logic in favor of a more flexible, composable architecture.

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Refactor observation processing and improve modularity

- Updated `ObservationProcessor` to enhance the modular design for processing observations.
- Cleaned up imports and improved code readability by removing unnecessary lines and comments.
- Ensured backward compatibility while integrating new processing components.
- Added tests to validate the functionality of the updated processing architecture.

* Remove redundant tests for None observation and serialization methods in `test_observation_processor.py` to streamline the test suite and improve maintainability.

* Refactor processing architecture to use RobotProcessor

- Replaced instances of RobotPipeline with RobotProcessor across the codebase for improved modularity and clarity.
- Introduced ProcessorStepRegistry for better management of processing steps.
- Updated relevant documentation and tests to reflect the new processing structure.
- Enhanced the save/load functionality to support the new processor design.
- Added a model card template for RobotProcessor to facilitate sharing and documentation.

* Add RobotProcessor tutorial to documentation

- Introduced a new tutorial on using RobotProcessor for preprocessing robot data.
- Added a section in the table of contents for easy navigation to the new tutorial.
- The tutorial covers key concepts, real-world scenarios, and practical examples for effective use of the RobotProcessor pipeline.

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Add normalization processor and related components

- Introduced `NormalizationProcessor` to handle both observation normalization and action unnormalization.
- Added `ObservationNormalizer` and `ActionUnnormalizer` classes for specific normalization tasks.
- Updated `__init__.py` to include the new `NormalizationProcessor` in the module exports.
- Enhanced `ObservationProcessor` with registration in the `ProcessorStepRegistry` for better modularity.
- Created `RenameProcessor` for renaming keys in observations, improving flexibility in data processing.

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Enhance processing architecture with new components

- Added `RenameProcessor` to facilitate key renaming in observations, improving data handling flexibility.
- Updated `__init__.py` to include `RenameProcessor` in module exports.
- Refactored `NormalizationProcessor` and `ObservationNormalizer` to use `rsplit` for better key handling.
- Introduced comprehensive tests for `NormalizationProcessor` and `RenameProcessor` to ensure functionality and robustness.

* chore (docs): add docstring for processor

* fix (test): test factory

* fix(test): policies

* Update tests/processor/test_observation_processor.py

Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
Signed-off-by: Adil Zouitine <adilzouitinegm@gmail.com>

* chore(test): add suggestion made by copilot regarding numpy test

* fix(test): import issue

* Refactor normalization components and update tests

- Renamed `ObservationNormalizer` to `NormalizerProcessor` and `ActionUnnormalizer` to `UnnormalizerProcessor` for clarity.
- Consolidated normalization logic for both observations and actions into `NormalizerProcessor` and `UnnormalizerProcessor`.
- Updated tests to reflect the new class names and ensure proper functionality of normalization and unnormalization processes.
- Enhanced handling of missing statistics in normalization processes.

* chore (docstrin):Improve docstring for NormalizerProcessor

* feat (device processor): Implement device processor

* chore (batch handling): Enhance processing components with batch conversion utilities

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* fix(test): linting issue

* chore (output format): improves output format

* chore (type): add typing for multiprocess envs

* feat (overrides): Implement support for loading processors with parameter overrides

- Added the ability to provide non-serializable objects when loading processors from saved configurations using the `overrides` parameter.
- Enhanced error handling for invalid override keys and instantiation errors.
- Updated documentation and examples to illustrate the usage of overrides for both registered and unregistered steps.
- Added comprehensive tests to validate the new functionality and ensure backward compatibility.

* chore(normalization): addressing comments from copilot

* chore(learner): nit comment from copilot

* feat(pipeline): Enhance step_through method to support both tuple and dict inputs

* refactor(pipeline): Simplify observation and padding data handling in batch transitions

* Apply suggestions from code review

Co-authored-by: Simon Alibert <75076266+aliberts@users.noreply.github.com>
Signed-off-by: Adil Zouitine <adilzouitinegm@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* refactor(pipeline): Introduce ComplementaryDataProcessor for handling complementary data in transitions

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* refactor(pipeline): Transition from tuple to dictionary format for EnvTransition

- Updated the EnvTransition structure to use a dictionary format instead of a tuple, enhancing readability and maintainability.
- Replaced instances of TransitionIndex with TransitionKey for accessing transition components.
- Adjusted related processing functions and tests to accommodate the new dictionary format, ensuring consistent handling of transitions across the codebase.

* refactor(observation_processor): Improve observation processing by using constants and simplifying pixel handling

- Introduced constants for observation keys to enhance readability.
- Streamlined the handling of the "pixels" key by copying observations first and processing images more clearly.
- Updated the environment state and agent position assignments to use the new constants, improving maintainability.

* feat(pipeline): Add hook unregistration functionality and enhance documentation

- Implemented methods to unregister before, after, and reset hooks in the RobotProcessor class, allowing for more flexible hook management.
- Enhanced documentation to clarify hook execution semantics and the implications of modifying transitions within hooks.
- Added comprehensive tests to verify the correct behavior of hook registration and unregistration, including error handling for non-existent hooks.

* refactor(pipeline): Clarify hook behavior and improve documentation

- Updated the RobotProcessor class to ensure hooks are strictly for observation and do not modify transitions, enhancing clarity and maintainability.
- Refactored hook registration methods to reflect the new behavior, ensuring they accept only functions that do not return modified transitions.
- Enhanced documentation to clearly outline the purpose of hooks and their execution semantics.
- Added tests to verify that hooks are not executed during the step_through method while ensuring they function correctly during the __call__ method.

* feat(pipeline): Add __repr__ method to RobotProcessor for improved readability

- Implemented a __repr__ method in the RobotProcessor class to provide a clear string representation of the processor, including step names and optional parameters like name and seed.
- Added comprehensive tests to validate the __repr__ output for various scenarios, including empty processors, single and multiple steps, custom names, and seed values.
- Ensured that the representation handles long lists of steps with truncation for better readability.

* chore(pipeline): Move _CFG_NAME along other class member

* refactor(pipeline): Utilize get_safe_torch_device for device assignment

- Replaced direct torch.device instantiation with get_safe_torch_device to ensure safe device handling.
- This change enhances code readability and maintains consistency in device management across the RobotProcessor class.

* refactor(pipeline): Enhance state filename generation and profiling method

- Updated state filename generation to use the registry name when available, improving clarity in saved files.
- Modified the profile_steps method to include a warmup_runs parameter, allowing for more controlled performance profiling.
- Ensured consistent conditions during profiling by deep copying transitions for each run, enhancing accuracy in timing results.

* chore(doc): address pip install commant lerobot that not exist yet

* feat(pipeline): Enhance configuration filename handling and state file naming

- Introduced support for custom configuration filenames in the `save_pretrained` method, allowing users to specify a filename instead of the default.
- Improved state file naming to include step indices, preventing conflicts when multiple processors of the same type are saved.
- Added automatic detection for configuration files when loading from a directory, with error handling for multiple files.
- Updated tests to validate new features, including custom filenames and automatic config detection.

* refactor(pipeline): Improve state file naming conventions for clarity and uniqueness

- Enhanced state file naming to include the processor's sanitized name, ensuring uniqueness when multiple processors are saved in the same directory.
- Updated tests to reflect changes in state file naming, verifying that filenames now include the processor name and step indices to prevent conflicts.
- Added a new test to validate state file naming when using multiple processors, ensuring distinct filenames for each processor's state files.

* docs(pipeline): Add clarification for repo name sanitization process

* feat(processors): Introduce processors for various policy types

- Added `make_processor` function to create processor instances for different policy types, including `tdmpc`, `diffusion`, `act`, `vqbet`, `pi0`, `pi0fast`, `sac`, and `reward_classifier`.
- Implemented corresponding processor files for each policy type, encapsulating normalization and unnormalization steps.
- Updated existing policies to remove direct normalization dependencies, enhancing modularity and clarity.
- Enhanced test coverage to validate the integration of new processors with existing policy configurations.

* refactor(learner): Remove normalization from cached image features retrieval

- Simplified the retrieval of observation features by removing the normalization step from the `get_cached_image_features` method calls.
- This change enhances clarity and aligns with the recent updates to policy processors.

* refactor(policies): Remove unnormalization step from action predictions

- Eliminated the unnormalization of actions in both `TDMPCPolicy` and `VQBeTPolicy` classes to streamline action prediction.
- This change improves code clarity and aligns with recent updates to policy processors.

* feat(train): Integrate preprocessor into training pipeline

* refactor(train): Update preprocessor initialization to include dataset statistics

* refactor(policies): Enhance processor creation and add NaN detection hook

* feat(record): Integrate RobotProcessor into recording loop and update policy handling

- Added support for RobotProcessor in the record_loop function to enhance data processing capabilities.
- Updated the logic to reset both policy and processor when provided, ensuring proper state management.
- Modified action prediction to utilize the processor, improving the overall functionality of the recording process.
- Adjusted the save_checkpoint function to include preprocessor state saving, enhancing checkpointing capabilities.

* feat(migration): Add script for migrating policy models with normalization layers

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* feat(migrate): Enhance migration script to create preprocessor and postprocessor for policy models

- Updated the migration script to generate both a preprocessor and a postprocessor, improving the handling of normalization for training and inference.
- Added functionality to convert features to PolicyFeature objects, ensuring compatibility with the new processor architecture.
- Refined the extraction and removal of normalization statistics and layers, streamlining the migration process.
- Improved error handling for missing mandatory configuration fields during model instantiation.

* feat(migrate): Add model card generation and saving to migration script

- Implemented functionality to generate and save a model card for the migrated model, including metadata such as dataset repository ID, license, and tags.
- Enhanced the script to push the model card to the hub if requested, improving model documentation and accessibility.
- Refactored the saving process to ensure the model card is saved locally and uploaded correctly when pushing to the hub.

* feat(processor): Introduce ToBatchProcessor for handling observation batching

- Added ToBatchProcessor to ensure observations have proper batch dimensions for model processing.
- Implemented functionality to add batch dimensions to state and image observations as needed.
- Created comprehensive unit tests to validate the processor's behavior with various tensor dimensions and types.
- Ensured compatibility with existing transition keys and maintained the integrity of non-observation data.

* feat(processors): Add ToBatchProcessor to multiple policy processors

- Integrated ToBatchProcessor into various policy processors to handle observation batching.
- Updated make functions for act, diffusion, pi0, pi0fast, sac, smolvla, tdmpc, and vqbet processors to include the new batching functionality.
- Ensured consistency across all processor implementations for improved data handling.

* refactor(factory): Remove unused imports and NaN detection hook from processor creation

* feat(batch_processor): Enhance ToBatchProcessor to handle action batching

- Updated ToBatchProcessor to add batch dimensions to actions in addition to observations.
- Implemented separate methods for processing observations and actions, improving code readability.
- Added comprehensive unit tests to validate action batching functionality across various tensor dimensions and types.

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* feat(factory): Enhance make_processor to support preprocessor and postprocessor configuration

- Introduced ProcessorConfigKwargs TypedDict for better type safety in processor configuration.
- Updated make_processor to accept preprocessor and postprocessor configuration filenames, improving flexibility in processor instantiation.
- Refactored the loading of pretrained processors to utilize the new configuration options.

* refactor(factory): Clean up imports in factory.py

- Removed unused import of IdentityProcessor to streamline the code.

* feat(migrate): Extend load_model_from_hub to include train configuration

- Updated load_model_from_hub to return the train configuration alongside the model state_dict and config.
- Modified main function to handle the additional train configuration when loading models from both the hub and local paths.
- Adjusted dataset_repo_id extraction to utilize the train configuration for improved accuracy.

* refactor(record): Rename processor parameters and update processing logic

- Renamed `processor` to `preprocessor` and added `postprocessor` parameter for clarity.
- Updated the `record_loop` and `predict_action` functions to utilize the new preprocessor and postprocessor, enhancing the processing flow.
- Ensured compatibility with existing functionality while improving code readability.

* feat(batch_processor): Add task field processing to ToBatchProcessor

- Enhanced ToBatchProcessor to wrap string tasks in a list, adding batch dimensions for compatibility with model inference.
- Implemented a new method for processing complementary data, ensuring that task values are correctly handled as either strings or lists of strings.
- Added comprehensive unit tests to validate task processing, including edge cases and in-place mutation of complementary data.

* feat(normalization): Implement IDENTITY mode for normalization and unnormalization

- Enhanced NormalizerProcessor and UnnormalizerProcessor to support IDENTITY mode, allowing features to bypass normalization when specified.
- Updated processing logic to check normalization modes and handle missing statistics gracefully.
- Added comprehensive unit tests to validate IDENTITY mode functionality for both observations and actions, ensuring correct behavior across various scenarios.
- Improved error handling for unsupported normalization modes.

* fix(rebase): remove residual normalization layer:

* refactor(diffusion): remove normalization layer from input processing

* Add debug + calib

* cleanup

* Add pipeline

* fix int

* Add record example

* nit

* Add feature contract to pipelinestep and pipeline

* Add tests

* Add processor tests

* PR feedback

* encorperate pr feedback

* type in doc

* oops

* cleaned up steps and integrated pipeline with feature_contract

* refactor steps and robot to pipeline

* cleanup pipeline

* cleanup code further

* make it run

* feat(processors): Introduce processors for various policy types

- Added `make_processor` function to create processor instances for different policy types, including `tdmpc`, `diffusion`, `act`, `vqbet`, `pi0`, `pi0fast`, `sac`, and `reward_classifier`.
- Implemented corresponding processor files for each policy type, encapsulating normalization and unnormalization steps.
- Updated existing policies to remove direct normalization dependencies, enhancing modularity and clarity.
- Enhanced test coverage to validate the integration of new processors with existing policy configurations.

* refactor(learner): Remove normalization from cached image features retrieval

- Simplified the retrieval of observation features by removing the normalization step from the `get_cached_image_features` method calls.
- This change enhances clarity and aligns with the recent updates to policy processors.

* refactor(policies): Remove unnormalization step from action predictions

- Eliminated the unnormalization of actions in both `TDMPCPolicy` and `VQBeTPolicy` classes to streamline action prediction.
- This change improves code clarity and aligns with recent updates to policy processors.

* feat(train): Integrate preprocessor into training pipeline

* refactor(train): Update preprocessor initialization to include dataset statistics

* refactor(policies): Enhance processor creation and add NaN detection hook

* feat(record): Integrate RobotProcessor into recording loop and update policy handling

- Added support for RobotProcessor in the record_loop function to enhance data processing capabilities.
- Updated the logic to reset both policy and processor when provided, ensuring proper state management.
- Modified action prediction to utilize the processor, improving the overall functionality of the recording process.
- Adjusted the save_checkpoint function to include preprocessor state saving, enhancing checkpointing capabilities.

* feat(migration): Add script for migrating policy models with normalization layers

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* feat(migrate): Enhance migration script to create preprocessor and postprocessor for policy models

- Updated the migration script to generate both a preprocessor and a postprocessor, improving the handling of normalization for training and inference.
- Added functionality to convert features to PolicyFeature objects, ensuring compatibility with the new processor architecture.
- Refined the extraction and removal of normalization statistics and layers, streamlining the migration process.
- Improved error handling for missing mandatory configuration fields during model instantiation.

* feat(migrate): Add model card generation and saving to migration script

- Implemented functionality to generate and save a model card for the migrated model, including metadata such as dataset repository ID, license, and tags.
- Enhanced the script to push the model card to the hub if requested, improving model documentation and accessibility.
- Refactored the saving process to ensure the model card is saved locally and uploaded correctly when pushing to the hub.

* feat(processor): Introduce ToBatchProcessor for handling observation batching

- Added ToBatchProcessor to ensure observations have proper batch dimensions for model processing.
- Implemented functionality to add batch dimensions to state and image observations as needed.
- Created comprehensive unit tests to validate the processor's behavior with various tensor dimensions and types.
- Ensured compatibility with existing transition keys and maintained the integrity of non-observation data.

* feat(processors): Add ToBatchProcessor to multiple policy processors

- Integrated ToBatchProcessor into various policy processors to handle observation batching.
- Updated make functions for act, diffusion, pi0, pi0fast, sac, smolvla, tdmpc, and vqbet processors to include the new batching functionality.
- Ensured consistency across all processor implementations for improved data handling.

* refactor(factory): Remove unused imports and NaN detection hook from processor creation

* feat(batch_processor): Enhance ToBatchProcessor to handle action batching

- Updated ToBatchProcessor to add batch dimensions to actions in addition to observations.
- Implemented separate methods for processing observations and actions, improving code readability.
- Added comprehensive unit tests to validate action batching functionality across various tensor dimensions and types.

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* feat(factory): Enhance make_processor to support preprocessor and postprocessor configuration

- Introduced ProcessorConfigKwargs TypedDict for better type safety in processor configuration.
- Updated make_processor to accept preprocessor and postprocessor configuration filenames, improving flexibility in processor instantiation.
- Refactored the loading of pretrained processors to utilize the new configuration options.

* refactor(factory): Clean up imports in factory.py

- Removed unused import of IdentityProcessor to streamline the code.

* feat(migrate): Extend load_model_from_hub to include train configuration

- Updated load_model_from_hub to return the train configuration alongside the model state_dict and config.
- Modified main function to handle the additional train configuration when loading models from both the hub and local paths.
- Adjusted dataset_repo_id extraction to utilize the train configuration for improved accuracy.

* refactor(record): Rename processor parameters and update processing logic

- Renamed `processor` to `preprocessor` and added `postprocessor` parameter for clarity.
- Updated the `record_loop` and `predict_action` functions to utilize the new preprocessor and postprocessor, enhancing the processing flow.
- Ensured compatibility with existing functionality while improving code readability.

* feat(batch_processor): Add task field processing to ToBatchProcessor

- Enhanced ToBatchProcessor to wrap string tasks in a list, adding batch dimensions for compatibility with model inference.
- Implemented a new method for processing complementary data, ensuring that task values are correctly handled as either strings or lists of strings.
- Added comprehensive unit tests to validate task processing, including edge cases and in-place mutation of complementary data.

* feat(normalization): Implement IDENTITY mode for normalization and unnormalization

- Enhanced NormalizerProcessor and UnnormalizerProcessor to support IDENTITY mode, allowing features to bypass normalization when specified.
- Updated processing logic to check normalization modes and handle missing statistics gracefully.
- Added comprehensive unit tests to validate IDENTITY mode functionality for both observations and actions, ensuring correct behavior across various scenarios.
- Improved error handling for unsupported normalization modes.

* fix(rebase): remove residual normalization layer:

* refactor(diffusion): remove normalization layer from input processing

* refactor(normalization): Remove unused state dict transformation methods and streamline imports

- Eliminated the _transform_state_dict_keys and _load_as_safetensor methods from PI0Policy, simplifying the model loading process.
- Cleaned up imports in modeling_pi0.py by removing log_model_loading_keys and init_logging.
- Updated TDMPCPolicy and VQBeTPolicy to handle action removal from batches during offline evaluation.
- Introduced hotswap_stats function in normalize_processor.py to update normalization statistics dynamically, with corresponding tests to ensure functionality.

* refactor(normalization): Clean up imports in normalize_processor.py

* feat(batch_processor): Add feature_contract method to ToBatchProcessor

- Introduced feature_contract method that returns features without modification, maintaining the no-op behavior of the processor.
- This addition enhances the flexibility of the ToBatchProcessor for future feature processing needs.

* fix(dependencies): Update transformers dependency constraint to allow only versions up to 4.52.0

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* feat(tokenizer): Introduce TokenizerProcessor for text tokenization

- Added TokenizerProcessor class to handle tokenization of task strings using Hugging Face's AutoTokenizer.
- Supports both string and list inputs, with customizable parameters for task key, output key, and tokenization settings.
- Implemented comprehensive unit tests to validate functionality, including handling of various input scenarios and integration with RobotProcessor.
- Updated types.py to include LANGUAGE feature type and modified __init__.py to register the new processor.

* feat(language): Enhance language processing in TokenizerProcessor

- Added OBS_LANGUAGE constant to define the observation language key.
- Updated TokenizerProcessor to store tokenized task data in the observation dictionary, ensuring compatibility with the new language feature.
- Introduced Pi0NewLineProcessor to append newlines to tasks for proper tokenization.
- Modified tests to validate the integration of language tokens and attention masks in the observation structure.

* feat(tokenizer): Add padding configuration to TokenizerProcessor

- Introduced `padding_side` parameter to the TokenizerProcessor for customizable padding direction.
- Updated the `make_pi0_processor` function to include the new padding configuration.
- Enhanced unit tests to validate the functionality of the `padding_side` parameter in various scenarios.

* feat(processor): Add state management methods to Pi0NewLineProcessor

* feat(normalization): Track normalization and unnormalization info in complementary data

- Updated NormalizerProcessor and UnnormalizerProcessor to accept additional parameters for tracking normalization modes.
- Enhanced the __call__ methods to store normalization and unnormalization information in the complementary data of transitions.
- Added unit tests to verify the correct tracking of normalization info, including scenarios with missing stats and selective normalization keys.

* feat(factory): Add preprocessor and postprocessor overrides to ProcessorConfigKwargs

- Updated ProcessorConfigKwargs to include optional overrides for preprocessor and postprocessor configurations.
- Enhanced the make_processor function to utilize the new overrides, allowing for more flexible processor initialization.

* feat(processors): Integrate RenameProcessor into various processor configurations

- Added RenameProcessor to the input steps of multiple processor functions, including make_act_processor, make_diffusion_processor, make_pi0_processor, make_sac_processor, make_tdmpc_processor, make_vqbet_processor, and make_smolvla_processor.
- Consolidated normalization features from input and output into a single NormalizerProcessor for improved efficiency.
- Updated the input steps to ensure compatibility with the new RenameProcessor integration.

* Do some todos and cleanup

* change feature_contract to dataset_features

* use one method for conversion pipeline output to add_frame dict and use base processors where possible

* Add back in and use record_loop

* update todo

* rename to_dataset_frame

* feat(smolvla): Refactor language processing and introduce new line processor (#1658)

- Removed the prepare_language method and directly accessed language tokens and masks from the batch using the OBS_LANGUAGE constant.
- Added SmolVLANewLineProcessor to ensure tasks end with a newline, enhancing tokenization compatibility.
- Updated the make_smolvla_processor function to include the new line processor and tokenizer processor for improved input handling.

* feat(processors): Integrate DeviceProcessor into multiple processor configurations

- Added DeviceProcessor to the input and output steps of various processor functions, including make_act_processor, make_diffusion_processor, make_pi0_processor, make_pi0fast_processor, make_sac_processor, make_tdmpc_processor, make_vqbet_processor, and make_smolvla_processor.
- Enhanced the DeviceProcessor class with state management methods and ensured compatibility with existing processor pipelines.
- Introduced unit tests for DeviceProcessor to validate functionality across different scenarios, including CPU and CUDA operations.

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* fix

* fix reference frame

* refactor(pipeline): Remove to() method for device management

- Eliminated the to() method from RobotProcessor, which was responsible for moving tensor states to specified devices.
- Removed associated unit tests that validated the functionality of the to() method across various scenarios.
- Streamlined the pipeline code by focusing on other device management strategies.

* feat(processor): Enhance DeviceProcessor with float dtype conversion

- Added support for optional float dtype conversion in DeviceProcessor, allowing tensors to be converted to specified floating-point types while preserving non-float types.
- Implemented validation for float dtype input and updated the processor's configuration methods to include float dtype.
- Refactored tensor processing logic to streamline device movement and dtype conversion.
- Introduced comprehensive unit tests to validate the new float dtype functionality across various scenarios.

* update data visualization

* update teleop example

* fix record bugs

* Add replay

* Not code

* feature(pipeline): port tokenizer pipeline for VLA (#1645)

* feat(tokenizer): Introduce TokenizerProcessor for text tokenization

- Added TokenizerProcessor class to handle tokenization of task strings using Hugging Face's AutoTokenizer.
- Supports both string and list inputs, with customizable parameters for task key, output key, and tokenization settings.
- Implemented comprehensive unit tests to validate functionality, including handling of various input scenarios and integration with RobotProcessor.
- Updated types.py to include LANGUAGE feature type and modified __init__.py to register the new processor.

* feat(language): Enhance language processing in TokenizerProcessor

- Added OBS_LANGUAGE constant to define the observation language key.
- Updated TokenizerProcessor to store tokenized task data in the observation dictionary, ensuring compatibility with the new language feature.
- Introduced Pi0NewLineProcessor to append newlines to tasks for proper tokenization.
- Modified tests to validate the integration of language tokens and attention masks in the observation structure.

* feat(tokenizer): Add padding configuration to TokenizerProcessor

- Introduced `padding_side` parameter to the TokenizerProcessor for customizable padding direction.
- Updated the `make_pi0_processor` function to include the new padding configuration.
- Enhanced unit tests to validate the functionality of the `padding_side` parameter in various scenarios.

* feat(processor): Add state management methods to Pi0NewLineProcessor

* feat(normalization): Track normalization and unnormalization info in complementary data

- Updated NormalizerProcessor and UnnormalizerProcessor to accept additional parameters for tracking normalization modes.
- Enhanced the __call__ methods to store normalization and unnormalization information in the complementary data of transitions.
- Added unit tests to verify the correct tracking of normalization info, including scenarios with missing stats and selective normalization keys.

* feat(factory): Add preprocessor and postprocessor overrides to ProcessorConfigKwargs

- Updated ProcessorConfigKwargs to include optional overrides for preprocessor and postprocessor configurations.
- Enhanced the make_processor function to utilize the new overrides, allowing for more flexible processor initialization.

* feat(processors): Integrate RenameProcessor into various processor configurations

- Added RenameProcessor to the input steps of multiple processor functions, including make_act_processor, make_diffusion_processor, make_pi0_processor, make_sac_processor, make_tdmpc_processor, make_vqbet_processor, and make_smolvla_processor.
- Consolidated normalization features from input and output into a single NormalizerProcessor for improved efficiency.
- Updated the input steps to ensure compatibility with the new RenameProcessor integration.

* feat(smolvla): Refactor language processing and introduce new line processor (#1658)

- Removed the prepare_language method and directly accessed language tokens and masks from the batch using the OBS_LANGUAGE constant.
- Added SmolVLANewLineProcessor to ensure tasks end with a newline, enhancing tokenization compatibility.
- Updated the make_smolvla_processor function to include the new line processor and tokenizer processor for improved input handling.

* feture(policies): add device processor (#1659)

* feat(processors): Integrate DeviceProcessor into multiple processor configurations

- Added DeviceProcessor to the input and output steps of various processor functions, including make_act_processor, make_diffusion_processor, make_pi0_processor, make_pi0fast_processor, make_sac_processor, make_tdmpc_processor, make_vqbet_processor, and make_smolvla_processor.
- Enhanced the DeviceProcessor class with state management methods and ensured compatibility with existing processor pipelines.
- Introduced unit tests for DeviceProcessor to validate functionality across different scenarios, including CPU and CUDA operations.

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* refactor(pipeline): Remove to() method for device management

- Eliminated the to() method from RobotProcessor, which was responsible for moving tensor states to specified devices.
- Removed associated unit tests that validated the functionality of the to() method across various scenarios.
- Streamlined the pipeline code by focusing on other device management strategies.

* feat(processor): Enhance DeviceProcessor with float dtype conversion

- Added support for optional float dtype conversion in DeviceProcessor, allowing tensors to be converted to specified floating-point types while preserving non-float types.
- Implemented validation for float dtype input and updated the processor's configuration methods to include float dtype.
- Refactored tensor processing logic to streamline device movement and dtype conversion.
- Introduced comprehensive unit tests to validate the new float dtype functionality across various scenarios.

* feat(policies): Add new line processors and update module exports

* feat(processor): Enhance batch and device processors to handle index and task_index fields

- Added logic to ToBatchProcessor for unsqueezing 0D tensors for index and task_index fields, ensuring they are processed as 1D tensors.
- Updated DeviceProcessor to process index and task_index fields in complementary data, preserving their tensor types and ensuring non-tensor fields remain unchanged.
- Enhanced unit tests to validate the correct handling of index and task_index fields across various scenarios, including device compatibility and dtype preservation.

* Add eval script

* fix `q_curr` in InverseKinematicsEEToJoints to the IK solution

* feat(processors): Introduce processors for various policy types

- Added `make_processor` function to create processor instances for different policy types, including `tdmpc`, `diffusion`, `act`, `vqbet`, `pi0`, `pi0fast`, `sac`, and `reward_classifier`.
- Implemented corresponding processor files for each policy type, encapsulating normalization and unnormalization steps.
- Updated existing policies to remove direct normalization dependencies, enhancing modularity and clarity.
- Enhanced test coverage to validate the integration of new processors with existing policy configurations.

* refactor(learner): Remove normalization from cached image features retrieval

- Simplified the retrieval of observation features by removing the normalization step from the `get_cached_image_features` method calls.
- This change enhances clarity and aligns with the recent updates to policy processors.

* refactor(policies): Remove unnormalization step from action predictions

- Eliminated the unnormalization of actions in both `TDMPCPolicy` and `VQBeTPolicy` classes to streamline action prediction.
- This change improves code clarity and aligns with recent updates to policy processors.

* feat(train): Integrate preprocessor into training pipeline

* refactor(train): Update preprocessor initialization to include dataset statistics

* refactor(policies): Enhance processor creation and add NaN detection hook

* feat(record): Integrate RobotProcessor into recording loop and update policy handling

- Added support for RobotProcessor in the record_loop function to enhance data processing capabilities.
- Updated the logic to reset both policy and processor when provided, ensuring proper state management.
- Modified action prediction to utilize the processor, improving the overall functionality of the recording process.
- Adjusted the save_checkpoint function to include preprocessor state saving, enhancing checkpointing capabilities.

* feat(migration): Add script for migrating policy models with normalization layers

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* feat(migrate): Enhance migration script to create preprocessor and postprocessor for policy models

- Updated the migration script to generate both a preprocessor and a postprocessor, improving the handling of normalization for training and inference.
- Added functionality to convert features to PolicyFeature objects, ensuring compatibility with the new processor architecture.
- Refined the extraction and removal of normalization statistics and layers, streamlining the migration process.
- Improved error handling for missing mandatory configuration fields during model instantiation.

* feat(migrate): Add model card generation and saving to migration script

- Implemented functionality to generate and save a model card for the migrated model, including metadata such as dataset repository ID, license, and tags.
- Enhanced the script to push the model card to the hub if requested, improving model documentation and accessibility.
- Refactored the saving process to ensure the model card is saved locally and uploaded correctly when pushing to the hub.

* feat(processor): Introduce ToBatchProcessor for handling observation batching

- Added ToBatchProcessor to ensure observations have proper batch dimensions for model processing.
- Implemented functionality to add batch dimensions to state and image observations as needed.
- Created comprehensive unit tests to validate the processor's behavior with various tensor dimensions and types.
- Ensured compatibility with existing transition keys and maintained the integrity of non-observation data.

* feat(processors): Add ToBatchProcessor to multiple policy processors

- Integrated ToBatchProcessor into various policy processors to handle observation batching.
- Updated make functions for act, diffusion, pi0, pi0fast, sac, smolvla, tdmpc, and vqbet processors to include the new batching functionality.
- Ensured consistency across all processor implementations for improved data handling.

* refactor(factory): Remove unused imports and NaN detection hook from processor creation

* feat(batch_processor): Enhance ToBatchProcessor to handle action batching

- Updated ToBatchProcessor to add batch dimensions to actions in addition to observations.
- Implemented separate methods for processing observations and actions, improving code readability.
- Added comprehensive unit tests to validate action batching functionality across various tensor dimensions and types.

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* feat(factory): Enhance make_processor to support preprocessor and postprocessor configuration

- Introduced ProcessorConfigKwargs TypedDict for better type safety in processor configuration.
- Updated make_processor to accept preprocessor and postprocessor configuration filenames, improving flexibility in processor instantiation.
- Refactored the loading of pretrained processors to utilize the new configuration options.

* refactor(factory): Clean up imports in factory.py

- Removed unused import of IdentityProcessor to streamline the code.

* feat(migrate): Extend load_model_from_hub to include train configuration

- Updated load_model_from_hub to return the train configuration alongside the model state_dict and config.
- Modified main function to handle the additional train configuration when loading models from both the hub and local paths.
- Adjusted dataset_repo_id extraction to utilize the train configuration for improved accuracy.

* refactor(record): Rename processor parameters and update processing logic

- Renamed `processor` to `preprocessor` and added `postprocessor` parameter for clarity.
- Updated the `record_loop` and `predict_action` functions to utilize the new preprocessor and postprocessor, enhancing the processing flow.
- Ensured compatibility with existing functionality while improving code readability.

* feat(batch_processor): Add task field processing to ToBatchProcessor

- Enhanced ToBatchProcessor to wrap string tasks in a list, adding batch dimensions for compatibility with model inference.
- Implemented a new method for processing complementary data, ensuring that task values are correctly handled as either strings or lists of strings.
- Added comprehensive unit tests to validate task processing, including edge cases and in-place mutation of complementary data.

* feat(normalization): Implement IDENTITY mode for normalization and unnormalization

- Enhanced NormalizerProcessor and UnnormalizerProcessor to support IDENTITY mode, allowing features to bypass normalization when specified.
- Updated processing logic to check normalization modes and handle missing statistics gracefully.
- Added comprehensive unit tests to validate IDENTITY mode functionality for both observations and actions, ensuring correct behavior across various scenarios.
- Improved error handling for unsupported normalization modes.

* fix(rebase): remove residual normalization layer:

* refactor(diffusion): remove normalization layer from input processing

* refactor(normalization): Remove unused state dict transformation methods and streamline imports

- Eliminated the _transform_state_dict_keys and _load_as_safetensor methods from PI0Policy, simplifying the model loading process.
- Cleaned up imports in modeling_pi0.py by removing log_model_loading_keys and init_logging.
- Updated TDMPCPolicy and VQBeTPolicy to handle action removal from batches during offline evaluation.
- Introduced hotswap_stats function in normalize_processor.py to update normalization statistics dynamically, with corresponding tests to ensure functionality.

* refactor(normalization): Clean up imports in normalize_processor.py

* feat(batch_processor): Add feature_contract method to ToBatchProcessor

- Introduced feature_contract method that returns features without modification, maintaining the no-op behavior of the processor.
- This addition enhances the flexibility of the ToBatchProcessor for future feature processing needs.

* fix(dependencies): Update transformers dependency constraint to allow only versions up to 4.52.0

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* feature(pipeline): port tokenizer pipeline for VLA (#1645)

* feat(tokenizer): Introduce TokenizerProcessor for text tokenization

- Added TokenizerProcessor class to handle tokenization of task strings using Hugging Face's AutoTokenizer.
- Supports both string and list inputs, with customizable parameters for task key, output key, and tokenization settings.
- Implemented comprehensive unit tests to validate functionality, including handling of various input scenarios and integration with RobotProcessor.
- Updated types.py to include LANGUAGE feature type and modified __init__.py to register the new processor.

* feat(language): Enhance language processing in TokenizerProcessor

- Added OBS_LANGUAGE constant to define the observation language key.
- Updated TokenizerProcessor to store tokenized task data in the observation dictionary, ensuring compatibility with the new language feature.
- Introduced Pi0NewLineProcessor to append newlines to tasks for proper tokenization.
- Modified tests to validate the integration of language tokens and attention masks in the observation structure.

* feat(tokenizer): Add padding configuration to TokenizerProcessor

- Introduced `padding_side` parameter to the TokenizerProcessor for customizable padding direction.
- Updated the `make_pi0_processor` function to include the new padding configuration.
- Enhanced unit tests to validate the functionality of the `padding_side` parameter in various scenarios.

* feat(processor): Add state management methods to Pi0NewLineProcessor

* feat(normalization): Track normalization and unnormalization info in complementary data

- Updated NormalizerProcessor and UnnormalizerProcessor to accept additional parameters for tracking normalization modes.
- Enhanced the __call__ methods to store normalization and unnormalization information in the complementary data of transitions.
- Added unit tests to verify the correct tracking of normalization info, including scenarios with missing stats and selective normalization keys.

* feat(factory): Add preprocessor and postprocessor overrides to ProcessorConfigKwargs

- Updated ProcessorConfigKwargs to include optional overrides for preprocessor and postprocessor configurations.
- Enhanced the make_processor function to utilize the new overrides, allowing for more flexible processor initialization.

* feat(processors): Integrate RenameProcessor into various processor configurations

- Added RenameProcessor to the input steps of multiple processor functions, including make_act_processor, make_diffusion_processor, make_pi0_processor, make_sac_processor, make_tdmpc_processor, make_vqbet_processor, and make_smolvla_processor.
- Consolidated normalization features from input and output into a single NormalizerProcessor for improved efficiency.
- Updated the input steps to ensure compatibility with the new RenameProcessor integration.

* feat(smolvla): Refactor language processing and introduce new line processor (#1658)

- Removed the prepare_language method and directly accessed language tokens and masks from the batch using the OBS_LANGUAGE constant.
- Added SmolVLANewLineProcessor to ensure tasks end with a newline, enhancing tokenization compatibility.
- Updated the make_smolvla_processor function to include the new line processor and tokenizer processor for improved input handling.

* feture(policies): add device processor (#1659)

* feat(processors): Integrate DeviceProcessor into multiple processor configurations

- Added DeviceProcessor to the input and output steps of various processor functions, including make_act_processor, make_diffusion_processor, make_pi0_processor, make_pi0fast_processor, make_sac_processor, make_tdmpc_processor, make_vqbet_processor, and make_smolvla_processor.
- Enhanced the DeviceProcessor class with state management methods and ensured compatibility with existing processor pipelines.
- Introduced unit tests for DeviceProcessor to validate functionality across different scenarios, including CPU and CUDA operations.

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* refactor(pipeline): Remove to() method for device management

- Eliminated the to() method from RobotProcessor, which was responsible for moving tensor states to specified devices.
- Removed associated unit tests that validated the functionality of the to() method across various scenarios.
- Streamlined the pipeline code by focusing on other device management strategies.

* feat(processor): Enhance DeviceProcessor with float dtype conversion

- Added support for optional float dtype conversion in DeviceProcessor, allowing tensors to be converted to specified floating-point types while preserving non-float types.
- Implemented validation for float dtype input and updated the processor's configuration methods to include float dtype.
- Refactored tensor processing logic to streamline device movement and dtype conversion.
- Introduced comprehensive unit tests to validate the new float dtype functionality across various scenarios.

* feat(policies): Add new line processors and update module exports

* feat(processor): Enhance batch and device processors to handle index and task_index fields

- Added logic to ToBatchProcessor for unsqueezing 0D tensors for index and task_index fields, ensuring they are processed as 1D tensors.
- Updated DeviceProcessor to process index and task_index fields in complementary data, preserving their tensor types and ensuring non-tensor fields remain unchanged.
- Enhanced unit tests to validate the correct handling of index and task_index fields across various scenarios, including device compatibility and dtype preservation.

* refactor(processors): Standardize processor naming conventions

- Updated processor names across various files to use a consistent "robot_preprocessor" and "robot_postprocessor" format.
- Modified the make_processor functions in factory, act, diffusion, pi0, pi0fast, sac, smolvla, tdmpc, and vqbet to reflect the new naming scheme.
- Enhanced the pipeline configuration to align with the updated processor names, improving clarity and maintainability.

* refactor(factory): Update processor configuration and type hints

- Changed return type of get_policy_class to type[PreTrainedPolicy] for improved type safety.
- Enhanced make_processor function to utilize dataset_stats in processor creation for better flexibility.
- Updated ProcessorConfigKwargs to include dataset_stats, allowing for more comprehensive processor configurations.
- Streamlined processor initialization by removing unnecessary kwargs and ensuring clarity in processor type handling.

* Fix eval and android gripper

* add some tests

* refactor(factory, pi0fast): Update processor function names and parameters

- Renamed make_pi0_processor to make_pi0fast_processor for clarity and consistency.
- Updated parameter names in the factory's make_processor function to use pretrained_model_name_or_path instead of source, enhancing readability and alignment with naming conventions.

* fix(train.py) push postprocessor with preprocessor
- Add preprocesser policy overrides for device and rename_map
- Add rename_map to DatasetRecordConfig (record.py)

* Cleanup pr

* fix more git diff pr issues

* add path as type in save_pretrained

* small nit

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* rename test file

* fix: make dataset_features/feature_contract is optional

* fix tests

* Encorperate pr feedback

* clean up record.py

* add ascii art, fix normal record

* remove merge issues

* fix merge

* remove features

* Add feedback PR

* fix last 4 tests

* remove features check

* rename to transform_features

* add transform_features

* fix lekiwi eval and update eval api example

---------

Signed-off-by: Adil Zouitine <adilzouitinegm@gmail.com>
Signed-off-by: Pepijn <138571049+pkooij@users.noreply.github.com>
Co-authored-by: Adil Zouitine <adilzouitinegm@gmail.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
Co-authored-by: Simon Alibert <75076266+aliberts@users.noreply.github.com>
Co-authored-by: Michel Aractingi <michel.aractingi@huggingface.co>
2025-08-07 16:13:34 +02:00
Adil Zouitine 0524551f52 refactor(migrate_policy_normalization): Enhance preprocessor and postprocessor structure
- Introduced RenameProcessor in the preprocessor to handle renaming features.
- Combined input and output features in a single NormalizerProcessor for improved efficiency.
- Updated RobotProcessor initialization to clarify step naming for preprocessor and postprocessor.
- Added DeviceProcessor to both preprocessor and postprocessor for better device management.
2025-08-07 11:04:15 +02:00
Steven Palma 862bc7ef85 Merge branch 'main' into user/azouitine/2025-7-4-convert-codebase-with-pipeline 2025-08-06 21:08:32 +02:00
Adil Zouitine d38792d6e5 test(tokenizer_processor): Add require_package decorator for transformers
- Introduced @require_package("transformers") decorator in multiple test functions to ensure the transformers package is available before running tests.
- This change enhances test reliability by preventing failures due to missing dependencies.
2025-08-06 19:22:23 +02:00
pre-commit-ci[bot] db3cf0158c [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
2025-08-06 16:08:39 +00:00
Adil Zouitine 0535f2a59a refactor(device_processor): Update device handling and improve type hints
- Changed device attribute type from torch.device to str for better clarity.
- Introduced a private _device attribute to store the actual torch.device instance.
- Updated tests to conditionally check for CUDA availability, ensuring compatibility across different environments.
- Refactored device-related assertions in tests to use a consistent approach for device type verification.
2025-08-06 18:08:15 +02:00
Michel Aractingi 2805ae347c fix(train.py) push postprocessor with preprocessor
- Add preprocesser policy overrides for device and rename_map
- Add rename_map to DatasetRecordConfig (record.py)
2025-08-06 17:21:17 +02:00
Adil Zouitine 28ef6fcd14 refactor(factory, pi0fast): Update processor function names and parameters
- Renamed make_pi0_processor to make_pi0fast_processor for clarity and consistency.
- Updated parameter names in the factory's make_processor function to use pretrained_model_name_or_path instead of source, enhancing readability and alignment with naming conventions.
2025-08-06 17:21:16 +02:00
Adil Zouitine 7fc7ec75bb refactor(factory): Update processor configuration and type hints
- Changed return type of get_policy_class to type[PreTrainedPolicy] for improved type safety.
- Enhanced make_processor function to utilize dataset_stats in processor creation for better flexibility.
- Updated ProcessorConfigKwargs to include dataset_stats, allowing for more comprehensive processor configurations.
- Streamlined processor initialization by removing unnecessary kwargs and ensuring clarity in processor type handling.
2025-08-06 17:21:15 +02:00
Adil Zouitine 87890cbf38 refactor(processors): Standardize processor naming conventions
- Updated processor names across various files to use a consistent "robot_preprocessor" and "robot_postprocessor" format.
- Modified the make_processor functions in factory, act, diffusion, pi0, pi0fast, sac, smolvla, tdmpc, and vqbet to reflect the new naming scheme.
- Enhanced the pipeline configuration to align with the updated processor names, improving clarity and maintainability.
2025-08-06 17:21:14 +02:00
Adil Zouitine 5326ffe77e feature(pipeline): port tokenizer pipeline for VLA (#1645)
* feat(tokenizer): Introduce TokenizerProcessor for text tokenization

- Added TokenizerProcessor class to handle tokenization of task strings using Hugging Face's AutoTokenizer.
- Supports both string and list inputs, with customizable parameters for task key, output key, and tokenization settings.
- Implemented comprehensive unit tests to validate functionality, including handling of various input scenarios and integration with RobotProcessor.
- Updated types.py to include LANGUAGE feature type and modified __init__.py to register the new processor.

* feat(language): Enhance language processing in TokenizerProcessor

- Added OBS_LANGUAGE constant to define the observation language key.
- Updated TokenizerProcessor to store tokenized task data in the observation dictionary, ensuring compatibility with the new language feature.
- Introduced Pi0NewLineProcessor to append newlines to tasks for proper tokenization.
- Modified tests to validate the integration of language tokens and attention masks in the observation structure.

* feat(tokenizer): Add padding configuration to TokenizerProcessor

- Introduced `padding_side` parameter to the TokenizerProcessor for customizable padding direction.
- Updated the `make_pi0_processor` function to include the new padding configuration.
- Enhanced unit tests to validate the functionality of the `padding_side` parameter in various scenarios.

* feat(processor): Add state management methods to Pi0NewLineProcessor

* feat(normalization): Track normalization and unnormalization info in complementary data

- Updated NormalizerProcessor and UnnormalizerProcessor to accept additional parameters for tracking normalization modes.
- Enhanced the __call__ methods to store normalization and unnormalization information in the complementary data of transitions.
- Added unit tests to verify the correct tracking of normalization info, including scenarios with missing stats and selective normalization keys.

* feat(factory): Add preprocessor and postprocessor overrides to ProcessorConfigKwargs

- Updated ProcessorConfigKwargs to include optional overrides for preprocessor and postprocessor configurations.
- Enhanced the make_processor function to utilize the new overrides, allowing for more flexible processor initialization.

* feat(processors): Integrate RenameProcessor into various processor configurations

- Added RenameProcessor to the input steps of multiple processor functions, including make_act_processor, make_diffusion_processor, make_pi0_processor, make_sac_processor, make_tdmpc_processor, make_vqbet_processor, and make_smolvla_processor.
- Consolidated normalization features from input and output into a single NormalizerProcessor for improved efficiency.
- Updated the input steps to ensure compatibility with the new RenameProcessor integration.

* feat(smolvla): Refactor language processing and introduce new line processor (#1658)

- Removed the prepare_language method and directly accessed language tokens and masks from the batch using the OBS_LANGUAGE constant.
- Added SmolVLANewLineProcessor to ensure tasks end with a newline, enhancing tokenization compatibility.
- Updated the make_smolvla_processor function to include the new line processor and tokenizer processor for improved input handling.

* feture(policies): add device processor (#1659)

* feat(processors): Integrate DeviceProcessor into multiple processor configurations

- Added DeviceProcessor to the input and output steps of various processor functions, including make_act_processor, make_diffusion_processor, make_pi0_processor, make_pi0fast_processor, make_sac_processor, make_tdmpc_processor, make_vqbet_processor, and make_smolvla_processor.
- Enhanced the DeviceProcessor class with state management methods and ensured compatibility with existing processor pipelines.
- Introduced unit tests for DeviceProcessor to validate functionality across different scenarios, including CPU and CUDA operations.

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* refactor(pipeline): Remove to() method for device management

- Eliminated the to() method from RobotProcessor, which was responsible for moving tensor states to specified devices.
- Removed associated unit tests that validated the functionality of the to() method across various scenarios.
- Streamlined the pipeline code by focusing on other device management strategies.

* feat(processor): Enhance DeviceProcessor with float dtype conversion

- Added support for optional float dtype conversion in DeviceProcessor, allowing tensors to be converted to specified floating-point types while preserving non-float types.
- Implemented validation for float dtype input and updated the processor's configuration methods to include float dtype.
- Refactored tensor processing logic to streamline device movement and dtype conversion.
- Introduced comprehensive unit tests to validate the new float dtype functionality across various scenarios.

* feat(policies): Add new line processors and update module exports

* feat(processor): Enhance batch and device processors to handle index and task_index fields

- Added logic to ToBatchProcessor for unsqueezing 0D tensors for index and task_index fields, ensuring they are processed as 1D tensors.
- Updated DeviceProcessor to process index and task_index fields in complementary data, preserving their tensor types and ensuring non-tensor fields remain unchanged.
- Enhanced unit tests to validate the correct handling of index and task_index fields across various scenarios, including device compatibility and dtype preservation.
2025-08-06 17:21:13 +02:00
pre-commit-ci[bot] a1734cf575 [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
2025-08-06 17:21:12 +02:00
Adil Zouitine 82f300e880 fix(dependencies): Update transformers dependency constraint to allow only versions up to 4.52.0 2025-08-06 17:21:11 +02:00
Adil Zouitine 3e7c9d7afc feat(batch_processor): Add feature_contract method to ToBatchProcessor
- Introduced feature_contract method that returns features without modification, maintaining the no-op behavior of the processor.
- This addition enhances the flexibility of the ToBatchProcessor for future feature processing needs.
2025-08-06 17:21:09 +02:00
Adil Zouitine e9cb779eab refactor(normalization): Clean up imports in normalize_processor.py 2025-08-06 17:21:08 +02:00
Adil Zouitine 8ff95be04c refactor(normalization): Remove unused state dict transformation methods and streamline imports
- Eliminated the _transform_state_dict_keys and _load_as_safetensor methods from PI0Policy, simplifying the model loading process.
- Cleaned up imports in modeling_pi0.py by removing log_model_loading_keys and init_logging.
- Updated TDMPCPolicy and VQBeTPolicy to handle action removal from batches during offline evaluation.
- Introduced hotswap_stats function in normalize_processor.py to update normalization statistics dynamically, with corresponding tests to ensure functionality.
2025-08-06 17:21:07 +02:00
Adil Zouitine f02ce69df0 refactor(diffusion): remove normalization layer from input processing 2025-08-06 17:21:07 +02:00
Adil Zouitine 1feb7b5d88 fix(rebase): remove residual normalization layer: 2025-08-06 17:21:06 +02:00
Adil Zouitine fbe9009db2 feat(normalization): Implement IDENTITY mode for normalization and unnormalization
- Enhanced NormalizerProcessor and UnnormalizerProcessor to support IDENTITY mode, allowing features to bypass normalization when specified.
- Updated processing logic to check normalization modes and handle missing statistics gracefully.
- Added comprehensive unit tests to validate IDENTITY mode functionality for both observations and actions, ensuring correct behavior across various scenarios.
- Improved error handling for unsupported normalization modes.
2025-08-06 17:21:05 +02:00
Adil Zouitine c0013b130b feat(batch_processor): Add task field processing to ToBatchProcessor
- Enhanced ToBatchProcessor to wrap string tasks in a list, adding batch dimensions for compatibility with model inference.
- Implemented a new method for processing complementary data, ensuring that task values are correctly handled as either strings or lists of strings.
- Added comprehensive unit tests to validate task processing, including edge cases and in-place mutation of complementary data.
2025-08-06 17:21:04 +02:00
Adil Zouitine c4763f61a1 refactor(record): Rename processor parameters and update processing logic
- Renamed `processor` to `preprocessor` and added `postprocessor` parameter for clarity.
- Updated the `record_loop` and `predict_action` functions to utilize the new preprocessor and postprocessor, enhancing the processing flow.
- Ensured compatibility with existing functionality while improving code readability.
2025-08-06 17:21:03 +02:00
Adil Zouitine b95c219d96 feat(migrate): Extend load_model_from_hub to include train configuration
- Updated load_model_from_hub to return the train configuration alongside the model state_dict and config.
- Modified main function to handle the additional train configuration when loading models from both the hub and local paths.
- Adjusted dataset_repo_id extraction to utilize the train configuration for improved accuracy.
2025-08-06 17:21:02 +02:00
Adil Zouitine 9b1138171e refactor(factory): Clean up imports in factory.py
- Removed unused import of IdentityProcessor to streamline the code.
2025-08-06 17:21:02 +02:00
Adil Zouitine 023b8f3466 feat(factory): Enhance make_processor to support preprocessor and postprocessor configuration
- Introduced ProcessorConfigKwargs TypedDict for better type safety in processor configuration.
- Updated make_processor to accept preprocessor and postprocessor configuration filenames, improving flexibility in processor instantiation.
- Refactored the loading of pretrained processors to utilize the new configuration options.
2025-08-06 17:21:00 +02:00
pre-commit-ci[bot] 1cad87ebd2 [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
2025-08-06 17:21:00 +02:00
Adil Zouitine 99de7567e6 feat(batch_processor): Enhance ToBatchProcessor to handle action batching
- Updated ToBatchProcessor to add batch dimensions to actions in addition to observations.
- Implemented separate methods for processing observations and actions, improving code readability.
- Added comprehensive unit tests to validate action batching functionality across various tensor dimensions and types.
2025-08-06 17:20:58 +02:00
Adil Zouitine 21baa8fa02 refactor(factory): Remove unused imports and NaN detection hook from processor creation 2025-08-06 17:20:53 +02:00
Adil Zouitine 8b4a5368b3 feat(processors): Add ToBatchProcessor to multiple policy processors
- Integrated ToBatchProcessor into various policy processors to handle observation batching.
- Updated make functions for act, diffusion, pi0, pi0fast, sac, smolvla, tdmpc, and vqbet processors to include the new batching functionality.
- Ensured consistency across all processor implementations for improved data handling.
2025-08-06 17:20:52 +02:00
Adil Zouitine f5c6b03b61 feat(processor): Introduce ToBatchProcessor for handling observation batching
- Added ToBatchProcessor to ensure observations have proper batch dimensions for model processing.
- Implemented functionality to add batch dimensions to state and image observations as needed.
- Created comprehensive unit tests to validate the processor's behavior with various tensor dimensions and types.
- Ensured compatibility with existing transition keys and maintained the integrity of non-observation data.
2025-08-06 17:20:51 +02:00
Adil Zouitine e7be2fd113 feat(migrate): Add model card generation and saving to migration script
- Implemented functionality to generate and save a model card for the migrated model, including metadata such as dataset repository ID, license, and tags.
- Enhanced the script to push the model card to the hub if requested, improving model documentation and accessibility.
- Refactored the saving process to ensure the model card is saved locally and uploaded correctly when pushing to the hub.
2025-08-06 17:20:50 +02:00
Adil Zouitine b632490b4b feat(migrate): Enhance migration script to create preprocessor and postprocessor for policy models
- Updated the migration script to generate both a preprocessor and a postprocessor, improving the handling of normalization for training and inference.
- Added functionality to convert features to PolicyFeature objects, ensuring compatibility with the new processor architecture.
- Refined the extraction and removal of normalization statistics and layers, streamlining the migration process.
- Improved error handling for missing mandatory configuration fields during model instantiation.
2025-08-06 17:20:50 +02:00
pre-commit-ci[bot] 9a9c7208d2 [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
2025-08-06 17:20:49 +02:00
pre-commit-ci[bot] 427b97d198 [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
2025-08-06 17:20:48 +02:00
AdilZouitine 2c2bb1e8bf feat(migration): Add script for migrating policy models with normalization layers 2025-08-06 17:20:47 +02:00
AdilZouitine 4b24f94225 feat(record): Integrate RobotProcessor into recording loop and update policy handling
- Added support for RobotProcessor in the record_loop function to enhance data processing capabilities.
- Updated the logic to reset both policy and processor when provided, ensuring proper state management.
- Modified action prediction to utilize the processor, improving the overall functionality of the recording process.
- Adjusted the save_checkpoint function to include preprocessor state saving, enhancing checkpointing capabilities.
2025-08-06 17:20:46 +02:00
AdilZouitine 670a278cbc refactor(policies): Enhance processor creation and add NaN detection hook 2025-08-06 17:20:45 +02:00
AdilZouitine fc74001202 refactor(train): Update preprocessor initialization to include dataset statistics 2025-08-06 17:20:45 +02:00
Adil Zouitine f14ac5d486 feat(train): Integrate preprocessor into training pipeline 2025-08-06 17:20:44 +02:00
Adil Zouitine 7bd0d62ce5 refactor(policies): Remove unnormalization step from action predictions
- Eliminated the unnormalization of actions in both `TDMPCPolicy` and `VQBeTPolicy` classes to streamline action prediction.
- This change improves code clarity and aligns with recent updates to policy processors.
2025-08-06 17:20:43 +02:00
Adil Zouitine 7eccefe235 refactor(learner): Remove normalization from cached image features retrieval
- Simplified the retrieval of observation features by removing the normalization step from the `get_cached_image_features` method calls.
- This change enhances clarity and aligns with the recent updates to policy processors.
2025-08-06 17:20:42 +02:00
Adil Zouitine b72274066e feat(processors): Introduce processors for various policy types
- Added `make_processor` function to create processor instances for different policy types, including `tdmpc`, `diffusion`, `act`, `vqbet`, `pi0`, `pi0fast`, `sac`, and `reward_classifier`.
- Implemented corresponding processor files for each policy type, encapsulating normalization and unnormalization steps.
- Updated existing policies to remove direct normalization dependencies, enhancing modularity and clarity.
- Enhanced test coverage to validate the integration of new processors with existing policy configurations.
2025-08-06 17:20:41 +02:00
Steven Palma 20f2910b63 Merge branch 'main' into user/azouitine/2025-7-2-implement-pipeline 2025-08-06 17:20:39 +02:00
Steven Palma fd4ae3466b refactor(pipeline): minor improvements (#1684)
* chore(pipeline): remove unused features + device torch + envtransition keys

* refactor(pipeline): ImageProcessor & StateProcessor are both implemented directly in VanillaObservationPRocessor

* refactor(pipeline): RenameProcessor now inherits from ObservationProcessor + remove unused code

* test(pipeline): fix broken test after refactors

* docs(pipeline): update docstrings VanillaObservationProcessor

* chore(pipeline): move None check to base pipeline classes
2025-08-06 14:00:13 +02:00
Adil Zouitine 7beb040e8e refactor(pipeline): Rename parameters for clarity and enhance save/load functionality
- Updated parameter names in the save_pretrained and from_pretrained methods for improved readability, changing destination_path to save_directory and source to pretrained_model_name_or_path.
- Enhanced the save_pretrained method to ensure directory creation and file handling is consistent with the new parameter names.
- Streamlined the loading process in from_pretrained to utilize loaded_config for better clarity and maintainability.
2025-08-05 17:44:21 +02:00
Adil Zouitine 05bd18f453 refactor(observation): Streamline observation preprocessing and remove unused processor methods
- Updated the `preprocess_observation` function to enhance image handling and ensure proper tensor formatting.
- Removed the `RobotProcessor` and associated transition handling from the `rollout` function, simplifying the observation processing flow.
- Integrated direct calls to `preprocess_observation` for improved clarity and efficiency in the evaluation script.
2025-08-05 10:32:56 +02:00
Adil Zouitine 8077456c00 refactor(pipeline): Remove model card generation and streamline processor methods
- Eliminated the _generate_model_card method from RobotProcessor, which was responsible for generating README.md files from a template.
- Updated save_pretrained method to remove model card generation, focusing on serialization of processor definitions and parameters.
- Added default implementations for get_config, state_dict, load_state_dict, reset, and feature_contract methods in various processor classes to enhance consistency and usability.
2025-08-05 10:31:09 +02:00
AdilZouitine 5595887fd0 refactor(pipeline): Remove to() method for device management
- Eliminated the to() method from RobotProcessor, which was responsible for moving tensor states to specified devices.
- Removed associated unit tests that validated the functionality of the to() method across various scenarios.
- Streamlined the pipeline code by focusing on other device management strategies.
2025-08-05 10:27:25 +02:00
Adil Zouitine 41959389b6 docs(pipeline): Clarify transition handling and hook behavior
- Updated documentation to specify that hooks always receive transitions in EnvTransition format, ensuring consistent behavior across input formats.
- Refactored the step_through method to yield only EnvTransition objects, regardless of the input format, and updated related tests to reflect this change.
- Enhanced test assertions to verify the structure of results and the correctness of processing steps.
2025-08-02 14:51:52 +02:00
Pepijn 2c4e888c7f Feat/pipeline add feature contract (#1637)
* Add feature contract to pipelinestep and pipeline

* Add tests

* Add processor tests

* PR feedback

* encorperate pr feedback

* type in doc

* oops
2025-08-01 08:41:54 +02:00
Adil Zouitine 5ced72e6b8 docs(pipeline): Add clarification for repo name sanitization process 2025-08-01 08:41:54 +02:00
Adil Zouitine 907023f9f7 refactor(pipeline): Improve state file naming conventions for clarity and uniqueness
- Enhanced state file naming to include the processor's sanitized name, ensuring uniqueness when multiple processors are saved in the same directory.
- Updated tests to reflect changes in state file naming, verifying that filenames now include the processor name and step indices to prevent conflicts.
- Added a new test to validate state file naming when using multiple processors, ensuring distinct filenames for each processor's state files.
2025-08-01 08:41:54 +02:00
Adil Zouitine 4ba23ea029 feat(pipeline): Enhance configuration filename handling and state file naming
- Introduced support for custom configuration filenames in the `save_pretrained` method, allowing users to specify a filename instead of the default.
- Improved state file naming to include step indices, preventing conflicts when multiple processors of the same type are saved.
- Added automatic detection for configuration files when loading from a directory, with error handling for multiple files.
- Updated tests to validate new features, including custom filenames and automatic config detection.
2025-08-01 08:41:54 +02:00
Adil Zouitine 409ac0baca chore(doc): address pip install commant lerobot that not exist yet 2025-08-01 08:41:54 +02:00
Adil Zouitine 699363f9fc refactor(pipeline): Enhance state filename generation and profiling method
- Updated state filename generation to use the registry name when available, improving clarity in saved files.
- Modified the profile_steps method to include a warmup_runs parameter, allowing for more controlled performance profiling.
- Ensured consistent conditions during profiling by deep copying transitions for each run, enhancing accuracy in timing results.
2025-08-01 08:41:54 +02:00
Adil Zouitine ae7a54de57 refactor(pipeline): Utilize get_safe_torch_device for device assignment
- Replaced direct torch.device instantiation with get_safe_torch_device to ensure safe device handling.
- This change enhances code readability and maintains consistency in device management across the RobotProcessor class.
2025-08-01 08:41:54 +02:00
Adil Zouitine fb9139b882 chore(pipeline): Move _CFG_NAME along other class member 2025-08-01 08:41:54 +02:00
Adil Zouitine 9fe3a3fb17 feat(pipeline): Add __repr__ method to RobotProcessor for improved readability
- Implemented a __repr__ method in the RobotProcessor class to provide a clear string representation of the processor, including step names and optional parameters like name and seed.
- Added comprehensive tests to validate the __repr__ output for various scenarios, including empty processors, single and multiple steps, custom names, and seed values.
- Ensured that the representation handles long lists of steps with truncation for better readability.
2025-08-01 08:41:54 +02:00
Adil Zouitine 26cb9a24c3 refactor(pipeline): Clarify hook behavior and improve documentation
- Updated the RobotProcessor class to ensure hooks are strictly for observation and do not modify transitions, enhancing clarity and maintainability.
- Refactored hook registration methods to reflect the new behavior, ensuring they accept only functions that do not return modified transitions.
- Enhanced documentation to clearly outline the purpose of hooks and their execution semantics.
- Added tests to verify that hooks are not executed during the step_through method while ensuring they function correctly during the __call__ method.
2025-08-01 08:41:54 +02:00
Adil Zouitine 77106697c3 feat(pipeline): Add hook unregistration functionality and enhance documentation
- Implemented methods to unregister before, after, and reset hooks in the RobotProcessor class, allowing for more flexible hook management.
- Enhanced documentation to clarify hook execution semantics and the implications of modifying transitions within hooks.
- Added comprehensive tests to verify the correct behavior of hook registration and unregistration, including error handling for non-existent hooks.
2025-08-01 08:41:54 +02:00
Adil Zouitine 75bc44c166 refactor(observation_processor): Improve observation processing by using constants and simplifying pixel handling
- Introduced constants for observation keys to enhance readability.
- Streamlined the handling of the "pixels" key by copying observations first and processing images more clearly.
- Updated the environment state and agent position assignments to use the new constants, improving maintainability.
2025-08-01 08:41:54 +02:00
Adil Zouitine f2b79656eb refactor(pipeline): Transition from tuple to dictionary format for EnvTransition
- Updated the EnvTransition structure to use a dictionary format instead of a tuple, enhancing readability and maintainability.
- Replaced instances of TransitionIndex with TransitionKey for accessing transition components.
- Adjusted related processing functions and tests to accommodate the new dictionary format, ensuring consistent handling of transitions across the codebase.
2025-08-01 08:41:53 +02:00
pre-commit-ci[bot] 14c2ece004 [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
2025-08-01 08:41:53 +02:00
Adil Zouitine 35612c61e1 refactor(pipeline): Introduce ComplementaryDataProcessor for handling complementary data in transitions 2025-08-01 08:41:53 +02:00
pre-commit-ci[bot] f7bb3e2d90 [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
2025-08-01 08:41:53 +02:00
Adil Zouitine 1e0d667a22 Apply suggestions from code review
Co-authored-by: Simon Alibert <75076266+aliberts@users.noreply.github.com>
Signed-off-by: Adil Zouitine <adilzouitinegm@gmail.com>
2025-08-01 08:41:53 +02:00
Adil Zouitine 33969a0337 refactor(pipeline): Simplify observation and padding data handling in batch transitions 2025-08-01 08:41:53 +02:00
Adil Zouitine fa26290e8c feat(pipeline): Enhance step_through method to support both tuple and dict inputs 2025-08-01 08:41:53 +02:00
Adil Zouitine e9f7f5127b chore(learner): nit comment from copilot 2025-08-01 08:41:53 +02:00
Adil Zouitine 097842c70f chore(normalization): addressing comments from copilot 2025-08-01 08:41:53 +02:00
Adil Zouitine 3b8a3a32a0 feat (overrides): Implement support for loading processors with parameter overrides
- Added the ability to provide non-serializable objects when loading processors from saved configurations using the `overrides` parameter.
- Enhanced error handling for invalid override keys and instantiation errors.
- Updated documentation and examples to illustrate the usage of overrides for both registered and unregistered steps.
- Added comprehensive tests to validate the new functionality and ensure backward compatibility.
2025-08-01 08:41:53 +02:00
Adil Zouitine 1c56779dd9 chore (type): add typing for multiprocess envs 2025-08-01 08:41:53 +02:00
Adil Zouitine 83a4338f8b chore (output format): improves output format 2025-08-01 08:41:53 +02:00
Adil Zouitine 730c7b2f35 fix(test): linting issue 2025-08-01 08:41:53 +02:00
pre-commit-ci[bot] 116059a43e [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
2025-08-01 08:41:53 +02:00
Adil Zouitine b08149a113 chore (batch handling): Enhance processing components with batch conversion utilities 2025-08-01 08:41:53 +02:00
Adil Zouitine c227107f60 feat (device processor): Implement device processor 2025-08-01 08:41:53 +02:00
Adil Zouitine 01dc289f3d chore (docstrin):Improve docstring for NormalizerProcessor 2025-08-01 08:41:53 +02:00
Adil Zouitine 6830ca7645 Refactor normalization components and update tests
- Renamed `ObservationNormalizer` to `NormalizerProcessor` and `ActionUnnormalizer` to `UnnormalizerProcessor` for clarity.
- Consolidated normalization logic for both observations and actions into `NormalizerProcessor` and `UnnormalizerProcessor`.
- Updated tests to reflect the new class names and ensure proper functionality of normalization and unnormalization processes.
- Enhanced handling of missing statistics in normalization processes.
2025-08-01 08:41:52 +02:00
Adil Zouitine ed42c71fc3 fix(test): import issue 2025-08-01 08:41:52 +02:00
Adil Zouitine e0139065bd chore(test): add suggestion made by copilot regarding numpy test 2025-08-01 08:41:52 +02:00
Adil Zouitine e509f255af Update tests/processor/test_observation_processor.py
Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
Signed-off-by: Adil Zouitine <adilzouitinegm@gmail.com>
2025-08-01 08:41:52 +02:00
Adil Zouitine e2fcd140b0 fix(test): policies 2025-08-01 08:41:52 +02:00
Adil Zouitine 2a7a0e6129 fix (test): test factory 2025-08-01 08:41:52 +02:00
Adil Zouitine 9f33791b19 chore (docs): add docstring for processor 2025-08-01 08:41:52 +02:00
Adil Zouitine 453e0a995f Enhance processing architecture with new components
- Added `RenameProcessor` to facilitate key renaming in observations, improving data handling flexibility.
- Updated `__init__.py` to include `RenameProcessor` in module exports.
- Refactored `NormalizationProcessor` and `ObservationNormalizer` to use `rsplit` for better key handling.
- Introduced comprehensive tests for `NormalizationProcessor` and `RenameProcessor` to ensure functionality and robustness.
2025-08-01 08:41:52 +02:00
pre-commit-ci[bot] 8ebf79c494 [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
2025-08-01 08:41:52 +02:00
Adil Zouitine 8774aec304 Add normalization processor and related components
- Introduced `NormalizationProcessor` to handle both observation normalization and action unnormalization.
- Added `ObservationNormalizer` and `ActionUnnormalizer` classes for specific normalization tasks.
- Updated `__init__.py` to include the new `NormalizationProcessor` in the module exports.
- Enhanced `ObservationProcessor` with registration in the `ProcessorStepRegistry` for better modularity.
- Created `RenameProcessor` for renaming keys in observations, improving flexibility in data processing.
2025-08-01 08:41:52 +02:00
pre-commit-ci[bot] ac742c9f0d [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
2025-08-01 08:41:52 +02:00
Adil Zouitine cd13f1ecfd Add RobotProcessor tutorial to documentation
- Introduced a new tutorial on using RobotProcessor for preprocessing robot data.
- Added a section in the table of contents for easy navigation to the new tutorial.
- The tutorial covers key concepts, real-world scenarios, and practical examples for effective use of the RobotProcessor pipeline.
2025-08-01 08:41:52 +02:00
Adil Zouitine 9aa632968f Refactor processing architecture to use RobotProcessor
- Replaced instances of RobotPipeline with RobotProcessor across the codebase for improved modularity and clarity.
- Introduced ProcessorStepRegistry for better management of processing steps.
- Updated relevant documentation and tests to reflect the new processing structure.
- Enhanced the save/load functionality to support the new processor design.
- Added a model card template for RobotProcessor to facilitate sharing and documentation.
2025-08-01 08:41:52 +02:00
Adil Zouitine 62caaf07b0 Remove redundant tests for None observation and serialization methods in test_observation_processor.py to streamline the test suite and improve maintainability. 2025-08-01 08:41:52 +02:00
Adil Zouitine 3355f04ca6 Refactor observation processing and improve modularity
- Updated `ObservationProcessor` to enhance the modular design for processing observations.
- Cleaned up imports and improved code readability by removing unnecessary lines and comments.
- Ensured backward compatibility while integrating new processing components.
- Added tests to validate the functionality of the updated processing architecture.
2025-08-01 08:41:52 +02:00
pre-commit-ci[bot] 769f531603 [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
2025-08-01 08:41:51 +02:00
Adil Zouitine f6c7287ae7 Refactor observation preprocessing to use a modular pipeline system
- Introduced `RobotPipeline` and `ObservationProcessor` for handling observation transformations.
- Updated `preprocess_observation` to maintain backward compatibility while leveraging the new pipeline.
- Added tests for the new processing components and ensured they match the original functionality.
- Removed hardcoded logic in favor of a more flexible, composable architecture.
2025-08-01 08:41:51 +02:00
90 changed files with 13210 additions and 998 deletions
+15 -2
View File
@@ -24,9 +24,16 @@
- local: smolvla
title: Finetune SmolVLA
title: "Policies"
- sections:
- local: introduction_processors
title: Introduction to Robot Processors
- local: implement_your_own_processor
title: Implement your own processor
- local: processors_robots_teleop
title: Processors for Robots and Teleoperators
title: "Robot Processors"
- sections:
- local: hope_jr
title: Hope Jr
- local: so101
title: SO-101
- local: so100
@@ -35,7 +42,13 @@
title: Koch v1.1
- local: lekiwi
title: LeKiwi
- local: hope_jr
title: Hope Jr
title: "Robots"
- sections:
- local: phone_teleop
title: Phone
title: "Teleoperators"
- sections:
- local: notebooks
title: Notebooks
+13 -2
View File
@@ -519,11 +519,14 @@ from lerobot.utils.control_utils import init_keyboard_listener
from lerobot.utils.utils import log_say
from lerobot.utils.visualization_utils import _init_rerun
from lerobot.record import record_loop
from lerobot.policies.factory import make_processor
NUM_EPISODES = 5
FPS = 30
EPISODE_TIME_SEC = 60
TASK_DESCRIPTION = "My task description"
HF_MODEL_ID = "<hf_username>/<model_repo_id>"
HF_DATASET_ID = "<hf_username>/<eval_dataset_repo_id>"
# Create the robot configuration
camera_config = {"front": OpenCVCameraConfig(index_or_path=0, width=640, height=480, fps=FPS)}
@@ -535,7 +538,7 @@ robot_config = SO100FollowerConfig(
robot = SO100Follower(robot_config)
# Initialize the policy
policy = ACTPolicy.from_pretrained("<hf_username>/<my_policy_repo_id>")
policy = ACTPolicy.from_pretrained(HF_MODEL_ID)
# Configure the dataset features
action_features = hw_to_dataset_features(robot.action_features, "action")
@@ -544,7 +547,7 @@ dataset_features = {**action_features, **obs_features}
# Create the dataset
dataset = LeRobotDataset.create(
repo_id="<hf_username>/eval_<dataset_repo_id>",
repo_id=HF_DATASET_ID,
fps=FPS,
features=dataset_features,
robot_type=robot.name,
@@ -559,6 +562,12 @@ _init_rerun(session_name="recording")
# Connect the robot
robot.connect()
preprocessor, postprocessor = make_processor(
policy_cfg=policy,
pretrained_path=HF_MODEL_ID,
dataset_stats=dataset.meta.stats,
)
for episode_idx in range(NUM_EPISODES):
log_say(f"Running inference, recording eval episode {episode_idx + 1} of {NUM_EPISODES}")
@@ -568,6 +577,8 @@ for episode_idx in range(NUM_EPISODES):
events=events,
fps=FPS,
policy=policy,
preprocessor=preprocessor,
postprocessor=postprocessor,
dataset=dataset,
control_time_s=EPISODE_TIME_SEC,
single_task=TASK_DESCRIPTION,
@@ -0,0 +1,323 @@
# Implement your own Robot Processor
In this tutorial, you'll learn how to implement your own Robot Processor.
It begins by exploring the need for a custom processor, then uses the Normalization processors as the running example to explain how to implement, configure, and serialize a processor. Finally, it lists all helper processors that ship with LeRobot.
## Why would you need a custom processor?
In most cases, when reading raw data from a sensor like the camera and robot motor encoders,
you will need to process this data to transform it into a format that is compatible to use with the policies in LeRobot.
For example, raw images are encoded with `uint8` and the values are in the range `[0, 255]`.
To use these images with the policies, you will need to cast them to `float32` and normalize them to the range `[0, 1]`.
For example, in LeRobot's `VanillaObservationProcessor`, raw images come from the environment as numpy arrays with `uint8` values in range `[0, 255]` and in channel-last format `(H, W, C)`. The processor transforms them into PyTorch tensors with `float32` values in range `[0, 1]` and channel-first format `(C, H, W)`:
```python
# Input: numpy array with shape (480, 640, 3) and dtype uint8
raw_image = env_observation["pixels"] # Values in [0, 255]
# After processing: torch tensor with shape (1, 3, 480, 640) and dtype float32
processed_image = processor(transition)["observation"]["observation.image"] # Values in [0, 1]
```
On the other hand, when a model returns a certain action to be executed on the robot, it is often that one has to post-process this action to make it compatible to run on the robot.
For example, the model might return joint positions values that range from `[-1, 1]` and one would need to scale them to the ranges of the minimum and maximum joint angle positions of the robot.
In LeRobot, this normalization workflow is handled by the `NormalizerProcessor` (for inputs) and the `UnnormalizerProcessor` (for outputs). These processors are heavily used by policies (e.g., Pi0, SmolVLA) and integrate tightly with the `RobotProcessor`'s `get_config`, `state_dict`, and `load_state_dict` APIs.
For instance, `UnnormalizerProcessor` converts model outputs in `[-1, 1]` back to actual robot joint ranges:
```python
# Input: model action with normalized values in [-1, 1]
normalized_action = torch.tensor([-0.5, 0.8, -1.0, 0.2]) # Model output
# After post-processing: real joint positions in robot's native ranges
# Example: joints range from [-180.0, 180.0]
real_action = unnormalizer(transition)["action"]
# real action after post-processing: [ -90., 144., -180., 36.]
```
The unnormalizer uses the dataset statistics to convert back:
```python
# For MIN_MAX normalization: action = (normalized + 1) * (max - min) / 2 + min
real_action = (normalized_action + 1) * (max_val - min_val) / 2 + min_val
```
All these situations point us towards the need for a mechanism to preprocess the data before being passed to the policies and then post-process the action that are returned to be executed on the robot.
To that end, LeRobot provides a pipeline mechanism to implement a sequence of processing steps for the input data and the output action.
## How to implement your own processor?
We'll use the `NormalizerProcessor` as a concrete running example because it is central to most policies and demonstrates configuration and state serialization cleanly.
Prepare the sequence of processing steps necessary for your problem. A processor step is a class that implements the following methods:
- `__call__`: implements the processing step for the input transition.
- `get_config`: gets the configuration of the processor step.
- `state_dict`: gets the state of the processor step.
- `load_state_dict`: loads the state of the processor step.
- `reset`: resets the state of the processor step.
- `feature_contract`: displays the modification to the feature space during the processor step.
### Implement the `__call__` method
The `__call__` method is the core of your processor step. It takes an `EnvTransition` and returns a modified `EnvTransition`. Here's how the `NormalizerProcessor` conceptually works (simplified):
```python
from dataclasses import dataclass
import torch
from lerobot.configs.types import FeatureType, NormalizationMode, PolicyFeature
from lerobot.processor.pipeline import EnvTransition, TransitionKey
@dataclass
class NormalizerProcessor:
features: dict[str, PolicyFeature]
norm_map: dict[FeatureType, NormalizationMode]
stats: dict[str, dict[str, torch.Tensor]]
eps: float = 1e-8
def __call__(self, transition: EnvTransition) -> EnvTransition:
normalized_info = {}
obs = transition.get(TransitionKey.OBSERVATION)
act = transition.get(TransitionKey.ACTION)
new_obs = self._normalize_observation(obs, normalized_info)
new_act = self._normalize_action(act, normalized_info)
new_transition = transition.copy()
new_transition[TransitionKey.OBSERVATION] = new_obs
new_transition[TransitionKey.ACTION] = new_act
# Record what was normalized into complementary_data
if normalized_info:
comp = new_transition.get(TransitionKey.COMPLEMENTARY_DATA) or {}
comp = dict(comp)
comp["normalized_keys"] = normalized_info
new_transition[TransitionKey.COMPLEMENTARY_DATA] = comp
return new_transition
```
See the full implementation in `src/lerobot/processor/normalize_processor.py` for details on mean/std and min/max modes and key selection.
**Key principles:**
- Always check if required data exists before processing
- Return unchanged transition if no processing is needed
- Use `transition.copy()` to avoid side effects
- Only modify the specific keys your processor handles
**Tip**: For observation-only processors, you can inherit from `ObservationProcessor` to avoid writing `__call__` boilerplate. The normalizer is mixed (observations and actions), so it implements `__call__` directly.
### Configuration and State Management
Processors support serialization through three methods that separate configuration from tensor state. This is especially important for normalization processors, which carry dataset statistics (tensors) in their state, and hyperparameters in their config:
```python
from dataclasses import dataclass, field
from typing import Any
import torch
from lerobot.configs.types import FeatureType, NormalizationMode, PolicyFeature
@dataclass
class NormalizerProcessor:
features: dict[str, PolicyFeature]
norm_map: dict[FeatureType, NormalizationMode]
eps: float = 1e-8
_tensor_stats: dict[str, dict[str, torch.Tensor]] = field(default_factory=dict, init=False, repr=False)
def get_config(self) -> dict[str, Any]:
"""JSON-serializable configuration (no tensors)."""
return {
"eps": self.eps,
"features": {k: {"type": v.type.value, "shape": v.shape} for k, v in self.features.items()},
"norm_map": {ft.value: nm.value for ft, nm in self.norm_map.items()},
}
def state_dict(self) -> dict[str, torch.Tensor]:
"""Tensor state only (e.g., dataset statistics)."""
flat: dict[str, torch.Tensor] = {}
for key, sub in self._tensor_stats.items():
for stat_name, tensor in sub.items():
flat[f"{key}.{stat_name}"] = tensor
return flat
def load_state_dict(self, state: dict[str, torch.Tensor]) -> None:
"""Restore tensor state at runtime."""
self._tensor_stats.clear()
for flat_key, tensor in state.items():
key, stat_name = flat_key.rsplit(".", 1)
self._tensor_stats.setdefault(key, {})[stat_name] = tensor
```
**Usage:**
```python
# Save (e.g., inside a policy)
config = processor.get_config()
tensors = processor.state_dict()
# Restore (e.g., loading a pretrained policy)
new_processor = NormalizerProcessor(**config)
new_processor.load_state_dict(tensors)
```
### Transform features
The `transform_features` method defines how your processor transforms feature names and shapes. This is crucial for policy configuration and debugging.
Normalization typically preserves the feature keys and shapes, so `NormalizerProcessor.transform_features` returns the input features unchanged. When your processor renames or reshapes, implement this method to reflect the mapping for downstream components. For example, a simple rename processor:
```python
def transform_features(self, features: dict[str, PolicyFeature]) -> dict[str, PolicyFeature]:
# Simple renaming
if "pixels" in features:
features["observation.image"] = features.pop("pixels")
# Pattern-based renaming
for key in list(features.keys()):
if key.startswith("env_state."):
suffix = key[len("env_state."):]
features[f"observation.{suffix}"] = features.pop(key)
return features
```
**Key principles:**
- Use `features.pop(old_key)` to remove and get the old feature
- Use `features[new_key] = old_feature` to add the renamed feature
- Always return the modified features dictionary
- Document transformations clearly in the docstring
### Example of usage from the codebase
`transform_features` is used by `RobotProcessor` to derive the dataset/policy feature contract from an initial feature set by applying each step's transformation. You can see concrete examples in the codebase:
- Phone teleoperation record pipeline (`examples/phone_so100_record.py`): processors like `ForwardKinematicsJointsToEE`, `GripperVelocityToJoint`, and `EEBoundsAndSafety` implement `transform_features` to declare which action/observation keys should be materialized in the dataset.
- SO100 follower kinematics (`src/lerobot/robots/so100_follower/robot_kinematic_processor.py`): each processor's `transform_features` method adds or refines feature keys such as `observation.state.ee.{x,y,z,wx,wy,wz}` or `action.gripper.pos`.
- Rename and tokenizer processors (`src/lerobot/processor/rename_processor.py`, `src/lerobot/processor/tokenizer_processor.py`): demonstrate key renaming and adding language token features to the contract.
In practice, you will often aggregate features by running `RobotProcessor.transform_features(...)` with your initial features to compute the final contract before recording or training.
## Helper Classes
LeRobot provides pre-built processor classes for common transformations. Below is a comprehensive list of registered processors in the codebase.
### Core processors (observations, actions, normalization)
- **`VanillaObservationProcessor`** (`observation_processor`): Images and state processing to LeRobot format.
- **`NormalizerProcessor`** (`normalizer_processor`): Normalize observations/actions (mean/std or min/max to [-1, 1]).
- **`UnnormalizerProcessor`** (`unnormalizer_processor`): Inverse of the normalizer for model outputs.
- **`DeviceProcessor`** (`device_processor`): Move tensors to a specific device (CPU/GPU) and optional float dtype.
- **`ToBatchProcessor`** (`to_batch_processor`): Add batch dimension to observations/actions when missing.
- **`RenameProcessor`** (`rename_processor`): Rename observation keys using a mapping dictionary.
- **`TokenizerProcessor`** (`tokenizer_processor`): Tokenize language tasks into `observation.language.*` tensors.
### Teleoperation mapping processors
- **`MapDeltaActionToRobotAction`** (`map_delta_action_to_robot_action`): Map teleop deltas (e.g., gamepad) to `action.target_*` fields.
- **`MapPhoneActionToRobotAction`** (`map_phone_action_to_robot_action`): Map calibrated phone pose/buttons to `action.target_*` and gripper.
### Robot kinematics processors (SO100 follower example)
- **`EEReferenceAndDelta`** (`ee_reference_and_delta`): Compute desired EE pose from target deltas and current pose.
- **`EEBoundsAndSafety`** (`ee_bounds_and_safety`): Clip EE pose to bounds and check for jumps.
- **`InverseKinematicsEEToJoints`** (`inverse_kinematics_ee_to_joints`): Convert EE pose to joint targets via IK.
- **`GripperVelocityToJoint`** (`gripper_velocity_to_joint`): Convert gripper velocity input to joint position command.
- **`ForwardKinematicsJointsToEE`** (`forward_kinematics_joints_to_ee`): Compute EE pose features from joint positions via FK.
- **`AddRobotObservationAsComplimentaryData`** (`add_robot_observation`): Read robot observation and insert `raw_joint_positions` into complementary data.
### Policy-specific utility processors
- **`Pi0NewLineProcessor`** (`pi0_new_line_processor`): Ensure text tasks end with a newline (Pi0 tokenizer compatibility).
- **`SmolVLANewLineProcessor`** (`smolvla_new_line_processor`): Ensure text tasks end with a newline (SmolVLA tokenizer compatibility).
### Usage Example
```python
from lerobot.processor import NormalizerProcessor, DeviceProcessor, RobotProcessor, ToBatchProcessor
# Create a processing pipeline (typical policy preprocessor)
steps = [
NormalizerProcessor(features=features, norm_map=norm_map, stats=stats),
ToBatchProcessor(),
DeviceProcessor(device="cuda"),
]
# Use in RobotProcessor
processor = RobotProcessor(steps=steps)
processed_transition = processor(raw_transition)
```
### Using overrides
You can override step parameters at load-time using `overrides`. This is handy for non-serializable objects or site-specific settings. It works both in policy factories and with `RobotProcessor.from_pretrained(...)`.
Example: during policy evaluation on the robot, override the device and rename map.
Use this to run a policy trained on CUDA on a CPU-only robot, or to remap camera keys when the robot uses different names than the dataset.
```437:445:src/lerobot/record.py
preprocessor, postprocessor = make_processor(
policy_cfg=cfg.policy,
pretrained_path=cfg.policy.pretrained_path,
dataset_stats=rename_stats(dataset.meta.stats, cfg.dataset.rename_map),
preprocessor_overrides={
"device_processor": {"device": cfg.policy.device},
"rename_processor": {"rename_map": cfg.dataset.rename_map},
},
)
```
Direct usage with `from_pretrained`:
```python
from lerobot.processor import RobotProcessor
processor = RobotProcessor.from_pretrained(
"username/my-processor",
overrides={
"device_processor": {"device": "cuda:0"}, # registry name for registered steps
"CustomStep": {"param": 42}, # class name for non-registered steps
},
)
```
## Best Practices
- **Keep processors atomic** - One transformation per processor for reusability and debugging
- **Use dataclasses** - Clean initialization with `@dataclass`
- **Always register processors** - Use `@ProcessorStepRegistry.register("name")` for discoverability
- **Check for None** - Always validate required data exists before processing
- **Use copy() for safety** - Avoid side effects with `transition.copy()`
- **Separate config and state** - JSON-serializable config vs tensor state_dict
- **Use base classes** - Inherit from `ObservationProcessor` for observation-only processing
```python
@ProcessorStepRegistry.register("my_processor")
@dataclass
class MyProcessor(ObservationProcessor):
threshold: float = 0.5
def observation(self, observation):
if observation is None:
return observation
# Your processing logic here
return processed_observation
```
## Conclusion
You now have all the tools to implement custom processors in LeRobot! The key steps are:
1. **Define your processor** as a dataclass with the required methods (`__call__`, `get_config`, `state_dict`, `load_state_dict`, `reset`, `feature_contract`)
2. **Register it** using `@ProcessorStepRegistry.register("name")` for discoverability
3. **Integrate it** into a `RobotProcessor` pipeline with other processing steps
4. **Use base classes** like `ObservationProcessor` when possible to reduce boilerplate
The processor system is designed to be modular and composable, allowing you to build complex data processing pipelines from simple, focused components. Whether you're preprocessing sensor data for training or post-processing model outputs for robot execution, custom processors give you the flexibility to handle any data transformation your robotics application requires. Policies like Pi0 and SmolVLA use the same normalization processors described above, so your understanding here will transfer directly when wiring policy preprocessors and postprocessors.
Start simple, test thoroughly, and leverage the existing helper classes to build robust data processing pipelines for your robot learning workflows.
+991
View File
@@ -0,0 +1,991 @@
# Introduction to Processors
In robotics, there's a fundamental mismatch between the data that robots and humans produce and what machine learning models expect. This creates several translation challenges:
**Raw Robot Data → Model Input:**
- Robots output raw sensor data (camera images, joint positions, force readings) that need normalization, batching, and device placement before models can process them
- Language instructions from humans ("pick up the red cube") must be tokenized into numerical representations
- Different robots use different coordinate systems and units that need standardization
**Model Output → Robot Commands:**
- Models might output end-effector positions, but robots need joint-space commands
- Teleoperators (like gamepads) produce relative movements (delta positions), but robots expect absolute commands
- Model predictions are often normalized and need to be converted back to real-world scales
**Cross-Domain Translation:**
- Training data from one robot setup needs adaptation for deployment on different hardware
- Models trained with specific camera configurations must work with new camera arrangements
- Datasets with different naming conventions need harmonization
**That's where processors come in.** They serve as the universal translators that bridge these gaps, ensuring seamless data flow from sensors to models to actuators.
Processors are the data transformation backbone of LeRobot. They handle all the preprocessing and postprocessing steps needed to convert raw environment data into model-ready inputs and vice versa. This guide will walk you through everything you need to know about processors - from basic concepts to advanced usage patterns.
## What are Processors?
In robotics, data comes in many forms - images from cameras, joint positions from sensors, text instructions from users, and more. Each type of data requires specific transformations before a model can use it effectively. Models need this data to be:
- **Normalized**: Scaled to appropriate ranges for neural network processing
- **Batched**: Organized with proper dimensions for batch processing
- **Tokenized**: Text converted to numerical representations
- **Device-placed**: Moved to the right hardware (CPU/GPU)
- **Type-converted**: Cast to appropriate data types
Processors handle these transformations through composable, reusable steps that can be chained together into pipelines. Think of them as a modular assembly line where each station performs a specific transformation on your data.
## Core Concepts
### EnvTransition: The Universal Data Container
The `EnvTransition` is the fundamental data structure that flows through all processors. It's a typed dictionary that represents a complete robot-environment interaction:
```python
from lerobot.processor.pipeline import TransitionKey, EnvTransition
# Example transition from a robot collecting data
transition: EnvTransition = {
TransitionKey.OBSERVATION: {
"observation.images.camera0": camera0_image_tensor, # Shape: (H, W, C)
"observation.images.camera1": camera1_image_tensor, # Shape: (H, W, C)
"observation.state": joint_positions_tensor, # Shape: (7,) for 7-DOF arm
"observation.environment_state": env_state_tensor # Shape: (3,) for object position
},
TransitionKey.ACTION: action_tensor, # Shape: (7,) for joint velocities
TransitionKey.REWARD: 0.0, # Scalar reward signal
TransitionKey.DONE: False, # Episode termination flag
TransitionKey.TRUNCATED: False, # Episode truncation flag
TransitionKey.INFO: {"success": False}, # Additional metadata
TransitionKey.COMPLEMENTARY_DATA: {
"task": "pick up the red cube", # Language instruction
"task_index": 0, # Task identifier
"index": 42 # Frame index
}
}
```
Each key in the transition has a specific purpose:
- **OBSERVATION**: All sensor data (images, states, proprioception)
- **ACTION**: The action to execute or that was executed
- **REWARD**: Reinforcement learning signal
- **DONE/TRUNCATED**: Episode boundary indicators
- **INFO**: Arbitrary metadata
- **COMPLEMENTARY_DATA**: Task descriptions, indices, padding flags, inter-step data (e.g., you need to compute the velocities and then use this velocity to clip the action)
### ProcessorStep: The Building Block Interface
A `ProcessorStep` is a single transformation unit that processes transitions. It's a protocol (interface) that any processor step must implement:
```python
from lerobot.processor.pipeline import ProcessorStep, EnvTransition
from lerobot.configs.types import PolicyFeature
from typing import Any
import torch
class MyProcessorStep:
"""Example processor step interface - all methods must be implemented."""
def __call__(self, transition: EnvTransition) -> EnvTransition:
"""Transform the transition - this is the main processing logic."""
raise NotImplementedError
def feature_contract(self, features: dict[str, PolicyFeature]) -> dict[str, PolicyFeature]:
"""Declare how this step transforms feature shapes/types."""
raise NotImplementedError
def get_config(self) -> dict[str, Any]:
"""Return JSON-serializable configuration for saving/loading."""
raise NotImplementedError
def state_dict(self) -> dict[str, torch.Tensor]:
"""Return any learnable parameters (tensors only)."""
raise NotImplementedError
def load_state_dict(self, state: dict[str, torch.Tensor]) -> None:
"""Load learnable parameters from saved state."""
raise NotImplementedError
def reset(self) -> None:
"""Reset any internal state between episodes."""
raise NotImplementedError
```
### RobotProcessor: The Pipeline Orchestrator
The `RobotProcessor` chains multiple `ProcessorStep` instances together, executing them sequentially. It provides automatic format conversion to handle both batch dictionaries (from datasets) and EnvTransition dictionaries:
```python
from lerobot.processor.pipeline import RobotProcessor, _default_batch_to_transition, _default_transition_to_batch
# Create a processing pipeline
processor = RobotProcessor(
steps=[
step1, # First transformation
step2, # Second transformation
step3 # Third transformation
],
name="my_preprocessing_pipeline",
# Optional: Custom converters for input/output formats
to_transition=_default_batch_to_transition, # How to convert batch dict → EnvTransition
to_output=_default_transition_to_batch # How to convert EnvTransition → output format
)
# The processor automatically handles different input formats:
# 1. If input is a batch dict (from dataset), converts to EnvTransition
# 2. Passes through each step sequentially
# 3. Converts back to original format (or custom output format)
# Example with batch dict input (common in training)
batch_dict = {"observation.state": tensor, "action": tensor}
output = processor(batch_dict) # Automatically converted to/from EnvTransition
# Example with EnvTransition input (common in inference)
transition = {TransitionKey.OBSERVATION: {...}, TransitionKey.ACTION: ...}
output = processor(transition) # Stays as EnvTransition throughout
```
The `to_transition` and `to_output` converters enable seamless integration with existing codebases.
By default, they handle the standard LeRobot batch format, but you can customize them for different data structures.
### Additional Converter Functions
LeRobot provides several specialized converter functions for common robotics scenarios:
```python
from lerobot.processor.converters import (
to_transition_teleop_action,
to_transition_robot_observation,
to_output_robot_action,
to_dataset_frame
)
```
**`to_transition_teleop_action`** - Converts teleoperation device actions to EnvTransitions:
```python
# Use case: Phone, gamepad, or other teleop device control
phone_action = {"x": 0.1, "y": -0.2, "gripper": 0.8}
transition = to_transition_teleop_action(phone_action)
# Creates: {ACTION: {"action.x": 0.1, "action.y": -0.2, "action.gripper": 0.8}, ...}
```
**`to_transition_robot_observation`** - Converts robot sensor data to EnvTransitions:
```python
# Use case: Live robot observation during inference
robot_obs = {
"joint_1": 0.5, "joint_2": -0.3, # joint positions
"camera_0": image_array # camera images
}
transition = to_transition_robot_observation(robot_obs)
# Creates: {OBSERVATION: {"observation.state.joint_1": 0.5, "observation.images.camera_0": image, ...}}
```
**`to_output_robot_action`** - Extracts robot-executable actions from EnvTransitions:
```python
# Use case: Converting model outputs back to robot commands
model_transition = {ACTION: {"action.joint_1": 0.2, "action.joint_2": 0.1}}
robot_action = to_output_robot_action(model_transition)
# Returns: {"joint_1": 0.2, "joint_2": 0.1} - ready for robot.send_action()
```
**`to_dataset_frame`** - Converts transitions to dataset-compatible format:
```python
# Use case: Saving processed data or creating training batches
features = {
"action": {"names": ["joint_1", "joint_2"]},
"observation.state": {"names": ["joint_1", "joint_2"]},
"observation.images.camera0": {...}
}
batch = to_dataset_frame(transition, features)
# Returns: {"action": [0.2, 0.1], "observation.state": [0.5, -0.3], ...}
```
These converters are particularly useful when integrating with real robots, as shown in the examples:
```python
# Example from phone_so100_teleop.py - Real robot teleoperation
phone_to_robot_ee_pose = RobotProcessor(
steps=[...],
to_transition=to_transition_teleop_action, # Phone → EnvTransition
to_output=lambda tr: tr # Keep as EnvTransition
)
# Example from phone_so100_eval.py - Robot action execution
robot_ee_to_joints = RobotProcessor(
steps=[...],
to_transition=lambda tr: tr, # Already EnvTransition
to_output=to_output_robot_action # EnvTransition → Robot action
)
# Example from phone_so100_record.py - Dataset recording
robot_joints_to_ee_pose = RobotProcessor(
steps=[...],
to_transition=to_transition_robot_observation, # Robot obs → EnvTransition
to_output=lambda tr: tr # Keep as EnvTransition for dataset
)
```
### Data Format Conversion
Different data sources have different formats, but processors need a unified `EnvTransition` structure internally.
The default converters handle LeRobot datasets, but you can customize them:
```python
# Default: LeRobot batch format
lerobot_batch = {
"observation.state": torch.tensor(...),
"action": torch.tensor(...),
"next.reward": torch.tensor(...),
"task": ["pick cube", ...]
}
# → Converts to EnvTransition → Processes → Converts back
# Custom: Live robot data
robot_data = {
"cameras": {"wrist_cam": np.array(...)},
"joint_positions": np.array(...),
"gripper_state": 0.5
}
def robot_to_transition(data: dict) -> EnvTransition:
return {
TransitionKey.OBSERVATION: {
"observation.images.wrist": torch.from_numpy(data["cameras"]["wrist_cam"]),
"observation.state": torch.from_numpy(data["joint_positions"])
},
TransitionKey.ACTION: None,
# ... other fields with defaults
}
# Use custom converter
processor = RobotProcessor(
steps=[...],
to_transition=robot_to_transition,
to_output=lambda transition: transition # Keep as EnvTransition
)
```
**When to customize:** Live robot data, Gymnasium environments, legacy datasets, or any non-LeRobot format.
## Common Processor Steps
LeRobot provides a rich set of pre-built processor steps for common transformations.
Let's explore each in detail:
### Data Normalization
Normalization is crucial for neural network training and inference.
The `NormalizerProcessor` handles both mean-std normalization and min-max scaling:
```python
from lerobot.processor.normalize_processor import NormalizerProcessor, UnnormalizerProcessor
from lerobot.configs.types import PolicyFeature, FeatureType, NormalizationMode
# Define what features exist in your data
features = {
"observation.images.camera0": PolicyFeature(
type=FeatureType.IMAGE,
shape=(224, 224, 3)
),
"observation.state": PolicyFeature(
type=FeatureType.STATE,
shape=(7,)
),
"action": PolicyFeature(
type=FeatureType.ACTION,
shape=(7,)
)
}
# Define normalization strategy per feature type
norm_map = {
FeatureType.IMAGE: NormalizationMode.MEAN_STD, # Images: (x - mean) / std
FeatureType.STATE: NormalizationMode.MIN_MAX, # States: scale to [-1, 1]
FeatureType.ACTION: NormalizationMode.MIN_MAX # Actions: scale to [-1, 1]
}
# Create normalizer with dataset statistics
normalizer = NormalizerProcessor(
features=features,
norm_map=norm_map,
stats=dataset.meta.stats, # Contains mean, std, min, max per feature
normalize_keys={"observation.state", "action"} # Optional: only normalize specific keys
)
# For postprocessing: inverse transformation
unnormalizer = UnnormalizerProcessor(
features=features,
norm_map=norm_map,
stats=dataset.meta.stats
)
# The normalizer automatically:
# - Detects which normalization to apply based on feature type
# - Handles device placement of statistics tensors
# - Skips keys not in stats or not in normalize_keys
# - Adds metadata about what was normalized
```
### Device Management
The `DeviceProcessor` ensures tensors are on the right device with the right dtype:
```python
from lerobot.processor.device_processor import DeviceProcessor
# Basic GPU placement
gpu_processor = DeviceProcessor(device="cuda:0")
# Advanced: GPU with half-precision for inference
efficient_processor = DeviceProcessor(
device="cuda:0",
float_dtype="float16" # Convert float32 -> float16 for memory efficiency
)
# The processor:
# - Moves all tensors to specified device
# - Preserves non-tensor data unchanged
# - Optionally converts float dtypes while preserving int/bool types
# - Uses non_blocking transfers for CUDA devices
# - Handles nested structures (observations, complementary_data)
# Supported float dtypes:
# "float16" / "half": 16-bit floating point
# "float32" / "float": 32-bit floating point (default)
# "float64" / "double": 64-bit floating point
# "bfloat16": Brain floating point (better for training)
```
### Batch Processing
Models expect batched inputs, but robot interactions often produce unbatched data:
```python
from lerobot.processor.batch_processor import ToBatchProcessor
batch_processor = ToBatchProcessor()
# Automatically adds batch dimensions where needed:
# State: (7,) -> (1, 7)
# Image: (224, 224, 3) -> (1, 224, 224, 3)
# Action: (4,) -> (1, 4)
# Task: "pick_cube" -> ["pick_cube"]
# Already batched: (1, 7) -> (1, 7) [unchanged]
# The processor intelligently:
# - Detects tensor dimensionality
# - Adds batch dim to 1D states/actions
# - Adds batch dim to 3D images
# - Wraps string tasks in lists
# - Preserves already-batched data
# Example usage in inference:
single_observation = robot.get_observation() # Unbatched
batched_input = batch_processor({"observation": single_observation})
model_output = model(batched_input) # Model expects batch dim
```
### Text Tokenization
For language-conditioned policies, text instructions must be tokenized:
```python
from lerobot.processor.tokenizer_processor import TokenizerProcessor
from transformers import AutoTokenizer
# Option 1: Auto-load tokenizer by name
tokenizer_proc = TokenizerProcessor(
tokenizer_name="google/paligemma-3b-pt-224",
max_length=128,
task_key="task", # Where to find text in complementary_data
padding="max_length", # Pad to max_length
padding_side="right",
truncation=True # Truncate if longer than max_length
)
# Option 2: Provide custom tokenizer
custom_tokenizer = AutoTokenizer.from_pretrained("microsoft/DialoGPT-medium")
custom_proc = TokenizerProcessor(
tokenizer=custom_tokenizer,
max_length=256,
padding_side="left" # For autoregressive models
)
# The processor:
# - Extracts task text from complementary_data
# - Tokenizes using HuggingFace tokenizer
# - Adds tokens and attention_mask to observations
# - Handles both single strings and lists of strings
# - Preserves original task in complementary_data
# Output structure:
# observation["observation.language.tokens"] = tensor([101, 2032, ...])
# observation["observation.language.attention_mask"] = tensor([1, 1, 0, ...])
```
### Key Renaming
Different datasets and models may use different naming conventions.
The `RenameProcessor` solves this mismatch:
**Why is this useful?**
- When loading a model trained on a different dataset with different key names
- When using foundation models that expect specific key naming conventions
- When standardizing datasets from different sources
- When adapting legacy code to new naming standards
```python
from lerobot.processor.rename_processor import RenameProcessor
# Example 1: Dataset uses "top"/"wrist", model expects "camera0"/"camera1"
rename_proc = RenameProcessor(
rename_map={
"observation.images.top": "observation.images.camera0",
"observation.images.wrist": "observation.images.camera1",
}
)
# Example 2: Foundation model compatibility
# Your dataset: "observation.state", Foundation model: "proprio"
foundation_rename = RenameProcessor(
rename_map={
"observation.state": "proprio",
"observation.images.main": "rgb",
}
)
# Example 3: Standardizing multiple datasets
standardize_rename = RenameProcessor(
rename_map={
# Different robots might use different names
"observation.joint_positions": "observation.state",
"observation.gripper_state": "observation.end_effector",
"observation.arm_camera": "observation.images.wrist",
}
)
```
## Building Complete Pipelines
Let's build a real-world preprocessing and postprocessing pipeline for a vision-based
manipulation policy:
```python
# Consolidated imports
from lerobot.processor import (
RobotProcessor,
NormalizerProcessor,
UnnormalizerProcessor,
DeviceProcessor,
ToBatchProcessor,
TokenizerProcessor,
RenameProcessor
)
# Step 1: Define the preprocessing pipeline
preprocessor = RobotProcessor(
steps=[
# 1. Standardize naming from dataset
RenameProcessor(
rename_map={
"observation.images.top": "observation.images.camera0",
"observation.images.wrist": "observation.images.camera1"
}
),
# 2. Add batch dimensions for model
ToBatchProcessor(),
# 3. Tokenize language instructions if present
TokenizerProcessor(
tokenizer_name="google/paligemma-3b-pt-224",
max_length=64,
task_key="task"
),
# 4. Normalize numerical data
NormalizerProcessor(
features=policy_features,
norm_map={
FeatureType.IMAGE: NormalizationMode.MEAN_STD,
FeatureType.STATE: NormalizationMode.MIN_MAX,
FeatureType.ACTION: NormalizationMode.MIN_MAX
},
stats=dataset.meta.stats
),
# 5. Move to GPU and convert to half precision
DeviceProcessor(
device="cuda:0",
float_dtype="float16"
)
],
name="robot_preprocessor"
)
# Step 2: Define the postprocessing pipeline
postprocessor = RobotProcessor(
steps=[
# 1. Move back to CPU for robot hardware
DeviceProcessor(device="cpu"),
# 2. Denormalize actions to original scale
UnnormalizerProcessor(
features=policy_features,
norm_map={
FeatureType.ACTION: NormalizationMode.MIN_MAX
},
stats=dataset.meta.stats
)
],
name="robot_postprocessor"
)
```
## Using Processors in Practice
### Training Loop Integration
Here's how processors integrate into a training loop using the policy's forward method:
```python
from torch.utils.data import DataLoader
# Create dataset and dataloader
dataset = LeRobotDataset(repo_id="your_dataset")
dataloader = DataLoader(dataset, batch_size=32, shuffle=True)
# Initialize model and processors
model = YourPolicy.from_pretrained("your_model")
preprocessor = RobotProcessor.from_pretrained(
"your_model",
config_filename="robot_preprocessor.json"
)
# Training loop
for epoch in range(num_epochs):
for batch in dataloader:
# Preprocess batch
processed_batch = preprocessor(batch)
# Forward pass - returns loss and optional metrics
loss, metrics = model.forward(processed_batch)
# Backward pass
optimizer.zero_grad()
loss.backward()
optimizer.step()
# Log metrics if available
if metrics:
wandb.log(metrics)
```
### Inference Pipeline
For deployment, processors ensure consistent data handling with real robots:
```python
# Load model and processors
policy = YourPolicy.from_pretrained("path/to/model")
preprocessor = RobotProcessor.from_pretrained(
"path/to/model",
config_filename="robot_preprocessor.json"
)
postprocessor = RobotProcessor.from_pretrained(
"path/to/model",
config_filename="robot_postprocessor.json"
)
# Connect to robot
robot = make_robot_from_config(robot_config)
robot.connect()
# Inference loop
policy.eval()
# Reset the policy and processors
policy.reset()
preprocessor.reset()
postprocessor.reset()
with torch.no_grad():
while not done:
# Get observation from robot
observation = robot.get_observation()
# Build dataset-compatible frame
observation_frame = build_dataset_frame(
dataset.features,
observation,
prefix="observation"
)
# Add task instruction to complementary data
observation_frame["task"] = "pick up the red cube"
# Preprocess for model
model_input = preprocessor(observation_frame)
# Run policy
raw_action = policy.select_action(model_input)
# Postprocess action
action_transition = {TransitionKey.ACTION: raw_action}
processed = postprocessor(action_transition)
action = processed[TransitionKey.ACTION]
# Convert to robot action format
robot_action = {
key: action[i].item()
for i, key in enumerate(robot.action_features)
}
# Execute on robot
robot.send_action(robot_action)
```
## Saving and Loading Processors
Processors can be persisted and shared just like models, making them portable across different
environments and ensuring reproducibility:
### Local Save/Load
```python
# Save processor configuration and state
preprocessor.save_pretrained(
"./my_robot_processor",
config_filename="preprocessor.json" # Optional custom name
)
# The save creates:
# my_robot_processor/
# ├── preprocessor.json # Configuration
# ├── preprocessor_step_0_normalizer.safetensors # Step 0 state (stats)
# └── preprocessor_step_1_device.safetensors # Step 1 state (if any)
# Load processor
loaded = RobotProcessor.from_pretrained(
"./my_robot_processor",
config_filename="preprocessor.json"
)
```
### HuggingFace Hub Integration
The HuggingFace Hub provides a centralized place to share and version your processors.
This is particularly useful for sharing preprocessing configurations with models,
ensuring that anyone who downloads your model can reproduce your exact preprocessing pipeline.
It also enables versioning and collaboration on preprocessing strategies.
```python
# Save to HuggingFace Hub
preprocessor.save_pretrained("username/my-robot-policy")
# Load from Hub with automatic download
hub_processor = RobotProcessor.from_pretrained(
"username/my-robot-policy",
config_filename="robot_preprocessor.json",
revision="main", # Optional: specific revision
cache_dir="./cache" # Optional: local cache directory
)
# The Hub integration provides:
# - Automatic versioning with git
# - Public or private sharing
# - Download caching for efficiency
# - Integration with model repositories
```
### Loading with Overrides
Sometimes you need to modify loaded processors for new environments or datasets.
The override mechanism allows you to update specific processor configurations without modifying
the saved files:
```python
# Load processor with configuration overrides
processor = RobotProcessor.from_pretrained(
"./saved_processor",
overrides={
# Change device for different hardware
"device_processor": {"device": "cuda:1"},
# Update statistics for new dataset
"normalizer_processor": {"stats": new_dataset.meta.stats},
# Provide non-serializable objects (like tokenizers)
"tokenizer_processor": {"tokenizer": custom_tokenizer}
}
)
# Common override scenarios:
# 1. Adapting to different hardware (GPU availability)
# 2. Fine-tuning on new datasets with different statistics
# 3. Providing runtime dependencies that can't be serialized
# 4. Testing variations without creating new saved configs
```
## Creating Custom Processor Steps
Build your own processor steps for specialized transformations.
The key is implementing the required interface:
### Basic Custom Step with Registration
The registration mechanism allows your custom processors to be saved and loaded by name rather
than by module path.
This makes them more portable and easier to share:
```python
from dataclasses import dataclass
from lerobot.processor.pipeline import ProcessorStepRegistry, ObservationProcessor
# The @register decorator adds your processor to the global registry
# Use a unique name, preferably namespaced to avoid conflicts
@dataclass
@ProcessorStepRegistry.register("my_company/gaussian_noise")
class GaussianNoiseProcessor(ObservationProcessor):
"""Add Gaussian noise to observations for robustness training."""
noise_std: float = 0.01
training_only: bool = True
is_training: bool = True
def observation(self, observation):
"""Add noise to observation tensors."""
if not self.is_training and self.training_only:
return observation
noisy_obs = {}
for key, value in observation.items():
if isinstance(value, torch.Tensor) and "image" not in key:
# Add noise to non-image observations
noise = torch.randn_like(value) * self.noise_std
noisy_obs[key] = value + noise
else:
noisy_obs[key] = value
return noisy_obs
def get_config(self):
return {
"noise_std": self.noise_std,
"training_only": self.training_only,
"is_training": self.is_training
}
# Why register?
# 1. Enables saving by name: config saves "my_company/gaussian_noise" instead of full module path
# 2. More portable: Others can use your processor without your exact module structure
# 3. Version-safe: Module refactoring won't break saved configs
# 4. Cleaner configs: JSON shows readable names instead of long import paths
```
### Using Base Classes for Common Patterns
LeRobot provides base classes like `ObservationProcessor`, `ActionProcessor`, etc., that handle
the boilerplate of extracting and reinserting specific components:
```python
from lerobot.processor import ActionProcessor
@dataclass
@ProcessorStepRegistry.register("my_company/action_clipper")
class ActionClipProcessor(ActionProcessor):
"""Clip actions to safe ranges."""
min_value: float = -1.0
max_value: float = 1.0
def action(self, action):
"""Process only the action component."""
# No need to handle transition dict - base class does it
return torch.clamp(action, self.min_value, self.max_value)
def get_config(self):
return {"min_value": self.min_value, "max_value": self.max_value}
```
For more advanced processor patterns including stateful processors, see [Implement Your Own Processor](implement_your_own_processor.mdx).
## Advanced Features
### Debugging with Hooks
Processors support hooks for monitoring and debugging without modifying the pipeline code:
```python
# Define monitoring hooks
def log_shapes(step_idx: int, transition: EnvTransition):
"""Log tensor shapes after each step."""
obs = transition.get(TransitionKey.OBSERVATION)
if obs:
print(f"Step {step_idx} shapes:")
for key, value in obs.items():
if isinstance(value, torch.Tensor):
print(f" {key}: {value.shape}")
def check_nans(step_idx: int, transition: EnvTransition):
"""Check for NaN values."""
obs = transition.get(TransitionKey.OBSERVATION)
if obs:
for key, value in obs.items():
if isinstance(value, torch.Tensor) and torch.isnan(value).any():
print(f"Warning: NaN detected in {key} at step {step_idx}")
# Register hooks
processor.register_after_step_hook(log_shapes)
processor.register_after_step_hook(check_nans)
# Process data - hooks will be called after each step
output = processor(input_data)
# Remove hooks when done debugging
processor.unregister_after_step_hook(log_shapes)
processor.unregister_after_step_hook(check_nans)
```
### Step-by-Step Inspection
Use `step_through()` for detailed debugging of the transformation pipeline:
```python
# Inspect data at each transformation stage
for i, intermediate in enumerate(processor.step_through(data)):
print(f"\n=== After step {i} ===")
# Check observation shapes
obs = intermediate.get(TransitionKey.OBSERVATION)
if obs:
for key, value in obs.items():
if isinstance(value, torch.Tensor):
print(f"{key}: shape={value.shape}, "
f"dtype={value.dtype}, "
f"device={value.device}, "
f"range=[{value.min():.3f}, {value.max():.3f}]")
# Check action if present
action = intermediate.get(TransitionKey.ACTION)
if action is not None and isinstance(action, torch.Tensor):
print(f"action: shape={action.shape}, range=[{action.min():.3f}, {action.max():.3f}]")
```
### Pipeline Slicing
Extract subsets of a pipeline for testing or creating variations:
```python
# Get specific steps
first_three_steps = processor[:3] # Returns new RobotProcessor
middle_step = processor[2] # Returns single ProcessorStep
# Test individual steps
test_input = {...}
step_output = processor[0](test_input) # Test first step only
# Create variations
variant_processor = RobotProcessor(
steps=processor.steps[:-1] + [new_final_step],
name="variant"
)
```
## Best Practices and Tips
### 1. Order Matters
The sequence of processors is crucial. Follow this general order:
```python
# Preprocessing: Raw → Model-ready
1. Rename (standardize keys)
2. Batch (add dimensions)
3. Tokenize (text → tokens)
4. Normalize (scale values)
5. Device (move to GPU)
# Postprocessing: Model → Robot-ready
1. Device (move to CPU)
2. Unnormalize (restore scale)
3. Unbatch (remove dimensions if needed)
```
### 2. Registration Best Practices
```python
# Always register custom steps for better portability
@ProcessorStepRegistry.register("my_company/special_processor")
class SpecialProcessor:
...
# Use namespaced names to avoid conflicts
# Good: "my_company/augmentation"
# Bad: "augmentation" (too generic)
# Check registered processors
print(ProcessorStepRegistry.list()) # See all registered processors
```
### 3. Common Pitfalls and Solutions
**Tensor Device Mismatch:**
```python
# Problem: RuntimeError: Expected all tensors on same device
# Solution: Ensure DeviceProcessor is in pipeline
preprocessor = RobotProcessor(
steps=[
NormalizerProcessor(...),
DeviceProcessor(device="cuda") # Add this
]
)
```
**Missing Statistics:**
```python
# Problem: NormalizerProcessor has no stats
# Solution 1: Compute stats from dataset
from lerobot.datasets.compute_stats import compute_stats
stats = compute_stats(dataset)
# Solution 2: Load with overrides
processor = RobotProcessor.from_pretrained(
"model_path",
overrides={"normalizer_processor": {"stats": dataset.meta.stats}}
)
```
## Next Steps
Now that you understand processors, explore these topics:
- [**Implement Your Own Processor**](implement_your_own_processor.mdx) - Deep dive into creating custom processors with advanced features like stateful processing
- [**Policy Documentation**](policies.mdx) - Learn how different policies use processors
- [**Dataset Documentation**](datasets.mdx) - Understand the data format that processors transform
- [**Training Guide**](training.mdx) - See processors in action during model training
- [**Evaluation Guide**](evaluation.mdx) - Learn about processor usage during policy evaluation
## Summary
Processors are the unsung heroes of robotics pipelines, handling the critical transformations between raw sensor data and model-ready tensors. By understanding and effectively using processors, you can:
- Build robust, reusable data pipelines
- Share preprocessing configurations across projects
- Debug data transformations systematically
- Ensure consistency between training and deployment
- Create custom transformations for specialized tasks
Remember: good preprocessing is often the difference between a model that works in theory
and one that works in practice!
The modular pipeline approach ensures your transformations are testable, reproducible,
and portable across different robots and environments.
+195
View File
@@ -0,0 +1,195 @@
# Phone
Use your phone (iOS or Android) to control your robot.
**In this guide you'll learn:**
- How to connect an iOS/Android phone
- How phone pose is mapped to robot endeffector (EE) targets
- How to tweak safety limits, gripper control, and IK settings
To use phone to control your robot, install the relevant dependencies with:
```bash
pip install lerobot[phone]
```
## Get started
### Supported platforms
- iOS: Uses the HEBI Mobile I/O app (ARKit pose + buttons). Download the app first, open it and the examples will discover it on your network and stream the phone pose and inputs.
- Android: Uses the `teleop` package (WebXR). When you start the Python process, it prints a local URL. Open the link on your phone, tap Start, then use Move to stream pose.
Links:
- Android WebXR library: [`teleop` on PyPI](https://pypi.org/project/teleop/)
- iOS app: [HEBI Mobile I/O](https://docs.hebi.us/tools.html#mobile-io)
### Phone orientation and controls
- Orientation: hold the phone with the screen facing up and the top edge pointing in the same direction as the robot gripper. This ensures calibration aligns the phones frame with the robot frame so motion feels natural.
- Enable/disable:
- iOS: Hold `B1` to enable teleoperation, release to stop. The first press captures a reference pose.
- Android: Press and hold the `Move` button, release to stop. The first press captures a reference pose.
- Gripper control:
- iOS: Analog input `A3` controls the gripper as velocity input.
- Android: Buttons `A` and `B` act like increment/decrement (A opens, B closes). You can tune velocity in the `GripperVelocityToJoint` step.
### Step 1: Choose the platform
Modify the examples to use `PhoneOS.IOS` or `PhoneOS.ANDROID` in `PhoneConfig`. The API is identical across platforms, only the input source differs. All examples are under `examples/` and have `phone_so100_*.py` variants.
Teleoperation example:
```36:43:examples/phone_so100_teleop.py
from lerobot.teleoperators.phone.config_phone import PhoneConfig, PhoneOS
teleop_config = PhoneConfig(phone_os=PhoneOS.IOS) # or PhoneOS.ANDROID
teleop_device = Phone(teleop_config)
```
### Step 2: Connect and calibrate
When `Phone(teleop_config)` is created and `connect()` is called, calibration is prompted automatically. Hold the phone in the orientation described above, then:
- iOS: press and hold `B1` to capture the reference pose.
- Android: press `Move` button on the WebXR page to capture the reference pose.
Why calibrate? We capture the current pose so subsequent poses are expressed in a robot aligned frame. When you again press the button to enable control, the position is recaptured to avoid drift when your phone is repositioned while it was disabled.
### Step 3: Run an example
Run on of the examples scripts to teleoperate, record a dataset, replay a dataset or evaluate a policy.
All scripts assume you configured your robot (e.g., SO-100 follower) and set the correct serial port.
- Android: after starting the script, open the printed local URL on your phone, tap Start, then press and hold Move.
- iOS: open HEBI Mobile I/O first; B1 enables motion. A3 controls the gripper.
You can customize mapping or safety limits by editing the processor steps shown in the examples.
You can also remap inputs (e.g., use a different analog input) or adapt the pipeline to other robots (e.g., LeKiwi) by modifying the input and kinematics steps. More about this in the [Processors for Robots and Teleoperators](./processors_robots_teleop.mdx) guide.
- Run this example to teleoperate:
```bash
python examples/phone_so100_teleop.py
```
- Run this example to record a dataset, which saves absolute end effector observations and actions:
```bash
python examples/phone_so100_record.py
```
- Run this example to replay recorded episodes:
```bash
python examples/phone_so100_replay.py
```
- Run this example to evaluate a pretrained policy:
```bash
python examples/phone_so100_eval.py
```
### Important pipeline steps and options
- Kinematics are used in multiple steps. We use [Placo](https://github.com/Rhoban/placo) which is a wrapper around Pinocchio for handling our kinematics. We construct the kinematics object by passing the robot's URDF and target frame. We set `target_frame_name` to the gripper frame.
```44:49:examples/phone_so100_teleop.py
RobotKinematics(
urdf_path="./src/lerobot/teleoperators/sim/so101_new_calib.urdf",
target_frame_name="gripper_frame_link",
joint_names=list(robot.bus.motors.keys()),
)
```
- The `MapPhoneActionToRobotAction` step converts the calibrated phone pose and inputs into target deltas and gripper commands, below is shown what the step outputs.
```72:83:src/lerobot/teleoperators/phone/phone_processor.py
# Map calibrated phone pose to robot targets (enabled gates the motion)
act.update(
{
"action.enabled": enabled,
"action.target_x": -pos[1] if enabled else 0.0,
"action.target_y": pos[0] if enabled else 0.0,
"action.target_z": pos[2] if enabled else 0.0,
"action.target_wx": rotvec[1] if enabled else 0.0,
"action.target_wy": rotvec[0] if enabled else 0.0,
"action.target_wz": -rotvec[2] if enabled else 0.0,
"action.gripper": gripper,
}
)
```
- The `EEReferenceAndDelta` step converts target deltas to an absolute desired EE pose, storing a reference on enable, the `end_effector_step_sizes` are the step sizes for the EE pose and can be modified to change the motion speed.
```56:65:examples/phone_so100_teleop.py
EEReferenceAndDelta(
kinematics=kinematics_solver,
end_effector_step_sizes={"x": 0.5, "y": 0.5, "z": 0.5},
motor_names=list(robot.bus.motors.keys()),
)
```
- The `EEBoundsAndSafety` step clamps EE motion to a workspace and checks for large ee step jumps to ensure safety. The `end_effector_bounds` are the bounds for the EE pose and can be modified to change the workspace. The `max_ee_step_m` and `max_ee_twist_step_rad` are the step limits for the EE pose and can be modified to change the safety limits.
```61:66:examples/phone_so100_teleop.py
EEBoundsAndSafety(
end_effector_bounds={"min": [-1.0, -1.0, -1.0], "max": [1.0, 1.0, 1.0]},
max_ee_step_m=0.10,
max_ee_twist_step_rad=0.50,
)
```
- The `GripperVelocityToJoint` step turns a velocitylike gripper input into absolute gripper position using the current measured state. The `speed_factor` is the factor by which the velocity is multiplied.
```78:81:examples/phone_so100_teleop.py
GripperVelocityToJoint(
motor_names=list(robot.bus.motors.keys()),
speed_factor=20.0,
)
```
#### Different IK initial guesses
We use different IK initial guesses in the kinematic steps. As initial guess either the current measured joints or the previous IK solution is used.
- Closed loop (used in record/eval): sets `initial_guess_current_joints=True` so IK starts from the measured joints each frame.
```71:76:examples/phone_so100_eval.py
InverseKinematicsEEToJoints(
kinematics=kinematics_solver,
motor_names=list(robot.bus.motors.keys()),
initial_guess_current_joints=True, # closed loop
)
```
- Open loop (used in replay): sets `initial_guess_current_joints=False` so IK continues from the previous IK solution rather than the measured state. This preserves action stability when we replay without feedback.
```80:86:examples/phone_so100_replay.py
InverseKinematicsEEToJoints(
kinematics=kinematics_solver,
motor_names=list(robot.bus.motors.keys()),
initial_guess_current_joints=False, # open loop
)
```
### Pipeline steps explained
- MapPhoneActionToRobotAction: converts calibrated phone pose and inputs into target deltas and a gripper command. Motion is gated by an enable signal (B1 on iOS, Move on Android).
- AddRobotObservationAsComplimentaryData: reads current robot joints and inserts them under `complementary_data.raw_joint_positions` for FK/IK steps to use.
- EEReferenceAndDelta: latches a reference EE pose on enable and combines it with target deltas to produce an absolute desired EE pose each frame. When disabled, it keeps sending the last commanded pose.
- EEBoundsAndSafety: clamps the EE pose to a workspace and ratelimits jumps for safety. Also declares `action.ee.*` features.
- InverseKinematicsEEToJoints: turns an EE pose into joint positions with IK. `initial_guess_current_joints=True` is recommended for closedloop control; set `False` for openloop replay for stability.
- GripperVelocityToJoint: integrates a velocitylike gripper input into an absolute gripper position using the current measured state.
- ForwardKinematicsJointsToEE: computes `observation.state.ee.*` from observed joints for logging and training on EE state.
### Troubleshooting
- iOS not discovered: ensure HEBI Mobile I/O is open and your laptop/phone are on the same network.
- Android URL not reachable: check local you used `https` instead of `http`, use the exact IP printed by the script and allow your browser to enter and ignore the certificate issue.
- Motion feels inverted: adjust the sign flips in `MapPhoneActionToRobotAction` or swap axes to match your setup.
+148
View File
@@ -0,0 +1,148 @@
# Processors for Robots and Teleoperators
This guide shows how to build and modify processing pipelines that connect teleoperators (e.g., phone) to robots and datasets. Pipelines standardize conversions between different action/observation spaces so you can swap teleops and robots without rewriting glue code.
We use the Phone to SO100 follower examples for concreteness, but the same patterns apply to other robots.
**What you'll learn**
- Absolute vs. relative EE control: What each means, tradeoffs, and how to choose for your task.
- Three-pipeline pattern: How to map teleop actions → dataset actions → robot commands, and robot observations → dataset observations.
- Adapters (`to_transition` / `to_output`): How these convert raw dicts to `EnvTransition` and back to reduce boilerplate.
- Dataset feature contracts: How steps declare features via `transform_features(...)`, and how to aggregate/merge them for recording.
- Choosing a representation: When to store joints, absolute EE poses, or relative EE deltas—and how that affects training.
- Pipeline customization guidance: How to swap robots/URDFs safely and tune bounds, step sizes, and options like IK initialization.
### Absolute vs relative EE control
The examples in this guide use absolute end effector (EE) poses because they are easy to reason about. In practice, relative EE deltas or joint position are often preferred as learning features.
You can choose what you save and learn from the teleop and robot action spaces, joints, absolute EE, or relative EE by using/implementing the right steps (and `transform_features()`) in your pipelines.
## Three pipelines
We often compose three pipelines. Depending on your setup, some can be empty if action and observation spaces already match.
Each of these pipelines handle different conversions between different action and observation spaces. Below is a quick explanation of each pipeline.
1. Pipeline 1: Teleop action space → dataset action space (phone pose → EE targets)
2. Pipeline 2: Dataset action space → robot command space (EE targets → joints)
3. Pipeline 3: Robot observation space → dataset observation space (joints → EE pose)
Below is an example of the three pipelines that we use in the phone to SO-100 follower examples:
```69:90:examples/phone_so100_record.py
phone_to_robot_ee_pose = RobotProcessor( # teleop -> dataset action
steps=[MapPhoneActionToRobotAction(platform=teleop_config.phone_os),
AddRobotObservationAsComplimentaryData(robot=robot),
EEReferenceAndDelta(kinematics=kinematics_solver,
end_effector_step_sizes={"x": 0.5, "y": 0.5, "z": 0.5},
motor_names=list(robot.bus.motors.keys())),
EEBoundsAndSafety(end_effector_bounds={"min": [-1, -1, -1], "max": [1, 1, 1]},
max_ee_step_m=0.20, max_ee_twist_step_rad=0.50)],
to_transition=to_transition_teleop_action,
to_output=lambda tr: tr,
)
robot_ee_to_joints = RobotProcessor( # dataset action -> robot
steps=[InverseKinematicsEEToJoints(kinematics=kinematics_solver,
motor_names=list(robot.bus.motors.keys()),
initial_guess_current_joints=True),
GripperVelocityToJoint(motor_names=list(robot.bus.motors.keys()), speed_factor=20.0)],
to_transition=lambda tr: tr,
to_output=to_output_robot_action,
)
robot_joints_to_ee_pose = RobotProcessor( # robot obs -> dataset obs
steps=[ForwardKinematicsJointsToEE(kinematics=kinematics_solver,
motor_names=list(robot.bus.motors.keys()))],
to_transition=to_transition_robot_observation,
to_output=lambda tr: tr,
)
```
## Why to_transition / to_output
To convert from robot/teleoperator to pipeline and back, we use the `to_transition` and `to_output` pipeline adapters.
They standardize conversions to reduce boilerplate code, and form the bridge between the robot and teleoperators raw dicts and the pipelines `EnvTransition` format.
In the phone to SO-100 follower examples we use the following adapters:
- `to_transition_teleop_action`: transforms the teleop action dict to a pipeline transition (puts keys under `action.*`, converts scalars/arrays to tensors, keeps objects like `Rotation` intact)
- `to_output_robot_action`: transforms the pipeline transition to a robot action dict (extracts keys ending with `.pos`/`.vel` and strips `action.` prefix)
- `to_transition_robot_observation`: transforms the robot observation dict to a pipeline transition (splits state vs images; stores state under `observation.state.*` and images under `observation.images.*`)
See `src/lerobot/processor/converters.py` for more details.
## Dataset feature contracts
Dataset features are the keys saved in the dataset. Each step can declare what its dataset features are via `transform_features(...)`. We can then aggregate features per pipeline with `aggregate_pipeline_dataset_features()` and merge multiple groups with `merge_features(...)`.
Below is and example of how we declare features with the `transform_features` method in the phone to SO-100 follower examples:
```203:211:src/lerobot/robots/so100_follower/robot_kinematic_processor.py
def transform_features(self, features: dict[str, PolicyFeature]) -> dict[str, PolicyFeature]:
# Because this is last step we specify the dataset features of this step that we want to be stored in the dataset
features["action.ee.x"] = float
features["action.ee.y"] = float
features["action.ee.z"] = float
features["action.ee.wx"] = float
features["action.ee.wy"] = float
features["action.ee.wz"] = float
return features
```
Tip: declare features at the last step that produces them (e.g., `EEBoundsAndSafety` declares `action.ee.*`, `ForwardKinematicsJointsToEE` declares `observation.state.ee.*`).
Below is an example of how we aggregate and merge features in the phone to SO-100 follower examples:
```121:145:examples/phone_so100_record.py
action_ee = aggregate_pipeline_dataset_features(
pipeline=phone_to_robot_ee_pose,
initial_features=phone.action_features,
use_videos=True,
patterns=["action.ee"],
)
gripper = aggregate_pipeline_dataset_features(
pipeline=robot_ee_to_joints,
initial_features={},
use_videos=True,
patterns=["action.gripper.pos", "observation.state.gripper.pos"],
)
observation_ee = aggregate_pipeline_dataset_features(
pipeline=robot_joints_to_ee_pose,
initial_features=robot.observation_features,
use_videos=True,
patterns=["observation.state.ee"],
)
dataset_features = merge_features(action_ee, gripper, observation_ee)
```
How it works:
- `aggregate_pipeline_dataset_features(...)`: applies `transform_features` across the pipeline and filters by patterns (images included when `use_videos=True`).
- `merge_features(...)`: combine multiple feature dicts.
- Recording uses `to_dataset_frame(...)` to build frames consistent with `dataset.features` before we call `add_frame(...)` to add the frame to the dataset.
## Guidance when customizing robot pipelines
You can store any of the following features as your action/observation space:
- Joint positions
- Absolute EE poses
- Relative EE deltas
- Other features: joint velocity, etc.
Pick what you want to use for your policy action and observation space and configure/modify the pipelines and steps accordingly.
### Different robots
- Swap `RobotKinematics` URDF and `motor_names`. Ensure `target_frame_name` points to your gripper/wrist.
### Safety first
- When changing pipelines, start with tight bounds, implement safety steps when working with real robots.
- Its advised to start with simulation first and then move to real robots.
Hope this guide helps you get started with customizing your robot pipelines, If you run into any issues at any point, jump into our [Discord community](https://discord.com/invite/s3KuuzsPFb) for support.
+13 -2
View File
@@ -1,6 +1,7 @@
from lerobot.datasets.lerobot_dataset import LeRobotDataset
from lerobot.datasets.utils import hw_to_dataset_features
from lerobot.policies.act.modeling_act import ACTPolicy
from lerobot.policies.factory import make_processor
from lerobot.record import record_loop
from lerobot.robots.lekiwi import LeKiwiClient, LeKiwiClientConfig
from lerobot.utils.control_utils import init_keyboard_listener
@@ -11,12 +12,14 @@ NUM_EPISODES = 2
FPS = 30
EPISODE_TIME_SEC = 60
TASK_DESCRIPTION = "My task description"
HF_MODEL_ID = "<hf_username>/<model_repo_id>"
HF_DATASET_ID = "<hf_username>/<eval_dataset_repo_id>"
# Create the robot and teleoperator configurations
robot_config = LeKiwiClientConfig(remote_ip="172.18.134.136", id="lekiwi")
robot = LeKiwiClient(robot_config)
policy = ACTPolicy.from_pretrained("<hf_username>/<policy_repo_id>")
policy = ACTPolicy.from_pretrained(HF_MODEL_ID)
# Configure the dataset features
action_features = hw_to_dataset_features(robot.action_features, "action")
@@ -25,7 +28,7 @@ dataset_features = {**action_features, **obs_features}
# Create the dataset
dataset = LeRobotDataset.create(
repo_id="<hf_username>/<eval_dataset_repo_id>",
repo_id=HF_DATASET_ID,
fps=FPS,
features=dataset_features,
robot_type=robot.name,
@@ -43,6 +46,12 @@ listener, events = init_keyboard_listener()
if not robot.is_connected:
raise ValueError("Robot is not connected!")
preprocessor, postprocessor = make_processor(
policy_cfg=policy,
pretrained_path=HF_MODEL_ID,
dataset_stats=dataset.meta.stats,
)
recorded_episodes = 0
while recorded_episodes < NUM_EPISODES and not events["stop_recording"]:
log_say(f"Running inference, recording eval episode {recorded_episodes} of {NUM_EPISODES}")
@@ -53,6 +62,8 @@ while recorded_episodes < NUM_EPISODES and not events["stop_recording"]:
events=events,
fps=FPS,
policy=policy,
preprocessor=preprocessor,
postprocessor=postprocessor,
dataset=dataset,
control_time_s=EPISODE_TIME_SEC,
single_task=TASK_DESCRIPTION,
+1 -1
View File
@@ -38,7 +38,7 @@ while True:
keyboard_keys = keyboard.get_action()
base_action = robot._from_keyboard_to_base_action(keyboard_keys)
log_rerun_data(observation, {**arm_action, **base_action})
log_rerun_data(observation=observation, action={**arm_action, **base_action})
action = {**arm_action, **base_action} if len(base_action) > 0 else arm_action
+158
View File
@@ -0,0 +1,158 @@
# !/usr/bin/env python
# Copyright 2025 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from lerobot.cameras.opencv.configuration_opencv import OpenCVCameraConfig
from lerobot.datasets.lerobot_dataset import LeRobotDataset
from lerobot.datasets.pipeline_features import aggregate_pipeline_dataset_features
from lerobot.datasets.utils import merge_features
from lerobot.model.kinematics import RobotKinematics
from lerobot.policies.act.modeling_act import ACTPolicy
from lerobot.policies.factory import make_processor
from lerobot.processor.converters import (
to_output_robot_action,
to_transition_robot_observation,
)
from lerobot.processor.pipeline import RobotProcessor
from lerobot.record import record_loop
from lerobot.robots.so100_follower.config_so100_follower import SO100FollowerConfig
from lerobot.robots.so100_follower.robot_kinematic_processor import (
AddRobotObservationAsComplimentaryData,
ForwardKinematicsJointsToEE,
InverseKinematicsEEToJoints,
)
from lerobot.robots.so100_follower.so100_follower import SO100Follower
from lerobot.utils.control_utils import init_keyboard_listener
from lerobot.utils.utils import log_say
from lerobot.utils.visualization_utils import _init_rerun
NUM_EPISODES = 5
FPS = 30
EPISODE_TIME_SEC = 60
TASK_DESCRIPTION = "My task description"
HF_MODEL_ID = "<hf_username>/<model_repo_id>"
HF_DATASET_ID = "<hf_username>/<dataset_repo_id>"
# Initialize the robot with degrees
camera_config = {"front": OpenCVCameraConfig(index_or_path=0, width=640, height=480, fps=FPS)}
robot_config = SO100FollowerConfig(
port="/dev/tty.usbmodem58760434471",
id="my_awesome_follower_arm",
cameras=camera_config,
use_degrees=True,
)
# Initialize the robot
robot = SO100Follower(robot_config)
# NOTE: It is highly recommended to use the urdf in the SO-ARM100 repo: https://github.com/TheRobotStudio/SO-ARM100/blob/main/Simulation/SO101/so101_new_calib.urdf
kinematics_solver = RobotKinematics(
urdf_path="./src/lerobot/teleoperators/sim/so101_new_calib.urdf",
target_frame_name="gripper_frame_link",
joint_names=list(robot.bus.motors.keys()),
)
# Build pipeline to convert ee pose action to joint action
robot_ee_to_joints = RobotProcessor(
steps=[
AddRobotObservationAsComplimentaryData(robot=robot),
InverseKinematicsEEToJoints(
kinematics=kinematics_solver,
motor_names=list(robot.bus.motors.keys()),
initial_guess_current_joints=True,
),
],
to_transition=lambda tr: tr,
to_output=to_output_robot_action,
)
# Build pipeline to convert joint observation to ee pose observation
robot_joints_to_ee_pose = RobotProcessor(
steps=[
ForwardKinematicsJointsToEE(kinematics=kinematics_solver, motor_names=list(robot.bus.motors.keys()))
],
to_transition=to_transition_robot_observation,
to_output=lambda tr: tr,
)
# Build dataset action and gripper features
action_ee_and_gripper = aggregate_pipeline_dataset_features(
pipeline=robot_ee_to_joints,
initial_features={},
use_videos=True,
patterns=["action.ee", "action.gripper.pos", "observation.state.gripper.pos"],
) # Get all ee action features + gripper pos action features
# Build dataset observation features
obs_ee = aggregate_pipeline_dataset_features(
pipeline=robot_joints_to_ee_pose,
initial_features=robot.observation_features,
use_videos=True,
patterns=["observation.state.ee"],
) # Get all ee observation features
dataset_features = merge_features(obs_ee, action_ee_and_gripper)
print("All dataset features: ", dataset_features)
# Create the dataset
dataset = LeRobotDataset.create(
repo_id=HF_DATASET_ID,
fps=FPS,
features=dataset_features,
robot_type=robot.name,
use_videos=True,
image_writer_threads=4,
)
# Initialize the keyboard listener and rerun visualization
_, events = init_keyboard_listener()
_init_rerun(session_name="recording_phone")
# Connect the robot and teleoperator
robot.connect()
episode_idx = 0
policy = ACTPolicy.from_pretrained(HF_MODEL_ID)
preprocessor, postprocessor = make_processor(
policy_cfg=policy,
pretrained_path=HF_MODEL_ID,
dataset_stats=dataset.meta.stats,
)
for episode_idx in range(NUM_EPISODES):
log_say(f"Running inference, recording eval episode {episode_idx + 1} of {NUM_EPISODES}")
record_loop(
robot=robot,
events=events,
fps=FPS,
policy=policy,
preprocessor=preprocessor,
postprocessor=postprocessor,
dataset=dataset,
control_time_s=EPISODE_TIME_SEC,
single_task=TASK_DESCRIPTION,
display_data=True,
robot_action_processor=robot_ee_to_joints,
robot_observation_processor=robot_joints_to_ee_pose,
)
dataset.save_episode()
# Clean up
log_say("Stop recording")
robot.disconnect()
dataset.push_to_hub()
+215
View File
@@ -0,0 +1,215 @@
# !/usr/bin/env python
# Copyright 2025 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from lerobot.cameras.opencv.configuration_opencv import OpenCVCameraConfig
from lerobot.datasets.lerobot_dataset import LeRobotDataset
from lerobot.datasets.pipeline_features import aggregate_pipeline_dataset_features
from lerobot.datasets.utils import merge_features
from lerobot.model.kinematics import RobotKinematics
from lerobot.processor.converters import (
to_output_robot_action,
to_transition_robot_observation,
to_transition_teleop_action,
)
from lerobot.processor.pipeline import RobotProcessor
from lerobot.record import record_loop
from lerobot.robots.so100_follower.config_so100_follower import SO100FollowerConfig
from lerobot.robots.so100_follower.robot_kinematic_processor import (
AddRobotObservationAsComplimentaryData,
EEBoundsAndSafety,
EEReferenceAndDelta,
ForwardKinematicsJointsToEE,
GripperVelocityToJoint,
InverseKinematicsEEToJoints,
)
from lerobot.robots.so100_follower.so100_follower import SO100Follower
from lerobot.teleoperators.phone.config_phone import PhoneConfig, PhoneOS
from lerobot.teleoperators.phone.phone import Phone
from lerobot.teleoperators.phone.phone_processor import MapPhoneActionToRobotAction
from lerobot.utils.control_utils import init_keyboard_listener
from lerobot.utils.utils import log_say
from lerobot.utils.visualization_utils import _init_rerun
NUM_EPISODES = 10
FPS = 30
EPISODE_TIME_SEC = 60
RESET_TIME_SEC = 30
TASK_DESCRIPTION = "My task description"
HF_REPO_ID = "<hf_username>/<dataset_repo_id>"
# Initialize the robot and teleoperator
camera_config = {"front": OpenCVCameraConfig(index_or_path=0, width=640, height=480, fps=FPS)}
robot_config = SO100FollowerConfig(
port="/dev/tty.usbmodem58760434471",
id="my_awesome_follower_arm",
cameras=camera_config,
use_degrees=True,
)
teleop_config = PhoneConfig(phone_os=PhoneOS.IOS) # or PhoneOS.ANDROID
# Initialize the robot and teleoperator
robot = SO100Follower(robot_config)
phone = Phone(teleop_config)
# NOTE: It is highly recommended to use the urdf in the SO-ARM100 repo: https://github.com/TheRobotStudio/SO-ARM100/blob/main/Simulation/SO101/so101_new_calib.urdf
kinematics_solver = RobotKinematics(
urdf_path="./src/lerobot/teleoperators/sim/so101_new_calib.urdf",
target_frame_name="gripper_frame_link",
joint_names=list(robot.bus.motors.keys()),
)
# Build pipeline to convert phone action to ee pose action
phone_to_robot_ee_pose = RobotProcessor(
steps=[
MapPhoneActionToRobotAction(platform=teleop_config.phone_os),
AddRobotObservationAsComplimentaryData(robot=robot),
EEReferenceAndDelta(
kinematics=kinematics_solver,
end_effector_step_sizes={"x": 0.5, "y": 0.5, "z": 0.5},
motor_names=list(robot.bus.motors.keys()),
),
EEBoundsAndSafety(
end_effector_bounds={"min": [-1.0, -1.0, -1.0], "max": [1.0, 1.0, 1.0]},
max_ee_step_m=0.20,
max_ee_twist_step_rad=0.50,
),
],
to_transition=to_transition_teleop_action,
to_output=lambda tr: tr,
)
# Build pipeline to convert ee pose action to joint action
robot_ee_to_joints = RobotProcessor(
steps=[
InverseKinematicsEEToJoints(
kinematics=kinematics_solver,
motor_names=list(robot.bus.motors.keys()),
initial_guess_current_joints=True,
),
GripperVelocityToJoint(
motor_names=list(robot.bus.motors.keys()),
speed_factor=20.0,
),
],
to_transition=lambda tr: tr,
to_output=to_output_robot_action,
)
# Build pipeline to convert joint observation to ee pose observation
robot_joints_to_ee_pose = RobotProcessor(
steps=[
ForwardKinematicsJointsToEE(kinematics=kinematics_solver, motor_names=list(robot.bus.motors.keys()))
],
to_transition=to_transition_robot_observation,
to_output=lambda tr: tr,
)
# Build dataset ee action features
action_ee = aggregate_pipeline_dataset_features(
pipeline=phone_to_robot_ee_pose,
initial_features=phone.action_features,
use_videos=True,
patterns=["action.ee"],
)
# Get gripper pos action features
gripper = aggregate_pipeline_dataset_features(
pipeline=robot_ee_to_joints,
initial_features={},
use_videos=True,
patterns=["action.gripper.pos", "observation.state.gripper.pos"],
)
# Build dataset ee observation features
observation_ee = aggregate_pipeline_dataset_features(
pipeline=robot_joints_to_ee_pose,
initial_features=robot.observation_features,
use_videos=True,
patterns=["observation.state.ee"],
)
dataset_features = merge_features(action_ee, gripper, observation_ee)
print("All dataset features: ", dataset_features)
# Create the dataset
dataset = LeRobotDataset.create(
repo_id=HF_REPO_ID,
fps=FPS,
features=dataset_features,
robot_type=robot.name,
use_videos=True,
image_writer_threads=4,
)
# Initialize the keyboard listener and rerun visualization
_, events = init_keyboard_listener()
_init_rerun(session_name="recording_phone")
# Connect the robot and teleoperator
robot.connect()
phone.connect()
episode_idx = 0
while episode_idx < NUM_EPISODES and not events["stop_recording"]:
log_say(f"Recording episode {episode_idx + 1} of {NUM_EPISODES}")
record_loop(
robot=robot,
events=events,
fps=FPS,
teleop=phone,
dataset=dataset,
control_time_s=EPISODE_TIME_SEC,
single_task=TASK_DESCRIPTION,
display_data=True,
teleop_action_processor=phone_to_robot_ee_pose,
robot_action_processor=robot_ee_to_joints,
robot_observation_processor=robot_joints_to_ee_pose,
)
# Reset the environment if not stopping or re-recording
if not events["stop_recording"] and (episode_idx < NUM_EPISODES - 1 or events["rerecord_episode"]):
log_say("Reset the environment")
record_loop(
robot=robot,
events=events,
fps=FPS,
teleop=phone,
control_time_s=RESET_TIME_SEC,
single_task=TASK_DESCRIPTION,
display_data=True,
teleop_action_processor=phone_to_robot_ee_pose,
robot_action_processor=robot_ee_to_joints,
robot_observation_processor=robot_joints_to_ee_pose,
)
if events["rerecord_episode"]:
log_say("Re-recording episode")
events["rerecord_episode"] = False
events["exit_early"] = False
dataset.clear_episode_buffer()
continue
dataset.save_episode()
episode_idx += 1
# Clean up
log_say("Stop recording")
robot.disconnect()
phone.disconnect()
dataset.push_to_hub()
+106
View File
@@ -0,0 +1,106 @@
# !/usr/bin/env python
# Copyright 2025 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import time
from lerobot.datasets.lerobot_dataset import LeRobotDataset
from lerobot.model.kinematics import RobotKinematics
from lerobot.processor.converters import to_output_robot_action
from lerobot.processor.pipeline import RobotProcessor
from lerobot.robots.so100_follower.config_so100_follower import SO100FollowerConfig
from lerobot.robots.so100_follower.robot_kinematic_processor import (
AddRobotObservationAsComplimentaryData,
InverseKinematicsEEToJoints,
)
from lerobot.robots.so100_follower.so100_follower import SO100Follower
from lerobot.utils.robot_utils import busy_wait
from lerobot.utils.utils import log_say
EPISODE_IDX = 0
HF_REPO_ID = "<hf_username>/<dataset_repo_id>"
robot_config = SO100FollowerConfig(
port="/dev/tty.usbmodem58760434471", id="my_awesome_follower_arm", use_degrees=True
)
robot = SO100Follower(robot_config)
robot.connect()
dataset = LeRobotDataset(HF_REPO_ID, episodes=[EPISODE_IDX])
actions = dataset.hf_dataset.select_columns("action")
# NOTE: It is highly recommended to use the urdf in the SO-ARM100 repo: https://github.com/TheRobotStudio/SO-ARM100/blob/main/Simulation/SO101/so101_new_calib.urdf
kinematics_solver = RobotKinematics(
urdf_path="./src/lerobot/teleoperators/sim/so101_new_calib.urdf",
target_frame_name="gripper_frame_link",
joint_names=list(robot.bus.motors.keys()),
)
# This method converts the action from the dataset to a transition for pipeline
def action_to_transition(action: dict):
act = {}
# EE pose
for k in ("ee.x", "ee.y", "ee.z", "ee.wx", "ee.wy", "ee.wz"):
if k in action:
act[f"action.{k}"] = float(action[k])
# Gripper: your dataset has absolute position
if "gripper.pos" in action:
act["action.gripper.pos"] = float(action["gripper.pos"])
return {
"observation": None,
"action": act,
"reward": None,
"done": False,
"truncated": False,
"info": {},
"complementary_data": {},
}
# Build pipeline to convert ee pose action to joint action
robot_ee_to_joints = RobotProcessor(
steps=[
AddRobotObservationAsComplimentaryData(robot=robot),
InverseKinematicsEEToJoints(
kinematics=kinematics_solver,
motor_names=list(robot.bus.motors.keys()),
initial_guess_current_joints=False, # Because replay is open loop
),
],
to_transition=action_to_transition,
to_output=to_output_robot_action,
)
robot_ee_to_joints.reset()
log_say(f"Replaying episode {EPISODE_IDX}")
for idx in range(dataset.num_frames):
t0 = time.perf_counter()
ee_action = {
name: float(actions[idx]["action"][i]) for i, name in enumerate(dataset.features["action"]["names"])
}
joint_action = robot_ee_to_joints(ee_action)
action_sent = robot.send_action(joint_action)
busy_wait(1.0 / dataset.fps - (time.perf_counter() - t0))
robot.disconnect()
+109
View File
@@ -0,0 +1,109 @@
#!/usr/bin/env python
# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specif
import time
from lerobot.model.kinematics import RobotKinematics
from lerobot.processor import RobotProcessor
from lerobot.processor.converters import to_output_robot_action, to_transition_teleop_action
from lerobot.robots.so100_follower.config_so100_follower import SO100FollowerConfig
from lerobot.robots.so100_follower.robot_kinematic_processor import (
AddRobotObservationAsComplimentaryData,
EEBoundsAndSafety,
EEReferenceAndDelta,
GripperVelocityToJoint,
InverseKinematicsEEToJoints,
)
from lerobot.robots.so100_follower.so100_follower import SO100Follower
from lerobot.teleoperators.phone.config_phone import PhoneConfig, PhoneOS
from lerobot.teleoperators.phone.phone import Phone
from lerobot.teleoperators.phone.phone_processor import MapPhoneActionToRobotAction
# Initialize the robot and teleoperator
robot_config = SO100FollowerConfig(
port="/dev/tty.usbmodem58760434471", id="my_awesome_follower_arm", use_degrees=True
)
teleop_config = PhoneConfig(phone_os=PhoneOS.IOS) # or PhoneOS.ANDROID
# Initialize the robot and teleoperator
robot = SO100Follower(robot_config)
teleop_device = Phone(teleop_config)
# NOTE: It is highly recommended to use the urdf in the SO-ARM100 repo: https://github.com/TheRobotStudio/SO-ARM100/blob/main/Simulation/SO101/so101_new_calib.urdf
kinematics_solver = RobotKinematics(
urdf_path="./src/lerobot/teleoperators/sim/so101_new_calib.urdf",
target_frame_name="gripper_frame_link",
joint_names=list(robot.bus.motors.keys()),
)
# Build pipeline to convert phone action to ee pose action
phone_to_robot_ee_pose = RobotProcessor(
steps=[
MapPhoneActionToRobotAction(platform=teleop_config.phone_os),
AddRobotObservationAsComplimentaryData(robot=robot),
EEReferenceAndDelta(
kinematics=kinematics_solver,
end_effector_step_sizes={"x": 0.5, "y": 0.5, "z": 0.5},
motor_names=list(robot.bus.motors.keys()),
),
EEBoundsAndSafety(
end_effector_bounds={"min": [-1.0, -1.0, -1.0], "max": [1.0, 1.0, 1.0]},
max_ee_step_m=0.10,
max_ee_twist_step_rad=0.50,
),
],
to_transition=to_transition_teleop_action,
to_output=lambda tr: tr,
)
# Build pipeline to convert ee pose action to joint action
robot_ee_to_joints = RobotProcessor(
steps=[
InverseKinematicsEEToJoints(
kinematics=kinematics_solver,
motor_names=list(robot.bus.motors.keys()),
),
GripperVelocityToJoint(
motor_names=list(robot.bus.motors.keys()),
speed_factor=20.0,
),
],
to_transition=lambda tr: tr,
to_output=to_output_robot_action,
)
robot.connect()
teleop_device.connect()
print("Starting teleop loop. Move your phone to teleoperate the robot.")
while True:
phone_obs = teleop_device.get_action()
if not phone_obs:
time.sleep(0.01)
continue
# Get teleop observation
phone_obs = teleop_device.get_action()
# Phone to EE pose transition
ee_transition = phone_to_robot_ee_pose(phone_obs)
# EE pose to Joints transition
joint_action = robot_ee_to_joints(ee_transition)
if joint_action:
robot.send_action(joint_action)
time.sleep(0.01)
+5 -2
View File
@@ -73,6 +73,7 @@ dependencies = [
"pynput>=1.7.7",
"pyserial>=3.5",
"wandb>=0.20.0",
"scipy>=1.15.2",
"torch>=2.2.1,<2.8.0", # TODO: Bumb dependency
"torchcodec>=0.2.1,<0.6.0; sys_platform != 'win32' and (sys_platform != 'linux' or (platform_machine != 'aarch64' and platform_machine != 'arm64' and platform_machine != 'armv7l')) and (sys_platform != 'darwin' or platform_machine != 'x86_64')", # TODO: Bumb dependency
@@ -95,7 +96,7 @@ dependencies = [
# Common
pygame-dep = ["pygame>=2.5.1"]
placo-dep = ["placo>=0.9.6"]
transformers-dep = ["transformers>=4.50.3,<4.52.0"] # TODO: Bumb dependency
transformers-dep = ["transformers<=4.52.0"]
grpcio-dep = ["grpcio==1.73.1", "protobuf==6.31.0"]
# Motors
@@ -111,6 +112,7 @@ intelrealsense = [
"pyrealsense2>=2.55.1.6486 ; sys_platform != 'darwin'",
"pyrealsense2-macosx>=2.54 ; sys_platform == 'darwin'",
]
phone = ["hebi-py>=2.8.0", "teleop>=0.1.0"]
# stretch = [
# "hello-robot-stretch-body>=0.7.27 ; sys_platform == 'linux'",
# "pyrender @ git+https://github.com/mmatl/pyrender.git ; sys_platform == 'linux'",
@@ -152,7 +154,8 @@ all = [
"lerobot[video_benchmark]",
"lerobot[aloha]",
"lerobot[pusht]",
"lerobot[xarm]"
"lerobot[xarm]",
"lerobot[phone]",
]
[project.scripts]
+74
View File
@@ -0,0 +1,74 @@
#!/usr/bin/env python
"""
Convert video dataset to image dataset for faster training.
This pre-extracts all frames from MP4 files to PNG images.
"""
import argparse
from pathlib import Path
import logging
import shutil
def convert_dataset_videos_to_images(repo_id: str, root: str | None = None):
"""Convert all videos in a LeRobot dataset to individual image files."""
from lerobot.datasets.lerobot_dataset import LeRobotDataset
from lerobot.datasets.video_utils import decode_video_frames
import torch
# Load dataset
dataset = LeRobotDataset(repo_id, root=root, download_videos=True)
total_frames_processed = 0
for ep_idx in range(dataset.meta.total_episodes):
logging.info(f"Processing episode {ep_idx}/{dataset.meta.total_episodes}")
for vid_key in dataset.meta.video_keys:
video_path = dataset.root / dataset.meta.get_video_file_path(ep_idx, vid_key)
if not video_path.exists():
logging.warning(f"Video not found: {video_path}")
continue
# Create image directory
img_dir = dataset.root / f"images/chunk-{dataset.meta.get_episode_chunk(ep_idx)}/{vid_key}"
img_dir.mkdir(parents=True, exist_ok=True)
# Decode all frames from video
# Get episode length to decode all frames
ep_length = dataset.meta.episodes[ep_idx]["length"]
timestamps = [i / dataset.fps for i in range(ep_length)]
try:
frames = decode_video_frames(video_path, timestamps, dataset.tolerance_s, dataset.video_backend)
# Save each frame as PNG
for i, frame in enumerate(frames.squeeze(0)):
img_path = img_dir / f"episode_{ep_idx:06d}_{i:06d}.png"
# Convert tensor to PIL and save
import torchvision.transforms as T
to_pil = T.ToPILImage()
pil_frame = to_pil(frame)
pil_frame.save(img_path)
total_frames_processed += len(frames.squeeze(0))
logging.info(f" Extracted {len(frames.squeeze(0))} frames to {img_dir}")
except Exception as e:
logging.error(f"Failed to process {video_path}: {e}")
continue
logging.info(f"Conversion complete! Processed {total_frames_processed} total frames")
logging.info(f"You can now use download_videos=False to use the extracted images")
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Convert LeRobot video dataset to images")
parser.add_argument("repo_id", help="Dataset repo ID (e.g., 'kenmacken/record-test-2')")
parser.add_argument("--root", help="Local root directory", default=None)
args = parser.parse_args()
logging.basicConfig(level=logging.INFO)
convert_dataset_videos_to_images(args.repo_id, args.root)
+2
View File
@@ -33,6 +33,8 @@ class DatasetConfig:
# Root directory where the dataset will be stored (e.g. 'dataset/path').
root: str | None = None
episodes: list[int] | None = None
# Percentage of dataset to use (0-100). If set, overrides episodes parameter.
percentage: float | None = None
image_transforms: ImageTransformsConfig = field(default_factory=ImageTransformsConfig)
revision: str | None = None
use_imagenet_stats: bool = True
+1 -2
View File
@@ -26,7 +26,7 @@ from huggingface_hub import hf_hub_download
from huggingface_hub.constants import CONFIG_NAME
from huggingface_hub.errors import HfHubHTTPError
from lerobot.configs.types import FeatureType, NormalizationMode, PolicyFeature
from lerobot.configs.types import FeatureType, PolicyFeature
from lerobot.constants import ACTION, OBS_STATE
from lerobot.optim.optimizers import OptimizerConfig
from lerobot.optim.schedulers import LRSchedulerConfig
@@ -53,7 +53,6 @@ class PreTrainedConfig(draccus.ChoiceRegistry, HubMixin, abc.ABC):
"""
n_obs_steps: int = 1
normalization_mapping: dict[str, NormalizationMode] = field(default_factory=dict)
input_features: dict[str, PolicyFeature] = field(default_factory=dict)
output_features: dict[str, PolicyFeature] = field(default_factory=dict)
+1
View File
@@ -24,6 +24,7 @@ class FeatureType(str, Enum):
ENV = "ENV"
ACTION = "ACTION"
REWARD = "REWARD"
LANGUAGE = "LANGUAGE"
class NormalizationMode(str, Enum):
+1
View File
@@ -21,6 +21,7 @@ OBS_ENV_STATE = "observation.environment_state"
OBS_STATE = "observation.state"
OBS_IMAGE = "observation.image"
OBS_IMAGES = "observation.images"
OBS_LANGUAGE = "observation.language"
ACTION = "action"
REWARD = "next.reward"
+15 -2
View File
@@ -13,7 +13,6 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import logging
from pprint import pformat
import torch
@@ -87,10 +86,24 @@ def make_dataset(cfg: TrainPipelineConfig) -> LeRobotDataset | MultiLeRobotDatas
cfg.dataset.repo_id, root=cfg.dataset.root, revision=cfg.dataset.revision
)
delta_timestamps = resolve_delta_timestamps(cfg.policy, ds_meta)
# Handle percentage parameter
episodes = cfg.dataset.episodes
if cfg.dataset.percentage is not None:
# Calculate episodes based on percentage
total_episodes = ds_meta.total_episodes
num_episodes_to_use = max(1, int(total_episodes * cfg.dataset.percentage / 100))
episodes = list(range(num_episodes_to_use))
import logging
logging.info(
f"Using {cfg.dataset.percentage}% of dataset: {num_episodes_to_use}/{total_episodes} episodes"
)
dataset = LeRobotDataset(
cfg.dataset.repo_id,
root=cfg.dataset.root,
episodes=cfg.dataset.episodes,
episodes=episodes,
delta_timestamps=delta_timestamps,
image_transforms=image_transforms,
revision=cfg.dataset.revision,
+94
View File
@@ -0,0 +1,94 @@
# Copyright 2025 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from collections.abc import Sequence
from typing import Any
from lerobot.datasets.utils import hw_to_dataset_features
from lerobot.processor.pipeline import RobotProcessor
def aggregate_pipeline_dataset_features(
pipeline: RobotProcessor,
initial_features: dict[str, Any],
*,
use_videos: bool = True,
patterns: Sequence[str] | None = None,
) -> dict[str, dict]:
"""
Aggregates the pipeline's features and returns a features dict ready for the dataset,
filtered to only those keys matching any of the given patterns (for action/state only).
- `initial_features`: raw camera specs, e.g. {"front": (h,w,c), ...}
- `use_videos`: whether to treat image features as video streams
- `patterns`: regexes to filter action & state features; images are included
whenever use_videos=True, regardless of patterns.
"""
import re
# Gather everything the pipeline features specifies, seeded with hardware cams:
all_features = pipeline.transform_features(initial_features)
# Helper to decide which action/state keys survive the `patterns` filter:
def keep(key: str) -> bool:
if patterns is None:
return True
return any(re.search(pat, key) for pat in patterns)
# Start with hardware dict, injecting initial cameras if videos are ON:
hw: dict[str, dict[str, Any]] = {}
if use_videos:
cams = {
name: shape
for name, shape in initial_features.items()
if isinstance(shape, tuple) and len(shape) == 3
}
if cams:
hw["observation"] = dict(cams)
# Go over every feature from the pipeline and merge:
for full_key, ty in all_features.items():
if full_key.startswith("action."):
# action.<feat>
if not keep(full_key):
continue
name = full_key[len("action.") :]
hw.setdefault("action", {})[name] = ty
elif full_key.startswith("observation.state."):
# observation.state.<feat>
if not keep(full_key):
continue
name = full_key[len("observation.state.") :]
hw.setdefault("observation", {})[name] = ty
elif full_key.startswith("observation.images."):
# observation.images.<cam>
# images obey ONLY the use_videos flag, not patterns
if not use_videos:
continue
name = full_key[len("observation.images.") :]
hw.setdefault("observation", {})[name] = ty
else:
# anything else (e.g. policy-only features) is ignored here
continue
out: dict[str, dict] = {}
if "action" in hw:
out.update(hw_to_dataset_features(hw["action"], "action", use_videos))
if "observation" in hw:
out.update(hw_to_dataset_features(hw["observation"], "observation", use_videos))
return out
+44
View File
@@ -470,6 +470,50 @@ def dataset_to_policy_features(features: dict[str, dict]) -> dict[str, PolicyFea
return policy_features
def merge_features(*dicts: dict) -> dict:
"""
Merge LeRobot grouped feature dicts.
- For 1D numeric specs (dtype not image/video/string) with "names": we merge the names and recompute the shape.
- For others (observation.images.*), last one wins (if they are identical).
"""
out: dict = {}
for d in dicts:
for key, value in d.items():
if not isinstance(value, dict):
out[key] = value
continue
dtype = value.get("dtype")
shape = value.get("shape")
is_vector = (
dtype not in ("image", "video", "string")
and isinstance(shape, tuple)
and len(shape) == 1
and "names" in value
)
if is_vector:
# Initialize or retrieve the accumulating dict for this feature key
target = out.setdefault(key, {"dtype": dtype, "names": [], "shape": (0,)})
# Ensure consistent data types across merged entries
if "dtype" in target and dtype != target["dtype"]:
raise ValueError(f"dtype mismatch for '{key}': {target['dtype']} vs {dtype}")
# Merge feature names: append only new ones to preserve order without duplicates
seen = set(target["names"])
for n in value["names"]:
if n not in seen:
target["names"].append(n)
seen.add(n)
# Recompute the shape to reflect the updated number of features
target["shape"] = (len(target["names"]),)
else:
# For images/videos and non-1D entries: override with the latest definition
out[key] = value
return out
def create_empty_dataset_info(
codebase_version: str,
fps: int,
@@ -13,20 +13,24 @@
# limitations under the License.
"""
This script will help you convert any LeRobot dataset already pushed to the hub from codebase version 2.0 to
2.1. It will:
This script converts a LeRobot dataset already pushed to the Hub from codebase version 2.0 to 2.1.
It downloads metadata from a SOURCE dataset repo, computes/validates per-episode stats, updates
the codebase version in `info.json`, and uploads the result to a DESTINATION dataset repo.
It will:
- Generate per-episodes stats and writes them in `episodes_stats.jsonl`
- Check consistency between these new stats and the old ones.
- Remove the deprecated `stats.json`.
- Update codebase_version in `info.json`.
- Push this new version to the hub on the 'main' branch and tags it with "v2.1".
- Push this new version to the destination repo/branch and tag it with the current codebase version.
Usage:
```bash
python -m lerobot.datasets.v21.convert_dataset_v20_to_v21 \
--repo-id=aliberts/koch_tutorial
--source-repo-id=namespace/source_dataset \
--dest-repo-id=namespace/destination_dataset \
--branch=main
```
"""
@@ -54,48 +58,67 @@ class SuppressWarnings:
def convert_dataset(
repo_id: str,
source_repo_id: str,
dest_repo_id: str,
branch: str | None = None,
num_workers: int = 4,
):
# Download metadata from the source repo at v2.0
with SuppressWarnings():
dataset = LeRobotDataset(repo_id, revision=V20, force_cache_sync=True)
dataset = LeRobotDataset(source_repo_id, revision=V20, force_cache_sync=True)
# Ensure we recompute fresh episodes stats
if (dataset.root / EPISODES_STATS_PATH).is_file():
(dataset.root / EPISODES_STATS_PATH).unlink()
# Compute and validate stats
convert_stats(dataset, num_workers=num_workers)
ref_stats = load_stats(dataset.root)
check_aggregate_stats(dataset, ref_stats)
# Update codebase version in info.json
dataset.meta.info["codebase_version"] = CODEBASE_VERSION
write_info(dataset.meta.info, dataset.root)
dataset.push_to_hub(branch=branch, tag_version=False, allow_patterns="meta/")
# delete old stats.json file
if (dataset.root / STATS_PATH).is_file:
# Remove deprecated stats.json locally so it won't be uploaded
if (dataset.root / STATS_PATH).is_file():
(dataset.root / STATS_PATH).unlink()
# Push only meta/ to destination repo
hub_api = HfApi()
if hub_api.file_exists(
repo_id=dataset.repo_id, filename=STATS_PATH, revision=branch, repo_type="dataset"
):
hub_api.delete_file(
path_in_repo=STATS_PATH, repo_id=dataset.repo_id, revision=branch, repo_type="dataset"
)
hub_api.create_repo(repo_id=dest_repo_id, private=False, repo_type="dataset", exist_ok=True)
if branch:
hub_api.create_branch(repo_id=dest_repo_id, branch=branch, repo_type="dataset", exist_ok=True)
hub_api.create_tag(repo_id, tag=CODEBASE_VERSION, revision=branch, repo_type="dataset")
hub_api.upload_folder(
repo_id=dest_repo_id,
folder_path=str(dataset.root),
repo_type="dataset",
revision=branch,
allow_patterns="meta/",
)
# Ensure old stats.json is deleted on destination
if hub_api.file_exists(repo_id=dest_repo_id, filename=STATS_PATH, revision=branch, repo_type="dataset"):
hub_api.delete_file(path_in_repo=STATS_PATH, repo_id=dest_repo_id, revision=branch, repo_type="dataset")
# Tag destination with current codebase version
hub_api.create_tag(dest_repo_id, tag=CODEBASE_VERSION, revision=branch, repo_type="dataset")
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument(
"--repo-id",
"--source-repo-id",
type=str,
required=True,
help="Repository identifier on Hugging Face: a community or a user name `/` the name of the dataset "
"(e.g. `lerobot/pusht`, `cadene/aloha_sim_insertion_human`).",
help="Source dataset repo id to download from (must be v2.0).",
)
parser.add_argument(
"--dest-repo-id",
type=str,
required=True,
help="Destination dataset repo id to upload the converted metadata to.",
)
parser.add_argument(
"--branch",
+13
View File
@@ -15,6 +15,19 @@
from .act.configuration_act import ACTConfig as ACTConfig
from .diffusion.configuration_diffusion import DiffusionConfig as DiffusionConfig
from .pi0.configuration_pi0 import PI0Config as PI0Config
from .pi0.processor_pi0 import Pi0NewLineProcessor
from .rlearn.configuration_rlearn import RLearNConfig as RLearNConfig
from .smolvla.configuration_smolvla import SmolVLAConfig as SmolVLAConfig
from .smolvla.processor_smolvla import SmolVLANewLineProcessor
from .tdmpc.configuration_tdmpc import TDMPCConfig as TDMPCConfig
from .vqbet.configuration_vqbet import VQBeTConfig as VQBeTConfig
__all__ = [
"ACTConfig",
"DiffusionConfig",
"PI0Config",
"SmolVLAConfig",
"TDMPCConfig",
"VQBeTConfig",
"RLearNConfig",
]
+1 -17
View File
@@ -35,7 +35,6 @@ from torchvision.ops.misc import FrozenBatchNorm2d
from lerobot.constants import ACTION, OBS_IMAGES
from lerobot.policies.act.configuration_act import ACTConfig
from lerobot.policies.normalize import Normalize, Unnormalize
from lerobot.policies.pretrained import PreTrainedPolicy
@@ -51,27 +50,16 @@ class ACTPolicy(PreTrainedPolicy):
def __init__(
self,
config: ACTConfig,
dataset_stats: dict[str, dict[str, Tensor]] | None = None,
):
"""
Args:
config: Policy configuration class instance or None, in which case the default instantiation of
the configuration class is used.
dataset_stats: Dataset statistics to be used for normalization. If not passed here, it is expected
that they will be passed with a call to `load_state_dict` before the policy is used.
"""
super().__init__(config)
config.validate_features()
self.config = config
self.normalize_inputs = Normalize(config.input_features, config.normalization_mapping, dataset_stats)
self.normalize_targets = Normalize(
config.output_features, config.normalization_mapping, dataset_stats
)
self.unnormalize_outputs = Unnormalize(
config.output_features, config.normalization_mapping, dataset_stats
)
self.model = ACT(config)
if config.temporal_ensemble_coeff is not None:
@@ -137,23 +125,19 @@ class ACTPolicy(PreTrainedPolicy):
"""Predict a chunk of actions given environment observations."""
self.eval()
batch = self.normalize_inputs(batch)
if self.config.image_features:
batch = dict(batch) # shallow copy so that adding a key doesn't modify the original
batch[OBS_IMAGES] = [batch[key] for key in self.config.image_features]
actions = self.model(batch)[0]
actions = self.unnormalize_outputs({ACTION: actions})[ACTION]
return actions
def forward(self, batch: dict[str, Tensor]) -> tuple[Tensor, dict]:
"""Run the batch through the model and compute the loss for training or validation."""
batch = self.normalize_inputs(batch)
if self.config.image_features:
batch = dict(batch) # shallow copy so that adding a key doesn't modify the original
batch[OBS_IMAGES] = [batch[key] for key in self.config.image_features]
batch = self.normalize_targets(batch)
actions_hat, (mu_hat, log_sigma_x2_hat) = self.model(batch)
l1_loss = (
@@ -303,7 +287,7 @@ class ACT(nn.Module):
└───────────────────────┘
"""
def __init__(self, config: ACTConfig):
def __init__(self, config: ACTConfig, dataset_stats=None):
# BERT style VAE encoder with input tokens [cls, robot_state, *action_sequence].
# The cls token forms parameters of the latent's distribution (like this [*means, *log_variances]).
super().__init__()
+50
View File
@@ -0,0 +1,50 @@
#!/usr/bin/env python
# Copyright 2024 Tony Z. Zhao and The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import torch
from lerobot.policies.act.configuration_act import ACTConfig
from lerobot.processor import (
DeviceProcessor,
NormalizerProcessor,
RenameProcessor,
RobotProcessor,
ToBatchProcessor,
UnnormalizerProcessor,
)
def make_act_processor(
config: ACTConfig, dataset_stats: dict[str, dict[str, torch.Tensor]] | None = None
) -> tuple[RobotProcessor, RobotProcessor]:
input_steps = [
RenameProcessor(rename_map={}),
NormalizerProcessor(
features={**config.input_features, **config.output_features},
norm_map=config.normalization_mapping,
stats=dataset_stats,
),
ToBatchProcessor(),
DeviceProcessor(device=config.device),
]
output_steps = [
DeviceProcessor(device="cpu"),
UnnormalizerProcessor(
features=config.output_features, norm_map=config.normalization_mapping, stats=dataset_stats
),
]
return RobotProcessor(steps=input_steps, name="robot_preprocessor"), RobotProcessor(
steps=output_steps, name="robot_postprocessor"
)
@@ -35,7 +35,6 @@ from torch import Tensor, nn
from lerobot.constants import ACTION, OBS_ENV_STATE, OBS_IMAGES, OBS_STATE
from lerobot.policies.diffusion.configuration_diffusion import DiffusionConfig
from lerobot.policies.normalize import Normalize, Unnormalize
from lerobot.policies.pretrained import PreTrainedPolicy
from lerobot.policies.utils import (
get_device_from_parameters,
@@ -57,7 +56,6 @@ class DiffusionPolicy(PreTrainedPolicy):
def __init__(
self,
config: DiffusionConfig,
dataset_stats: dict[str, dict[str, Tensor]] | None = None,
):
"""
Args:
@@ -70,14 +68,6 @@ class DiffusionPolicy(PreTrainedPolicy):
config.validate_features()
self.config = config
self.normalize_inputs = Normalize(config.input_features, config.normalization_mapping, dataset_stats)
self.normalize_targets = Normalize(
config.output_features, config.normalization_mapping, dataset_stats
)
self.unnormalize_outputs = Unnormalize(
config.output_features, config.normalization_mapping, dataset_stats
)
# queues are populated during rollout of the policy, they contain the n latest observations and actions
self._queues = None
@@ -106,9 +96,6 @@ class DiffusionPolicy(PreTrainedPolicy):
batch = {k: torch.stack(list(self._queues[k]), dim=1) for k in batch if k in self._queues}
actions = self.diffusion.generate_actions(batch)
# TODO(rcadene): make above methods return output dictionary?
actions = self.unnormalize_outputs({ACTION: actions})[ACTION]
return actions
@torch.no_grad()
@@ -137,7 +124,6 @@ class DiffusionPolicy(PreTrainedPolicy):
if ACTION in batch:
batch.pop(ACTION)
batch = self.normalize_inputs(batch)
if self.config.image_features:
batch = dict(batch) # shallow copy so that adding a key doesn't modify the original
batch[OBS_IMAGES] = torch.stack([batch[key] for key in self.config.image_features], dim=-4)
@@ -153,11 +139,9 @@ class DiffusionPolicy(PreTrainedPolicy):
def forward(self, batch: dict[str, Tensor]) -> tuple[Tensor, None]:
"""Run the batch through the model and compute the loss for training or validation."""
batch = self.normalize_inputs(batch)
if self.config.image_features:
batch = dict(batch) # shallow copy so that adding a key doesn't modify the original
batch[OBS_IMAGES] = torch.stack([batch[key] for key in self.config.image_features], dim=-4)
batch = self.normalize_targets(batch)
loss = self.diffusion.compute_loss(batch)
# no output_dict so returning None
return loss, None
@@ -0,0 +1,51 @@
#!/usr/bin/env python
# Copyright 2024 Columbia Artificial Intelligence, Robotics Lab,
# and The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import torch
from lerobot.policies.diffusion.configuration_diffusion import DiffusionConfig
from lerobot.processor import (
DeviceProcessor,
NormalizerProcessor,
RenameProcessor,
RobotProcessor,
ToBatchProcessor,
UnnormalizerProcessor,
)
def make_diffusion_processor(
config: DiffusionConfig, dataset_stats: dict[str, dict[str, torch.Tensor]] | None = None
) -> tuple[RobotProcessor, RobotProcessor]:
input_steps = [
RenameProcessor(rename_map={}),
NormalizerProcessor(
features={**config.input_features, **config.output_features},
norm_map=config.normalization_mapping,
stats=dataset_stats,
),
ToBatchProcessor(),
DeviceProcessor(device=config.device),
]
output_steps = [
DeviceProcessor(device="cpu"),
UnnormalizerProcessor(
features=config.output_features, norm_map=config.normalization_mapping, stats=dataset_stats
),
]
return RobotProcessor(steps=input_steps, name="robot_preprocessor"), RobotProcessor(
steps=output_steps, name="robot_postprocessor"
)
+148 -3
View File
@@ -14,9 +14,14 @@
# See the License for the specific language governing permissions and
# limitations under the License.
import logging
from __future__ import annotations
import logging
from typing import Any, TypedDict, cast
import torch
from torch import nn
from typing_extensions import Unpack
from lerobot.configs.policies import PreTrainedConfig
from lerobot.configs.types import FeatureType
@@ -29,14 +34,16 @@ from lerobot.policies.diffusion.configuration_diffusion import DiffusionConfig
from lerobot.policies.pi0.configuration_pi0 import PI0Config
from lerobot.policies.pi0fast.configuration_pi0fast import PI0FASTConfig
from lerobot.policies.pretrained import PreTrainedPolicy
from lerobot.policies.rlearn.configuration_rlearn import RLearNConfig
from lerobot.policies.sac.configuration_sac import SACConfig
from lerobot.policies.sac.reward_model.configuration_classifier import RewardClassifierConfig
from lerobot.policies.smolvla.configuration_smolvla import SmolVLAConfig
from lerobot.policies.tdmpc.configuration_tdmpc import TDMPCConfig
from lerobot.policies.vqbet.configuration_vqbet import VQBeTConfig
from lerobot.processor.pipeline import RobotProcessor
def get_policy_class(name: str) -> PreTrainedPolicy:
def get_policy_class(name: str) -> type[PreTrainedPolicy]:
"""Get the policy's class and config class given a name (matching the policy class' `name` attribute)."""
if name == "tdmpc":
from lerobot.policies.tdmpc.modeling_tdmpc import TDMPCPolicy
@@ -74,6 +81,10 @@ def get_policy_class(name: str) -> PreTrainedPolicy:
from lerobot.policies.smolvla.modeling_smolvla import SmolVLAPolicy
return SmolVLAPolicy
elif name == "rlearn":
from lerobot.policies.rlearn.modeling_rlearn import RLearNPolicy
return RLearNPolicy
else:
raise NotImplementedError(f"Policy with name {name} is not implemented.")
@@ -97,14 +108,143 @@ def make_policy_config(policy_type: str, **kwargs) -> PreTrainedConfig:
return SmolVLAConfig(**kwargs)
elif policy_type == "reward_classifier":
return RewardClassifierConfig(**kwargs)
elif policy_type == "rlearn":
return RLearNConfig(**kwargs)
else:
raise ValueError(f"Policy type '{policy_type}' is not available.")
class ProcessorConfigKwargs(TypedDict, total=False):
"""Keyword arguments for the processor config."""
preprocessor_config_filename: str | None
postprocessor_config_filename: str | None
preprocessor_overrides: dict[str, Any] | None
postprocessor_overrides: dict[str, Any] | None
dataset_stats: dict[str, dict[str, torch.Tensor]] | None
def make_processor(
policy_cfg: PreTrainedConfig,
pretrained_path: str | None = None,
**kwargs: Unpack[ProcessorConfigKwargs],
) -> tuple[RobotProcessor, RobotProcessor]:
"""Make a processor instance for a given policy type.
This function creates the appropriate processor configuration based on the policy type.
Each policy type has its own processor with specific preprocessing steps.
Args:
policy_cfg: The config of the policy to create a processor for (e.g., "act", "diffusion", etc.)
pretrained_path: Optional path to load a pretrained processor from. If provided, loads
the processor from this path instead of creating a new one.
**kwargs: Additional keyword arguments passed to the processor creation.
Returns:
Tuple of (input_processor, output_processor) for the policy.
Raises:
NotImplementedError: If the policy type doesn't have a processor implemented.
"""
if pretrained_path:
# Load a pretrained processor
# TODO(azouitine): Handle this case.
return (
RobotProcessor.from_pretrained(
pretrained_model_name_or_path=pretrained_path,
config_filename=kwargs.get("preprocessor_config_filename", "robot_preprocessor.json"),
overrides=kwargs.get("preprocessor_overrides", {}),
),
RobotProcessor.from_pretrained(
pretrained_model_name_or_path=pretrained_path,
config_filename=kwargs.get("postprocessor_config_filename", "robot_postprocessor.json"),
overrides=kwargs.get("postprocessor_overrides", {}),
),
)
# Create a new processor based on policy type
if policy_cfg.type == "tdmpc":
from lerobot.policies.tdmpc.configuration_tdmpc import TDMPCConfig
from lerobot.policies.tdmpc.processor_tdmpc import make_tdmpc_processor
processors = make_tdmpc_processor(
config=cast(TDMPCConfig, policy_cfg), dataset_stats=kwargs.get("dataset_stats")
)
elif policy_cfg.type == "diffusion":
from lerobot.policies.diffusion.processor_diffusion import make_diffusion_processor
processors = make_diffusion_processor(
cast(DiffusionConfig, policy_cfg), dataset_stats=kwargs.get("dataset_stats")
)
elif policy_cfg.type == "act":
from lerobot.policies.act.processor_act import make_act_processor
processors = make_act_processor(
config=cast(ACTConfig, policy_cfg), dataset_stats=kwargs.get("dataset_stats")
)
elif policy_cfg.type == "vqbet":
from lerobot.policies.vqbet.processor_vqbet import make_vqbet_processor
processors = make_vqbet_processor(
config=cast(VQBeTConfig, policy_cfg), dataset_stats=kwargs.get("dataset_stats")
)
elif policy_cfg.type == "pi0":
from lerobot.policies.pi0.processor_pi0 import make_pi0_processor
processors = make_pi0_processor(
config=cast(PI0Config, policy_cfg), dataset_stats=kwargs.get("dataset_stats")
)
elif policy_cfg.type == "pi0fast":
from lerobot.policies.pi0fast.processor_pi0fast import make_pi0fast_processor
processors = make_pi0fast_processor(
cast(PI0Config, policy_cfg), dataset_stats=kwargs.get("dataset_stats")
)
elif policy_cfg.type == "sac":
from lerobot.policies.sac.processor_sac import make_sac_processor
processors = make_sac_processor(
cast(SACConfig, policy_cfg), dataset_stats=kwargs.get("dataset_stats")
)
elif policy_cfg.type == "reward_classifier":
from lerobot.policies.sac.reward_model.processor_classifier import make_classifier_processor
processors = make_classifier_processor(
cast(RewardClassifierConfig, policy_cfg), dataset_stats=kwargs.get("dataset_stats")
)
elif policy_cfg.type == "smolvla":
from lerobot.policies.smolvla.processor_smolvla import make_smolvla_processor
processors = make_smolvla_processor(
cast(SmolVLAConfig, policy_cfg), dataset_stats=kwargs.get("dataset_stats")
)
elif policy_cfg.type == "rlearn":
from lerobot.policies.rlearn.processor_rlearn import make_rlearn_processor
processors = make_rlearn_processor(
cast(RLearNConfig, policy_cfg), dataset_stats=kwargs.get("dataset_stats")
)
else:
raise NotImplementedError(f"Processor for policy type '{policy_cfg.type}' is not implemented.")
return processors
def make_policy(
cfg: PreTrainedConfig,
ds_meta: LeRobotDatasetMetadata | None = None,
env_cfg: EnvConfig | None = None,
episode_data_index: dict | None = None,
) -> PreTrainedPolicy:
"""Make an instance of a policy class.
@@ -147,7 +287,6 @@ def make_policy(
kwargs = {}
if ds_meta is not None:
features = dataset_to_policy_features(ds_meta.features)
kwargs["dataset_stats"] = ds_meta.stats
else:
if not cfg.pretrained_path:
logging.warning(
@@ -155,12 +294,18 @@ def make_policy(
"rather than a dataset. Normalization modules inside the policy will have infinite values "
"by default without stats from a dataset."
)
if env_cfg is None:
raise ValueError("env_cfg cannot be None when ds_meta is not provided")
features = env_to_policy_features(env_cfg)
cfg.output_features = {key: ft for key, ft in features.items() if ft.type is FeatureType.ACTION}
cfg.input_features = {key: ft for key, ft in features.items() if key not in cfg.output_features}
kwargs["config"] = cfg
# Pass episode_data_index for RLearN policy to calculate proper progress
if cfg.type == "rlearn" and episode_data_index is not None:
kwargs["episode_data_index"] = episode_data_index
if cfg.pretrained_path:
# Load a pretrained policy and override the config if needed (for example, if there are inference-time
# hyperparameters that we want to vary).
+7 -139
View File
@@ -56,18 +56,15 @@ from collections import deque
import torch
import torch.nn.functional as F # noqa: N812
from torch import Tensor, nn
from transformers import AutoTokenizer
from lerobot.constants import ACTION, OBS_STATE
from lerobot.policies.normalize import Normalize, Unnormalize
from lerobot.constants import ACTION, OBS_LANGUAGE, OBS_STATE
from lerobot.policies.pi0.configuration_pi0 import PI0Config
from lerobot.policies.pi0.paligemma_with_expert import (
PaliGemmaWithExpertConfig,
PaliGemmaWithExpertModel,
)
from lerobot.policies.pretrained import PreTrainedPolicy
from lerobot.policies.utils import log_model_loading_keys
from lerobot.utils.utils import get_safe_dtype, init_logging
from lerobot.utils.utils import get_safe_dtype
def create_sinusoidal_pos_embedding(
@@ -223,28 +220,17 @@ class PI0Policy(PreTrainedPolicy):
def __init__(
self,
config: PI0Config,
dataset_stats: dict[str, dict[str, Tensor]] | None = None,
):
"""
Args:
config: Policy configuration class instance or None, in which case the default instantiation of
the configuration class is used.
dataset_stats: Dataset statistics to be used for normalization. If not passed here, it is expected
that they will be passed with a call to `load_state_dict` before the policy is used.
"""
super().__init__(config)
config.validate_features()
self.config = config
self.normalize_inputs = Normalize(config.input_features, config.normalization_mapping, dataset_stats)
self.normalize_targets = Normalize(
config.output_features, config.normalization_mapping, dataset_stats
)
self.unnormalize_outputs = Unnormalize(
config.output_features, config.normalization_mapping, dataset_stats
)
self.language_tokenizer = AutoTokenizer.from_pretrained("google/paligemma-3b-pt-224")
self.model = PI0FlowMatching(config)
self.reset()
@@ -253,99 +239,6 @@ class PI0Policy(PreTrainedPolicy):
"""This should be called whenever the environment is reset."""
self._action_queue = deque([], maxlen=self.config.n_action_steps)
@classmethod
def _transform_state_dict_keys(cls, state_dict: dict) -> dict:
"""
Transform state dict keys to match expected model structure.
Transformations:
- model.paligemma_with_expert.paligemma.language_model.lm_head ->
model.paligemma_with_expert.paligemma.lm_head
- model.paligemma_with_expert.paligemma.language_model.model ->
model.paligemma_with_expert.paligemma.model.language_model
- model.paligemma_with_expert.paligemma.vision_tower ->
model.paligemma_with_expert.paligemma.model.vision_tower
- model.paligemma_with_expert.paligemma.multi_modal_projector ->
model.paligemma_with_expert.paligemma.model.multi_modal_projector
Also handles tied weights between lm_head.weight and
embed_tokens.weight.
"""
import re
transformed_dict = {}
transformations = [
(
re.compile(r"\.paligemma_with_expert\.paligemma\.language_model\.lm_head"),
".paligemma_with_expert.paligemma.lm_head",
),
(
re.compile(r"\.paligemma_with_expert\.paligemma\.language_model\.model"),
".paligemma_with_expert.paligemma.model.language_model",
),
(
re.compile(r"\.paligemma_with_expert\.paligemma\.vision_tower"),
".paligemma_with_expert.paligemma.model.vision_tower",
),
(
re.compile(r"\.paligemma_with_expert\.paligemma\.multi_modal_projector"),
".paligemma_with_expert.paligemma.model.multi_modal_projector",
),
]
for key, value in state_dict.items():
new_key = key
for pattern, replacement in transformations:
new_key = pattern.sub(replacement, new_key)
transformed_dict[new_key] = value
# Handle tied weights: lm_head.weight and embed_tokens.weight share memory
lm_head_key = None
embed_tokens_key = None
for key in transformed_dict:
if key.endswith(".paligemma_with_expert.paligemma.lm_head.weight"):
lm_head_key = key
elif key.endswith(".paligemma_with_expert.paligemma.model.language_model.embed_tokens.weight"):
embed_tokens_key = key
if lm_head_key and embed_tokens_key:
break
if lm_head_key and not embed_tokens_key:
embed_tokens_key = lm_head_key.replace(
".lm_head.weight", ".model.language_model.embed_tokens.weight"
)
transformed_dict[embed_tokens_key] = transformed_dict[lm_head_key]
elif embed_tokens_key and not lm_head_key:
lm_head_key = embed_tokens_key.replace(
".model.language_model.embed_tokens.weight", ".lm_head.weight"
)
transformed_dict[lm_head_key] = transformed_dict[embed_tokens_key]
return transformed_dict
@classmethod
def _load_as_safetensor(
cls, model: "PI0Policy", model_file: str, map_location: str, strict: bool
) -> "PI0Policy":
"""Override to apply key transformations before loading."""
from safetensors.torch import load_file
init_logging()
# Load the state dict from file safely
state_dict = load_file(model_file, device=map_location)
# Apply key transformations
transformed_state_dict = cls._transform_state_dict_keys(state_dict)
# Load the transformed state dict
msg = model.load_state_dict(transformed_state_dict, strict=strict)
# Log message
log_model_loading_keys(msg.missing_keys, msg.unexpected_keys)
return model
def get_optim_params(self) -> dict:
return self.parameters()
@@ -377,14 +270,13 @@ class PI0Policy(PreTrainedPolicy):
if self.config.adapt_to_pi_aloha:
batch[OBS_STATE] = self._pi_aloha_decode_state(batch[OBS_STATE])
batch = self.normalize_inputs(batch)
# Action queue logic for n_action_steps > 1. When the action_queue is depleted, populate it by
# querying the policy.
if len(self._action_queue) == 0:
images, img_masks = self.prepare_images(batch)
state = self.prepare_state(batch)
lang_tokens, lang_masks = self.prepare_language(batch)
lang_tokens = batch[f"{OBS_LANGUAGE}.tokens"]
lang_masks = batch[f"{OBS_LANGUAGE}.attention_mask"]
actions = self.model.sample_actions(
images, img_masks, lang_tokens, lang_masks, state, noise=noise
@@ -394,8 +286,6 @@ class PI0Policy(PreTrainedPolicy):
original_action_dim = self.config.action_feature.shape[0]
actions = actions[:, :, :original_action_dim]
actions = self.unnormalize_outputs({"action": actions})["action"]
if self.config.adapt_to_pi_aloha:
actions = self._pi_aloha_encode_actions(actions)
@@ -410,12 +300,10 @@ class PI0Policy(PreTrainedPolicy):
batch[OBS_STATE] = self._pi_aloha_decode_state(batch[OBS_STATE])
batch[ACTION] = self._pi_aloha_encode_actions_inv(batch[ACTION])
batch = self.normalize_inputs(batch)
batch = self.normalize_targets(batch)
images, img_masks = self.prepare_images(batch)
state = self.prepare_state(batch)
lang_tokens, lang_masks = self.prepare_language(batch)
lang_tokens = batch[f"{OBS_LANGUAGE}.tokens"]
lang_masks = batch[f"{OBS_LANGUAGE}.attention_mask"]
actions = self.prepare_action(batch)
actions_is_pad = batch.get("action_is_pad")
@@ -482,26 +370,6 @@ class PI0Policy(PreTrainedPolicy):
return images, img_masks
def prepare_language(self, batch) -> tuple[Tensor, Tensor]:
"""Tokenize the text input"""
device = batch[OBS_STATE].device
tasks = batch["task"]
# PaliGemma prompt has to end with a new line
tasks = [task if task.endswith("\n") else f"{task}\n" for task in tasks]
tokenized_prompt = self.language_tokenizer.__call__(
tasks,
padding="max_length",
padding_side="right",
max_length=self.config.tokenizer_max_length,
return_tensors="pt",
)
lang_tokens = tokenized_prompt["input_ids"].to(device=device)
lang_masks = tokenized_prompt["attention_mask"].to(device=device, dtype=torch.bool)
return lang_tokens, lang_masks
def _pi_aloha_decode_state(self, state):
# Flip the joints.
for motor_idx in [1, 2, 8, 9]:
@@ -567,7 +435,7 @@ class PI0FlowMatching(nn.Module):
└──────────────────────────────┘
"""
def __init__(self, config):
def __init__(self, config: PI0Config):
super().__init__()
self.config = config
+120
View File
@@ -0,0 +1,120 @@
#!/usr/bin/env python
# Copyright 2025 Physical Intelligence and The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from typing import Any
import torch
from lerobot.configs.types import PolicyFeature
from lerobot.policies.pi0.configuration_pi0 import PI0Config
from lerobot.processor import (
DeviceProcessor,
NormalizerProcessor,
RobotProcessor,
ToBatchProcessor,
TokenizerProcessor,
UnnormalizerProcessor,
)
from lerobot.processor.pipeline import (
EnvTransition,
ProcessorStep,
ProcessorStepRegistry,
TransitionKey,
)
from lerobot.processor.rename_processor import RenameProcessor
@ProcessorStepRegistry.register(name="pi0_new_line_processor")
class Pi0NewLineProcessor(ProcessorStep):
"""Add a new line to the end of the task if it doesn't have one.
This is required for the PaliGemma tokenizer.
"""
def __call__(self, transition: EnvTransition) -> EnvTransition:
# Check if complementary_data exists
complementary_data = transition.get(TransitionKey.COMPLEMENTARY_DATA)
if complementary_data is None or "task" not in complementary_data:
return transition
task = complementary_data["task"]
if task is None:
return transition
# Handle both string and list of strings
if isinstance(task, str):
# Single string: add newline if not present
if not task.endswith("\n"):
complementary_data["task"] = f"{task}\n"
elif isinstance(task, list) and all(isinstance(t, str) for t in task):
# List of strings: add newline to each if not present
complementary_data["task"] = [t if t.endswith("\n") else f"{t}\n" for t in task]
# If task is neither string nor list of strings, leave unchanged
return transition
def transform_features(self, features: dict[str, PolicyFeature]) -> dict[str, PolicyFeature]:
"""Add tokenized task features to the features."""
return features
def state_dict(self) -> dict[str, torch.Tensor]:
"""Return state dictionary (empty for this processor)."""
return {}
def load_state_dict(self, state: dict[str, torch.Tensor]) -> None:
"""Load state dictionary (no-op for this processor)."""
pass
def reset(self) -> None:
"""Reset processor state (no-op for this processor)."""
pass
def get_config(self) -> dict[str, Any]:
"""Return configuration for serialization."""
return {}
def make_pi0_processor(
config: PI0Config, dataset_stats: dict[str, dict[str, torch.Tensor]] | None = None
) -> tuple[RobotProcessor, RobotProcessor]:
# Add remaining processors
input_steps: list[ProcessorStep] = [
RenameProcessor(rename_map={}), # To mimic the same processor as pretrained one
NormalizerProcessor(
features={**config.input_features, **config.output_features},
norm_map=config.normalization_mapping,
stats=dataset_stats,
),
ToBatchProcessor(),
Pi0NewLineProcessor(), # Add newlines before tokenization for PaliGemma
TokenizerProcessor(
tokenizer_name="google/paligemma-3b-pt-224",
max_length=config.tokenizer_max_length,
padding_side="right",
padding="max_length",
),
DeviceProcessor(device=config.device),
]
output_steps: list[ProcessorStep] = [
DeviceProcessor(device="cpu"),
UnnormalizerProcessor(
features=config.output_features, norm_map=config.normalization_mapping, stats=dataset_stats
),
]
return RobotProcessor(steps=input_steps, name="robot_preprocessor"), RobotProcessor(
steps=output_steps, name="robot_postprocessor"
)
@@ -58,7 +58,6 @@ from transformers.cache_utils import HybridCache, StaticCache
from transformers.models.auto import CONFIG_MAPPING
from lerobot.constants import ACTION, OBS_STATE
from lerobot.policies.normalize import Normalize, Unnormalize
from lerobot.policies.pi0fast.configuration_pi0fast import PI0FASTConfig
from lerobot.policies.pretrained import PreTrainedPolicy
@@ -146,14 +145,6 @@ class PI0FASTPolicy(PreTrainedPolicy):
config.validate_features()
self.config = config
self.normalize_inputs = Normalize(config.input_features, config.normalization_mapping, dataset_stats)
self.normalize_targets = Normalize(
config.output_features, config.normalization_mapping, dataset_stats
)
self.unnormalize_outputs = Unnormalize(
config.output_features, config.normalization_mapping, dataset_stats
)
self.language_tokenizer = AutoProcessor.from_pretrained("google/paligemma-3b-pt-224")
self.model = PI0FAST(config)
@@ -221,8 +212,6 @@ class PI0FASTPolicy(PreTrainedPolicy):
if self.config.adapt_to_pi_aloha:
batch[OBS_STATE] = self._pi_aloha_decode_state(batch[OBS_STATE])
batch = self.normalize_inputs(batch)
# Action queue logic for n_action_steps > 1. When the action_queue is depleted, populate it by
# querying the policy.
if len(self._action_queue) == 0:
@@ -235,8 +224,6 @@ class PI0FASTPolicy(PreTrainedPolicy):
] # self.config.max_action_dim # self.config.action_feature.shape[0]
actions = actions[:, :, :original_action_dim]
actions = self.unnormalize_outputs({"action": actions})["action"]
if self.config.adapt_to_pi_aloha:
actions = self._pi_aloha_encode_actions(actions)
@@ -249,8 +236,6 @@ class PI0FASTPolicy(PreTrainedPolicy):
if self.config.adapt_to_pi_aloha:
batch[OBS_STATE] = self._pi_aloha_decode_state(batch[OBS_STATE])
batch[ACTION] = self._pi_aloha_encode_actions_inv(batch[ACTION])
batch = self.normalize_inputs(batch)
batch = self.normalize_targets(batch)
loss_dict = self.model.forward(batch)
return loss_dict["loss"], loss_dict
@@ -0,0 +1,51 @@
#!/usr/bin/env python
# Copyright 2025 Physical Intelligence and The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import torch
from lerobot.policies.pi0.configuration_pi0 import PI0Config
from lerobot.processor import (
DeviceProcessor,
NormalizerProcessor,
RenameProcessor,
RobotProcessor,
ToBatchProcessor,
UnnormalizerProcessor,
)
def make_pi0fast_processor(
config: PI0Config, dataset_stats: dict[str, dict[str, torch.Tensor]] | None = None
) -> tuple[RobotProcessor, RobotProcessor]:
input_steps = [
RenameProcessor(rename_map={}), # To mimic the same processor as pretrained one
NormalizerProcessor(
features={**config.input_features, **config.output_features},
norm_map=config.normalization_mapping,
stats=dataset_stats,
),
ToBatchProcessor(),
DeviceProcessor(device=config.device),
]
output_steps = [
DeviceProcessor(device="cpu"),
UnnormalizerProcessor(
features=config.output_features, norm_map=config.normalization_mapping, stats=dataset_stats
),
]
return RobotProcessor(steps=input_steps, name="robot_preprocessor"), RobotProcessor(
steps=output_steps, name="robot_postprocessor"
)
@@ -0,0 +1,128 @@
#!/usr/bin/env python
# Copyright 2025 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from dataclasses import dataclass, field
from lerobot.configs.policies import PreTrainedConfig
from lerobot.configs.types import NormalizationMode
@PreTrainedConfig.register_subclass("rlearn")
@dataclass
class RLearNConfig(PreTrainedConfig):
"""Configuration for a video-language conditioned reward model (RLearN).
Inputs:
- Visual frames (one or multiple cameras). Optionally a short sequence.
- A language instruction/goal string.
Output:
- Per-timestep reward logits or a single-step reward logit.
Notes:
- This follows the ReWiND paper architecture. It uses frozen vision/text encoders
(DINOv3 for vision, SigLIP2 for language) and trains a
lightweight temporal aggregator + head.
"""
# Encoders - Use SigLIP2 for both vision and text (shared checkpoint)
vision_model_name: str = "google/siglip2-base-patch16-224"
text_model_name: str = "google/siglip2-base-patch16-224"
freeze_backbones: bool = True
# Sequence length, amount of past frames including current one to use in the temporal model
max_seq_len: int = 16
# Temporal sampling stride
temporal_sampling_stride: int = 3 # Open x mostly has fps 10, and rewind has seq len 16, ours is 30fps so 30/10 = 3 stride lenght to have same timeframe!
# Model dimensions and transformer
dim_model: int = 512
num_layers: int = 4
num_heads: int = 8
ff_mult: int = 4 # Feed-forward multiplier, hidden = dim_model * ff_mult
dropout: float = 0.05
# --- reward head options ---
use_categorical_rewards: bool = False # classification over bins
num_reward_bins: int = 25
reward_min_value: float = 0.0 # for HL-Gauss range
reward_max_value: float = 1.0
use_hl_gauss_loss: bool = True # if False -> plain regression
hl_gauss_num_bins: int = 25 # histogram resolution
# Inference-time subsampling and regularization
inference_stride: int = 1 # inference_stride is an extra, second downsampling applied in forward after window sampling/rewind. Keep it at 1 to disable extra skipping
frame_dropout_p: float = 0.10
# Training
learning_rate: float = 5e-4
weight_decay: float = 0.01
head_lr_multiplier: float = 5.0
logit_eps: float = 1e-4
regularizer_warmup_steps: int = 500
# Performance optimizations
use_amp: bool = False
compile_model: bool = True
# ReWiND augmentation
rewind_prob: float = 0.3 #0.8
rewind_last3_prob: float = 0.0 #0.3
mismatch_prob: float = 0.0 #0.2
# Normalization presets
normalization_mapping: dict[str, NormalizationMode] = field(
default_factory=lambda: {
"VISUAL": NormalizationMode.MEAN_STD,
}
)
# Required path to episodes.jsonl for episode boundaries
episodes_jsonl_path: str | None = "meta/episodes.jsonl"
def validate_features(self) -> None:
# Require at least one image feature. Language is recommended but optional (can be blank).
if not self.image_features:
raise ValueError(
"You must provide at least one image feature for RLearN (e.g. 'observation.image')."
)
@property
def observation_delta_indices(self) -> list | None:
# Request a long enough context so in-window stride sampling can be >1.
# We ask for (max_seq_len * temporal_sampling_stride) frames ending at t=0.
# Example: max_seq_len=16, temporal_sampling_stride=3 → 48 deltas → ~46 frames available.
total_needed = self.max_seq_len * max(1, int(self.temporal_sampling_stride))
return list(range(1 - total_needed, 1))
@property
def action_delta_indices(self) -> list | None:
# Not an action chunking policy.
return None
@property
def reward_delta_indices(self) -> list | None:
# ReWiND generates progress labels on-the-fly, doesn't need reward data
return None
def get_optimizer_preset(self): # type: ignore[override]
from lerobot.optim.optimizers import AdamWConfig
return AdamWConfig(lr=self.learning_rate, weight_decay=self.weight_decay)
def get_scheduler_preset(self): # type: ignore[override]
# No scheduler by default.
return None
+392
View File
@@ -0,0 +1,392 @@
#!/usr/bin/env python
"""
Standalone evaluation script for RLearN models.
This script evaluates RLearN reward models on episodes from a dataset,
generating comparison plots between ground truth rewards and model predictions.
Usage:
python src/lerobot/policies/rlearn/eval_script.py --model MODEL_NAME --dataset DATASET_REPO --episodes N
Example:
python src/lerobot/policies/rlearn/eval_script.py --model pepijn223/rlearn_18 --dataset pepijn223/phone_pipeline_pickup1 --episodes 2
"""
import argparse
import os
import sys
from pathlib import Path
# Add src to path for imports
sys.path.append(str(Path(__file__).parent.parent.parent.parent))
import warnings
import matplotlib.pyplot as plt
import numpy as np
import torch
from scipy.stats import spearmanr
from tqdm import tqdm
warnings.filterwarnings("ignore")
# LeRobot imports
from lerobot.constants import OBS_IMAGE, OBS_IMAGES, OBS_LANGUAGE
from lerobot.datasets.lerobot_dataset import LeRobotDataset
from lerobot.policies.rlearn.modeling_rlearn import RLearNPolicy
def _to_chw_float01(img):
"""Ensure CHW float in [0,1]."""
if isinstance(img, np.ndarray):
img = torch.from_numpy(img)
# HWC -> CHW if needed
if len(img.shape) == 3 and img.shape[-1] in (1, 3, 4):
img = img.permute(2, 0, 1)
if img.dtype == torch.uint8:
img = img.float() / 255.0
else:
img = img.float()
return torch.clamp(img, 0.0, 1.0)
def _get_language(frame_data):
lang = None
if OBS_LANGUAGE in frame_data:
lang = frame_data[OBS_LANGUAGE]
if isinstance(lang, list) and len(lang) > 0:
lang = lang[0]
elif "task" in frame_data:
lang = frame_data["task"]
return lang if isinstance(lang, str) else "No language provided"
def _get_ground_truth_reward(frame_data):
"""Try common keys for ground-truth reward. Return None if unavailable."""
for key in ("reward", "rewards", "gt_reward", "progress"):
if key in frame_data:
r = frame_data[key]
# unwrap single-element lists/arrays
if isinstance(r, (list, np.ndarray)) and np.array(r).size == 1:
r = float(np.array(r).reshape(-1)[0])
try:
return float(r)
except Exception:
pass
return None
def extract_episode_frames_and_gt(dataset, episode_idx):
"""Load a full episode: frames (T, C, H, W), language (str), gt_rewards (np.ndarray or None)."""
ep_start = dataset.episode_data_index["from"][episode_idx].item()
ep_end = dataset.episode_data_index["to"][episode_idx].item()
T = ep_end - ep_start
frames = []
gt_rewards = []
language = None
for t in range(T):
item = dataset[ep_start + t]
# image(s)
if OBS_IMAGES in item:
img = item[OBS_IMAGES]
elif OBS_IMAGE in item:
img = item[OBS_IMAGE]
else:
# try to find an image-like key
img_keys = [k for k in item.keys() if "image" in k.lower()]
if not img_keys:
continue
img = item[img_keys[0]]
frames.append(_to_chw_float01(img))
# language once
if language is None:
language = _get_language(item)
# ground-truth reward (optional)
r = _get_ground_truth_reward(item)
gt_rewards.append(r)
if not frames:
return None, None, None
frames = torch.stack(frames) # (T, C, H, W)
# If all GT entries are None, treat as missing
if all(r is None for r in gt_rewards):
gt_rewards = None
else:
# Replace None by forward filling
arr = np.array([np.nan if r is None else float(r) for r in gt_rewards], dtype=float)
# forward/back fill
if np.isnan(arr[0]):
first_valid = np.flatnonzero(~np.isnan(arr))
if len(first_valid) > 0:
arr[0] = arr[first_valid[0]]
else:
arr[0] = 0.0
for i in range(1, len(arr)):
if np.isnan(arr[i]):
arr[i] = arr[i - 1]
gt_rewards = arr
return frames, language or "No language provided", gt_rewards
@torch.no_grad()
def predict_rewards_sliding(model, frames, language, max_seq_len=16, batch_size=64, device="cuda", temporal_stride: int | None = None):
"""
Sliding-window prediction: for each frame i, create a window [max(0, i-L+1) .. i],
left-pad by repeating the first frame to length L (<= 16), and take the prediction
corresponding to the current frame's position in the window.
Returns np.ndarray of shape (T,).
"""
T = frames.shape[0]
cfg = getattr(model, "config", object())
L = int(getattr(cfg, "max_seq_len", max_seq_len))
L = min(L, max_seq_len) # hard-cap at 16
# Use the same temporal stride as training (skip s-1 frames, take 1)
if temporal_stride is None:
temporal_stride = int(getattr(cfg, "temporal_sampling_stride", 1))
temporal_stride = max(1, int(temporal_stride))
# Preprocessed tensor on device
frames = frames.to(device)
windows = []
frame_positions = [] # Track which temporal position each frame should use
left_pad_counts = [] # Number of left-pad (OOB) frames per window
for i in range(T):
# Build indices with stride s: [..., i-3, i] etc., left-padded by clamping to 0
idxs = [i - (L - 1 - j) * temporal_stride for j in range(L)]
pad_needed = sum(1 for k in idxs if k < 0)
clamped = [0 if k < 0 else (T - 1 if k >= T else k) for k in idxs]
window = frames[clamped] # (L, C, H, W)
# Use the last temporal position (current frame) for reading model output
frame_pos = L - 1
windows.append(window)
frame_positions.append(frame_pos)
left_pad_counts.append(pad_needed)
preds = np.zeros(T, dtype=float)
for s in range(0, T, batch_size):
e = min(s + batch_size, T)
batch_windows = torch.stack(windows[s:e]) # (B, L, C, H, W)
batch_positions = frame_positions[s:e]
batch = {OBS_IMAGES: batch_windows, OBS_LANGUAGE: [language] * (e - s)} # expects (B, L, C, H, W)
# Model returns (B, L) predictions for each temporal position
values = model.predict_rewards(batch) # torch.Tensor (B, L)
# Apply eval-time padding rule: predictions for left-padded (OOB) frames are zero
if values.dim() == 2 and len(left_pad_counts) >= (e - s):
for b_idx in range(e - s):
pad_n = left_pad_counts[s + b_idx]
if pad_n > 0:
values[b_idx, :pad_n] = 0.0
# Debug output removed - issue was identified and fixed
if values.dim() == 2:
# Extract the prediction corresponding to each frame's position in its window
batch_preds = []
for b_idx, pos in enumerate(batch_positions):
batch_preds.append(values[b_idx, pos].item())
preds[s:e] = np.array(batch_preds)
else:
# Fallback: if model returns (B,), use as is
preds[s:e] = values.detach().float().cpu().numpy()
return preds
def plot_episode_eval(episode_idx, gt, pred, language, save_path=None, show=False, title_prefix="RLearN Eval"):
"""Plot GT vs Predicted over time. Saves PNG if save_path is provided."""
T = len(pred)
x = np.arange(T)
plt.figure(figsize=(14, 8))
plt.plot(x, pred, linewidth=2.5, marker="o", markersize=3, label="Predicted Reward", color="blue")
if gt is not None:
plt.plot(x, gt, linestyle="--", linewidth=2.5, label="Ground-Truth Reward", color="orange")
# Correlation between GT and Pred
corr, p = spearmanr(gt, pred)
corr_str = f"ρ(GT, Pred) = {0.0 if np.isnan(corr) else corr:.3f} (p={0.0 if np.isnan(p) else p:.3f})"
else:
expected = np.linspace(0, 1, T)
plt.plot(x, expected, linestyle="--", linewidth=2.5, label="Expected Progress (0→1)", color="orange")
corr, p = spearmanr(x, pred)
corr_str = f"VOC-S ρ(t, Pred) = {0.0 if np.isnan(corr) else corr:.3f} (p={0.0 if np.isnan(p) else p:.3f})"
plt.title(f"{title_prefix} — Episode {episode_idx}\n{language}\n{corr_str}", fontsize=14)
plt.xlabel("Frame Index", fontsize=12)
plt.ylabel("Reward / Progress", fontsize=12)
plt.legend(fontsize=11)
plt.grid(True, alpha=0.3)
plt.tight_layout()
if save_path is not None:
plt.savefig(save_path, dpi=200, bbox_inches="tight")
print(f"Saved eval image to: {save_path}")
if show:
plt.show()
else:
plt.close()
def eval_episode_sliding(
episode_idx, dataset, model, save_dir=".", device="cuda", max_seq_len=16, batch_size=64, title_prefix="RLearN Eval"
):
"""End-to-end: load episode, predict with sliding 16-frame windows, and save PNG."""
frames, language, gt = extract_episode_frames_and_gt(dataset, episode_idx)
if frames is None:
print(f"[Episode {episode_idx}] No frames found.")
return None
model.eval()
pred = predict_rewards_sliding(
model=model, frames=frames, language=language, max_seq_len=max_seq_len, batch_size=batch_size, device=device
)
# Basic stats
print(f"Episode {episode_idx}: T={len(pred)}, pred∈[{pred.min():.3f},{pred.max():.3f}]")
if gt is not None:
print(f"GT available: gt∈[{np.nanmin(gt):.3f},{np.nanmax(gt):.3f}]")
save_path = f"{save_dir}/episode_{episode_idx:04d}_eval.png"
plot_episode_eval(
episode_idx=episode_idx, gt=gt, pred=pred, language=language, save_path=save_path, show=False, title_prefix=title_prefix
)
return save_path
def main():
"""Main evaluation script for RLearN models."""
# Parse command line arguments
parser = argparse.ArgumentParser(description="Evaluate RLearN model on episodes with GT vs Predicted rewards")
parser.add_argument("--model", type=str, required=True, help="Model name/path (e.g., pepijn223/rlearn_mse5)")
parser.add_argument("--dataset", type=str, required=True, help="Dataset repo (e.g., pepijn223/phone_pipeline_pickup1)")
parser.add_argument("--episodes", type=int, default=5, help="Number of episodes to evaluate")
parser.add_argument("--output", type=str, default="./eval_results", help="Output directory for images")
parser.add_argument(
"--device",
type=str,
default="cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu",
help="Device to use",
)
parser.add_argument("--batch_size", type=int, default=32, help="Batch size for sliding window evaluation")
args = parser.parse_args()
# Create output directory
output_dir = Path(args.output)
output_dir.mkdir(parents=True, exist_ok=True)
print("🎯 RLearN Model Evaluation")
print("=" * 60)
print(f"Model: {args.model}")
print(f"Dataset: {args.dataset}")
print(f"Episodes: {args.episodes}")
print(f"Device: {args.device}")
print(f"Output: {output_dir}")
print("=" * 60)
try:
# Load dataset
print("📁 Loading dataset...")
dataset = LeRobotDataset(
repo_id=args.dataset,
episodes=list(range(min(args.episodes, 50))), # Load enough episodes
download_videos=True,
)
print(f"✅ Dataset loaded: {dataset.num_episodes} episodes, {dataset.num_frames} frames")
print(f" Features: {list(dataset.features.keys())}")
print(f" FPS: {dataset.fps}")
# Load model
print("\n🤖 Loading model...")
model = RLearNPolicy.from_pretrained(args.model)
model = model.to(args.device)
model.eval()
print(f"✅ Model loaded on {args.device}")
print(f" Parameters: {sum(p.numel() for p in model.parameters()):,}")
print(f" Trainable: {sum(p.numel() for p in model.parameters() if p.requires_grad):,}")
print(f" Max sequence length: {model.config.max_seq_len}")
# Select episodes to evaluate
total_available = min(dataset.num_episodes, args.episodes)
episode_indices = list(range(total_available))
print(f"\n📊 Evaluating {len(episode_indices)} episodes...")
print("=" * 60)
# Run sliding window evaluation on each episode
saved_paths = []
for i, ep_idx in enumerate(episode_indices):
print(f"\n[{i+1}/{len(episode_indices)}] Processing Episode {ep_idx}")
print("-" * 40)
try:
save_path = eval_episode_sliding(
episode_idx=ep_idx,
dataset=dataset,
model=model,
save_dir=str(output_dir),
device=args.device,
batch_size=args.batch_size,
title_prefix="RLearN Ground Truth vs Predicted",
)
if save_path:
saved_paths.append(save_path)
except Exception as e:
print(f"❌ Error processing episode {ep_idx}: {e}")
import traceback
traceback.print_exc()
continue
# Summary
print("\n" + "=" * 60)
print("✅ EVALUATION COMPLETE")
print(f"📈 Generated {len(saved_paths)} evaluation plots")
print(f"📁 Results saved to: {output_dir}")
print("\nGenerated files:")
for path in saved_paths:
print(f"{path}")
if saved_paths:
print(f"\n💡 View the plots to compare ground truth vs predicted rewards!")
print(f" Each plot shows the model's sliding 16-frame window predictions")
print(f" against available ground truth rewards over the episode timeline.")
return 0
except Exception as e:
print(f"❌ Error during evaluation: {e}")
import traceback
traceback.print_exc()
return 1
if __name__ == "__main__":
exit(main())
File diff suppressed because it is too large Load Diff
@@ -0,0 +1,128 @@
#!/usr/bin/env python
# Copyright 2025 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from dataclasses import dataclass
from typing import Any
from lerobot.configs.types import PolicyFeature
from lerobot.constants import OBS_LANGUAGE
from lerobot.policies.rlearn.configuration_rlearn import RLearNConfig
from lerobot.processor import (
DeviceProcessor,
NormalizerProcessor,
RenameProcessor,
RobotProcessor,
ToBatchProcessor,
TokenizerProcessor,
UnnormalizerProcessor,
)
from lerobot.processor.pipeline import (
ComplementaryDataProcessor,
EnvTransition,
ProcessorStepRegistry,
TransitionKey,
)
def make_rlearn_processor(
config: RLearNConfig, dataset_stats: dict[str, dict[str, Any]] | None = None
) -> tuple[RobotProcessor, RobotProcessor]:
"""Build pre/post processors for RLearN.
Responsibilities moved out of the model:
- Normalize inputs (images) using dataset stats
- Ensure batching
- Map complementary_data.task to observation.language when available
- Tokenize language into observation.language.tokens / attention_mask
- Move to/from device
"""
input_steps = [
# No renaming by default, but keep for future extensibility
RenameProcessor(rename_map={}),
# Move heavy normalization to GPU after transfer for better parallelism
ToBatchProcessor(),
RLearnLanguageFromTaskProcessor(),
# Use SigLIP2 for tokenizer to keep vocab aligned with text tower
TokenizerProcessor(
tokenizer_name=config.text_model_name,
max_length=64,
padding="max_length",
truncation=True,
padding_side="right",
),
DeviceProcessor(device=config.device),
# Move normalization after GPU transfer to use GPU acceleration
NormalizerProcessor(
features={**config.input_features, **config.output_features},
norm_map=config.normalization_mapping,
stats=dataset_stats,
),
]
output_steps = [
DeviceProcessor(device="cpu"),
UnnormalizerProcessor(
features=config.output_features, norm_map=config.normalization_mapping, stats=dataset_stats
),
]
return RobotProcessor(steps=input_steps, name="robot_preprocessor"), RobotProcessor(
steps=output_steps, name="robot_postprocessor"
)
@dataclass
@ProcessorStepRegistry.register(name="rlearn_language_from_task")
class RLearnLanguageFromTaskProcessor(ComplementaryDataProcessor):
"""Copy complementary_data['task'] into observation['observation.language'] if present.
This ensures the model can consume a raw language string when tokenization is not used,
while TokenizerProcessor can still create tokenized fields.
"""
task_key: str = "task"
def __call__(self, transition: EnvTransition) -> EnvTransition: # type: ignore[override]
complementary_data = transition.get(TransitionKey.COMPLEMENTARY_DATA)
if not complementary_data or self.task_key not in complementary_data:
return transition
task = complementary_data.get(self.task_key)
if task is None:
return transition
# Normalize to list[str]
if isinstance(task, str):
task_list = [task]
elif isinstance(task, list) and all(isinstance(t, str) for t in task):
task_list = task
else:
return transition
observation = transition.get(TransitionKey.OBSERVATION) or {}
# Do not overwrite if user already provided observation.language
if OBS_LANGUAGE not in observation:
observation[OBS_LANGUAGE] = task_list
transition[TransitionKey.OBSERVATION] = observation
return transition
def transform_features(self, features: dict[str, PolicyFeature]) -> dict[str, PolicyFeature]: # noqa: D401
# Adds nothing to features; only mirrors complementary_data.task into observation
return features
def get_config(self) -> dict[str, Any]:
return {"task_key": self.task_key}
+101
View File
@@ -0,0 +1,101 @@
## General Value/Reward Learning:
I want to implement a general/universal vision and language value function or reward model for robotics/video tasks. Also called a video language conditioned reward model. Integrated with already existing LeRobot code if convenient, use the LeRobot Dataset for dataset and store the reward for a frame in the lerobot frame itself.
Inspired by these papers:
- ReWiND; https://arxiv.org/pdf/2505.10911 (Most applicable and main paper I want to implement ideas from) and code: https://github.com/lucidrains/rewind-reward-pytorch
- LIV; https://arxiv.org/pdf/2306.00958 (Most applicable and 2nd main paper I want to implement ideas from) and code https://github.com/penn-pal-lab/LI
- VLC: Video-Language Critic: Transferable Reward Functions for Language-Conditioned Robotics: https://arxiv.org/pdf/2405.19988 (Most applicable and 3rd paper I want to implement ideas from) and code: https://github.com/minttusofia/video_language_critic
And these papers which are also relevant:
- https://www.dyna.co/dyna-1/research (Main company I want to reproduce the eventual results from)
- vip; https://arxiv.org/pdf/2210.00030
- uvd; https://arxiv.org/pdf/2310.08581
- vlm in context; https://arxiv.org/pdf/2411.04549
- https://www.youtube.com/watch?v=JfZYtpEisoM
Little less relevant but still similar papers:
- Learning Generalizable Robotic Reward Functions from “In-The-Wild” Human Videos,
- XIRL: Cross-embodiment Inverse Reinforcement Learning,
- Video-Language Critic: Transferable Reward https://arxiv.org/pdf/2405.19988
- Functions for Language-Conditioned Robotics,
- LORel, Language-Driven Representation Learning for Robotics https://sites.google.com/view/robotlorel
- RoboCLIP: One Demonstration is Enough to Learn Robot Policies https://arxiv.org/pdf/2310.07899
- Points2Rewards: learn first key points and then uses the keypoints to learn general value function/policy https://semrob.github.io/docs/2025_rss_semrob.github.io_paper20.pdf
- Language-Driven Representation Learning for Robotics: https://arxiv.org/pdf/2302.12766v1
- R3M: A Universal Visual Representation for Robot Manipulation: https://arxiv.org/pdf/2203.12601v3
Input should be the current image or whole video and the task goal specified in text/language. Output is current reward.
Archiutecture:
_ inputs: video o1:T (or current o1:t), language z;
_ DINO v3 ViT-B/16 (86M params): https://huggingface.co/facebook/dinov3-vitb16-pretrain-lvd1689m for vision encoding
\_ sentence-transformers/all-MiniLM-L12-v2: https://huggingface.co/sentence-transformers/all-MiniLM-L12-v2 for text encoding \* Temporal module: small causal transformer ("cross-modal sequential aggregator"), with first-frame positional embedding (to avoid position cheating), frame-dropout, and stride sampling; outputs per-timestep logits.
Loss: See this chatgpt thread: https://chatgpt.com/s/t_68999a50a0b081919abc365cdd205e01
Past images: (for example a reward method go to 3rd floor, has to know what floor it was on and what pas actions it did, can we attend or encorperate images of decision from history in one way?) Maybe via this paper: Learning Long-Context Diffusion Policies via Past-Token Prediction
Amount of frames needed for test/generalization: 1M frames? or ~20% of IPEC-COMMUNITY/bc_z_lerobot
Eval:
Implement something like voc score , or ROC rank order correlation between reward leanredna and ev reward from sim, or use something else to do additional evaluation
Ideas:
- Incorporate training on multiple horizons: as in label same dataset for longer horizons: make a sandwich (long), put cheese on bread (medium) and even smaller horizons: go down or close gripper (small)
- Incorporate navigation goals “walk towards the kitchen”, make sure we fix CLIP contrastive learning issue of positional text misunderstanding where model doesnnt learn difference between "horse right of cow" and "horse left of cow" “Move right” potentially train with more other data or even actionable world models such as Genie 3 (https://deepmind.google/discover/blog/genie-3-a-new-frontier-for-world-models/)
How to use a general reward model (use cases): - Train rl policy on it - Success detection - Do exploraion - Do task via planning and search to optimize reward - Filter out bad episodes in large datasets from imitation learning
Potential Datasets: (start with dataset that is most clean for this and works best with chosen way of doing evals)
_ Epic-Kitchens-100
_ Something-Something v. 2 Dataset https://www.qualcomm.com/developer/software/something-something-v-2-dataset
_ Ego4D (3000 hours)
_ Open X-Embodiment (OXE)
\_ Agi bot world: https://huggingface.co/datasets/agibot-world/AgiBotWorld-Alpha
- GalexiAI dataset: https://opengalaxea.github.io/G0/
_ GTEA+ Gaze: https://cbs.ic.gatech.edu/fpv/
_ YouCook2 dataset
\_ HOWTO100M: https://www.di.ens.fr/willow/research/howto100m/
- Genie generated dataset?
### TODOs:
- Implement first architecture [x]
- Implement processors [x]
- Choose right loss metric(s) [x]
- Make dataset with script that generated the dataset (IPEC-COMMUNITY/bc_z_lerobot) ready in lerobot format (and be able to visualize in dataset visualizer)
- Annotate with ReWiND-style 0→1 progress rewards [x]
- Visualize to check [x]
- Implement eval score or metric that is robust and can deal with generalization/is a good metric to try different architectures. And use it in an eval jupyter notebook with visalization of the live reward next to the video for part of the dataset: VOC score and score with correct and incorrect language captions [x]
- Do first training [x]
- Implement on-the-fly progress label generation (no need for pre-annotated rewards) [x]
- Try different losses
- Only rewind loss [x]
- Exactly similar to: https://github.com/lucidrains/rewind-reward-pytorch/blob/main/rewind_reward_pytorch/rewind_reward.py#L11 [x]
- Try DINO v2 as encoder Base 86 M: with https://huggingface.co/sentence-transformers/all-MiniLM-L12-v2 [x]
- Test rewind (evaluate) [x]
- benchmark siglip 2 vs this implementation forward pass, debug speed [x]
- use siglip 2 [x]
- Fix evaluation bug !!! []
- Fix sample episode padding bug !!! []
- Overfit on one episode []
- Cleanup code? [] + enable language loss
- Convert python -m lerobot.datasets.v21.convert_dataset_v20_to_v21 --repo-id=IPEC-COMMUNITY/bc_z_lerobot and train on 1 percent
- Then on 10 percent []
- Ablation 16 sucessive frame vs 16 frame samples with stride 2 or 4 []
- Add more artificial text to dataset generated by vlm (google gemini) []
- See google gemini vlm caption [] https://gemini.google.com/app/7e332ffaf32580f2
- Multiple captions per video, creat method to generate as much data as possible etc [] https://arxiv.org/abs/2508.13446, https://arxiv.org/pdf/2412.04453
- Add other datasets from OXE metioned in rewind []
- Extend evaluation []
- Ablation for size vision encoder, language encoder, temporal head []
- Ablation one mlp head per frame or single mlp head []
- Add other datasets metnioned here []
- How can we improve spatial aware learning? solve issue of Contrastive learning and position []
+10 -53
View File
@@ -28,7 +28,6 @@ import torch.nn.functional as F # noqa: N812
from torch import Tensor
from torch.distributions import MultivariateNormal, TanhTransform, Transform, TransformedDistribution
from lerobot.policies.normalize import NormalizeBuffer
from lerobot.policies.pretrained import PreTrainedPolicy
from lerobot.policies.sac.configuration_sac import SACConfig, is_image_feature
from lerobot.policies.utils import get_device_from_parameters
@@ -45,7 +44,6 @@ class SACPolicy(
def __init__(
self,
config: SACConfig | None = None,
dataset_stats: dict[str, dict[str, Tensor]] | None = None,
):
super().__init__(config)
config.validate_features()
@@ -53,7 +51,6 @@ class SACPolicy(
# Determine action dimension and initialize all components
continuous_action_dim = config.output_features["action"].shape[0]
self._init_normalization(dataset_stats)
self._init_encoders()
self._init_critics(continuous_action_dim)
self._init_actor(continuous_action_dim)
@@ -88,8 +85,7 @@ class SACPolicy(
observations_features = None
if self.shared_encoder and self.actor.encoder.has_images:
# Cache and normalize image features
observations_features = self.actor.encoder.get_cached_image_features(batch, normalize=True)
observations_features = self.actor.encoder.get_cached_image_features(batch)
actions, _, _ = self.actor(batch, observations_features)
@@ -391,28 +387,12 @@ class SACPolicy(
actor_loss = ((self.temperature * log_probs) - min_q_preds).mean()
return actor_loss
def _init_normalization(self, dataset_stats):
"""Initialize input/output normalization modules."""
self.normalize_inputs = nn.Identity()
self.normalize_targets = nn.Identity()
if self.config.dataset_stats is not None:
params = _convert_normalization_params_to_tensor(self.config.dataset_stats)
self.normalize_inputs = NormalizeBuffer(
self.config.input_features, self.config.normalization_mapping, params
)
stats = dataset_stats or params
self.normalize_targets = NormalizeBuffer(
self.config.output_features, self.config.normalization_mapping, stats
)
def _init_encoders(self):
"""Initialize shared or separate encoders for actor and critic."""
self.shared_encoder = self.config.shared_encoder
self.encoder_critic = SACObservationEncoder(self.config, self.normalize_inputs)
self.encoder_critic = SACObservationEncoder(self.config)
self.encoder_actor = (
self.encoder_critic
if self.shared_encoder
else SACObservationEncoder(self.config, self.normalize_inputs)
self.encoder_critic if self.shared_encoder else SACObservationEncoder(self.config)
)
def _init_critics(self, continuous_action_dim):
@@ -424,9 +404,7 @@ class SACPolicy(
)
for _ in range(self.config.num_critics)
]
self.critic_ensemble = CriticEnsemble(
encoder=self.encoder_critic, ensemble=heads, output_normalization=self.normalize_targets
)
self.critic_ensemble = CriticEnsemble(encoder=self.encoder_critic, ensemble=heads)
target_heads = [
CriticHead(
input_dim=self.encoder_critic.output_dim + continuous_action_dim,
@@ -434,9 +412,7 @@ class SACPolicy(
)
for _ in range(self.config.num_critics)
]
self.critic_target = CriticEnsemble(
encoder=self.encoder_critic, ensemble=target_heads, output_normalization=self.normalize_targets
)
self.critic_target = CriticEnsemble(encoder=self.encoder_critic, ensemble=target_heads)
self.critic_target.load_state_dict(self.critic_ensemble.state_dict())
if self.config.use_torch_compile:
@@ -490,10 +466,9 @@ class SACPolicy(
class SACObservationEncoder(nn.Module):
"""Encode image and/or state vector observations."""
def __init__(self, config: SACConfig, input_normalizer: nn.Module) -> None:
def __init__(self, config: SACConfig) -> None:
super().__init__()
self.config = config
self.input_normalization = input_normalizer
self._init_image_layers()
self._init_state_layers()
self._compute_output_dim()
@@ -568,11 +543,10 @@ class SACObservationEncoder(nn.Module):
def forward(
self, obs: dict[str, Tensor], cache: dict[str, Tensor] | None = None, detach: bool = False
) -> Tensor:
obs = self.input_normalization(obs)
parts = []
if self.has_images:
if cache is None:
cache = self.get_cached_image_features(obs, normalize=False)
cache = self.get_cached_image_features(obs)
parts.append(self._encode_images(cache, detach))
if self.has_env:
parts.append(self.env_encoder(obs["observation.environment_state"]))
@@ -585,7 +559,7 @@ class SACObservationEncoder(nn.Module):
"No parts to concatenate, you should have at least one image or environment state or state"
)
def get_cached_image_features(self, obs: dict[str, Tensor], normalize: bool = False) -> dict[str, Tensor]:
def get_cached_image_features(self, obs: dict[str, Tensor]) -> dict[str, Tensor]:
"""Extract and optionally cache image features from observations.
This function processes image observations through the vision encoder once and returns
@@ -597,26 +571,17 @@ class SACObservationEncoder(nn.Module):
- The vision encoder forward pass is typically the main computational bottleneck during training and inference
- Caching these features can provide 2-4x speedup in training and inference
Normalization behavior:
- When called from inside forward(): set normalize=False since inputs are already normalized
- When called from outside forward(): set normalize=True to ensure proper input normalization
Usage patterns:
- Called in select_action() with normalize=True
- Called in select_action()
- Called in learner.py's get_observation_features() to pre-compute features for all policy components
- Called internally by forward() with normalize=False
- Called internally by forward()
Args:
obs: Dictionary of observation tensors containing image keys
normalize: Whether to normalize observations before encoding
Set to True when calling directly from outside the encoder's forward method
Set to False when calling from within forward() where inputs are already normalized
Returns:
Dictionary mapping image keys to their corresponding encoded features
"""
if normalize:
obs = self.input_normalization(obs)
batched = torch.cat([obs[k] for k in self.image_keys], dim=0)
out = self.image_encoder(batched)
chunks = torch.chunk(out, len(self.image_keys), dim=0)
@@ -747,7 +712,6 @@ class CriticEnsemble(nn.Module):
Args:
encoder (SACObservationEncoder): encoder for observations.
ensemble (List[CriticHead]): list of critic heads.
output_normalization (nn.Module): normalization layer for actions.
init_final (float | None): optional initializer scale for final layers.
Forward returns a tensor of shape (num_critics, batch_size) containing Q-values.
@@ -757,13 +721,11 @@ class CriticEnsemble(nn.Module):
self,
encoder: SACObservationEncoder,
ensemble: list[CriticHead],
output_normalization: nn.Module,
init_final: float | None = None,
):
super().__init__()
self.encoder = encoder
self.init_final = init_final
self.output_normalization = output_normalization
self.critics = nn.ModuleList(ensemble)
def forward(
@@ -775,11 +737,6 @@ class CriticEnsemble(nn.Module):
device = get_device_from_parameters(self)
# Move each tensor in observations to device
observations = {k: v.to(device) for k, v in observations.items()}
# NOTE: We normalize actions it helps for sample efficiency
actions: dict[str, torch.tensor] = {"action": actions}
# NOTE: Normalization layer took dict in input and outputs a dict that why
actions = self.output_normalization(actions)["action"]
actions = actions.to(device)
obs_enc = self.encoder(observations, cache=observation_features)
+52
View File
@@ -0,0 +1,52 @@
#!/usr/bin/env python
# Copyright 2024 The HuggingFace Inc. team.
# All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import torch
from lerobot.policies.sac.configuration_sac import SACConfig
from lerobot.processor import (
DeviceProcessor,
NormalizerProcessor,
RenameProcessor,
RobotProcessor,
ToBatchProcessor,
UnnormalizerProcessor,
)
def make_sac_processor(
config: SACConfig, dataset_stats: dict[str, dict[str, torch.Tensor]] | None = None
) -> tuple[RobotProcessor, RobotProcessor]:
input_steps = [
RenameProcessor(rename_map={}),
NormalizerProcessor(
features={**config.input_features, **config.output_features},
norm_map=config.normalization_mapping,
stats=dataset_stats,
),
ToBatchProcessor(),
DeviceProcessor(device=config.device),
]
output_steps = [
DeviceProcessor(device="cpu"),
UnnormalizerProcessor(
features=config.output_features, norm_map=config.normalization_mapping, stats=dataset_stats
),
]
return RobotProcessor(steps=input_steps, name="robot_preprocessor"), RobotProcessor(
steps=output_steps, name="robot_postprocessor"
)
@@ -20,7 +20,6 @@ import torch
from torch import Tensor, nn
from lerobot.constants import OBS_IMAGE, REWARD
from lerobot.policies.normalize import Normalize, Unnormalize
from lerobot.policies.pretrained import PreTrainedPolicy
from lerobot.policies.sac.reward_model.configuration_classifier import RewardClassifierConfig
@@ -108,22 +107,12 @@ class Classifier(PreTrainedPolicy):
def __init__(
self,
config: RewardClassifierConfig,
dataset_stats: dict[str, dict[str, Tensor]] | None = None,
):
from transformers import AutoModel
super().__init__(config)
self.config = config
# Initialize normalization (standardized with the policy framework)
self.normalize_inputs = Normalize(config.input_features, config.normalization_mapping, dataset_stats)
self.normalize_targets = Normalize(
config.output_features, config.normalization_mapping, dataset_stats
)
self.unnormalize_outputs = Unnormalize(
config.output_features, config.normalization_mapping, dataset_stats
)
# Set up encoder
encoder = AutoModel.from_pretrained(self.config.model_name, trust_remote_code=True)
# Extract vision model if we're given a multimodal model
@@ -247,10 +236,6 @@ class Classifier(PreTrainedPolicy):
def forward(self, batch: dict[str, Tensor]) -> tuple[Tensor, dict[str, Tensor]]:
"""Standard forward pass for training compatible with train.py."""
# Normalize inputs if needed
batch = self.normalize_inputs(batch)
batch = self.normalize_targets(batch)
# Extract images and labels
images, labels = self.extract_images_and_labels(batch)
@@ -0,0 +1,42 @@
# !/usr/bin/env python
# Copyright 2025 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import torch
from lerobot.policies.sac.reward_model.configuration_classifier import RewardClassifierConfig
from lerobot.processor import (
DeviceProcessor,
IdentityProcessor,
NormalizerProcessor,
RobotProcessor,
)
def make_classifier_processor(
config: RewardClassifierConfig, dataset_stats: dict[str, dict[str, torch.Tensor]] | None = None
) -> tuple[RobotProcessor, RobotProcessor]:
input_steps = [
NormalizerProcessor(
features=config.input_features, norm_map=config.normalization_mapping, stats=dataset_stats
),
NormalizerProcessor(
features=config.output_features, norm_map=config.normalization_mapping, stats=dataset_stats
),
DeviceProcessor(device=config.device),
]
output_steps = [DeviceProcessor(device="cpu"), IdentityProcessor()]
return RobotProcessor(steps=input_steps, name="classifier_preprocessor"), RobotProcessor(
steps=output_steps, name="classifier_postprocessor"
)
@@ -53,21 +53,13 @@ policy = SmolVLAPolicy.from_pretrained("lerobot/smolvla_base")
"""
import math
import os
import re
from collections import deque
import safetensors
import torch
import torch.nn.functional as F # noqa: N812
from torch import Tensor, nn
from transformers import AutoProcessor
from lerobot.constants import ACTION, OBS_STATE
from lerobot.policies.normalize import (
Normalize,
Unnormalize,
)
from lerobot.constants import ACTION, OBS_LANGUAGE, OBS_STATE
from lerobot.policies.pretrained import PreTrainedPolicy
from lerobot.policies.smolvla.configuration_smolvla import SmolVLAConfig
from lerobot.policies.smolvla.smolvlm_with_expert import SmolVLMWithExpertModel
@@ -76,102 +68,6 @@ from lerobot.policies.utils import (
)
from lerobot.utils.utils import get_safe_dtype
# Matches ".soNNN", optionally followed by "-something", up to the "_buffer_" marker
_VARIANT_RE = re.compile(r"\.so\d+(?:-[\w]+)?_buffer_")
def canonicalise(k: str) -> str:
"""
Remove dataset-variant markers like '.so100-blue_' or '.so100_' from a
normalisation-buffer key.
"""
return _VARIANT_RE.sub(".buffer_", k)
def standardise_state_dict(
checkpoint: dict[str, torch.Tensor], ref_keys: set[str], *, verbose: bool = True
) -> tuple[dict[str, torch.Tensor], list[str]]:
"""
• Re-keys `checkpoint ` so that every entry matches the *reference* key set.
• If several variant keys collapse to the same canonical name we keep the
first one and log the collision.
• Returns the new dict + a list of entries that could not be matched.
"""
out, collisions, unmatched = {}, {}, []
for k, v in checkpoint.items():
canon = canonicalise(k)
if canon in ref_keys:
if canon in out: # duplicate after collapsing
collisions.setdefault(canon, []).append(k)
else:
out[canon] = v
else:
unmatched.append(k)
if verbose:
for canon, variants in collisions.items():
print(f"[standardise_state_dict] '{canon}'{variants}")
if unmatched:
print(f"[standardise_state_dict] kept {len(unmatched)} unmatched keys")
out.update({k: checkpoint[k] for k in unmatched})
return out, unmatched
def rename_checkpoint_keys(checkpoint: dict, rename_str: str):
"""
Renames keys in a checkpoint dictionary based on the given rename string.
Args:
checkpoint (dict): The checkpoint dictionary.
rename_str (str): A string specifying key mappings in the format "old1//new1,old2//new2".
Returns:
dict: The modified checkpoint with renamed keys.
"""
rename_dict = dict(pair.split("//") for pair in rename_str.split(","))
new_checkpoint = {}
for k, v in checkpoint.items():
for old_key, new_key in rename_dict.items():
if old_key in k:
k = k.replace(old_key, new_key)
new_checkpoint[k] = v
return new_checkpoint
def load_smolvla(
model: torch.nn.Module,
filename: str | os.PathLike,
*,
device: str = "cpu",
checkpoint_keys_mapping: str = "",
) -> torch.nn.Module:
state_dict = safetensors.torch.load_file(filename, device=device)
# Optional user-supplied renames (e.g. "model._orig_mod.//model.")
if checkpoint_keys_mapping and "//" in checkpoint_keys_mapping:
state_dict = rename_checkpoint_keys(state_dict, checkpoint_keys_mapping)
state_dict, _ = standardise_state_dict(state_dict, set(model.state_dict().keys()))
# HACK(aliberts): to not overwrite normalization parameters as they should come from the dataset
norm_keys = ("normalize_inputs", "normalize_targets", "unnormalize_outputs")
state_dict = {k: v for k, v in state_dict.items() if not k.startswith(norm_keys)}
missing, unexpected = model.load_state_dict(state_dict, strict=False)
if not all(key.startswith(norm_keys) for key in missing) or unexpected:
raise RuntimeError(
"SmolVLA %d missing / %d unexpected keys",
len(missing),
len(unexpected),
)
return model
def create_sinusoidal_pos_embedding(
time: torch.tensor, dimension: int, min_period: float, max_period: float, device="cpu"
@@ -326,28 +222,17 @@ class SmolVLAPolicy(PreTrainedPolicy):
def __init__(
self,
config: SmolVLAConfig,
dataset_stats: dict[str, dict[str, Tensor]] | None = None,
):
"""
Args:
config: Policy configuration class instance or None, in which case the default instantiation of
the configuration class is used.
dataset_stats: Dataset statistics to be used for normalization. If not passed here, it is expected
that they will be passed with a call to `load_state_dict` before the policy is used.
"""
super().__init__(config)
config.validate_features()
self.config = config
self.normalize_inputs = Normalize(config.input_features, config.normalization_mapping, dataset_stats)
self.normalize_targets = Normalize(
config.output_features, config.normalization_mapping, dataset_stats
)
self.unnormalize_outputs = Unnormalize(
config.output_features, config.normalization_mapping, dataset_stats
)
self.language_tokenizer = AutoProcessor.from_pretrained(self.config.vlm_model_name).tokenizer
self.model = VLAFlowMatching(config)
self.reset()
@@ -357,23 +242,6 @@ class SmolVLAPolicy(PreTrainedPolicy):
ACTION: deque(maxlen=self.config.n_action_steps),
}
# HACK(aliberts, danaaubakirova): we overwrite this classmethod here to fix smolVLA-specific issues
@classmethod
def _load_as_safetensor(
cls,
model: "SmolVLAPolicy",
model_file: str,
map_location: str,
strict: bool,
):
safetensors.torch.load_model(model, model_file, strict=strict, device=map_location)
return load_smolvla(
model,
model_file,
device=map_location,
checkpoint_keys_mapping="model._orig_mod.//model.",
)
def get_optim_params(self) -> dict:
return self.parameters()
@@ -389,7 +257,8 @@ class SmolVLAPolicy(PreTrainedPolicy):
images, img_masks = self.prepare_images(batch)
state = self.prepare_state(batch)
lang_tokens, lang_masks = self.prepare_language(batch)
lang_tokens = batch[f"{OBS_LANGUAGE}.tokens"]
lang_masks = batch[f"{OBS_LANGUAGE}.attention_mask"]
actions = self.model.sample_actions(images, img_masks, lang_tokens, lang_masks, state, noise=noise)
@@ -397,8 +266,6 @@ class SmolVLAPolicy(PreTrainedPolicy):
original_action_dim = self.config.action_feature.shape[0]
actions = actions[:, :, :original_action_dim]
actions = self.unnormalize_outputs({ACTION: actions})[ACTION]
if self.config.adapt_to_pi_aloha:
actions = self._pi_aloha_encode_actions(actions)
@@ -408,8 +275,6 @@ class SmolVLAPolicy(PreTrainedPolicy):
if self.config.adapt_to_pi_aloha:
batch[OBS_STATE] = self._pi_aloha_decode_state(batch[OBS_STATE])
batch = self.normalize_inputs(batch)
return batch
@torch.no_grad()
@@ -450,11 +315,11 @@ class SmolVLAPolicy(PreTrainedPolicy):
if self.config.adapt_to_pi_aloha:
batch[OBS_STATE] = self._pi_aloha_decode_state(batch[OBS_STATE])
batch[ACTION] = self._pi_aloha_encode_actions_inv(batch[ACTION])
batch = self.normalize_inputs(batch)
batch = self.normalize_targets(batch)
images, img_masks = self.prepare_images(batch)
state = self.prepare_state(batch)
lang_tokens, lang_masks = self.prepare_language(batch)
lang_tokens = batch[f"{OBS_LANGUAGE}.tokens"]
lang_masks = batch[f"{OBS_LANGUAGE}.attention_mask"]
actions = self.prepare_action(batch)
actions_is_pad = batch.get("actions_id_pad")
loss_dict = {}
@@ -518,30 +383,6 @@ class SmolVLAPolicy(PreTrainedPolicy):
img_masks.append(mask)
return images, img_masks
def prepare_language(self, batch) -> tuple[Tensor, Tensor]:
"""Tokenize the text input"""
device = batch[OBS_STATE].device
tasks = batch["task"]
if isinstance(tasks, str):
tasks = [tasks]
if len(tasks) == 1:
tasks = [tasks[0] for _ in range(batch[OBS_STATE].shape[0])]
tasks = [task if task.endswith("\n") else f"{task}\n" for task in tasks]
tokenized_prompt = self.language_tokenizer.__call__(
tasks,
padding=self.config.pad_language_to,
padding_side="right",
max_length=self.config.tokenizer_max_length,
return_tensors="pt",
)
lang_tokens = tokenized_prompt["input_ids"].to(device=device)
lang_masks = tokenized_prompt["attention_mask"].to(device=device, dtype=torch.bool)
return lang_tokens, lang_masks
def _pi_aloha_decode_state(self, state):
# Flip the joints.
for motor_idx in [1, 2, 8, 9]:
@@ -0,0 +1,109 @@
#!/usr/bin/env python
# Copyright 2025 HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from typing import Any
import torch
from lerobot.configs.types import PolicyFeature
from lerobot.policies.smolvla.configuration_smolvla import SmolVLAConfig
from lerobot.processor import (
DeviceProcessor,
NormalizerProcessor,
RenameProcessor,
RobotProcessor,
ToBatchProcessor,
TokenizerProcessor,
UnnormalizerProcessor,
)
from lerobot.processor.pipeline import EnvTransition, ProcessorStep, ProcessorStepRegistry, TransitionKey
def make_smolvla_processor(
config: SmolVLAConfig, dataset_stats: dict[str, dict[str, torch.Tensor]] | None = None
) -> tuple[RobotProcessor, RobotProcessor]:
input_steps = [
RenameProcessor(rename_map={}), # To mimic the same processor as pretrained one
NormalizerProcessor(
features={**config.input_features, **config.output_features},
norm_map=config.normalization_mapping,
stats=dataset_stats,
),
ToBatchProcessor(),
SmolVLANewLineProcessor(),
TokenizerProcessor(
tokenizer_name=config.vlm_model_name,
padding=config.pad_language_to,
padding_side="right",
max_length=config.tokenizer_max_length,
),
DeviceProcessor(device=config.device),
]
output_steps = [
DeviceProcessor(device="cpu"),
UnnormalizerProcessor(
features=config.output_features, norm_map=config.normalization_mapping, stats=dataset_stats
),
]
return RobotProcessor(steps=input_steps, name="robot_preprocessor"), RobotProcessor(
steps=output_steps, name="robot_postprocessor"
)
@ProcessorStepRegistry.register(name="smolvla_new_line_processor")
class SmolVLANewLineProcessor(ProcessorStep):
"""Add a new line to the end of the task if it doesn't have one."""
def __call__(self, transition: EnvTransition) -> EnvTransition:
# Check if complementary_data exists
complementary_data = transition.get(TransitionKey.COMPLEMENTARY_DATA)
if complementary_data is None or "task" not in complementary_data:
return transition
task = complementary_data["task"]
if task is None:
return transition
# Handle both string and list of strings
if isinstance(task, str):
# Single string: add newline if not present
if not task.endswith("\n"):
complementary_data["task"] = f"{task}\n"
elif isinstance(task, list) and all(isinstance(t, str) for t in task):
# List of strings: add newline to each if not present
complementary_data["task"] = [t if t.endswith("\n") else f"{t}\n" for t in task]
# If task is neither string nor list of strings, leave unchanged
return transition
def transform_features(self, features: dict[str, PolicyFeature]) -> dict[str, PolicyFeature]:
"""Adds nothing to the features."""
return features
def state_dict(self) -> dict[str, torch.Tensor]:
"""Return state dictionary (empty for this processor)."""
return {}
def load_state_dict(self, state: dict[str, torch.Tensor]) -> None:
"""Load state dictionary (no-op for this processor)."""
pass
def reset(self) -> None:
"""Reset processor state (no-op for this processor)."""
pass
def get_config(self) -> dict[str, Any]:
"""Return configuration for serialization."""
return {}
+7 -17
View File
@@ -36,7 +36,6 @@ import torch.nn.functional as F # noqa: N812
from torch import Tensor
from lerobot.constants import ACTION, OBS_ENV_STATE, OBS_IMAGE, OBS_STATE, REWARD
from lerobot.policies.normalize import Normalize, Unnormalize
from lerobot.policies.pretrained import PreTrainedPolicy
from lerobot.policies.tdmpc.configuration_tdmpc import TDMPCConfig
from lerobot.policies.utils import get_device_from_parameters, get_output_shape, populate_queues
@@ -63,26 +62,19 @@ class TDMPCPolicy(PreTrainedPolicy):
config_class = TDMPCConfig
name = "tdmpc"
def __init__(self, config: TDMPCConfig, dataset_stats: dict[str, dict[str, Tensor]] | None = None):
def __init__(
self,
config: TDMPCConfig,
):
"""
Args:
config: Policy configuration class instance or None, in which case the default instantiation of
the configuration class is used.
dataset_stats: Dataset statistics to be used for normalization. If not passed here, it is expected
that they will be passed with a call to `load_state_dict` before the policy is used.
"""
super().__init__(config)
config.validate_features()
self.config = config
self.normalize_inputs = Normalize(config.input_features, config.normalization_mapping, dataset_stats)
self.normalize_targets = Normalize(
config.output_features, config.normalization_mapping, dataset_stats
)
self.unnormalize_outputs = Unnormalize(
config.output_features, config.normalization_mapping, dataset_stats
)
self.model = TDMPCTOLD(config)
self.model_target = deepcopy(self.model)
for param in self.model_target.parameters():
@@ -137,7 +129,6 @@ class TDMPCPolicy(PreTrainedPolicy):
actions = torch.clamp(actions, -1, +1)
actions = self.unnormalize_outputs({ACTION: actions})[ACTION]
return actions
@torch.no_grad()
@@ -147,11 +138,12 @@ class TDMPCPolicy(PreTrainedPolicy):
if ACTION in batch:
batch.pop(ACTION)
batch = self.normalize_inputs(batch)
if self.config.image_features:
batch = dict(batch) # shallow copy so that adding a key doesn't modify the original
batch[OBS_IMAGE] = batch[next(iter(self.config.image_features))]
# NOTE: for offline evaluation, we have action in the batch, so we need to pop it out
if ACTION in batch:
batch.pop(ACTION)
self._queues = populate_queues(self._queues, batch)
@@ -320,11 +312,9 @@ class TDMPCPolicy(PreTrainedPolicy):
"""
device = get_device_from_parameters(self)
batch = self.normalize_inputs(batch)
if self.config.image_features:
batch = dict(batch) # shallow copy so that adding a key doesn't modify the original
batch[OBS_IMAGE] = batch[next(iter(self.config.image_features))]
batch = self.normalize_targets(batch)
info = {}
@@ -0,0 +1,51 @@
#!/usr/bin/env python
# Copyright 2024 Nicklas Hansen, Xiaolong Wang, Hao Su,
# and The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import torch
from lerobot.policies.tdmpc.configuration_tdmpc import TDMPCConfig
from lerobot.processor import (
DeviceProcessor,
NormalizerProcessor,
RenameProcessor,
RobotProcessor,
ToBatchProcessor,
UnnormalizerProcessor,
)
def make_tdmpc_processor(
config: TDMPCConfig, dataset_stats: dict[str, dict[str, torch.Tensor]] | None = None
) -> tuple[RobotProcessor, RobotProcessor]:
input_steps = [
RenameProcessor(rename_map={}),
NormalizerProcessor(
features={**config.input_features, **config.output_features},
norm_map=config.normalization_mapping,
stats=dataset_stats,
),
ToBatchProcessor(),
DeviceProcessor(device=config.device),
]
output_steps = [
DeviceProcessor(device="cpu"),
UnnormalizerProcessor(
features=config.output_features, norm_map=config.normalization_mapping, stats=dataset_stats
),
]
return RobotProcessor(steps=input_steps, name="robot_preprocessor"), RobotProcessor(
steps=output_steps, name="robot_postprocessor"
)
+3 -14
View File
@@ -28,7 +28,6 @@ import torchvision
from torch import Tensor, nn
from lerobot.constants import ACTION, OBS_IMAGES, OBS_STATE
from lerobot.policies.normalize import Normalize, Unnormalize
from lerobot.policies.pretrained import PreTrainedPolicy
from lerobot.policies.utils import get_device_from_parameters, get_output_shape, populate_queues
from lerobot.policies.vqbet.configuration_vqbet import VQBeTConfig
@@ -48,7 +47,6 @@ class VQBeTPolicy(PreTrainedPolicy):
def __init__(
self,
config: VQBeTConfig | None = None,
dataset_stats: dict[str, dict[str, Tensor]] | None = None,
):
"""
Args:
@@ -61,14 +59,6 @@ class VQBeTPolicy(PreTrainedPolicy):
config.validate_features()
self.config = config
self.normalize_inputs = Normalize(config.input_features, config.normalization_mapping, dataset_stats)
self.normalize_targets = Normalize(
config.output_features, config.normalization_mapping, dataset_stats
)
self.unnormalize_outputs = Unnormalize(
config.output_features, config.normalization_mapping, dataset_stats
)
self.vqbet = VQBeTModel(config)
self.reset()
@@ -128,7 +118,6 @@ class VQBeTPolicy(PreTrainedPolicy):
def predict_action_chunk(self, batch: dict[str, Tensor]) -> Tensor:
batch = {k: torch.stack(list(self._queues[k]), dim=1) for k in batch if k in self._queues}
actions = self.vqbet(batch, rollout=True)[:, : self.config.action_chunk_size]
actions = self.unnormalize_outputs({ACTION: actions})[ACTION]
return actions
@torch.no_grad()
@@ -142,10 +131,12 @@ class VQBeTPolicy(PreTrainedPolicy):
# NOTE: for offline evaluation, we have action in the batch, so we need to pop it out
if ACTION in batch:
batch.pop(ACTION)
batch = self.normalize_inputs(batch)
batch = dict(batch) # shallow copy so that adding a key doesn't modify the original
# NOTE: It's important that this happens after stacking the images into a single key.
batch["observation.images"] = torch.stack([batch[key] for key in self.config.image_features], dim=-4)
# NOTE: for offline evaluation, we have action in the batch, so we need to pop it out
if ACTION in batch:
batch.pop(ACTION)
self._queues = populate_queues(self._queues, batch)
@@ -165,10 +156,8 @@ class VQBeTPolicy(PreTrainedPolicy):
def forward(self, batch: dict[str, Tensor]) -> tuple[Tensor, dict]:
"""Run the batch through the model and compute the loss for training or validation."""
batch = self.normalize_inputs(batch)
batch = dict(batch) # shallow copy so that adding a key doesn't modify the original
batch[OBS_IMAGES] = torch.stack([batch[key] for key in self.config.image_features], dim=-4)
batch = self.normalize_targets(batch)
# VQ-BeT discretizes action using VQ-VAE before training BeT (please refer to section 3.2 in the VQ-BeT paper https://huggingface.co/papers/2403.03181)
if not self.vqbet.action_head.vqvae_model.discretized.item():
# loss: total loss of training RVQ
@@ -0,0 +1,52 @@
#!/usr/bin/env python
# Copyright 2024 Seungjae Lee and Yibin Wang and Haritheja Etukuru
# and H. Jin Kim and Nur Muhammad Mahi Shafiullah and Lerrel Pinto
# and The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import torch
from lerobot.policies.vqbet.configuration_vqbet import VQBeTConfig
from lerobot.processor import (
DeviceProcessor,
NormalizerProcessor,
RenameProcessor,
RobotProcessor,
ToBatchProcessor,
UnnormalizerProcessor,
)
def make_vqbet_processor(
config: VQBeTConfig, dataset_stats: dict[str, dict[str, torch.Tensor]] | None = None
) -> tuple[RobotProcessor, RobotProcessor]:
input_steps = [
RenameProcessor(rename_map={}), # Let the possibility to the user to rename the keys
NormalizerProcessor(
features={**config.input_features, **config.output_features},
norm_map=config.normalization_mapping,
stats=dataset_stats,
),
ToBatchProcessor(),
DeviceProcessor(device=config.device),
]
output_steps = [
DeviceProcessor(device="cpu"),
UnnormalizerProcessor(
features=config.output_features, norm_map=config.normalization_mapping, stats=dataset_stats
),
]
return RobotProcessor(steps=input_steps, name="robot_preprocessor"), RobotProcessor(
steps=output_steps, name="robot_postprocessor"
)
+6 -1
View File
@@ -14,8 +14,9 @@
# See the License for the specific language governing permissions and
# limitations under the License.
from .batch_processor import ToBatchProcessor
from .device_processor import DeviceProcessor
from .normalize_processor import NormalizerProcessor, UnnormalizerProcessor
from .normalize_processor import NormalizerProcessor, UnnormalizerProcessor, hotswap_stats
from .observation_processor import VanillaObservationProcessor
from .pipeline import (
ActionProcessor,
@@ -32,6 +33,7 @@ from .pipeline import (
TruncatedProcessor,
)
from .rename_processor import RenameProcessor
from .tokenizer_processor import TokenizerProcessor
__all__ = [
"ActionProcessor",
@@ -42,12 +44,15 @@ __all__ = [
"InfoProcessor",
"NormalizerProcessor",
"UnnormalizerProcessor",
"hotswap_stats",
"ObservationProcessor",
"ProcessorStep",
"ProcessorStepRegistry",
"RenameProcessor",
"RewardProcessor",
"RobotProcessor",
"ToBatchProcessor",
"TokenizerProcessor",
"TransitionKey",
"TruncatedProcessor",
"VanillaObservationProcessor",
+139
View File
@@ -0,0 +1,139 @@
# Copyright 2025 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from dataclasses import dataclass
from typing import Any
import torch
from torch import Tensor
from lerobot.configs.types import PolicyFeature
from lerobot.constants import OBS_ENV_STATE, OBS_IMAGE, OBS_IMAGES, OBS_STATE
from lerobot.processor.pipeline import EnvTransition, ProcessorStepRegistry, TransitionKey
@dataclass
@ProcessorStepRegistry.register(name="to_batch_processor")
class ToBatchProcessor:
"""Processor that adds batch dimensions to observations and actions when needed.
This processor ensures that observations and actions have proper batch dimensions for model processing:
- For state observations (observation.state, observation.environment_state):
Adds batch dimension (unsqueeze at dim=0) if tensor is 1-dimensional
- For image observations (observation.image, observation.images.*):
Adds batch dimension (unsqueeze at dim=0) if tensor is 3-dimensional (H, W, C)
- For actions:
Adds batch dimension (unsqueeze at dim=0) if tensor is 1-dimensional
- For task field in complementary data:
Wraps string task in a list to add batch dimension
(task must be a string or list of strings)
This is useful when processing single transitions that need to be batched for
model inference or when converting from unbatched environment outputs to
batched model inputs.
The processor only modifies tensors that need batching and leaves already
batched tensors unchanged.
Example:
```python
# State: (7,) -> (1, 7)
# Image: (224, 224, 3) -> (1, 224, 224, 3)
# Action: (4,) -> (1, 4)
# Task: "pick_cube" -> ["pick_cube"]
# Already batched: (1, 7) -> (1, 7) [unchanged]
```
"""
def __call__(self, transition: EnvTransition) -> EnvTransition:
self._process_observation(transition)
self._process_action(transition)
self._process_complementary_data(transition)
return transition
def _process_observation(self, transition: EnvTransition) -> None:
"""Process observation component in-place, adding batch dimensions where needed."""
observation = transition.get(TransitionKey.OBSERVATION)
if observation is None:
return
# Process state observations - add batch dim if 1D
for state_key in [OBS_STATE, OBS_ENV_STATE]:
if state_key in observation:
state_value = observation[state_key]
if isinstance(state_value, Tensor) and state_value.dim() == 1:
observation[state_key] = state_value.unsqueeze(0)
# Process single image observation - add batch dim if 3D
if OBS_IMAGE in observation:
image_value = observation[OBS_IMAGE]
if isinstance(image_value, Tensor) and image_value.dim() == 3:
observation[OBS_IMAGE] = image_value.unsqueeze(0)
# Process multiple image observations - add batch dim if 3D
for key, value in observation.items():
if key.startswith(f"{OBS_IMAGES}.") and isinstance(value, Tensor) and value.dim() == 3:
observation[key] = value.unsqueeze(0)
def _process_action(self, transition: EnvTransition) -> None:
"""Process action component in-place, adding batch dimension if needed."""
action = transition.get(TransitionKey.ACTION)
if action is not None and isinstance(action, Tensor) and action.dim() == 1:
transition[TransitionKey.ACTION] = action.unsqueeze(0)
def _process_complementary_data(self, transition: EnvTransition) -> None:
"""Process complementary data in-place, handling task field batching."""
complementary_data = transition.get(TransitionKey.COMPLEMENTARY_DATA)
if complementary_data is None:
return
# Process task field - wrap string in list to add batch dimension
if "task" in complementary_data:
task_value = complementary_data["task"]
if isinstance(task_value, str):
complementary_data["task"] = [task_value]
# Process index field - add batch dim if 0D
if "index" in complementary_data:
index_value = complementary_data["index"]
if isinstance(index_value, Tensor) and index_value.dim() == 0:
complementary_data["index"] = index_value.unsqueeze(0)
# Process task_index field - add batch dim if 0D
if "task_index" in complementary_data:
task_index_value = complementary_data["task_index"]
if isinstance(task_index_value, Tensor) and task_index_value.dim() == 0:
complementary_data["task_index"] = task_index_value.unsqueeze(0)
def get_config(self) -> dict[str, Any]:
"""Return configuration for serialization."""
return {}
def state_dict(self) -> dict[str, torch.Tensor]:
"""Return state dictionary (empty for this processor)."""
return {}
def load_state_dict(self, state: dict[str, torch.Tensor]) -> None:
"""Load state dictionary (no-op for this processor)."""
pass
def reset(self) -> None:
"""Reset processor state (no-op for this processor)."""
pass
def transform_features(self, features: dict[str, PolicyFeature]) -> dict[str, PolicyFeature]:
return features
+225
View File
@@ -0,0 +1,225 @@
# !/usr/bin/env python
# Copyright 2025 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import annotations
from collections.abc import Iterable, Sequence
from copy import deepcopy
from typing import Any
import numpy as np
import torch
from scipy.spatial.transform import Rotation
from .pipeline import EnvTransition, TransitionKey
def _to_tensor(x: torch.Tensor | np.ndarray | Sequence[int | float]):
if isinstance(x, torch.Tensor):
return x
if isinstance(x, np.ndarray):
# Keep images (uint8 HWC) and python objects as-is
if x.dtype == np.uint8 or x.dtype == np.object_:
return x
# Scalars/arrays to float32 tensor
return torch.as_tensor(x, dtype=torch.float32)
# Anything else to float32 tensor
return torch.as_tensor(x, dtype=torch.float32)
def _from_tensor(x: Any):
if isinstance(x, torch.Tensor):
return x.item() if x.numel() == 1 else x.detach().cpu().numpy()
return x
def _is_image(arr: Any) -> bool:
return isinstance(arr, np.ndarray) and arr.dtype == np.uint8 and arr.ndim == 3
def _split_obs_to_state_and_images(obs: dict[str, Any]) -> tuple[dict[str, Any], dict[str, Any]]:
state, images = {}, {}
for k, v in obs.items():
if _is_image(v):
images[k] = v
else:
state[k] = v
return state, images
def make_obs_act_transition(
*, obs: dict[str, Any] | None = None, act: dict[str, Any] | None = None
) -> EnvTransition:
return {
TransitionKey.OBSERVATION: {} if obs is None else obs,
TransitionKey.ACTION: {} if act is None else act,
TransitionKey.INFO: {},
TransitionKey.COMPLEMENTARY_DATA: {},
TransitionKey.REWARD: None,
TransitionKey.DONE: None,
TransitionKey.TRUNCATED: None,
}
def to_transition_teleop_action(action: dict[str, Any]) -> EnvTransition:
"""
Convert a raw teleop action dict into an EnvTransition under the ACTION TransitionKey.
"""
act_dict: dict[str, Any] = {}
for k, v in action.items():
# Check if the value is a type that should not be converted to a tensor.
if isinstance(v, (Rotation, dict)):
act_dict[f"action.{k}"] = v
continue
arr = np.array(v) if np.isscalar(v) else v
act_dict[f"action.{k}"] = _to_tensor(arr)
return make_obs_act_transition(act=act_dict)
# TODO(Adil, Pepijn): Overtime we can maybe add these converters to pipeline.py itself
def to_transition_robot_observation(observation: dict[str, Any]) -> EnvTransition:
"""
Convert a raw robot observation dict into an EnvTransition under the OBSERVATION TransitionKey.
"""
state, images = _split_obs_to_state_and_images(observation)
obs_dict: dict[str, Any] = {}
for k, v in state.items():
arr = np.array(v) if np.isscalar(v) else v
obs_dict[f"observation.state.{k}"] = _to_tensor(arr)
for cam, img in images.items():
obs_dict[f"observation.images.{cam}"] = img
return make_obs_act_transition(obs=obs_dict)
def to_output_robot_action(transition: EnvTransition) -> dict[str, Any]:
"""
Converts a EnvTransition under the ACTION TransitionKey to a dict with keys ending in '.pos' for raw robot actions.
"""
out: dict[str, Any] = {}
action_dict = transition.get(TransitionKey.ACTION) or {}
for k, v in action_dict.items():
if isinstance(k, str) and k.startswith("action.") and k.endswith((".pos", ".vel")):
out_key = k[len("action.") :] # Strip the 'action.' prefix.
out[out_key] = float(v)
return out
def to_dataset_frame(
transitions_or_transition: EnvTransition | Iterable[EnvTransition], features: dict[str, dict]
) -> dict[str, any]:
"""
Converts a single EnvTransition or an iterable of them into a flat,
dataset-friendly dictionary for training or evaluation, according to
the provided `features` spec.
Args:
transitions_or_transition: Either a single EnvTransition dict
or an iterable of them (which will be merged).
features (dict[str, dict]):
A feature specification dictionary:
- 'action': dict with 'names': list of action feature names
- 'observation.state': dict with 'names': list of state feature names
- keys starting with 'observation.images.' are passed through
Returns:
batch (dict[str, any]): Flat dictionary containing:
- numpy arrays for "observation.state" and "action"
- any image tensors defined in features
- next.{reward,done,truncated}
- info dict
- *_is_pad flags and task from complementary_data
"""
action_names = features.get("action", {}).get("names", [])
obs_state_names = features.get("observation.state", {}).get("names", [])
image_keys = [k for k in features if k.startswith("observation.images.")]
def _merge(base: EnvTransition, other: EnvTransition) -> EnvTransition:
out = deepcopy(base)
for key in (
TransitionKey.OBSERVATION,
TransitionKey.ACTION,
TransitionKey.INFO,
TransitionKey.COMPLEMENTARY_DATA,
):
if other.get(key):
out.setdefault(key, {}).update(deepcopy(other[key]))
for k in (TransitionKey.REWARD, TransitionKey.DONE, TransitionKey.TRUNCATED):
if k in other:
out[k] = other[k]
return out
def _ensure_transition(obj) -> EnvTransition:
# single transition
if isinstance(obj, dict) and any(isinstance(k, TransitionKey) for k in obj):
return obj
# iterable of transitions
if isinstance(obj, Iterable):
items = list(obj)
if not items:
return {}
acc = items[0]
for t in items[1:]:
acc = _merge(acc, t)
return acc
raise TypeError("Expected EnvTransition or iterable of them")
tr = _ensure_transition(transitions_or_transition)
obs = tr.get(TransitionKey.OBSERVATION, {}) or {}
act = tr.get(TransitionKey.ACTION, {}) or {}
batch: dict[str, any] = {}
# Images passthrough
for k in image_keys:
if k in obs:
batch[k] = obs[k]
# Observation.state vector
if obs_state_names:
vals = [_from_tensor(obs.get(f"observation.state.{n}", 0.0)) for n in obs_state_names]
batch["observation.state"] = np.asarray(vals, dtype=np.float32)
# Action vector
if action_names:
vals = [_from_tensor(act.get(f"action.{n}", 0.0)) for n in action_names]
batch["action"] = np.asarray(vals, dtype=np.float32)
# Next.* fields
if tr.get(TransitionKey.REWARD) is not None:
batch["next.reward"] = _from_tensor(tr[TransitionKey.REWARD])
if tr.get(TransitionKey.DONE) is not None:
batch["next.done"] = _from_tensor(tr[TransitionKey.DONE])
if tr.get(TransitionKey.TRUNCATED) is not None:
batch["next.truncated"] = _from_tensor(tr[TransitionKey.TRUNCATED])
# Complementary data flags and task
comp = tr.get(TransitionKey.COMPLEMENTARY_DATA) or {}
if comp:
# pad flags
for k, v in comp.items():
if k.endswith("_is_pad"):
batch[k] = v
# task label
if comp.get("task") is not None:
batch["task"] = comp["task"]
return batch
+77 -14
View File
@@ -19,24 +19,63 @@ from typing import Any
import torch
from lerobot.configs.types import PolicyFeature
from lerobot.processor.pipeline import EnvTransition, TransitionKey
from lerobot.processor.pipeline import EnvTransition, ProcessorStepRegistry, TransitionKey
from lerobot.utils.utils import get_safe_torch_device
@ProcessorStepRegistry.register("device_processor")
@dataclass
class DeviceProcessor:
"""Processes transitions by moving tensors to the specified device.
"""Processes transitions by moving tensors to the specified device and optionally converting float dtypes.
This processor ensures that all tensors in the transition are moved to the
specified device (CPU or GPU) before they are returned.
specified device (CPU or GPU) before they are returned. It can also convert
floating-point tensors to a specified dtype while preserving non-float types
(int, long, bool, etc.).
"""
device: torch.device = "cpu"
device: str = "cpu"
float_dtype: str | None = None
_device: torch.device | None = None
def __post_init__(self):
self.device = get_safe_torch_device(self.device)
self._device = get_safe_torch_device(self.device)
self.device = self._device.type
self.non_blocking = "cuda" in str(self.device)
# Validate and convert float_dtype string to torch dtype
if self.float_dtype is not None:
dtype_mapping = {
"float16": torch.float16,
"float32": torch.float32,
"float64": torch.float64,
"bfloat16": torch.bfloat16,
"half": torch.float16,
"float": torch.float32,
"double": torch.float64,
}
if self.float_dtype not in dtype_mapping:
available_dtypes = list(dtype_mapping.keys())
raise ValueError(
f"Invalid float_dtype '{self.float_dtype}'. Available options: {available_dtypes}"
)
self._target_float_dtype = dtype_mapping[self.float_dtype]
else:
self._target_float_dtype = None
def _process_tensor(self, tensor: torch.Tensor) -> torch.Tensor:
"""Process a tensor by moving to device and optionally converting float dtype."""
# Move to device first
tensor = tensor.to(self.device, non_blocking=self.non_blocking)
# Convert float dtype if specified and tensor is floating point
if self._target_float_dtype is not None and tensor.is_floating_point():
tensor = tensor.to(dtype=self._target_float_dtype)
return tensor
def __call__(self, transition: EnvTransition) -> EnvTransition:
# Create a copy of the transition
new_transition = transition.copy()
@@ -45,7 +84,7 @@ class DeviceProcessor:
observation = transition.get(TransitionKey.OBSERVATION)
if observation is not None:
new_observation = {
k: v.to(self.device, non_blocking=self.non_blocking) if isinstance(v, torch.Tensor) else v
k: self._process_tensor(v) if isinstance(v, torch.Tensor) else v
for k, v in observation.items()
}
new_transition[TransitionKey.OBSERVATION] = new_observation
@@ -53,30 +92,54 @@ class DeviceProcessor:
# Process action tensor
action = transition.get(TransitionKey.ACTION)
if action is not None and isinstance(action, torch.Tensor):
new_transition[TransitionKey.ACTION] = action.to(self.device, non_blocking=self.non_blocking)
new_transition[TransitionKey.ACTION] = self._process_tensor(action)
# Process reward tensor
reward = transition.get(TransitionKey.REWARD)
if reward is not None and isinstance(reward, torch.Tensor):
new_transition[TransitionKey.REWARD] = reward.to(self.device, non_blocking=self.non_blocking)
new_transition[TransitionKey.REWARD] = self._process_tensor(reward)
# Process done tensor
done = transition.get(TransitionKey.DONE)
if done is not None and isinstance(done, torch.Tensor):
new_transition[TransitionKey.DONE] = done.to(self.device, non_blocking=self.non_blocking)
new_transition[TransitionKey.DONE] = self._process_tensor(done)
# Process truncated tensor
truncated = transition.get(TransitionKey.TRUNCATED)
if truncated is not None and isinstance(truncated, torch.Tensor):
new_transition[TransitionKey.TRUNCATED] = truncated.to(
self.device, non_blocking=self.non_blocking
)
new_transition[TransitionKey.TRUNCATED] = self._process_tensor(truncated)
# Process complementary data tensors
complementary_data = transition.get(TransitionKey.COMPLEMENTARY_DATA)
if complementary_data is not None:
new_complementary_data = {}
# Process all items in complementary_data
for key, value in complementary_data.items():
if isinstance(value, torch.Tensor):
new_complementary_data[key] = self._process_tensor(value)
else:
new_complementary_data[key] = value
new_transition[TransitionKey.COMPLEMENTARY_DATA] = new_complementary_data
return new_transition
def get_config(self) -> dict[str, Any]:
"""Return configuration for serialization."""
return {"device": self.device}
return {"device": self.device, "float_dtype": self.float_dtype}
def feature_contract(self, features: dict[str, PolicyFeature]) -> dict[str, PolicyFeature]:
def state_dict(self) -> dict[str, torch.Tensor]:
"""Return state dictionary (empty for this processor)."""
return {}
def load_state_dict(self, state: dict[str, torch.Tensor]) -> None:
"""Load state dictionary (no-op for this processor)."""
pass
def reset(self) -> None:
"""Reset processor state (no-op for this processor)."""
pass
def transform_features(self, features: dict[str, PolicyFeature]) -> dict[str, PolicyFeature]:
return features
@@ -0,0 +1,502 @@
#!/usr/bin/env python
# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Generic script to migrate any policy model with normalization layers to the new pipeline-based system.
This script:
1. Loads an existing pretrained policy model
2. Extracts normalization statistics from the model
3. Creates both preprocessor and postprocessor:
- Preprocessor: normalizes both inputs (observations) and outputs (actions) for training
- Postprocessor: unnormalizes outputs (actions) for inference
4. Removes normalization layers from the model state_dict
5. Saves the new model and both processors
Usage:
python src/lerobot/processor/migrate_policy_normalization.py \
--pretrained-path lerobot/act_aloha_sim_transfer_cube_human \
--policy-type act \
--push-to-hub
"""
import argparse
import importlib
import json
import os
from copy import deepcopy
from pathlib import Path
from typing import Any
import torch
from huggingface_hub import hf_hub_download
from safetensors.torch import load_file as load_safetensors
from lerobot.configs.types import FeatureType, NormalizationMode, PolicyFeature
from lerobot.processor.batch_processor import ToBatchProcessor
from lerobot.processor.device_processor import DeviceProcessor
from lerobot.processor.normalize_processor import NormalizerProcessor, UnnormalizerProcessor
from lerobot.processor.pipeline import RobotProcessor
from lerobot.processor.rename_processor import RenameProcessor
# Policy type to class mapping
POLICY_CLASSES = {
"act": "lerobot.policies.act.modeling_act.ACTPolicy",
"diffusion": "lerobot.policies.diffusion.modeling_diffusion.DiffusionPolicy",
"pi0": "lerobot.policies.pi0.modeling_pi0.PI0Policy",
"pi0fast": "lerobot.policies.pi0fast.modeling_pi0fast.PI0FASTPolicy",
"smolvla": "lerobot.policies.smolvla.modeling_smolvla.SmolVLAPolicy",
"tdmpc": "lerobot.policies.tdmpc.modeling_tdmpc.TDMPCPolicy",
"vqbet": "lerobot.policies.vqbet.modeling_vqbet.VQBeTPolicy",
"sac": "lerobot.policies.sac.modeling_sac.SACPolicy",
"classifier": "lerobot.policies.classifier.modeling_classifier.ClassifierPolicy",
}
def extract_normalization_stats(state_dict: dict[str, torch.Tensor]) -> dict[str, dict[str, torch.Tensor]]:
"""Extract normalization statistics from model state_dict."""
stats = {}
# Define patterns to match and their prefixes to remove
normalization_patterns = [
"normalize_inputs.buffer_",
"unnormalize_outputs.buffer_",
"normalize_targets.buffer_",
"normalize.", # Must come after normalize_* patterns
"unnormalize.", # Must come after unnormalize_* patterns
"input_normalizer.",
"output_normalizer.",
]
# Process each key in state_dict
for key, tensor in state_dict.items():
# Try each pattern
for pattern in normalization_patterns:
if key.startswith(pattern):
# Extract the remaining part after the pattern
remaining = key[len(pattern) :]
parts = remaining.split(".")
# Need at least feature name and stat type
if len(parts) >= 2:
# Last part is the stat type (mean, std, min, max, etc.)
stat_type = parts[-1]
# Everything else is the feature name
feature_name = ".".join(parts[:-1]).replace("_", ".")
# Add to stats
if feature_name not in stats:
stats[feature_name] = {}
stats[feature_name][stat_type] = tensor.clone()
# Only process the first matching pattern
break
return stats
def detect_features_and_norm_modes(
config: dict[str, Any], stats: dict[str, dict[str, torch.Tensor]]
) -> tuple[dict[str, PolicyFeature], dict[FeatureType, NormalizationMode]]:
"""Detect features and normalization modes from config and stats."""
features = {}
norm_modes = {}
# First, check if there's a normalization_mapping in the config
if "normalization_mapping" in config:
print(f"Found normalization_mapping in config: {config['normalization_mapping']}")
# Extract normalization modes from config
for feature_name, mode_str in config["normalization_mapping"].items():
# Convert string to NormalizationMode enum
if mode_str == "mean_std":
mode = NormalizationMode.MEAN_STD
elif mode_str == "min_max":
mode = NormalizationMode.MIN_MAX
else:
print(f"Warning: Unknown normalization mode '{mode_str}' for feature '{feature_name}'")
continue
# Determine feature type from feature name
if "image" in feature_name or "visual" in feature_name:
feature_type = FeatureType.VISUAL
elif "state" in feature_name:
feature_type = FeatureType.STATE
elif "action" in feature_name:
feature_type = FeatureType.ACTION
else:
feature_type = FeatureType.STATE
norm_modes[feature_type] = mode
# Try to extract from config
if "features" in config:
for key, feature_config in config["features"].items():
shape = feature_config.get("shape", feature_config.get("dim"))
shape = (shape,) if isinstance(shape, int) else tuple(shape)
# Determine feature type
if "image" in key or "visual" in key:
feature_type = FeatureType.VISUAL
elif "state" in key:
feature_type = FeatureType.STATE
elif "action" in key:
feature_type = FeatureType.ACTION
else:
feature_type = FeatureType.STATE # Default
features[key] = PolicyFeature(feature_type, shape)
# If no features in config, infer from stats
if not features:
for key, stat_dict in stats.items():
# Get shape from any stat tensor
tensor = next(iter(stat_dict.values()))
shape = tuple(tensor.shape)
# Determine feature type based on key
if "image" in key or "visual" in key or "pixels" in key:
feature_type = FeatureType.VISUAL
elif "state" in key or "joint" in key or "position" in key:
feature_type = FeatureType.STATE
elif "action" in key:
feature_type = FeatureType.ACTION
else:
feature_type = FeatureType.STATE
features[key] = PolicyFeature(feature_type, shape)
# If normalization modes weren't in config, determine based on available stats
if not norm_modes:
for key, stat_dict in stats.items():
if key in features:
if "mean" in stat_dict and "std" in stat_dict:
feature_type = features[key].type
if feature_type not in norm_modes:
norm_modes[feature_type] = NormalizationMode.MEAN_STD
elif "min" in stat_dict and "max" in stat_dict:
feature_type = features[key].type
if feature_type not in norm_modes:
norm_modes[feature_type] = NormalizationMode.MIN_MAX
# Default normalization modes if not detected
if FeatureType.VISUAL not in norm_modes:
norm_modes[FeatureType.VISUAL] = NormalizationMode.MEAN_STD
if FeatureType.STATE not in norm_modes:
norm_modes[FeatureType.STATE] = NormalizationMode.MIN_MAX
if FeatureType.ACTION not in norm_modes:
norm_modes[FeatureType.ACTION] = NormalizationMode.MEAN_STD
return features, norm_modes
def remove_normalization_layers(state_dict: dict[str, torch.Tensor]) -> dict[str, torch.Tensor]:
"""Remove normalization layers from state_dict."""
new_state_dict = {}
# Patterns to remove
remove_patterns = [
"normalize_inputs.",
"unnormalize_outputs.",
"normalize_targets.", # Added pattern for target normalization
"normalize.",
"unnormalize.",
"input_normalizer.",
"output_normalizer.",
"normalizer.",
]
for key, tensor in state_dict.items():
should_remove = any(pattern in key for pattern in remove_patterns)
if not should_remove:
new_state_dict[key] = tensor
return new_state_dict
def convert_features_to_policy_features(features_dict: dict[str, dict]) -> dict[str, PolicyFeature]:
"""Convert features from old format to PolicyFeature objects."""
converted_features = {}
for key, feature_dict in features_dict.items():
# Determine feature type based on key
if "image" in key or "visual" in key:
feature_type = FeatureType.VISUAL
elif "state" in key:
feature_type = FeatureType.STATE
elif "action" in key:
feature_type = FeatureType.ACTION
else:
feature_type = FeatureType.STATE
# Get shape from feature dict
shape = feature_dict.get("shape", feature_dict.get("dim"))
shape = (shape,) if isinstance(shape, int) else tuple(shape)
converted_features[key] = PolicyFeature(feature_type, shape)
return converted_features
def load_model_from_hub(
repo_id: str, revision: str = None
) -> tuple[dict[str, torch.Tensor], dict[str, Any], dict[str, Any]]:
"""Load model state_dict and config from hub."""
# Download files
safetensors_path = hf_hub_download(repo_id=repo_id, filename="model.safetensors", revision=revision)
config_path = hf_hub_download(repo_id=repo_id, filename="config.json", revision=revision)
train_config_path = hf_hub_download(repo_id=repo_id, filename="train_config.json", revision=revision)
# Load state_dict
state_dict = load_safetensors(safetensors_path)
# Load config
with open(config_path) as f:
config = json.load(f)
with open(train_config_path) as f:
train_config = json.load(f)
return state_dict, config, train_config
def main():
parser = argparse.ArgumentParser(
description="Migrate policy models with normalization layers to new pipeline system"
)
parser.add_argument(
"--pretrained-path",
type=str,
required=True,
help="Path to pretrained model (hub repo or local directory)",
)
parser.add_argument(
"--output-dir",
type=str,
default=None,
help="Output directory for migrated model (default: same as pretrained-path)",
)
parser.add_argument("--push-to-hub", action="store_true", help="Push migrated model to hub")
parser.add_argument(
"--hub-repo-id",
type=str,
default=None,
help="Hub repository ID for pushing (default: same as pretrained-path)",
)
parser.add_argument("--revision", type=str, default=None, help="Revision of the model to load")
parser.add_argument("--private", action="store_true", help="Make the hub repository private")
args = parser.parse_args()
# Load model and config
print(f"Loading model from {args.pretrained_path}...")
if os.path.isdir(args.pretrained_path):
# Local directory
state_dict = load_safetensors(os.path.join(args.pretrained_path, "model.safetensors"))
with open(os.path.join(args.pretrained_path, "config.json")) as f:
config = json.load(f)
with open(os.path.join(args.pretrained_path, "train_config.json")) as f:
train_config = json.load(f)
else:
# Hub repository
state_dict, config, train_config = load_model_from_hub(args.pretrained_path, args.revision)
# Extract normalization statistics
print("Extracting normalization statistics...")
stats = extract_normalization_stats(state_dict)
print(f"Found normalization statistics for: {list(stats.keys())}")
# Detect input features and normalization modes
print("Detecting features and normalization modes...")
features, norm_map = detect_features_and_norm_modes(config, stats)
print(f"Detected features: {list(features.keys())}")
print(f"Normalization modes: {norm_map}")
# Remove normalization layers from state_dict
print("Removing normalization layers from model...")
new_state_dict = remove_normalization_layers(state_dict)
removed_keys = set(state_dict.keys()) - set(new_state_dict.keys())
if removed_keys:
print(f"Removed {len(removed_keys)} normalization layer keys")
# Determine output path
if args.output_dir:
output_dir = Path(args.output_dir)
else:
if os.path.isdir(args.pretrained_path):
output_dir = Path(args.pretrained_path).parent / f"{Path(args.pretrained_path).name}_migrated"
else:
output_dir = Path(f"./{args.pretrained_path.replace('/', '_')}_migrated")
output_dir.mkdir(parents=True, exist_ok=True)
# Clean up config - remove normalization_mapping field
cleaned_config = dict(config)
if "normalization_mapping" in cleaned_config:
print("Removing 'normalization_mapping' field from config")
del cleaned_config["normalization_mapping"]
policy_type = deepcopy(cleaned_config["type"])
del cleaned_config["type"]
# Instantiate the policy model with cleaned config and load the cleaned state dict
print(f"Instantiating {policy_type} policy model...")
policy_class_path = POLICY_CLASSES[policy_type]
module_path, class_name = policy_class_path.rsplit(".", 1)
module = importlib.import_module(module_path)
policy_class = getattr(module, class_name)
# Create config class instance
config_module_path = module_path.replace("modeling", "configuration")
config_module = importlib.import_module(config_module_path)
# Handle special cases for config class names
config_class_names = {
"act": "ACTConfig",
"diffusion": "DiffusionConfig",
"pi0": "PI0Config",
"pi0fast": "PI0FASTConfig",
"smolvla": "SmolVLAConfig",
"tdmpc": "TDMPCConfig",
"vqbet": "VQBeTConfig",
"sac": "SACConfig",
"classifier": "ClassifierConfig",
}
config_class_name = config_class_names.get(policy_type, f"{policy_type.upper()}Config")
config_class = getattr(config_module, config_class_name)
# Convert input_features and output_features to PolicyFeature objects - these are mandatory
if "input_features" not in cleaned_config:
raise ValueError("Missing mandatory 'input_features' in config")
if "output_features" not in cleaned_config:
raise ValueError("Missing mandatory 'output_features' in config")
cleaned_config["input_features"] = convert_features_to_policy_features(cleaned_config["input_features"])
cleaned_config["output_features"] = convert_features_to_policy_features(cleaned_config["output_features"])
# Create config instance from cleaned config dict
policy_config = config_class(**cleaned_config)
# Create policy instance - some policies expect dataset_stats
policy = policy_class(policy_config)
# Load the cleaned state dict
policy.load_state_dict(new_state_dict, strict=True)
print("Successfully loaded cleaned state dict into policy model")
# Now create preprocessor and postprocessor with cleaned_config available
print("Creating preprocessor and postprocessor...")
# The pattern from existing processor factories:
# - Preprocessor has two NormalizerProcessors: one for input_features, one for output_features
# - Postprocessor has one UnnormalizerProcessor for output_features only
# Get features from cleaned_config (now they're PolicyFeature objects)
input_features = cleaned_config.get("input_features", {})
output_features = cleaned_config.get("output_features", {})
# Create preprocessor with two normalizers (following the pattern from processor factories)
preprocessor_steps = [
RenameProcessor(rename_map={}),
NormalizerProcessor(
features={**input_features, **output_features},
norm_map=norm_map,
stats=stats,
),
ToBatchProcessor(),
DeviceProcessor(device=policy_config.device),
]
preprocessor = RobotProcessor(steps=preprocessor_steps, name="robot_preprocessor")
# Create postprocessor with unnormalizer for outputs only
postprocessor_steps = [
DeviceProcessor(device="cpu"),
UnnormalizerProcessor(features=output_features, norm_map=norm_map, stats=stats),
]
postprocessor = RobotProcessor(steps=postprocessor_steps, name="robot_postprocessor")
# Determine hub repo ID if pushing to hub
if args.push_to_hub:
if args.hub_repo_id:
hub_repo_id = args.hub_repo_id
else:
if not os.path.isdir(args.pretrained_path):
# Use same repo with "_migrated" suffix
hub_repo_id = f"{args.pretrained_path}_migrated"
else:
raise ValueError("--hub-repo-id must be specified when pushing local model to hub")
else:
hub_repo_id = None
# Save preprocessor and postprocessor to root directory
print(f"Saving preprocessor to {output_dir}...")
preprocessor.save_pretrained(output_dir)
if args.push_to_hub:
preprocessor.push_to_hub(repo_id=hub_repo_id, private=args.private)
print(f"Saving postprocessor to {output_dir}...")
postprocessor.save_pretrained(output_dir)
if args.push_to_hub:
postprocessor.push_to_hub(repo_id=hub_repo_id, private=args.private)
# Save model using the policy's save_pretrained method
print(f"Saving model to {output_dir}...")
policy.save_pretrained(
output_dir, push_to_hub=args.push_to_hub, repo_id=hub_repo_id, private=args.private
)
# Generate and save model card
print("Generating model card...")
# Get metadata from original config
dataset_repo_id = train_config.get("repo_id", "unknown")
license = config.get("license", "apache-2.0")
tags = config.get("tags", ["robotics", "lerobot", policy_type]) or ["robotics", "lerobot", policy_type]
tags = set(tags).union({"robotics", "lerobot", policy_type})
tags = list(tags)
# Generate model card
card = policy.generate_model_card(
dataset_repo_id=dataset_repo_id, model_type=policy_type, license=license, tags=tags
)
# Save model card locally
card.save(str(output_dir / "README.md"))
print(f"Model card saved to {output_dir / 'README.md'}")
# Push model card to hub if requested
if args.push_to_hub:
from huggingface_hub import HfApi
api = HfApi()
api.upload_file(
path_or_fileobj=str(output_dir / "README.md"),
path_in_repo="README.md",
repo_id=hub_repo_id,
repo_type="model",
commit_message="Add model card for migrated model",
)
print("Model card pushed to hub")
print("\nMigration complete!")
print(f"Migrated model saved to: {output_dir}")
if args.push_to_hub:
print(f"Successfully pushed to https://huggingface.co/{hub_repo_id}")
if __name__ == "__main__":
main()
+185 -41
View File
@@ -1,6 +1,7 @@
from __future__ import annotations
from collections.abc import Mapping
from copy import deepcopy
from dataclasses import dataclass, field
from typing import Any
@@ -10,7 +11,7 @@ from torch import Tensor
from lerobot.configs.types import FeatureType, NormalizationMode, PolicyFeature
from lerobot.datasets.lerobot_dataset import LeRobotDataset
from lerobot.processor.pipeline import EnvTransition, ProcessorStepRegistry, TransitionKey
from lerobot.processor.pipeline import EnvTransition, ProcessorStepRegistry, RobotProcessor, TransitionKey
def _convert_stats_to_tensors(stats: dict[str, dict[str, Any]]) -> dict[str, dict[str, Tensor]]:
@@ -115,7 +116,7 @@ class NormalizerProcessor:
if self.normalize_keys is not None and not isinstance(self.normalize_keys, set):
self.normalize_keys = set(self.normalize_keys)
def _normalize_obs(self, observation):
def _normalize_obs(self, observation, normalized_info):
if observation is None:
return None
@@ -128,7 +129,20 @@ class NormalizerProcessor:
processed = dict(observation)
for key in keys_to_norm:
if key not in processed or key not in self._tensor_stats:
if key not in processed or key not in self.features:
continue
# Check the normalization mode for this feature type
feature = self.features[key]
norm_mode = self.norm_map.get(feature.type, NormalizationMode.IDENTITY)
# Skip normalization if mode is IDENTITY
if norm_mode is NormalizationMode.IDENTITY:
normalized_info[key] = "IDENTITY"
continue
# Skip if no stats available for this key
if key not in self._tensor_stats:
continue
orig_val = processed[key]
@@ -139,16 +153,35 @@ class NormalizerProcessor:
)
stats = {k: v.to(tensor.device) for k, v in self._tensor_stats[key].items()}
if "mean" in stats and "std" in stats:
mean, std = stats["mean"], stats["std"]
processed[key] = (tensor - mean) / (std + self.eps)
elif "min" in stats and "max" in stats:
min_val, max_val = stats["min"], stats["max"]
processed[key] = 2 * (tensor - min_val) / (max_val - min_val + self.eps) - 1
if norm_mode is NormalizationMode.MEAN_STD:
if "mean" in stats and "std" in stats:
mean, std = stats["mean"], stats["std"]
processed[key] = (tensor - mean) / (std + self.eps)
normalized_info[key] = "MEAN_STD"
elif norm_mode is NormalizationMode.MIN_MAX:
if "min" in stats and "max" in stats:
min_val, max_val = stats["min"], stats["max"]
processed[key] = 2 * (tensor - min_val) / (max_val - min_val + self.eps) - 1
normalized_info[key] = "MIN_MAX"
else:
raise ValueError(f"Unsupported normalization mode: {norm_mode}")
return processed
def _normalize_action(self, action):
if action is None or "action" not in self._tensor_stats:
def _normalize_action(self, action, normalized_info):
if action is None:
return action
# Check the normalization mode for actions
norm_mode = self.norm_map.get(FeatureType.ACTION, NormalizationMode.IDENTITY)
# Skip normalization if mode is IDENTITY
if norm_mode is NormalizationMode.IDENTITY:
normalized_info["action"] = "IDENTITY"
return action
# Skip if no stats available for actions
if "action" not in self._tensor_stats:
return action
tensor = (
@@ -157,22 +190,42 @@ class NormalizerProcessor:
else torch.as_tensor(action, dtype=torch.float32)
)
stats = {k: v.to(tensor.device) for k, v in self._tensor_stats["action"].items()}
if "mean" in stats and "std" in stats:
mean, std = stats["mean"], stats["std"]
return (tensor - mean) / (std + self.eps)
if "min" in stats and "max" in stats:
min_val, max_val = stats["min"], stats["max"]
return 2 * (tensor - min_val) / (max_val - min_val + self.eps) - 1
raise ValueError("Action stats must contain either ('mean','std') or ('min','max')")
if norm_mode is NormalizationMode.MEAN_STD:
if "mean" in stats and "std" in stats:
mean, std = stats["mean"], stats["std"]
normalized_info["action"] = "MEAN_STD"
return (tensor - mean) / (std + self.eps)
elif norm_mode is NormalizationMode.MIN_MAX:
if "min" in stats and "max" in stats:
min_val, max_val = stats["min"], stats["max"]
normalized_info["action"] = "MIN_MAX"
return 2 * (tensor - min_val) / (max_val - min_val + self.eps) - 1
else:
raise ValueError(f"Unsupported normalization mode: {norm_mode}")
# If we reach here, the required stats for the normalization mode are not available
raise ValueError(f"Action stats must contain appropriate values for {norm_mode} normalization")
def __call__(self, transition: EnvTransition) -> EnvTransition:
observation = self._normalize_obs(transition.get(TransitionKey.OBSERVATION))
action = self._normalize_action(transition.get(TransitionKey.ACTION))
# Track what was normalized
normalized_info = {}
observation = self._normalize_obs(transition.get(TransitionKey.OBSERVATION), normalized_info)
action = self._normalize_action(transition.get(TransitionKey.ACTION), normalized_info)
# Create a new transition with normalized values
new_transition = transition.copy()
new_transition[TransitionKey.OBSERVATION] = observation
new_transition[TransitionKey.ACTION] = action
# Add normalization info to complementary data
if normalized_info:
comp_data = new_transition.get(TransitionKey.COMPLEMENTARY_DATA, {})
comp_data = {} if comp_data is None else dict(comp_data)
comp_data["normalized_keys"] = normalized_info
new_transition[TransitionKey.COMPLEMENTARY_DATA] = comp_data
return new_transition
def get_config(self) -> dict[str, Any]:
@@ -204,7 +257,7 @@ class NormalizerProcessor:
def reset(self):
pass
def feature_contract(self, features: dict[str, PolicyFeature]) -> dict[str, PolicyFeature]:
def transform_features(self, features: dict[str, PolicyFeature]) -> dict[str, PolicyFeature]:
return features
@@ -253,14 +306,28 @@ class UnnormalizerProcessor:
self.stats = self.stats or {}
self._tensor_stats = _convert_stats_to_tensors(self.stats)
def _unnormalize_obs(self, observation):
def _unnormalize_obs(self, observation, unnormalized_info):
if observation is None:
return None
keys = [k for k, ft in self.features.items() if ft.type is not FeatureType.ACTION]
processed = dict(observation)
for key in keys:
if key not in processed or key not in self._tensor_stats:
if key not in processed or key not in self.features:
continue
# Check the normalization mode for this feature type
feature = self.features[key]
norm_mode = self.norm_map.get(feature.type, NormalizationMode.IDENTITY)
# Skip unnormalization if mode is IDENTITY
if norm_mode is NormalizationMode.IDENTITY:
unnormalized_info[key] = "IDENTITY"
continue
# Skip if no stats available for this key
if key not in self._tensor_stats:
continue
orig_val = processed[key]
tensor = (
orig_val.to(dtype=torch.float32)
@@ -268,39 +335,80 @@ class UnnormalizerProcessor:
else torch.as_tensor(orig_val, dtype=torch.float32)
)
stats = {k: v.to(tensor.device) for k, v in self._tensor_stats[key].items()}
if "mean" in stats and "std" in stats:
mean, std = stats["mean"], stats["std"]
processed[key] = tensor * std + mean
elif "min" in stats and "max" in stats:
min_val, max_val = stats["min"], stats["max"]
processed[key] = (tensor + 1) / 2 * (max_val - min_val) + min_val
if norm_mode is NormalizationMode.MEAN_STD:
if "mean" in stats and "std" in stats:
mean, std = stats["mean"], stats["std"]
processed[key] = tensor * std + mean
unnormalized_info[key] = "MEAN_STD"
elif norm_mode is NormalizationMode.MIN_MAX:
if "min" in stats and "max" in stats:
min_val, max_val = stats["min"], stats["max"]
processed[key] = (tensor + 1) / 2 * (max_val - min_val) + min_val
unnormalized_info[key] = "MIN_MAX"
else:
raise ValueError(f"Unsupported normalization mode: {norm_mode}")
return processed
def _unnormalize_action(self, action):
if action is None or "action" not in self._tensor_stats:
def _unnormalize_action(self, action, unnormalized_info):
if action is None:
return action
# Check the normalization mode for actions
norm_mode = self.norm_map.get(FeatureType.ACTION, NormalizationMode.IDENTITY)
# Skip unnormalization if mode is IDENTITY
if norm_mode is NormalizationMode.IDENTITY:
unnormalized_info["action"] = "IDENTITY"
return action
# Skip if no stats available for actions
if "action" not in self._tensor_stats:
return action
tensor = (
action.to(dtype=torch.float32)
if isinstance(action, torch.Tensor)
else torch.as_tensor(action, dtype=torch.float32)
)
stats = {k: v.to(tensor.device) for k, v in self._tensor_stats["action"].items()}
if "mean" in stats and "std" in stats:
mean, std = stats["mean"], stats["std"]
return tensor * std + mean
if "min" in stats and "max" in stats:
min_val, max_val = stats["min"], stats["max"]
return (tensor + 1) / 2 * (max_val - min_val) + min_val
raise ValueError("Action stats must contain either ('mean','std') or ('min','max')")
if norm_mode is NormalizationMode.MEAN_STD:
if "mean" in stats and "std" in stats:
mean, std = stats["mean"], stats["std"]
unnormalized_info["action"] = "MEAN_STD"
return tensor * std + mean
elif norm_mode is NormalizationMode.MIN_MAX:
if "min" in stats and "max" in stats:
min_val, max_val = stats["min"], stats["max"]
unnormalized_info["action"] = "MIN_MAX"
return (tensor + 1) / 2 * (max_val - min_val) + min_val
else:
raise ValueError(f"Unsupported normalization mode: {norm_mode}")
# If we reach here, the required stats for the normalization mode are not available
raise ValueError(f"Action stats must contain appropriate values for {norm_mode} normalization")
def __call__(self, transition: EnvTransition) -> EnvTransition:
observation = self._unnormalize_obs(transition.get(TransitionKey.OBSERVATION))
action = self._unnormalize_action(transition.get(TransitionKey.ACTION))
# Track what was unnormalized
unnormalized_info = {}
observation = self._unnormalize_obs(transition.get(TransitionKey.OBSERVATION), unnormalized_info)
action = self._unnormalize_action(transition.get(TransitionKey.ACTION), unnormalized_info)
# Create a new transition with unnormalized values
new_transition = transition.copy()
new_transition[TransitionKey.OBSERVATION] = observation
new_transition[TransitionKey.ACTION] = action
# Add unnormalization info to complementary data
if unnormalized_info:
comp_data = new_transition.get(TransitionKey.COMPLEMENTARY_DATA, {})
comp_data = {} if comp_data is None else dict(comp_data)
comp_data["unnormalized_keys"] = unnormalized_info
new_transition[TransitionKey.COMPLEMENTARY_DATA] = comp_data
return new_transition
def get_config(self) -> dict[str, Any]:
@@ -327,5 +435,41 @@ class UnnormalizerProcessor:
def reset(self):
pass
def feature_contract(self, features: dict[str, PolicyFeature]) -> dict[str, PolicyFeature]:
def transform_features(self, features: dict[str, PolicyFeature]) -> dict[str, PolicyFeature]:
return features
def hotswap_stats(robot_processor: RobotProcessor, stats: dict[str, dict[str, Any]]) -> RobotProcessor:
robot_processor = deepcopy(robot_processor)
for step in robot_processor.steps:
if isinstance(step, NormalizerProcessor) or isinstance(step, UnnormalizerProcessor):
step: NormalizerProcessor | UnnormalizerProcessor
step.stats = stats
step._tensor_stats = _convert_stats_to_tensors(stats)
return robot_processor
def rename_stats(stats: dict[str, dict[str, Any]], rename_map: dict[str, str]) -> dict[str, dict[str, Any]]:
"""Rename keys in the stats dictionary according to the provided mapping.
Args:
stats: The statistics dictionary with structure {feature_key: {stat_name: value}}
rename_map: Dictionary mapping old key names to new key names
Returns:
A new stats dictionary with renamed keys
Example:
>>> stats = {"observation.state": {"mean": 0.0, "std": 1.0}, "action": {"mean": 0.5, "std": 0.5}}
>>> rename_map = {"observation.state": "observation.robot_state"}
>>> new_stats = rename_stats(stats, rename_map)
>>> # new_stats will have "observation.robot_state" instead of "observation.state"
"""
renamed_stats = {}
for old_key, sub_stats in stats.items():
# Use the new key if it exists in the rename map, otherwise keep the old key
new_key = rename_map.get(old_key, old_key)
renamed_stats[new_key] = deepcopy(sub_stats)
return renamed_stats
@@ -106,9 +106,8 @@ class VanillaObservationProcessor(ObservationProcessor):
def observation(self, observation):
return self._process_observation(observation)
def feature_contract(self, features: dict[str, PolicyFeature]) -> dict[str, PolicyFeature]:
def transform_features(self, features: dict[str, PolicyFeature]) -> dict[str, PolicyFeature]:
"""Transforms feature keys to a standardized contract.
This method handles several renaming patterns:
- Exact matches (e.g., 'pixels' -> 'OBS_IMAGE').
- Prefixed exact matches (e.g., 'observation.pixels' -> 'OBS_IMAGE').
+39 -31
View File
@@ -23,7 +23,7 @@ from copy import deepcopy
from dataclasses import dataclass, field
from enum import Enum
from pathlib import Path
from typing import Any, Protocol, TypedDict
from typing import Any, Protocol, TypedDict, runtime_checkable
import torch
from huggingface_hub import ModelHubMixin, hf_hub_download
@@ -132,6 +132,7 @@ class ProcessorStepRegistry:
cls._registry.clear()
@runtime_checkable
class ProcessorStep(Protocol):
"""Structural typing interface for a single processor step.
@@ -145,7 +146,6 @@ class ProcessorStep(Protocol):
**Required**:
- ``__call__(transition: EnvTransition) -> EnvTransition``
- ``feature_contract(features: dict[str, PolicyFeature]) -> dict[str, PolicyFeature]``
Optional helper protocol:
* ``get_config() -> dict[str, Any]`` User-defined JSON-serializable
@@ -158,6 +158,8 @@ class ProcessorStep(Protocol):
* ``load_state_dict(state)`` Inverse of ``state_dict``. Receives a dict
containing torch tensors only.
* ``reset()`` Clear internal buffers at episode boundaries.
* ``transform_features(features: dict[str, PolicyFeature]) -> dict[str, PolicyFeature]``
If present, this method will be called to aggregate the dataset features of all steps.
Example separation:
- get_config(): {"name": "my_step", "learning_rate": 0.01, "window_size": 10}
@@ -174,7 +176,7 @@ class ProcessorStep(Protocol):
def reset(self) -> None: ...
def feature_contract(self, features: dict[str, PolicyFeature]) -> dict[str, PolicyFeature]: ...
def transform_features(self, features: dict[str, PolicyFeature]) -> dict[str, PolicyFeature]: ...
def _default_batch_to_transition(batch: dict[str, Any]) -> EnvTransition: # noqa: D401
@@ -201,10 +203,16 @@ def _default_batch_to_transition(batch: dict[str, Any]) -> EnvTransition: # noq
observation_keys = {k: v for k, v in batch.items() if k.startswith("observation.")}
observation = observation_keys if observation_keys else None
# Extract padding and task keys for complementary data
# Extract padding, task, index, and task_index keys for complementary data
pad_keys = {k: v for k, v in batch.items() if "_is_pad" in k}
task_key = {"task": batch["task"]} if "task" in batch else {}
complementary_data = {**pad_keys, **task_key} if pad_keys or task_key else {}
index_key = {"index": batch["index"]} if "index" in batch else {}
task_index_key = {"task_index": batch["task_index"]} if "task_index" in batch else {}
complementary_data = (
{**pad_keys, **task_key, **index_key, **task_index_key}
if pad_keys or task_key or index_key or task_index_key
else {}
)
transition: EnvTransition = {
TransitionKey.OBSERVATION: observation,
@@ -231,7 +239,7 @@ def _default_transition_to_batch(transition: EnvTransition) -> dict[str, Any]:
"info": transition.get(TransitionKey.INFO, {}),
}
# Add padding and task data from complementary_data
# Add padding, task, index, and task_index data from complementary_data
complementary_data = transition.get(TransitionKey.COMPLEMENTARY_DATA)
if complementary_data:
pad_data = {k: v for k, v in complementary_data.items() if "_is_pad" in k}
@@ -240,6 +248,12 @@ def _default_transition_to_batch(transition: EnvTransition) -> dict[str, Any]:
if "task" in complementary_data:
batch["task"] = complementary_data["task"]
if "index" in complementary_data:
batch["index"] = complementary_data["index"]
if "task_index" in complementary_data:
batch["task_index"] = complementary_data["task_index"]
# Handle observation - flatten dict to observation.* keys if it's a dict
observation = transition.get(TransitionKey.OBSERVATION)
if isinstance(observation, dict):
@@ -342,7 +356,10 @@ class RobotProcessor(ModelHubMixin):
hook(idx, current_transition)
# Convert back to original format if needed
return self.to_output(current_transition) if called_with_batch else current_transition
if called_with_batch or self.to_output is not _default_transition_to_batch:
return self.to_output(current_transition)
else:
return current_transition
def _prepare_transition(self, data: EnvTransition | dict[str, Any]) -> tuple[EnvTransition, bool]:
"""Prepare and validate transition data for processing.
@@ -575,10 +592,9 @@ class RobotProcessor(ModelHubMixin):
if config_filename is None:
# Try common config names
common_names = [
"processor.json",
"preprocessor.json",
"postprocessor.json",
"robotprocessor.json",
"robot_processor.json",
"robot_preprocessor.json",
"robot_postprocessor.json",
]
config_path = None
for name in common_names:
@@ -808,23 +824,15 @@ class RobotProcessor(ModelHubMixin):
f"Step {i} ({type(step).__name__}) must define __call__(transition) -> EnvTransition"
)
fc = getattr(step, "feature_contract", None)
if not callable(fc):
raise TypeError(
f"Step {i} ({type(step).__name__}) must define feature_contract(features) -> dict[str, Any]"
)
def feature_contract(self, initial_features: dict[str, PolicyFeature]) -> dict[str, PolicyFeature]:
def transform_features(self, initial_features: dict[str, PolicyFeature]) -> dict[str, PolicyFeature]:
"""
Apply ALL steps in order. Each step must implement
feature_contract(features) and return a dict (full or incremental schema).
Apply ALL steps in order. Only if a step has a features method, it will be called.
We aggregate the dataset features of all steps.
"""
features: dict[str, PolicyFeature] = deepcopy(initial_features)
for _, step in enumerate(self.steps):
out = step.feature_contract(features)
if not isinstance(out, dict):
raise TypeError(f"{step.__class__.__name__}.feature_contract must return dict[str, Any]")
out = step.transform_features(features)
features = out
return features
@@ -884,7 +892,7 @@ class ObservationProcessor:
def reset(self) -> None:
pass
def feature_contract(self, features: dict[str, PolicyFeature]) -> dict[str, PolicyFeature]:
def transform_features(self, features: dict[str, PolicyFeature]) -> dict[str, PolicyFeature]:
return features
@@ -944,7 +952,7 @@ class ActionProcessor:
def reset(self) -> None:
pass
def feature_contract(self, features: dict[str, PolicyFeature]) -> dict[str, PolicyFeature]:
def transform_features(self, features: dict[str, PolicyFeature]) -> dict[str, PolicyFeature]:
return features
@@ -1003,7 +1011,7 @@ class RewardProcessor:
def reset(self) -> None:
pass
def feature_contract(self, features: dict[str, PolicyFeature]) -> dict[str, PolicyFeature]:
def transform_features(self, features: dict[str, PolicyFeature]) -> dict[str, PolicyFeature]:
return features
@@ -1067,7 +1075,7 @@ class DoneProcessor:
def reset(self) -> None:
pass
def feature_contract(self, features: dict[str, PolicyFeature]) -> dict[str, PolicyFeature]:
def transform_features(self, features: dict[str, PolicyFeature]) -> dict[str, PolicyFeature]:
return features
@@ -1127,7 +1135,7 @@ class TruncatedProcessor:
def reset(self) -> None:
pass
def feature_contract(self, features: dict[str, PolicyFeature]) -> dict[str, PolicyFeature]:
def transform_features(self, features: dict[str, PolicyFeature]) -> dict[str, PolicyFeature]:
return features
@@ -1192,7 +1200,7 @@ class InfoProcessor:
def reset(self) -> None:
pass
def feature_contract(self, features: dict[str, PolicyFeature]) -> dict[str, PolicyFeature]:
def transform_features(self, features: dict[str, PolicyFeature]) -> dict[str, PolicyFeature]:
return features
@@ -1238,7 +1246,7 @@ class ComplementaryDataProcessor:
def reset(self) -> None:
pass
def feature_contract(self, features: dict[str, PolicyFeature]) -> dict[str, PolicyFeature]:
def transform_features(self, features: dict[str, PolicyFeature]) -> dict[str, PolicyFeature]:
return features
@@ -1260,5 +1268,5 @@ class IdentityProcessor:
def reset(self) -> None:
pass
def feature_contract(self, features: dict[str, PolicyFeature]) -> dict[str, PolicyFeature]:
def transform_features(self, features: dict[str, PolicyFeature]) -> dict[str, PolicyFeature]:
return features
+1 -1
View File
@@ -43,7 +43,7 @@ class RenameProcessor(ObservationProcessor):
def get_config(self) -> dict[str, Any]:
return {"rename_map": self.rename_map}
def feature_contract(self, features: dict[str, PolicyFeature]) -> dict[str, PolicyFeature]:
def transform_features(self, features: dict[str, PolicyFeature]) -> dict[str, PolicyFeature]:
"""Transforms:
- Each key in the observation that appears in `rename_map` is renamed to its value.
- Keys not in `rename_map` remain unchanged.
@@ -0,0 +1,229 @@
"""
Tokenizer processor for handling text tokenization in robot transitions.
"""
from __future__ import annotations
from dataclasses import dataclass, field
from typing import TYPE_CHECKING, Any
import torch
from lerobot.configs.types import FeatureType, PolicyFeature
from lerobot.constants import OBS_LANGUAGE
from lerobot.processor.pipeline import EnvTransition, ProcessorStepRegistry, TransitionKey
from lerobot.utils.import_utils import _transformers_available
if TYPE_CHECKING or _transformers_available:
from transformers import AutoTokenizer
else:
AutoTokenizer = None
@dataclass
@ProcessorStepRegistry.register(name="tokenizer_processor")
class TokenizerProcessor:
"""Tokenizes text tasks in complementary data using a huggingface tokenizer.
This processor handles tokenization of task strings found in the complementary_data
using a specified pretrained tokenizer from Hugging Face. It adds tokenized versions
to the observation data for model processing while preserving the original task string.
The processor supports both single strings and lists of strings as task inputs.
Args:
tokenizer_name: Name of the pretrained tokenizer to load from Hugging Face Hub
(e.g., "bert-base-uncased", "microsoft/DialoGPT-medium"). This will be used
with AutoTokenizer.from_pretrained(). If tokenizer is provided, this is ignored.
tokenizer: A tokenizer object (e.g., from transformers library) that implements
the __call__ method. If provided, tokenizer_name is ignored. This parameter
is not serialized and must be provided via overrides when loading.
max_length: Maximum sequence length for tokenization. Defaults to 512.
task_key: Key in complementary_data containing the task text. Defaults to "task".
padding: Padding strategy for tokenization. Defaults to "max_length".
truncation: Whether to truncate sequences longer than max_length. Defaults to True.
Examples:
Using tokenizer name (auto-loaded):
```python
processor = TokenizerProcessor(tokenizer_name="bert-base-uncased", max_length=128)
```
Using custom tokenizer object:
```python
from transformers import AutoTokenizer
custom_tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
processor = TokenizerProcessor(tokenizer=custom_tokenizer, max_length=128)
```
"""
tokenizer_name: str | None = None
tokenizer: Any | None = None # Otherwise transformers is not available in the core dependencies
max_length: int = 512
task_key: str = "task"
padding_side: str = "right"
padding: str = "max_length"
truncation: bool = True
# Internal tokenizer instance (not serialized)
_tokenizer: Any = field(default=None, init=False, repr=False)
def __post_init__(self):
"""Initialize the tokenizer from the provided tokenizer or tokenizer name."""
if not _transformers_available:
raise ImportError(
"The 'transformers' library is not installed. "
"Please install it with `pip install 'lerobot[transformers-dep]'` to use TokenizerProcessor."
)
if self.tokenizer is not None:
# Use provided tokenizer object directly
self._tokenizer = self.tokenizer
elif self.tokenizer_name is not None:
if AutoTokenizer is None:
raise ImportError("AutoTokenizer is not available")
self._tokenizer = AutoTokenizer.from_pretrained(self.tokenizer_name)
else:
raise ValueError(
"Either 'tokenizer' or 'tokenizer_name' must be provided. "
"Pass a tokenizer object directly or a tokenizer name to auto-load."
)
def get_task(self, transition: EnvTransition) -> list[str] | None:
"""Extract and normalize task from complementary data.
Args:
transition: Input transition containing complementary_data.
Returns:
List of task strings if task is present, None otherwise.
"""
complementary_data = transition.get(TransitionKey.COMPLEMENTARY_DATA)
if complementary_data is None:
return None
if self.task_key not in complementary_data:
return None
task = complementary_data[self.task_key]
if task is None:
return None
# Convert to list of strings
if isinstance(task, str):
return [task]
elif isinstance(task, list) and all(isinstance(t, str) for t in task):
return task
return None
def __call__(self, transition: EnvTransition) -> EnvTransition:
"""Process the transition by tokenizing the task text.
Args:
transition: Input transition containing complementary_data with task text.
Returns:
Modified transition with tokenized task added to observation.
Raises:
ValueError: If tokenizer initialization failed.
"""
task = self.get_task(transition)
if task is None:
return transition
# Tokenize the task
tokenized_prompt = self._tokenize_text(task)
# Get or create observation dict
observation = transition.get(TransitionKey.OBSERVATION)
if observation is None:
observation = {}
else:
observation = dict(observation) # Make a copy
# Add tokenized data to observation
input_ids = tokenized_prompt["input_ids"]
attention_mask = tokenized_prompt.get("attention_mask")
if attention_mask is None:
# Some tokenizers (e.g., SigLIP text) may not return attention_mask; default to ones
attention_mask = torch.ones_like(input_ids)
observation[f"{OBS_LANGUAGE}.tokens"] = input_ids
observation[f"{OBS_LANGUAGE}.attention_mask"] = attention_mask.to(dtype=torch.bool)
transition[TransitionKey.OBSERVATION.value] = observation # type: ignore[misc]
return transition
def _tokenize_text(self, text: str | list[str]) -> dict[str, torch.Tensor]:
"""Tokenize text using the configured tokenizer.
Args:
text: Text string or list of strings to tokenize.
Returns:
Dictionary containing tokenized output with keys like 'input_ids', 'attention_mask'.
"""
return self._tokenizer(
text,
max_length=self.max_length,
truncation=self.truncation,
padding=self.padding,
padding_side=self.padding_side,
return_tensors="pt",
)
def get_config(self) -> dict[str, Any]:
"""Return configuration for serialization.
Note: Only tokenizer_name is saved, not the tokenizer object itself.
When loading, provide the tokenizer via overrides if needed.
"""
config = {
"max_length": self.max_length,
"task_key": self.task_key,
"padding_side": self.padding_side,
"padding": self.padding,
"truncation": self.truncation,
}
# Only include tokenizer_name if it was used (not when tokenizer object was provided)
if self.tokenizer_name is not None:
config["tokenizer_name"] = self.tokenizer_name
return config
def state_dict(self) -> dict[str, torch.Tensor]:
"""Return state dictionary (empty for this processor)."""
return {}
def load_state_dict(self, state: dict[str, torch.Tensor]) -> None:
"""Load state dictionary (no-op for this processor)."""
pass
def reset(self) -> None:
"""Reset processor state (no-op for this processor)."""
pass
def transform_features(self, features: dict[str, PolicyFeature]) -> dict[str, PolicyFeature]:
"""Add tokenized task features to the feature contract.
Args:
features: Input feature dictionary.
Returns:
Updated feature dictionary with tokenized task features added.
"""
# Add features for tokenized output if they don't exist
# Standard tokenizer output includes tokens and attention_mask
tokens_key = f"{OBS_LANGUAGE}.tokens"
attention_mask_key = f"{OBS_LANGUAGE}.attention_mask"
if tokens_key not in features:
features[tokens_key] = PolicyFeature(type=FeatureType.LANGUAGE, shape=(self.max_length,))
if attention_mask_key not in features:
features[attention_mask_key] = PolicyFeature(type=FeatureType.LANGUAGE, shape=(self.max_length,))
return features
+149 -36
View File
@@ -59,7 +59,7 @@ python -m lerobot.record \
import logging
import time
from dataclasses import asdict, dataclass
from dataclasses import asdict, dataclass, field
from pathlib import Path
from pprint import pformat
@@ -72,10 +72,19 @@ from lerobot.configs import parser
from lerobot.configs.policies import PreTrainedConfig
from lerobot.datasets.image_writer import safe_stop_image_writer
from lerobot.datasets.lerobot_dataset import LeRobotDataset
from lerobot.datasets.utils import build_dataset_frame, hw_to_dataset_features
from lerobot.datasets.utils import hw_to_dataset_features
from lerobot.datasets.video_utils import VideoEncodingManager
from lerobot.policies.factory import make_policy
from lerobot.policies.factory import make_policy, make_processor
from lerobot.policies.pretrained import PreTrainedPolicy
from lerobot.processor import RobotProcessor
from lerobot.processor.converters import (
to_dataset_frame,
to_output_robot_action,
to_transition_robot_observation,
to_transition_teleop_action,
)
from lerobot.processor.normalize_processor import rename_stats
from lerobot.processor.pipeline import IdentityProcessor, TransitionKey
from lerobot.robots import ( # noqa: F401
Robot,
RobotConfig,
@@ -149,6 +158,8 @@ class DatasetRecordConfig:
# Number of episodes to record before batch encoding videos
# Set to 1 for immediate encoding (default behavior), or higher for batched encoding
video_encoding_batch_size: int = 1
# Rename map for the observation to override the image and state keys
rename_map: dict[str, str] = field(default_factory=dict)
def __post_init__(self):
if self.single_task is None:
@@ -187,6 +198,36 @@ class RecordConfig:
return ["policy"]
""" --------------- record_loop() data flow --------------------------
[ Robot ]
V
[ robot.get_observation() ] ---> raw_obs
V
[ robot_observation_processor ] ---> obs_transition
V
.-----( ACTION LOGIC )------------------.
V V
[ From Teleoperator ] [ From Policy ]
| |
| [teleop.get_action] -> raw_action | [predict_action]
| | | |
| V | V
| [teleop_action_processor] | |
| | | |
'---> teleop_transition '---> policy_transition
| |
'-------------------------.-------------'
V
[ robot_action_processor ] --> robot_action_to_send
V
[ robot.send_action() ] -- (Robot Executes)
V
( Transitions are merged & added to Dataset )
V
( Rerun Log / Loop Wait )
"""
@safe_stop_image_writer
def record_loop(
robot: Robot,
@@ -195,15 +236,30 @@ def record_loop(
dataset: LeRobotDataset | None = None,
teleop: Teleoperator | list[Teleoperator] | None = None,
policy: PreTrainedPolicy | None = None,
preprocessor: RobotProcessor | None = None,
postprocessor: RobotProcessor | None = None,
control_time_s: int | None = None,
teleop_action_processor: RobotProcessor | None = None, # runs after teleop
robot_action_processor: RobotProcessor | None = None, # runs before robot
robot_observation_processor: RobotProcessor | None = None, # runs after robot
single_task: str | None = None,
display_data: bool = False,
):
teleop_action_processor = teleop_action_processor or RobotProcessor(
steps=[IdentityProcessor()], to_transition=to_transition_teleop_action, to_output=lambda tr: tr
)
robot_action_processor = robot_action_processor or RobotProcessor(
steps=[IdentityProcessor()], to_transition=lambda tr: tr, to_output=to_output_robot_action
)
robot_observation_processor = robot_observation_processor or RobotProcessor(
steps=[IdentityProcessor()], to_transition=to_transition_robot_observation, to_output=lambda tr: tr
)
if dataset is not None and dataset.fps != fps:
raise ValueError(f"The dataset fps should be equal to requested fps ({dataset.fps} != {fps}).")
teleop_arm = teleop_keyboard = None
if isinstance(teleop, list):
if isinstance(teleop, list): # For LeKiwi
teleop_keyboard = next((t for t in teleop if isinstance(t, KeyboardTeleop)), None)
teleop_arm = next(
(
@@ -219,9 +275,20 @@ def record_loop(
"For multi-teleop, the list must contain exactly one KeyboardTeleop and one arm teleoperator. Currently only supported for LeKiwi robot."
)
# if policy is given it needs cleaning up
if policy is not None:
# Reset policy and processor if they are provided
if policy is not None and preprocessor is not None and postprocessor is not None:
policy.reset()
preprocessor.reset()
postprocessor.reset()
# Reset custom pipelines
teleop_action_processor.reset()
robot_action_processor.reset()
robot_observation_processor.reset()
policy_transition = None
teleop_transition = None
obs_transition = None
timestamp = 0
start_episode_t = time.perf_counter()
@@ -232,51 +299,87 @@ def record_loop(
events["exit_early"] = False
break
observation = robot.get_observation()
# Get robot observation
obs = robot.get_observation()
if policy is not None or dataset is not None:
observation_frame = build_dataset_frame(dataset.features, observation, prefix="observation")
# Applies a pipeline to the raw robot observation, default is IdentityProcessor
obs_transition = robot_observation_processor(obs)
# Get action from either policy or teleop
if policy is not None and preprocessor is not None and postprocessor is not None:
if dataset is not None:
observation_frame = to_dataset_frame(
obs_transition, dataset.features
) # Convert the observation to the dataset format
if policy is not None:
action_values = predict_action(
observation_frame,
policy,
get_safe_torch_device(policy.config.device),
policy.config.use_amp,
observation=observation_frame,
policy=policy,
device=get_safe_torch_device(policy.config.device),
preprocessor=preprocessor,
postprocessor=postprocessor,
use_amp=policy.config.use_amp,
task=single_task,
robot_type=robot.robot_type,
)
action = {key: action_values[i].item() for i, key in enumerate(robot.action_features)}
elif policy is None and isinstance(teleop, Teleoperator):
action = teleop.get_action()
elif policy is None and isinstance(teleop, list):
# TODO(pepijn, steven): clean the record loop for use of multiple robots (possibly with pipeline)
action_names = dataset.features["action"]["names"]
policy_action = {f"action.{name}": float(action_values[i]) for i, name in enumerate(action_names)}
policy_transition = {
TransitionKey.ACTION: policy_action,
TransitionKey.COMPLEMENTARY_DATA: {},
}
elif isinstance(teleop, Teleoperator):
act = teleop.get_action()
# Applies a pipeline to the raw teleop action, default is IdentityProcessor
teleop_transition = teleop_action_processor(act)
elif isinstance(teleop, list):
arm_action = teleop_arm.get_action()
arm_action = {f"arm_{k}": v for k, v in arm_action.items()}
keyboard_action = teleop_keyboard.get_action()
base_action = robot._from_keyboard_to_base_action(keyboard_action)
action = {**arm_action, **base_action} if len(base_action) > 0 else arm_action
act = {**arm_action, **base_action} if len(base_action) > 0 else arm_action
teleop_transition = teleop_action_processor(act)
else:
logging.info(
"No policy or teleoperator provided, skipping action generation."
"This is likely to happen when resetting the environment without a teleop device."
"The robot won't be at its rest position at the start of the next episode."
"No policy or teleoperator provided, skipping action generation. "
"This is likely to happen during environment reset."
)
continue
# Still continue to next loop to respect timing
# Applies a pipeline to the action, default is IdentityProcessor
# IMPORTANT: action_pipeline.to_output must return a dict suitable for robot.send_action()
if policy_transition is not None:
robot_action_to_send = robot_action_processor(policy_transition)
else:
robot_action_to_send = robot_action_processor(teleop_transition)
# Send action to robot
# Action can eventually be clipped using `max_relative_target`,
# so action actually sent is saved in the dataset.
sent_action = robot.send_action(action)
# so action actually sent is saved in the dataset. action = postprocessor.process(action)
# TODO(pepijn, adil): we should use a pipeline step to clip the action, so the sent action is the action that we input to the robot.
_ = robot.send_action(robot_action_to_send)
# Write to dataset
if dataset is not None:
action_frame = build_dataset_frame(dataset.features, sent_action, prefix="action")
frame = {**observation_frame, **action_frame}
# If to_dataset_frame is provided, use it to merge the transitions.
merged = []
if obs_transition is not None: # The observation from the robot
merged.append(obs_transition)
if teleop_transition is not None: # The action from teleop
merged.append(teleop_transition)
if policy_transition is not None: # The action from policy
merged.append(policy_transition)
frame = to_dataset_frame(
merged if len(merged) > 1 else merged[0], dataset.features
) # Convert the observation to the dataset format
dataset.add_frame(frame, task=single_task)
if display_data:
log_rerun_data(observation, action)
log_rerun_data([obs_transition, teleop_transition or policy_transition])
dt_s = time.perf_counter() - start_loop_t
busy_wait(1 / fps - dt_s)
@@ -328,6 +431,18 @@ def record(cfg: RecordConfig) -> LeRobotDataset:
# Load pretrained policy
policy = None if cfg.policy is None else make_policy(cfg.policy, ds_meta=dataset.meta)
preprocessor = None
postprocessor = None
if cfg.policy is not None:
preprocessor, postprocessor = make_processor(
policy_cfg=cfg.policy,
pretrained_path=cfg.policy.pretrained_path,
dataset_stats=rename_stats(dataset.meta.stats, cfg.dataset.rename_map),
preprocessor_overrides={
"device_processor": {"device": cfg.policy.device},
"rename_processor": {"rename_map": cfg.dataset.rename_map},
},
)
robot.connect()
if teleop is not None:
@@ -345,6 +460,8 @@ def record(cfg: RecordConfig) -> LeRobotDataset:
fps=cfg.dataset.fps,
teleop=teleop,
policy=policy,
preprocessor=preprocessor,
postprocessor=postprocessor,
dataset=dataset,
control_time_s=cfg.dataset.episode_time_s,
single_task=cfg.dataset.single_task,
@@ -393,9 +510,5 @@ def record(cfg: RecordConfig) -> LeRobotDataset:
return dataset
def main():
record()
if __name__ == "__main__":
main()
record()
@@ -14,6 +14,5 @@
# See the License for the specific language governing permissions and
# limitations under the License.
from .config_so100_follower import SO100FollowerConfig, SO100FollowerEndEffectorConfig
from .config_so100_follower import SO100FollowerConfig
from .so100_follower import SO100Follower
from .so100_follower_end_effector import SO100FollowerEndEffector
@@ -39,35 +39,3 @@ class SO100FollowerConfig(RobotConfig):
# Set to `True` for backward compatibility with previous policies/dataset
use_degrees: bool = False
@RobotConfig.register_subclass("so100_follower_end_effector")
@dataclass
class SO100FollowerEndEffectorConfig(SO100FollowerConfig):
"""Configuration for the SO100FollowerEndEffector robot."""
# Path to URDF file for kinematics
# NOTE: It is highly recommended to use the urdf in the SO-ARM100 repo:
# https://github.com/TheRobotStudio/SO-ARM100/blob/main/Simulation/SO101/so101_new_calib.urdf
urdf_path: str | None = None
# End-effector frame name in the URDF
target_frame_name: str = "gripper_frame_link"
# Default bounds for the end-effector position (in meters)
end_effector_bounds: dict[str, list[float]] = field(
default_factory=lambda: {
"min": [-1.0, -1.0, -1.0], # min x, y, z
"max": [1.0, 1.0, 1.0], # max x, y, z
}
)
max_gripper_pos: float = 50
end_effector_step_sizes: dict[str, float] = field(
default_factory=lambda: {
"x": 0.02,
"y": 0.02,
"z": 0.02,
}
)
@@ -0,0 +1,447 @@
# !/usr/bin/env python
# Copyright 2025 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from dataclasses import dataclass, field
import numpy as np
from scipy.spatial.transform import Rotation
from lerobot.configs.types import PolicyFeature
from lerobot.model.kinematics import RobotKinematics
from lerobot.processor.pipeline import (
ActionProcessor,
ComplementaryDataProcessor,
EnvTransition,
ObservationProcessor,
ProcessorStepRegistry,
TransitionKey,
)
from lerobot.robots.robot import Robot
@ProcessorStepRegistry.register("ee_reference_and_delta")
@dataclass
class EEReferenceAndDelta:
"""
Compute the desired end-effector pose from the target pose and the current pose.
Input ACTION keys:
{
"action.ee.{x,y,z,wx,wy,wz}" : float
"complementary_data.raw_joint_positions": dict,
}
Output ACTION keys:
{
"action.ee.{x,y,z,wx,wy,wz}" : float
}
"""
kinematics: RobotKinematics
end_effector_step_sizes: dict
motor_names: list[str]
reference_ee_pose: np.ndarray | None = field(default=None, init=False, repr=False)
_prev_enabled: bool = field(default=False, init=False, repr=False)
_command_when_disabled: np.ndarray | None = field(default=None, init=False, repr=False)
def __call__(self, transition: EnvTransition) -> EnvTransition:
act = transition.get(TransitionKey.ACTION) or {}
comp = transition.get(TransitionKey.COMPLEMENTARY_DATA) or {}
# Get joint positions from complimentary data
raw = comp.get("raw_joint_positions", None)
if raw is None:
raise ValueError(
"raw_joint_positions is not in complementary data and is required for EEReferenceAndDelta"
)
q = np.array([float(raw[n]) for n in self.motor_names], dtype=float)
# Current pose from FK on measured joints
t_curr = self.kinematics.forward_kinematics(q)
enabled = bool(act.pop("action.enabled", 0))
tx = float(act.pop("action.target_x", 0.0))
ty = float(act.pop("action.target_y", 0.0))
tz = float(act.pop("action.target_z", 0.0))
wx = float(act.pop("action.target_wx", 0.0))
wy = float(act.pop("action.target_wy", 0.0))
wz = float(act.pop("action.target_wz", 0.0))
desired = None
if enabled:
# Latch a reference at the rising edge; also be defensive if None
if not self._prev_enabled or self.reference_ee_pose is None:
self.reference_ee_pose = t_curr.copy()
ref = self.reference_ee_pose if self.reference_ee_pose is not None else t_curr
delta_p = np.array(
[
tx * self.end_effector_step_sizes["x"],
ty * self.end_effector_step_sizes["y"],
tz * self.end_effector_step_sizes["z"],
],
dtype=float,
)
r_abs = Rotation.from_rotvec([wx, wy, wz]).as_matrix()
desired = np.eye(4, dtype=float)
desired[:3, :3] = ref[:3, :3] @ r_abs
desired[:3, 3] = ref[:3, 3] + delta_p
self._command_when_disabled = desired.copy()
else:
# While disabled, keep sending the same command to avoid drift.
if self._command_when_disabled is None:
# If we've never had an enabled command yet, freeze current FK pose once.
self._command_when_disabled = t_curr.copy()
desired = self._command_when_disabled.copy()
# Write action fields
pos = desired[:3, 3]
tw = Rotation.from_matrix(desired[:3, :3]).as_rotvec()
act.update(
{
"action.ee.x": float(pos[0]),
"action.ee.y": float(pos[1]),
"action.ee.z": float(pos[2]),
"action.ee.wx": float(tw[0]),
"action.ee.wy": float(tw[1]),
"action.ee.wz": float(tw[2]),
}
)
self._prev_enabled = enabled
transition[TransitionKey.ACTION] = act
return transition
def transform_features(self, features: dict[str, PolicyFeature]) -> dict[str, PolicyFeature]:
return features
@ProcessorStepRegistry.register("ee_bounds_and_safety")
@dataclass
class EEBoundsAndSafety(ActionProcessor):
"""
Clip the end-effector pose to the bounds and check for jumps.
Input ACTION keys:
{
"action.ee.{x,y,z,wx,wy,wz}" : float
}
Output ACTION keys:
{
"action.ee.{x,y,z,wx,wy,wz}" : float
}
"""
end_effector_bounds: dict
max_ee_step_m: float = 0.05
max_ee_twist_step_rad: float = 0.20
_last_pos: np.ndarray | None = field(default=None, init=False, repr=False)
def action(self, act: dict | None) -> dict:
x = act.pop("action.ee.x", None)
y = act.pop("action.ee.y", None)
z = act.pop("action.ee.z", None)
wx = act.pop("action.ee.wx", None)
wy = act.pop("action.ee.wy", None)
wz = act.pop("action.ee.wz", None)
if None in (x, y, z, wx, wy, wz):
return act
pos = np.array([x, y, z], dtype=float)
twist = np.array([wx, wy, wz], dtype=float)
# Clip position
pos = np.clip(pos, self.end_effector_bounds["min"], self.end_effector_bounds["max"])
# Check for jumps in position
if self._last_pos is not None:
dpos = pos - self._last_pos
n = float(np.linalg.norm(dpos))
if n > self.max_ee_step_m and n > 0:
pos = self._last_pos + dpos * (self.max_ee_step_m / n)
raise ValueError(f"EE jump {n:.3f}m > {self.max_ee_step_m}m")
self._last_pos = pos
self._last_twist = twist
act.update(
{
"action.ee.x": float(pos[0]),
"action.ee.y": float(pos[1]),
"action.ee.z": float(pos[2]),
"action.ee.wx": float(twist[0]),
"action.ee.wy": float(twist[1]),
"action.ee.wz": float(twist[2]),
}
)
return act
def reset(self):
self._last_pos = None
def transform_features(self, features: dict[str, PolicyFeature]) -> dict[str, PolicyFeature]:
# Because this is last step we specify the dataset features of this step that we want to be stored in the dataset
features["action.ee.x"] = float
features["action.ee.y"] = float
features["action.ee.z"] = float
features["action.ee.wx"] = float
features["action.ee.wy"] = float
features["action.ee.wz"] = float
return features
@ProcessorStepRegistry.register("inverse_kinematics_ee_to_joints")
@dataclass
class InverseKinematicsEEToJoints:
"""
Compute the desired joint positions from the desired end-effector pose.
Input ACTION keys:
{
"action.ee.{x,y,z,wx,wy,wz}" : float
"complementary_data.raw_joint_positions": dict,
}
Output ACTION keys:
{
"action.joint_name_1.pos": float,
"action.joint_name_2.pos": float,
...
"action.joint_name_n.pos": float,
}
"""
kinematics: RobotKinematics
motor_names: list[str]
q_curr: np.ndarray | None = field(default=None, init=False, repr=False)
initial_guess_current_joints: bool = True
def __call__(self, transition: EnvTransition) -> EnvTransition:
act = transition.get(TransitionKey.ACTION) or {}
comp = transition.get(TransitionKey.COMPLEMENTARY_DATA) or {}
x = act.get("action.ee.x", None)
y = act.get("action.ee.y", None)
z = act.get("action.ee.z", None)
wx = act.get("action.ee.wx", None)
wy = act.get("action.ee.wy", None)
wz = act.get("action.ee.wz", None)
if None in (x, y, z, wx, wy, wz):
# Nothing to do; restore what we popped and return
act.update(
{
"action.ee.x": x,
"action.ee.y": y,
"action.ee.z": z,
"action.ee.wx": wx,
"action.ee.wy": wy,
"action.ee.wz": wz,
}
)
transition[TransitionKey.ACTION] = act
return transition
# Get joint positions from complimentary data
raw = comp.get("raw_joint_positions", None)
if raw is None:
raise ValueError(
"raw_joint_positions is not in complementary data and is required for EEReferenceAndDelta"
)
if self.initial_guess_current_joints: # Use current joints as initial guess
self.q_curr = np.array([float(raw[n]) for n in self.motor_names], dtype=float)
else: # Use previous ik solution as initial guess
if self.q_curr is None:
self.q_curr = np.array([float(raw[n]) for n in self.motor_names], dtype=float)
# Build desired 4x4 transform from pos + rotvec (twist)
t_des = np.eye(4, dtype=float)
t_des[:3, :3] = Rotation.from_rotvec([wx, wy, wz]).as_matrix()
t_des[:3, 3] = [x, y, z]
# Compute inverse kinematics
q_target = self.kinematics.inverse_kinematics(self.q_curr, t_des)
self.q_curr = q_target
new_act = dict(act)
for i, name in enumerate(self.motor_names):
if name == "gripper":
new_act["observation.state.gripper.pos"] = float(raw["gripper"])
else:
new_act[f"action.{name}.pos"] = float(q_target[i])
transition[TransitionKey.ACTION] = new_act
return transition
def transform_features(self, features: dict[str, PolicyFeature]) -> dict[str, PolicyFeature]:
# We specify the dataset features of this step that we want to be stored in the dataset
features["action.ee.x"] = float
features["action.ee.y"] = float
features["action.ee.z"] = float
features["action.ee.wx"] = float
features["action.ee.wy"] = float
features["action.ee.wz"] = float
features["observation.state.gripper.pos"] = float
features["action.gripper.pos"] = float
return features
def reset(self):
self.q_curr = None
@ProcessorStepRegistry.register("gripper_velocity_to_joint")
@dataclass
class GripperVelocityToJoint:
"""
Convert the gripper velocity to a joint velocity.
Input ACTION keys:
{
"action.gripper": float,
}
Output ACTION keys:
{
"action.gripper.pos": float,
}
"""
motor_names: list[str]
speed_factor: float = 20.0
clip_min: float = 0.0
clip_max: float = 100.0
def __call__(self, transition: EnvTransition) -> EnvTransition:
obs = transition.get(TransitionKey.OBSERVATION) or {}
act = transition.get(TransitionKey.ACTION) or {}
comp = transition.get(TransitionKey.COMPLEMENTARY_DATA) or {}
if "action.gripper" not in act:
return transition
if "gripper" not in self.motor_names:
new_act = dict(act)
new_act.pop("action.gripper", None)
transition[TransitionKey.ACTION] = new_act
return transition
# Get current gripper position from complementary data
raw = comp.get("raw_joint_positions") or {}
curr_pos = float(raw.get("gripper"))
# Compute desired gripper velocity
u = float(act.get("action.gripper", 0.0))
delta = u * float(self.speed_factor)
gripper_pos = float(np.clip(curr_pos + delta, self.clip_min, self.clip_max))
new_act = dict(act)
new_act["action.gripper.pos"] = gripper_pos
new_act.pop("action.gripper", None)
transition[TransitionKey.ACTION] = new_act
obs.update({"observation.state.gripper.pos": curr_pos})
transition[TransitionKey.OBSERVATION] = obs
return transition
def transform_features(self, features: dict[str, PolicyFeature]) -> dict[str, PolicyFeature]:
# We specify the dataset features of this step that we want to be stored in the dataset
features["observation.state.gripper.pos"] = float
features["action.gripper.pos"] = float
return features
@ProcessorStepRegistry.register("forward_kinematics_joints_to_ee")
@dataclass
class ForwardKinematicsJointsToEE(ObservationProcessor):
"""
Compute the end-effector pose from the joint positions.
Input OBSERVATION keys:
{
"observation.state.{joint_name_1,joint_name_2,...,joint_name_n}.pos": float,
}
Output OBSERVATION keys:
{
"observation.state.ee.{x,y,z,wx,wy,wz}" : float
}
"""
kinematics: RobotKinematics
motor_names: list[str]
def observation(self, obs: dict | None) -> dict:
if not all(f"observation.state.{n}.pos" in obs for n in self.motor_names):
return obs
q = np.array([obs[f"observation.state.{n}.pos"] for n in self.motor_names], dtype=float)
t = self.kinematics.forward_kinematics(q)
pos = t[:3, 3]
tw = Rotation.from_matrix(t[:3, :3]).as_rotvec()
obs.update(
{
"observation.state.ee.x": float(pos[0]),
"observation.state.ee.y": float(pos[1]),
"observation.state.ee.z": float(pos[2]),
"observation.state.ee.wx": float(tw[0]),
"observation.state.ee.wy": float(tw[1]),
"observation.state.ee.wz": float(tw[2]),
}
)
return obs
def transform_features(self, features: dict[str, PolicyFeature]) -> dict[str, PolicyFeature]:
# We specify the dataset features of this step that we want to be stored in the dataset
for k in ["x", "y", "z", "wx", "wy", "wz"]:
features[f"observation.state.ee.{k}"] = float
return features
@ProcessorStepRegistry.register("add_robot_observation")
@dataclass
class AddRobotObservationAsComplimentaryData(ComplementaryDataProcessor):
"""
Read the robot's current observation and insert it into the transition as complementary data.
- Joint positions are added under complementary_data["raw_joint_positions"] as a dict:
{ "<motor_name>": <float position>, ... }
"""
robot: Robot
def complementary_data(self, comp: dict | None) -> dict:
comp = {} if comp is None else dict(comp)
obs = self.robot.get_observation()
comp["raw_joint_positions"] = {
k.removesuffix(".pos"): float(v)
for k, v in obs.items()
if isinstance(k, str) and k.endswith(".pos")
}
return comp
def transform_features(self, features: dict[str, PolicyFeature]) -> dict[str, PolicyFeature]:
return features
@@ -1,200 +0,0 @@
# !/usr/bin/env python
# Copyright 2025 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import logging
import time
from typing import Any
import numpy as np
from lerobot.cameras import make_cameras_from_configs
from lerobot.errors import DeviceNotConnectedError
from lerobot.model.kinematics import RobotKinematics
from lerobot.motors import Motor, MotorNormMode
from lerobot.motors.feetech import FeetechMotorsBus
from . import SO100Follower
from .config_so100_follower import SO100FollowerEndEffectorConfig
logger = logging.getLogger(__name__)
class SO100FollowerEndEffector(SO100Follower):
"""
SO100Follower robot with end-effector space control.
This robot inherits from SO100Follower but transforms actions from
end-effector space to joint space before sending them to the motors.
"""
config_class = SO100FollowerEndEffectorConfig
name = "so100_follower_end_effector"
def __init__(self, config: SO100FollowerEndEffectorConfig):
super().__init__(config)
self.bus = FeetechMotorsBus(
port=self.config.port,
motors={
"shoulder_pan": Motor(1, "sts3215", MotorNormMode.DEGREES),
"shoulder_lift": Motor(2, "sts3215", MotorNormMode.DEGREES),
"elbow_flex": Motor(3, "sts3215", MotorNormMode.DEGREES),
"wrist_flex": Motor(4, "sts3215", MotorNormMode.DEGREES),
"wrist_roll": Motor(5, "sts3215", MotorNormMode.DEGREES),
"gripper": Motor(6, "sts3215", MotorNormMode.RANGE_0_100),
},
calibration=self.calibration,
)
self.cameras = make_cameras_from_configs(config.cameras)
self.config = config
# Initialize the kinematics module for the so100 robot
if self.config.urdf_path is None:
raise ValueError(
"urdf_path must be provided in the configuration for end-effector control. "
"Please set urdf_path in your SO100FollowerEndEffectorConfig."
)
self.kinematics = RobotKinematics(
urdf_path=self.config.urdf_path,
target_frame_name=self.config.target_frame_name,
)
# Store the bounds for end-effector position
self.end_effector_bounds = self.config.end_effector_bounds
self.current_ee_pos = None
self.current_joint_pos = None
@property
def action_features(self) -> dict[str, Any]:
"""
Define action features for end-effector control.
Returns dictionary with dtype, shape, and names.
"""
return {
"dtype": "float32",
"shape": (4,),
"names": {"delta_x": 0, "delta_y": 1, "delta_z": 2, "gripper": 3},
}
def send_action(self, action: dict[str, Any]) -> dict[str, Any]:
"""
Transform action from end-effector space to joint space and send to motors.
Args:
action: Dictionary with keys 'delta_x', 'delta_y', 'delta_z' for end-effector control
or a numpy array with [delta_x, delta_y, delta_z]
Returns:
The joint-space action that was sent to the motors
"""
if not self.is_connected:
raise DeviceNotConnectedError(f"{self} is not connected.")
# Convert action to numpy array if not already
if isinstance(action, dict):
if all(k in action for k in ["delta_x", "delta_y", "delta_z"]):
delta_ee = np.array(
[
action["delta_x"] * self.config.end_effector_step_sizes["x"],
action["delta_y"] * self.config.end_effector_step_sizes["y"],
action["delta_z"] * self.config.end_effector_step_sizes["z"],
],
dtype=np.float32,
)
if "gripper" not in action:
action["gripper"] = [1.0]
action = np.append(delta_ee, action["gripper"])
else:
logger.warning(
f"Expected action keys 'delta_x', 'delta_y', 'delta_z', got {list(action.keys())}"
)
action = np.zeros(4, dtype=np.float32)
if self.current_joint_pos is None:
# Read current joint positions
current_joint_pos = self.bus.sync_read("Present_Position")
self.current_joint_pos = np.array([current_joint_pos[name] for name in self.bus.motors])
# Calculate current end-effector position using forward kinematics
if self.current_ee_pos is None:
self.current_ee_pos = self.kinematics.forward_kinematics(self.current_joint_pos)
# Set desired end-effector position by adding delta
desired_ee_pos = np.eye(4)
desired_ee_pos[:3, :3] = self.current_ee_pos[:3, :3] # Keep orientation
# Add delta to position and clip to bounds
desired_ee_pos[:3, 3] = self.current_ee_pos[:3, 3] + action[:3]
if self.end_effector_bounds is not None:
desired_ee_pos[:3, 3] = np.clip(
desired_ee_pos[:3, 3],
self.end_effector_bounds["min"],
self.end_effector_bounds["max"],
)
# Compute inverse kinematics to get joint positions
target_joint_values_in_degrees = self.kinematics.inverse_kinematics(
self.current_joint_pos, desired_ee_pos
)
# Create joint space action dictionary
joint_action = {
f"{key}.pos": target_joint_values_in_degrees[i] for i, key in enumerate(self.bus.motors.keys())
}
# Handle gripper separately if included in action
# Gripper delta action is in the range 0 - 2,
# We need to shift the action to the range -1, 1 so that we can expand it to -Max_gripper_pos, Max_gripper_pos
joint_action["gripper.pos"] = np.clip(
self.current_joint_pos[-1] + (action[-1] - 1) * self.config.max_gripper_pos,
5,
self.config.max_gripper_pos,
)
self.current_ee_pos = desired_ee_pos.copy()
self.current_joint_pos = target_joint_values_in_degrees.copy()
self.current_joint_pos[-1] = joint_action["gripper.pos"]
# Send joint space action to parent class
return super().send_action(joint_action)
def get_observation(self) -> dict[str, Any]:
if not self.is_connected:
raise DeviceNotConnectedError(f"{self} is not connected.")
# Read arm position
start = time.perf_counter()
obs_dict = self.bus.sync_read("Present_Position")
obs_dict = {f"{motor}.pos": val for motor, val in obs_dict.items()}
dt_ms = (time.perf_counter() - start) * 1e3
logger.debug(f"{self} read state: {dt_ms:.1f}ms")
# Capture images from cameras
for cam_key, cam in self.cameras.items():
start = time.perf_counter()
obs_dict[cam_key] = cam.async_read()
dt_ms = (time.perf_counter() - start) * 1e3
logger.debug(f"{self} read {cam_key}: {dt_ms:.1f}ms")
return obs_dict
def reset(self):
self.current_ee_pos = None
self.current_joint_pos = None
+1
View File
@@ -69,6 +69,7 @@ def make_robot_from_config(config: RobotConfig) -> Robot:
raise ValueError(config.type)
# TODO(pepijn): Move to pipeline step to make sure we don't have to do this in the robot code and send action to robot is clean for use in dataset
def ensure_safe_goal_position(
goal_present_pos: dict[str, tuple[float, float]], max_relative_target: float | dict[float]
) -> dict[str, float]:
+2 -4
View File
@@ -1048,10 +1048,8 @@ def get_observation_features(
return None, None
with torch.no_grad():
observation_features = policy.actor.encoder.get_cached_image_features(observations, normalize=True)
next_observation_features = policy.actor.encoder.get_cached_image_features(
next_observations, normalize=True
)
observation_features = policy.actor.encoder.get_cached_image_features(observations)
next_observation_features = policy.actor.encoder.get_cached_image_features(next_observations)
return observation_features, next_observation_features
+182 -4
View File
@@ -14,12 +14,17 @@
# See the License for the specific language governing permissions and
# limitations under the License.
import logging
import os
import time
from contextlib import nullcontext
from pprint import pformat
from typing import Any
import torch
# Fix tokenizer parallelism conflicts with multiprocessing
os.environ["TOKENIZERS_PARALLELISM"] = "false"
from termcolor import colored
from torch.amp import GradScaler
from torch.optim import Optimizer
@@ -31,7 +36,7 @@ from lerobot.datasets.sampler import EpisodeAwareSampler
from lerobot.datasets.utils import cycle
from lerobot.envs.factory import make_env
from lerobot.optim.factory import make_optimizer_and_scheduler
from lerobot.policies.factory import make_policy
from lerobot.policies.factory import make_policy, make_processor
from lerobot.policies.pretrained import PreTrainedPolicy
from lerobot.policies.utils import get_device_from_parameters
from lerobot.scripts.eval import eval_policy
@@ -67,10 +72,18 @@ def update_policy(
start_time = time.perf_counter()
device = get_device_from_parameters(policy)
policy.train()
# Forward pass timing
forward_start = time.perf_counter()
with torch.autocast(device_type=device.type) if use_amp else nullcontext():
loss, output_dict = policy.forward(batch)
# TODO(rcadene): policy.unnormalize_outputs(out_dict)
forward_time = time.perf_counter() - forward_start
# Backward pass timing
backward_start = time.perf_counter()
grad_scaler.scale(loss).backward()
backward_time = time.perf_counter() - backward_start
# Unscale the gradient of the optimizer's assigned params in-place **prior to gradient clipping**.
grad_scaler.unscale_(optimizer)
@@ -81,6 +94,9 @@ def update_policy(
error_if_nonfinite=False,
)
# Optimizer step timing
optim_start = time.perf_counter()
# Optimizer's gradients are already unscaled, so scaler.step does not unscale them,
# although it still skips optimizer.step() if the gradients contain infs or NaNs.
with lock if lock is not None else nullcontext():
@@ -97,6 +113,47 @@ def update_policy(
if has_method(policy, "update"):
# To possibly update an internal buffer (for instance an Exponential Moving Average like in TDMPC).
policy.update()
optim_time = time.perf_counter() - optim_start
total_time = time.perf_counter() - start_time
# Collect timing statistics for RLearN policy (averaged reporting every minute)
if getattr(policy, "name", None) == "rlearn":
# Initialize timing accumulator if not exists
if not hasattr(policy, '_train_timing_stats'):
policy._train_timing_stats = {
'forward_times': [],
'backward_times': [],
'optim_times': [],
'total_times': [],
'last_print_time': time.perf_counter()
}
# Accumulate current step's timings
stats = policy._train_timing_stats
stats['forward_times'].append(forward_time * 1000)
stats['backward_times'].append(backward_time * 1000)
stats['optim_times'].append(optim_time * 1000)
stats['total_times'].append(total_time * 1000)
# Print averaged stats every minute (60 seconds)
current_time = time.perf_counter()
if current_time - stats['last_print_time'] >= 60.0:
n_samples = len(stats['forward_times'])
if n_samples > 0:
print(f"\nTraining Step Average Timing (last {n_samples} steps):")
print(f" Forward pass: {sum(stats['forward_times'])/n_samples:.2f} ms")
print(f" Backward pass: {sum(stats['backward_times'])/n_samples:.2f} ms")
print(f" Optimizer step: {sum(stats['optim_times'])/n_samples:.2f} ms")
print(f" Total update: {sum(stats['total_times'])/n_samples:.2f} ms")
print(f" Avg steps/sec: {1000.0/(sum(stats['total_times'])/n_samples):.2f}")
print("-" * 50)
# Reset stats for next minute
for key in stats:
if key != 'last_print_time':
stats[key] = []
stats['last_print_time'] = current_time
train_metrics.loss = loss.item()
train_metrics.grad_norm = grad_norm.item()
@@ -125,6 +182,18 @@ def train(cfg: TrainPipelineConfig):
torch.backends.cuda.matmul.allow_tf32 = True
logging.info("Creating dataset")
# Force PyAV backend for RLearN (proven to be fastest)
if getattr(cfg.policy, "type", None) == "rlearn":
# Override video backend to use PyAV
if hasattr(cfg.dataset, 'video_backend'):
original_backend = cfg.dataset.video_backend
cfg.dataset.video_backend = 'pyav'
logging.info(f"RLearN: Forcing video_backend from '{original_backend}' to 'pyav' for better performance")
else:
cfg.dataset.video_backend = 'pyav'
logging.info("RLearN: Setting video_backend to 'pyav' for better performance")
dataset = make_dataset(cfg)
# Create environment used for evaluating checkpoints during training on simulation data.
@@ -136,9 +205,18 @@ def train(cfg: TrainPipelineConfig):
eval_env = make_env(cfg.env, n_envs=cfg.eval.batch_size, use_async_envs=cfg.eval.use_async_envs)
logging.info("Creating policy")
# Pass episode_data_index for RLearN to calculate proper progress
episode_data_index = dataset.episode_data_index if hasattr(dataset, "episode_data_index") else None
policy = make_policy(
cfg=cfg.policy,
ds_meta=dataset.meta,
episode_data_index=episode_data_index,
)
preprocessor, postprocessor = make_processor(
policy_cfg=cfg.policy, pretrained_path=cfg.policy.pretrained_path, dataset_stats=dataset.meta.stats
)
logging.info("Creating optimizer and scheduler")
@@ -170,6 +248,15 @@ def train(cfg: TrainPipelineConfig):
drop_n_last_frames=cfg.policy.drop_n_last_frames,
shuffle=True,
)
elif cfg.policy.type == "rlearn":
# For RLearN, drop first 15 frames to avoid padding issues with temporal windows
shuffle = False
sampler = EpisodeAwareSampler(
dataset.episode_data_index,
drop_n_first_frames=15, # Skip frames that would need padding
drop_n_last_frames=0,
shuffle=True,
)
else:
shuffle = True
sampler = None
@@ -182,6 +269,9 @@ def train(cfg: TrainPipelineConfig):
sampler=sampler,
pin_memory=device.type == "cuda",
drop_last=False,
persistent_workers=cfg.num_workers > 0, # Keep workers alive between epochs
prefetch_factor=3, # Prefetch for video pipeline
timeout=30, # Prevent hanging on video decode errors
)
dl_iter = cycle(dataloader)
@@ -194,6 +284,12 @@ def train(cfg: TrainPipelineConfig):
"update_s": AverageMeter("updt_s", ":.3f"),
"dataloading_s": AverageMeter("data_s", ":.3f"),
}
# RLearN-only: pixels per second throughput
try:
if getattr(policy, "name", None) == "rlearn":
train_metrics["pix_s"] = AverageMeter("pix/s", ":.1f")
except Exception:
pass
train_tracker = MetricsTracker(
cfg.batch_size, dataset.num_frames, dataset.num_episodes, train_metrics, initial_step=step
@@ -201,9 +297,17 @@ def train(cfg: TrainPipelineConfig):
logging.info("Start offline training on a fixed dataset")
for _ in range(step, cfg.steps):
start_time = time.perf_counter()
# Data loading timing
data_start = time.perf_counter()
batch = next(dl_iter)
train_tracker.dataloading_s = time.perf_counter() - start_time
data_loading_time = time.perf_counter() - data_start
# Preprocessing timing
preprocess_start = time.perf_counter()
batch = preprocessor(batch)
preprocess_time = time.perf_counter() - preprocess_start
train_tracker.dataloading_s = data_loading_time + preprocess_time
for key in batch:
if isinstance(batch[key], torch.Tensor):
@@ -220,6 +324,73 @@ def train(cfg: TrainPipelineConfig):
use_amp=cfg.policy.use_amp,
)
# RLearN-only: compute pixel throughput (pixels per second)
if getattr(policy, "name", None) == "rlearn":
def _count_pixels(x: torch.Tensor) -> int:
# Expect shapes: (B,T,C,H,W) or (B,C,H,W)
if x.dim() == 5:
b, t, _, h, w = x.shape
return int(b * t * h * w)
if x.dim() == 4:
b, _, h, w = x.shape
return int(b * h * w)
return 0
total_pixels = 0
for k, v in batch.items():
if "image" not in k.lower():
continue
if isinstance(v, torch.Tensor):
total_pixels += _count_pixels(v)
elif isinstance(v, list) and len(v) > 0 and isinstance(v[0], torch.Tensor):
# list of T tensors shaped (B,C,H,W)
total_pixels += sum(_count_pixels(t) for t in v)
# Avoid div-by-zero
meter = train_tracker.update_s
upd_s = meter.val if isinstance(meter, AverageMeter) else float(meter)
upd_s = max(upd_s, 1e-8)
pix_per_s = float(total_pixels) / upd_s
try:
train_tracker.pix_s = pix_per_s
except Exception:
pass
# Collect data pipeline timing for RLearN (averaged reporting every minute)
if getattr(policy, "name", None) == "rlearn":
# Initialize data timing accumulator if not exists
if not hasattr(policy, '_data_timing_stats'):
policy._data_timing_stats = {
'data_loading_times': [],
'preprocess_times': [],
'last_print_time': time.perf_counter()
}
# Accumulate current step's data timings
data_stats = policy._data_timing_stats
data_stats['data_loading_times'].append(data_loading_time * 1000)
data_stats['preprocess_times'].append(preprocess_time * 1000)
# Print averaged stats every minute (60 seconds)
current_time = time.perf_counter()
if current_time - data_stats['last_print_time'] >= 60.0:
n_samples = len(data_stats['data_loading_times'])
if n_samples > 0:
avg_data_loading = sum(data_stats['data_loading_times']) / n_samples
avg_preprocessing = sum(data_stats['preprocess_times']) / n_samples
print(f"\nData Pipeline Average Timing (last {n_samples} steps):")
print(f" Data loading: {avg_data_loading:.2f} ms")
print(f" Preprocessing: {avg_preprocessing:.2f} ms")
print(f" Total data pipeline: {avg_data_loading + avg_preprocessing:.2f} ms")
print("-" * 50)
# Reset stats for next minute
for key in data_stats:
if key != 'last_print_time':
data_stats[key] = []
data_stats['last_print_time'] = current_time
# Note: eval and checkpoint happens *after* the `step`th training update has completed, so we
# increment `step` here.
step += 1
@@ -228,6 +399,7 @@ def train(cfg: TrainPipelineConfig):
is_saving_step = step % cfg.save_freq == 0 or step == cfg.steps
is_eval_step = cfg.eval_freq > 0 and step % cfg.eval_freq == 0
if is_log_step:
logging.info(train_tracker)
if wandb_logger:
@@ -240,7 +412,7 @@ def train(cfg: TrainPipelineConfig):
if cfg.save_checkpoint and is_saving_step:
logging.info(f"Checkpoint policy after step {step}")
checkpoint_dir = get_step_checkpoint_dir(cfg.output_dir, cfg.steps, step)
save_checkpoint(checkpoint_dir, step, cfg, policy, optimizer, lr_scheduler)
save_checkpoint(checkpoint_dir, step, cfg, policy, optimizer, lr_scheduler, preprocessor)
update_last_checkpoint(checkpoint_dir)
if wandb_logger:
wandb_logger.log_policy(checkpoint_dir)
@@ -278,12 +450,18 @@ def train(cfg: TrainPipelineConfig):
wandb_logger.log_dict(wandb_log_dict, step, mode="eval")
wandb_logger.log_video(eval_info["video_paths"][0], step, mode="eval")
if eval_env:
eval_env.close()
logging.info("End of training")
if cfg.policy.push_to_hub:
policy.push_model_to_hub(cfg)
if preprocessor:
preprocessor.push_to_hub(cfg.policy.repo_id)
if postprocessor:
postprocessor.push_to_hub(cfg.policy.repo_id)
def main():
+1 -1
View File
@@ -109,7 +109,7 @@ def teleop_loop(
action = teleop.get_action()
if display_data:
observation = robot.get_observation()
log_rerun_data(observation, action)
log_rerun_data(observation=observation, action=action)
robot.send_action(action)
dt_s = time.perf_counter() - loop_start
@@ -0,0 +1,18 @@
#!/usr/bin/env python
# Copyright 2025 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from .config_phone import PhoneConfig
from .phone import Phone
@@ -0,0 +1,36 @@
#!/usr/bin/env python
# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from dataclasses import dataclass
from enum import Enum
import numpy as np
from ..config import TeleoperatorConfig
class PhoneOS(Enum):
ANDROID = "android"
IOS = "ios"
@TeleoperatorConfig.register_subclass("phone")
@dataclass
class PhoneConfig(TeleoperatorConfig):
phone_os: PhoneOS = PhoneOS.IOS
camera_offset = np.array(
[0.0, -0.02, 0.04]
) # iPhone 14 Pro camera is 2cm off center and 4cm above center
+246
View File
@@ -0,0 +1,246 @@
#!/usr/bin/env python
# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Docs:
# hebi: https://docs.hebi.us/tools.html#mobile-io
# teleop: https://github.com/SpesRobotics/teleop
import logging
import threading
import time
import hebi
import numpy as np
from scipy.spatial.transform import Rotation
from teleop import Teleop
from lerobot.errors import DeviceAlreadyConnectedError, DeviceNotConnectedError
from lerobot.teleoperators.phone.config_phone import PhoneConfig, PhoneOS
from lerobot.teleoperators.teleoperator import Teleoperator
logger = logging.getLogger(__name__)
class Phone(Teleoperator):
"""
Phone-based teleoperator using ARKit (iOS via HEBI Mobile I/O App) or the teleop Python package (Android via WebXR API).
For HEBI Mobile I/O we also expose 8 analog (a1-a8) and 8 digital (b1-b8) inputs.
Press and hold **B1** to enable teleoperation. While enabled, the first B1 press
captures a reference pose and rotation, when disabled and pressed again the position is reapplied.
"""
config_class = PhoneConfig
name = "phone"
def __init__(self, config: PhoneConfig):
super().__init__(config)
self.config = config
self._group = None
self._teleop = None
self._teleop_thread = None
self._latest_pose = None
self._latest_message = None
self._enabled: bool = False
self._calib_pos: np.ndarray | None = None
self._calib_rot_inv: Rotation | None = None
@property
def is_connected(self) -> bool:
return (self.config.phone_os == PhoneOS.IOS and self._group is not None) or (
self.config.phone_os == PhoneOS.ANDROID and self._teleop is not None
)
def connect(self) -> None:
if self.is_connected:
raise DeviceAlreadyConnectedError(f"{self} already connected")
if self.config.phone_os == PhoneOS.IOS:
logger.info("Connecting to IPhone, make sure to open the HEBI Mobile I/O app.")
lookup = hebi.Lookup()
time.sleep(2.0)
group = lookup.get_group_from_names(["HEBI"], ["mobileIO"])
if group is None:
raise RuntimeError("Mobile I/O not found — check name/family settings in the app.")
self._group = group
logger.info(f"{self} connected to HEBI group with {group.size} module(s).")
elif self.config.phone_os == PhoneOS.ANDROID:
logger.info("Starting teleop stream for Android...")
self._teleop = Teleop()
self._teleop.subscribe(self._android_callback)
self._teleop_thread = threading.Thread(target=self._teleop.run, daemon=True)
self._teleop_thread.start()
logger.info(f"{self} connected, teleop stream started.")
else:
raise ValueError(f"Invalid config phone_os: {self.config.phone_os}")
self.calibrate()
def calibrate(self) -> None:
print(
"Hold the phone so that: top edge points forward in same direction as the robot (robot +x) and screen points up (robot +z)"
)
if self.config.phone_os == PhoneOS.IOS:
print("Press and hold B1 in the HEBI Mobile I/O app to capture this pose...\n")
else:
print("Touch and move on the WebXR page to capture this pose...\n")
pos, rot = self._wait_for_capture_trigger()
self._calib_pos = pos.copy()
self._calib_rot_inv = rot.inv()
self._enabled = False
print("Calibration done\n")
def _reapply_position_calibration(self, pos: np.ndarray) -> None:
self._calib_pos = pos.copy()
@property
def is_calibrated(self) -> bool:
return (self._calib_pos is not None) and (self._calib_rot_inv is not None)
@property
def action_features(self) -> dict[str, type]:
return {
"phone.pos": np.ndarray, # shape (3,)
"phone.rot": Rotation, # scipy.spatial.transform.Rotation
"phone.raw_inputs": dict, # analogs/buttons or webXR meta
"phone.enabled": bool,
}
def _wait_for_capture_trigger(self) -> tuple[np.ndarray, Rotation]:
"""Wait trigger for calibration: iOS: B1. Android: 'move'."""
while True:
ok, pos, rot, pose = self._read_current_pose()
if not ok:
time.sleep(0.01)
continue
if self.config.phone_os == PhoneOS.IOS:
io = getattr(pose, "io", None)
b = getattr(io, "b", None) if io is not None else None
b1 = False
if b is not None:
b1 = bool(b.get_int(1))
if b1:
return pos, rot
else:
msg = self._latest_message or {}
if bool(msg.get("move", False)):
return pos, rot
time.sleep(0.01)
def _read_current_pose(self) -> tuple[bool, np.ndarray | None, Rotation | None, object | None]:
if self.config.phone_os == PhoneOS.IOS:
fbk = self._group.get_next_feedback()
pose = fbk[0]
ar_pos = getattr(pose, "ar_position", None)
ar_quat = getattr(pose, "ar_orientation", None)
if ar_pos is None or ar_quat is None:
return False, None, None, None
quat_xyzw = np.concatenate((ar_quat[1:], [ar_quat[0]])) # wxyz to xyzw
rot = Rotation.from_quat(quat_xyzw)
pos = ar_pos - rot.apply(self.config.camera_offset)
return True, pos, rot, pose
else:
p = self._latest_pose
if p is None:
return False, None, None, None
rot = Rotation.from_matrix(p[:3, :3])
pos = p[:3, 3] - rot.apply(self.config.camera_offset)
pose = self._latest_pose
return True, pos, rot, pose
@property
def feedback_features(self) -> dict[str, type]:
# No haptic or other feedback implemented yet
pass
def configure(self) -> None:
# No additional configuration required for phone teleop
pass
def _android_callback(self, pose: np.ndarray, message: dict) -> None:
self._latest_pose = pose
self._latest_message = message
time.sleep(0.001) # 1ms delay to avoid race condition
def get_action(self) -> dict:
ok, raw_pos, raw_rot, pose = self._read_current_pose()
if not ok or not self.is_calibrated:
return {}
# Collect raw inputs (B1 / analogs on iOS, move/scale on Android)
raw_inputs: dict[str, float | int | bool] = {}
if self.config.phone_os == PhoneOS.IOS:
io = getattr(pose, "io", None)
if io is not None:
bank_a, bank_b = io.a, io.b
if bank_a:
for ch in range(1, 9):
if bank_a.has_float(ch):
raw_inputs[f"a{ch}"] = float(bank_a.get_float(ch))
if bank_b:
for ch in range(1, 9):
if bank_b.has_int(ch):
raw_inputs[f"b{ch}"] = int(bank_b.get_int(ch))
elif hasattr(bank_b, "has_bool") and bank_b.has_bool(ch):
raw_inputs[f"b{ch}"] = int(bank_b.get_bool(ch))
else:
msg = self._latest_message or {}
raw_inputs["move"] = bool(msg.get("move", False))
raw_inputs["scale"] = float(msg.get("scale", 1.0))
raw_inputs["reservedButtonA"] = bool(msg.get("reservedButtonA", False))
raw_inputs["reservedButtonB"] = bool(msg.get("reservedButtonB", False))
if self.config.phone_os == PhoneOS.IOS:
enable = bool(raw_inputs.get("b1", 0))
else:
enable = bool(raw_inputs.get("move", False))
# Rising edge then re-capture calibration immediately from current raw pose
if enable and not self._enabled:
self._reapply_position_calibration(raw_pos)
# Apply calibration
pos_cal = self._calib_rot_inv.apply(raw_pos - self._calib_pos)
rot_cal = self._calib_rot_inv * raw_rot
self._enabled = enable
return {
"phone.pos": pos_cal,
"phone.rot": rot_cal,
"phone.raw_inputs": raw_inputs,
"phone.enabled": self._enabled,
}
def send_feedback(self, feedback: dict[str, float]) -> None:
# We could add haptic feedback (vibrations) here, but it's not implemented yet
raise NotImplementedError
def disconnect(self) -> None:
if not self.is_connected:
raise DeviceNotConnectedError(f"{self} is not connected.")
if self.config.phone_os == PhoneOS.IOS:
self._group = None
else:
self._teleop = None
if self._teleop_thread and self._teleop_thread.is_alive():
self._teleop_thread.join(timeout=1.0)
self._teleop_thread = None
self._latest_pose = None
@@ -0,0 +1,87 @@
# !/usr/bin/env python
# Copyright 2025 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from dataclasses import dataclass, field
from lerobot.configs.types import PolicyFeature
from lerobot.processor.pipeline import ActionProcessor, ProcessorStepRegistry
from lerobot.teleoperators.phone.config_phone import PhoneOS
@ProcessorStepRegistry.register("map_phone_action_to_robot_action")
@dataclass
class MapPhoneActionToRobotAction(ActionProcessor):
"""
Map calibrated phone pose (actions) to the inputs for robot actions
Expected input ACTION keys:
{
"action.phone.enabled": bool,
"action.phone.pos": np.ndarray,
"action.phone.rot": Rotation,
"action.phone.raw_inputs": dict,
}
Output ACTION keys:
{
"action.enabled": bool,
"action.ee.{x,y,z,wx,wy,wz}" : float
"action.gripper": float,
}
"""
platform: PhoneOS
_enabled_prev: bool = field(default=False, init=False, repr=False)
def action(self, act: dict | None) -> dict:
# Pop them from the action
enabled = act.pop("action.phone.enabled", 0)
pos = act.pop("action.phone.pos", None)
rot = act.pop("action.phone.rot", None)
inputs = act.pop("action.phone.raw_inputs", {})
if pos is None or rot is None:
return act
rotvec = rot.as_rotvec() # Absolute orientation as rotvec
# Map certain inputs to certain actions
if self.platform == PhoneOS.IOS:
gripper = float(inputs.get("a3", 0.0))
else:
a = float(inputs.get("reservedButtonA", 0.0))
b = float(inputs.get("reservedButtonB", 0.0))
gripper = (
a - b
) # Positive if a is pressed, negative if b is pressed, 0 if both or neither are pressed
# For some actions we need to invert the axis
act.update(
{
"action.enabled": enabled,
"action.target_x": -pos[1] if enabled else 0.0,
"action.target_y": pos[0] if enabled else 0.0,
"action.target_z": pos[2] if enabled else 0.0,
"action.target_wx": rotvec[1] if enabled else 0.0,
"action.target_wy": rotvec[0] if enabled else 0.0,
"action.target_wz": -rotvec[2] if enabled else 0.0,
"action.gripper": gripper, # Still send gripper action when disabled
}
)
return act
def transform_features(self, features: dict[str, PolicyFeature]) -> dict[str, PolicyFeature]:
return features
+7
View File
@@ -31,6 +31,7 @@ from termcolor import colored
from lerobot.datasets.lerobot_dataset import LeRobotDataset
from lerobot.datasets.utils import DEFAULT_FEATURES
from lerobot.policies.pretrained import PreTrainedPolicy
from lerobot.processor import RobotProcessor, TransitionKey
from lerobot.robots import Robot
@@ -101,6 +102,8 @@ def predict_action(
observation: dict[str, np.ndarray],
policy: PreTrainedPolicy,
device: torch.device,
preprocessor: RobotProcessor,
postprocessor: RobotProcessor,
use_amp: bool,
task: str | None = None,
robot_type: str | None = None,
@@ -122,10 +125,14 @@ def predict_action(
observation["task"] = task if task else ""
observation["robot_type"] = robot_type if robot_type else ""
observation = preprocessor(observation)
# Compute the next action with the policy
# based on the current observation
action = policy.select_action(observation)
action: torch.Tensor = postprocessor({TransitionKey.ACTION: action})[TransitionKey.ACTION]
# Remove batch dimension
action = action.squeeze(0)
+1
View File
@@ -58,6 +58,7 @@ def is_package_available(pkg_name: str, return_version: bool = False) -> tuple[b
_torch_available, _torch_version = is_package_available("torch", return_version=True)
_transformers_available = is_package_available("transformers")
_gym_xarm_available = is_package_available("gym_xarm")
_gym_aloha_available = is_package_available("gym_aloha")
_gym_pusht_available = is_package_available("gym_pusht")
+7 -1
View File
@@ -74,6 +74,7 @@ def save_checkpoint(
policy: PreTrainedPolicy,
optimizer: Optimizer,
scheduler: LRScheduler | None = None,
preprocessor=None,
) -> None:
"""This function creates the following directory structure:
@@ -81,7 +82,9 @@ def save_checkpoint(
pretrained_model/
config.json # policy config
model.safetensors # policy weights
train_config.json # train config
train_config.json # train config
processor.json # processor config (if preprocessor provided)
step_*.safetensors # processor state files (if any)
training_state/
optimizer_param_groups.json # optimizer param groups
optimizer_state.safetensors # optimizer state
@@ -95,10 +98,13 @@ def save_checkpoint(
policy (PreTrainedPolicy): The policy to save.
optimizer (Optimizer | None, optional): The optimizer to save the state from. Defaults to None.
scheduler (LRScheduler | None, optional): The scheduler to save the state from. Defaults to None.
preprocessor: The preprocessor/pipeline to save. Defaults to None.
"""
pretrained_dir = checkpoint_dir / PRETRAINED_MODEL_DIR
policy.save_pretrained(pretrained_dir)
cfg.save_pretrained(pretrained_dir)
if preprocessor is not None:
preprocessor.save_pretrained(pretrained_dir)
save_training_state(checkpoint_dir, step, optimizer, scheduler)
+86 -15
View File
@@ -12,12 +12,15 @@
# See the License for the specific language governing permissions and
# limitations under the License.
import numbers
import os
from typing import Any
import numpy as np
import rerun as rr
from lerobot.processor.pipeline import EnvTransition, TransitionKey
def _init_rerun(session_name: str = "lerobot_control_loop") -> None:
"""Initializes the Rerun SDK for visualizing the control loop."""
@@ -28,19 +31,87 @@ def _init_rerun(session_name: str = "lerobot_control_loop") -> None:
rr.spawn(memory_limit=memory_limit)
def log_rerun_data(observation: dict[str | Any], action: dict[str | Any]):
for obs, val in observation.items():
if isinstance(val, float):
rr.log(f"observation.{obs}", rr.Scalar(val))
elif isinstance(val, np.ndarray):
if val.ndim == 1:
for i, v in enumerate(val):
rr.log(f"observation.{obs}_{i}", rr.Scalar(float(v)))
def _is_scalar(x):
return (
isinstance(x, numbers.Real)
or isinstance(x, (np.integer, np.floating))
or (isinstance(x, np.ndarray) and x.ndim == 0)
)
def log_rerun_data(
data: list[dict[str | Any] | EnvTransition] | dict[str | Any] | EnvTransition | None = None,
*,
observation: dict[str, Any] | None = None,
action: dict[str, Any] | None = None,
) -> None:
items = data if isinstance(data, list) else ([data] if data is not None else [])
obs = {} if observation is None else dict(observation)
act = {} if action is None else dict(action)
for idx, item in enumerate(items):
if not isinstance(item, dict):
continue
if any(isinstance(k, TransitionKey) for k in item.keys()):
o = item.get(TransitionKey.OBSERVATION) or {}
a = item.get(TransitionKey.ACTION) or {}
if isinstance(o, dict):
obs.update(o)
if isinstance(a, dict):
act.update(a)
continue
keys = list(item.keys())
has_obs = any(str(k).startswith("observation.") for k in keys)
has_act = any(str(k).startswith("action.") for k in keys)
if has_obs or has_act:
if has_obs:
obs.update(item)
if has_act:
act.update(item)
else:
# No prefixes: assume first is observation, second is action, others are observation
if idx == 0:
obs.update(item)
elif idx == 1:
act.update(item)
else:
rr.log(f"observation.{obs}", rr.Image(val), static=True)
for act, val in action.items():
if isinstance(val, float):
rr.log(f"action.{act}", rr.Scalar(val))
elif isinstance(val, np.ndarray):
for i, v in enumerate(val):
rr.log(f"action.{act}_{i}", rr.Scalar(float(v)))
obs.update(item)
for k, v in obs.items():
if v is None:
continue
key = k if str(k).startswith("observation.") else f"observation.{k}"
if _is_scalar(v):
rr.log(key, rr.Scalar(float(v)))
elif isinstance(v, np.ndarray):
arr = v
# Convert CHW -> HWC when needed
if arr.ndim == 3 and arr.shape[0] in (1, 3, 4) and arr.shape[-1] not in (1, 3, 4):
arr = np.transpose(arr, (1, 2, 0))
if arr.ndim == 1:
for i, vi in enumerate(arr):
rr.log(f"{key}_{i}", rr.Scalar(float(vi)))
else:
rr.log(key, rr.Image(arr), static=True)
for k, v in act.items():
if v is None:
continue
key = k if str(k).startswith("action.") else f"action.{k}"
if _is_scalar(v):
rr.log(key, rr.Scalar(float(v)))
elif isinstance(v, np.ndarray):
if v.ndim == 1:
for i, vi in enumerate(v):
rr.log(f"{key}_{i}", rr.Scalar(float(vi)))
else:
# Fall back to flattening higher-dimensional arrays
flat = v.flatten()
for i, vi in enumerate(flat):
rr.log(f"{key}_{i}", rr.Scalar(float(vi)))
+116
View File
@@ -0,0 +1,116 @@
#!/usr/bin/env python
"""
Quick benchmark to test video decoding speed across different backends.
"""
import time
from pathlib import Path
import torch
def test_video_backend(video_path, backend_name, num_frames=10):
"""Test video decoding speed for a specific backend."""
try:
from lerobot.datasets.video_utils import decode_video_frames
# Create timestamps for first N frames
fps = 30 # Assume 30fps, adjust if needed
timestamps = [i / fps for i in range(num_frames)]
# Time the decoding
start_time = time.perf_counter()
frames = decode_video_frames(video_path, timestamps, tolerance_s=1e-4, backend=backend_name)
decode_time = time.perf_counter() - start_time
frames_decoded = frames.shape[1] if frames.dim() > 1 else frames.shape[0]
ms_per_frame = (decode_time * 1000) / max(frames_decoded, 1)
print(f"{backend_name:12} | {decode_time*1000:6.1f}ms total | {ms_per_frame:6.1f}ms/frame | {frames_decoded} frames")
return decode_time, frames_decoded
except Exception as e:
print(f"{backend_name:12} | ERROR: {str(e)[:50]}...")
return float('inf'), 0
def main():
print("📦 Downloading dataset to get video file locations...")
try:
from lerobot.datasets.lerobot_dataset import LeRobotDataset
# Download the dataset - this will tell us exactly where it's stored
dataset = LeRobotDataset("kenmacken/record-test-2", download_videos=True)
print(f"✅ Dataset downloaded to: {dataset.root}")
print(f" Video keys: {dataset.meta.video_keys}")
print(f" Total episodes: {dataset.meta.total_episodes}")
# Get actual video file paths from the dataset
video_files = []
for ep_idx in range(min(2, dataset.meta.total_episodes)): # Test first 2 episodes max
for vid_key in dataset.meta.video_keys:
video_path = dataset.root / dataset.meta.get_video_file_path(ep_idx, vid_key)
if video_path.exists():
video_files.append(video_path)
break # Just need one video file for testing
if video_files:
break
if not video_files:
print("❌ No video files found after download!")
return
except Exception as e:
print(f"❌ Error downloading dataset: {e}")
# Fallback to manual search
possible_paths = [
Path.home() / ".cache/huggingface/lerobot/kenmacken/record-test-2",
Path("/tmp/huggingface/lerobot/kenmacken/record-test-2"),
Path("./datasets/record-test-2"),
]
video_files = []
print("Trying fallback search...")
for path in possible_paths:
print(f" Checking: {path}")
if path.exists():
files = list(path.rglob("*.mp4"))
if files:
video_files = files
print(f" ✅ Found {len(files)} video files!")
break
if not video_files:
print("❌ No video files found!")
return
test_video = video_files[0]
print(f"Testing video: {test_video.name}")
print(f"File size: {test_video.stat().st_size / 1024 / 1024:.1f} MB")
print("-" * 60)
backends = ["torchcodec", "pyav", "video_reader"]
results = {}
for backend in backends:
decode_time, frames = test_video_backend(test_video, backend)
results[backend] = (decode_time, frames)
print("-" * 60)
print("RECOMMENDATION:")
# Find fastest backend
valid_results = {k: v for k, v in results.items() if v[0] != float('inf')}
if valid_results:
fastest = min(valid_results.items(), key=lambda x: x[1][0])
print(f"🚀 Use '{fastest[0]}' - fastest backend!")
print(f" Add to your config: video_backend: \"{fastest[0]}\"")
slowest_time = max(valid_results.values())[0]
speedup = slowest_time / fastest[1][0]
print(f" Speedup vs slowest: {speedup:.1f}x faster")
else:
print("❌ No backends worked!")
if __name__ == "__main__":
main()
+132
View File
@@ -0,0 +1,132 @@
#!/usr/bin/env python
# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import pytest
import torch
from datasets import Dataset
from huggingface_hub import DatasetCard
from lerobot.datasets.push_dataset_to_hub.utils import calculate_episode_data_index
from lerobot.datasets.utils import create_lerobot_dataset_card, hf_transform_to_torch, merge_features
def test_default_parameters():
card = create_lerobot_dataset_card()
assert isinstance(card, DatasetCard)
assert card.data.tags == ["LeRobot"]
assert card.data.task_categories == ["robotics"]
assert card.data.configs == [
{
"config_name": "default",
"data_files": "data/*/*.parquet",
}
]
def test_with_tags():
tags = ["tag1", "tag2"]
card = create_lerobot_dataset_card(tags=tags)
assert card.data.tags == ["LeRobot", "tag1", "tag2"]
def test_calculate_episode_data_index():
dataset = Dataset.from_dict(
{
"timestamp": [0.1, 0.2, 0.3, 0.4, 0.5, 0.6],
"index": [0, 1, 2, 3, 4, 5],
"episode_index": [0, 0, 1, 2, 2, 2],
},
)
dataset.set_transform(hf_transform_to_torch)
episode_data_index = calculate_episode_data_index(dataset)
assert torch.equal(episode_data_index["from"], torch.tensor([0, 2, 3]))
assert torch.equal(episode_data_index["to"], torch.tensor([2, 3, 6]))
def test_merge_simple_vectors():
g1 = {
"action": {
"dtype": "float32",
"shape": (2,),
"names": ["ee.x", "ee.y"],
}
}
g2 = {
"action": {
"dtype": "float32",
"shape": (2,),
"names": ["ee.y", "ee.z"],
}
}
out = merge_features(g1, g2)
assert "action" in out
assert out["action"]["dtype"] == "float32"
# Names merged with preserved order and de-dupuplication
assert out["action"]["names"] == ["ee.x", "ee.y", "ee.z"]
# Shape correctly recomputed from names length
assert out["action"]["shape"] == (3,)
def test_merge_multiple_groups_order_and_dedup():
g1 = {"action": {"dtype": "float32", "shape": (2,), "names": ["a", "b"]}}
g2 = {"action": {"dtype": "float32", "shape": (2,), "names": ["b", "c"]}}
g3 = {"action": {"dtype": "float32", "shape": (3,), "names": ["a", "c", "d"]}}
out = merge_features(g1, g2, g3)
assert out["action"]["names"] == ["a", "b", "c", "d"]
assert out["action"]["shape"] == (4,)
def test_non_vector_last_wins_for_images():
# Non-vector (images) with same name should be overwritten by the last image specified
g1 = {
"observation.images.front": {
"dtype": "image",
"shape": (3, 480, 640),
"names": ["channels", "height", "width"],
}
}
g2 = {
"observation.images.front": {
"dtype": "image",
"shape": (3, 720, 1280),
"names": ["channels", "height", "width"],
}
}
out = merge_features(g1, g2)
assert out["observation.images.front"]["shape"] == (3, 720, 1280)
assert out["observation.images.front"]["dtype"] == "image"
def test_dtype_mismatch_raises():
g1 = {"action": {"dtype": "float32", "shape": (1,), "names": ["a"]}}
g2 = {"action": {"dtype": "float64", "shape": (1,), "names": ["b"]}}
with pytest.raises(ValueError, match="dtype mismatch for 'action'"):
_ = merge_features(g1, g2)
def test_non_dict_passthrough_last_wins():
g1 = {"misc": 123}
g2 = {"misc": 456}
out = merge_features(g1, g2)
# For non-dict entries the last one wins
assert out["misc"] == 456
-55
View File
@@ -1,55 +0,0 @@
#!/usr/bin/env python
# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import torch
from datasets import Dataset
from huggingface_hub import DatasetCard
from lerobot.datasets.push_dataset_to_hub.utils import calculate_episode_data_index
from lerobot.datasets.utils import create_lerobot_dataset_card, hf_transform_to_torch
def test_default_parameters():
card = create_lerobot_dataset_card()
assert isinstance(card, DatasetCard)
assert card.data.tags == ["LeRobot"]
assert card.data.task_categories == ["robotics"]
assert card.data.configs == [
{
"config_name": "default",
"data_files": "data/*/*.parquet",
}
]
def test_with_tags():
tags = ["tag1", "tag2"]
card = create_lerobot_dataset_card(tags=tags)
assert card.data.tags == ["LeRobot", "tag1", "tag2"]
def test_calculate_episode_data_index():
dataset = Dataset.from_dict(
{
"timestamp": [0.1, 0.2, 0.3, 0.4, 0.5, 0.6],
"index": [0, 1, 2, 3, 4, 5],
"episode_index": [0, 0, 1, 2, 2, 2],
},
)
dataset.set_transform(hf_transform_to_torch)
episode_data_index = calculate_episode_data_index(dataset)
assert torch.equal(episode_data_index["from"], torch.tensor([0, 2, 3]))
assert torch.equal(episode_data_index["to"], torch.tensor([2, 3, 6]))
+188
View File
@@ -0,0 +1,188 @@
#!/usr/bin/env python
# Copyright 2025 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Test script for RLearN evaluation metrics.
This script tests the VOC-S and success/failure detection metrics with synthetic data
to ensure they work correctly before running on real datasets.
"""
import numpy as np
from lerobot.policies.rlearn.evaluation import (
compute_success_failure_detection,
compute_voc_s,
generate_mismatched_languages,
)
def test_voc_s():
"""Test VOC-S computation with synthetic data."""
print("Testing VOC-S computation...")
# Test case 1: Perfect positive correlation (0 -> 1)
perfect_positive = [np.linspace(0, 1, 20) for _ in range(10)]
results = compute_voc_s(perfect_positive)
print("Perfect positive correlation:")
print(f" Mean: {results['voc_s_mean']:.4f} (should be ~1.0)")
print(f" IQM: {results['voc_s_iqm']:.4f} (should be ~1.0)")
assert results["voc_s_mean"] > 0.95, f"Expected >0.95, got {results['voc_s_mean']}"
# Test case 2: Perfect negative correlation (1 -> 0)
perfect_negative = [np.linspace(1, 0, 20) for _ in range(10)]
results = compute_voc_s(perfect_negative)
print("Perfect negative correlation:")
print(f" Mean: {results['voc_s_mean']:.4f} (should be ~-1.0)")
print(f" IQM: {results['voc_s_iqm']:.4f} (should be ~-1.0)")
assert results["voc_s_mean"] < -0.95, f"Expected <-0.95, got {results['voc_s_mean']}"
# Test case 3: No correlation (random)
np.random.seed(42)
random_rewards = [np.random.random(20) for _ in range(50)]
results = compute_voc_s(random_rewards)
print("Random correlation:")
print(f" Mean: {results['voc_s_mean']:.4f} (should be ~0.0)")
print(f" IQM: {results['voc_s_iqm']:.4f} (should be ~0.0)")
assert abs(results["voc_s_mean"]) < 0.3, f"Expected ~0, got {results['voc_s_mean']}"
# Test case 4: Mixed correlations
mixed = []
mixed.extend([np.linspace(0, 1, 15) for _ in range(5)]) # Positive
mixed.extend([np.linspace(1, 0, 15) for _ in range(5)]) # Negative
mixed.extend([np.random.random(15) for _ in range(5)]) # Random
results = compute_voc_s(mixed)
print("Mixed correlations:")
print(f" Mean: {results['voc_s_mean']:.4f}")
print(f" IQM: {results['voc_s_iqm']:.4f}")
print(f" Std: {results['voc_s_std']:.4f}")
print("✓ VOC-S tests passed!\n")
def test_success_failure_detection():
"""Test success/failure detection with synthetic data."""
print("Testing Success/Failure Detection...")
# Test case 1: Clear separation (correct > incorrect)
correct_rewards = [np.linspace(0, 1, 20) for _ in range(20)] # Always increasing
incorrect_rewards = [np.linspace(0, 0.3, 20) for _ in range(20)] # Lower final values
results = compute_success_failure_detection(correct_rewards, incorrect_rewards)
print("Clear separation test:")
print(f" Detection accuracy: {results['detection_accuracy']:.4f} (should be 1.0)")
print(f" Mean correct: {results['mean_correct_final']:.4f}")
print(f" Mean incorrect: {results['mean_incorrect_final']:.4f}")
print(f" Separation score: {results['separation_score']:.4f}")
assert results["detection_accuracy"] == 1.0, f"Expected 1.0, got {results['detection_accuracy']}"
# Test case 2: No separation (same distributions with some randomness)
np.random.seed(42)
same_rewards_1 = [np.random.normal(0.5, 0.05, 15) for _ in range(20)]
same_rewards_2 = [np.random.normal(0.5, 0.05, 15) for _ in range(20)]
results = compute_success_failure_detection(same_rewards_1, same_rewards_2)
print("No separation test:")
print(f" Detection accuracy: {results['detection_accuracy']:.4f} (should be ~0.5)")
print(f" Separation score: {results['separation_score']:.4f} (should be ~0.0)")
# Relax the assertion since random data can vary
assert 0.2 <= results["detection_accuracy"] <= 0.8, (
f"Expected ~0.5 (±0.3), got {results['detection_accuracy']}"
)
# Test case 3: Partial separation
np.random.seed(42)
partial_correct = [np.random.normal(0.7, 0.1, 10) for _ in range(20)]
partial_incorrect = [np.random.normal(0.4, 0.1, 10) for _ in range(20)]
results = compute_success_failure_detection(partial_correct, partial_incorrect)
print("Partial separation test:")
print(f" Detection accuracy: {results['detection_accuracy']:.4f}")
print(f" Separation score: {results['separation_score']:.4f}")
print("✓ Success/Failure Detection tests passed!\n")
def test_mismatch_generation():
"""Test mismatch language generation."""
print("Testing mismatch language generation...")
original_languages = [
"pick up the red ball",
"put the cup on the table",
"open the drawer",
"close the door",
]
# Test with default templates
mismatched = generate_mismatched_languages(original_languages)
print(f"Original languages: {len(original_languages)}")
print(f"Mismatched languages: {len(mismatched)}")
assert len(mismatched) == len(original_languages)
# Ensure they're actually different
for orig, mismatch in zip(original_languages, mismatched, strict=False):
print(f" '{orig}' -> '{mismatch}'")
assert orig != mismatch, "Mismatch should be different from original"
# Test with custom templates
custom_templates = ["dance", "sing", "jump"]
mismatched_custom = generate_mismatched_languages(original_languages, custom_templates)
print("\nWith custom templates:")
for orig, mismatch in zip(original_languages, mismatched_custom, strict=False):
print(f" '{orig}' -> '{mismatch}'")
assert mismatch in custom_templates
print("✓ Mismatch generation tests passed!\n")
def test_edge_cases():
"""Test edge cases and error handling."""
print("Testing edge cases...")
# Empty input
empty_results = compute_voc_s([])
assert empty_results["num_episodes"] == 0
assert empty_results["voc_s_mean"] == 0.0
# Single frame episodes (should be skipped)
single_frame = [np.array([0.5]) for _ in range(5)]
results = compute_voc_s(single_frame)
assert results["num_episodes"] == 0, "Single-frame episodes should be skipped"
# Constant rewards (should give correlation = 0)
constant_rewards = [np.ones(10) * 0.5 for _ in range(5)]
results = compute_voc_s(constant_rewards)
print(f"Constant rewards correlation: {results['voc_s_mean']:.4f} (should be 0.0)")
assert results["voc_s_mean"] == 0.0
# Mismatched array lengths for detection
try:
compute_success_failure_detection([np.array([1, 2])], [])
assert False, "Should have raised ValueError"
except ValueError:
pass # Expected
print("✓ Edge case tests passed!\n")
+244
View File
@@ -0,0 +1,244 @@
#!/usr/bin/env python
# Copyright 2025 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import torch
from lerobot.configs.types import FeatureType, PolicyFeature
from lerobot.constants import OBS_IMAGES, OBS_LANGUAGE, REWARD
from lerobot.policies.factory import make_processor
from lerobot.policies.rlearn.configuration_rlearn import RLearNConfig
from lerobot.policies.rlearn.modeling_rlearn import RLearNPolicy
from tests.utils import require_package
@require_package("transformers")
@require_package("sentence_transformers")
def test_rlearn_instantiation_and_forward_tensor_batch():
"""Instantiate RLearN and run a forward pass with a (B, T, C, H, W) tensor input using a real model and real text."""
cfg = RLearNConfig(
vision_model_name="facebook/dinov3-vitb16-pretrain-lvd1689m",
text_model_name="sentence-transformers/all-MiniLM-L12-v2",
push_to_hub=False,
freeze_backbones=True,
)
cfg.input_features = {
"observation.image": PolicyFeature(type=FeatureType.VISUAL, shape=(3, 224, 224)),
}
cfg.output_features = {
REWARD: PolicyFeature(type=FeatureType.REWARD, shape=(1,)),
}
policy = RLearNPolicy(cfg)
B, T, C, H, W = 2, 3, 3, 256, 256
batch = {
OBS_IMAGES: torch.rand(B, T, C, H, W),
REWARD: torch.randint(low=0, high=1, size=(B, T)).float(),
OBS_LANGUAGE: ["move the green cube into the box" for _ in range(B)],
}
loss, logs = policy.forward(batch)
assert isinstance(loss, torch.Tensor)
assert "loss" in logs
@require_package("transformers")
@require_package("sentence_transformers")
def test_rlearn_instantiation_and_forward_list_batch_with_language():
"""Instantiate RLearN and run a forward pass with a list-of-frames input and real language using a real model."""
cfg = RLearNConfig(
vision_model_name="facebook/dinov3-vitb16-pretrain-lvd1689m",
text_model_name="sentence-transformers/all-MiniLM-L12-v2",
push_to_hub=False,
freeze_backbones=True,
)
cfg.input_features = {
"observation.image": PolicyFeature(type=FeatureType.VISUAL, shape=(3, 224, 224)),
}
cfg.output_features = {
REWARD: PolicyFeature(type=FeatureType.REWARD, shape=(1,)),
}
policy = RLearNPolicy(cfg)
B, T, C, H, W = 2, 4, 3, 256, 256
frames = [torch.rand(B, C, H, W) for _ in range(T)]
batch = {
OBS_IMAGES: frames, # list[(B, C, H, W)]
REWARD: torch.randint(low=0, high=2, size=(B, T)).float(),
OBS_LANGUAGE: ["move the red cube into the box" for _ in range(B)],
}
loss, logs = policy.forward(batch)
assert isinstance(loss, torch.Tensor)
assert "loss" in logs
@require_package("transformers")
@require_package("sentence_transformers")
def test_rlearn_composite_loss_shapes_and_terms():
"""Smoke test composite loss: checks presence of terms and valid gradients."""
cfg = RLearNConfig(
vision_model_name="facebook/dinov3-vitb16-pretrain-lvd1689m",
text_model_name="sentence-transformers/all-MiniLM-L12-v2",
push_to_hub=False,
freeze_backbones=True,
use_video_rewind=True,
rewind_prob=0.5,
use_mismatch_loss=True,
)
cfg.input_features = {
"observation.image": PolicyFeature(type=FeatureType.VISUAL, shape=(3, 224, 224)),
}
cfg.output_features = {
REWARD: PolicyFeature(type=FeatureType.REWARD, shape=(1,)),
}
policy = RLearNPolicy(cfg)
B, T, C, H, W = 2, 3, 3, 256, 256
# Progress labels y in [0,1]
y = torch.linspace(0, 1, T).unsqueeze(0).repeat(B, 1)
batch = {
OBS_IMAGES: torch.rand(B, T, C, H, W),
REWARD: y.clone(),
OBS_LANGUAGE: ["stack the blocks" for _ in range(B)],
}
loss, logs = policy.forward(batch)
assert isinstance(loss, torch.Tensor) and torch.isfinite(loss)
# Expect ReWiND loss terms (progress and mismatch)
assert "loss_progress" in logs
assert "loss_mismatch" in logs
@require_package("transformers")
@require_package("sentence_transformers")
def test_rlearn_preprocessor_tokenizes_and_copies_task():
cfg = RLearNConfig(
vision_model_name="facebook/dinov3-vitb16-pretrain-lvd1689m",
text_model_name="sentence-transformers/all-MiniLM-L12-v2",
device="cpu",
push_to_hub=False,
)
cfg.input_features = {
"observation.image": PolicyFeature(type=FeatureType.VISUAL, shape=(3, 64, 64)),
}
cfg.output_features = {
REWARD: PolicyFeature(type=FeatureType.REWARD, shape=(1,)),
}
pre, post = make_processor(cfg, dataset_stats=None)
B, C, H, W = 2, 3, 64, 64
batch = {
"observation.image": torch.rand(B, C, H, W),
REWARD: torch.zeros(B),
"task": ["pick the cube", "place it in the box"],
}
processed = pre(batch)
assert isinstance(processed, dict)
assert f"{OBS_LANGUAGE}.tokens" in processed
assert f"{OBS_LANGUAGE}.attention_mask" in processed
assert OBS_LANGUAGE in processed
tokens = processed[f"{OBS_LANGUAGE}.tokens"]
attn = processed[f"{OBS_LANGUAGE}.attention_mask"]
assert tokens.dim() == 2 and attn.dim() == 2
assert tokens.shape[0] == B and attn.shape[0] == B
@require_package("transformers")
@require_package("sentence_transformers")
def test_rlearn_preprocessor_string_task_and_to_batch():
cfg = RLearNConfig(
vision_model_name="facebook/dinov3-vitb16-pretrain-lvd1689m",
text_model_name="sentence-transformers/all-MiniLM-L12-v2",
device="cpu",
push_to_hub=False,
)
cfg.input_features = {
"observation.image": PolicyFeature(type=FeatureType.VISUAL, shape=(3, 64, 64)),
}
cfg.output_features = {
REWARD: PolicyFeature(type=FeatureType.REWARD, shape=(1,)),
}
pre, post = make_processor(cfg, dataset_stats=None)
# Unbatched image and single string task
batch = {
"observation.image": torch.rand(3, 64, 64),
REWARD: torch.tensor(0.0),
"task": "move the green cube into the box",
}
processed = pre(batch)
# Image should have batch dim now
assert processed["observation.image"].dim() == 4 and processed["observation.image"].shape[0] == 1
# Language copy and tokenization should exist
assert OBS_LANGUAGE in processed and isinstance(processed[OBS_LANGUAGE], list)
assert f"{OBS_LANGUAGE}.tokens" in processed
assert f"{OBS_LANGUAGE}.attention_mask" in processed
@require_package("transformers")
@require_package("sentence_transformers")
def test_rlearn_pipeline_end_to_end_forward():
"""End-to-end: preprocessor + model forward using RLearN pipeline on synthetic data."""
cfg = RLearNConfig(
vision_model_name="facebook/dinov3-vitb16-pretrain-lvd1689m",
text_model_name="sentence-transformers/all-MiniLM-L12-v2",
device="cpu",
push_to_hub=False,
freeze_backbones=True,
use_video_rewind=True,
)
cfg.input_features = {
"observation.image": PolicyFeature(type=FeatureType.VISUAL, shape=(3, 224, 224)),
}
cfg.output_features = {
REWARD: PolicyFeature(type=FeatureType.REWARD, shape=(1,)),
}
# Build processors and model
pre, post = make_processor(cfg, dataset_stats=None)
policy = RLearNPolicy(cfg)
B, T, C, H, W = 2, 3, 3, 256, 256
y = torch.linspace(0, 1, T).unsqueeze(0).repeat(B, 1)
raw = {
# Provide as observation.image to let preprocessor map/normalize and batch
"observation.image": torch.rand(B, C, H, W), # not time-major to test ToBatch
REWARD: y[:, :1].clone(), # single step label; pipeline keeps structure
"task": ["insert the peg", "insert the peg"],
}
processed = pre(raw)
# Integrate preprocessor output with model forward
loss, logs = policy.forward(
{
OBS_IMAGES: processed.get(OBS_IMAGES, processed.get("observation.image"))
.unsqueeze(1)
.repeat(1, T, 1, 1, 1),
REWARD: y.clone(),
OBS_LANGUAGE: processed[OBS_LANGUAGE],
}
)
assert isinstance(loss, torch.Tensor) and torch.isfinite(loss)
@@ -0,0 +1,59 @@
#!/usr/bin/env python
import torch
from lerobot.policies.rlearn.configuration_rlearn import RLearNConfig
from lerobot.policies.rlearn.evaluation import RLearnEvaluator
from lerobot.policies.rlearn.modeling_rlearn import RLearNPolicy
def test_temporal_evaluation():
"""Test that evaluation creates proper temporal sequences with past frames."""
# Create a simple config
config = RLearNConfig(
max_seq_len=4, # Small for testing
dim_model=64, # Small for testing
n_heads=2,
n_layers=2,
)
# Create model (will be randomly initialized)
model = RLearNPolicy(config)
model.eval()
# Create evaluator
evaluator = RLearnEvaluator(model, device="cpu")
# Create test episode: 8 frames of 3x64x64 images
T, C, H, W = 8, 3, 64, 64
frames = torch.randn(T, C, H, W)
language = "test instruction"
print(f"Input episode shape: {frames.shape}")
print(f"Model expects sequences of length: {config.max_seq_len}")
# Test the evaluation
rewards = evaluator.predict_episode_rewards(frames, language, batch_size=4)
print(f"Output rewards shape: {rewards.shape}")
print(f"Rewards: {rewards}")
# Verify we get one reward per frame
assert len(rewards) == T, f"Expected {T} rewards, got {len(rewards)}"
print("✅ Test passed! Evaluation correctly processes temporal sequences.")
# Test with very short episode (shorter than max_seq_len)
short_frames = torch.randn(2, C, H, W) # Only 2 frames
short_rewards = evaluator.predict_episode_rewards(short_frames, language)
print(f"\nShort episode shape: {short_frames.shape}")
print(f"Short rewards shape: {short_rewards.shape}")
assert len(short_rewards) == 2, f"Expected 2 rewards, got {len(short_rewards)}"
print("✅ Short episode test passed!")
if __name__ == "__main__":
test_temporal_evaluation()
+3
View File
@@ -39,6 +39,7 @@ from lerobot.policies.factory import (
get_policy_class,
make_policy,
make_policy_config,
make_processor,
)
from lerobot.policies.normalize import Normalize, Unnormalize
from lerobot.policies.pretrained import PreTrainedPolicy
@@ -151,6 +152,7 @@ def test_policy(ds_repo_id, env_name, env_kwargs, policy_name, policy_kwargs):
# Check that we can make the policy object.
dataset = make_dataset(train_cfg)
preprocessor, _ = make_processor(train_cfg.policy, None)
policy = make_policy(train_cfg.policy, ds_meta=dataset.meta)
assert isinstance(policy, PreTrainedPolicy)
@@ -224,6 +226,7 @@ def test_act_backbone_lr():
assert cfg.policy.optimizer_lr_backbone == 0.001
dataset = make_dataset(cfg)
preprocessor, _ = make_processor(cfg.policy, None)
policy = make_policy(cfg.policy, ds_meta=dataset.meta)
optimizer, _ = make_optimizer_and_scheduler(cfg, policy)
assert len(optimizer.param_groups) == 2
File diff suppressed because it is too large Load Diff
+196
View File
@@ -0,0 +1,196 @@
import numpy as np
import pytest
import torch
from lerobot.processor.converters import (
to_dataset_frame,
to_output_robot_action,
to_transition_robot_observation,
to_transition_teleop_action,
)
from lerobot.processor.pipeline import TransitionKey
def test_to_transition_teleop_action_prefix_and_tensor_conversion():
# Scalars, arrays, and "image-like" uint8 arrays are supported
img = np.zeros((8, 12, 3), dtype=np.uint8)
act = {
"ee.x": 0.5, # scalar to torch tensor
"delta": np.array([1.0, 2.0]), # ndarray to torch tensor
"raw_img": img, # uint8 HWC to passthrough ndarray
}
tr = to_transition_teleop_action(act)
# Should be an EnvTransition-like dict with ACTION populated
assert isinstance(tr, dict)
assert TransitionKey.ACTION in tr
assert "action.ee.x" in tr[TransitionKey.ACTION]
assert "action.delta" in tr[TransitionKey.ACTION]
assert "action.raw_img" in tr[TransitionKey.ACTION]
# Types: scalars/arrays -> torch tensor; images to np.ndarray
assert isinstance(tr[TransitionKey.ACTION]["action.ee.x"], torch.Tensor)
assert tr[TransitionKey.ACTION]["action.ee.x"].item() == pytest.approx(0.5)
assert isinstance(tr[TransitionKey.ACTION]["action.delta"], torch.Tensor)
assert tr[TransitionKey.ACTION]["action.delta"].shape == (2,)
assert torch.allclose(tr[TransitionKey.ACTION]["action.delta"], torch.tensor([1.0, 2.0]))
assert isinstance(tr[TransitionKey.ACTION]["action.raw_img"], np.ndarray)
assert tr[TransitionKey.ACTION]["action.raw_img"].dtype == np.uint8
assert tr[TransitionKey.ACTION]["action.raw_img"].shape == (8, 12, 3)
# Observation is created as empty dict by make_transition
assert TransitionKey.OBSERVATION in tr
assert isinstance(tr[TransitionKey.OBSERVATION], dict)
assert tr[TransitionKey.OBSERVATION] == {}
def test_to_transition_robot_observation_state_vs_images_split():
# Create an observation with mixed content
img = np.full((10, 20, 3), 255, dtype=np.uint8) # image (uint8 HWC)
obs = {
"j1.pos": 10.0, # scalar to state to torch tensor
"j2.pos": np.float32(20.0), # scalar np to state to torch tensor
"image_front": img, # to images passthrough
"flag": np.int32(7), # scalar to state to torch tensor
"arr": np.array([1.5, 2.5]), # vector to state to torch tensor
}
tr = to_transition_robot_observation(obs)
assert isinstance(tr, dict)
assert TransitionKey.OBSERVATION in tr
out = tr[TransitionKey.OBSERVATION]
# Check state keys are present and converted to tensors
for k in ("j1.pos", "j2.pos", "flag", "arr"):
key = f"observation.state.{k}"
assert key in out
v = out[key]
if k != "arr":
assert isinstance(v, torch.Tensor) and v.ndim == 0
else:
assert isinstance(v, torch.Tensor) and v.ndim == 1 and v.shape == (2,)
# Check image present as is
assert "observation.images.image_front" in out
assert isinstance(out["observation.images.image_front"], np.ndarray)
assert out["observation.images.image_front"].dtype == np.uint8
assert out["observation.images.image_front"].shape == (10, 20, 3)
# ACTION should be empty dict by make_transition
assert TransitionKey.ACTION in tr
assert isinstance(tr[TransitionKey.ACTION], dict)
assert tr[TransitionKey.ACTION] == {}
def test_to_output_robot_action_strips_prefix_and_filters_pos_keys_only():
# Build a transition with mixed action keys
tr = {
TransitionKey.ACTION: {
"action.j1.pos": 11.0, # keep "j1.pos"
"action.gripper.pos": torch.tensor(33.0), # keep: tensor accepted
"action.ee.x": 0.5, # ignore (doesn't end with .pos)
"misc": "ignore_me", # ignore (no 'action.' prefix)
}
}
out = to_output_robot_action(tr)
# Only ".pos" keys with "action." prefix are retained and stripped to base names
assert set(out.keys()) == {"j1.pos", "gripper.pos"}
# Values converted to float
assert isinstance(out["j1.pos"], float)
assert isinstance(out["gripper.pos"], float)
assert out["j1.pos"] == pytest.approx(11.0)
assert out["gripper.pos"] == pytest.approx(33.0)
def test_to_dataset_frame_merge_and_pack_vectors_and_metadata():
# Fabricate dataset features (as stored in dataset.meta["features"])
features = {
# Action vector: 3 elements in specific order
"action": {
"dtype": "float32",
"shape": (3,),
"names": ["j1.pos", "j2.pos", "gripper.pos"],
},
# Observation state vector: 2 elements
"observation.state": {
"dtype": "float32",
"shape": (2,),
"names": ["j1.pos", "j2.pos"],
},
# Image spec (video/image dtype acceptable)
"observation.images.front": {
"dtype": "image",
"shape": (480, 640, 3),
"names": ["h", "w", "c"],
},
}
# Build two transitions to be merged: teleop (action) and robot obs (state/images)
img = np.random.randint(0, 255, size=(480, 640, 3), dtype=np.uint8)
teleop_transition = {
TransitionKey.OBSERVATION: {},
TransitionKey.ACTION: {
"action.j1.pos": torch.tensor(1.1),
"action.j2.pos": torch.tensor(2.2),
# gripper.pos missing → defaults to 0.0
"action.ee.x": 0.5, # ignored, not in features["action"]["names"]
},
TransitionKey.COMPLEMENTARY_DATA: {
"frame_is_pad": True,
"task": "Pick cube",
},
}
robot_transition = {
TransitionKey.OBSERVATION: {
"observation.state.j1.pos": torch.tensor(10.0),
"observation.state.j2.pos": torch.tensor(20.0),
"observation.images.front": img,
},
TransitionKey.REWARD: torch.tensor(5.0),
TransitionKey.DONE: True,
TransitionKey.TRUNCATED: False,
TransitionKey.INFO: {"note": "ok"},
}
# Directly call the refactored function
batch = to_dataset_frame([teleop_transition, robot_transition], features)
# Images passthrough
assert "observation.images.front" in batch
assert batch["observation.images.front"].shape == img.shape
assert batch["observation.images.front"].dtype == np.uint8
assert np.shares_memory(batch["observation.images.front"], img) or np.array_equal(
batch["observation.images.front"], img
)
# Observation.state vector
assert "observation.state" in batch
obs_vec = batch["observation.state"]
assert isinstance(obs_vec, np.ndarray) and obs_vec.dtype == np.float32
assert obs_vec.shape == (2,)
assert obs_vec[0] == pytest.approx(10.0)
assert obs_vec[1] == pytest.approx(20.0)
# Action vector
assert "action" in batch
act_vec = batch["action"]
assert isinstance(act_vec, np.ndarray) and act_vec.dtype == np.float32
assert act_vec.shape == (3,)
assert act_vec[0] == pytest.approx(1.1)
assert act_vec[1] == pytest.approx(2.2)
assert act_vec[2] == pytest.approx(0.0) # default for missing gripper.pos
# Next.* metadata
assert batch["next.reward"] == pytest.approx(5.0)
assert batch["next.done"] is True
assert batch["next.truncated"] is False
# Complementary data
assert batch["frame_is_pad"] is True
assert batch["task"] == "Pick cube"
+885
View File
@@ -0,0 +1,885 @@
#!/usr/bin/env python
# Copyright 2025 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import tempfile
import pytest
import torch
from lerobot.configs.types import FeatureType, PolicyFeature
from lerobot.processor import DeviceProcessor, RobotProcessor
from lerobot.processor.pipeline import TransitionKey
def create_transition(
observation=None, action=None, reward=None, done=None, truncated=None, info=None, complementary_data=None
):
"""Helper function to create a transition dictionary."""
transition = {}
if observation is not None:
transition[TransitionKey.OBSERVATION] = observation
if action is not None:
transition[TransitionKey.ACTION] = action
if reward is not None:
transition[TransitionKey.REWARD] = reward
if done is not None:
transition[TransitionKey.DONE] = done
if truncated is not None:
transition[TransitionKey.TRUNCATED] = truncated
if info is not None:
transition[TransitionKey.INFO] = info
if complementary_data is not None:
transition[TransitionKey.COMPLEMENTARY_DATA] = complementary_data
return transition
def test_basic_functionality():
"""Test basic device processor functionality on CPU."""
processor = DeviceProcessor(device="cpu")
# Create a transition with CPU tensors
observation = {"observation.state": torch.randn(10), "observation.image": torch.randn(3, 224, 224)}
action = torch.randn(5)
reward = torch.tensor(1.0)
done = torch.tensor(False)
truncated = torch.tensor(False)
transition = create_transition(
observation=observation, action=action, reward=reward, done=done, truncated=truncated
)
result = processor(transition)
# Check that all tensors are on CPU
assert result[TransitionKey.OBSERVATION]["observation.state"].device.type == "cpu"
assert result[TransitionKey.OBSERVATION]["observation.image"].device.type == "cpu"
assert result[TransitionKey.ACTION].device.type == "cpu"
assert result[TransitionKey.REWARD].device.type == "cpu"
assert result[TransitionKey.DONE].device.type == "cpu"
assert result[TransitionKey.TRUNCATED].device.type == "cpu"
@pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
def test_cuda_functionality():
"""Test device processor functionality on CUDA."""
processor = DeviceProcessor(device="cuda")
# Create a transition with CPU tensors
observation = {"observation.state": torch.randn(10), "observation.image": torch.randn(3, 224, 224)}
action = torch.randn(5)
reward = torch.tensor(1.0)
done = torch.tensor(False)
truncated = torch.tensor(False)
transition = create_transition(
observation=observation, action=action, reward=reward, done=done, truncated=truncated
)
result = processor(transition)
# Check that all tensors are on CUDA
assert result[TransitionKey.OBSERVATION]["observation.state"].device.type == "cuda"
assert result[TransitionKey.OBSERVATION]["observation.image"].device.type == "cuda"
assert result[TransitionKey.ACTION].device.type == "cuda"
assert result[TransitionKey.REWARD].device.type == "cuda"
assert result[TransitionKey.DONE].device.type == "cuda"
assert result[TransitionKey.TRUNCATED].device.type == "cuda"
@pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
def test_specific_cuda_device():
"""Test device processor with specific CUDA device."""
processor = DeviceProcessor(device="cuda:0")
observation = {"observation.state": torch.randn(10)}
action = torch.randn(5)
transition = create_transition(observation=observation, action=action)
result = processor(transition)
assert result[TransitionKey.OBSERVATION]["observation.state"].device.type == "cuda"
assert result[TransitionKey.OBSERVATION]["observation.state"].device.index == 0
assert result[TransitionKey.ACTION].device.type == "cuda"
assert result[TransitionKey.ACTION].device.index == 0
def test_non_tensor_values():
"""Test that non-tensor values are preserved."""
processor = DeviceProcessor(device="cpu")
observation = {
"observation.state": torch.randn(10),
"observation.metadata": {"key": "value"}, # Non-tensor data
"observation.list": [1, 2, 3], # Non-tensor data
}
action = torch.randn(5)
info = {"episode": 1, "step": 42}
transition = create_transition(observation=observation, action=action, info=info)
result = processor(transition)
# Check tensors are processed
assert isinstance(result[TransitionKey.OBSERVATION]["observation.state"], torch.Tensor)
assert isinstance(result[TransitionKey.ACTION], torch.Tensor)
# Check non-tensor values are preserved
assert result[TransitionKey.OBSERVATION]["observation.metadata"] == {"key": "value"}
assert result[TransitionKey.OBSERVATION]["observation.list"] == [1, 2, 3]
assert result[TransitionKey.INFO] == {"episode": 1, "step": 42}
def test_none_values():
"""Test handling of None values."""
processor = DeviceProcessor(device="cpu")
# Test with None observation
transition = create_transition(observation=None, action=torch.randn(5))
result = processor(transition)
assert TransitionKey.OBSERVATION not in result
assert result[TransitionKey.ACTION].device.type == "cpu"
# Test with None action
transition = create_transition(observation={"observation.state": torch.randn(10)}, action=None)
result = processor(transition)
assert result[TransitionKey.OBSERVATION]["observation.state"].device.type == "cpu"
assert TransitionKey.ACTION not in result
def test_empty_observation():
"""Test handling of empty observation dictionary."""
processor = DeviceProcessor(device="cpu")
transition = create_transition(observation={}, action=torch.randn(5))
result = processor(transition)
assert result[TransitionKey.OBSERVATION] == {}
assert result[TransitionKey.ACTION].device.type == "cpu"
def test_scalar_tensors():
"""Test handling of scalar tensors."""
processor = DeviceProcessor(device="cpu")
observation = {"observation.scalar": torch.tensor(1.5)}
action = torch.tensor(2.0)
reward = torch.tensor(0.5)
transition = create_transition(observation=observation, action=action, reward=reward)
result = processor(transition)
assert result[TransitionKey.OBSERVATION]["observation.scalar"].item() == 1.5
assert result[TransitionKey.ACTION].item() == 2.0
assert result[TransitionKey.REWARD].item() == 0.5
def test_dtype_preservation():
"""Test that tensor dtypes are preserved."""
processor = DeviceProcessor(device="cpu")
observation = {
"observation.float32": torch.randn(5, dtype=torch.float32),
"observation.float64": torch.randn(5, dtype=torch.float64),
"observation.int32": torch.randint(0, 10, (5,), dtype=torch.int32),
"observation.bool": torch.tensor([True, False, True], dtype=torch.bool),
}
action = torch.randn(3, dtype=torch.float16)
transition = create_transition(observation=observation, action=action)
result = processor(transition)
assert result[TransitionKey.OBSERVATION]["observation.float32"].dtype == torch.float32
assert result[TransitionKey.OBSERVATION]["observation.float64"].dtype == torch.float64
assert result[TransitionKey.OBSERVATION]["observation.int32"].dtype == torch.int32
assert result[TransitionKey.OBSERVATION]["observation.bool"].dtype == torch.bool
assert result[TransitionKey.ACTION].dtype == torch.float16
def test_shape_preservation():
"""Test that tensor shapes are preserved."""
processor = DeviceProcessor(device="cpu")
observation = {
"observation.1d": torch.randn(10),
"observation.2d": torch.randn(5, 10),
"observation.3d": torch.randn(3, 224, 224),
"observation.4d": torch.randn(2, 3, 224, 224),
}
action = torch.randn(2, 5, 3)
transition = create_transition(observation=observation, action=action)
result = processor(transition)
assert result[TransitionKey.OBSERVATION]["observation.1d"].shape == (10,)
assert result[TransitionKey.OBSERVATION]["observation.2d"].shape == (5, 10)
assert result[TransitionKey.OBSERVATION]["observation.3d"].shape == (3, 224, 224)
assert result[TransitionKey.OBSERVATION]["observation.4d"].shape == (2, 3, 224, 224)
assert result[TransitionKey.ACTION].shape == (2, 5, 3)
@pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
def test_mixed_devices():
"""Test handling of tensors already on different devices."""
processor = DeviceProcessor(device="cuda")
# Create tensors on different devices
observation = {
"observation.cpu": torch.randn(5), # CPU
"observation.cuda": torch.randn(5).cuda(), # Already on CUDA
}
action = torch.randn(3).cuda() # Already on CUDA
transition = create_transition(observation=observation, action=action)
result = processor(transition)
# All should be on CUDA
assert result[TransitionKey.OBSERVATION]["observation.cpu"].device.type == "cuda"
assert result[TransitionKey.OBSERVATION]["observation.cuda"].device.type == "cuda"
assert result[TransitionKey.ACTION].device.type == "cuda"
def test_non_blocking_flag():
"""Test that non_blocking flag is set correctly."""
# CPU processor should have non_blocking=False
cpu_processor = DeviceProcessor(device="cpu")
assert cpu_processor.non_blocking is False
if torch.cuda.is_available():
# CUDA processor should have non_blocking=True
cuda_processor = DeviceProcessor(device="cuda")
assert cuda_processor.non_blocking is True
cuda_0_processor = DeviceProcessor(device="cuda:0")
assert cuda_0_processor.non_blocking is True
def test_serialization_methods():
"""Test get_config, state_dict, and load_state_dict methods."""
device = "cuda" if torch.cuda.is_available() else "cpu"
processor = DeviceProcessor(device=device)
# Test get_config
config = processor.get_config()
assert config == {"device": device, "float_dtype": None}
# Test state_dict (should be empty)
state = processor.state_dict()
assert state == {}
# Test load_state_dict (should be no-op)
processor.load_state_dict({})
assert processor.device == device
# Test reset (should be no-op)
processor.reset()
assert processor.device == device
def test_features():
"""Test that features returns features unchanged."""
processor = DeviceProcessor(device="cpu")
features = {
"observation.state": PolicyFeature(type=FeatureType.STATE, shape=(10,)),
"action": PolicyFeature(type=FeatureType.ACTION, shape=(5,)),
}
result = processor.transform_features(features)
assert result == features
assert result is features # Should return the same object
def test_integration_with_robot_processor():
"""Test integration with RobotProcessor."""
from lerobot.constants import OBS_STATE
from lerobot.processor import ToBatchProcessor
# Create a pipeline with DeviceProcessor
device_processor = DeviceProcessor(device="cpu")
batch_processor = ToBatchProcessor()
processor = RobotProcessor(steps=[batch_processor, device_processor], name="test_pipeline")
# Create test data
observation = {OBS_STATE: torch.randn(10)}
action = torch.randn(5)
transition = create_transition(observation=observation, action=action)
result = processor(transition)
# Check that tensors are batched and on correct device
# The result has TransitionKey.OBSERVATION as the key, with observation.state inside
assert result[TransitionKey.OBSERVATION][OBS_STATE].shape[0] == 1 # Batched
assert result[TransitionKey.OBSERVATION][OBS_STATE].device.type == "cpu"
assert result[TransitionKey.ACTION].shape[0] == 1 # Batched
assert result[TransitionKey.ACTION].device.type == "cpu"
def test_save_and_load_pretrained():
"""Test saving and loading processor with DeviceProcessor."""
device = "cuda:0" if torch.cuda.is_available() else "cpu"
processor = DeviceProcessor(device=device, float_dtype="float16")
robot_processor = RobotProcessor(steps=[processor], name="device_test_processor")
with tempfile.TemporaryDirectory() as tmpdir:
# Save
robot_processor.save_pretrained(tmpdir)
# Load
loaded_processor = RobotProcessor.from_pretrained(tmpdir)
assert len(loaded_processor.steps) == 1
loaded_device_processor = loaded_processor.steps[0]
assert isinstance(loaded_device_processor, DeviceProcessor)
# Use getattr to access attributes safely
assert (
getattr(loaded_device_processor, "device", None) == device.split(":")[0]
) # Device normalizes cuda:0 to cuda
assert getattr(loaded_device_processor, "float_dtype", None) == "float16"
def test_registry_functionality():
"""Test that DeviceProcessor is properly registered."""
from lerobot.processor.pipeline import ProcessorStepRegistry
# Check that DeviceProcessor is registered
registered_class = ProcessorStepRegistry.get("device_processor")
assert registered_class is DeviceProcessor
@pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
def test_performance_with_large_tensors():
"""Test performance with large tensors and non_blocking flag."""
processor = DeviceProcessor(device="cuda")
# Create large tensors
observation = {
"observation.large_image": torch.randn(10, 3, 512, 512), # Large image batch
"observation.features": torch.randn(10, 2048), # Large feature vector
}
action = torch.randn(10, 100) # Large action space
transition = create_transition(observation=observation, action=action)
# Process should not raise any errors
result = processor(transition)
# Verify all tensors are on CUDA
assert result[TransitionKey.OBSERVATION]["observation.large_image"].device.type == "cuda"
assert result[TransitionKey.OBSERVATION]["observation.features"].device.type == "cuda"
assert result[TransitionKey.ACTION].device.type == "cuda"
def test_reward_done_truncated_types():
"""Test handling of different types for reward, done, and truncated."""
processor = DeviceProcessor(device="cpu")
# Test with scalar values (not tensors)
transition = create_transition(
observation={"observation.state": torch.randn(5)},
action=torch.randn(3),
reward=1.0, # float
done=False, # bool
truncated=True, # bool
)
result = processor(transition)
# Non-tensor values should be preserved as-is
assert result[TransitionKey.REWARD] == 1.0
assert result[TransitionKey.DONE] is False
assert result[TransitionKey.TRUNCATED] is True
# Test with tensor values
transition = create_transition(
observation={"observation.state": torch.randn(5)},
action=torch.randn(3),
reward=torch.tensor(1.0),
done=torch.tensor(False),
truncated=torch.tensor(True),
)
result = processor(transition)
# Tensor values should be moved to device
assert isinstance(result[TransitionKey.REWARD], torch.Tensor)
assert isinstance(result[TransitionKey.DONE], torch.Tensor)
assert isinstance(result[TransitionKey.TRUNCATED], torch.Tensor)
assert result[TransitionKey.REWARD].device.type == "cpu"
assert result[TransitionKey.DONE].device.type == "cpu"
assert result[TransitionKey.TRUNCATED].device.type == "cpu"
def test_complementary_data_preserved():
"""Test that complementary_data is preserved unchanged."""
processor = DeviceProcessor(device="cpu")
complementary_data = {
"task": "pick_object",
"episode_id": 42,
"metadata": {"sensor": "camera_1"},
"observation_is_pad": torch.tensor([False, False, True]), # This should be moved to device
}
transition = create_transition(
observation={"observation.state": torch.randn(5)}, complementary_data=complementary_data
)
result = processor(transition)
# Check that complementary_data is preserved
assert TransitionKey.COMPLEMENTARY_DATA in result
assert result[TransitionKey.COMPLEMENTARY_DATA]["task"] == "pick_object"
assert result[TransitionKey.COMPLEMENTARY_DATA]["episode_id"] == 42
assert result[TransitionKey.COMPLEMENTARY_DATA]["metadata"] == {"sensor": "camera_1"}
# Note: Currently DeviceProcessor doesn't process tensors in complementary_data
# This is intentional as complementary_data is typically metadata
def test_float_dtype_conversion():
"""Test float dtype conversion functionality."""
processor = DeviceProcessor(device="cpu", float_dtype="float16")
# Create tensors of different types
observation = {
"observation.float32": torch.randn(5, dtype=torch.float32),
"observation.float64": torch.randn(5, dtype=torch.float64),
"observation.int32": torch.randint(0, 10, (5,), dtype=torch.int32),
"observation.int64": torch.randint(0, 10, (5,), dtype=torch.int64),
"observation.bool": torch.tensor([True, False, True], dtype=torch.bool),
}
action = torch.randn(3, dtype=torch.float32)
reward = torch.tensor(1.0, dtype=torch.float32)
transition = create_transition(observation=observation, action=action, reward=reward)
result = processor(transition)
# Check that float tensors are converted to float16
assert result[TransitionKey.OBSERVATION]["observation.float32"].dtype == torch.float16
assert result[TransitionKey.OBSERVATION]["observation.float64"].dtype == torch.float16
assert result[TransitionKey.ACTION].dtype == torch.float16
assert result[TransitionKey.REWARD].dtype == torch.float16
# Check that non-float tensors are preserved
assert result[TransitionKey.OBSERVATION]["observation.int32"].dtype == torch.int32
assert result[TransitionKey.OBSERVATION]["observation.int64"].dtype == torch.int64
assert result[TransitionKey.OBSERVATION]["observation.bool"].dtype == torch.bool
def test_float_dtype_none():
"""Test that when float_dtype is None, no dtype conversion occurs."""
processor = DeviceProcessor(device="cpu", float_dtype=None)
observation = {
"observation.float32": torch.randn(5, dtype=torch.float32),
"observation.float64": torch.randn(5, dtype=torch.float64),
"observation.int32": torch.randint(0, 10, (5,), dtype=torch.int32),
}
action = torch.randn(3, dtype=torch.float64)
transition = create_transition(observation=observation, action=action)
result = processor(transition)
# Check that dtypes are preserved when float_dtype is None
assert result[TransitionKey.OBSERVATION]["observation.float32"].dtype == torch.float32
assert result[TransitionKey.OBSERVATION]["observation.float64"].dtype == torch.float64
assert result[TransitionKey.OBSERVATION]["observation.int32"].dtype == torch.int32
assert result[TransitionKey.ACTION].dtype == torch.float64
def test_float_dtype_bfloat16():
"""Test conversion to bfloat16."""
processor = DeviceProcessor(device="cpu", float_dtype="bfloat16")
observation = {"observation.state": torch.randn(5, dtype=torch.float32)}
action = torch.randn(3, dtype=torch.float64)
transition = create_transition(observation=observation, action=action)
result = processor(transition)
assert result[TransitionKey.OBSERVATION]["observation.state"].dtype == torch.bfloat16
assert result[TransitionKey.ACTION].dtype == torch.bfloat16
def test_float_dtype_float64():
"""Test conversion to float64."""
processor = DeviceProcessor(device="cpu", float_dtype="float64")
observation = {"observation.state": torch.randn(5, dtype=torch.float16)}
action = torch.randn(3, dtype=torch.float32)
transition = create_transition(observation=observation, action=action)
result = processor(transition)
assert result[TransitionKey.OBSERVATION]["observation.state"].dtype == torch.float64
assert result[TransitionKey.ACTION].dtype == torch.float64
def test_float_dtype_invalid():
"""Test that invalid float_dtype raises ValueError."""
with pytest.raises(ValueError, match="Invalid float_dtype 'invalid_dtype'"):
DeviceProcessor(device="cpu", float_dtype="invalid_dtype")
def test_float_dtype_aliases():
"""Test that dtype aliases work correctly."""
# Test 'half' alias for float16
processor_half = DeviceProcessor(device="cpu", float_dtype="half")
assert processor_half._target_float_dtype == torch.float16
# Test 'float' alias for float32
processor_float = DeviceProcessor(device="cpu", float_dtype="float")
assert processor_float._target_float_dtype == torch.float32
# Test 'double' alias for float64
processor_double = DeviceProcessor(device="cpu", float_dtype="double")
assert processor_double._target_float_dtype == torch.float64
def test_float_dtype_with_mixed_tensors():
"""Test float dtype conversion with mixed tensor types."""
processor = DeviceProcessor(device="cpu", float_dtype="float32")
observation = {
"observation.image": torch.randint(0, 255, (3, 64, 64), dtype=torch.uint8), # Should not convert
"observation.state": torch.randn(10, dtype=torch.float64), # Should convert
"observation.mask": torch.tensor([True, False, True], dtype=torch.bool), # Should not convert
"observation.indices": torch.tensor([1, 2, 3], dtype=torch.long), # Should not convert
}
action = torch.randn(5, dtype=torch.float16) # Should convert
transition = create_transition(observation=observation, action=action)
result = processor(transition)
# Check conversions
assert result[TransitionKey.OBSERVATION]["observation.image"].dtype == torch.uint8 # Unchanged
assert result[TransitionKey.OBSERVATION]["observation.state"].dtype == torch.float32 # Converted
assert result[TransitionKey.OBSERVATION]["observation.mask"].dtype == torch.bool # Unchanged
assert result[TransitionKey.OBSERVATION]["observation.indices"].dtype == torch.long # Unchanged
assert result[TransitionKey.ACTION].dtype == torch.float32 # Converted
def test_float_dtype_serialization():
"""Test that float_dtype is properly serialized in get_config."""
device = "cuda" if torch.cuda.is_available() else "cpu"
processor = DeviceProcessor(device=device, float_dtype="float16")
config = processor.get_config()
assert config == {"device": device, "float_dtype": "float16"}
# Test with None float_dtype
processor_none = DeviceProcessor(device="cpu", float_dtype=None)
config_none = processor_none.get_config()
assert config_none == {"device": "cpu", "float_dtype": None}
@pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
def test_float_dtype_with_cuda():
"""Test float dtype conversion combined with CUDA device."""
processor = DeviceProcessor(device="cuda", float_dtype="float16")
# Create tensors on CPU with different dtypes
observation = {
"observation.float32": torch.randn(5, dtype=torch.float32),
"observation.int64": torch.tensor([1, 2, 3], dtype=torch.int64),
}
action = torch.randn(3, dtype=torch.float64)
transition = create_transition(observation=observation, action=action)
result = processor(transition)
# Check that tensors are on CUDA and float types are converted
assert result[TransitionKey.OBSERVATION]["observation.float32"].device.type == "cuda"
assert result[TransitionKey.OBSERVATION]["observation.float32"].dtype == torch.float16
assert result[TransitionKey.OBSERVATION]["observation.int64"].device.type == "cuda"
assert result[TransitionKey.OBSERVATION]["observation.int64"].dtype == torch.int64 # Unchanged
assert result[TransitionKey.ACTION].device.type == "cuda"
assert result[TransitionKey.ACTION].dtype == torch.float16
def test_complementary_data_index_fields():
"""Test processing of index and task_index fields in complementary_data."""
processor = DeviceProcessor(device="cpu")
# Create transition with index and task_index in complementary_data
complementary_data = {
"task": ["pick_cube"],
"index": torch.tensor([42], dtype=torch.int64),
"task_index": torch.tensor([3], dtype=torch.int64),
"episode_id": 123, # Non-tensor field
}
transition = create_transition(
observation={"observation.state": torch.randn(1, 7)},
action=torch.randn(1, 4),
complementary_data=complementary_data,
)
result = processor(transition)
# Check that tensors in complementary_data are processed
processed_comp_data = result[TransitionKey.COMPLEMENTARY_DATA]
# Check index tensor
assert isinstance(processed_comp_data["index"], torch.Tensor)
assert processed_comp_data["index"].device.type == "cpu"
assert torch.equal(processed_comp_data["index"], complementary_data["index"])
# Check task_index tensor
assert isinstance(processed_comp_data["task_index"], torch.Tensor)
assert processed_comp_data["task_index"].device.type == "cpu"
assert torch.equal(processed_comp_data["task_index"], complementary_data["task_index"])
# Check non-tensor fields remain unchanged
assert processed_comp_data["task"] == ["pick_cube"]
assert processed_comp_data["episode_id"] == 123
@pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
def test_complementary_data_index_fields_cuda():
"""Test moving index and task_index fields to CUDA."""
processor = DeviceProcessor(device="cuda:0")
# Create CPU tensors
complementary_data = {
"index": torch.tensor([100, 101], dtype=torch.int64),
"task_index": torch.tensor([5], dtype=torch.int64),
}
transition = create_transition(complementary_data=complementary_data)
result = processor(transition)
processed_comp_data = result[TransitionKey.COMPLEMENTARY_DATA]
# Check tensors moved to CUDA
assert processed_comp_data["index"].device.type == "cuda"
assert processed_comp_data["index"].device.index == 0
assert processed_comp_data["task_index"].device.type == "cuda"
assert processed_comp_data["task_index"].device.index == 0
def test_complementary_data_without_index_fields():
"""Test that complementary_data without index/task_index fields works correctly."""
processor = DeviceProcessor(device="cpu")
complementary_data = {
"task": ["navigate"],
"episode_id": 456,
}
transition = create_transition(complementary_data=complementary_data)
result = processor(transition)
# Should process without errors and preserve non-tensor fields
processed_comp_data = result[TransitionKey.COMPLEMENTARY_DATA]
assert processed_comp_data["task"] == ["navigate"]
assert processed_comp_data["episode_id"] == 456
def test_complementary_data_mixed_tensors():
"""Test complementary_data with mix of tensors and non-tensors."""
processor = DeviceProcessor(device="cpu")
complementary_data = {
"task": ["pick_and_place"],
"index": torch.tensor([42], dtype=torch.int64),
"task_index": torch.tensor([3], dtype=torch.int64),
"metrics": [1.0, 2.0, 3.0], # List, not tensor
"config": {"speed": "fast"}, # Dict
"episode_id": 789, # Int
}
transition = create_transition(complementary_data=complementary_data)
result = processor(transition)
processed_comp_data = result[TransitionKey.COMPLEMENTARY_DATA]
# Check tensors are processed
assert isinstance(processed_comp_data["index"], torch.Tensor)
assert isinstance(processed_comp_data["task_index"], torch.Tensor)
# Check non-tensors remain unchanged
assert processed_comp_data["task"] == ["pick_and_place"]
assert processed_comp_data["metrics"] == [1.0, 2.0, 3.0]
assert processed_comp_data["config"] == {"speed": "fast"}
assert processed_comp_data["episode_id"] == 789
def test_complementary_data_float_dtype_conversion():
"""Test that float dtype conversion doesn't affect int tensors in complementary_data."""
processor = DeviceProcessor(device="cpu", float_dtype="float16")
complementary_data = {
"index": torch.tensor([42], dtype=torch.int64),
"task_index": torch.tensor([3], dtype=torch.int64),
"float_tensor": torch.tensor([1.5, 2.5], dtype=torch.float32), # Should be converted
}
transition = create_transition(complementary_data=complementary_data)
result = processor(transition)
processed_comp_data = result[TransitionKey.COMPLEMENTARY_DATA]
# Int tensors should keep their dtype
assert processed_comp_data["index"].dtype == torch.int64
assert processed_comp_data["task_index"].dtype == torch.int64
# Float tensor should be converted
assert processed_comp_data["float_tensor"].dtype == torch.float16
@pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
def test_complementary_data_full_pipeline_cuda():
"""Test full transition with complementary_data on CUDA."""
processor = DeviceProcessor(device="cuda:0", float_dtype="float16")
# Create full transition with mixed CPU tensors
observation = {"observation.state": torch.randn(1, 7, dtype=torch.float32)}
action = torch.randn(1, 4, dtype=torch.float32)
reward = torch.tensor(1.5, dtype=torch.float32)
done = torch.tensor(False)
complementary_data = {
"task": ["reach_target"],
"index": torch.tensor([1000], dtype=torch.int64),
"task_index": torch.tensor([10], dtype=torch.int64),
}
transition = create_transition(
observation=observation,
action=action,
reward=reward,
done=done,
complementary_data=complementary_data,
)
result = processor(transition)
# Check all components moved to CUDA
assert result[TransitionKey.OBSERVATION]["observation.state"].device.type == "cuda"
assert result[TransitionKey.ACTION].device.type == "cuda"
assert result[TransitionKey.REWARD].device.type == "cuda"
assert result[TransitionKey.DONE].device.type == "cuda"
# Check complementary_data tensors
processed_comp_data = result[TransitionKey.COMPLEMENTARY_DATA]
assert processed_comp_data["index"].device.type == "cuda"
assert processed_comp_data["task_index"].device.type == "cuda"
# Check float conversion happened for float tensors
assert result[TransitionKey.OBSERVATION]["observation.state"].dtype == torch.float16
assert result[TransitionKey.ACTION].dtype == torch.float16
assert result[TransitionKey.REWARD].dtype == torch.float16
# Check int tensors kept their dtype
assert processed_comp_data["index"].dtype == torch.int64
assert processed_comp_data["task_index"].dtype == torch.int64
def test_complementary_data_empty():
"""Test empty complementary_data handling."""
processor = DeviceProcessor(device="cpu")
transition = create_transition(
observation={"observation.state": torch.randn(1, 7)},
complementary_data={},
)
result = processor(transition)
# Should have empty dict
assert result[TransitionKey.COMPLEMENTARY_DATA] == {}
def test_complementary_data_none():
"""Test None complementary_data handling."""
processor = DeviceProcessor(device="cpu")
transition = create_transition(
observation={"observation.state": torch.randn(1, 7)},
complementary_data=None,
)
result = processor(transition)
# Complementary data should not be in the result (same as input)
assert TransitionKey.COMPLEMENTARY_DATA not in result
@pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
def test_policy_processor_integration():
"""Test integration with policy processors - input on GPU, output on CPU."""
from lerobot.configs.types import FeatureType, NormalizationMode, PolicyFeature
from lerobot.constants import ACTION, OBS_STATE
from lerobot.processor import NormalizerProcessor, ToBatchProcessor, UnnormalizerProcessor
# Create features and stats
features = {
OBS_STATE: PolicyFeature(type=FeatureType.STATE, shape=(10,)),
ACTION: PolicyFeature(type=FeatureType.ACTION, shape=(5,)),
}
stats = {
OBS_STATE: {"mean": torch.zeros(10), "std": torch.ones(10)},
ACTION: {"mean": torch.zeros(5), "std": torch.ones(5)},
}
norm_map = {FeatureType.STATE: NormalizationMode.MEAN_STD, FeatureType.ACTION: NormalizationMode.MEAN_STD}
# Create input processor (preprocessor) that moves to GPU
input_processor = RobotProcessor(
steps=[
NormalizerProcessor(features=features, norm_map=norm_map, stats=stats),
ToBatchProcessor(),
DeviceProcessor(device="cuda"),
],
name="test_preprocessor",
)
# Create output processor (postprocessor) that moves to CPU
output_processor = RobotProcessor(
steps=[
DeviceProcessor(device="cpu"),
UnnormalizerProcessor(features={ACTION: features[ACTION]}, norm_map=norm_map, stats=stats),
],
name="test_postprocessor",
)
# Test data on CPU
observation = {OBS_STATE: torch.randn(10)}
action = torch.randn(5)
transition = create_transition(observation=observation, action=action)
# Process through input processor
input_result = input_processor(transition)
# Verify tensors are on GPU and batched
# The result has TransitionKey.OBSERVATION as the key, with observation.state inside
assert input_result[TransitionKey.OBSERVATION][OBS_STATE].device.type == "cuda"
assert input_result[TransitionKey.OBSERVATION][OBS_STATE].shape[0] == 1
assert input_result[TransitionKey.ACTION].device.type == "cuda"
assert input_result[TransitionKey.ACTION].shape[0] == 1
# Simulate model output on GPU
model_output = create_transition(action=torch.randn(1, 5).cuda())
# Process through output processor
output_result = output_processor(model_output)
# Verify action is back on CPU and unnormalized
assert output_result[TransitionKey.ACTION].device.type == "cpu"
assert output_result[TransitionKey.ACTION].shape == (1, 5)
File diff suppressed because it is too large Load Diff
+10 -10
View File
@@ -410,13 +410,13 @@ def test_equivalent_with_image_dict():
torch.testing.assert_close(original_result[key], processor_result[key])
def test_image_processor_feature_contract_pixels_to_image(policy_feature_factory):
def test_image_processor_features_pixels_to_image(policy_feature_factory):
processor = VanillaObservationProcessor()
features = {
"pixels": policy_feature_factory(FeatureType.VISUAL, (3, 64, 64)),
"keep": policy_feature_factory(FeatureType.ENV, (1,)),
}
out = processor.feature_contract(features.copy())
out = processor.transform_features(features.copy())
assert OBS_IMAGE in out and out[OBS_IMAGE] == features["pixels"]
assert "pixels" not in out
@@ -424,13 +424,13 @@ def test_image_processor_feature_contract_pixels_to_image(policy_feature_factory
assert_contract_is_typed(out)
def test_image_processor_feature_contract_observation_pixels_to_image(policy_feature_factory):
def test_image_processor_features_observation_pixels_to_image(policy_feature_factory):
processor = VanillaObservationProcessor()
features = {
"observation.pixels": policy_feature_factory(FeatureType.VISUAL, (3, 64, 64)),
"keep": policy_feature_factory(FeatureType.ENV, (1,)),
}
out = processor.feature_contract(features.copy())
out = processor.transform_features(features.copy())
assert OBS_IMAGE in out and out[OBS_IMAGE] == features["observation.pixels"]
assert "observation.pixels" not in out
@@ -438,7 +438,7 @@ def test_image_processor_feature_contract_observation_pixels_to_image(policy_fea
assert_contract_is_typed(out)
def test_image_processor_feature_contract_multi_camera_and_prefixed(policy_feature_factory):
def test_image_processor_features_multi_camera_and_prefixed(policy_feature_factory):
processor = VanillaObservationProcessor()
features = {
"pixels.front": policy_feature_factory(FeatureType.VISUAL, (3, 64, 64)),
@@ -446,7 +446,7 @@ def test_image_processor_feature_contract_multi_camera_and_prefixed(policy_featu
"observation.pixels.rear": policy_feature_factory(FeatureType.VISUAL, (3, 64, 64)),
"keep": policy_feature_factory(FeatureType.ENV, (7,)),
}
out = processor.feature_contract(features.copy())
out = processor.transform_features(features.copy())
assert f"{OBS_IMAGES}.front" in out and out[f"{OBS_IMAGES}.front"] == features["pixels.front"]
assert f"{OBS_IMAGES}.wrist" in out and out[f"{OBS_IMAGES}.wrist"] == features["pixels.wrist"]
@@ -456,14 +456,14 @@ def test_image_processor_feature_contract_multi_camera_and_prefixed(policy_featu
assert_contract_is_typed(out)
def test_state_processor_feature_contract_environment_and_agent_pos(policy_feature_factory):
def test_state_processor_features_environment_and_agent_pos(policy_feature_factory):
processor = VanillaObservationProcessor()
features = {
"environment_state": policy_feature_factory(FeatureType.STATE, (3,)),
"agent_pos": policy_feature_factory(FeatureType.STATE, (7,)),
"keep": policy_feature_factory(FeatureType.ENV, (1,)),
}
out = processor.feature_contract(features.copy())
out = processor.transform_features(features.copy())
assert OBS_ENV_STATE in out and out[OBS_ENV_STATE] == features["environment_state"]
assert OBS_STATE in out and out[OBS_STATE] == features["agent_pos"]
@@ -472,13 +472,13 @@ def test_state_processor_feature_contract_environment_and_agent_pos(policy_featu
assert_contract_is_typed(out)
def test_state_processor_feature_contract_prefixed_inputs(policy_feature_factory):
def test_state_processor_features_prefixed_inputs(policy_feature_factory):
proc = VanillaObservationProcessor()
features = {
"observation.environment_state": policy_feature_factory(FeatureType.STATE, (2,)),
"observation.agent_pos": policy_feature_factory(FeatureType.STATE, (4,)),
}
out = proc.feature_contract(features.copy())
out = proc.transform_features(features.copy())
assert OBS_ENV_STATE in out and out[OBS_ENV_STATE] == features["observation.environment_state"]
assert OBS_STATE in out and out[OBS_STATE] == features["observation.agent_pos"]
+297 -50
View File
@@ -26,6 +26,7 @@ import torch
import torch.nn as nn
from lerobot.configs.types import FeatureType, PolicyFeature
from lerobot.datasets.pipeline_features import aggregate_pipeline_dataset_features
from lerobot.processor import EnvTransition, ProcessorStepRegistry, RobotProcessor
from lerobot.processor.pipeline import TransitionKey
from tests.conftest import assert_contract_is_typed
@@ -90,8 +91,8 @@ class MockStep:
def reset(self) -> None:
self.counter = 0
def feature_contract(self, features: dict[str, PolicyFeature]) -> dict[str, PolicyFeature]:
# We do not test feature_contract here
def transform_features(self, features: dict[str, PolicyFeature]) -> dict[str, PolicyFeature]:
# We do not test features here
return features
@@ -112,8 +113,8 @@ class MockStepWithoutOptionalMethods:
return transition
def feature_contract(self, features: dict[str, PolicyFeature]) -> dict[str, PolicyFeature]:
# We do not test feature_contract here
def transform_features(self, features: dict[str, PolicyFeature]) -> dict[str, PolicyFeature]:
# We do not test features here
return features
@@ -168,8 +169,8 @@ class MockStepWithTensorState:
self.running_mean.zero_()
self.running_count.zero_()
def feature_contract(self, features: dict[str, PolicyFeature]) -> dict[str, PolicyFeature]:
# We do not test feature_contract here
def transform_features(self, features: dict[str, PolicyFeature]) -> dict[str, PolicyFeature]:
# We do not test features here
return features
@@ -662,8 +663,8 @@ class MockModuleStep(nn.Module):
self.running_mean.zero_()
self.counter = 0
def feature_contract(self, features: dict[str, PolicyFeature]) -> dict[str, PolicyFeature]:
# We do not test feature_contract here
def transform_features(self, features: dict[str, PolicyFeature]) -> dict[str, PolicyFeature]:
# We do not test features here
return features
@@ -744,8 +745,8 @@ class MockNonModuleStepWithState:
self.step_count.zero_()
self.history.clear()
def feature_contract(self, features: dict[str, PolicyFeature]) -> dict[str, PolicyFeature]:
# We do not test feature_contract here
def transform_features(self, features: dict[str, PolicyFeature]) -> dict[str, PolicyFeature]:
# We do not test features here
return features
@@ -799,8 +800,8 @@ class MockStepWithNonSerializableParam:
def reset(self) -> None:
pass
def feature_contract(self, features: dict[str, PolicyFeature]) -> dict[str, PolicyFeature]:
# We do not test feature_contract here
def transform_features(self, features: dict[str, PolicyFeature]) -> dict[str, PolicyFeature]:
# We do not test features here
return features
@@ -838,8 +839,8 @@ class RegisteredMockStep:
def reset(self) -> None:
pass
def feature_contract(self, features: dict[str, PolicyFeature]) -> dict[str, PolicyFeature]:
# We do not test feature_contract here
def transform_features(self, features: dict[str, PolicyFeature]) -> dict[str, PolicyFeature]:
# We do not test features here
return features
@@ -1382,8 +1383,8 @@ def test_state_file_naming_with_registry():
def load_state_dict(self, state):
self.state_tensor = state["state_tensor"]
def feature_contract(self, features: dict[str, PolicyFeature]) -> dict[str, PolicyFeature]:
# We do not test feature_contract here
def transform_features(self, features: dict[str, PolicyFeature]) -> dict[str, PolicyFeature]:
# We do not test features here
return features
try:
@@ -1439,8 +1440,8 @@ def test_override_with_nested_config():
def get_config(self):
return {"name": self.name, "simple_param": self.simple_param, "nested_config": self.nested_config}
def feature_contract(self, features: dict[str, PolicyFeature]) -> dict[str, PolicyFeature]:
# We do not test feature_contract here
def transform_features(self, features: dict[str, PolicyFeature]) -> dict[str, PolicyFeature]:
# We do not test features here
return features
try:
@@ -1531,8 +1532,8 @@ def test_override_with_callables():
def get_config(self):
return {"name": self.name}
def feature_contract(self, features: dict[str, PolicyFeature]) -> dict[str, PolicyFeature]:
# We do not test feature_contract here
def transform_features(self, features: dict[str, PolicyFeature]) -> dict[str, PolicyFeature]:
# We do not test features here
return features
try:
@@ -1639,6 +1640,109 @@ def test_state_file_naming_with_multiple_processors():
assert loaded_post.steps[0].window_size == 10
def test_default_batch_to_transition_with_index_fields():
"""Test that _default_batch_to_transition handles index and task_index fields correctly."""
from lerobot.processor.pipeline import _default_batch_to_transition
# Create batch with index and task_index fields
batch = {
"observation.state": torch.randn(1, 7),
"action": torch.randn(1, 4),
"next.reward": 1.5,
"next.done": False,
"task": ["pick_cube"],
"index": torch.tensor([42], dtype=torch.int64),
"task_index": torch.tensor([3], dtype=torch.int64),
}
transition = _default_batch_to_transition(batch)
# Check basic transition structure
assert TransitionKey.OBSERVATION in transition
assert TransitionKey.ACTION in transition
assert TransitionKey.COMPLEMENTARY_DATA in transition
# Check that index and task_index are in complementary_data
comp_data = transition[TransitionKey.COMPLEMENTARY_DATA]
assert "index" in comp_data
assert "task_index" in comp_data
assert "task" in comp_data
# Verify values
assert torch.equal(comp_data["index"], batch["index"])
assert torch.equal(comp_data["task_index"], batch["task_index"])
assert comp_data["task"] == batch["task"]
def test_default_transition_to_batch_with_index_fields():
"""Test that _default_transition_to_batch handles index and task_index fields correctly."""
from lerobot.processor.pipeline import _default_transition_to_batch
# Create transition with index and task_index in complementary_data
transition = create_transition(
observation={"observation.state": torch.randn(1, 7)},
action=torch.randn(1, 4),
reward=1.5,
done=False,
complementary_data={
"task": ["navigate"],
"index": torch.tensor([100], dtype=torch.int64),
"task_index": torch.tensor([5], dtype=torch.int64),
},
)
batch = _default_transition_to_batch(transition)
# Check that index and task_index are in the batch
assert "index" in batch
assert "task_index" in batch
assert "task" in batch
# Verify values
assert torch.equal(batch["index"], transition[TransitionKey.COMPLEMENTARY_DATA]["index"])
assert torch.equal(batch["task_index"], transition[TransitionKey.COMPLEMENTARY_DATA]["task_index"])
assert batch["task"] == transition[TransitionKey.COMPLEMENTARY_DATA]["task"]
def test_batch_to_transition_without_index_fields():
"""Test that conversion works without index and task_index fields."""
from lerobot.processor.pipeline import _default_batch_to_transition
# Batch without index/task_index
batch = {
"observation.state": torch.randn(1, 7),
"action": torch.randn(1, 4),
"task": ["pick_cube"],
}
transition = _default_batch_to_transition(batch)
comp_data = transition[TransitionKey.COMPLEMENTARY_DATA]
# Should have task but not index/task_index
assert "task" in comp_data
assert "index" not in comp_data
assert "task_index" not in comp_data
def test_transition_to_batch_without_index_fields():
"""Test that conversion works without index and task_index fields."""
from lerobot.processor.pipeline import _default_transition_to_batch
# Transition without index/task_index
transition = create_transition(
observation={"observation.state": torch.randn(1, 7)},
action=torch.randn(1, 4),
complementary_data={"task": ["navigate"]},
)
batch = _default_transition_to_batch(transition)
# Should have task but not index/task_index
assert "task" in batch
assert "index" not in batch
assert "task_index" not in batch
def test_override_with_device_strings():
"""Test overriding device parameters with string values."""
@@ -1663,8 +1767,8 @@ def test_override_with_device_strings():
def load_state_dict(self, state):
self.buffer = state["buffer"]
def feature_contract(self, features: dict[str, PolicyFeature]) -> dict[str, PolicyFeature]:
# We do not test feature_contract here
def transform_features(self, features: dict[str, PolicyFeature]) -> dict[str, PolicyFeature]:
# We do not test features here
return features
try:
@@ -1757,21 +1861,16 @@ def test_save_load_with_custom_converter_functions():
class NonCompliantStep:
"""Intentionally non-compliant: missing feature_contract."""
"""Intentionally non-compliant: missing features."""
def __call__(self, transition: EnvTransition) -> EnvTransition:
return transition
def test_construction_rejects_step_without_feature_contract():
with pytest.raises(TypeError, match=r"must define feature_contract\(features\) -> dict\[str, Any\]"):
RobotProcessor([NonCompliantStep()])
class NonCallableStep:
"""Intentionally non-compliant: missing __call__."""
def feature_contract(self, features: dict[str, PolicyFeature]) -> dict[str, PolicyFeature]:
def transform_features(self, features: dict[str, PolicyFeature]) -> dict[str, PolicyFeature]:
return features
@@ -1790,7 +1889,7 @@ class FeatureContractAddStep:
def __call__(self, transition: EnvTransition) -> EnvTransition:
return transition
def feature_contract(self, features: dict[str, PolicyFeature]) -> dict[str, PolicyFeature]:
def transform_features(self, features: dict[str, PolicyFeature]) -> dict[str, PolicyFeature]:
features[self.key] = self.value
return features
@@ -1805,7 +1904,7 @@ class FeatureContractMutateStep:
def __call__(self, transition: EnvTransition) -> EnvTransition:
return transition
def feature_contract(self, features: dict[str, PolicyFeature]) -> dict[str, PolicyFeature]:
def transform_features(self, features: dict[str, PolicyFeature]) -> dict[str, PolicyFeature]:
features[self.key] = self.fn(features.get(self.key))
return features
@@ -1817,7 +1916,7 @@ class FeatureContractBadReturnStep:
def __call__(self, transition: EnvTransition) -> EnvTransition:
return transition
def feature_contract(self, features: dict[str, PolicyFeature]) -> dict[str, PolicyFeature]:
def transform_features(self, features: dict[str, PolicyFeature]) -> dict[str, PolicyFeature]:
return ["not-a-dict"]
@@ -1830,12 +1929,12 @@ class FeatureContractRemoveStep:
def __call__(self, transition: EnvTransition) -> EnvTransition:
return transition
def feature_contract(self, features: dict[str, PolicyFeature]) -> dict[str, PolicyFeature]:
def transform_features(self, features: dict[str, PolicyFeature]) -> dict[str, PolicyFeature]:
features.pop(self.key, None)
return features
def test_feature_contract_orders_and_merges(policy_feature_factory):
def test_features_orders_and_merges(policy_feature_factory):
p = RobotProcessor(
[
FeatureContractAddStep("a", policy_feature_factory(FeatureType.STATE, (1,))),
@@ -1843,14 +1942,14 @@ def test_feature_contract_orders_and_merges(policy_feature_factory):
FeatureContractAddStep("b", policy_feature_factory(FeatureType.ENV, (2,))),
]
)
out = p.feature_contract({})
out = p.transform_features({})
assert out["a"].type == FeatureType.STATE and out["a"].shape == (3,)
assert out["b"].type == FeatureType.ENV and out["b"].shape == (2,)
assert_contract_is_typed(out)
def test_feature_contract_respects_initial_without_mutation(policy_feature_factory):
def test_features_respects_initial_without_mutation(policy_feature_factory):
initial = {
"seed": policy_feature_factory(FeatureType.STATE, (7,)),
"nested": policy_feature_factory(FeatureType.ENV, (0,)),
@@ -1863,7 +1962,7 @@ def test_feature_contract_respects_initial_without_mutation(policy_feature_facto
),
]
)
out = p.feature_contract(initial_features=initial)
out = p.transform_features(initial_features=initial)
assert out["seed"].shape == (8,)
assert out["nested"].shape == (5,)
@@ -1874,13 +1973,7 @@ def test_feature_contract_respects_initial_without_mutation(policy_feature_facto
assert_contract_is_typed(out)
def test_feature_contract_type_error_on_bad_step():
p = RobotProcessor([FeatureContractAddStep(), FeatureContractBadReturnStep()])
with pytest.raises(TypeError, match=r"\w+\.feature_contract must return dict\[str, Any\]"):
_ = p.feature_contract({})
def test_feature_contract_execution_order_tracking():
def test_features_execution_order_tracking():
class Track:
def __init__(self, label):
self.label = label
@@ -1888,32 +1981,186 @@ def test_feature_contract_execution_order_tracking():
def __call__(self, transition: EnvTransition) -> EnvTransition:
return transition
def feature_contract(self, features: dict[str, PolicyFeature]) -> dict[str, PolicyFeature]:
def transform_features(self, features: dict[str, PolicyFeature]) -> dict[str, PolicyFeature]:
code = {"A": 1, "B": 2, "C": 3}[self.label]
pf = features.get("order", PolicyFeature(type=FeatureType.ENV, shape=()))
features["order"] = PolicyFeature(type=pf.type, shape=pf.shape + (code,))
return features
out = RobotProcessor([Track("A"), Track("B"), Track("C")]).feature_contract({})
out = RobotProcessor([Track("A"), Track("B"), Track("C")]).transform_features({})
assert out["order"].shape == (1, 2, 3)
def test_feature_contract_remove_key(policy_feature_factory):
def test_features_remove_key(policy_feature_factory):
p = RobotProcessor(
[
FeatureContractAddStep("a", policy_feature_factory(FeatureType.STATE, (1,))),
FeatureContractRemoveStep("a"),
]
)
out = p.feature_contract({})
out = p.transform_features({})
assert "a" not in out
def test_feature_contract_remove_from_initial(policy_feature_factory):
def test_features_remove_from_initial(policy_feature_factory):
initial = {
"keep": policy_feature_factory(FeatureType.STATE, (1,)),
"drop": policy_feature_factory(FeatureType.STATE, (1,)),
}
p = RobotProcessor([FeatureContractRemoveStep("drop")])
out = p.feature_contract(initial_features=initial)
out = p.transform_features(initial_features=initial)
assert "drop" not in out and out["keep"] == initial["keep"]
@dataclass
class AddActionEEAndJointFeatures:
"""Adds both EE and JOINT action features."""
def __call__(self, tr):
return tr
def transform_features(self, features: dict) -> dict:
# EE features
features["action.ee.x"] = float
features["action.ee.y"] = float
# JOINT features
features["action.j1.pos"] = float
features["action.j2.pos"] = float
return features
@dataclass
class AddObservationStateFeatures:
"""Adds state features (and optionally an image spec to test precedence)."""
add_front_image: bool = False
front_image_shape: tuple = (240, 320, 3)
def __call__(self, tr):
return tr
def transform_features(self, features: dict) -> dict:
# State features (mix EE and a joint state)
features["observation.state.ee.x"] = float
features["observation.state.j1.pos"] = float
if self.add_front_image:
features["observation.images.front"] = self.front_image_shape
return features
def test_aggregate_joint_action_only():
rp = RobotProcessor([AddActionEEAndJointFeatures()])
initial = {"front": (480, 640, 3)}
out = aggregate_pipeline_dataset_features(
pipeline=rp,
initial_features=initial,
use_videos=True,
patterns=["action.j1.pos", "action.j2.pos"],
)
# Expect only "action" with joint names
assert "action" in out and "observation.state" not in out
assert out["action"]["dtype"] == "float32"
assert set(out["action"]["names"]) == {"j1.pos", "j2.pos"}
assert out["action"]["shape"] == (len(out["action"]["names"]),)
def test_aggregate_ee_action_and_observation_with_videos():
rp = RobotProcessor([AddActionEEAndJointFeatures(), AddObservationStateFeatures()])
initial = {"front": (480, 640, 3), "side": (720, 1280, 3)}
out = aggregate_pipeline_dataset_features(
pipeline=rp,
initial_features=initial,
use_videos=True,
patterns=["action.ee", "observation.state"],
)
# Action should pack only EE names
assert "action" in out
assert set(out["action"]["names"]) == {"ee.x", "ee.y"}
assert out["action"]["dtype"] == "float32"
# Observation state should pack both ee.x and j1.pos as a vector
assert "observation.state" in out
assert set(out["observation.state"]["names"]) == {"ee.x", "j1.pos"}
assert out["observation.state"]["dtype"] == "float32"
# Cameras from initial_features appear as videos
for cam in ("front", "side"):
key = f"observation.images.{cam}"
assert key in out
assert out[key]["dtype"] == "video"
assert out[key]["shape"] == initial[cam]
assert out[key]["names"] == ["height", "width", "channels"]
def test_aggregate_both_action_types():
rp = RobotProcessor([AddActionEEAndJointFeatures()])
out = aggregate_pipeline_dataset_features(
pipeline=rp,
initial_features={},
use_videos=True,
patterns=["action.ee", "action.j1", "action.j2.pos"],
)
assert "action" in out
expected = {"ee.x", "ee.y", "j1.pos", "j2.pos"}
assert set(out["action"]["names"]) == expected
assert out["action"]["shape"] == (len(expected),)
def test_aggregate_images_when_use_videos_false():
rp = RobotProcessor([AddObservationStateFeatures(add_front_image=True)])
initial = {"back": (480, 640, 3)}
out = aggregate_pipeline_dataset_features(
pipeline=rp,
initial_features=initial,
use_videos=False, # expect "image" dtype
patterns=None,
)
key = "observation.images.back"
key_front = "observation.images.front"
assert key not in out
assert key_front not in out
def test_aggregate_images_when_use_videos_true():
rp = RobotProcessor([AddObservationStateFeatures(add_front_image=True)])
initial = {"back": (480, 640, 3)}
out = aggregate_pipeline_dataset_features(
pipeline=rp,
initial_features=initial,
use_videos=True,
patterns=None,
)
key = "observation.images.front"
key_back = "observation.images.back"
assert key in out
assert key_back in out
assert out[key]["dtype"] == "video"
assert out[key_back]["dtype"] == "video"
assert out[key_back]["shape"] == initial["back"]
def test_initial_camera_not_overridden_by_step_image():
# Step explicitly sets a different front image shape; initial has another shape.
# aggregate_pipeline_dataset_features should keep the step's value (setdefault behavior on initial cams).
rp = RobotProcessor([AddObservationStateFeatures(add_front_image=True, front_image_shape=(240, 320, 3))])
initial = {"front": (480, 640, 3)} # should NOT override the step-provided (240, 320, 3)
out = aggregate_pipeline_dataset_features(
pipeline=rp,
initial_features=initial,
use_videos=True,
patterns=["observation.images.front"],
)
key = "observation.images.front"
assert key in out
assert out[key]["shape"] == (240, 320, 3) # from the step, not from initial
+6 -6
View File
@@ -410,7 +410,7 @@ def test_value_types_preserved():
assert processed_obs["old_list"] == [1, 2, 3]
def test_feature_contract_basic_renaming(policy_feature_factory):
def test_features_basic_renaming(policy_feature_factory):
processor = RenameProcessor(rename_map={"a": "x", "b": "y"})
features = {
"a": policy_feature_factory(FeatureType.STATE, (2,)),
@@ -418,7 +418,7 @@ def test_feature_contract_basic_renaming(policy_feature_factory):
"c": policy_feature_factory(FeatureType.ENV, (1,)),
}
out = processor.feature_contract(features.copy())
out = processor.transform_features(features.copy())
# Values preserved and typed
assert out["x"] == features["a"]
@@ -430,14 +430,14 @@ def test_feature_contract_basic_renaming(policy_feature_factory):
assert set(features) == {"a", "b", "c"}
def test_feature_contract_overlapping_keys(policy_feature_factory):
def test_features_overlapping_keys(policy_feature_factory):
# Overlapping renames: both 'a' and 'b' exist. 'a'->'b', 'b'->'c'
processor = RenameProcessor(rename_map={"a": "b", "b": "c"})
features = {
"a": policy_feature_factory(FeatureType.STATE, (1,)),
"b": policy_feature_factory(FeatureType.STATE, (2,)),
}
out = processor.feature_contract(features)
out = processor.transform_features(features)
assert set(out) == {"b", "c"}
assert out["b"] == features["a"] # 'a' renamed to'b'
@@ -445,7 +445,7 @@ def test_feature_contract_overlapping_keys(policy_feature_factory):
assert_contract_is_typed(out)
def test_feature_contract_chained_processors(policy_feature_factory):
def test_features_chained_processors(policy_feature_factory):
# Chain two rename processors at the contract level
processor1 = RenameProcessor(rename_map={"pos": "agent_position", "img": "camera_image"})
processor2 = RenameProcessor(
@@ -458,7 +458,7 @@ def test_feature_contract_chained_processors(policy_feature_factory):
"img": policy_feature_factory(FeatureType.VISUAL, (3, 64, 64)),
"extra": policy_feature_factory(FeatureType.ENV, (1,)),
}
out = pipeline.feature_contract(initial_features=spec)
out = pipeline.transform_features(initial_features=spec)
assert set(out) == {"observation.state", "observation.image", "extra"}
assert out["observation.state"] == spec["pos"]
+727
View File
@@ -0,0 +1,727 @@
"""
Tests for the TokenizerProcessor class.
"""
import tempfile
from unittest.mock import patch
import pytest
import torch
from lerobot.configs.types import FeatureType, PolicyFeature
from lerobot.constants import OBS_LANGUAGE
from lerobot.processor.pipeline import RobotProcessor, TransitionKey
from lerobot.processor.tokenizer_processor import TokenizerProcessor
from tests.utils import require_package
def create_transition(
observation=None, action=None, reward=None, done=None, truncated=None, info=None, complementary_data=None
):
"""Helper function to create test transitions."""
return {
TransitionKey.OBSERVATION: observation,
TransitionKey.ACTION: action,
TransitionKey.REWARD: reward,
TransitionKey.DONE: done,
TransitionKey.TRUNCATED: truncated,
TransitionKey.INFO: info,
TransitionKey.COMPLEMENTARY_DATA: complementary_data,
}
class MockTokenizer:
"""Mock tokenizer for testing that mimics transformers tokenizer interface."""
def __init__(self, vocab_size: int = 1000):
self.vocab_size = vocab_size
def __call__(
self,
text: str | list[str],
max_length: int = 512,
truncation: bool = True,
padding: str = "max_length",
padding_side: str = "right",
return_tensors: str = "pt",
**kwargs,
) -> dict[str, torch.Tensor]:
"""Mock tokenization that returns deterministic tokens based on text."""
if isinstance(text, str):
texts = [text]
else:
texts = text
batch_size = len(texts)
# Create mock input_ids and attention_mask
input_ids = torch.zeros(batch_size, max_length, dtype=torch.long)
attention_mask = torch.zeros(batch_size, max_length, dtype=torch.long)
for i, txt in enumerate(texts):
# Simple mock: use hash of text to generate deterministic tokens
text_hash = hash(txt) % self.vocab_size
seq_len = min(len(txt.split()), max_length)
# Fill input_ids with simple pattern based on text
for j in range(seq_len):
input_ids[i, j] = (text_hash + j) % self.vocab_size
# Set attention mask for non-padded positions
attention_mask[i, :seq_len] = 1
result = {
"input_ids": input_ids,
"attention_mask": attention_mask,
}
# Return single sequence for single input to match transformers behavior
if len(texts) == 1:
result = {k: v.squeeze(0) for k, v in result.items()}
return result
@pytest.fixture
def mock_tokenizer():
"""Provide a mock tokenizer for testing."""
return MockTokenizer(vocab_size=100)
@require_package("transformers")
@patch("lerobot.processor.tokenizer_processor.AutoTokenizer")
def test_basic_tokenization(mock_auto_tokenizer):
"""Test basic string tokenization functionality."""
# Mock AutoTokenizer.from_pretrained to return our mock tokenizer
mock_tokenizer = MockTokenizer(vocab_size=100)
mock_auto_tokenizer.from_pretrained.return_value = mock_tokenizer
processor = TokenizerProcessor(tokenizer_name="test-tokenizer", max_length=10)
transition = create_transition(complementary_data={"task": "pick up the red cube"})
result = processor(transition)
# Check that original task is preserved
assert result[TransitionKey.COMPLEMENTARY_DATA]["task"] == "pick up the red cube"
# Check that tokens were added to observation
observation = result[TransitionKey.OBSERVATION]
assert f"{OBS_LANGUAGE}.tokens" in observation
assert f"{OBS_LANGUAGE}.attention_mask" in observation
# Check token structure
tokens = observation[f"{OBS_LANGUAGE}.tokens"]
attention_mask = observation[f"{OBS_LANGUAGE}.attention_mask"]
assert isinstance(tokens, torch.Tensor)
assert isinstance(attention_mask, torch.Tensor)
assert tokens.shape == (10,)
assert attention_mask.shape == (10,)
@require_package("transformers")
def test_basic_tokenization_with_tokenizer_object():
"""Test basic string tokenization functionality using tokenizer object directly."""
mock_tokenizer = MockTokenizer(vocab_size=100)
processor = TokenizerProcessor(tokenizer=mock_tokenizer, max_length=10)
transition = create_transition(complementary_data={"task": "pick up the red cube"})
result = processor(transition)
# Check that original task is preserved
assert result[TransitionKey.COMPLEMENTARY_DATA]["task"] == "pick up the red cube"
# Check that tokens were added to observation
observation = result[TransitionKey.OBSERVATION]
assert f"{OBS_LANGUAGE}.tokens" in observation
assert f"{OBS_LANGUAGE}.attention_mask" in observation
# Check token structure
tokens = observation[f"{OBS_LANGUAGE}.tokens"]
attention_mask = observation[f"{OBS_LANGUAGE}.attention_mask"]
assert isinstance(tokens, torch.Tensor)
assert isinstance(attention_mask, torch.Tensor)
assert tokens.shape == (10,)
assert attention_mask.shape == (10,)
@require_package("transformers")
@patch("lerobot.processor.tokenizer_processor.AutoTokenizer")
def test_list_of_strings_tokenization(mock_auto_tokenizer):
"""Test tokenization of a list of strings."""
mock_tokenizer = MockTokenizer(vocab_size=100)
mock_auto_tokenizer.from_pretrained.return_value = mock_tokenizer
processor = TokenizerProcessor(tokenizer_name="test-tokenizer", max_length=8)
transition = create_transition(complementary_data={"task": ["pick up cube", "place on table"]})
result = processor(transition)
# Check that original task is preserved
assert result[TransitionKey.COMPLEMENTARY_DATA]["task"] == ["pick up cube", "place on table"]
# Check that tokens were added to observation
observation = result[TransitionKey.OBSERVATION]
tokens = observation[f"{OBS_LANGUAGE}.tokens"]
attention_mask = observation[f"{OBS_LANGUAGE}.attention_mask"]
assert tokens.shape == (2, 8) # batch_size=2, seq_len=8
assert attention_mask.shape == (2, 8)
@require_package("transformers")
@patch("lerobot.processor.tokenizer_processor.AutoTokenizer")
def test_custom_keys(mock_auto_tokenizer):
"""Test using custom task_key."""
mock_tokenizer = MockTokenizer(vocab_size=100)
mock_auto_tokenizer.from_pretrained.return_value = mock_tokenizer
processor = TokenizerProcessor(tokenizer_name="test-tokenizer", task_key="instruction", max_length=5)
transition = create_transition(complementary_data={"instruction": "move forward"})
result = processor(transition)
# Check that tokens are stored in observation regardless of task_key
observation = result[TransitionKey.OBSERVATION]
assert f"{OBS_LANGUAGE}.tokens" in observation
assert f"{OBS_LANGUAGE}.attention_mask" in observation
tokens = observation[f"{OBS_LANGUAGE}.tokens"]
assert tokens.shape == (5,)
@require_package("transformers")
@patch("lerobot.processor.tokenizer_processor.AutoTokenizer")
def test_none_complementary_data(mock_auto_tokenizer):
"""Test handling of None complementary_data."""
mock_tokenizer = MockTokenizer(vocab_size=100)
mock_auto_tokenizer.from_pretrained.return_value = mock_tokenizer
processor = TokenizerProcessor(tokenizer_name="test-tokenizer")
transition = create_transition(complementary_data=None)
result = processor(transition)
assert result == transition # Should return unchanged
@require_package("transformers")
@patch("lerobot.processor.tokenizer_processor.AutoTokenizer")
def test_missing_task_key(mock_auto_tokenizer):
"""Test handling when task key is missing."""
mock_tokenizer = MockTokenizer(vocab_size=100)
mock_auto_tokenizer.from_pretrained.return_value = mock_tokenizer
processor = TokenizerProcessor(tokenizer_name="test-tokenizer")
transition = create_transition(complementary_data={"other_field": "some value"})
result = processor(transition)
assert result == transition # Should return unchanged
@require_package("transformers")
@patch("lerobot.processor.tokenizer_processor.AutoTokenizer")
def test_none_task_value(mock_auto_tokenizer):
"""Test handling when task value is None."""
mock_tokenizer = MockTokenizer(vocab_size=100)
mock_auto_tokenizer.from_pretrained.return_value = mock_tokenizer
processor = TokenizerProcessor(tokenizer_name="test-tokenizer")
transition = create_transition(complementary_data={"task": None})
result = processor(transition)
assert result == transition # Should return unchanged
@require_package("transformers")
@patch("lerobot.processor.tokenizer_processor.AutoTokenizer")
def test_unsupported_task_type(mock_auto_tokenizer):
"""Test handling of unsupported task types."""
mock_tokenizer = MockTokenizer(vocab_size=100)
mock_auto_tokenizer.from_pretrained.return_value = mock_tokenizer
processor = TokenizerProcessor(tokenizer_name="test-tokenizer")
# Test with integer task
transition = create_transition(complementary_data={"task": 123})
result = processor(transition)
assert result == transition # Should return unchanged
# Test with mixed list
transition = create_transition(complementary_data={"task": ["text", 123, "more text"]})
result = processor(transition)
assert result == transition # Should return unchanged
@require_package("transformers")
def test_no_tokenizer_error():
"""Test that ValueError is raised when neither tokenizer nor tokenizer_name is provided."""
with pytest.raises(ValueError, match="Either 'tokenizer' or 'tokenizer_name' must be provided"):
TokenizerProcessor()
@require_package("transformers")
def test_invalid_tokenizer_name_error():
"""Test that error is raised when invalid tokenizer_name is provided."""
with patch("lerobot.processor.tokenizer_processor.AutoTokenizer") as mock_auto_tokenizer:
# Mock import error
mock_auto_tokenizer.from_pretrained.side_effect = Exception("Model not found")
with pytest.raises(Exception, match="Model not found"):
TokenizerProcessor(tokenizer_name="invalid-tokenizer")
@require_package("transformers")
@patch("lerobot.processor.tokenizer_processor.AutoTokenizer")
def test_get_config_with_tokenizer_name(mock_auto_tokenizer):
"""Test configuration serialization when using tokenizer_name."""
mock_tokenizer = MockTokenizer(vocab_size=100)
mock_auto_tokenizer.from_pretrained.return_value = mock_tokenizer
processor = TokenizerProcessor(
tokenizer_name="test-tokenizer",
max_length=256,
task_key="instruction",
padding="longest",
truncation=False,
)
config = processor.get_config()
expected = {
"tokenizer_name": "test-tokenizer",
"max_length": 256,
"task_key": "instruction",
"padding_side": "right",
"padding": "longest",
"truncation": False,
}
assert config == expected
@require_package("transformers")
def test_get_config_with_tokenizer_object():
"""Test configuration serialization when using tokenizer object."""
mock_tokenizer = MockTokenizer(vocab_size=100)
processor = TokenizerProcessor(
tokenizer=mock_tokenizer,
max_length=256,
task_key="instruction",
padding="longest",
truncation=False,
)
config = processor.get_config()
# tokenizer_name should not be in config when tokenizer object is used
expected = {
"max_length": 256,
"task_key": "instruction",
"padding_side": "right",
"padding": "longest",
"truncation": False,
}
assert config == expected
assert "tokenizer_name" not in config
@require_package("transformers")
@patch("lerobot.processor.tokenizer_processor.AutoTokenizer")
def test_state_dict_methods(mock_auto_tokenizer):
"""Test state_dict and load_state_dict methods."""
mock_tokenizer = MockTokenizer(vocab_size=100)
mock_auto_tokenizer.from_pretrained.return_value = mock_tokenizer
processor = TokenizerProcessor(tokenizer_name="test-tokenizer")
# Should return empty dict
state = processor.state_dict()
assert state == {}
# load_state_dict should not raise error
processor.load_state_dict({})
@require_package("transformers")
@patch("lerobot.processor.tokenizer_processor.AutoTokenizer")
def test_reset_method(mock_auto_tokenizer):
"""Test reset method."""
mock_tokenizer = MockTokenizer(vocab_size=100)
mock_auto_tokenizer.from_pretrained.return_value = mock_tokenizer
processor = TokenizerProcessor(tokenizer_name="test-tokenizer")
# Should not raise error
processor.reset()
@require_package("transformers")
@patch("lerobot.processor.tokenizer_processor.AutoTokenizer")
def test_integration_with_robot_processor(mock_auto_tokenizer):
"""Test integration with RobotProcessor."""
mock_tokenizer = MockTokenizer(vocab_size=100)
mock_auto_tokenizer.from_pretrained.return_value = mock_tokenizer
tokenizer_processor = TokenizerProcessor(tokenizer_name="test-tokenizer", max_length=6)
robot_processor = RobotProcessor([tokenizer_processor])
transition = create_transition(
observation={"state": torch.tensor([1.0, 2.0])},
action=torch.tensor([0.1, 0.2]),
complementary_data={"task": "test task"},
)
result = robot_processor(transition)
# Check that observation exists and tokenization was applied
assert TransitionKey.OBSERVATION in result
observation = result[TransitionKey.OBSERVATION]
assert f"{OBS_LANGUAGE}.tokens" in observation
assert f"{OBS_LANGUAGE}.attention_mask" in observation
tokens = observation[f"{OBS_LANGUAGE}.tokens"]
attention_mask = observation[f"{OBS_LANGUAGE}.attention_mask"]
assert tokens.shape == (6,)
assert attention_mask.shape == (6,)
# Check that other data is preserved
assert torch.equal(
result[TransitionKey.OBSERVATION]["state"], transition[TransitionKey.OBSERVATION]["state"]
)
assert torch.equal(result[TransitionKey.ACTION], transition[TransitionKey.ACTION])
@require_package("transformers")
@patch("lerobot.processor.tokenizer_processor.AutoTokenizer")
def test_save_and_load_pretrained_with_tokenizer_name(mock_auto_tokenizer):
"""Test saving and loading processor with tokenizer_name."""
mock_tokenizer = MockTokenizer(vocab_size=100)
mock_auto_tokenizer.from_pretrained.return_value = mock_tokenizer
original_processor = TokenizerProcessor(
tokenizer_name="test-tokenizer", max_length=32, task_key="instruction"
)
robot_processor = RobotProcessor([original_processor])
with tempfile.TemporaryDirectory() as temp_dir:
# Save processor
robot_processor.save_pretrained(temp_dir)
# Load processor - tokenizer will be recreated from saved config
loaded_processor = RobotProcessor.from_pretrained(temp_dir)
# Test that loaded processor works
transition = create_transition(complementary_data={"instruction": "test instruction"})
result = loaded_processor(transition)
assert TransitionKey.OBSERVATION in result
assert f"{OBS_LANGUAGE}.tokens" in result[TransitionKey.OBSERVATION]
assert f"{OBS_LANGUAGE}.attention_mask" in result[TransitionKey.OBSERVATION]
@require_package("transformers")
def test_save_and_load_pretrained_with_tokenizer_object():
"""Test saving and loading processor with tokenizer object using overrides."""
mock_tokenizer = MockTokenizer(vocab_size=100)
original_processor = TokenizerProcessor(tokenizer=mock_tokenizer, max_length=32, task_key="instruction")
robot_processor = RobotProcessor([original_processor])
with tempfile.TemporaryDirectory() as temp_dir:
# Save processor
robot_processor.save_pretrained(temp_dir)
# Load processor with tokenizer override (since tokenizer object wasn't saved)
loaded_processor = RobotProcessor.from_pretrained(
temp_dir, overrides={"tokenizer_processor": {"tokenizer": mock_tokenizer}}
)
# Test that loaded processor works
transition = create_transition(complementary_data={"instruction": "test instruction"})
result = loaded_processor(transition)
assert TransitionKey.OBSERVATION in result
assert f"{OBS_LANGUAGE}.tokens" in result[TransitionKey.OBSERVATION]
assert f"{OBS_LANGUAGE}.attention_mask" in result[TransitionKey.OBSERVATION]
@require_package("transformers")
def test_registry_functionality():
"""Test that the processor is properly registered."""
from lerobot.processor.pipeline import ProcessorStepRegistry
# Check that the processor is registered
assert "tokenizer_processor" in ProcessorStepRegistry.list()
# Check that we can retrieve it
retrieved_class = ProcessorStepRegistry.get("tokenizer_processor")
assert retrieved_class is TokenizerProcessor
@require_package("transformers")
def test_features_basic():
"""Test basic feature contract functionality."""
mock_tokenizer = MockTokenizer(vocab_size=100)
processor = TokenizerProcessor(tokenizer=mock_tokenizer, max_length=128)
input_features = {
"observation.state": PolicyFeature(type=FeatureType.STATE, shape=(10,)),
"action": PolicyFeature(type=FeatureType.ACTION, shape=(5,)),
}
output_features = processor.transform_features(input_features)
# Check that original features are preserved
assert "observation.state" in output_features
assert "action" in output_features
# Check that tokenized features are added
assert f"{OBS_LANGUAGE}.tokens" in output_features
assert f"{OBS_LANGUAGE}.attention_mask" in output_features
# Check feature properties
tokens_feature = output_features[f"{OBS_LANGUAGE}.tokens"]
attention_mask_feature = output_features[f"{OBS_LANGUAGE}.attention_mask"]
assert tokens_feature.type == FeatureType.LANGUAGE
assert tokens_feature.shape == (128,)
assert attention_mask_feature.type == FeatureType.LANGUAGE
assert attention_mask_feature.shape == (128,)
@require_package("transformers")
def test_features_with_custom_max_length():
"""Test feature contract with custom max_length."""
mock_tokenizer = MockTokenizer(vocab_size=100)
processor = TokenizerProcessor(tokenizer=mock_tokenizer, max_length=64)
input_features = {}
output_features = processor.transform_features(input_features)
# Check that features use correct max_length
assert f"{OBS_LANGUAGE}.tokens" in output_features
assert f"{OBS_LANGUAGE}.attention_mask" in output_features
tokens_feature = output_features[f"{OBS_LANGUAGE}.tokens"]
attention_mask_feature = output_features[f"{OBS_LANGUAGE}.attention_mask"]
assert tokens_feature.shape == (64,)
assert attention_mask_feature.shape == (64,)
@require_package("transformers")
def test_features_existing_features():
"""Test feature contract when tokenized features already exist."""
mock_tokenizer = MockTokenizer(vocab_size=100)
processor = TokenizerProcessor(tokenizer=mock_tokenizer, max_length=256)
input_features = {
f"{OBS_LANGUAGE}.tokens": PolicyFeature(type=FeatureType.LANGUAGE, shape=(100,)),
f"{OBS_LANGUAGE}.attention_mask": PolicyFeature(type=FeatureType.LANGUAGE, shape=(100,)),
}
output_features = processor.transform_features(input_features)
# Should not overwrite existing features
assert output_features[f"{OBS_LANGUAGE}.tokens"].shape == (100,) # Original shape preserved
assert output_features[f"{OBS_LANGUAGE}.attention_mask"].shape == (100,)
@require_package("transformers")
@patch("lerobot.processor.tokenizer_processor.AutoTokenizer")
def test_tokenization_parameters(mock_auto_tokenizer):
"""Test that tokenization parameters are correctly passed to tokenizer."""
# Create a custom mock that tracks calls
class TrackingMockTokenizer:
def __init__(self):
self.last_call_args = None
self.last_call_kwargs = None
def __call__(self, *args, **kwargs):
self.last_call_args = args
self.last_call_kwargs = kwargs
# Return minimal valid output
return {
"input_ids": torch.zeros(16, dtype=torch.long),
"attention_mask": torch.ones(16, dtype=torch.long),
}
tracking_tokenizer = TrackingMockTokenizer()
mock_auto_tokenizer.from_pretrained.return_value = tracking_tokenizer
processor = TokenizerProcessor(
tokenizer_name="test-tokenizer",
max_length=16,
padding="longest",
truncation=False,
padding_side="left",
)
transition = create_transition(complementary_data={"task": "test task"})
processor(transition)
# Check that parameters were passed correctly (task is converted to list)
assert tracking_tokenizer.last_call_args == (["test task"],)
assert tracking_tokenizer.last_call_kwargs["max_length"] == 16
assert tracking_tokenizer.last_call_kwargs["padding"] == "longest"
assert tracking_tokenizer.last_call_kwargs["padding_side"] == "left"
assert tracking_tokenizer.last_call_kwargs["truncation"] is False
assert tracking_tokenizer.last_call_kwargs["return_tensors"] == "pt"
@require_package("transformers")
@patch("lerobot.processor.tokenizer_processor.AutoTokenizer")
def test_preserves_other_complementary_data(mock_auto_tokenizer):
"""Test that other complementary data fields are preserved."""
mock_tokenizer = MockTokenizer(vocab_size=100)
mock_auto_tokenizer.from_pretrained.return_value = mock_tokenizer
processor = TokenizerProcessor(tokenizer_name="test-tokenizer")
transition = create_transition(
complementary_data={
"task": "test task",
"episode_id": 123,
"timestamp": 456.789,
"other_field": {"nested": "data"},
}
)
result = processor(transition)
comp_data = result[TransitionKey.COMPLEMENTARY_DATA]
# Check that all original fields are preserved
assert comp_data["task"] == "test task"
assert comp_data["episode_id"] == 123
assert comp_data["timestamp"] == 456.789
assert comp_data["other_field"] == {"nested": "data"}
# Check that tokens were added to observation
observation = result[TransitionKey.OBSERVATION]
assert f"{OBS_LANGUAGE}.tokens" in observation
assert f"{OBS_LANGUAGE}.attention_mask" in observation
@require_package("transformers")
@patch("lerobot.processor.tokenizer_processor.AutoTokenizer")
def test_deterministic_tokenization(mock_auto_tokenizer):
"""Test that tokenization is deterministic for the same input."""
mock_tokenizer = MockTokenizer(vocab_size=100)
mock_auto_tokenizer.from_pretrained.return_value = mock_tokenizer
processor = TokenizerProcessor(tokenizer_name="test-tokenizer", max_length=10)
transition = create_transition(complementary_data={"task": "consistent test"})
result1 = processor(transition)
result2 = processor(transition)
tokens1 = result1[TransitionKey.OBSERVATION][f"{OBS_LANGUAGE}.tokens"]
attention_mask1 = result1[TransitionKey.OBSERVATION][f"{OBS_LANGUAGE}.attention_mask"]
tokens2 = result2[TransitionKey.OBSERVATION][f"{OBS_LANGUAGE}.tokens"]
attention_mask2 = result2[TransitionKey.OBSERVATION][f"{OBS_LANGUAGE}.attention_mask"]
# Results should be identical
assert torch.equal(tokens1, tokens2)
assert torch.equal(attention_mask1, attention_mask2)
@require_package("transformers")
@patch("lerobot.processor.tokenizer_processor.AutoTokenizer")
def test_empty_string_task(mock_auto_tokenizer):
"""Test handling of empty string task."""
mock_tokenizer = MockTokenizer(vocab_size=100)
mock_auto_tokenizer.from_pretrained.return_value = mock_tokenizer
processor = TokenizerProcessor(tokenizer_name="test-tokenizer", max_length=8)
transition = create_transition(complementary_data={"task": ""})
result = processor(transition)
# Should still tokenize (mock tokenizer handles empty strings)
observation = result[TransitionKey.OBSERVATION]
assert f"{OBS_LANGUAGE}.tokens" in observation
tokens = observation[f"{OBS_LANGUAGE}.tokens"]
assert tokens.shape == (8,)
@require_package("transformers")
@patch("lerobot.processor.tokenizer_processor.AutoTokenizer")
def test_very_long_task(mock_auto_tokenizer):
"""Test handling of very long task strings."""
mock_tokenizer = MockTokenizer(vocab_size=100)
mock_auto_tokenizer.from_pretrained.return_value = mock_tokenizer
processor = TokenizerProcessor(tokenizer_name="test-tokenizer", max_length=5, truncation=True)
long_task = " ".join(["word"] * 100) # Very long task
transition = create_transition(complementary_data={"task": long_task})
result = processor(transition)
# Should be truncated to max_length
observation = result[TransitionKey.OBSERVATION]
tokens = observation[f"{OBS_LANGUAGE}.tokens"]
attention_mask = observation[f"{OBS_LANGUAGE}.attention_mask"]
assert tokens.shape == (5,)
assert attention_mask.shape == (5,)
@require_package("transformers")
@patch("lerobot.processor.tokenizer_processor.AutoTokenizer")
def test_custom_padding_side(mock_auto_tokenizer):
"""Test using custom padding_side parameter."""
# Create a mock tokenizer that tracks padding_side calls
class PaddingSideTrackingTokenizer:
def __init__(self):
self.padding_side_calls = []
def __call__(
self,
text,
max_length=512,
truncation=True,
padding="max_length",
padding_side="right",
return_tensors="pt",
**kwargs,
):
self.padding_side_calls.append(padding_side)
# Return minimal valid output
return {
"input_ids": torch.zeros(max_length, dtype=torch.long),
"attention_mask": torch.ones(max_length, dtype=torch.long),
}
tracking_tokenizer = PaddingSideTrackingTokenizer()
mock_auto_tokenizer.from_pretrained.return_value = tracking_tokenizer
# Test left padding
processor_left = TokenizerProcessor(tokenizer_name="test-tokenizer", max_length=10, padding_side="left")
transition = create_transition(complementary_data={"task": "test task"})
processor_left(transition)
assert tracking_tokenizer.padding_side_calls[-1] == "left"
# Test right padding (default)
processor_right = TokenizerProcessor(tokenizer_name="test-tokenizer", max_length=10, padding_side="right")
processor_right(transition)
assert tracking_tokenizer.padding_side_calls[-1] == "right"
+205
View File
@@ -0,0 +1,205 @@
import importlib
import sys
from types import SimpleNamespace
import numpy as np
import pytest
from lerobot.processor.pipeline import TransitionKey
@pytest.fixture
def mock_rerun(monkeypatch):
"""
Provide a mock `rerun` module so tests don't depend on the real library.
Also reload the module-under-test so it binds to this mock `rr`.
"""
calls = []
class DummyScalar:
def __init__(self, value):
self.value = float(value)
class DummyImage:
def __init__(self, arr):
self.arr = arr
def dummy_log(key, obj, **kwargs):
calls.append((key, obj, kwargs))
dummy_rr = SimpleNamespace(
Scalar=DummyScalar,
Image=DummyImage,
log=dummy_log,
init=lambda *a, **k: None,
spawn=lambda *a, **k: None,
)
# Inject fake module into sys.modules
monkeypatch.setitem(sys.modules, "rerun", dummy_rr)
# Now import and reload the module under test, to bind to our rerun mock
import lerobot.utils.visualization_utils as vu
importlib.reload(vu)
# Expose both the reloaded module and the call recorder
yield vu, calls
def _keys(calls):
"""Helper to extract just the keys logged to rr.log"""
return [k for (k, _obj, _kw) in calls]
def _obj_for(calls, key):
"""Find the first object logged under a given key."""
for k, obj, _kw in calls:
if k == key:
return obj
raise KeyError(f"Key {key} not found in calls: {calls}")
def _kwargs_for(calls, key):
for k, _obj, kw in calls:
if k == key:
return kw
raise KeyError(f"Key {key} not found in calls: {calls}")
def test_log_rerun_data_envtransition_scalars_and_image(mock_rerun):
vu, calls = mock_rerun
# Build EnvTransition dict
obs = {
"observation.state.temperature": np.float32(25.0),
# CHW image should be converted to HWC for rr.Image
"observation.camera": np.zeros((3, 10, 20), dtype=np.uint8),
}
act = {
"action.throttle": 0.7,
# 1D array should log individual Scalars with suffix _i
"action.vector": np.array([1.0, 2.0], dtype=np.float32),
}
transition = {
TransitionKey.OBSERVATION: obs,
TransitionKey.ACTION: act,
}
vu.log_rerun_data(transition)
# We expect:
# - observation.state.temperature -> Scalar
# - observation.camera -> Image (HWC) with static=True
# - action.throttle -> Scalar
# - action.vector_0, action.vector_1 -> Scalars
expected_keys = {
"observation.state.temperature",
"observation.camera",
"action.throttle",
"action.vector_0",
"action.vector_1",
}
assert set(_keys(calls)) == expected_keys
# Check scalar types and values
temp_obj = _obj_for(calls, "observation.state.temperature")
assert type(temp_obj).__name__ == "DummyScalar"
assert temp_obj.value == pytest.approx(25.0)
throttle_obj = _obj_for(calls, "action.throttle")
assert type(throttle_obj).__name__ == "DummyScalar"
assert throttle_obj.value == pytest.approx(0.7)
v0 = _obj_for(calls, "action.vector_0")
v1 = _obj_for(calls, "action.vector_1")
assert type(v0).__name__ == "DummyScalar"
assert type(v1).__name__ == "DummyScalar"
assert v0.value == pytest.approx(1.0)
assert v1.value == pytest.approx(2.0)
# Check image handling: CHW -> HWC
img_obj = _obj_for(calls, "observation.camera")
assert type(img_obj).__name__ == "DummyImage"
assert img_obj.arr.shape == (10, 20, 3) # transposed
assert _kwargs_for(calls, "observation.camera").get("static", False) is True # static=True for images
def test_log_rerun_data_plain_list_ordering_and_prefixes(mock_rerun):
vu, calls = mock_rerun
# First dict without prefixes treated as observation
# Second dict without prefixes treated as action
obs_plain = {
"temp": 1.5,
# Already HWC image => should stay as-is
"img": np.zeros((5, 6, 3), dtype=np.uint8),
"none": None, # should be skipped
}
act_plain = {
"throttle": 0.3,
"vec": np.array([9, 8, 7], dtype=np.float32),
}
vu.log_rerun_data([obs_plain, act_plain])
# Expected keys with auto-prefixes
expected = {
"observation.temp",
"observation.img",
"action.throttle",
"action.vec_0",
"action.vec_1",
"action.vec_2",
}
logged = set(_keys(calls))
assert logged == expected
# Scalars
t = _obj_for(calls, "observation.temp")
assert type(t).__name__ == "DummyScalar"
assert t.value == pytest.approx(1.5)
throttle = _obj_for(calls, "action.throttle")
assert type(throttle).__name__ == "DummyScalar"
assert throttle.value == pytest.approx(0.3)
# Image stays HWC
img = _obj_for(calls, "observation.img")
assert type(img).__name__ == "DummyImage"
assert img.arr.shape == (5, 6, 3)
assert _kwargs_for(calls, "observation.img").get("static", False) is True
# Vectors
for i, val in enumerate([9, 8, 7]):
o = _obj_for(calls, f"action.vec_{i}")
assert type(o).__name__ == "DummyScalar"
assert o.value == pytest.approx(val)
def test_log_rerun_data_kwargs_only(mock_rerun):
vu, calls = mock_rerun
vu.log_rerun_data(
None,
observation={"observation.temp": 10.0, "observation.gray": np.zeros((8, 8, 1), dtype=np.uint8)},
action={"action.a": 1.0},
)
keys = set(_keys(calls))
assert "observation.temp" in keys
assert "observation.gray" in keys
assert "action.a" in keys
temp = _obj_for(calls, "observation.temp")
assert type(temp).__name__ == "DummyScalar"
assert temp.value == pytest.approx(10.0)
img = _obj_for(calls, "observation.gray")
assert type(img).__name__ == "DummyImage"
assert img.arr.shape == (8, 8, 1) # remains HWC
assert _kwargs_for(calls, "observation.gray").get("static", False) is True
a = _obj_for(calls, "action.a")
assert type(a).__name__ == "DummyScalar"
assert a.value == pytest.approx(1.0)