feat(processor): multiple improvements to the pipeline porting (#1749)

* [Port codebase pipeline] General fixes for RL and scripts (#1748) * Refactor dataset configuration in documentation and codebase - Updated dataset configuration keys from `dataset_root` to `root` and `num_episodes` to `num_episodes_to_record` for consistency. - Adjusted replay episode handling by renaming `episode` to `replay_episode`. - Enhanced documentation - added specific processor to transform from policy actions to delta actions * Added Robot action to tensor processor Added new processor script for dealing with gym specific action processing * removed RobotAction2Tensor processor; imrpoved choosing observations in actor * nit in delta action * added missing reset functions to kinematics * Adapt teleoperate and replay to pipeline similar to record * refactor(processors): move to inheritance (#1750) * fix(teleoperator): improvements phone implementation (#1752) * fix(teleoperator): protect shared state in phone implementation * refactor(teleop): separate classes in phone * fix: solve breaking changes (#1753) * refactor(policies): multiple improvements (#1754) * refactor(processor): simpler logic in device processor (#1755) * refactor(processor): euclidean distance in delta action processor (#1757) * refactor(processor): improvements to joint observations processor migration (#1758) * refactor(processor): improvements to tokenizer migration (#1759) * refactor(processor): improvements to tokenizer migration * fix(tests): tokenizer tests regression from #1750 * fix(processors): fix float comparison and config in hil processors (#1760) * chore(teleop): remove unnecessary callbacks in KeyboardEndEffectorTeleop (#1761) * refactor(processor): improvements normalize pipeline migration (#1756) * refactor(processor): several improvements normalize processor step * refactor(processor): more improvements normalize processor * refactor(processor): more changes to normalizer * refactor(processor): take a different approach to DRY * refactor(processor): final design * chore(record): revert comment and continue deleted (#1764) * refactor(examples): pipeline phone examples (#1769) * refactor(examples): phone teleop + teleop script * refactor(examples): phone replay + replay * chore(examples): rename phone example files & folders * feat(processor): fix improvements to the pipeline porting (#1796) * refactor(processor): enhance tensor device handling in normalization process (#1795) * refactor(tests): remove unsupported device detection test for complementary data (#1797) * chore(tests): update ToBatchProcessor test (#1798) * refactor(tests): remove in-place mutation tests for actions and complementary data in batch processor * test(tests): add tests for action and task processing in batch processor * add names for android and ios phone (#1799) * use _tensor_stats in normalize processor (#1800) * fix(normalize_processor): correct device reference for tensor epsilon handling (#1801) * add point 5 add missing feature contracts (#1806) * Fix PR comments 1452 (#1807) * use key to determine image * Address rest of PR comments * use PolicyFeatures in transform_features --------- Co-authored-by: Pepijn <138571049+pkooij@users.noreply.github.com> --------- Co-authored-by: Michel Aractingi <michel.aractingi@huggingface.co> Co-authored-by: Adil Zouitine <adilzouitinegm@gmail.com> Co-authored-by: Pepijn <138571049+pkooij@users.noreply.github.com>
2026-05-16 00:59:46 +00:00 · 2025-08-31 20:38:52 +02:00
parent 35c5d43255
commit ce665160ae
55 changed files with 1549 additions and 2024 deletions
@@ -98,7 +98,11 @@ def test_basic_tokenization(mock_auto_tokenizer):

    processor = TokenizerProcessor(tokenizer_name="test-tokenizer", max_length=10)

-    transition = create_transition(complementary_data={"task": "pick up the red cube"})
+    transition = create_transition(
+        observation={"state": torch.tensor([1.0, 2.0])},
+        action=torch.tensor([0.1, 0.2]),
+        complementary_data={"task": "pick up the red cube"},
+    )

    result = processor(transition)

@@ -126,7 +130,11 @@ def test_basic_tokenization_with_tokenizer_object():

    processor = TokenizerProcessor(tokenizer=mock_tokenizer, max_length=10)

-    transition = create_transition(complementary_data={"task": "pick up the red cube"})
+    transition = create_transition(
+        observation={"state": torch.tensor([1.0, 2.0])},
+        action=torch.tensor([0.1, 0.2]),
+        complementary_data={"task": "pick up the red cube"},
+    )

    result = processor(transition)

@@ -156,7 +164,11 @@ def test_list_of_strings_tokenization(mock_auto_tokenizer):

    processor = TokenizerProcessor(tokenizer_name="test-tokenizer", max_length=8)

-    transition = create_transition(complementary_data={"task": ["pick up cube", "place on table"]})
+    transition = create_transition(
+        observation={"state": torch.tensor([1.0, 2.0])},
+        action=torch.tensor([0.1, 0.2]),
+        complementary_data={"task": ["pick up cube", "place on table"]},
+    )

    result = processor(transition)

@@ -180,7 +192,11 @@ def test_custom_keys(mock_auto_tokenizer):

    processor = TokenizerProcessor(tokenizer_name="test-tokenizer", task_key="instruction", max_length=5)

-    transition = create_transition(complementary_data={"instruction": "move forward"})
+    transition = create_transition(
+        observation={"state": torch.tensor([1.0, 2.0])},
+        action=torch.tensor([0.1, 0.2]),
+        complementary_data={"instruction": "move forward"},
+    )

    result = processor(transition)

@@ -421,7 +437,11 @@ def test_save_and_load_pretrained_with_tokenizer_name(mock_auto_tokenizer):
        loaded_processor = RobotProcessor.from_pretrained(temp_dir)

        # Test that loaded processor works
-        transition = create_transition(complementary_data={"instruction": "test instruction"})
+        transition = create_transition(
+            observation={"state": torch.tensor([1.0, 2.0])},
+            action=torch.tensor([0.1, 0.2]),
+            complementary_data={"instruction": "test instruction"},
+        )

        result = loaded_processor(transition)
        assert TransitionKey.OBSERVATION in result
@@ -448,7 +468,11 @@ def test_save_and_load_pretrained_with_tokenizer_object():
        )

        # Test that loaded processor works
-        transition = create_transition(complementary_data={"instruction": "test instruction"})
+        transition = create_transition(
+            observation={"state": torch.tensor([1.0, 2.0])},
+            action=torch.tensor([0.1, 0.2]),
+            complementary_data={"instruction": "test instruction"},
+        )

        result = loaded_processor(transition)
        assert TransitionKey.OBSERVATION in result
@@ -569,7 +593,11 @@ def test_tokenization_parameters(mock_auto_tokenizer):
        padding_side="left",
    )

-    transition = create_transition(complementary_data={"task": "test task"})
+    transition = create_transition(
+        observation={"state": torch.tensor([1.0, 2.0])},
+        action=torch.tensor([0.1, 0.2]),
+        complementary_data={"task": "test task"},
+    )

    processor(transition)

@@ -592,12 +620,14 @@ def test_preserves_other_complementary_data(mock_auto_tokenizer):
    processor = TokenizerProcessor(tokenizer_name="test-tokenizer")

    transition = create_transition(
+        observation={"state": torch.tensor([1.0, 2.0])},
+        action=torch.tensor([0.1, 0.2]),
        complementary_data={
            "task": "test task",
            "episode_id": 123,
            "timestamp": 456.789,
            "other_field": {"nested": "data"},
-        }
+        },
    )

    result = processor(transition)
@@ -624,7 +654,11 @@ def test_deterministic_tokenization(mock_auto_tokenizer):

    processor = TokenizerProcessor(tokenizer_name="test-tokenizer", max_length=10)

-    transition = create_transition(complementary_data={"task": "consistent test"})
+    transition = create_transition(
+        observation={"state": torch.tensor([1.0, 2.0])},
+        action=torch.tensor([0.1, 0.2]),
+        complementary_data={"task": "consistent test"},
+    )

    result1 = processor(transition)
    result2 = processor(transition)
@@ -648,7 +682,11 @@ def test_empty_string_task(mock_auto_tokenizer):

    processor = TokenizerProcessor(tokenizer_name="test-tokenizer", max_length=8)

-    transition = create_transition(complementary_data={"task": ""})
+    transition = create_transition(
+        observation={"state": torch.tensor([1.0, 2.0])},
+        action=torch.tensor([0.1, 0.2]),
+        complementary_data={"task": ""},
+    )

    result = processor(transition)

@@ -669,7 +707,11 @@ def test_very_long_task(mock_auto_tokenizer):
    processor = TokenizerProcessor(tokenizer_name="test-tokenizer", max_length=5, truncation=True)

    long_task = " ".join(["word"] * 100)  # Very long task
-    transition = create_transition(complementary_data={"task": long_task})
+    transition = create_transition(
+        observation={"state": torch.tensor([1.0, 2.0])},
+        action=torch.tensor([0.1, 0.2]),
+        complementary_data={"task": long_task},
+    )

    result = processor(transition)

@@ -714,7 +756,11 @@ def test_custom_padding_side(mock_auto_tokenizer):
    # Test left padding
    processor_left = TokenizerProcessor(tokenizer_name="test-tokenizer", max_length=10, padding_side="left")

-    transition = create_transition(complementary_data={"task": "test task"})
+    transition = create_transition(
+        observation={"state": torch.tensor([1.0, 2.0])},
+        action=torch.tensor([0.1, 0.2]),
+        complementary_data={"task": "test task"},
+    )
    processor_left(transition)

    assert tracking_tokenizer.padding_side_calls[-1] == "left"
@@ -873,32 +919,6 @@ def test_device_detection_from_action():
    assert attention_mask.device.type == "cuda"


-@pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
-@require_package("transformers")
-def test_device_detection_from_complementary_data():
-    """Test that device is detected from tensors in complementary_data."""
-    mock_tokenizer = MockTokenizer(vocab_size=100)
-    processor = TokenizerProcessor(tokenizer=mock_tokenizer, max_length=10)
-
-    # Create transition with tensor in complementary_data
-    transition = create_transition(
-        observation={"metadata": {"key": "value"}},  # No tensors
-        complementary_data={
-            "task": "comp data test",
-            "index": torch.tensor([42]).cuda(),  # Tensor in complementary_data
-        },
-    )
-
-    result = processor(transition)
-
-    # Check that tokenized tensors match complementary_data tensor's device
-    tokens = result[TransitionKey.OBSERVATION][f"{OBS_LANGUAGE}.tokens"]
-    attention_mask = result[TransitionKey.OBSERVATION][f"{OBS_LANGUAGE}.attention_mask"]
-
-    assert tokens.device.type == "cuda"
-    assert attention_mask.device.type == "cuda"
-
-
@require_package("transformers")
 def test_device_detection_preserves_dtype():
    """Test that device detection doesn't affect dtype of tokenized tensors."""