chore(installation): remove libero installation patch (#2416 )

* chore(installation): remove libero installation patch * fix(ci): exclude groot for unbound deps test
fix(dataset): fix data access bottleneck for faster training (#2408 )
2026-05-12 07:09:43 +00:00 · 2025-11-10 11:51:52 +01:00 · 2025-11-07 21:54:44 +01:00
7 changed files with 43 additions and 42 deletions
@@ -83,11 +83,11 @@ jobs:
          fi

      - name: Remove Tags with Git dependencies
-        # TODO(Steven): Temporary patch to remove libero and pi from PyPi 0.4.0 release due to its reliance on git dependencies.
+        # TODO(Steven): Temporary patch to remove pi from PyPi 0.4.0 release due to its reliance on git dependencies.
        run: |
          echo "::info:: Checking for Git dependencies to remove from pyproject.toml..."
-          grep -E '@ git\+https|lerobot\[pi\]|lerobot\[libero\]' pyproject.toml | sed 's/^/::warning:: Removing line: /' || true
-          sed -E -i '/@ git\+https|lerobot\[pi\]|lerobot\[libero\]/d' pyproject.toml
+          grep -E '@ git\+https|lerobot\[pi\]' pyproject.toml | sed 's/^/::warning:: Removing line: /' || true
+          sed -E -i '/@ git\+https|lerobot\[pi\]/d' pyproject.toml
          echo "::info:: Git dependencies removed. Proceeding with build."

      - name: Install build dependencies
@@ -70,7 +70,7 @@ jobs:
          echo "Dependencies unbound:" && cat pyproject.toml

      - name: Install lerobot with all extras
-        run: uv sync --all-extras
+        run: uv sync --all-extras --no-extra groot # TODO(Steven): Make flash-attn optional

      - name: Run pytest (all extras)
        run: uv run pytest tests -vv
@@ -186,7 +186,7 @@ For a full list of optional dependencies, see:
 https://pypi.org/project/lerobot/

 > [!NOTE]
-> For lerobot 0.4.0, if you want to install libero or pi tags, you will have to do: `pip install "lerobot[pi,libero]@git+https://github.com/huggingface/lerobot.git"`.
+> For lerobot 0.4.0, if you want to install pi tags, you will have to do: `pip install "lerobot[pi]@git+https://github.com/huggingface/lerobot.git"`.
 >
 > This will be solved in the next patch release

@@ -82,7 +82,7 @@ For a full list of optional dependencies, see:
 https://pypi.org/project/lerobot/

 > [!NOTE]
-> For lerobot 0.4.0, if you want to install libero or pi, you will have to do: `pip install "lerobot[pi,libero]@git+https://github.com/huggingface/lerobot.git"`
+> For lerobot 0.4.0, if you want to install pi, you will have to do: `pip install "lerobot[pi]@git+https://github.com/huggingface/lerobot.git"`

 ### Troubleshooting

@@ -28,11 +28,6 @@ LIBERO is now part of our **multi-eval supported simulation**, meaning you can b
 To Install LIBERO, after following LeRobot official instructions, just do:
 `pip install -e ".[libero]"`

-> [!NOTE]
-> For lerobot 0.4.0, if you want to install libero tag, you will have to do: `pip install "lerobot[libero]@git+https://github.com/huggingface/lerobot.git"`.
->
-> This will be solved in the next patch release
-
 ### Single-suite evaluation

 Evaluate a policy on one LIBERO suite:
@@ -940,11 +940,26 @@ class LeRobotDataset(torch.utils.data.Dataset):
        return query_timestamps

    def _query_hf_dataset(self, query_indices: dict[str, list[int]]) -> dict:
-        return {
-            key: torch.stack(self.hf_dataset[q_idx][key])
-            for key, q_idx in query_indices.items()
-            if key not in self.meta.video_keys
-        }
+        """
+        Query dataset for indices across keys, skipping video keys.
+
+        Tries column-first [key][indices] for speed, falls back to row-first.
+
+        Args:
+            query_indices: Dict mapping keys to index lists to retrieve
+
+        Returns:
+            Dict with stacked tensors of queried data (video keys excluded)
+        """
+        result: dict = {}
+        for key, q_idx in query_indices.items():
+            if key in self.meta.video_keys:
+                continue
+            try:
+                result[key] = torch.stack(self.hf_dataset[key][q_idx])
+            except (KeyError, TypeError, IndexError):
+                result[key] = torch.stack(self.hf_dataset[q_idx][key])
+        return result

    def _query_videos(self, query_timestamps: dict[str, list[float]], ep_idx: int) -> dict[str, torch.Tensor]:
        """Note: When using data workers (e.g. DataLoader with num_workers>0), do not call this function
@@ -50,9 +50,9 @@ from typing import Any

 import jsonlines
 import pandas as pd
-import pyarrow.parquet as pq
+import pyarrow as pa
 import tqdm
-from datasets import Dataset, concatenate_datasets
+from datasets import Dataset, Features, Image
 from huggingface_hub import HfApi, snapshot_download
 from requests import HTTPError

@@ -68,7 +68,6 @@ from lerobot.datasets.utils import (
    LEGACY_EPISODES_STATS_PATH,
    LEGACY_TASKS_PATH,
    cast_stats_to_numpy,
-    embed_images,
    flatten_dict,
    get_file_size_in_mb,
    get_parquet_file_size_in_mb,
@@ -175,33 +174,25 @@ def convert_tasks(root, new_root):
    write_tasks(df_tasks, new_root)


-def concat_data_files(
-    paths_to_cat: list[Path], new_root: Path, chunk_idx: int, file_idx: int, image_keys: list[str]
-):
-    """Concatenate multiple parquet data files into a single file.
-
-    Args:
-        paths_to_cat: List of parquet file paths to concatenate
-        new_root: Root directory for the new dataset
-        chunk_idx: Chunk index for the output file
-        file_idx: File index within the chunk
-        image_keys: List of feature keys that contain images
-    """
-
-    datasets_list: list[Dataset] = [Dataset.from_parquet(str(file)) for file in paths_to_cat]
-    concatenated_ds: Dataset = concatenate_datasets(datasets_list)
-
-    if len(image_keys) > 0:
-        logging.debug(f"Embedding {len(image_keys)} image features for optimal training performance")
-        concatenated_ds = embed_images(concatenated_ds)
+def concat_data_files(paths_to_cat, new_root, chunk_idx, file_idx, image_keys):
+    # TODO(rcadene): to save RAM use Dataset.from_parquet(file) and concatenate_datasets
+    dataframes = [pd.read_parquet(file) for file in paths_to_cat]
+    # Concatenate all DataFrames along rows
+    concatenated_df = pd.concat(dataframes, ignore_index=True)

    path = new_root / DEFAULT_DATA_PATH.format(chunk_index=chunk_idx, file_index=file_idx)
    path.parent.mkdir(parents=True, exist_ok=True)

-    table = concatenated_ds.with_format("arrow")[:]
-    writer = pq.ParquetWriter(path, schema=table.schema, compression="snappy", use_dictionary=True)
-    writer.write_table(table)
-    writer.close()
+    if len(image_keys) > 0:
+        schema = pa.Schema.from_pandas(concatenated_df)
+        features = Features.from_arrow_schema(schema)
+        for key in image_keys:
+            features[key] = Image()
+        schema = features.arrow_schema
+    else:
+        schema = None
+
+    concatenated_df.to_parquet(path, index=False, schema=schema)


 def convert_data(root: Path, new_root: Path, data_file_size_in_mb: int):
Author	SHA1	Message	Date
Steven Palma	a5b29d4301	chore(installation): remove libero installation patch (#2416 ) * chore(installation): remove libero installation patch * fix(ci): exclude groot for unbound deps test	2025-11-10 11:51:52 +01:00
Steven Palma	a4aa316470	fix(dataset): fix data access bottleneck for faster training (#2408 )	2025-11-07 21:54:44 +01:00