Files
Qizhi Chen 67091bc4a7 💥 Add generic converter adapter hooks (#107)
* Add generic converter adapter hooks

Co-authored-by: Codex <codex@openai.com>

* Require conversion task repo ids

Co-authored-by: Codex <codex@openai.com>

* Remove conversion task runtime repo id check

Co-authored-by: Codex <codex@openai.com>

* Apply suggestion from @gemini-code-assist[bot]

Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>

---------

Co-authored-by: Codex <codex@openai.com>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
2026-06-21 14:53:09 -07:00

39 lines
865 B
Python

from collections.abc import Mapping, Sequence
from dataclasses import dataclass, field
from pathlib import Path
from typing import Any
TaskMetadata = Mapping[str, Any]
FeatureSpec = Mapping[str, dict]
@dataclass(frozen=True)
class ConversionTask:
"""One independently convertible raw input file and adapter metadata."""
input_path: Path
output_path: Path
local_repo_id: str
metadata: TaskMetadata = field(default_factory=dict)
def setup_logger():
import sys
from datatrove.utils.logging import logger
logger.remove()
logger.add(sys.stdout, level="INFO", colorize=True)
return logger
def unique_strings(values: Sequence[str]) -> list[str]:
result = []
seen = set()
for value in values:
if value in seen:
continue
result.append(value)
seen.add(value)
return result