# Copyright 2026 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# This Dockerfile builds a GPU inference pod for `lerobot-policy-server`
# (remote inference over Zenoh). It starts from an NVIDIA CUDA base image;
# the cu128 PyTorch wheels bundle their own CUDA runtime (driver floor 570.86,
# see pyproject.toml [tool.uv]).

# docker build -f docker/Dockerfile.policy-server -t lerobot-policy-server .
# docker run --gpus all -v ./server.yaml:/etc/lerobot/server.yaml lerobot-policy-server
#
# Extra policy-family dependencies (e.g. pi0/smolvla need transformers) can be
# added at build time:
#   docker build -f docker/Dockerfile.policy-server \
#       --build-arg LEROBOT_EXTRAS="async pi0" -t lerobot-policy-server .

# Configure the base image (same CUDA family as Dockerfile.internal)
ARG CUDA_VERSION=12.8.1
ARG OS_VERSION=24.04
FROM nvidia/cuda:${CUDA_VERSION}-base-ubuntu${OS_VERSION}

# Define Python version and lerobot extras arguments
ARG PYTHON_VERSION=3.12
ARG LEROBOT_EXTRAS="async"

# Configure environment variables
ENV DEBIAN_FRONTEND=noninteractive \
    PATH=/lerobot/.venv/bin:$PATH

# Install system dependencies and uv (as root).
# Kept lean: no hardware/teleop libraries — this image only serves policies.
RUN apt-get update && apt-get install -y --no-install-recommends \
    git curl ca-certificates libglib2.0-0 ffmpeg \
    && curl -LsSf https://astral.sh/uv/install.sh | sh \
    && mv /root/.local/bin/uv /usr/local/bin/uv \
    && useradd --create-home --shell /bin/bash user_lerobot \
    && apt-get clean && rm -rf /var/lib/apt/lists/*

# Create application directory and set permissions
WORKDIR /lerobot
RUN chown -R user_lerobot:user_lerobot /lerobot

# Switch to the non-root user
USER user_lerobot

# Model checkpoints are cached under HF_HOME — mount it as a volume
# (or a PVC in Kubernetes) so warm restarts skip the Hub download.
ENV HOME=/home/user_lerobot \
    HF_HOME=/home/user_lerobot/.cache/huggingface \
    HF_LEROBOT_HOME=/home/user_lerobot/.cache/huggingface/lerobot \
    TORCH_HOME=/home/user_lerobot/.cache/torch \
    TRITON_CACHE_DIR=/home/user_lerobot/.cache/triton

# Create the virtual environment (Python provisioned by uv)
RUN uv venv --python ${PYTHON_VERSION}

# Install lerobot from the build context with the async extra
# (eclipse-zenoh + msgpack — see pyproject.toml [project.optional-dependencies])
COPY --chown=user_lerobot:user_lerobot setup.py pyproject.toml uv.lock README.md MANIFEST.in ./
COPY --chown=user_lerobot:user_lerobot src/ src/

RUN uv sync --locked --no-cache $(printf -- '--extra %s ' ${LEROBOT_EXTRAS})

# HTTP health + Prometheus metrics (manifest `health_port`, 0 disables)
EXPOSE 9100

# The manifest is typically mounted as a ConfigMap (Kubernetes) or a bind
# mount (docker run -v) at /etc/lerobot/server.yaml; any field can also be
# overridden on the command line, e.g. --model.repo_or_path=lerobot/pi0_towels
ENTRYPOINT ["lerobot-policy-server"]
CMD ["--manifest", "/etc/lerobot/server.yaml"]