# Copyright 2026 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

name: Model Profiling

on:
  schedule:
    - cron: "0 0 * * 0"
  pull_request:
    branches:
      - main
      - feat/libero-benchmark
    paths:
      - .github/workflows/model_profiling.yml
      - profiling/model_profiling_specs.json
      - scripts/ci/run_model_profiling.py
      - src/lerobot/configs/train.py
      - src/lerobot/scripts/lerobot_train.py
      - src/lerobot/utils/profiling_utils.py
      - tests/scripts/test_model_profiling.py
  workflow_dispatch:
    inputs:
      git_ref:
        description: Git ref to profile when no commit SHA is provided
        required: false
        type: string
        default: main
      git_commit:
        description: Optional exact commit SHA to profile
        required: false
        type: string
        default: ""
      policies:
        description: Optional comma-separated policy filter
        required: false
        type: string
        default: ""
      profile_mode:
        description: Torch profiler mode
        required: false
        type: choice
        options:
          - trace
          - summary
        default: trace
      publish_results:
        description: Publish results to the profiling dataset when a Hub token is available
        required: false
        type: boolean
        default: true
      results_repo:
        description: Dataset repo name or fully qualified repo id
        required: false
        type: string
        default: model-profiling-history

permissions:
  contents: read

concurrency:
  group: ${{ github.workflow }}-${{ github.event_name }}-${{ github.event.inputs.git_commit || github.event.inputs.git_ref || github.ref_name || github.run_id }}
  cancel-in-progress: true

jobs:
  profile-models:
    name: Weekly Model Profiling
    runs-on:
      group: aws-g6-4xlarge-plus
    env:
      HF_USER_TOKEN: ${{ secrets.LEROBOT_HF_USER }}
      PROFILE_MODE: ${{ github.event_name == 'pull_request' && 'summary' || github.event.inputs.profile_mode || 'trace' }}
      POLICY_FILTER: ${{ github.event_name == 'pull_request' && 'act,diffusion,pi0,pi05,smolvla,groot,xvla,wall_x' || github.event.inputs.policies || '' }}
      RESULTS_REPO: ${{ github.event.inputs.results_repo || 'model-profiling-history' }}
      SHOULD_PUBLISH: ${{ github.event_name == 'schedule' || (github.event_name == 'workflow_dispatch' && github.event.inputs.publish_results == 'true') }}
    steps:
      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd  # v6.0.2
        with:
          persist-credentials: false
          lfs: true
          ref: ${{ github.event.pull_request.head.sha || github.event.inputs.git_commit || github.event.inputs.git_ref || 'main' }}

      - name: Pull GPU image
        run: docker pull huggingface/lerobot-gpu:latest

      - name: Run model profiling
        env:
          HOST_GIT_COMMIT: ${{ github.event.pull_request.head.sha || github.event.inputs.git_commit || github.sha }}
          PROFILE_GIT_REF: ${{ github.head_ref || github.ref_name || github.event.inputs.git_ref || 'main' }}
          PROFILE_PR_NUMBER: ${{ github.event.pull_request.number || '' }}
        run: |
          set -eux
          mkdir -p profiling-results
          docker run --rm --gpus all \
            --user "$(id -u):$(id -g)" \
            --shm-size=16g \
            -e HOME=/tmp/lerobot-home \
            -e HF_HOME=/tmp/hf \
            -e HF_LEROBOT_HOME=/tmp/hf-lerobot \
            -e TORCH_HOME=/tmp/torch-home \
            -e TORCHINDUCTOR_CACHE_DIR=/tmp/torchinductor-cache \
            -e UV_PROJECT_ENVIRONMENT=/tmp/lerobot-venv \
            -e UV_CACHE_DIR=/tmp/uv-cache \
            -e UV_PYTHON_PREFERENCE=only-system \
            -e XDG_DATA_HOME=/tmp/xdg-data \
            -e XDG_CACHE_HOME=/tmp/xdg-cache \
            -e HOST_GIT_COMMIT="${HOST_GIT_COMMIT}" \
            -e PROFILE_GIT_REF="${PROFILE_GIT_REF}" \
            -e PROFILE_PR_NUMBER="${PROFILE_PR_NUMBER}" \
            -e HF_USER_TOKEN="${HF_USER_TOKEN}" \
            -e HF_TOKEN="${HF_USER_TOKEN}" \
            -e PROFILE_MODE="${PROFILE_MODE}" \
            -e POLICY_FILTER="${POLICY_FILTER}" \
            -e RESULTS_REPO="${RESULTS_REPO}" \
            -e SHOULD_PUBLISH="${SHOULD_PUBLISH}" \
            -v "${GITHUB_WORKSPACE}:/workspace" \
            -w /workspace \
            huggingface/lerobot-gpu:latest \
            bash -c '
              set -euxo pipefail
              mkdir -p "${HOME}" "${HF_HOME}" "${HF_LEROBOT_HOME}" "${TORCH_HOME}" "${UV_CACHE_DIR}" "${XDG_CACHE_HOME}" "${XDG_DATA_HOME}" "${TORCHINDUCTOR_CACHE_DIR}"
              rm -rf /tmp/lerobot-src
              cp -a /workspace/. /tmp/lerobot-src
              cd /tmp/lerobot-src

              if [[ -n "${HF_USER_TOKEN:-}" ]]; then
                hf auth login --token "${HF_USER_TOKEN}" --add-to-git-credential 2>/dev/null || true
              fi

              policies_to_run=()
              if [[ -n "${POLICY_FILTER}" ]]; then
                IFS="," read -ra policies_to_run <<< "${POLICY_FILTER}"
              else
                policies_to_run=(act diffusion groot multi_task_dit pi0 pi0_fast pi05 smolvla wall_x xvla)
              fi

              policy_extras() {
                case "$1" in
                  act) ;;
                  diffusion) echo "diffusion" ;;
                  groot) echo "groot" ;;
                  multi_task_dit) echo "multi_task_dit" ;;
                  pi0|pi0_fast|pi05) echo "pi" ;;
                  smolvla) echo "smolvla" ;;
                  wall_x) echo "wallx" ;;
                  xvla) echo "xvla" ;;
                  *)
                    echo "Unknown profiling policy $1" >&2
                    return 1
                    ;;
                esac
              }

              # Policies whose dep-install may fail due to environment constraints
              # (e.g. groot requires compiling flash-attn, which needs nvcc; the CI
              # image only ships the CUDA runtime). Install failures for these are
              # logged as warnings and do not fail the job. See the TODO next to
              # `lerobot[groot]` in pyproject.toml.
              is_install_failure_tolerated() {
                case "$1" in
                  groot) return 0 ;;
                  *) return 1 ;;
                esac
              }

              overall_status=0
              for raw_policy in "${policies_to_run[@]}"; do
                policy="$(echo "${raw_policy}" | xargs)"
                [[ -z "${policy}" ]] && continue

                echo "::group::Profile ${policy}"

                extra="$(policy_extras "${policy}")" || { overall_status=1; echo "::endgroup::"; continue; }

                # Fresh, isolated dependency resolution per policy so that
                # incompatible extras (e.g. flash-attn for groot) never block
                # the rest of the matrix.
                sync_cmd=(uv sync --locked --extra training --extra test)
                if [[ -n "${extra}" ]]; then
                  sync_cmd+=(--extra "${extra}")
                fi
                # flash-attn does not declare torch as a build-time dep, so its
                # isolated build env fails with ModuleNotFoundError. Torch is a
                # core lerobot dep and is already resolved here, so we disable
                # build isolation for flash-attn specifically.
                sync_cmd+=(--no-build-isolation-package flash-attn)
                if ! "${sync_cmd[@]}"; then
                  if is_install_failure_tolerated "${policy}"; then
                    echo "::warning::Dependency install failed for ${policy} (known-fragile); skipping."
                  else
                    echo "Dependency install failed for ${policy}; skipping." >&2
                    overall_status=1
                  fi
                  echo "::endgroup::"
                  continue
                fi

                cmd=(
                  uv run python scripts/ci/run_model_profiling.py
                  --output_dir=/workspace/profiling-results
                  --hub_org=lerobot
                  --results_repo="${RESULTS_REPO}"
                  --profile_mode="${PROFILE_MODE}"
                  --git_commit="${HOST_GIT_COMMIT}"
                  --git_ref="${PROFILE_GIT_REF}"
                  --pr_number="${PROFILE_PR_NUMBER}"
                  --policies "${policy}"
                )
                if [[ "${SHOULD_PUBLISH}" == "true" && -n "${HF_USER_TOKEN:-}" ]]; then
                  cmd+=(--publish)
                fi

                if ! "${cmd[@]}"; then
                  echo "Profiling failed for ${policy}." >&2
                  overall_status=1
                fi

                echo "::endgroup::"
              done

              exit "${overall_status}"
            '

      - name: Upload profiling artifacts
        if: always()
        uses: actions/upload-artifact@v4 # zizmor: ignore[unpinned-uses]
        with:
          name: model-profiling-results
          path: profiling-results
          if-no-files-found: warn