Skip to content

Fix LocalBackend fork_checkpoint to overwrite initial LoRA for vLLM #506

Fix LocalBackend fork_checkpoint to overwrite initial LoRA for vLLM

Fix LocalBackend fork_checkpoint to overwrite initial LoRA for vLLM #506

Workflow file for this run

name: Prek
on:
pull_request:
push:
branches: [main]
permissions:
contents: write
env:
CI_BASE_IMAGE: "pytorch/pytorch:2.9.0-cuda12.8-cudnn9-devel"
CI_PYTHON_MM: "3.11"
CI_UV_CACHE_RELEASE_TAG: "prek-uv-cache"
CI_UV_CACHE_ASSET_PREFIX: "prek-uv-cache"
CI_APEX_PARALLEL_BUILD: "8"
CI_APEX_NVCC_THREADS: "1"
CI_UV_BUILD_SLOTS: "2"
UV_CACHE_DIR: "/root/.cache/uv"
UV_LINK_MODE: "copy"
TORCH_CUDA_ARCH_LIST: "9.0"
jobs:
cache-status:
runs-on: art-large-runner
outputs:
cache-hit: ${{ steps.check.outputs.cache-hit }}
fingerprint: ${{ steps.fingerprint.outputs.fingerprint }}
steps:
- name: Checkout code
uses: actions/checkout@v4
- name: Compute expected uv cache fingerprint
id: fingerprint
run: |
fp="$(python3 scripts/ci/compute_uv_fingerprint.py \
--pyproject pyproject.toml \
--uv-lock uv.lock \
--base-image "${CI_BASE_IMAGE}" \
--python-mm "${CI_PYTHON_MM}" \
--torch-cuda-arch-list "${TORCH_CUDA_ARCH_LIST}" \
--ci-apex-parallel-build "${CI_APEX_PARALLEL_BUILD}" \
--ci-apex-nvcc-threads "${CI_APEX_NVCC_THREADS}")"
echo "fingerprint=${fp}" >> "${GITHUB_OUTPUT}"
echo "Expected uv cache fingerprint: ${fp}"
- name: Check if uv cache exists
id: check
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
run: |
fingerprint="${{ steps.fingerprint.outputs.fingerprint }}"
part_prefix="${CI_UV_CACHE_ASSET_PREFIX}-${fingerprint}.tar.zst.part-"
release_api="https://api.github.com/repos/${GITHUB_REPOSITORY}/releases/tags/${CI_UV_CACHE_RELEASE_TAG}"
release_json="$(curl -fsSL \
-H "Authorization: Bearer ${GITHUB_TOKEN}" \
-H "Accept: application/vnd.github+json" \
"${release_api}" || true)"
if [ -z "${release_json}" ]; then
echo "Cache release '${CI_UV_CACHE_RELEASE_TAG}' not found."
echo "cache-hit=false" >> "${GITHUB_OUTPUT}"
exit 0
fi
hit="$(RELEASE_JSON="${release_json}" PART_PREFIX="${part_prefix}" python3 -c "
import json, os, re
payload = json.loads(os.environ['RELEASE_JSON'])
prefix = os.environ['PART_PREFIX']
pattern = re.compile(r'^' + re.escape(prefix) + r'(\d{3})$')
parts = sorted(
int(m.group(1))
for a in payload.get('assets', [])
for m in [pattern.match(a.get('name', ''))]
if m and a.get('id') is not None
)
print('true' if parts and parts == list(range(len(parts))) else 'false')
")"
echo "cache-hit=${hit}" >> "${GITHUB_OUTPUT}"
echo "Cache hit: ${hit}"
build-cache:
needs: cache-status
if: needs.cache-status.outputs.cache-hit != 'true'
runs-on: art-cache-builder
container:
image: pytorch/pytorch:2.9.0-cuda12.8-cudnn9-devel
steps:
- name: Install CI dependencies
run: |
apt-get update
apt-get install -y --no-install-recommends ca-certificates curl git zstd libibverbs-dev
rm -rf /var/lib/apt/lists/*
curl -LsSf https://astral.sh/uv/install.sh | sh
echo "/root/.local/bin" >> "${GITHUB_PATH}"
- name: Install gh CLI
env:
GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
run: |
GH_DL_URL="$(curl -fsSL \
-H "Authorization: Bearer ${GH_TOKEN}" \
https://api.github.com/repos/cli/cli/releases/latest \
| python3 -c "import json,sys;r=json.load(sys.stdin);print([a['browser_download_url'] for a in r['assets'] if a['name'].endswith('_linux_amd64.tar.gz')][0])")"
curl -fsSL "${GH_DL_URL}" | tar xz --strip-components=1 -C /usr/local
gh version
- name: Checkout code
uses: actions/checkout@v4
- name: Mark workspace as a safe git directory
run: |
git config --global --add safe.directory "${GITHUB_WORKSPACE}"
- name: Build and upload uv cache
env:
GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
run: |
bash scripts/ci/build_and_push_uv_cache.sh \
--base-image "${CI_BASE_IMAGE}" \
--python-mm "${CI_PYTHON_MM}"
quality-checks:
needs: [cache-status, build-cache]
if: ${{ !failure() && !cancelled() }}
runs-on: art-large-runner
container:
image: pytorch/pytorch:2.9.0-cuda12.8-cudnn9-devel
steps:
- name: Install CI dependencies
run: |
apt-get update
apt-get install -y --no-install-recommends ca-certificates curl git zstd libibverbs-dev
rm -rf /var/lib/apt/lists/*
curl -LsSf https://astral.sh/uv/install.sh | sh
echo "/root/.local/bin" >> "${GITHUB_PATH}"
- name: Checkout code
uses: actions/checkout@v4
- name: Mark workspace as a safe git directory
run: |
git config --global --add safe.directory "${GITHUB_WORKSPACE}"
- name: Restore prebuilt uv cache
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
run: |
release_api="https://api.github.com/repos/${GITHUB_REPOSITORY}/releases/tags/${CI_UV_CACHE_RELEASE_TAG}"
fingerprint="${{ needs.cache-status.outputs.fingerprint }}"
part_prefix="${CI_UV_CACHE_ASSET_PREFIX}-${fingerprint}.tar.zst.part-"
release_json="$(curl -fsSL \
-H "Authorization: Bearer ${GITHUB_TOKEN}" \
-H "Accept: application/vnd.github+json" \
"${release_api}" || true)"
if [ -z "${release_json}" ]; then
echo "::error::Missing cache release '${CI_UV_CACHE_RELEASE_TAG}'."
exit 1
fi
part_selection_file="/tmp/uv-cache-part-selection.txt"
if ! RELEASE_JSON="${release_json}" PART_PREFIX="${part_prefix}" python3 -c "import json, os, re, sys; payload=json.loads(os.environ['RELEASE_JSON']); part_prefix=os.environ['PART_PREFIX']; pattern=re.compile(r'^' + re.escape(part_prefix) + r'(\\d{3})$'); parts=[]; [parts.append((int(m.group(1)), int(a.get('id')), a.get('name'))) for a in payload.get('assets', []) for m in [pattern.match(a.get('name', ''))] if m and a.get('id') is not None]; parts.sort(key=lambda x: x[0]); indices=[p[0] for p in parts]; expected=list(range(len(parts))); print('\\n'.join(f'{asset_id} {name}' for _, asset_id, name in parts)) if parts and indices == expected else (_ for _ in ()).throw(SystemExit(2 if not parts else 3))" > "${part_selection_file}"; then
echo "::error::No complete uv cache part set found for prefix '${part_prefix}'."
exit 1
fi
part_count="$(wc -l < "${part_selection_file}" | tr -d ' ')"
echo "Using uv cache part set '${part_prefix}*' (${part_count} parts)."
parts_dir="/tmp/uv-cache-parts"
part_paths_file="/tmp/uv-cache-part-paths.txt"
rm -rf "${parts_dir}"
mkdir -p "${parts_dir}"
awk -v d="${parts_dir}" '{print d "/" $2}' "${part_selection_file}" > "${part_paths_file}"
PARTS_DIR="${parts_dir}" GITHUB_TOKEN="${GITHUB_TOKEN}" GITHUB_REPOSITORY="${GITHUB_REPOSITORY}" \
xargs -n 2 -P 8 sh -c '
asset_id="$1"
asset_name="$2"
part_path="${PARTS_DIR}/${asset_name}"
curl -fsSL -L \
-H "Authorization: Bearer ${GITHUB_TOKEN}" \
-H "Accept: application/octet-stream" \
"https://api.github.com/repos/${GITHUB_REPOSITORY}/releases/assets/${asset_id}" \
-o "${part_path}"
' sh < "${part_selection_file}"
while IFS= read -r part_path; do
[ -s "${part_path}" ] || {
echo "::error::Missing or empty cache part: ${part_path}"
exit 1
}
done < "${part_paths_file}"
rm -rf "${UV_CACHE_DIR}"
mkdir -p "${UV_CACHE_DIR}"
while IFS= read -r part_path; do
cat "${part_path}"
done < "${part_paths_file}" | zstd -d -c | tar -xf - -C "${UV_CACHE_DIR}"
du -sh "${UV_CACHE_DIR}"
- name: Install dependencies (with all optional extras for complete type checking)
run: |
original_pyproject="$(mktemp)"
cp pyproject.toml "${original_pyproject}"
cleanup() {
mv "${original_pyproject}" pyproject.toml
}
trap cleanup EXIT
py_mm="$(python -c 'import sys; print(f"{sys.version_info.major}.{sys.version_info.minor}")')"
cudnn_path="${GITHUB_WORKSPACE}/.venv/lib/python${py_mm}/site-packages/nvidia/cudnn"
export CUDNN_PATH="${cudnn_path}"
export CUDNN_HOME="${cudnn_path}"
export CUDNN_INCLUDE_PATH="${cudnn_path}/include"
export CUDNN_LIBRARY_PATH="${cudnn_path}/lib"
export CPLUS_INCLUDE_PATH="${CUDNN_INCLUDE_PATH}${CPLUS_INCLUDE_PATH:+:${CPLUS_INCLUDE_PATH}}"
export LIBRARY_PATH="${CUDNN_LIBRARY_PATH}${LIBRARY_PATH:+:${LIBRARY_PATH}}"
export LD_LIBRARY_PATH="${CUDNN_LIBRARY_PATH}${LD_LIBRARY_PATH:+:${LD_LIBRARY_PATH}}"
export UV_CONCURRENT_BUILDS="${CI_UV_BUILD_SLOTS}"
export CMAKE_BUILD_PARALLEL_LEVEL="${CI_APEX_PARALLEL_BUILD}"
export MAX_JOBS="${CI_APEX_PARALLEL_BUILD}"
export NINJAFLAGS="-j${CI_APEX_PARALLEL_BUILD}"
python3 scripts/ci/apply_ci_uv_build_overrides.py \
--pyproject pyproject.toml \
--apex-parallel-build "${CI_APEX_PARALLEL_BUILD}" \
--apex-nvcc-threads "${CI_APEX_NVCC_THREADS}"
echo "CI uv build overrides: APEX_PARALLEL_BUILD=${CI_APEX_PARALLEL_BUILD}, NVCC_APPEND_FLAGS=--threads ${CI_APEX_NVCC_THREADS}, UV_CONCURRENT_BUILDS=${CI_UV_BUILD_SLOTS}"
uv --version
uv sync --all-extras --group dev --frozen
- name: Run prek hooks (lint, format, typecheck, uv.lock, tests)
run: |
uv run --no-sync prek run --all-files
- name: Run unit tests (via prek)
run: |
uv run --no-sync prek run pytest