From 6ee799ef00755f2ff063e8f01fc19fef2664553b Mon Sep 17 00:00:00 2001 From: Hayden Free Date: Sat, 25 Apr 2026 23:10:37 -0400 Subject: [PATCH] Fix smoke/d12 runner: dedupe logs, disambiguate git ref, robust HF + self-delete on early failure --- runs/runpod/d12.sh | 28 +++++++++++++++++++++++----- runs/runpod/smoke.sh | 28 +++++++++++++++++++++++----- 2 files changed, 46 insertions(+), 10 deletions(-) diff --git a/runs/runpod/d12.sh b/runs/runpod/d12.sh index 9119c87d..2203916f 100755 --- a/runs/runpod/d12.sh +++ b/runs/runpod/d12.sh @@ -33,11 +33,18 @@ NANOCHAT_BASE_DIR="$HOME/.cache/nanochat" BACKUP_PID="" mkdir -p /workspace -exec > >(tee -a "$LOG_FILE") 2>&1 +# NOTE: dockerStartCmd already redirects stdout/stderr to $LOG_FILE. +# Don't add a second tee here — would write every line twice. echo "[runner] $(date -Iseconds) starting on pod=$RUNPOD_POD_ID" echo "[runner] repo=$NANOCHAT_REPO ref=$NANOCHAT_REF hf_repo=$HF_REPO wandb_run=$WANDB_RUN" +# Bootstrap huggingface_hub system-wide so the cleanup trap can upload logs +# even if we fail before the venv is activated. +{ pip3 install --break-system-packages --quiet --upgrade huggingface_hub 2>&1 || \ + python3 -m pip install --break-system-packages --quiet --upgrade huggingface_hub 2>&1 || \ + echo "[runner] WARN: could not pre-install huggingface_hub; cleanup uploads may fail"; } || true + cleanup() { local rc=$? set +e @@ -76,9 +83,19 @@ cleanup() { fi echo "[runner] self-deleting pod $RUNPOD_POD_ID" - runpodctl pod delete "$RUNPOD_POD_ID" 2>&1 || \ - curl -sS -X DELETE -H "Authorization: Bearer ${RUNPOD_API_KEY:-}" \ - "https://rest.runpod.io/v1/pods/$RUNPOD_POD_ID" + # Preinstalled runpodctl may be older (legacy 'remove pod') or newer ('pod delete'). + # Try new, then legacy, then REST API. -fsS makes curl fail loudly on HTTP errors. + if runpodctl pod delete "$RUNPOD_POD_ID" 2>&1; then + : + elif runpodctl remove pod "$RUNPOD_POD_ID" 2>&1; then + : + else + echo "[runner] runpodctl delete failed via both syntaxes, using REST API" + curl -fsS -X DELETE \ + -H "Authorization: Bearer ${RUNPOD_API_KEY:-}" \ + "https://rest.runpod.io/v1/pods/$RUNPOD_POD_ID" 2>&1 || \ + echo "[runner] WARN: REST delete also failed — pod may need manual cleanup" + fi exit "$rc" } trap cleanup EXIT @@ -90,7 +107,8 @@ trap cleanup EXIT rm -rf "$WORKDIR" git clone "https://github.com/${NANOCHAT_REPO}.git" "$WORKDIR" cd "$WORKDIR" -git checkout "$NANOCHAT_REF" +# `--` disambiguates ref-vs-file (some images create a `dev` file in HOME) +git checkout "$NANOCHAT_REF" -- echo "[runner] HEAD = $(git rev-parse HEAD)" sed -i 's/--depth=24/--depth=12/' runs/speedrun.sh diff --git a/runs/runpod/smoke.sh b/runs/runpod/smoke.sh index d023a764..0f60bc58 100755 --- a/runs/runpod/smoke.sh +++ b/runs/runpod/smoke.sh @@ -25,10 +25,17 @@ LOG_FILE="/workspace/runner.log" NANOCHAT_BASE_DIR="$HOME/.cache/nanochat" mkdir -p /workspace -exec > >(tee -a "$LOG_FILE") 2>&1 +# NOTE: dockerStartCmd already redirects stdout/stderr to $LOG_FILE. +# Don't add a second tee here — would write every line twice. echo "[smoke] $(date -Iseconds) starting on pod=$RUNPOD_POD_ID" +# Bootstrap huggingface_hub system-wide so the cleanup trap can upload logs +# even if we fail before the venv is activated. Try pip3, then python3 -m pip. +{ pip3 install --break-system-packages --quiet --upgrade huggingface_hub 2>&1 || \ + python3 -m pip install --break-system-packages --quiet --upgrade huggingface_hub 2>&1 || \ + echo "[smoke] WARN: could not pre-install huggingface_hub; cleanup uploads may fail"; } || true + cleanup() { local rc=$? set +e @@ -45,9 +52,19 @@ cleanup() { echo "[smoke] artifacts: https://huggingface.co/$HF_REPO/tree/main/$HF_PATH_PREFIX" echo "[smoke] self-deleting pod $RUNPOD_POD_ID" - runpodctl pod delete "$RUNPOD_POD_ID" 2>&1 || \ - curl -sS -X DELETE -H "Authorization: Bearer ${RUNPOD_API_KEY:-}" \ - "https://rest.runpod.io/v1/pods/$RUNPOD_POD_ID" + # The preinstalled runpodctl may be older (legacy 'remove pod' syntax) or newer ('pod delete'). + # Try new, then legacy, then REST API. -fsS makes curl fail loudly on HTTP errors. + if runpodctl pod delete "$RUNPOD_POD_ID" 2>&1; then + : + elif runpodctl remove pod "$RUNPOD_POD_ID" 2>&1; then + : + else + echo "[smoke] runpodctl delete failed via both syntaxes, using REST API" + curl -fsS -X DELETE \ + -H "Authorization: Bearer ${RUNPOD_API_KEY:-}" \ + "https://rest.runpod.io/v1/pods/$RUNPOD_POD_ID" 2>&1 || \ + echo "[smoke] WARN: REST delete also failed — pod may need manual cleanup" + fi exit "$rc" } trap cleanup EXIT @@ -60,7 +77,8 @@ trap cleanup EXIT rm -rf "$WORKDIR" git clone "https://github.com/${NANOCHAT_REPO}.git" "$WORKDIR" cd "$WORKDIR" -git checkout "$NANOCHAT_REF" +# `--` disambiguates ref-vs-file (some images create a `dev` file in HOME) +git checkout "$NANOCHAT_REF" -- echo "[smoke] HEAD = $(git rev-parse HEAD)" # Env + uv