Fix smoke/d12 runner: dedupe logs, disambiguate git ref, robust HF + self-delete on early failure

This commit is contained in:
Hayden Free 2026-04-25 23:10:37 -04:00
parent 86cf10c693
commit 6ee799ef00
2 changed files with 46 additions and 10 deletions

View File

@ -33,11 +33,18 @@ NANOCHAT_BASE_DIR="$HOME/.cache/nanochat"
BACKUP_PID=""
mkdir -p /workspace
exec > >(tee -a "$LOG_FILE") 2>&1
# NOTE: dockerStartCmd already redirects stdout/stderr to $LOG_FILE.
# Don't add a second tee here — would write every line twice.
echo "[runner] $(date -Iseconds) starting on pod=$RUNPOD_POD_ID"
echo "[runner] repo=$NANOCHAT_REPO ref=$NANOCHAT_REF hf_repo=$HF_REPO wandb_run=$WANDB_RUN"
# Bootstrap huggingface_hub system-wide so the cleanup trap can upload logs
# even if we fail before the venv is activated.
{ pip3 install --break-system-packages --quiet --upgrade huggingface_hub 2>&1 || \
python3 -m pip install --break-system-packages --quiet --upgrade huggingface_hub 2>&1 || \
echo "[runner] WARN: could not pre-install huggingface_hub; cleanup uploads may fail"; } || true
cleanup() {
local rc=$?
set +e
@ -76,9 +83,19 @@ cleanup() {
fi
echo "[runner] self-deleting pod $RUNPOD_POD_ID"
runpodctl pod delete "$RUNPOD_POD_ID" 2>&1 || \
curl -sS -X DELETE -H "Authorization: Bearer ${RUNPOD_API_KEY:-}" \
"https://rest.runpod.io/v1/pods/$RUNPOD_POD_ID"
# Preinstalled runpodctl may be older (legacy 'remove pod') or newer ('pod delete').
# Try new, then legacy, then REST API. -fsS makes curl fail loudly on HTTP errors.
if runpodctl pod delete "$RUNPOD_POD_ID" 2>&1; then
:
elif runpodctl remove pod "$RUNPOD_POD_ID" 2>&1; then
:
else
echo "[runner] runpodctl delete failed via both syntaxes, using REST API"
curl -fsS -X DELETE \
-H "Authorization: Bearer ${RUNPOD_API_KEY:-}" \
"https://rest.runpod.io/v1/pods/$RUNPOD_POD_ID" 2>&1 || \
echo "[runner] WARN: REST delete also failed — pod may need manual cleanup"
fi
exit "$rc"
}
trap cleanup EXIT
@ -90,7 +107,8 @@ trap cleanup EXIT
rm -rf "$WORKDIR"
git clone "https://github.com/${NANOCHAT_REPO}.git" "$WORKDIR"
cd "$WORKDIR"
git checkout "$NANOCHAT_REF"
# `--` disambiguates ref-vs-file (some images create a `dev` file in HOME)
git checkout "$NANOCHAT_REF" --
echo "[runner] HEAD = $(git rev-parse HEAD)"
sed -i 's/--depth=24/--depth=12/' runs/speedrun.sh

View File

@ -25,10 +25,17 @@ LOG_FILE="/workspace/runner.log"
NANOCHAT_BASE_DIR="$HOME/.cache/nanochat"
mkdir -p /workspace
exec > >(tee -a "$LOG_FILE") 2>&1
# NOTE: dockerStartCmd already redirects stdout/stderr to $LOG_FILE.
# Don't add a second tee here — would write every line twice.
echo "[smoke] $(date -Iseconds) starting on pod=$RUNPOD_POD_ID"
# Bootstrap huggingface_hub system-wide so the cleanup trap can upload logs
# even if we fail before the venv is activated. Try pip3, then python3 -m pip.
{ pip3 install --break-system-packages --quiet --upgrade huggingface_hub 2>&1 || \
python3 -m pip install --break-system-packages --quiet --upgrade huggingface_hub 2>&1 || \
echo "[smoke] WARN: could not pre-install huggingface_hub; cleanup uploads may fail"; } || true
cleanup() {
local rc=$?
set +e
@ -45,9 +52,19 @@ cleanup() {
echo "[smoke] artifacts: https://huggingface.co/$HF_REPO/tree/main/$HF_PATH_PREFIX"
echo "[smoke] self-deleting pod $RUNPOD_POD_ID"
runpodctl pod delete "$RUNPOD_POD_ID" 2>&1 || \
curl -sS -X DELETE -H "Authorization: Bearer ${RUNPOD_API_KEY:-}" \
"https://rest.runpod.io/v1/pods/$RUNPOD_POD_ID"
# The preinstalled runpodctl may be older (legacy 'remove pod' syntax) or newer ('pod delete').
# Try new, then legacy, then REST API. -fsS makes curl fail loudly on HTTP errors.
if runpodctl pod delete "$RUNPOD_POD_ID" 2>&1; then
:
elif runpodctl remove pod "$RUNPOD_POD_ID" 2>&1; then
:
else
echo "[smoke] runpodctl delete failed via both syntaxes, using REST API"
curl -fsS -X DELETE \
-H "Authorization: Bearer ${RUNPOD_API_KEY:-}" \
"https://rest.runpod.io/v1/pods/$RUNPOD_POD_ID" 2>&1 || \
echo "[smoke] WARN: REST delete also failed — pod may need manual cleanup"
fi
exit "$rc"
}
trap cleanup EXIT
@ -60,7 +77,8 @@ trap cleanup EXIT
rm -rf "$WORKDIR"
git clone "https://github.com/${NANOCHAT_REPO}.git" "$WORKDIR"
cd "$WORKDIR"
git checkout "$NANOCHAT_REF"
# `--` disambiguates ref-vs-file (some images create a `dev` file in HOME)
git checkout "$NANOCHAT_REF" --
echo "[smoke] HEAD = $(git rev-parse HEAD)"
# Env + uv