mirror of
https://github.com/karpathy/nanochat.git
synced 2026-06-16 11:09:09 +00:00
Use 'hf' (huggingface-cli is deprecated); REST-first self-delete
This commit is contained in:
parent
e26bf8719b
commit
66b92b108c
|
|
@ -59,7 +59,7 @@ cleanup() {
|
|||
if [ "$rc" -eq 0 ]; then
|
||||
echo "[runner] success — final upload to $HF_REPO"
|
||||
if [ -d "$NANOCHAT_BASE_DIR" ]; then
|
||||
huggingface-cli upload "$HF_REPO" "$NANOCHAT_BASE_DIR" . \
|
||||
hf upload "$HF_REPO" "$NANOCHAT_BASE_DIR" . \
|
||||
--repo-type model --commit-message "final rc=0 $TS" || \
|
||||
echo "[runner] WARN: final upload failed"
|
||||
fi
|
||||
|
|
@ -70,31 +70,30 @@ cleanup() {
|
|||
[ -d "$NANOCHAT_BASE_DIR/report" ] && cp -r "$NANOCHAT_BASE_DIR/report" /tmp/failure/ 2>/dev/null || true
|
||||
[ -d "$WORKDIR" ] && (cd "$WORKDIR" && git rev-parse HEAD 2>/dev/null > /tmp/failure/git-head.txt || true)
|
||||
|
||||
huggingface-cli upload "$HF_REPO" /tmp/failure "_failures/${TS}-rc${rc}/logs" \
|
||||
hf upload "$HF_REPO" /tmp/failure "_failures/${TS}-rc${rc}/logs" \
|
||||
--repo-type model --commit-message "failure rc=$rc logs $TS" || \
|
||||
echo "[runner] WARN: log upload failed"
|
||||
|
||||
if [ "$UPLOAD_FAILURE_CACHE" = "1" ] && [ -d "$NANOCHAT_BASE_DIR" ]; then
|
||||
echo "[runner] UPLOAD_FAILURE_CACHE=1 — also dumping partial cache (may be slow)"
|
||||
huggingface-cli upload "$HF_REPO" "$NANOCHAT_BASE_DIR" "_failures/${TS}-rc${rc}/cache" \
|
||||
hf upload "$HF_REPO" "$NANOCHAT_BASE_DIR" "_failures/${TS}-rc${rc}/cache" \
|
||||
--repo-type model --commit-message "failure rc=$rc cache $TS" || true
|
||||
fi
|
||||
echo "[runner] failure artifacts: https://huggingface.co/$HF_REPO/tree/main/_failures/${TS}-rc${rc}"
|
||||
fi
|
||||
|
||||
echo "[runner] self-deleting pod $RUNPOD_POD_ID"
|
||||
# Preinstalled runpodctl may be older (legacy 'remove pod') or newer ('pod delete').
|
||||
# Try new, then legacy, then REST API. -fsS makes curl fail loudly on HTTP errors.
|
||||
if runpodctl pod delete "$RUNPOD_POD_ID" 2>&1; then
|
||||
:
|
||||
elif runpodctl remove pod "$RUNPOD_POD_ID" 2>&1; then
|
||||
:
|
||||
# REST API first — pod-scoped key has delete permission and the API is reliable.
|
||||
# The pod's preinstalled runpodctl is unreliable (often missing config or 'pod' subcommand).
|
||||
if curl -fsS -X DELETE \
|
||||
-H "Authorization: Bearer ${RUNPOD_API_KEY:-}" \
|
||||
"https://rest.runpod.io/v1/pods/$RUNPOD_POD_ID" 2>&1; then
|
||||
echo "[runner] REST delete request accepted"
|
||||
else
|
||||
echo "[runner] runpodctl delete failed via both syntaxes, using REST API"
|
||||
curl -fsS -X DELETE \
|
||||
-H "Authorization: Bearer ${RUNPOD_API_KEY:-}" \
|
||||
"https://rest.runpod.io/v1/pods/$RUNPOD_POD_ID" 2>&1 || \
|
||||
echo "[runner] WARN: REST delete also failed — pod may need manual cleanup"
|
||||
echo "[runner] REST delete failed, trying runpodctl as fallback"
|
||||
runpodctl pod delete "$RUNPOD_POD_ID" 2>&1 || \
|
||||
runpodctl remove pod "$RUNPOD_POD_ID" 2>&1 || \
|
||||
echo "[runner] WARN: all delete methods failed — pod may need manual cleanup"
|
||||
fi
|
||||
exit "$rc"
|
||||
}
|
||||
|
|
@ -147,7 +146,7 @@ echo "[runner] === FA3 PROBE END ==="
|
|||
while true; do
|
||||
sleep "$BACKUP_INTERVAL"
|
||||
if [ -d "$NANOCHAT_BASE_DIR" ]; then
|
||||
huggingface-cli upload "$HF_REPO" "$NANOCHAT_BASE_DIR" . \
|
||||
hf upload "$HF_REPO" "$NANOCHAT_BASE_DIR" . \
|
||||
--repo-type model \
|
||||
--commit-message "checkpoint $(date -Iseconds)" >> /workspace/backup.log 2>&1 || true
|
||||
fi
|
||||
|
|
|
|||
|
|
@ -46,24 +46,23 @@ cleanup() {
|
|||
cp /workspace/*.log /tmp/smoke-out/ 2>/dev/null || true
|
||||
echo "rc=$rc ts=$TS pod=$RUNPOD_POD_ID" > /tmp/smoke-out/result.txt
|
||||
[ -d "$WORKDIR" ] && (cd "$WORKDIR" && git rev-parse HEAD 2>/dev/null > /tmp/smoke-out/git-head.txt || true)
|
||||
huggingface-cli upload "$HF_REPO" /tmp/smoke-out "$HF_PATH_PREFIX" \
|
||||
hf upload "$HF_REPO" /tmp/smoke-out "$HF_PATH_PREFIX" \
|
||||
--repo-type model --commit-message "smoke rc=$rc $TS" || \
|
||||
echo "[smoke] WARN: HF upload failed"
|
||||
|
||||
echo "[smoke] artifacts: https://huggingface.co/$HF_REPO/tree/main/$HF_PATH_PREFIX"
|
||||
echo "[smoke] self-deleting pod $RUNPOD_POD_ID"
|
||||
# The preinstalled runpodctl may be older (legacy 'remove pod' syntax) or newer ('pod delete').
|
||||
# Try new, then legacy, then REST API. -fsS makes curl fail loudly on HTTP errors.
|
||||
if runpodctl pod delete "$RUNPOD_POD_ID" 2>&1; then
|
||||
:
|
||||
elif runpodctl remove pod "$RUNPOD_POD_ID" 2>&1; then
|
||||
:
|
||||
# REST API first — pod-scoped key has delete permission and the API is reliable.
|
||||
# The pod's preinstalled runpodctl is unreliable (often missing config or 'pod' subcommand).
|
||||
if curl -fsS -X DELETE \
|
||||
-H "Authorization: Bearer ${RUNPOD_API_KEY:-}" \
|
||||
"https://rest.runpod.io/v1/pods/$RUNPOD_POD_ID" 2>&1; then
|
||||
echo "[smoke] REST delete request accepted"
|
||||
else
|
||||
echo "[smoke] runpodctl delete failed via both syntaxes, using REST API"
|
||||
curl -fsS -X DELETE \
|
||||
-H "Authorization: Bearer ${RUNPOD_API_KEY:-}" \
|
||||
"https://rest.runpod.io/v1/pods/$RUNPOD_POD_ID" 2>&1 || \
|
||||
echo "[smoke] WARN: REST delete also failed — pod may need manual cleanup"
|
||||
echo "[smoke] REST delete failed, trying runpodctl as fallback"
|
||||
runpodctl pod delete "$RUNPOD_POD_ID" 2>&1 || \
|
||||
runpodctl remove pod "$RUNPOD_POD_ID" 2>&1 || \
|
||||
echo "[smoke] WARN: all delete methods failed — pod may need manual cleanup"
|
||||
fi
|
||||
exit "$rc"
|
||||
}
|
||||
|
|
|
|||
Loading…
Reference in New Issue
Block a user