Refactor run_t4_quick_test.sh to check for existing installations of uv and rustc before installation, and improve data download logic to skip existing files. Update checkpoint directory naming conventions in training scripts for consistency.

2025-12-06 04:12:13 +00:00 · 2025-10-27 10:24:24 +08:00 · 2025-10-27 10:24:24 +08:00 · 2020eb9973
commit 2020eb9973
parent 7456de29b9
4 changed files with 29 additions and 11 deletions
--- a/run_t4_quick_test.sh
+++ b/run_t4_quick_test.sh
@ -13,7 +13,11 @@ export NANOCHAT_BASE_DIR=".cache/nanochat"
 mkdir -p $NANOCHAT_BASE_DIR

 # 检查并安装uv
-command -v uv &> /dev/null || curl -LsSf https://astral.sh/uv/install.sh | sh
+# 检测下是否安装了uv
+if ! command -v uv &> /dev/null; then
+    curl -LsSf https://astral.sh/uv/install.sh | sh
+fi
+

 # 设置虚拟环境
 [ -d ".venv" ] || uv venv
@ -30,22 +34,36 @@ echo "📊 Wandb运行名称: $WANDB_RUN"
 python -m nanochat.report reset

 # 安装Rust和编译tokenizer
-curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y
+
+if ! command -v rustc &> /dev/null; then
+    curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y
+fi
+
 source "$HOME/.cargo/env"
 uv run maturin develop --release --manifest-path rustbpe/Cargo.toml

 # 下载评估数据
 EVAL_BUNDLE_URL=https://karpathy-public.s3.us-west-2.amazonaws.com/eval_bundle.zip
 if [ ! -d "$NANOCHAT_BASE_DIR/eval_bundle" ]; then
-    echo "📥 下载评估数据包..."
-    curl -L -o eval_bundle.zip $EVAL_BUNDLE_URL
+    if [ ! -f "eval_bundle.zip" ]; then
+        echo "📥 下载评估数据包..."
+        curl -L -o eval_bundle.zip $EVAL_BUNDLE_URL
+    else
+        echo "🗂️ 已存在 eval_bundle.zip，跳过下载。"
+    fi
    unzip -q eval_bundle.zip
    rm eval_bundle.zip
    mv eval_bundle $NANOCHAT_BASE_DIR
+else
+    echo "✅ 评估数据包已经存在，跳过下载。"
 fi

 # 下载身份对话数据
-curl -L -o $NANOCHAT_BASE_DIR/identity_conversations.jsonl https://karpathy-public.s3.us-west-2.amazonaws.com/identity_conversations.jsonl
+if [ ! -f "$NANOCHAT_BASE_DIR/identity_conversations.jsonl" ]; then
+    curl -L -o $NANOCHAT_BASE_DIR/identity_conversations.jsonl https://karpathy-public.s3.us-west-2.amazonaws.com/identity_conversations.jsonl
+else
+    echo "✅ identity_conversations.jsonl 已存在，跳过下载。"
+fi

 echo "📊 开始数据准备..."

--- a/scripts/t4_chat_sft.py
+++ b/scripts/t4_chat_sft.py
@ -256,8 +256,8 @@ for step in range(num_iterations):
 if master_process:
    base_dir = get_base_dir()
    depth = model.config.n_layer
-    model_tag = f"t4_d{depth}" # base the model tag on the depth of the base model
-    checkpoint_dir = os.path.join(base_dir, "t4_chatsft_checkpoints", model_tag)
+    model_tag = f"d{depth}" # base the model tag on the depth of the base model
+    checkpoint_dir = os.path.join(base_dir, "chatsft_checkpoints", model_tag)
    model_config_kwargs = model.config.__dict__ # slightly naughty, abusing the simplicity of GPTConfig, TODO nicer
    save_checkpoint(
        checkpoint_dir,
--- a/scripts/t4_mid_train.py
+++ b/scripts/t4_mid_train.py
@ -209,8 +209,8 @@ while True:

    # save checkpoint at the end of the run (only on master process)
    if master_process and last_step and not dry_run:
-        output_dirname = f"t4_d{depth}"
-        checkpoint_dir = os.path.join(base_dir, "t4_mid_checkpoints", output_dirname)
+        output_dirname = f"d{depth}"
+        checkpoint_dir = os.path.join(base_dir, "mid_checkpoints", output_dirname)
        save_checkpoint(
            checkpoint_dir,
            step,
--- a/scripts/t4_train.py
+++ b/scripts/t4_train.py
@ -238,8 +238,8 @@ for step in range(num_iterations + 1):

    # save checkpoint at the end of the run (only on master process)
    if master_process and last_step:
-        output_dirname = model_tag if model_tag else f"t4_d{depth}"
-        checkpoint_dir = os.path.join(base_dir, "t4_checkpoints", output_dirname)
+        output_dirname = model_tag if model_tag else f"d{depth}"
+        checkpoint_dir = os.path.join(base_dir, "base_checkpoints", output_dirname)
        save_checkpoint(
            checkpoint_dir,
            step,