Refactor run_t4_quick_test.sh to check for existing installations of uv and rustc before installation, and improve data download logic to skip existing files. Update checkpoint directory naming conventions in training scripts for consistency.

This commit is contained in:
z 2025-10-27 10:24:24 +08:00
parent 7456de29b9
commit 2020eb9973
4 changed files with 29 additions and 11 deletions

View File

@ -13,7 +13,11 @@ export NANOCHAT_BASE_DIR=".cache/nanochat"
mkdir -p $NANOCHAT_BASE_DIR
# 检查并安装uv
command -v uv &> /dev/null || curl -LsSf https://astral.sh/uv/install.sh | sh
# 检测下是否安装了uv
if ! command -v uv &> /dev/null; then
curl -LsSf https://astral.sh/uv/install.sh | sh
fi
# 设置虚拟环境
[ -d ".venv" ] || uv venv
@ -30,22 +34,36 @@ echo "📊 Wandb运行名称: $WANDB_RUN"
python -m nanochat.report reset
# 安装Rust和编译tokenizer
curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y
if ! command -v rustc &> /dev/null; then
curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y
fi
source "$HOME/.cargo/env"
uv run maturin develop --release --manifest-path rustbpe/Cargo.toml
# 下载评估数据
EVAL_BUNDLE_URL=https://karpathy-public.s3.us-west-2.amazonaws.com/eval_bundle.zip
if [ ! -d "$NANOCHAT_BASE_DIR/eval_bundle" ]; then
echo "📥 下载评估数据包..."
curl -L -o eval_bundle.zip $EVAL_BUNDLE_URL
if [ ! -f "eval_bundle.zip" ]; then
echo "📥 下载评估数据包..."
curl -L -o eval_bundle.zip $EVAL_BUNDLE_URL
else
echo "🗂️ 已存在 eval_bundle.zip跳过下载。"
fi
unzip -q eval_bundle.zip
rm eval_bundle.zip
mv eval_bundle $NANOCHAT_BASE_DIR
else
echo "✅ 评估数据包已经存在,跳过下载。"
fi
# 下载身份对话数据
curl -L -o $NANOCHAT_BASE_DIR/identity_conversations.jsonl https://karpathy-public.s3.us-west-2.amazonaws.com/identity_conversations.jsonl
if [ ! -f "$NANOCHAT_BASE_DIR/identity_conversations.jsonl" ]; then
curl -L -o $NANOCHAT_BASE_DIR/identity_conversations.jsonl https://karpathy-public.s3.us-west-2.amazonaws.com/identity_conversations.jsonl
else
echo "✅ identity_conversations.jsonl 已存在,跳过下载。"
fi
echo "📊 开始数据准备..."

View File

@ -256,8 +256,8 @@ for step in range(num_iterations):
if master_process:
base_dir = get_base_dir()
depth = model.config.n_layer
model_tag = f"t4_d{depth}" # base the model tag on the depth of the base model
checkpoint_dir = os.path.join(base_dir, "t4_chatsft_checkpoints", model_tag)
model_tag = f"d{depth}" # base the model tag on the depth of the base model
checkpoint_dir = os.path.join(base_dir, "chatsft_checkpoints", model_tag)
model_config_kwargs = model.config.__dict__ # slightly naughty, abusing the simplicity of GPTConfig, TODO nicer
save_checkpoint(
checkpoint_dir,

View File

@ -209,8 +209,8 @@ while True:
# save checkpoint at the end of the run (only on master process)
if master_process and last_step and not dry_run:
output_dirname = f"t4_d{depth}"
checkpoint_dir = os.path.join(base_dir, "t4_mid_checkpoints", output_dirname)
output_dirname = f"d{depth}"
checkpoint_dir = os.path.join(base_dir, "mid_checkpoints", output_dirname)
save_checkpoint(
checkpoint_dir,
step,

View File

@ -238,8 +238,8 @@ for step in range(num_iterations + 1):
# save checkpoint at the end of the run (only on master process)
if master_process and last_step:
output_dirname = model_tag if model_tag else f"t4_d{depth}"
checkpoint_dir = os.path.join(base_dir, "t4_checkpoints", output_dirname)
output_dirname = model_tag if model_tag else f"d{depth}"
checkpoint_dir = os.path.join(base_dir, "base_checkpoints", output_dirname)
save_checkpoint(
checkpoint_dir,
step,