From 9196ff6fc0e8179a7671882b59cd350c7f2d72bf Mon Sep 17 00:00:00 2001 From: Muheng Date: Thu, 8 Jan 2026 13:34:34 +0000 Subject: [PATCH] ready to run --- nanochat/checkpoint_manager.py | 24 ++++++++++---- scripts/base_train.py | 38 ++++++++-------------- speedrun_moe.sh | 58 +++++++--------------------------- 3 files changed, 43 insertions(+), 77 deletions(-) diff --git a/nanochat/checkpoint_manager.py b/nanochat/checkpoint_manager.py index b7f784a..780f212 100644 --- a/nanochat/checkpoint_manager.py +++ b/nanochat/checkpoint_manager.py @@ -20,6 +20,14 @@ def log0(message): if int(os.environ.get('RANK', 0)) == 0: logger.info(message) +def _optimizer_path(checkpoint_dir, step): + return os.path.join(checkpoint_dir, f"optim_{step:06d}.pt") + + +def _legacy_optimizer_path(checkpoint_dir, step, rank): + return os.path.join(checkpoint_dir, f"optim_{step:06d}_rank{rank:d}.pt") + + def save_checkpoint(checkpoint_dir, step, model_data, optimizer_data, meta_data, rank=0): if rank == 0: os.makedirs(checkpoint_dir, exist_ok=True) @@ -32,11 +40,11 @@ def save_checkpoint(checkpoint_dir, step, model_data, optimizer_data, meta_data, with open(meta_path, "w", encoding="utf-8") as f: json.dump(meta_data, f, indent=2) logger.info(f"Saved metadata to: {meta_path}") - # Note that optimizer state is sharded across ranks, so each rank must save its own. - if optimizer_data is not None: - optimizer_path = os.path.join(checkpoint_dir, f"optim_{step:06d}_rank{rank:d}.pt") - torch.save(optimizer_data, optimizer_path) - logger.info(f"Saved optimizer state to: {optimizer_path}") + # Save optimizer state once per step (non-sharded optimizer). + if optimizer_data is not None: + optimizer_path = _optimizer_path(checkpoint_dir, step) + torch.save(optimizer_data, optimizer_path) + logger.info(f"Saved optimizer state to: {optimizer_path}") def load_checkpoint(checkpoint_dir, step, device, load_optimizer=False, rank=0): # Load the model state @@ -45,7 +53,11 @@ def load_checkpoint(checkpoint_dir, step, device, load_optimizer=False, rank=0): # Load the optimizer state if requested optimizer_data = None if load_optimizer: - optimizer_path = os.path.join(checkpoint_dir, f"optim_{step:06d}_rank{rank:d}.pt") + optimizer_path = _optimizer_path(checkpoint_dir, step) + if not os.path.exists(optimizer_path): + optimizer_path = _legacy_optimizer_path(checkpoint_dir, step, rank) + if rank != 0 and not os.path.exists(optimizer_path): + optimizer_path = _legacy_optimizer_path(checkpoint_dir, step, 0) optimizer_data = torch.load(optimizer_path, map_location=device) # Load the metadata meta_path = os.path.join(checkpoint_dir, f"meta_{step:06d}.json") diff --git a/scripts/base_train.py b/scripts/base_train.py index 4894ad1..1a2b296 100644 --- a/scripts/base_train.py +++ b/scripts/base_train.py @@ -39,6 +39,16 @@ from scripts.base_eval import evaluate_model print_banner() +# Allow env overrides for common LR knobs used in cluster runs. +def _get_env_float(name, default): + val = os.getenv(name) + if val is None or val == "": + return default + try: + return float(val) + except ValueError as exc: + raise ValueError(f"Invalid {name} env value: {val}") from exc + # ----------------------------------------------------------------------------- # User settings run = "dummy" # wandb run name default ("dummy" is special - we won't log to wandb) @@ -75,14 +85,14 @@ embedding_lr = 0.0006 # learning rate for the embedding parameters (Adam) unembedding_lr = 0.0006 # learning rate for the unembedding parameters (Adam) weight_decay = 0.1 # weight decay (matches nanoMoE weight_decay=1e-1) matrix_lr = 0.0006 # learning rate for the matrix parameters (Muon) -learning_rate = 6e-4 # learning rate for AdamW optimizer (matches nanoMoE: 6e-4) +learning_rate = _get_env_float("LEARNING_RATE", 6e-4) # learning rate for AdamW optimizer (matches nanoMoE: 6e-4) betas = (0.9, 0.95) # betas for AdamW optimizer (matches nanoMoE: beta1=0.9, beta2=0.95) grad_clip = 1.0 # gradient clipping value (0.0 = disabled) decay_lr = True # whether to decay the learning rate (matches train_nano_moe.py) # Learning rate decay parameters (matching train.py and train_nano_moe.py) warmup_iters = 2000 # how many steps to warm up for (matches train.py default) lr_decay_iters = 50000 # learning rate decay iterations (matches train_nano_moe.py) -min_lr = 6e-5 # minimum learning rate (matches train.py default, which equals 6e-4 * 0.1) +min_lr = _get_env_float("MIN_LR", 6e-5) # minimum learning rate (matches train.py default, which equals 6e-4 * 0.1) final_lr_frac = 0.1 # final learning rate as fraction of initial learning rate (for compatibility) resume_from_step = -1 # resume training from this step of the optimization (-1 = disable) @@ -93,11 +103,11 @@ log_interval = 10 # every how many steps to log training metrics (matches nanoMo core_metric_every = -1 # every how many steps to evaluate the core metric (-1 = disable) core_metric_max_per_task = -1 # examples per task in estimating the core metric sample_every = 200000000 # every how many steps to sample from the model -save_every = 1000 # every how many steps to save model checkpoints (-1 = disable, and save only at the end of the run) +save_every = 10000 # every how many steps to save model checkpoints (-1 = disable, and save only at the end of the run) # System compile = True # use PyTorch 2.0 to compile the model to be faster (matches nanoMoE) # Output -model_tag = "" # optionally override the model tag for the output checkpoint directory name +model_tag = f"d6_min_lr{min_lr}_max_lr{learning_rate}" # optionally override the model tag for the output checkpoint directory name # now allow CLI to override the settings via the configurator lol config_keys = [k for k,v in globals().items() if not k.startswith('_') and isinstance(v, (int, float, bool, str))] @@ -345,26 +355,6 @@ while True: }) model.train() - # once in a while: sample from the model (only on master process) - # use the original uncompiled model because the inputs keep changing shape - if master_process and (last_step or (step > 0 and step % sample_every == 0)): - model.eval() - prompts = [ - "The capital of France is", - "The chemical symbol of gold is", - "If yesterday was Friday, then tomorrow will be", - "The opposite of hot is", - "The planets of the solar system are:", - "My favorite color is", - "If 5*x + 3 = 13, then x is", - ] - engine = Engine(orig_model, tokenizer) # use orig_model to avoid recompilation - for prompt in prompts: - tokens = tokenizer(prompt, prepend="<|bos|>") - with autocast_ctx: - sample, _ = engine.generate_batch(tokens, num_samples=1, max_tokens=16, temperature=0) - print0(tokenizer.decode(sample[0])) - model.train() # save checkpoint: at the end of the run, or every save_every steps, except at the first step or the resume step if last_step or (step > 0 and step != resume_from_step and save_every > 0 and step % save_every == 0): diff --git a/speedrun_moe.sh b/speedrun_moe.sh index 7207604..6b1b37e 100644 --- a/speedrun_moe.sh +++ b/speedrun_moe.sh @@ -11,16 +11,18 @@ # WANDB_RUN=speedrun screen -L -Logfile speedrun.log -S speedrun bash speedrun.sh # Default intermediate artifacts directory is in ~/.cache/nanochat-moe -USER = "dpq23" +export USER="limh23" export OMP_NUM_THREADS=1 export NANOCHAT_BASE_DIR="/thullms/$USER/.cache/nanochat-moe" -export NANOCHAT_DATA_DIR="/thullms/$USER" +export NANOCHAT_DATA_DIR="/thullms/$USER/.cache/nanochat-moe-data" mkdir -p $NANOCHAT_BASE_DIR mkdir -p $NANOCHAT_DATA_DIR + + # Use tokenizer from nanochat (not nanochat-moe) # Create a symlink to nanochat's tokenizer directory if it doesn't exist -NANOCHAT_TOKENIZER_DIR="/thullms/$USER/.cache/nanochat/tokenizer" +NANOCHAT_TOKENIZER_DIR="$HOME/.cache/nanochat/tokenizer" MOE_TOKENIZER_DIR="$NANOCHAT_BASE_DIR/tokenizer" if [ -d "$NANOCHAT_TOKENIZER_DIR" ] && [ ! -e "$MOE_TOKENIZER_DIR" ]; then echo "Creating symlink to nanochat tokenizer: $MOE_TOKENIZER_DIR -> $NANOCHAT_TOKENIZER_DIR" @@ -93,6 +95,7 @@ fi # export UV_INDEX_URL=https://pypi.tuna.tsinghua.edu.cn/simple # uv sync --extra gpu # # activate venv so that `python` uses the project's venv instead of system python +cd $HOME/nanochat-MoE source .venv/bin/activate # # ----------------------------------------------------------------------------- @@ -153,53 +156,14 @@ fi # echo "Waiting for dataset download to complete..." # wait $DATASET_DOWNLOAD_PID + +MIN_LR=${MIN_LR:-6e-5} +LEARNING_RATE=${LEARNING_RATE:-6e-4} # Number of processes/GPUs to use -NPROC_PER_NODE=2 +NPROC_PER_NODE=8 # Master port for distributed training (default: 29500) # Set this to avoid port conflicts when running multiple torchrun tasks simultaneously # Example: MASTER_PORT=29501 bash speedrun.sh MASTER_PORT=${MASTER_PORT:-29501} # # # pretrain the d20 model -MASTER_PORT=$MASTER_PORT torchrun --standalone --nproc_per_node=$NPROC_PER_NODE -m scripts_moe.base_train -- --depth=20 --run=$WANDB_RUN -# evaluate the model on a larger chunk of train/val data and draw some samples -MASTER_PORT=$MASTER_PORT torchrun --standalone --nproc_per_node=$NPROC_PER_NODE -m scripts_moe.base_loss -# evaluate the model on CORE tasks -MASTER_PORT=$MASTER_PORT torchrun --standalone --nproc_per_node=$NPROC_PER_NODE -m scripts_moe.base_eval - -# # ----------------------------------------------------------------------------- -# # Midtraining (teach the model conversation special tokens, tool use, multiple choice) - -# # download 2.3MB of synthetic identity conversations to impart a personality to nanochat -# # see dev/gen_sft_data.py for details on how this data was prepared and to get a sense of how you can easily tune it -# curl -L -o $NANOCHAT_BASE_DIR/identity_conversations.jsonl https://karpathy-public.s3.us-west-2.amazonaws.com/identity_conversations.jsonl - -# # run midtraining and eval the model -# torchrun --standalone --nproc_per_node=$NPROC_PER_NODE -m scripts.mid_train -- --run=$WANDB_RUN -# torchrun --standalone --nproc_per_node=$NPROC_PER_NODE -m scripts.chat_eval -- -i mid - -# # ----------------------------------------------------------------------------- -# # Supervised Finetuning (domain adaptation to each sequence all by itself per row) - -# # train sft and re-eval right away (should see a small bump) -# torchrun --standalone --nproc_per_node=$NPROC_PER_NODE -m scripts.chat_sft -- --run=$WANDB_RUN -# torchrun --standalone --nproc_per_node=$NPROC_PER_NODE -m scripts.chat_eval -- -i sft - -# # chat with the model over CLI! Leave out the -p to chat interactively -# python -m scripts.chat_cli -p "Why is the sky blue?" - -# even better, chat with your model over a pretty WebUI ChatGPT style -# python -m scripts.chat_web - -# # ----------------------------------------------------------------------------- -# # Reinforcement Learning. Optional, and currently only on GSM8K -# # (optional) - -# # run reinforcement learning -# MASTER_PORT=$MASTER_PORT torchrun --standalone --nproc_per_node=$NPROC_PER_NODE -m scripts.chat_rl -- --run=$WANDB_RUN -# # eval the RL model only on GSM8K -# MASTER_PORT=$MASTER_PORT torchrun --standalone --nproc_per_node=$NPROC_PER_NODE -m scripts.chat_eval -- -i rl -a GSM8K - -# # ----------------------------------------------------------------------------- -# # Generate the full report by putting together all the sections -# report.md is the output and will be copied to current directory for convenience -python -m nanochat_moe.report generate +MASTER_PORT=$MASTER_PORT torchrun --standalone --nproc_per_node=$NPROC_PER_NODE -m scripts.base_train >> $NANOCHAT_BASE_DIR/d6_min_lr${MIN_LR}_max_lr${LEARNING_RATE}.log 2>&1