global bs and d_model configurable

2026-04-02 21:55:14 +00:00 · 2026-01-10 02:30:44 +00:00 · 2026-01-10 02:30:44 +00:00 · 1dc944f734
commit 1dc944f734
parent 1f09520820
2 changed files with 14 additions and 6 deletions
--- a/scripts/base_train.py
+++ b/scripts/base_train.py
@ -89,8 +89,15 @@ num_iterations = 50000 # explicit number of steps (matches nanoMoE max_iters=500
 target_flops = -1.0 # calculate num_iterations to reach target_flops. Useful for scaling laws experiments (-1 = disable)
 target_param_data_ratio = -1 # calculate num_iterations to maintain fixed data:param ratio (Chinchilla=20) (-1 = disable)
 # Optimization
-device_batch_size = 12 # per-device batch size (matches nanoMoE batch_size=12)
-total_batch_size = 491520 # total desired batch size in #tokens (matches nanoMoE: 12 * 1024 * 40 = 491,520 for 8 GPUs)
+device_batch_size = _get_env_int("MICRO_BS", 12) # per-device batch size (matches nanoMoE batch_size=12)
+glabal_batch_size = _get_env_int("GLOBAL_BS", 480) # total batch size (overrides total_batch_size if set)
+gpu_number = _get_env_int("NPROC_PER_NODE", -1)
+if glabal_batch_size > 0:
+    total_batch_size = max_seq_len * glabal_batch_size
+    if gpu_number > 0:
+        assert total_batch_size % (device_batch_size * gpu_number) == 0, "GLOBAL_BS must be compatible with MICRO_BS and NPROC_PER_NODE"
+else:
+    total_batch_size = 491520 # total desired batch size in #tokens (matches nanoMoE: 12 * 1024 * 40 = 491,520 for 8 GPUs)
 embedding_lr = 0.0006 # learning rate for the embedding parameters (Adam)
 unembedding_lr = 0.0006 # learning rate for the unembedding parameters (Adam)
 weight_decay = 0.1 # weight decay (matches nanoMoE weight_decay=1e-1)
@ -123,7 +130,7 @@ model_tag = os.getenv("MODEL_TAG", "") # optionally override the model tag for t
 config_keys = [k for k,v in globals().items() if not k.startswith('_') and isinstance(v, (int, float, bool, str))]
 exec(open(os.path.join('nanochat', 'configurator.py')).read()) # overrides from command line or config file
 if model_tag == "":
-    model_tag = f"d{depth}_min_lr{min_lr}_max_lr{learning_rate}"
+    model_tag = f"d{depth}_e{n_exp}_min_lr{min_lr}_max_lr{learning_rate}"
 user_config = {k: globals()[k] for k in config_keys} # will be useful for logging
 # -----------------------------------------------------------------------------

@ -158,7 +165,7 @@ print0(f"Vocab size: {vocab_size:,}")
 # Model kwargs are derived from the desired depth of the model
 # For nanoMoE, we use n_layer, n_head, n_embd directly
 n_layer = depth
-model_dim = 384  # matches train_nano_moe.py
+model_dim = _get_env_int("MODEL_DIM", 384)  # matches train_nano_moe.py
 num_heads = 6  # matches train_nano_moe.py
 n_head = num_heads
 n_embd = model_dim
--- a/speedrun_moe.sh
+++ b/speedrun_moe.sh
@ -156,13 +156,14 @@ fi
 # echo "Waiting for dataset download to complete..."
 # wait $DATASET_DOWNLOAD_PID

-
+MODEL_DIM=${MODEL_DIM:-384}
+GLOBAL_BS=${GLOBAL_BS:-480}
 MIN_LR=${MIN_LR:-6e-5}
 LEARNING_RATE=${LEARNING_RATE:-6e-4}
 DEPTH=${DEPTH:-${N_LAYER:-6}}
 MODEL_TAG=${MODEL_TAG:-d${DEPTH}_min_lr${MIN_LR}_max_lr${LEARNING_RATE}}
 # Number of processes/GPUs to use
-NPROC_PER_NODE=8
+NPROC_PER_NODE=${NPROC_PER_NODE:-8}
 # Master port for distributed training (default: 29500)
 # Set this to avoid port conflicts when running multiple torchrun tasks simultaneously
 # Example: MASTER_PORT=29501 bash speedrun.sh