global bs and d_model configurable

This commit is contained in:
Muheng 2026-01-10 02:30:44 +00:00
parent 1f09520820
commit 1dc944f734
2 changed files with 14 additions and 6 deletions

View File

@ -89,8 +89,15 @@ num_iterations = 50000 # explicit number of steps (matches nanoMoE max_iters=500
target_flops = -1.0 # calculate num_iterations to reach target_flops. Useful for scaling laws experiments (-1 = disable)
target_param_data_ratio = -1 # calculate num_iterations to maintain fixed data:param ratio (Chinchilla=20) (-1 = disable)
# Optimization
device_batch_size = 12 # per-device batch size (matches nanoMoE batch_size=12)
total_batch_size = 491520 # total desired batch size in #tokens (matches nanoMoE: 12 * 1024 * 40 = 491,520 for 8 GPUs)
device_batch_size = _get_env_int("MICRO_BS", 12) # per-device batch size (matches nanoMoE batch_size=12)
glabal_batch_size = _get_env_int("GLOBAL_BS", 480) # total batch size (overrides total_batch_size if set)
gpu_number = _get_env_int("NPROC_PER_NODE", -1)
if glabal_batch_size > 0:
total_batch_size = max_seq_len * glabal_batch_size
if gpu_number > 0:
assert total_batch_size % (device_batch_size * gpu_number) == 0, "GLOBAL_BS must be compatible with MICRO_BS and NPROC_PER_NODE"
else:
total_batch_size = 491520 # total desired batch size in #tokens (matches nanoMoE: 12 * 1024 * 40 = 491,520 for 8 GPUs)
embedding_lr = 0.0006 # learning rate for the embedding parameters (Adam)
unembedding_lr = 0.0006 # learning rate for the unembedding parameters (Adam)
weight_decay = 0.1 # weight decay (matches nanoMoE weight_decay=1e-1)
@ -123,7 +130,7 @@ model_tag = os.getenv("MODEL_TAG", "") # optionally override the model tag for t
config_keys = [k for k,v in globals().items() if not k.startswith('_') and isinstance(v, (int, float, bool, str))]
exec(open(os.path.join('nanochat', 'configurator.py')).read()) # overrides from command line or config file
if model_tag == "":
model_tag = f"d{depth}_min_lr{min_lr}_max_lr{learning_rate}"
model_tag = f"d{depth}_e{n_exp}_min_lr{min_lr}_max_lr{learning_rate}"
user_config = {k: globals()[k] for k in config_keys} # will be useful for logging
# -----------------------------------------------------------------------------
@ -158,7 +165,7 @@ print0(f"Vocab size: {vocab_size:,}")
# Model kwargs are derived from the desired depth of the model
# For nanoMoE, we use n_layer, n_head, n_embd directly
n_layer = depth
model_dim = 384 # matches train_nano_moe.py
model_dim = _get_env_int("MODEL_DIM", 384) # matches train_nano_moe.py
num_heads = 6 # matches train_nano_moe.py
n_head = num_heads
n_embd = model_dim

View File

@ -156,13 +156,14 @@ fi
# echo "Waiting for dataset download to complete..."
# wait $DATASET_DOWNLOAD_PID
MODEL_DIM=${MODEL_DIM:-384}
GLOBAL_BS=${GLOBAL_BS:-480}
MIN_LR=${MIN_LR:-6e-5}
LEARNING_RATE=${LEARNING_RATE:-6e-4}
DEPTH=${DEPTH:-${N_LAYER:-6}}
MODEL_TAG=${MODEL_TAG:-d${DEPTH}_min_lr${MIN_LR}_max_lr${LEARNING_RATE}}
# Number of processes/GPUs to use
NPROC_PER_NODE=8
NPROC_PER_NODE=${NPROC_PER_NODE:-8}
# Master port for distributed training (default: 29500)
# Set this to avoid port conflicts when running multiple torchrun tasks simultaneously
# Example: MASTER_PORT=29501 bash speedrun.sh