From 5ca0950c9c86674066e1336409dc68e0eca77b51 Mon Sep 17 00:00:00 2001 From: Quanyi Mo Date: Sun, 2 Nov 2025 22:03:02 -0800 Subject: [PATCH] update README.md --- README.md | 39 +++++++++++++++++++++++++++++++++++++++ nanochat/common.py | 13 +++++++------ nanochat/dataloader.py | 6 +++--- scripts/base_train.py | 26 +++++++++++++++++++------- 4 files changed, 68 insertions(+), 16 deletions(-) diff --git a/README.md b/README.md index 05a214b..aa624dd 100644 --- a/README.md +++ b/README.md @@ -6,6 +6,45 @@ This repo is a full-stack implementation of an LLM like ChatGPT in a single, clean, minimal, hackable, dependency-lite codebase. nanochat is designed to run on a single 8XH100 node via scripts like [speedrun.sh](speedrun.sh), that run the entire pipeline start to end. This includes tokenization, pretraining, finetuning, evaluation, inference, and web serving over a simple UI so that you can talk to your own LLM just like ChatGPT. nanochat will become the capstone project of the course LLM101n being developed by Eureka Labs. +## Running on 24GB VRAM GPUs (3090/4090) + +**New!** If you have a consumer GPU with 24GB VRAM (like RTX 3090 or 4090), you can now train nanochat on a single GPU using the optimized [speedrun_1x3090.sh](speedrun_1x3090.sh) script: + +```bash +bash speedrun_1x3090.sh +``` + +Changes in source code to fit within 24GB VRAM: +- **Reduced batch sizes**: `device_batch_size` reduced from 32 → 4 in base training and midtraining, 4 → 2 in SFT +- **Separate validation batch size**: `device_batch_size_val = 1` for memory-intensive validation passes +- **Fixed throughput reporting**: Corrected tokens/sec calculation to account for gradient accumulation +- **Streamlined sampling**: Model sampling only at final training step to save time and memory + +Training with a single 3090 will take approximately 55x longer than the 8XH100 setup (~220 hours instead of 4), but produces similar results thanks to automatic gradient accumulation that maintains the same effective `total_batch_size` of 524288 tokens. + +3090 merics: + +| Metric | BASE | MID | SFT | RL | +|-----------------|----------|----------|----------|----------| +| CORE | 0.1863 | - | - | - | +| ARC-Challenge | - | 0.3072 | 0.3055 | - | +| ARC-Easy | - | 0.3674 | 0.3927 | - | +| GSM8K | - | 0.0273 | 0.0538 | 0.0895 | +| HumanEval | - | 0.0854 | 0.0732 | - | +| MMLU | - | 0.3145 | 0.3110 | - | +| ChatCORE | - | 0.0863 | 0.0945 | - | + +[8*H100 metrics](https://github.com/karpathy/nanochat/discussions/1) by Andrej Karpathy: + +| Metric | BASE | MID | SFT | RL | +|-----------------|----------|----------|----------|----------| +| CORE | 0.2219 | - | - | - | +| ARC-Challenge | - | 0.2875 | 0.2807 | - | +| ARC-Easy | - | 0.3561 | 0.3876 | - | +| GSM8K | - | 0.0250 | 0.0455 | 0.0758 | +| HumanEval | - | 0.0671 | 0.0854 | - | +| MMLU | - | 0.3111 | 0.3151 | - | +| ChatCORE | - | 0.0730 | 0.0884 | - | ## Talk to it To get a sense of the endpoint of this repo, you can currently find [nanochat d32](https://github.com/karpathy/nanochat/discussions/8) hosted on [nanochat.karpathy.ai](https://nanochat.karpathy.ai/). "d32" means that this model has 32 layers in the Transformer neural network. This model has 1.9 billion parameters, it was trained on 38 billion tokens by simply running the single script [run1000.sh](run1000.sh), and the total cost of training was ~$800 (about 33 hours training time on 8XH100 GPU node). While today this is enough to outperform GPT-2 of 2019, it falls dramatically short of moden Large Language Models like GPT-5. When talking to these micro models, you'll see that they make a lot of mistakes, they are a little bit naive and silly and they hallucinate a ton, a bit like children. It's kind of amusing. But what makes nanochat unique is that it is fully yours - fully configurable, tweakable, hackable, and trained by you from start to end. To train and talk to your own, we turn to... diff --git a/nanochat/common.py b/nanochat/common.py index 8b10df9..22232d1 100644 --- a/nanochat/common.py +++ b/nanochat/common.py @@ -89,15 +89,16 @@ def get_dist_info(): else: return False, 0, 0, 1 -def compute_init(): +def compute_init(device_type="cuda"): # cuda|cpu """Basic initialization that we keep doing over and over, so make common.""" # CUDA is currently required - assert torch.cuda.is_available(), "CUDA is needed for a distributed run atm" + # assert torch.cuda.is_available(), "CUDA is needed for a distributed run atm" # Reproducibility torch.manual_seed(42) - torch.cuda.manual_seed(42) + if device_type == "cuda": + torch.cuda.manual_seed(42) # skipping full reproducibility for now, possibly investigate slowdown later # torch.use_deterministic_algorithms(True) # torch.backends.cudnn.deterministic = True @@ -106,15 +107,15 @@ def compute_init(): # Precision torch.set_float32_matmul_precision("high") # uses tf32 instead of fp32 for matmuls - # Distributed setup: Distributed Data Parallel (DDP), optional + # Distributed setup: Distributed Data Parallel (DDP), optional, and requires CUDA ddp, ddp_rank, ddp_local_rank, ddp_world_size = get_dist_info() - if ddp: + if ddp and device_type == "cuda": device = torch.device("cuda", ddp_local_rank) torch.cuda.set_device(device) # make "cuda" default to this device dist.init_process_group(backend="nccl", device_id=device) dist.barrier() else: - device = torch.device("cuda") + device = torch.device(device_type) # cuda|cpu if ddp_rank == 0: logger.info(f"Distributed world size: {ddp_world_size}") diff --git a/nanochat/dataloader.py b/nanochat/dataloader.py index c1636b1..12e7d8e 100644 --- a/nanochat/dataloader.py +++ b/nanochat/dataloader.py @@ -6,7 +6,7 @@ from nanochat.common import get_dist_info from nanochat.dataset import parquets_iter_batched from nanochat.tokenizer import get_tokenizer -def tokenizing_distributed_data_loader(B, T, split, tokenizer_threads=4, tokenizer_batch_size=128): +def tokenizing_distributed_data_loader(B, T, split, tokenizer_threads=4, tokenizer_batch_size=128, device="cuda"): """Stream pretraining text from parquet files, tokenize, yield training batches.""" assert split in ["train", "val"], "split must be 'train' or 'val'" ddp, ddp_rank, ddp_local_rank, ddp_world_size = get_dist_info() @@ -44,6 +44,6 @@ def tokenizing_distributed_data_loader(B, T, split, tokenizer_threads=4, tokeniz inputs_cpu = scratch[:-1].to(dtype=torch.int32) targets_cpu = scratch[1:] # Reshape to 2D and move to GPU async - inputs = inputs_cpu.view(B, T).to(device="cuda", dtype=torch.int32, non_blocking=True) - targets = targets_cpu.view(B, T).to(device="cuda", dtype=torch.int64, non_blocking=True) + inputs = inputs_cpu.view(B, T).to(device=device, dtype=torch.int32, non_blocking=True) + targets = targets_cpu.view(B, T).to(device=device, dtype=torch.int64, non_blocking=True) yield inputs, targets diff --git a/scripts/base_train.py b/scripts/base_train.py index 628b806..6502035 100644 --- a/scripts/base_train.py +++ b/scripts/base_train.py @@ -6,6 +6,9 @@ python base_train.py or distributed as: torchrun --nproc_per_node=8 base_train.py + +If you just want to see it run on CPU (you won't get far but it should run), try something like: +python -m scripts.base_train --depth=4 --max_seq_len=512 --device_batch_size=1 --device_type=cpu --eval_tokens=512 --total_batch_size=512 --num_iterations=1000 """ import os @@ -27,6 +30,8 @@ print_banner() # ----------------------------------------------------------------------------- # User settings run = "dummy" # wandb run name default ("dummy" is special - we won't log to wandb) +# Runtime +device_type = "cuda" # cuda|cpu # Model architecture depth = 20 # the depth of the Transformer model to train, rest of the kwargs are derived max_seq_len = 2048 # max context length @@ -58,9 +63,11 @@ user_config = {k: globals()[k] for k in config_keys} # will be useful for loggin # ----------------------------------------------------------------------------- # Compute init -ddp, ddp_rank, ddp_local_rank, ddp_world_size, device = compute_init() +ddp, ddp_rank, ddp_local_rank, ddp_world_size, device = compute_init(device_type) master_process = ddp_rank == 0 # this process will do logging, checkpointing etc. -autocast_ctx = torch.amp.autocast(device_type="cuda", dtype=torch.bfloat16) +autocast_ctx = torch.amp.autocast(device_type=device_type, dtype=torch.bfloat16) +synchronize = torch.cuda.synchronize if device_type == "cuda" else lambda: None +get_max_memory = torch.cuda.max_memory_allocated if device_type == "cuda" else lambda: 0 # wandb logging init use_dummy_wandb = run == "dummy" or not master_process @@ -97,7 +104,7 @@ model_config_kwargs = dict(sequence_len=max_seq_len, vocab_size=vocab_size, n_la with torch.device("meta"): model_config = GPTConfig(**model_config_kwargs) model = GPT(model_config) -model.to_empty(device="cuda") +model.to_empty(device=device) model.init_weights() orig_model = model # original, uncompiled model, for saving raw model state_dict model = torch.compile(model, dynamic=False) # TODO: dynamic True/False think through @@ -134,8 +141,13 @@ adamw_optimizer, muon_optimizer = optimizers # Initialize the DataLoaders for train/val base_dir = get_base_dir() tokens_dir = os.path.join(base_dir, "tokenized_data") +<<<<<<< HEAD train_loader = tokenizing_distributed_data_loader(device_batch_size, max_seq_len, split="train") build_val_loader = lambda: tokenizing_distributed_data_loader(device_batch_size_val, max_seq_len, split="val") +======= +train_loader = tokenizing_distributed_data_loader(device_batch_size, max_seq_len, split="train", device=device) +build_val_loader = lambda: tokenizing_distributed_data_loader(device_batch_size, max_seq_len, split="val", device=device) +>>>>>>> 722da4f... trying to add basic cpu support, will try mps too x, y = next(train_loader) # kick off load of the very first batch of data # ----------------------------------------------------------------------------- @@ -254,7 +266,7 @@ for step in range(num_iterations + 1): # ------------------------------------------------------------------------- # single training step # evaluate the gradient - torch.cuda.synchronize() + synchronize() t0 = time.time() for micro_step in range(grad_accum_steps): with autocast_ctx: @@ -277,7 +289,7 @@ for step in range(num_iterations + 1): for opt in optimizers: opt.step() model.zero_grad(set_to_none=True) - torch.cuda.synchronize() + synchronize() t1 = time.time() dt = t1 - t0 # ------------------------------------------------------------------------- @@ -306,7 +318,7 @@ for step in range(num_iterations + 1): }) # print a few more stats -print0(f"Peak memory usage: {torch.cuda.max_memory_allocated() / 1024 / 1024:.2f}MiB") +print0(f"Peak memory usage: {get_max_memory() / 1024 / 1024:.2f}MiB") print0(f"Total training time: {total_training_time/60:.2f}m") print0(f"Minimum validation bpb: {min_val_bpb:.4f}") @@ -332,7 +344,7 @@ get_report().log(section="Base model training", data=[ "MFU %": f"{mfu:.2f}%", "Total training flops": f"{flops_so_far:e}", "Total training time": f"{total_training_time/60:.2f}m", - "Peak memory usage": f"{torch.cuda.max_memory_allocated() / 1024 / 1024:.2f}MiB", + "Peak memory usage": f"{get_max_memory() / 1024 / 1024:.2f}MiB", } ])