From 5ca0950c9c86674066e1336409dc68e0eca77b51 Mon Sep 17 00:00:00 2001
From: Quanyi Mo <quanyi.mo@gmail.com>
Date: Sun, 2 Nov 2025 22:03:02 -0800
Subject: [PATCH] update README.md

---
 README.md              | 39 +++++++++++++++++++++++++++++++++++++++
 nanochat/common.py     | 13 +++++++------
 nanochat/dataloader.py |  6 +++---
 scripts/base_train.py  | 26 +++++++++++++++++++-------
 4 files changed, 68 insertions(+), 16 deletions(-)

diff --git a/README.md b/README.md
index 05a214b..aa624dd 100644
--- a/README.md
+++ b/README.md
@@ -6,6 +6,45 @@
 
 This repo is a full-stack implementation of an LLM like ChatGPT in a single, clean, minimal, hackable, dependency-lite codebase. nanochat is designed to run on a single 8XH100 node via scripts like [speedrun.sh](speedrun.sh), that run the entire pipeline start to end. This includes tokenization, pretraining, finetuning, evaluation, inference, and web serving over a simple UI so that you can talk to your own LLM just like ChatGPT. nanochat will become the capstone project of the course LLM101n being developed by Eureka Labs.
 
+## Running on 24GB VRAM GPUs (3090/4090)
+
+**New!** If you have a consumer GPU with 24GB VRAM (like RTX 3090 or 4090), you can now train nanochat on a single GPU using the optimized [speedrun_1x3090.sh](speedrun_1x3090.sh) script:
+
+```bash
+bash speedrun_1x3090.sh
+```
+
+Changes in source code to fit within 24GB VRAM:
+- **Reduced batch sizes**: `device_batch_size` reduced from 32 → 4 in base training and midtraining, 4 → 2 in SFT
+- **Separate validation batch size**: `device_batch_size_val = 1` for memory-intensive validation passes
+- **Fixed throughput reporting**: Corrected tokens/sec calculation to account for gradient accumulation
+- **Streamlined sampling**: Model sampling only at final training step to save time and memory
+
+Training with a single 3090 will take approximately 55x longer than the 8XH100 setup (~220 hours instead of 4), but produces similar results thanks to automatic gradient accumulation that maintains the same effective `total_batch_size` of 524288 tokens.
+
+3090 merics:
+
+| Metric          | BASE     | MID      | SFT      | RL       |
+|-----------------|----------|----------|----------|----------|
+| CORE            | 0.1863   | -        | -        | -        |
+| ARC-Challenge   | -        | 0.3072   | 0.3055   | -        |
+| ARC-Easy        | -        | 0.3674   | 0.3927   | -        |
+| GSM8K           | -        | 0.0273   | 0.0538   | 0.0895   |
+| HumanEval       | -        | 0.0854   | 0.0732   | -        |
+| MMLU            | -        | 0.3145   | 0.3110   | -        |
+| ChatCORE        | -        | 0.0863   | 0.0945   | -        |
+
+[8*H100 metrics](https://github.com/karpathy/nanochat/discussions/1) by Andrej Karpathy:
+
+| Metric          | BASE     | MID      | SFT      | RL       |
+|-----------------|----------|----------|----------|----------|
+| CORE            | 0.2219   | -        | -        | -        |
+| ARC-Challenge   | -        | 0.2875   | 0.2807   | -        |
+| ARC-Easy        | -        | 0.3561   | 0.3876   | -        |
+| GSM8K           | -        | 0.0250   | 0.0455   | 0.0758   |
+| HumanEval       | -        | 0.0671   | 0.0854   | -        |
+| MMLU            | -        | 0.3111   | 0.3151   | -        |
+| ChatCORE        | -        | 0.0730   | 0.0884   | -        |
 ## Talk to it
 
 To get a sense of the endpoint of this repo, you can currently find [nanochat d32](https://github.com/karpathy/nanochat/discussions/8) hosted on [nanochat.karpathy.ai](https://nanochat.karpathy.ai/). "d32" means that this model has 32 layers in the Transformer neural network. This model has 1.9 billion parameters, it was trained on 38 billion tokens by simply running the single script [run1000.sh](run1000.sh), and the total cost of training was ~$800 (about 33 hours training time on 8XH100 GPU node). While today this is enough to outperform GPT-2 of 2019, it falls dramatically short of moden Large Language Models like GPT-5. When talking to these micro models, you'll see that they make a lot of mistakes, they are a little bit naive and silly and they hallucinate a ton, a bit like children. It's kind of amusing. But what makes nanochat unique is that it is fully yours - fully configurable, tweakable, hackable, and trained by you from start to end. To train and talk to your own, we turn to...
diff --git a/nanochat/common.py b/nanochat/common.py
index 8b10df9..22232d1 100644
--- a/nanochat/common.py
+++ b/nanochat/common.py
@@ -89,15 +89,16 @@ def get_dist_info():
     else:
         return False, 0, 0, 1
 
-def compute_init():
+def compute_init(device_type="cuda"): # cuda|cpu
     """Basic initialization that we keep doing over and over, so make common."""
 
     # CUDA is currently required
-    assert torch.cuda.is_available(), "CUDA is needed for a distributed run atm"
+    # assert torch.cuda.is_available(), "CUDA is needed for a distributed run atm"
 
     # Reproducibility
     torch.manual_seed(42)
-    torch.cuda.manual_seed(42)
+    if device_type == "cuda":
+        torch.cuda.manual_seed(42)
     # skipping full reproducibility for now, possibly investigate slowdown later
     # torch.use_deterministic_algorithms(True)
     # torch.backends.cudnn.deterministic = True
@@ -106,15 +107,15 @@ def compute_init():
     # Precision
     torch.set_float32_matmul_precision("high") # uses tf32 instead of fp32 for matmuls
 
-    # Distributed setup: Distributed Data Parallel (DDP), optional
+    # Distributed setup: Distributed Data Parallel (DDP), optional, and requires CUDA
     ddp, ddp_rank, ddp_local_rank, ddp_world_size = get_dist_info()
-    if ddp:
+    if ddp and device_type == "cuda":
         device = torch.device("cuda", ddp_local_rank)
         torch.cuda.set_device(device) # make "cuda" default to this device
         dist.init_process_group(backend="nccl", device_id=device)
         dist.barrier()
     else:
-        device = torch.device("cuda")
+        device = torch.device(device_type) # cuda|cpu
 
     if ddp_rank == 0:
         logger.info(f"Distributed world size: {ddp_world_size}")
diff --git a/nanochat/dataloader.py b/nanochat/dataloader.py
index c1636b1..12e7d8e 100644
--- a/nanochat/dataloader.py
+++ b/nanochat/dataloader.py
@@ -6,7 +6,7 @@ from nanochat.common import get_dist_info
 from nanochat.dataset import parquets_iter_batched
 from nanochat.tokenizer import get_tokenizer
 
-def tokenizing_distributed_data_loader(B, T, split, tokenizer_threads=4, tokenizer_batch_size=128):
+def tokenizing_distributed_data_loader(B, T, split, tokenizer_threads=4, tokenizer_batch_size=128, device="cuda"):
     """Stream pretraining text from parquet files, tokenize, yield training batches."""
     assert split in ["train", "val"], "split must be 'train' or 'val'"
     ddp, ddp_rank, ddp_local_rank, ddp_world_size = get_dist_info()
@@ -44,6 +44,6 @@ def tokenizing_distributed_data_loader(B, T, split, tokenizer_threads=4, tokeniz
         inputs_cpu = scratch[:-1].to(dtype=torch.int32)
         targets_cpu = scratch[1:]
         # Reshape to 2D and move to GPU async
-        inputs = inputs_cpu.view(B, T).to(device="cuda", dtype=torch.int32, non_blocking=True)
-        targets = targets_cpu.view(B, T).to(device="cuda", dtype=torch.int64, non_blocking=True)
+        inputs = inputs_cpu.view(B, T).to(device=device, dtype=torch.int32, non_blocking=True)
+        targets = targets_cpu.view(B, T).to(device=device, dtype=torch.int64, non_blocking=True)
         yield inputs, targets
diff --git a/scripts/base_train.py b/scripts/base_train.py
index 628b806..6502035 100644
--- a/scripts/base_train.py
+++ b/scripts/base_train.py
@@ -6,6 +6,9 @@ python base_train.py
 or distributed as:
 
 torchrun --nproc_per_node=8 base_train.py
+
+If you just want to see it run on CPU (you won't get far but it should run), try something like:
+python -m scripts.base_train --depth=4 --max_seq_len=512 --device_batch_size=1 --device_type=cpu --eval_tokens=512 --total_batch_size=512 --num_iterations=1000
 """
 
 import os
@@ -27,6 +30,8 @@ print_banner()
 # -----------------------------------------------------------------------------
 # User settings
 run = "dummy" # wandb run name default ("dummy" is special - we won't log to wandb)
+# Runtime
+device_type = "cuda" # cuda|cpu
 # Model architecture
 depth = 20 # the depth of the Transformer model to train, rest of the kwargs are derived
 max_seq_len = 2048 # max context length
@@ -58,9 +63,11 @@ user_config = {k: globals()[k] for k in config_keys} # will be useful for loggin
 # -----------------------------------------------------------------------------
 
 # Compute init
-ddp, ddp_rank, ddp_local_rank, ddp_world_size, device = compute_init()
+ddp, ddp_rank, ddp_local_rank, ddp_world_size, device = compute_init(device_type)
 master_process = ddp_rank == 0 # this process will do logging, checkpointing etc.
-autocast_ctx = torch.amp.autocast(device_type="cuda", dtype=torch.bfloat16)
+autocast_ctx = torch.amp.autocast(device_type=device_type, dtype=torch.bfloat16)
+synchronize = torch.cuda.synchronize if device_type == "cuda" else lambda: None
+get_max_memory = torch.cuda.max_memory_allocated if device_type == "cuda" else lambda: 0
 
 # wandb logging init
 use_dummy_wandb = run == "dummy" or not master_process
@@ -97,7 +104,7 @@ model_config_kwargs = dict(sequence_len=max_seq_len, vocab_size=vocab_size, n_la
 with torch.device("meta"):
     model_config = GPTConfig(**model_config_kwargs)
     model = GPT(model_config)
-model.to_empty(device="cuda")
+model.to_empty(device=device)
 model.init_weights()
 orig_model = model # original, uncompiled model, for saving raw model state_dict
 model = torch.compile(model, dynamic=False) # TODO: dynamic True/False think through
@@ -134,8 +141,13 @@ adamw_optimizer, muon_optimizer = optimizers
 # Initialize the DataLoaders for train/val
 base_dir = get_base_dir()
 tokens_dir = os.path.join(base_dir, "tokenized_data")
+<<<<<<< HEAD
 train_loader = tokenizing_distributed_data_loader(device_batch_size, max_seq_len, split="train")
 build_val_loader = lambda: tokenizing_distributed_data_loader(device_batch_size_val, max_seq_len, split="val")
+=======
+train_loader = tokenizing_distributed_data_loader(device_batch_size, max_seq_len, split="train", device=device)
+build_val_loader = lambda: tokenizing_distributed_data_loader(device_batch_size, max_seq_len, split="val", device=device)
+>>>>>>> 722da4f... trying to add basic cpu support, will try mps too
 x, y = next(train_loader) # kick off load of the very first batch of data
 
 # -----------------------------------------------------------------------------
@@ -254,7 +266,7 @@ for step in range(num_iterations + 1):
     # -------------------------------------------------------------------------
     # single training step
     # evaluate the gradient
-    torch.cuda.synchronize()
+    synchronize()
     t0 = time.time()
     for micro_step in range(grad_accum_steps):
         with autocast_ctx:
@@ -277,7 +289,7 @@ for step in range(num_iterations + 1):
     for opt in optimizers:
         opt.step()
     model.zero_grad(set_to_none=True)
-    torch.cuda.synchronize()
+    synchronize()
     t1 = time.time()
     dt = t1 - t0
     # -------------------------------------------------------------------------
@@ -306,7 +318,7 @@ for step in range(num_iterations + 1):
         })
 
 # print a few more stats
-print0(f"Peak memory usage: {torch.cuda.max_memory_allocated() / 1024 / 1024:.2f}MiB")
+print0(f"Peak memory usage: {get_max_memory() / 1024 / 1024:.2f}MiB")
 print0(f"Total training time: {total_training_time/60:.2f}m")
 print0(f"Minimum validation bpb: {min_val_bpb:.4f}")
 
@@ -332,7 +344,7 @@ get_report().log(section="Base model training", data=[
         "MFU %": f"{mfu:.2f}%",
         "Total training flops": f"{flops_so_far:e}",
         "Total training time": f"{total_training_time/60:.2f}m",
-        "Peak memory usage": f"{torch.cuda.max_memory_allocated() / 1024 / 1024:.2f}MiB",
+        "Peak memory usage": f"{get_max_memory() / 1024 / 1024:.2f}MiB",
     }
 ])