mirror of
https://github.com/karpathy/nanochat.git
synced 2026-03-08 02:10:31 +00:00
big, breaking change but large upside: swap previous FineWeb-EDU dataset to NVIDIA ClimbMix dataset. Requires people to download the data shards. The upside is that training GPT-2 capablity model now only takes ~2 hours, down from 2.76 hours, so this is a huge win data-wise
This commit is contained in:
parent
b07604ebaa
commit
324e69c45d
23
dev/LOG.md
23
dev/LOG.md
|
|
@ -4,6 +4,29 @@ A running summary documenting some experiments and findings. Started ~Jan 7 2026
|
|||
|
||||
---
|
||||
|
||||
## 2026-03-04: Dataset upgrade: FineWeb-EDU 100B → ClimbMix 400B
|
||||
|
||||
Switched the pretraining dataset from FineWeb-EDU 100B to ClimbMix 400B. This is by far the single biggest improvement to nanochat's GPT-2 speedrun time, bringing it down from **2 hours 46 minutes to 2 hours 1 minute** — a 27% reduction.
|
||||
|
||||
### What is ClimbMix?
|
||||
|
||||
ClimbMix 400B is a curated 400B-token pretraining mixture hosted at `karpathy/climbmix-400b-shuffle` on HuggingFace. It comes form [NVIDIA](https://huggingface.co/datasets/nvidia/Nemotron-ClimbMix). It is a blend of high-quality web text, code, math, and other sources, designed to be a better general-purpose pretraining dataset than FineWeb-EDU alone.
|
||||
|
||||
### What changed
|
||||
|
||||
- **Dataset**: `karpathy/fineweb-edu-100b-shuffle` → `karpathy/climbmix-400b-shuffle` (up to 6543 shards available vs the previous 1823 data shards, allowing for longer training in the future)
|
||||
- **Data directory**: `base_data/` → `base_data_climbmix/` (clean separation from legacy data)
|
||||
- **Model depth**: d26 → d24. ClimbMix trains more efficiently, so a smaller model reaches GPT-2 capability
|
||||
- **Shard count**: Only approx 150 data shards (~7B tokens) are now needed for GPT-2 capability
|
||||
- **Eval tokens**: doubled from 40 to 80 batches for more stable validation loss estimates
|
||||
- **Legacy fallback**: added a migration warning in `list_parquet_files()` that detects the old `base_data/` directory and falls back gracefully, so existing users see clear upgrade instructions on `git pull`
|
||||
|
||||
### Context
|
||||
|
||||
This is the sixth attempt at beating FineWeb-EDU on CORE score — the previous five all failed (see entries on 2026-02-17, 2026-02-10, 2026-01-12 below). ClimbMix is the first dataset to convincingly surpass it, and the margin is large enough to also shrink the model from d26 to d24.
|
||||
|
||||
---
|
||||
|
||||
## 2026-03-02: SoftCap tuning
|
||||
|
||||
Quick experiment to tune logit softcap on d24 scale. Tried 5..30. 5 was terrible, the rest of them were all about equal with the exception of 20, which was the best. Minor but solid improvement: val loss improved by ~1e-3 (0.716 -> 0.715). Setting as default.
|
||||
|
|
|
|||
|
|
@ -32,7 +32,8 @@ def _document_batches(split, resume_state_dict, tokenizer_batch_size):
|
|||
"""
|
||||
ddp, ddp_rank, ddp_local_rank, ddp_world_size = get_dist_info()
|
||||
|
||||
parquet_paths = list_parquet_files()
|
||||
warn_on_legacy = ddp_rank == 0 and split == "train" # rank 0 on train split will warn on legacy
|
||||
parquet_paths = list_parquet_files(warn_on_legacy=warn_on_legacy)
|
||||
assert len(parquet_paths) != 0, "No dataset parquet files found, did you run dataset.py?"
|
||||
parquet_paths = parquet_paths[:-1] if split == "train" else parquet_paths[-1:]
|
||||
|
||||
|
|
|
|||
|
|
@ -20,19 +20,43 @@ from nanochat.common import get_base_dir
|
|||
# The specifics of the current pretraining dataset
|
||||
|
||||
# The URL on the internet where the data is hosted and downloaded from on demand
|
||||
BASE_URL = "https://huggingface.co/datasets/karpathy/fineweb-edu-100b-shuffle/resolve/main"
|
||||
MAX_SHARD = 1822 # the last datashard is shard_01822.parquet
|
||||
BASE_URL = "https://huggingface.co/datasets/karpathy/climbmix-400b-shuffle/resolve/main"
|
||||
MAX_SHARD = 6542 # the last datashard is shard_06542.parquet
|
||||
index_to_filename = lambda index: f"shard_{index:05d}.parquet" # format of the filenames
|
||||
base_dir = get_base_dir()
|
||||
DATA_DIR = os.path.join(base_dir, "base_data")
|
||||
os.makedirs(DATA_DIR, exist_ok=True)
|
||||
DATA_DIR = os.path.join(base_dir, "base_data_climbmix")
|
||||
|
||||
# -----------------------------------------------------------------------------
|
||||
# These functions are useful utilities to other modules, can/should be imported
|
||||
|
||||
def list_parquet_files(data_dir=None):
|
||||
def list_parquet_files(data_dir=None, warn_on_legacy=False):
|
||||
""" Looks into a data dir and returns full paths to all parquet files. """
|
||||
data_dir = DATA_DIR if data_dir is None else data_dir
|
||||
|
||||
# Legacy-supporting code due to the upgrade from FinewebEdu-100B to ClimbMix-400B
|
||||
# This code will eventually be deleted.
|
||||
if not os.path.exists(data_dir):
|
||||
if warn_on_legacy:
|
||||
print()
|
||||
print("=" * 80)
|
||||
print(" WARNING: DATASET UPGRADE REQUIRED")
|
||||
print("=" * 80)
|
||||
print()
|
||||
print(f" Could not find: {data_dir}")
|
||||
print()
|
||||
print(" nanochat recently switched from FinewebEdu-100B to ClimbMix-400B.")
|
||||
print(" Everyone who does `git pull` as of March 4, 2026 is expected to see this message.")
|
||||
print(" To upgrade to the new ClimbMix-400B dataset, run these two commands:")
|
||||
print()
|
||||
print(" python -m nanochat.dataset -n 170 # download ~170 shards, enough for GPT-2, adjust as desired")
|
||||
print(" python -m scripts.tok_train # re-train tokenizer on new ClimbMix data")
|
||||
print()
|
||||
print(" For now, falling back to your old FinewebEdu-100B dataset...")
|
||||
print("=" * 80)
|
||||
print()
|
||||
# attempt a fallback to the legacy data directory
|
||||
data_dir = os.path.join(base_dir, "base_data")
|
||||
|
||||
parquet_files = sorted([
|
||||
f for f in os.listdir(data_dir)
|
||||
if f.endswith('.parquet') and not f.endswith('.tmp')
|
||||
|
|
@ -110,13 +134,21 @@ def download_single_file(index):
|
|||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser(description="Download FineWeb-Edu 100BT dataset shards")
|
||||
parser.add_argument("-n", "--num-files", type=int, default=-1, help="Number of shards to download (default: -1), -1 = disable")
|
||||
parser = argparse.ArgumentParser(description="Download pretraining dataset shards")
|
||||
parser.add_argument("-n", "--num-files", type=int, default=-1, help="Number of train shards to download (default: -1), -1 = disable")
|
||||
parser.add_argument("-w", "--num-workers", type=int, default=4, help="Number of parallel download workers (default: 4)")
|
||||
args = parser.parse_args()
|
||||
|
||||
num = MAX_SHARD + 1 if args.num_files == -1 else min(args.num_files, MAX_SHARD + 1)
|
||||
ids_to_download = list(range(num))
|
||||
# Prepare the output directory
|
||||
os.makedirs(DATA_DIR, exist_ok=True)
|
||||
|
||||
# The way this works is that the user specifies the number of train shards to download via the -n flag.
|
||||
# In addition to that, the validation shard is *always* downloaded and is pinned to be the last shard.
|
||||
num_train_shards = MAX_SHARD if args.num_files == -1 else min(args.num_files, MAX_SHARD)
|
||||
ids_to_download = list(range(num_train_shards))
|
||||
ids_to_download.append(MAX_SHARD) # always download the validation shard
|
||||
|
||||
# Download the shards
|
||||
print(f"Downloading {len(ids_to_download)} shards using {args.num_workers} workers...")
|
||||
print(f"Target directory: {DATA_DIR}")
|
||||
print()
|
||||
|
|
|
|||
|
|
@ -55,9 +55,9 @@ python -m nanochat.report reset
|
|||
# look at dev/repackage_data_reference.py for details on how this data was prepared
|
||||
python -m nanochat.dataset -n 8
|
||||
# Immediately also kick off downloading more shards in the background while tokenizer trains
|
||||
# Approximately 350 shards are needed for 10B tokens of data for pretraining.
|
||||
# The maximum total number of shards available in the entire dataset is 1822.
|
||||
python -m nanochat.dataset -n 370 &
|
||||
# Approximately 150 shards are needed for GPT-2 capability pretraining, add 20 for padding.
|
||||
# The maximum total number of shards available in the entire dataset is 6542.
|
||||
python -m nanochat.dataset -n 170 &
|
||||
DATASET_DOWNLOAD_PID=$!
|
||||
# train the tokenizer with vocab size 2**15 = 32768 on ~2B characters of data
|
||||
python -m scripts.tok_train
|
||||
|
|
@ -69,8 +69,8 @@ python -m scripts.tok_eval
|
|||
echo "Waiting for dataset download to complete..."
|
||||
wait $DATASET_DOWNLOAD_PID
|
||||
|
||||
# d26 model (slightly undertrained to beat GPT-2 => decrease data:params ratio from compute optimal 10.5 (default) to 8.25)
|
||||
torchrun --standalone --nproc_per_node=8 -m scripts.base_train -- --depth=26 --target-param-data-ratio=8.25 --device-batch-size=16 --fp8 --run=$WANDB_RUN
|
||||
# d24 model (slightly undertrained to beat GPT-2 => decrease data:params ratio from compute optimal 10.5 (default) to 9.5)
|
||||
torchrun --standalone --nproc_per_node=8 -m scripts.base_train -- --depth=24 --target-param-data-ratio=9.5 --device-batch-size=16 --fp8 --run=$WANDB_RUN
|
||||
# evaluate the model: CORE metric, BPB on train/val, and draw samples
|
||||
torchrun --standalone --nproc_per_node=8 -m scripts.base_eval -- --device-batch-size=16
|
||||
|
||||
|
|
|
|||
|
|
@ -71,7 +71,7 @@ parser.add_argument("--final-lr-frac", type=float, default=0.0, help="final LR a
|
|||
parser.add_argument("--resume-from-step", type=int, default=-1, help="resume training from this step (-1 = disable)")
|
||||
# Evaluation
|
||||
parser.add_argument("--eval-every", type=int, default=250, help="evaluate val bpb every N steps (-1 = disable)")
|
||||
parser.add_argument("--eval-tokens", type=int, default=40*524288, help="number of tokens to evaluate val loss on")
|
||||
parser.add_argument("--eval-tokens", type=int, default=80*524288, help="number of tokens to evaluate val loss on")
|
||||
parser.add_argument("--core-metric-every", type=int, default=2000, help="evaluate CORE metric every N steps (-1 = disable)")
|
||||
parser.add_argument("--core-metric-max-per-task", type=int, default=500, help="examples per task for CORE metric")
|
||||
parser.add_argument("--sample-every", type=int, default=2000, help="sample from model every N steps (-1 = disable)")
|
||||
|
|
@ -533,7 +533,7 @@ while True:
|
|||
eta_str = f" | eta: {eta_seconds/60:.1f}m"
|
||||
else:
|
||||
eta_str = ""
|
||||
epoch = dataloader_state_dict["epoch"]
|
||||
epoch = f"{dataloader_state_dict['epoch']} pq: {dataloader_state_dict['pq_idx']} rg: {dataloader_state_dict['rg_idx']}"
|
||||
print0(f"step {step:05d}/{num_iterations:05d} ({pct_done:.2f}%) | loss: {debiased_smooth_loss:.6f} | lrm: {lrm:.2f} | dt: {dt * 1000:.2f}ms | tok/sec: {tok_per_sec:,} | bf16_mfu: {mfu:.2f} | epoch: {epoch} | total time: {total_training_time/60:.2f}m{eta_str}")
|
||||
if step % 100 == 0:
|
||||
log_data = {
|
||||
|
|
|
|||
Loading…
Reference in New Issue
Block a user