diff --git a/dev/LOG.md b/dev/LOG.md index b6e83ef..b4b3757 100644 --- a/dev/LOG.md +++ b/dev/LOG.md @@ -4,6 +4,29 @@ A running summary documenting some experiments and findings. Started ~Jan 7 2026 --- +## 2026-03-04: Dataset upgrade: FineWeb-EDU 100B → ClimbMix 400B + +Switched the pretraining dataset from FineWeb-EDU 100B to ClimbMix 400B. This is by far the single biggest improvement to nanochat's GPT-2 speedrun time, bringing it down from **2 hours 46 minutes to 2 hours 1 minute** — a 27% reduction. + +### What is ClimbMix? + +ClimbMix 400B is a curated 400B-token pretraining mixture hosted at `karpathy/climbmix-400b-shuffle` on HuggingFace. It comes form [NVIDIA](https://huggingface.co/datasets/nvidia/Nemotron-ClimbMix). It is a blend of high-quality web text, code, math, and other sources, designed to be a better general-purpose pretraining dataset than FineWeb-EDU alone. + +### What changed + +- **Dataset**: `karpathy/fineweb-edu-100b-shuffle` → `karpathy/climbmix-400b-shuffle` (up to 6543 shards available vs the previous 1823 data shards, allowing for longer training in the future) +- **Data directory**: `base_data/` → `base_data_climbmix/` (clean separation from legacy data) +- **Model depth**: d26 → d24. ClimbMix trains more efficiently, so a smaller model reaches GPT-2 capability +- **Shard count**: Only approx 150 data shards (~7B tokens) are now needed for GPT-2 capability +- **Eval tokens**: doubled from 40 to 80 batches for more stable validation loss estimates +- **Legacy fallback**: added a migration warning in `list_parquet_files()` that detects the old `base_data/` directory and falls back gracefully, so existing users see clear upgrade instructions on `git pull` + +### Context + +This is the sixth attempt at beating FineWeb-EDU on CORE score — the previous five all failed (see entries on 2026-02-17, 2026-02-10, 2026-01-12 below). ClimbMix is the first dataset to convincingly surpass it, and the margin is large enough to also shrink the model from d26 to d24. + +--- + ## 2026-03-02: SoftCap tuning Quick experiment to tune logit softcap on d24 scale. Tried 5..30. 5 was terrible, the rest of them were all about equal with the exception of 20, which was the best. Minor but solid improvement: val loss improved by ~1e-3 (0.716 -> 0.715). Setting as default. diff --git a/nanochat/dataloader.py b/nanochat/dataloader.py index 125625f..4cb2279 100644 --- a/nanochat/dataloader.py +++ b/nanochat/dataloader.py @@ -32,7 +32,8 @@ def _document_batches(split, resume_state_dict, tokenizer_batch_size): """ ddp, ddp_rank, ddp_local_rank, ddp_world_size = get_dist_info() - parquet_paths = list_parquet_files() + warn_on_legacy = ddp_rank == 0 and split == "train" # rank 0 on train split will warn on legacy + parquet_paths = list_parquet_files(warn_on_legacy=warn_on_legacy) assert len(parquet_paths) != 0, "No dataset parquet files found, did you run dataset.py?" parquet_paths = parquet_paths[:-1] if split == "train" else parquet_paths[-1:] diff --git a/nanochat/dataset.py b/nanochat/dataset.py index 602daed..fffe722 100644 --- a/nanochat/dataset.py +++ b/nanochat/dataset.py @@ -20,19 +20,43 @@ from nanochat.common import get_base_dir # The specifics of the current pretraining dataset # The URL on the internet where the data is hosted and downloaded from on demand -BASE_URL = "https://huggingface.co/datasets/karpathy/fineweb-edu-100b-shuffle/resolve/main" -MAX_SHARD = 1822 # the last datashard is shard_01822.parquet +BASE_URL = "https://huggingface.co/datasets/karpathy/climbmix-400b-shuffle/resolve/main" +MAX_SHARD = 6542 # the last datashard is shard_06542.parquet index_to_filename = lambda index: f"shard_{index:05d}.parquet" # format of the filenames base_dir = get_base_dir() -DATA_DIR = os.path.join(base_dir, "base_data") -os.makedirs(DATA_DIR, exist_ok=True) +DATA_DIR = os.path.join(base_dir, "base_data_climbmix") # ----------------------------------------------------------------------------- # These functions are useful utilities to other modules, can/should be imported -def list_parquet_files(data_dir=None): +def list_parquet_files(data_dir=None, warn_on_legacy=False): """ Looks into a data dir and returns full paths to all parquet files. """ data_dir = DATA_DIR if data_dir is None else data_dir + + # Legacy-supporting code due to the upgrade from FinewebEdu-100B to ClimbMix-400B + # This code will eventually be deleted. + if not os.path.exists(data_dir): + if warn_on_legacy: + print() + print("=" * 80) + print(" WARNING: DATASET UPGRADE REQUIRED") + print("=" * 80) + print() + print(f" Could not find: {data_dir}") + print() + print(" nanochat recently switched from FinewebEdu-100B to ClimbMix-400B.") + print(" Everyone who does `git pull` as of March 4, 2026 is expected to see this message.") + print(" To upgrade to the new ClimbMix-400B dataset, run these two commands:") + print() + print(" python -m nanochat.dataset -n 170 # download ~170 shards, enough for GPT-2, adjust as desired") + print(" python -m scripts.tok_train # re-train tokenizer on new ClimbMix data") + print() + print(" For now, falling back to your old FinewebEdu-100B dataset...") + print("=" * 80) + print() + # attempt a fallback to the legacy data directory + data_dir = os.path.join(base_dir, "base_data") + parquet_files = sorted([ f for f in os.listdir(data_dir) if f.endswith('.parquet') and not f.endswith('.tmp') @@ -110,13 +134,21 @@ def download_single_file(index): if __name__ == "__main__": - parser = argparse.ArgumentParser(description="Download FineWeb-Edu 100BT dataset shards") - parser.add_argument("-n", "--num-files", type=int, default=-1, help="Number of shards to download (default: -1), -1 = disable") + parser = argparse.ArgumentParser(description="Download pretraining dataset shards") + parser.add_argument("-n", "--num-files", type=int, default=-1, help="Number of train shards to download (default: -1), -1 = disable") parser.add_argument("-w", "--num-workers", type=int, default=4, help="Number of parallel download workers (default: 4)") args = parser.parse_args() - num = MAX_SHARD + 1 if args.num_files == -1 else min(args.num_files, MAX_SHARD + 1) - ids_to_download = list(range(num)) + # Prepare the output directory + os.makedirs(DATA_DIR, exist_ok=True) + + # The way this works is that the user specifies the number of train shards to download via the -n flag. + # In addition to that, the validation shard is *always* downloaded and is pinned to be the last shard. + num_train_shards = MAX_SHARD if args.num_files == -1 else min(args.num_files, MAX_SHARD) + ids_to_download = list(range(num_train_shards)) + ids_to_download.append(MAX_SHARD) # always download the validation shard + + # Download the shards print(f"Downloading {len(ids_to_download)} shards using {args.num_workers} workers...") print(f"Target directory: {DATA_DIR}") print() diff --git a/runs/speedrun.sh b/runs/speedrun.sh index c757253..fa50694 100644 --- a/runs/speedrun.sh +++ b/runs/speedrun.sh @@ -55,9 +55,9 @@ python -m nanochat.report reset # look at dev/repackage_data_reference.py for details on how this data was prepared python -m nanochat.dataset -n 8 # Immediately also kick off downloading more shards in the background while tokenizer trains -# Approximately 350 shards are needed for 10B tokens of data for pretraining. -# The maximum total number of shards available in the entire dataset is 1822. -python -m nanochat.dataset -n 370 & +# Approximately 150 shards are needed for GPT-2 capability pretraining, add 20 for padding. +# The maximum total number of shards available in the entire dataset is 6542. +python -m nanochat.dataset -n 170 & DATASET_DOWNLOAD_PID=$! # train the tokenizer with vocab size 2**15 = 32768 on ~2B characters of data python -m scripts.tok_train @@ -69,8 +69,8 @@ python -m scripts.tok_eval echo "Waiting for dataset download to complete..." wait $DATASET_DOWNLOAD_PID -# d26 model (slightly undertrained to beat GPT-2 => decrease data:params ratio from compute optimal 10.5 (default) to 8.25) -torchrun --standalone --nproc_per_node=8 -m scripts.base_train -- --depth=26 --target-param-data-ratio=8.25 --device-batch-size=16 --fp8 --run=$WANDB_RUN +# d24 model (slightly undertrained to beat GPT-2 => decrease data:params ratio from compute optimal 10.5 (default) to 9.5) +torchrun --standalone --nproc_per_node=8 -m scripts.base_train -- --depth=24 --target-param-data-ratio=9.5 --device-batch-size=16 --fp8 --run=$WANDB_RUN # evaluate the model: CORE metric, BPB on train/val, and draw samples torchrun --standalone --nproc_per_node=8 -m scripts.base_eval -- --device-batch-size=16 diff --git a/scripts/base_train.py b/scripts/base_train.py index 24091b6..9461e88 100644 --- a/scripts/base_train.py +++ b/scripts/base_train.py @@ -71,7 +71,7 @@ parser.add_argument("--final-lr-frac", type=float, default=0.0, help="final LR a parser.add_argument("--resume-from-step", type=int, default=-1, help="resume training from this step (-1 = disable)") # Evaluation parser.add_argument("--eval-every", type=int, default=250, help="evaluate val bpb every N steps (-1 = disable)") -parser.add_argument("--eval-tokens", type=int, default=40*524288, help="number of tokens to evaluate val loss on") +parser.add_argument("--eval-tokens", type=int, default=80*524288, help="number of tokens to evaluate val loss on") parser.add_argument("--core-metric-every", type=int, default=2000, help="evaluate CORE metric every N steps (-1 = disable)") parser.add_argument("--core-metric-max-per-task", type=int, default=500, help="examples per task for CORE metric") parser.add_argument("--sample-every", type=int, default=2000, help="sample from model every N steps (-1 = disable)") @@ -533,7 +533,7 @@ while True: eta_str = f" | eta: {eta_seconds/60:.1f}m" else: eta_str = "" - epoch = dataloader_state_dict["epoch"] + epoch = f"{dataloader_state_dict['epoch']} pq: {dataloader_state_dict['pq_idx']} rg: {dataloader_state_dict['rg_idx']}" print0(f"step {step:05d}/{num_iterations:05d} ({pct_done:.2f}%) | loss: {debiased_smooth_loss:.6f} | lrm: {lrm:.2f} | dt: {dt * 1000:.2f}ms | tok/sec: {tok_per_sec:,} | bf16_mfu: {mfu:.2f} | epoch: {epoch} | total time: {total_training_time/60:.2f}m{eta_str}") if step % 100 == 0: log_data = {