From 3b50b77ed38c77be46406f925fccd79adca1fcda Mon Sep 17 00:00:00 2001 From: Andrej Karpathy Date: Tue, 13 Jan 2026 22:09:36 +0000 Subject: [PATCH] fix base_loss to report correct loss by switching the dataloader to the new default --- nanochat/checkpoint_manager.py | 3 +++ scripts/base_loss.py | 4 ++-- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/nanochat/checkpoint_manager.py b/nanochat/checkpoint_manager.py index cca6294..c008ec2 100644 --- a/nanochat/checkpoint_manager.py +++ b/nanochat/checkpoint_manager.py @@ -25,6 +25,7 @@ def _patch_missing_config_keys(model_config_kwargs): # Old models were trained with full context (no sliding window) if "window_pattern" not in model_config_kwargs: model_config_kwargs["window_pattern"] = "L" + log0(f"Patching missing window_pattern in model config to 'L'") def _patch_missing_keys(model_data, model_config): """Add default values for new parameters that may be missing in old checkpoints.""" @@ -32,9 +33,11 @@ def _patch_missing_keys(model_data, model_config): # resid_lambdas defaults to 1.0 (identity scaling) if "resid_lambdas" not in model_data: model_data["resid_lambdas"] = torch.ones(n_layer) + log0(f"Patching missing resid_lambdas in model data to 1.0") # x0_lambdas defaults to 0.0 (disabled) if "x0_lambdas" not in model_data: model_data["x0_lambdas"] = torch.zeros(n_layer) + log0(f"Patching missing x0_lambdas in model data to 0.0") def save_checkpoint(checkpoint_dir, step, model_data, optimizer_data, meta_data, rank=0): if rank == 0: diff --git a/scripts/base_loss.py b/scripts/base_loss.py index 094299a..46544d4 100644 --- a/scripts/base_loss.py +++ b/scripts/base_loss.py @@ -14,7 +14,7 @@ from contextlib import nullcontext import torch from nanochat.checkpoint_manager import load_model from nanochat.common import compute_init, print0, compute_cleanup, autodetect_device_type -from nanochat.dataloader import tokenizing_distributed_data_loader +from nanochat.dataloader import tokenizing_distributed_data_loader_bos_bestfit from nanochat.tokenizer import get_token_bytes, HuggingFaceTokenizer from nanochat.loss_eval import evaluate_bpb from nanochat.engine import Engine @@ -97,7 +97,7 @@ assert args.split_tokens % tokens_per_step == 0, "split_tokens must be divisible steps = args.split_tokens // tokens_per_step bpb_results = {} for split_name in ["train", "val"]: - loader = tokenizing_distributed_data_loader(tokenizer, args.device_batch_size, sequence_len, split_name, device=device) + loader = tokenizing_distributed_data_loader_bos_bestfit(tokenizer, args.device_batch_size, sequence_len, split_name, device=device) with autocast_ctx: bpb = evaluate_bpb(model, loader, steps, token_bytes) print0(f"{split_name} bpb: {bpb:.4f}")