diff --git a/README.md b/README.md index 800c5d9..6283437 100644 --- a/README.md +++ b/README.md @@ -142,8 +142,7 @@ I've published a number of guides that might contain helpful information: │ ├── scaling_laws.sh # Scaling laws experiments │ └── speedrun.sh # Train the ~$100 nanochat d20 ├── scripts -│ ├── base_eval.py # Base model: calculate CORE score -│ ├── base_loss.py # Base model: calculate bits per byte, sample +│ ├── base_eval.py # Base model: CORE score, bits per byte, samples │ ├── base_train.py # Base model: train │ ├── chat_cli.py # Chat model: talk to over CLI │ ├── chat_eval.py # Chat model: eval tasks diff --git a/runs/runcpu.sh b/runs/runcpu.sh index f383726..853fa1f 100755 --- a/runs/runcpu.sh +++ b/runs/runcpu.sh @@ -42,8 +42,7 @@ python -m scripts.base_train \ --sample-every=100 \ --num-iterations=5000 \ --run=$WANDB_RUN -python -m scripts.base_loss --device-batch-size=1 --split-tokens=16384 -python -m scripts.base_eval --max-per-task=16 +python -m scripts.base_eval --device-batch-size=1 --split-tokens=16384 --max-per-task=16 # SFT (~10 minutes on my MacBook Pro M3 Max) curl -L -o $NANOCHAT_BASE_DIR/identity_conversations.jsonl https://karpathy-public.s3.us-west-2.amazonaws.com/identity_conversations.jsonl diff --git a/runs/speedrun.sh b/runs/speedrun.sh index a9612c0..a709462 100644 --- a/runs/speedrun.sh +++ b/runs/speedrun.sh @@ -74,9 +74,7 @@ NPROC_PER_NODE=8 # d24 model (slightly overtrained is enough to beat GPT-2 => increase data:params ratio from compute optimal 10.5 (default) to 12) torchrun --standalone --nproc_per_node=$NPROC_PER_NODE -m scripts.base_train -- --depth=24 --target-param-data-ratio=12 --run=$WANDB_RUN -# evaluate the model on a larger chunk of train/val data and draw some samples -torchrun --standalone --nproc_per_node=$NPROC_PER_NODE -m scripts.base_loss -# evaluate the model on CORE tasks +# evaluate the model: CORE metric, BPB on train/val, and draw samples torchrun --standalone --nproc_per_node=$NPROC_PER_NODE -m scripts.base_eval # ----------------------------------------------------------------------------- diff --git a/scripts/base_eval.py b/scripts/base_eval.py index bd83ff3..57f9fd4 100644 --- a/scripts/base_eval.py +++ b/scripts/base_eval.py @@ -1,13 +1,23 @@ """ -Evaluate the CORE metric for a given model. +Unified evaluation script for base models. -Run on a single GPU: -python -m scripts.base_eval +Supports three evaluation modes (comma-separated): + --eval core : CORE metric (accuracy on ICL tasks) + --eval bpb : Bits per byte on train/val splits + --eval sample : Generate samples from the model -Run with torchrun on e.g. 8 GPUs: -torchrun --nproc_per_node=8 -m scripts.base_eval +Default is all three: --eval core,bpb,sample -The script will print the CORE metric to the console. +Examples: + + # Evaluate a HuggingFace model (e.g. GPT-2 124M) using 8 GPUs + torchrun --nproc_per_node=8 -m scripts.base_eval --hf-path openai-community/gpt2 + + # Evaluate a nanochat model (e.g. d24) using 8 GPUs + torchrun --nproc_per_node=8 -m scripts.base_eval --model-tag d24 --device-batch-size=16 + + # Quick/approximate evaluation using a single GPU + python -m scripts.base_eval --model-tag d24 --device-batch-size=16 --max-per-task=100 --split-tokens=524288 """ import os import csv @@ -18,24 +28,74 @@ import shutil import random import zipfile import tempfile +import argparse from contextlib import nullcontext import torch from nanochat.common import compute_init, compute_cleanup, print0, get_base_dir, autodetect_device_type, download_file_with_lock -from nanochat.tokenizer import HuggingFaceTokenizer +from nanochat.tokenizer import HuggingFaceTokenizer, get_token_bytes from nanochat.checkpoint_manager import load_model from nanochat.core_eval import evaluate_task +from nanochat.dataloader import tokenizing_distributed_data_loader_bos_bestfit +from nanochat.loss_eval import evaluate_bpb +from nanochat.engine import Engine # ----------------------------------------------------------------------------- -# nanochat specific function dealing with I/O etc. +# HuggingFace loading utilities + +class ModelWrapper: + """Lightweight wrapper to give HuggingFace models a nanochat-compatible interface.""" + def __init__(self, model, max_seq_len=None): + self.model = model + self.max_seq_len = max_seq_len + + def __call__(self, input_ids, targets=None, loss_reduction='mean'): + logits = self.model(input_ids).logits + if targets is None: + return logits + loss = torch.nn.functional.cross_entropy( + logits.view(-1, logits.size(-1)), + targets.view(-1), + ignore_index=-1, + reduction=loss_reduction + ) + return loss + + def get_device(self): + return next(self.model.parameters()).device + + +def load_hf_model(hf_path: str, device): + """Load a HuggingFace model and tokenizer.""" + print0(f"Loading HuggingFace model from: {hf_path}") + from transformers import AutoModelForCausalLM + model = AutoModelForCausalLM.from_pretrained(hf_path) + model.to(device) + model.eval() + max_seq_len = 1024 if "openai-community/gpt2" in hf_path else None + model = ModelWrapper(model, max_seq_len=max_seq_len) + tokenizer = HuggingFaceTokenizer.from_pretrained(hf_path) + return model, tokenizer + + +def get_hf_token_bytes(tokenizer, device="cpu"): + """Compute token_bytes tensor for a HuggingFace tokenizer.""" + vocab_size = tokenizer.tokenizer.get_vocab_size() + token_bytes = torch.zeros(vocab_size, dtype=torch.int64, device=device) + for token_id in range(vocab_size): + token_str = tokenizer.tokenizer.decode([token_id]) + token_bytes[token_id] = len(token_str.encode('utf-8')) + return token_bytes + +# ----------------------------------------------------------------------------- +# CORE evaluation -# ~162MB of data needed to evaluate the CORE metric EVAL_BUNDLE_URL = "https://karpathy-public.s3.us-west-2.amazonaws.com/eval_bundle.zip" + def place_eval_bundle(file_path): - # here file_path is the path to the eval_bundle.zip file - # we need to unzip it and place it in the base directory + """Unzip eval_bundle.zip and place it in the base directory.""" base_dir = get_base_dir() eval_bundle_dir = os.path.join(base_dir, "eval_bundle") with tempfile.TemporaryDirectory() as tmpdir: @@ -45,25 +105,27 @@ def place_eval_bundle(file_path): shutil.move(extracted_bundle_dir, eval_bundle_dir) print0(f"Placed eval_bundle directory at {eval_bundle_dir}") -def evaluate_model(model, tokenizer, device, max_per_task=-1): + +def evaluate_core(model, tokenizer, device, max_per_task=-1): """ Evaluate a base model on the CORE benchmark. - - max_per_task: crop the data to this many examples per task for testing (-1 = disable) + Returns dict with results, centered_results, and core_metric. """ - # Load config and task metadata base_dir = get_base_dir() eval_bundle_dir = os.path.join(base_dir, "eval_bundle") - # Download the eval bundle to disk (and unzip if needed) + # Download the eval bundle if needed if not os.path.exists(eval_bundle_dir): download_file_with_lock(EVAL_BUNDLE_URL, "eval_bundle.zip", postprocess_fn=place_eval_bundle) + config_path = os.path.join(eval_bundle_dir, "core.yaml") data_base_path = os.path.join(eval_bundle_dir, "eval_data") eval_meta_data = os.path.join(eval_bundle_dir, "eval_meta_data.csv") + with open(config_path, 'r', encoding='utf-8') as f: config = yaml.safe_load(f) tasks = config['icl_tasks'] - # Load random baseline values from eval metadata + # Load random baseline values random_baselines = {} with open(eval_meta_data, 'r', encoding='utf-8') as f: reader = csv.DictReader(f) @@ -86,27 +148,23 @@ def evaluate_model(model, tokenizer, device, max_per_task=-1): } print0(f"Evaluating: {label} ({task_meta['num_fewshot']}-shot, type: {task_meta['task_type']})... ", end='') - # Load data for this task data_path = os.path.join(data_base_path, task_meta['dataset_uri']) with open(data_path, 'r', encoding='utf-8') as f: data = [json.loads(line.strip()) for line in f] - # shuffle the data because in many cases it appears ordered but we want - # the ability to only run a subset of the data for debugging purposes etc. + # Shuffle for consistent subsampling when using max_per_task shuffle_rng = random.Random(1337) shuffle_rng.shuffle(data) if max_per_task > 0: data = data[:max_per_task] - # run the evaluation for this task accuracy = evaluate_task(model, tokenizer, data, device, task_meta) - results[label] = accuracy random_baseline = random_baselines[label] centered_result = (accuracy - 0.01 * random_baseline) / (1.0 - 0.01 * random_baseline) centered_results[label] = centered_result - end_time = time.time() - print0(f"accuracy: {accuracy:.4f} | centered: {centered_result:.4f} | time: {end_time - start_time:.2f}s") + elapsed = time.time() - start_time + print0(f"accuracy: {accuracy:.4f} | centered: {centered_result:.4f} | time: {elapsed:.2f}s") core_metric = sum(centered_results.values()) / len(centered_results) out = { @@ -117,98 +175,157 @@ def evaluate_model(model, tokenizer, device, max_per_task=-1): return out # ----------------------------------------------------------------------------- -# HuggingFace loading utilities and light wrappers for a model +# Main -class ModelWrapper: - """Lightweight wrapper for a HuggingFace model""" - def __init__(self, model, max_seq_len=None): - self.model = model - self.max_seq_len = max_seq_len - - def __call__(self, input_ids): - outputs = self.model(input_ids) - logits = outputs.logits - return logits - -def load_hf_model(hf_path: str, device): - print0(f"Loading model from: {hf_path}") - # Load the model - from transformers import AutoModelForCausalLM - model = AutoModelForCausalLM.from_pretrained(hf_path) - model.to(device) - model.eval() - max_seq_len = 1024 if "openai-community/gpt2" in hf_path else None - model = ModelWrapper(model, max_seq_len=max_seq_len) - # Load the tokenizer - tokenizer = HuggingFaceTokenizer.from_pretrained(hf_path) - return model, tokenizer - -# ----------------------------------------------------------------------------- def main(): - import argparse - parser = argparse.ArgumentParser() - parser.add_argument('--hf-path', type=str, default=None, help='HuggingFace model path to evaluate') - parser.add_argument('--max-per-task', type=int, default=-1, help='Max examples per task to evaluate (-1 = disable)') - parser.add_argument('--model-tag', type=str, default=None, help='optional model tag for the output directory name') - parser.add_argument('--step', type=str, default=None, help='optional model step for the output directory name') + parser = argparse.ArgumentParser(description="Base model evaluation") + parser.add_argument('--eval', type=str, default='core,bpb,sample', help='Comma-separated evaluations to run: core,bpb,sample (default: all)') + parser.add_argument('--hf-path', type=str, default=None, help='HuggingFace model path (e.g. openai-community/gpt2)') + parser.add_argument('--model-tag', type=str, default=None, help='nanochat model tag to identify the checkpoint directory') + parser.add_argument('--step', type=int, default=None, help='Model step to load (default = last)') + parser.add_argument('--max-per-task', type=int, default=-1, help='Max examples per CORE task (-1 = all)') + parser.add_argument('--device-batch-size', type=int, default=32, help='Per-device batch size for BPB evaluation') + parser.add_argument('--split-tokens', type=int, default=40*524288, help='Number of tokens to evaluate per split for BPB') + parser.add_argument('--device-type', type=str, default='', help='cuda|cpu|mps (empty = autodetect)') args = parser.parse_args() - # distributed / precision setup - device_type = autodetect_device_type() + # Parse evaluation modes + eval_modes = set(mode.strip() for mode in args.eval.split(',')) + valid_modes = {'core', 'bpb', 'sample'} + invalid = eval_modes - valid_modes + if invalid: + parser.error(f"Invalid eval modes: {invalid}. Valid: {valid_modes}") + + # Distributed / precision setup + device_type = autodetect_device_type() if args.device_type == '' else args.device_type ddp, ddp_rank, ddp_local_rank, ddp_world_size, device = compute_init(device_type) autocast_ctx = torch.amp.autocast(device_type=device_type, dtype=torch.bfloat16) if device_type == "cuda" else nullcontext() - # Load model and tokenizer from command line or from file system - if args.hf_path is not None: - # atm assume that if a path is given, it's a huggingface model path - hf_path = args.hf_path - print0(f"Loading huggingface model from: {hf_path}") - model, tokenizer = load_hf_model(hf_path, device) - model_name = hf_path # just for logging - model_slug = hf_path.replace("/", "-") # for the output csv file + # Load model and tokenizer + is_hf_model = args.hf_path is not None + if is_hf_model: + model, tokenizer = load_hf_model(args.hf_path, device) + sequence_len = model.max_seq_len or 1024 + token_bytes = get_hf_token_bytes(tokenizer, device=device) + model_name = args.hf_path + model_slug = args.hf_path.replace("/", "-") else: - # load a local model from the file system model, tokenizer, meta = load_model("base", device, phase="eval", model_tag=args.model_tag, step=args.step) - model_name = f"base_model (step {meta['step']})" # just for logging - model_slug = f"base_model_{meta['step']:06d}" # for the output csv file + sequence_len = meta["model_config"]["sequence_len"] + token_bytes = get_token_bytes(device=device) + model_name = f"base_model (step {meta['step']})" + model_slug = f"base_model_{meta['step']:06d}" - # Evaluate the model - with autocast_ctx: - out = evaluate_model(model, tokenizer, device, max_per_task=args.max_per_task) + print0(f"Evaluating model: {model_name}") + print0(f"Eval modes: {', '.join(sorted(eval_modes))}") - # Write out the results to a csv file - core_metric = None - centered_results = {} - if ddp_rank == 0: - base_dir = get_base_dir() - output_csv_path = os.path.join(base_dir, "base_eval", f"{model_slug}.csv") - os.makedirs(os.path.dirname(output_csv_path), exist_ok=True) - results = out["results"] - centered_results = out["centered_results"] - core_metric = out["core_metric"] - with open(output_csv_path, 'w', encoding='utf-8', newline='') as f: - f.write(f"{'Task':<35}, {'Accuracy':<10}, {'Centered':<10}\n") - for label in results: - f.write(f"{label:<35}, {results[label]:<10.6f}, {centered_results[label]:<10.6f}\n") - f.write(f"{'CORE':<35}, {'':<10}, {core_metric:<10.6f}\n") - # Print the content of the csv file to console too + # Results to log + core_results = None + bpb_results = {} + samples = [] + unconditioned_samples = [] + + # --- CORE evaluation --- + if 'core' in eval_modes: + print0("\n" + "="*80) + print0("CORE Evaluation") print0("="*80) - print0(f"Model: {model_name}") - print0("="*80) - with open(output_csv_path, 'r', encoding='utf-8') as f: - print0(f.read()) + with autocast_ctx: + core_results = evaluate_core(model, tokenizer, device, max_per_task=args.max_per_task) - # Log to report + # Write CSV output + if ddp_rank == 0: + base_dir = get_base_dir() + output_csv_path = os.path.join(base_dir, "base_eval", f"{model_slug}.csv") + os.makedirs(os.path.dirname(output_csv_path), exist_ok=True) + with open(output_csv_path, 'w', encoding='utf-8', newline='') as f: + f.write(f"{'Task':<35}, {'Accuracy':<10}, {'Centered':<10}\n") + for label in core_results["results"]: + acc = core_results["results"][label] + centered = core_results["centered_results"][label] + f.write(f"{label:<35}, {acc:<10.6f}, {centered:<10.6f}\n") + f.write(f"{'CORE':<35}, {'':<10}, {core_results['core_metric']:<10.6f}\n") + print0(f"\nResults written to: {output_csv_path}") + print0(f"CORE metric: {core_results['core_metric']:.4f}") + + # --- BPB evaluation --- + if 'bpb' in eval_modes: + print0("\n" + "="*80) + print0("BPB Evaluation") + print0("="*80) + tokens_per_step = args.device_batch_size * sequence_len * ddp_world_size + if args.split_tokens % tokens_per_step != 0: + # Adjust to nearest multiple + args.split_tokens = (args.split_tokens // tokens_per_step) * tokens_per_step + print0(f"Adjusted split_tokens to {args.split_tokens} (must be divisible by {tokens_per_step})") + steps = args.split_tokens // tokens_per_step + + for split_name in ["train", "val"]: + loader = tokenizing_distributed_data_loader_bos_bestfit(tokenizer, args.device_batch_size, sequence_len, split_name, device=device) + with autocast_ctx: + bpb = evaluate_bpb(model, loader, steps, token_bytes) + bpb_results[split_name] = bpb + print0(f"{split_name} bpb: {bpb:.6f}") + + # --- Sampling --- + if 'sample' in eval_modes and not is_hf_model: + print0("\n" + "="*80) + print0("Model Samples") + print0("="*80) + if ddp_rank == 0: + prompts = [ + "The capital of France is", + "The chemical symbol of gold is", + "If yesterday was Friday, then tomorrow will be", + "The opposite of hot is", + "The planets of the solar system are:", + "My favorite color is", + "If 5*x + 3 = 13, then x is", + ] + engine = Engine(model, tokenizer) + print0("\nConditioned samples:") + for prompt in prompts: + tokens = tokenizer(prompt, prepend="<|bos|>") + with autocast_ctx: + sample, _ = engine.generate_batch(tokens, num_samples=1, max_tokens=16, temperature=0) + sample_str = tokenizer.decode(sample[0]) + print0("-" * 80) + print0(sample_str) + samples.append(sample_str) + + print0("\nUnconditioned samples:") + tokens = tokenizer("", prepend="<|bos|>") + with autocast_ctx: + uncond, _ = engine.generate_batch(tokens, num_samples=8, max_tokens=128, temperature=1.0) + for sample in uncond: + sample_str = tokenizer.decode(sample) + print0("-" * 80) + print0(sample_str) + unconditioned_samples.append(sample_str) + elif 'sample' in eval_modes and is_hf_model: + print0("\nSkipping sampling for HuggingFace models (not supported)") + + # --- Log to report --- from nanochat.report import get_report - get_report().log(section="Base model evaluation", data=[ - { - "Model": model_name, - "CORE metric": core_metric, - }, - centered_results, # the full table - ]) + report_data = [{"model": model_name}] + + if core_results: + report_data[0]["CORE metric"] = core_results["core_metric"] + report_data.append(core_results["centered_results"]) + + if bpb_results: + report_data[0]["train bpb"] = bpb_results.get("train") + report_data[0]["val bpb"] = bpb_results.get("val") + + if samples: + report_data.append({f"sample {i}": s for i, s in enumerate(samples)}) + if unconditioned_samples: + report_data.append({f"unconditioned {i}": s for i, s in enumerate(unconditioned_samples)}) + + get_report().log(section="Base model evaluation", data=report_data) compute_cleanup() + if __name__ == "__main__": main() diff --git a/scripts/base_loss.py b/scripts/base_loss.py deleted file mode 100644 index fb8cf59..0000000 --- a/scripts/base_loss.py +++ /dev/null @@ -1,155 +0,0 @@ -""" -Loads a checkpoint, and: -- Evaluates the loss on a larger chunk of train/val splits -- Samples from the model - -Example run as: -torchrun --standalone --nproc_per_node=8 -m scripts.base_loss - -To evaluate a HuggingFace model: -python -m scripts.base_loss --hf-path openai-community/gpt2 -""" -import argparse -from contextlib import nullcontext -import torch -from nanochat.checkpoint_manager import load_model -from nanochat.common import compute_init, print0, compute_cleanup, autodetect_device_type -from nanochat.dataloader import tokenizing_distributed_data_loader_bos_bestfit -from nanochat.tokenizer import get_token_bytes, HuggingFaceTokenizer -from nanochat.loss_eval import evaluate_bpb -from nanochat.engine import Engine - -# ----------------------------------------------------------------------------- -# HuggingFace loading utilities, making the APIs match up to those of nanochat - -class ModelWrapper: - """Lightweight wrapper for a HuggingFace model""" - def __init__(self, model, max_seq_len=None): - self.model = model - self.max_seq_len = max_seq_len - - def __call__(self, input_ids, targets=None, loss_reduction='mean'): - logits = self.model(input_ids).logits - if targets is None: - return logits - else: - loss = torch.nn.functional.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1), ignore_index=-1, reduction=loss_reduction) - return loss - - def get_device(self): - return next(self.model.parameters()).device - -def load_hf_model(hf_path: str, device): - print0(f"Loading model from: {hf_path}") - from transformers import AutoModelForCausalLM - model = AutoModelForCausalLM.from_pretrained(hf_path) - model.to(device) - model.eval() - max_seq_len = 1024 if "openai-community/gpt2" in hf_path else None - model = ModelWrapper(model, max_seq_len=max_seq_len) - tokenizer = HuggingFaceTokenizer.from_pretrained(hf_path) - return model, tokenizer - -def get_hf_token_bytes(tokenizer, device="cpu"): - """Compute token_bytes tensor for a HuggingFace tokenizer.""" - vocab_size = tokenizer.tokenizer.get_vocab_size() - token_bytes = torch.zeros(vocab_size, dtype=torch.int64, device=device) - for token_id in range(vocab_size): - token_str = tokenizer.tokenizer.decode([token_id]) - token_bytes[token_id] = len(token_str.encode('utf-8')) # Count UTF-8 bytes - return token_bytes - -# CLI arguments -parser = argparse.ArgumentParser(description="Evaluate loss on train/val splits and sample from model") -parser.add_argument("--device-batch-size", type=int, default=32, help="per-device batch size") -parser.add_argument("--split-tokens", type=int, default=40*524288, help="number of tokens to evaluate per split") -parser.add_argument("--model-tag", type=str, default=None, help="model tag for checkpoint directory") -parser.add_argument("--model-step", type=int, default=None, help="model step to load") -parser.add_argument("--device-type", type=str, default="", help="cuda|cpu|mps (empty = autodetect)") -parser.add_argument("--hf-path", type=str, default=None, help="HuggingFace model path (e.g. openai-community/gpt2)") -args = parser.parse_args() - -# Load the base model and the tokenizer -device_type = autodetect_device_type() if args.device_type == "" else args.device_type -ddp, ddp_rank, ddp_local_rank, ddp_world_size, device = compute_init(device_type) -print0(f"Device: {device} | DDP rank: {ddp_rank} | DDP local rank: {ddp_local_rank} | DDP world size: {ddp_world_size}") - -if args.hf_path is not None: - # Load HuggingFace model - model, tokenizer = load_hf_model(args.hf_path, device) - sequence_len = model.max_seq_len if model.max_seq_len else 1024 - token_bytes = get_hf_token_bytes(tokenizer, device=device) - model_name = args.hf_path -else: - # Load local nanochat model - model, tokenizer, meta = load_model("base", device, phase="eval", model_tag=args.model_tag, step=args.model_step) - sequence_len = meta["model_config"]["sequence_len"] - token_bytes = get_token_bytes(device=device) - model_name = f"base_model (step {meta['step']})" - -autocast_ctx = torch.amp.autocast(device_type=device_type, dtype=torch.bfloat16) if device_type == "cuda" else nullcontext() - -print0(f"Evaluating model: {model_name}") - -# Evaluate the loss on each split -tokens_per_step = args.device_batch_size * sequence_len * ddp_world_size -assert args.split_tokens % tokens_per_step == 0, "split_tokens must be divisible by tokens_per_step" -steps = args.split_tokens // tokens_per_step -bpb_results = {} -for split_name in ["train", "val"]: - loader = tokenizing_distributed_data_loader_bos_bestfit(tokenizer, args.device_batch_size, sequence_len, split_name, device=device) - with autocast_ctx: - bpb = evaluate_bpb(model, loader, steps, token_bytes) - print0(f"{split_name} bpb: {bpb:.4f}") - bpb_results[split_name] = bpb - print0(f"Model: {model_name}, {split_name} bpb: {bpb:.6f}") - -# Master process also samples from the model for some basic knowledge-eliciting prompts (only for nanochat models) -samples = [] -if ddp_rank == 0 and args.hf_path is None: - prompts = [ - "The capital of France is", - "The chemical symbol of gold is", - "If yesterday was Friday, then tomorrow will be", - "The opposite of hot is", - "The planets of the solar system are:", - "My favorite color is", - "If 5*x + 3 = 13, then x is", - ] - engine = Engine(model, tokenizer) - for prompt in prompts: - tokens = tokenizer(prompt, prepend="<|bos|>") - with autocast_ctx: - sample, _ = engine.generate_batch(tokens, num_samples=1, max_tokens=16, temperature=0) - sample_str = tokenizer.decode(sample[0]) - print0("-" * 80) - print0(sample_str) - samples.append(sample_str) - -# Draw some unconditioned samples from the model (only for nanochat models) -unconditioned_samples = [] -if ddp_rank == 0 and args.hf_path is None: - engine = Engine(model, tokenizer) - tokens = tokenizer("", prepend="<|bos|>") - with autocast_ctx: - samples, _ = engine.generate_batch(tokens, num_samples=8, max_tokens=128, temperature=1.0) - for sample in samples: - sample_str = tokenizer.decode(sample) - print0("-" * 80) - print0(sample_str) - unconditioned_samples.append(sample_str) - -# Log to report -from nanochat.report import get_report -get_report().log(section="Base model loss", data=[ - { - "model": model_name, - "train bpb": bpb_results["train"], - "val bpb": bpb_results["val"], - }, - {f"sample {i}": sample for i, sample in enumerate(samples)}, - {f"unconditioned sample {i}": sample for i, sample in enumerate(unconditioned_samples)}, -]) - -# Cleanup -compute_cleanup()