diff --git a/scripts/base_eval.py b/scripts/base_eval.py index 57f9fd4..e45ae43 100644 --- a/scripts/base_eval.py +++ b/scripts/base_eval.py @@ -73,7 +73,7 @@ def load_hf_model(hf_path: str, device): model = AutoModelForCausalLM.from_pretrained(hf_path) model.to(device) model.eval() - max_seq_len = 1024 if "openai-community/gpt2" in hf_path else None + max_seq_len = 1024 if "gpt2" in hf_path else None model = ModelWrapper(model, max_seq_len=max_seq_len) tokenizer = HuggingFaceTokenizer.from_pretrained(hf_path) return model, tokenizer @@ -180,7 +180,7 @@ def evaluate_core(model, tokenizer, device, max_per_task=-1): def main(): parser = argparse.ArgumentParser(description="Base model evaluation") parser.add_argument('--eval', type=str, default='core,bpb,sample', help='Comma-separated evaluations to run: core,bpb,sample (default: all)') - parser.add_argument('--hf-path', type=str, default=None, help='HuggingFace model path (e.g. openai-community/gpt2)') + parser.add_argument('--hf-path', type=str, default=None, help='HuggingFace model path (e.g. openai-community/gpt2-xl)') parser.add_argument('--model-tag', type=str, default=None, help='nanochat model tag to identify the checkpoint directory') parser.add_argument('--step', type=int, default=None, help='Model step to load (default = last)') parser.add_argument('--max-per-task', type=int, default=-1, help='Max examples per CORE task (-1 = all)') @@ -225,48 +225,6 @@ def main(): samples = [] unconditioned_samples = [] - # --- CORE evaluation --- - if 'core' in eval_modes: - print0("\n" + "="*80) - print0("CORE Evaluation") - print0("="*80) - with autocast_ctx: - core_results = evaluate_core(model, tokenizer, device, max_per_task=args.max_per_task) - - # Write CSV output - if ddp_rank == 0: - base_dir = get_base_dir() - output_csv_path = os.path.join(base_dir, "base_eval", f"{model_slug}.csv") - os.makedirs(os.path.dirname(output_csv_path), exist_ok=True) - with open(output_csv_path, 'w', encoding='utf-8', newline='') as f: - f.write(f"{'Task':<35}, {'Accuracy':<10}, {'Centered':<10}\n") - for label in core_results["results"]: - acc = core_results["results"][label] - centered = core_results["centered_results"][label] - f.write(f"{label:<35}, {acc:<10.6f}, {centered:<10.6f}\n") - f.write(f"{'CORE':<35}, {'':<10}, {core_results['core_metric']:<10.6f}\n") - print0(f"\nResults written to: {output_csv_path}") - print0(f"CORE metric: {core_results['core_metric']:.4f}") - - # --- BPB evaluation --- - if 'bpb' in eval_modes: - print0("\n" + "="*80) - print0("BPB Evaluation") - print0("="*80) - tokens_per_step = args.device_batch_size * sequence_len * ddp_world_size - if args.split_tokens % tokens_per_step != 0: - # Adjust to nearest multiple - args.split_tokens = (args.split_tokens // tokens_per_step) * tokens_per_step - print0(f"Adjusted split_tokens to {args.split_tokens} (must be divisible by {tokens_per_step})") - steps = args.split_tokens // tokens_per_step - - for split_name in ["train", "val"]: - loader = tokenizing_distributed_data_loader_bos_bestfit(tokenizer, args.device_batch_size, sequence_len, split_name, device=device) - with autocast_ctx: - bpb = evaluate_bpb(model, loader, steps, token_bytes) - bpb_results[split_name] = bpb - print0(f"{split_name} bpb: {bpb:.6f}") - # --- Sampling --- if 'sample' in eval_modes and not is_hf_model: print0("\n" + "="*80) @@ -305,6 +263,48 @@ def main(): elif 'sample' in eval_modes and is_hf_model: print0("\nSkipping sampling for HuggingFace models (not supported)") + # --- BPB evaluation --- + if 'bpb' in eval_modes: + print0("\n" + "="*80) + print0("BPB Evaluation") + print0("="*80) + tokens_per_step = args.device_batch_size * sequence_len * ddp_world_size + if args.split_tokens % tokens_per_step != 0: + # Adjust to nearest multiple + args.split_tokens = (args.split_tokens // tokens_per_step) * tokens_per_step + print0(f"Adjusted split_tokens to {args.split_tokens} (must be divisible by {tokens_per_step})") + steps = args.split_tokens // tokens_per_step + + for split_name in ["train", "val"]: + loader = tokenizing_distributed_data_loader_bos_bestfit(tokenizer, args.device_batch_size, sequence_len, split_name, device=device) + with autocast_ctx: + bpb = evaluate_bpb(model, loader, steps, token_bytes) + bpb_results[split_name] = bpb + print0(f"{split_name} bpb: {bpb:.6f}") + + # --- CORE evaluation --- + if 'core' in eval_modes: + print0("\n" + "="*80) + print0("CORE Evaluation") + print0("="*80) + with autocast_ctx: + core_results = evaluate_core(model, tokenizer, device, max_per_task=args.max_per_task) + + # Write CSV output + if ddp_rank == 0: + base_dir = get_base_dir() + output_csv_path = os.path.join(base_dir, "base_eval", f"{model_slug}.csv") + os.makedirs(os.path.dirname(output_csv_path), exist_ok=True) + with open(output_csv_path, 'w', encoding='utf-8', newline='') as f: + f.write(f"{'Task':<35}, {'Accuracy':<10}, {'Centered':<10}\n") + for label in core_results["results"]: + acc = core_results["results"][label] + centered = core_results["centered_results"][label] + f.write(f"{label:<35}, {acc:<10.6f}, {centered:<10.6f}\n") + f.write(f"{'CORE':<35}, {'':<10}, {core_results['core_metric']:<10.6f}\n") + print0(f"\nResults written to: {output_csv_path}") + print0(f"CORE metric: {core_results['core_metric']:.4f}") + # --- Log to report --- from nanochat.report import get_report report_data = [{"model": model_name}]