diff --git a/scripts/base_eval.py b/scripts/base_eval.py index 8efde4f..748fc54 100644 --- a/scripts/base_eval.py +++ b/scripts/base_eval.py @@ -1,5 +1,5 @@ """ -Evlauate the CORE metric for a given model. +Evaluate the CORE metric for a given model. Run on a single GPU: python base_eval.py @@ -17,7 +17,7 @@ import random import yaml from contextlib import nullcontext -import pandas as pd +import csv import torch from nanochat.common import compute_init, compute_cleanup, print0, get_base_dir, autodetect_device_type @@ -28,6 +28,7 @@ from nanochat.core_eval import evaluate_task # ----------------------------------------------------------------------------- # nanoChat specific function dealing with I/O etc. + def evaluate_model(model, tokenizer, device, max_per_task=-1): """ Evaluate a base model on the CORE benchmark. @@ -43,7 +44,12 @@ def evaluate_model(model, tokenizer, device, max_per_task=-1): with open(config_path, 'r') as f: config = yaml.safe_load(f) tasks = config['icl_tasks'] - eval_metadata = pd.read_csv(eval_meta_data) + + eval_metadata = {} + with open(eval_meta_data, "r", newline="", encoding="utf-8") as f: + reader = csv.DictReader(f) + for row in reader: + eval_metadata[row["Eval Task"]] = row # Evaluate each task results = {} @@ -57,7 +63,8 @@ def evaluate_model(model, tokenizer, device, max_per_task=-1): 'num_fewshot': task['num_fewshot'][0], 'continuation_delimiter': task.get('continuation_delimiter', ' ') } - print0(f"Evaluating: {label} ({task_meta['num_fewshot']}-shot, type: {task_meta['task_type']})... ", end='') + print0( + f"Evaluating: {label} ({task_meta['num_fewshot']}-shot, type: {task_meta['task_type']})... ", end='') # Load data for this task data_path = os.path.join(data_base_path, task_meta['dataset_uri']) @@ -72,15 +79,24 @@ def evaluate_model(model, tokenizer, device, max_per_task=-1): data = data[:max_per_task] # run the evaluation for this task - accuracy = evaluate_task(model, tokenizer, data, device, task_meta) + # eval should be grad-free for stability/perf + with torch.inference_mode(): + accuracy = evaluate_task(model, tokenizer, data, device, task_meta) results[label] = accuracy - row = eval_metadata[eval_metadata["Eval Task"] == label] - random_baseline = row["Random baseline"].values[0] - centered_result = (accuracy - 0.01 * random_baseline) / (1.0 - 0.01 * random_baseline) + # row = eval_metadata[eval_metadata["Eval Task"] == label] + # random_baseline = row["Random baseline"].values[0] + row = eval_meta_data.get(label) + if row is None or "Random baseline" not in row: + raise KeyError( + f"Missing 'Random baseline' for task '{label}' in {eval_meta_data}") + random_baseline = float(row["Random baseline"]) + centered_result = (accuracy - 0.01 * random_baseline) / \ + (1.0 - 0.01 * random_baseline) centered_results[label] = centered_result end_time = time.time() - print0(f"accuracy: {accuracy:.4f} | centered: {centered_result:.4f} | time: {end_time - start_time:.2f}s") + print0( + f"accuracy: {accuracy:.4f} | centered: {centered_result:.4f} | time: {end_time - start_time:.2f}s") core_metric = sum(centered_results.values()) / len(centered_results) out = { @@ -93,8 +109,10 @@ def evaluate_model(model, tokenizer, device, max_per_task=-1): # ----------------------------------------------------------------------------- # HuggingFace loading utilities and light wrappers for a model + class ModelWrapper: """Lightweight wrapper for a HuggingFace model""" + def __init__(self, model, max_seq_len=None): self.model = model self.max_seq_len = max_seq_len @@ -104,6 +122,7 @@ class ModelWrapper: logits = outputs.logits return logits + def load_hf_model(hf_path: str, device): print0(f"Loading model from: {hf_path}") # Load the model @@ -118,17 +137,23 @@ def load_hf_model(hf_path: str, device): return model, tokenizer # ----------------------------------------------------------------------------- + + def main(): import argparse parser = argparse.ArgumentParser() - parser.add_argument('--hf-path', type=str, default=None, help='HuggingFace model path to evaluate') - parser.add_argument('--max-per-task', type=int, default=-1, help='Max examples per task to evaluate (-1 = disable)') + parser.add_argument('--hf-path', type=str, default=None, + help='HuggingFace model path to evaluate') + parser.add_argument('--max-per-task', type=int, default=-1, + help='Max examples per task to evaluate (-1 = disable)') args = parser.parse_args() # distributed / precision setup device_type = autodetect_device_type() - ddp, ddp_rank, ddp_local_rank, ddp_world_size, device = compute_init(device_type) - autocast_ctx = torch.amp.autocast(device_type=device_type, dtype=torch.bfloat16) if device_type == "cuda" else nullcontext() + ddp, ddp_rank, ddp_local_rank, ddp_world_size, device = compute_init( + device_type) + autocast_ctx = torch.amp.autocast( + device_type=device_type, dtype=torch.bfloat16) if device_type == "cuda" else nullcontext() # Load model and tokenizer from command line or from file system if args.hf_path is not None: @@ -136,24 +161,27 @@ def main(): hf_path = args.hf_path print0(f"Loading huggingface model from: {hf_path}") model, tokenizer = load_hf_model(hf_path, device) - model_name = hf_path # just for logging - model_slug = hf_path.replace("/", "-") # for the output csv file + model_name = hf_path # just for logging + model_slug = hf_path.replace("/", "-") # for the output csv file else: # load a local model from the file system model, tokenizer, meta = load_model("base", device, phase="eval") - model_name = f"base_model (step {meta['step']})" # just for logging - model_slug = f"base_model_{meta['step']:06d}" # for the output csv file + model_name = f"base_model (step {meta['step']})" # just for logging + # for the output csv file + model_slug = f"base_model_{meta['step']:06d}" # Evaluate the model with autocast_ctx: - out = evaluate_model(model, tokenizer, device, max_per_task=args.max_per_task) + out = evaluate_model(model, tokenizer, device, + max_per_task=args.max_per_task) # Write out the results to a csv file core_metric = None centered_results = {} if ddp_rank == 0: base_dir = get_base_dir() - output_csv_path = os.path.join(base_dir, "base_eval", f"{model_slug}.csv") + output_csv_path = os.path.join( + base_dir, "base_eval", f"{model_slug}.csv") os.makedirs(os.path.dirname(output_csv_path), exist_ok=True) results = out["results"] centered_results = out["centered_results"] @@ -161,7 +189,8 @@ def main(): with open(output_csv_path, 'w') as f: f.write(f"{'Task':<35}, {'Accuracy':<10}, {'Centered':<10}\n") for label in results: - f.write(f"{label:<35}, {results[label]:<10.6f}, {centered_results[label]:<10.6f}\n") + f.write( + f"{label:<35}, {results[label]:<10.6f}, {centered_results[label]:<10.6f}\n") f.write(f"{'CORE':<35}, {'':<10}, {core_metric:<10.6f}\n") # Print the content of the csv file to console too print0("="*80) @@ -177,10 +206,11 @@ def main(): "Model": model_name, "CORE metric": core_metric, }, - centered_results, # the full table + centered_results, # the full table ]) compute_cleanup() + if __name__ == "__main__": main()