From d60b7202139c04f2ee381261befa610e31be9565 Mon Sep 17 00:00:00 2001 From: Dipesh Babu <59379458+dipeshbabu@users.noreply.github.com> Date: Thu, 30 Oct 2025 13:37:58 -0400 Subject: [PATCH] fix(eval): drop pandas in base_eval, harden metadata lookup, fix typo --- scripts/base_eval.py | 59 +++++++++++++++----------------------------- 1 file changed, 20 insertions(+), 39 deletions(-) diff --git a/scripts/base_eval.py b/scripts/base_eval.py index 748fc54..c981c2d 100644 --- a/scripts/base_eval.py +++ b/scripts/base_eval.py @@ -28,7 +28,6 @@ from nanochat.core_eval import evaluate_task # ----------------------------------------------------------------------------- # nanoChat specific function dealing with I/O etc. - def evaluate_model(model, tokenizer, device, max_per_task=-1): """ Evaluate a base model on the CORE benchmark. @@ -44,7 +43,7 @@ def evaluate_model(model, tokenizer, device, max_per_task=-1): with open(config_path, 'r') as f: config = yaml.safe_load(f) tasks = config['icl_tasks'] - + eval_metadata = pd.read_csv(eval_meta_data) eval_metadata = {} with open(eval_meta_data, "r", newline="", encoding="utf-8") as f: reader = csv.DictReader(f) @@ -63,8 +62,7 @@ def evaluate_model(model, tokenizer, device, max_per_task=-1): 'num_fewshot': task['num_fewshot'][0], 'continuation_delimiter': task.get('continuation_delimiter', ' ') } - print0( - f"Evaluating: {label} ({task_meta['num_fewshot']}-shot, type: {task_meta['task_type']})... ", end='') + print0(f"Evaluating: {label} ({task_meta['num_fewshot']}-shot, type: {task_meta['task_type']})... ", end='') # Load data for this task data_path = os.path.join(data_base_path, task_meta['dataset_uri']) @@ -79,24 +77,21 @@ def evaluate_model(model, tokenizer, device, max_per_task=-1): data = data[:max_per_task] # run the evaluation for this task - # eval should be grad-free for stability/perf - with torch.inference_mode(): - accuracy = evaluate_task(model, tokenizer, data, device, task_meta) + accuracy = evaluate_task(model, tokenizer, data, device, task_meta) results[label] = accuracy # row = eval_metadata[eval_metadata["Eval Task"] == label] # random_baseline = row["Random baseline"].values[0] - row = eval_meta_data.get(label) + row = eval_metadata.get(label) if row is None or "Random baseline" not in row: raise KeyError( - f"Missing 'Random baseline' for task '{label}' in {eval_meta_data}") + f"Missing 'Random baseline' for task '{label}' in {eval_meta_data}" + ) random_baseline = float(row["Random baseline"]) - centered_result = (accuracy - 0.01 * random_baseline) / \ - (1.0 - 0.01 * random_baseline) + centered_result = (accuracy - 0.01 * random_baseline) / (1.0 - 0.01 * random_baseline) centered_results[label] = centered_result end_time = time.time() - print0( - f"accuracy: {accuracy:.4f} | centered: {centered_result:.4f} | time: {end_time - start_time:.2f}s") + print0(f"accuracy: {accuracy:.4f} | centered: {centered_result:.4f} | time: {end_time - start_time:.2f}s") core_metric = sum(centered_results.values()) / len(centered_results) out = { @@ -109,10 +104,8 @@ def evaluate_model(model, tokenizer, device, max_per_task=-1): # ----------------------------------------------------------------------------- # HuggingFace loading utilities and light wrappers for a model - class ModelWrapper: """Lightweight wrapper for a HuggingFace model""" - def __init__(self, model, max_seq_len=None): self.model = model self.max_seq_len = max_seq_len @@ -122,7 +115,6 @@ class ModelWrapper: logits = outputs.logits return logits - def load_hf_model(hf_path: str, device): print0(f"Loading model from: {hf_path}") # Load the model @@ -137,23 +129,17 @@ def load_hf_model(hf_path: str, device): return model, tokenizer # ----------------------------------------------------------------------------- - - def main(): import argparse parser = argparse.ArgumentParser() - parser.add_argument('--hf-path', type=str, default=None, - help='HuggingFace model path to evaluate') - parser.add_argument('--max-per-task', type=int, default=-1, - help='Max examples per task to evaluate (-1 = disable)') + parser.add_argument('--hf-path', type=str, default=None, help='HuggingFace model path to evaluate') + parser.add_argument('--max-per-task', type=int, default=-1, help='Max examples per task to evaluate (-1 = disable)') args = parser.parse_args() # distributed / precision setup device_type = autodetect_device_type() - ddp, ddp_rank, ddp_local_rank, ddp_world_size, device = compute_init( - device_type) - autocast_ctx = torch.amp.autocast( - device_type=device_type, dtype=torch.bfloat16) if device_type == "cuda" else nullcontext() + ddp, ddp_rank, ddp_local_rank, ddp_world_size, device = compute_init(device_type) + autocast_ctx = torch.amp.autocast(device_type=device_type, dtype=torch.bfloat16) if device_type == "cuda" else nullcontext() # Load model and tokenizer from command line or from file system if args.hf_path is not None: @@ -161,27 +147,24 @@ def main(): hf_path = args.hf_path print0(f"Loading huggingface model from: {hf_path}") model, tokenizer = load_hf_model(hf_path, device) - model_name = hf_path # just for logging - model_slug = hf_path.replace("/", "-") # for the output csv file + model_name = hf_path # just for logging + model_slug = hf_path.replace("/", "-") # for the output csv file else: # load a local model from the file system model, tokenizer, meta = load_model("base", device, phase="eval") - model_name = f"base_model (step {meta['step']})" # just for logging - # for the output csv file - model_slug = f"base_model_{meta['step']:06d}" + model_name = f"base_model (step {meta['step']})" # just for logging + model_slug = f"base_model_{meta['step']:06d}" # for the output csv file # Evaluate the model with autocast_ctx: - out = evaluate_model(model, tokenizer, device, - max_per_task=args.max_per_task) + out = evaluate_model(model, tokenizer, device, max_per_task=args.max_per_task) # Write out the results to a csv file core_metric = None centered_results = {} if ddp_rank == 0: base_dir = get_base_dir() - output_csv_path = os.path.join( - base_dir, "base_eval", f"{model_slug}.csv") + output_csv_path = os.path.join(base_dir, "base_eval", f"{model_slug}.csv") os.makedirs(os.path.dirname(output_csv_path), exist_ok=True) results = out["results"] centered_results = out["centered_results"] @@ -189,8 +172,7 @@ def main(): with open(output_csv_path, 'w') as f: f.write(f"{'Task':<35}, {'Accuracy':<10}, {'Centered':<10}\n") for label in results: - f.write( - f"{label:<35}, {results[label]:<10.6f}, {centered_results[label]:<10.6f}\n") + f.write(f"{label:<35}, {results[label]:<10.6f}, {centered_results[label]:<10.6f}\n") f.write(f"{'CORE':<35}, {'':<10}, {core_metric:<10.6f}\n") # Print the content of the csv file to console too print0("="*80) @@ -206,11 +188,10 @@ def main(): "Model": model_name, "CORE metric": core_metric, }, - centered_results, # the full table + centered_results, # the full table ]) compute_cleanup() - if __name__ == "__main__": main()