Merge 557b2d5840 into 1ccbaf4416

2026-04-28 12:00:23 +00:00 · 2025-10-29 21:41:29 +05:00 · 2025-10-29 21:41:29 +05:00 · 0a784e25de
commit 0a784e25de
parent 1ccbaf4416 557b2d5840
3 changed files with 17 additions and 8 deletions
--- a/nanochat/common.py
+++ b/nanochat/common.py
@ -116,7 +116,8 @@ def print_banner():
    print0(banner)

 def is_ddp():
-    # TODO is there a proper way
+    if dist.is_initialized():
+        return True
    return int(os.environ.get('RANK', -1)) != -1

 def get_dist_info():
--- a/nanochat/engine.py
+++ b/nanochat/engine.py
@ -213,7 +213,7 @@ class Engine:
            seq_len=len(tokens),
            **kv_model_kwargs,
        )
-        ids = torch.tensor([tokens], dtype=torch.long, device=device)
+        ids = torch.tensor([tokens.copy() for _ in range(num_samples)], dtype=torch.long, device=device)
        logits = self.model.forward(ids, kv_cache=kv_cache_prefill)
        logits = logits[:, -1, :]
        next_ids = sample_next_token(logits, rng, temperature, top_k)  # (B, 1)
@ -246,7 +246,7 @@ class Engine:
            # Get sampled tokens - either from prefill or from forward pass
            if first_iteration:
                # Use the tokens we already sampled from prefill
-                sampled_tokens = [sampled_tokens[0]] * num_samples  # Broadcast first token to all rows
+                # sampled_tokens = [sampled_tokens[0]] * num_samples  # Broadcast first token to all rows
                # TODO: we should sample a token for each row instead of broadcasting
                first_iteration = False
            else:
--- a/scripts/base_eval.py
+++ b/scripts/base_eval.py
@ -17,7 +17,7 @@ import random
 import yaml
 from contextlib import nullcontext

-import pandas as pd
+import csv
 import torch

 from nanochat.common import compute_init, compute_cleanup, print0, get_base_dir, autodetect_device_type
@ -39,11 +39,20 @@ def evaluate_model(model, tokenizer, device, max_per_task=-1):
    eval_bundle_dir = os.path.join(base_dir, "eval_bundle")
    config_path = os.path.join(eval_bundle_dir, "core.yaml")
    data_base_path = os.path.join(eval_bundle_dir, "eval_data")
-    eval_meta_data = os.path.join(eval_bundle_dir, "eval_meta_data.csv")
+    eval_meta_data_path = os.path.join(eval_bundle_dir, "eval_meta_data.csv")
    with open(config_path, 'r') as f:
        config = yaml.safe_load(f)
    tasks = config['icl_tasks']
-    eval_metadata = pd.read_csv(eval_meta_data)
+
+    # Load eval metadata
+    eval_metadata = {}
+    with open(eval_meta_data_path, 'r') as f:
+        reader = csv.reader(f)
+        header = next(reader) # Skip header
+        for row in reader:
+            task_name = row[0]
+            random_baseline = float(row[1])
+            eval_metadata[task_name] = {"Random baseline": random_baseline}

    # Evaluate each task
    results = {}
@ -75,8 +84,7 @@ def evaluate_model(model, tokenizer, device, max_per_task=-1):
        accuracy = evaluate_task(model, tokenizer, data, device, task_meta)

        results[label] = accuracy
-        row = eval_metadata[eval_metadata["Eval Task"] == label]
-        random_baseline = row["Random baseline"].values[0]
+        random_baseline = eval_metadata[label]["Random baseline"]
        centered_result = (accuracy - 0.01 * random_baseline) / (1.0 - 0.01 * random_baseline)
        centered_results[label] = centered_result
        end_time = time.time()