From bc81d6a460b170d097c2fa28cc93e224093273a9 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Bar=C4=B1=C5=9F=20=C3=96zmen?= <hbaristr@gmail.com>
Date: Mon, 29 Dec 2025 13:41:04 +0300
Subject: [PATCH 1/5] test: add engine generation tests for expected invariants
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- test_seed_reproducibility
- test_temperature_zero_determinism
- test_max_tokens_respected
- test_num_samples_count

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
---
 tests/test_engine.py | 49 ++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 49 insertions(+)
diff --git a/tests/test_engine.py b/tests/test_engine.py
index 683f89b..01a30ee 100644
--- a/tests/test_engine.py
+++ b/tests/test_engine.py
@@ -185,3 +185,52 @@ def test_multi_sample_first_token_diversity():
         f"With uniform logits, this is statistically impossible (~10^-36 probability) "
         f"unless tokens are being broadcast instead of independently sampled."
     )
+
+
+def test_seed_reproducibility():
+    """Same seed must produce identical output."""
+    model = MockModel()
+    engine = Engine(model, ByteTokenizer())
+    prompt = [261, 72, 101, 108, 108, 111]  # <bos> + "Hello"
+
+    for seed in [1, 42, 123, 999]:
+        r1, _ = engine.generate_batch(prompt, max_tokens=5, seed=seed)
+        r2, _ = engine.generate_batch(prompt, max_tokens=5, seed=seed)
+        r3, _ = engine.generate_batch(prompt, max_tokens=5, seed=seed)
+        assert r1 == r2 == r3, "Same seed must produce identical output for the same prompt."
+
+
+def test_temperature_zero_determinism():
+    """Temperature=0 is deterministic regardless of seed."""
+    model = MockModel()
+    engine = Engine(model, ByteTokenizer())
+    prompt = [261, 72, 101, 108, 108, 111]
+
+    for seed in [1, 42, 123, 999]:
+        r1, _ = engine.generate_batch(prompt, temperature=0.0, max_tokens=5, seed=seed)
+        r2, _ = engine.generate_batch(prompt, temperature=0.0, max_tokens=5, seed=seed)
+        r3, _ = engine.generate_batch(prompt, temperature=0.0, max_tokens=5, seed=seed)
+        assert r1 == r2 == r3, "Temperature=0 must result in the same output for the same prompt regardless of seed."
+
+
+def test_max_tokens_respected():
+    """Generation stops at max_tokens limit."""
+    model = MockModel()
+    engine = Engine(model, ByteTokenizer())
+    prompt = [261, 72, 101, 108, 108, 111]
+
+    for max_tokens in [1, 4, 16, 64]:
+        results, _ = engine.generate_batch(prompt, max_tokens=max_tokens)
+        num_generated_tokens = len(results[0]) - len(prompt)
+        assert num_generated_tokens <= max_tokens, f"Generated {num_generated_tokens} tokens, expected max_tokens={max_tokens} or less."
+
+
+def test_num_samples_count():
+    """num_samples=N produces exactly N sequences."""
+    model = MockModel()
+    engine = Engine(model, ByteTokenizer())
+    prompt = [261, 72, 101, 108, 108, 111]
+
+    for num_samples in [1, 4, 16, 64]:
+        results, _ = engine.generate_batch(prompt, num_samples=num_samples, max_tokens=3)
+        assert len(results) == num_samples, f"Expected {num_samples} sequences from {num_samples} samples, got {len(results)}"

From 31aeda19d10e4686cf6fcdcc079676909f2a9910 Mon Sep 17 00:00:00 2001
From: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Date: Wed, 31 Dec 2025 11:49:46 +0100
Subject: [PATCH 2/5] Fix temperature test

---
 tests/test_engine.py | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/tests/test_engine.py b/tests/test_engine.py
index 01a30ee..75ad7b8 100644
--- a/tests/test_engine.py
+++ b/tests/test_engine.py
@@ -206,11 +206,10 @@ def test_temperature_zero_determinism():
     engine = Engine(model, ByteTokenizer())
     prompt = [261, 72, 101, 108, 108, 111]
 
-    for seed in [1, 42, 123, 999]:
-        r1, _ = engine.generate_batch(prompt, temperature=0.0, max_tokens=5, seed=seed)
-        r2, _ = engine.generate_batch(prompt, temperature=0.0, max_tokens=5, seed=seed)
-        r3, _ = engine.generate_batch(prompt, temperature=0.0, max_tokens=5, seed=seed)
-        assert r1 == r2 == r3, "Temperature=0 must result in the same output for the same prompt regardless of seed."
+    r1, _ = engine.generate_batch(prompt, temperature=0.0, max_tokens=5, seed=1)
+    r2, _ = engine.generate_batch(prompt, temperature=0.0, max_tokens=5, seed=42)
+    r3, _ = engine.generate_batch(prompt, temperature=0.0, max_tokens=5, seed=123)
+    assert r1 == r2 == r3, "Temperature=0 must result in the same output for the same prompt regardless of seed."
 
 
 def test_max_tokens_respected():

From 57ffd35e0a98c4f597f7216bbf37afe38840ac80 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Bar=C4=B1=C5=9F=20=C3=96zmen?= <hbaristr@gmail.com>
Date: Wed, 31 Dec 2025 15:43:42 +0300
Subject: [PATCH 3/5] add test for seed variation in sampling

Add test for seed variation in sampling with temperature > 0.
---
 tests/test_engine.py | 25 +++++++++++++++++++++++++
 1 file changed, 25 insertions(+)

diff --git a/tests/test_engine.py b/tests/test_engine.py
index 75ad7b8..e348df1 100644
--- a/tests/test_engine.py
+++ b/tests/test_engine.py
@@ -233,3 +233,28 @@ def test_num_samples_count():
     for num_samples in [1, 4, 16, 64]:
         results, _ = engine.generate_batch(prompt, num_samples=num_samples, max_tokens=3)
         assert len(results) == num_samples, f"Expected {num_samples} sequences from {num_samples} samples, got {len(results)}"
+
+
+def test_seed_variation_in_sampling():
+    """With temperature > 0, different seeds should introduce sampling variation."""
+    model = MockModel()
+    engine = Engine(model, ByteTokenizer())
+    prompt = [261, 72, 101, 108, 108, 111]  # <bos> + "Hello"
+
+    outputs = set()
+
+    for seed in [1, 42, 123, 999, 1000, 1001, 1002, 1003, 1004, 1005]:
+        results, _ = engine.generate_batch(
+            prompt,
+            temperature=1.0,
+            max_tokens=5,
+            seed=seed,
+        )
+        outputs.add(tuple(results[0]))
+
+    # Sanity check: sampling actually introduces variation
+    assert len(outputs) > 1, (
+        f"All seeds produced the same output: {outputs}"
+        f"with temperature > 0 and different seeds, this is statistically impossible."
+        f"implies an issue within engine."
+    )

From 07d4bf7161c7c4b061a4affd592cc02cd52a6801 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Bar=C4=B1=C5=9F=20=C3=96zmen?= <hbaristr@gmail.com>
Date: Wed, 31 Dec 2025 15:49:53 +0300
Subject: [PATCH 4/5] Rename test for clarity

---
 tests/test_engine.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/test_engine.py b/tests/test_engine.py
index e348df1..de745c4 100644
--- a/tests/test_engine.py
+++ b/tests/test_engine.py
@@ -235,7 +235,7 @@ def test_num_samples_count():
         assert len(results) == num_samples, f"Expected {num_samples} sequences from {num_samples} samples, got {len(results)}"
 
 
-def test_seed_variation_in_sampling():
+def test_different_seeds_introduce_variation_when_temperature_nonzero():
     """With temperature > 0, different seeds should introduce sampling variation."""
     model = MockModel()
     engine = Engine(model, ByteTokenizer())
@@ -256,5 +256,5 @@ def test_seed_variation_in_sampling():
     assert len(outputs) > 1, (
         f"All seeds produced the same output: {outputs}"
         f"with temperature > 0 and different seeds, this is statistically impossible."
-        f"implies an issue within engine."
+        f"this implies an issue within the engine."
     )

From 7f6219e092ab7365f66a94e69d89c5a3845043a8 Mon Sep 17 00:00:00 2001
From: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Date: Wed, 31 Dec 2025 14:02:19 +0100
Subject: [PATCH 5/5] Shorten assert msg

---
 tests/test_engine.py | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/tests/test_engine.py b/tests/test_engine.py
index de745c4..6555d55 100644
--- a/tests/test_engine.py
+++ b/tests/test_engine.py
@@ -253,8 +253,4 @@ def test_different_seeds_introduce_variation_when_temperature_nonzero():
         outputs.add(tuple(results[0]))
 
     # Sanity check: sampling actually introduces variation
-    assert len(outputs) > 1, (
-        f"All seeds produced the same output: {outputs}"
-        f"with temperature > 0 and different seeds, this is statistically impossible."
-        f"this implies an issue within the engine."
-    )
+    assert len(outputs) > 1, "All seeds produced the same output which is statistically highly improbable."