From bc81d6a460b170d097c2fa28cc93e224093273a9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bar=C4=B1=C5=9F=20=C3=96zmen?= Date: Mon, 29 Dec 2025 13:41:04 +0300 Subject: [PATCH 1/5] test: add engine generation tests for expected invariants MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - test_seed_reproducibility - test_temperature_zero_determinism - test_max_tokens_respected - test_num_samples_count 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 --- tests/test_engine.py | 49 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 49 insertions(+) diff --git a/tests/test_engine.py b/tests/test_engine.py index 683f89b..01a30ee 100644 --- a/tests/test_engine.py +++ b/tests/test_engine.py @@ -185,3 +185,52 @@ def test_multi_sample_first_token_diversity(): f"With uniform logits, this is statistically impossible (~10^-36 probability) " f"unless tokens are being broadcast instead of independently sampled." ) + + +def test_seed_reproducibility(): + """Same seed must produce identical output.""" + model = MockModel() + engine = Engine(model, ByteTokenizer()) + prompt = [261, 72, 101, 108, 108, 111] # + "Hello" + + for seed in [1, 42, 123, 999]: + r1, _ = engine.generate_batch(prompt, max_tokens=5, seed=seed) + r2, _ = engine.generate_batch(prompt, max_tokens=5, seed=seed) + r3, _ = engine.generate_batch(prompt, max_tokens=5, seed=seed) + assert r1 == r2 == r3, "Same seed must produce identical output for the same prompt." + + +def test_temperature_zero_determinism(): + """Temperature=0 is deterministic regardless of seed.""" + model = MockModel() + engine = Engine(model, ByteTokenizer()) + prompt = [261, 72, 101, 108, 108, 111] + + for seed in [1, 42, 123, 999]: + r1, _ = engine.generate_batch(prompt, temperature=0.0, max_tokens=5, seed=seed) + r2, _ = engine.generate_batch(prompt, temperature=0.0, max_tokens=5, seed=seed) + r3, _ = engine.generate_batch(prompt, temperature=0.0, max_tokens=5, seed=seed) + assert r1 == r2 == r3, "Temperature=0 must result in the same output for the same prompt regardless of seed." + + +def test_max_tokens_respected(): + """Generation stops at max_tokens limit.""" + model = MockModel() + engine = Engine(model, ByteTokenizer()) + prompt = [261, 72, 101, 108, 108, 111] + + for max_tokens in [1, 4, 16, 64]: + results, _ = engine.generate_batch(prompt, max_tokens=max_tokens) + num_generated_tokens = len(results[0]) - len(prompt) + assert num_generated_tokens <= max_tokens, f"Generated {num_generated_tokens} tokens, expected max_tokens={max_tokens} or less." + + +def test_num_samples_count(): + """num_samples=N produces exactly N sequences.""" + model = MockModel() + engine = Engine(model, ByteTokenizer()) + prompt = [261, 72, 101, 108, 108, 111] + + for num_samples in [1, 4, 16, 64]: + results, _ = engine.generate_batch(prompt, num_samples=num_samples, max_tokens=3) + assert len(results) == num_samples, f"Expected {num_samples} sequences from {num_samples} samples, got {len(results)}" From 31aeda19d10e4686cf6fcdcc079676909f2a9910 Mon Sep 17 00:00:00 2001 From: Sofie Van Landeghem Date: Wed, 31 Dec 2025 11:49:46 +0100 Subject: [PATCH 2/5] Fix temperature test --- tests/test_engine.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/tests/test_engine.py b/tests/test_engine.py index 01a30ee..75ad7b8 100644 --- a/tests/test_engine.py +++ b/tests/test_engine.py @@ -206,11 +206,10 @@ def test_temperature_zero_determinism(): engine = Engine(model, ByteTokenizer()) prompt = [261, 72, 101, 108, 108, 111] - for seed in [1, 42, 123, 999]: - r1, _ = engine.generate_batch(prompt, temperature=0.0, max_tokens=5, seed=seed) - r2, _ = engine.generate_batch(prompt, temperature=0.0, max_tokens=5, seed=seed) - r3, _ = engine.generate_batch(prompt, temperature=0.0, max_tokens=5, seed=seed) - assert r1 == r2 == r3, "Temperature=0 must result in the same output for the same prompt regardless of seed." + r1, _ = engine.generate_batch(prompt, temperature=0.0, max_tokens=5, seed=1) + r2, _ = engine.generate_batch(prompt, temperature=0.0, max_tokens=5, seed=42) + r3, _ = engine.generate_batch(prompt, temperature=0.0, max_tokens=5, seed=123) + assert r1 == r2 == r3, "Temperature=0 must result in the same output for the same prompt regardless of seed." def test_max_tokens_respected(): From 57ffd35e0a98c4f597f7216bbf37afe38840ac80 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bar=C4=B1=C5=9F=20=C3=96zmen?= Date: Wed, 31 Dec 2025 15:43:42 +0300 Subject: [PATCH 3/5] add test for seed variation in sampling Add test for seed variation in sampling with temperature > 0. --- tests/test_engine.py | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/tests/test_engine.py b/tests/test_engine.py index 75ad7b8..e348df1 100644 --- a/tests/test_engine.py +++ b/tests/test_engine.py @@ -233,3 +233,28 @@ def test_num_samples_count(): for num_samples in [1, 4, 16, 64]: results, _ = engine.generate_batch(prompt, num_samples=num_samples, max_tokens=3) assert len(results) == num_samples, f"Expected {num_samples} sequences from {num_samples} samples, got {len(results)}" + + +def test_seed_variation_in_sampling(): + """With temperature > 0, different seeds should introduce sampling variation.""" + model = MockModel() + engine = Engine(model, ByteTokenizer()) + prompt = [261, 72, 101, 108, 108, 111] # + "Hello" + + outputs = set() + + for seed in [1, 42, 123, 999, 1000, 1001, 1002, 1003, 1004, 1005]: + results, _ = engine.generate_batch( + prompt, + temperature=1.0, + max_tokens=5, + seed=seed, + ) + outputs.add(tuple(results[0])) + + # Sanity check: sampling actually introduces variation + assert len(outputs) > 1, ( + f"All seeds produced the same output: {outputs}" + f"with temperature > 0 and different seeds, this is statistically impossible." + f"implies an issue within engine." + ) From 07d4bf7161c7c4b061a4affd592cc02cd52a6801 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bar=C4=B1=C5=9F=20=C3=96zmen?= Date: Wed, 31 Dec 2025 15:49:53 +0300 Subject: [PATCH 4/5] Rename test for clarity --- tests/test_engine.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/test_engine.py b/tests/test_engine.py index e348df1..de745c4 100644 --- a/tests/test_engine.py +++ b/tests/test_engine.py @@ -235,7 +235,7 @@ def test_num_samples_count(): assert len(results) == num_samples, f"Expected {num_samples} sequences from {num_samples} samples, got {len(results)}" -def test_seed_variation_in_sampling(): +def test_different_seeds_introduce_variation_when_temperature_nonzero(): """With temperature > 0, different seeds should introduce sampling variation.""" model = MockModel() engine = Engine(model, ByteTokenizer()) @@ -256,5 +256,5 @@ def test_seed_variation_in_sampling(): assert len(outputs) > 1, ( f"All seeds produced the same output: {outputs}" f"with temperature > 0 and different seeds, this is statistically impossible." - f"implies an issue within engine." + f"this implies an issue within the engine." ) From 7f6219e092ab7365f66a94e69d89c5a3845043a8 Mon Sep 17 00:00:00 2001 From: Sofie Van Landeghem Date: Wed, 31 Dec 2025 14:02:19 +0100 Subject: [PATCH 5/5] Shorten assert msg --- tests/test_engine.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/tests/test_engine.py b/tests/test_engine.py index de745c4..6555d55 100644 --- a/tests/test_engine.py +++ b/tests/test_engine.py @@ -253,8 +253,4 @@ def test_different_seeds_introduce_variation_when_temperature_nonzero(): outputs.add(tuple(results[0])) # Sanity check: sampling actually introduces variation - assert len(outputs) > 1, ( - f"All seeds produced the same output: {outputs}" - f"with temperature > 0 and different seeds, this is statistically impossible." - f"this implies an issue within the engine." - ) + assert len(outputs) > 1, "All seeds produced the same output which is statistically highly improbable."