From 8b2ccb65b3d8f22ec39ee65490de343fb8679899 Mon Sep 17 00:00:00 2001 From: Sermet Pekin <96650846+SermetPekin@users.noreply.github.com> Date: Tue, 21 Oct 2025 09:08:54 +0300 Subject: [PATCH] Specify UTF-8 encoding for enwik8 file reads (especially for windows) Added UTF-8 encoding to file reading in enwik8 fixtures. --- tests/test_rustbpe.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/tests/test_rustbpe.py b/tests/test_rustbpe.py index 5f95721..524cab2 100644 --- a/tests/test_rustbpe.py +++ b/tests/test_rustbpe.py @@ -451,19 +451,18 @@ def enwik8_path(): print(f"Using existing enwik8 at {enwik8_local_path}") return enwik8_local_path - @pytest.fixture(scope="module") def enwik8_small(enwik8_path): """Fixture providing 100KB of enwik8 for quick tests.""" - with open(enwik8_path, "r") as f: + with open(enwik8_path, "r", encoding="utf-8") as f: return f.read(100_000) @pytest.fixture(scope="module") def enwik8_large(enwik8_path): """Fixture providing 10MB of enwik8 for performance tests.""" - with open(enwik8_path, "r") as f: + with open(enwik8_path, "r", encoding="utf-8") as f: return f.read(10**7) - + def time_function(func, *args, **kwargs): """Time a function call and return the result and elapsed time""" start_time = time.time()