use non-deprecated thread/gil apis from pyo3

This commit is contained in:
Qubitium 2025-10-21 02:58:12 +00:00
parent f696e9ce4c
commit 4040d31aab
2 changed files with 10 additions and 3 deletions

View File

@ -307,7 +307,7 @@ impl Tokenizer {
// Helper: refill `buf` with up to `buffer_size` strings from the Python iterator.
// Returns Ok(true) if the iterator is exhausted, Ok(false) otherwise.
let refill = |buf: &mut Vec<String>| -> PyResult<bool> {
pyo3::Python::with_gil(|py| {
pyo3::Python::attach(|py| {
buf.clear();
let it = py_iter.bind(py);
loop {
@ -345,7 +345,7 @@ impl Tokenizer {
total_sequences += buf.len() as u64;
let pattern = self.compiled_pattern.clone();
let local: AHashMap<CompactString, i32> = py.allow_threads(|| {
let local: AHashMap<CompactString, i32> = py.detach(|| {
buf.par_iter()
.map(|s| {
let mut m: AHashMap<CompactString, i32> = AHashMap::new();

View File

@ -1,6 +1,13 @@
import sys
from pathlib import Path
import pytest
from tasks.gsm8k import DATASET_CONFIGS, GSM8K
# Ensure the repository root (which contains the ``nanochat`` package) is on sys.path
if "nanochat" not in sys.modules:
sys.path.insert(0, str(Path(__file__).resolve().parents[1]))
from nanochat.tasks.gsm8k import DATASET_CONFIGS, GSM8K
# Simple test to check we are getting the correct rows from the gsm8k datasets.
# It does not verify the actual content of the dataset itself.