From 851810c7d57b0dd94b97465f9c1ccc1d6cc3cf50 Mon Sep 17 00:00:00 2001 From: MadMax129 Date: Fri, 24 Oct 2025 17:06:06 -0400 Subject: [PATCH] remove string allocations --- fregex/bench.py | 91 +++++++++++++++++++++++++++++++---------------- fregex/cload.py | 51 ++++++++++++++++++++------ fregex/compare.py | 15 -------- fregex/fregex.c | 43 +++++++--------------- fregex/fregex.h | 12 +++---- fregex/fuzz.py | 15 -------- 6 files changed, 118 insertions(+), 109 deletions(-) diff --git a/fregex/bench.py b/fregex/bench.py index 64dcda0..7231e18 100755 --- a/fregex/bench.py +++ b/fregex/bench.py @@ -3,37 +3,45 @@ import ctypes import random import time import statistics +import os +import gc from pathlib import Path from nanochat.tokenizer import SPLIT_PATTERN + +os.environ.update({ + 'OMP_NUM_THREADS': '1', + 'OPENBLAS_NUM_THREADS': '1', + 'MKL_NUM_THREADS': '1', + 'VECLIB_MAXIMUM_THREADS': '1', + 'NUMEXPR_NUM_THREADS': '1', + 'RAYON_NUM_THREADS': '1', +}) + +os.setpriority(os.PRIO_PROCESS, 0, -10) + from rustbpe import split_text as rust_split_text from fregex.fuzz import gen_valid_unicode_string, compare_pair_text from fregex.cload import * -def bench_c_regex(data: bytes, iterations: int) -> list: - times = [] - for _ in range(iterations): - token_list = TokenList() - c_lib.tokenlist_init(ctypes.byref(token_list)) - - start = time.perf_counter() - c_lib.tokenize_fast(data, len(data), ctypes.byref(token_list)) - elapsed = time.perf_counter() - start - - c_lib.tokenlist_free(ctypes.byref(token_list)) - times.append(elapsed * 1000) - - return times +PyBytes_AsString = ctypes.pythonapi.PyBytes_AsString +PyBytes_AsString.restype = ctypes.c_void_p +PyBytes_AsString.argtypes = [ctypes.py_object] -def bench_rust_regex(text: str, iterations: int) -> list: - times = [] - for _ in range(iterations): - start = time.perf_counter() - rust_split_text(SPLIT_PATTERN, text) - elapsed = time.perf_counter() - start - times.append(elapsed * 1000) - - return times +def _run_once_c(data: bytes) -> float: + token_list = TokenList() + c_lib.tokenlist_init(ctypes.byref(token_list)) + base_ptr = PyBytes_AsString(data) + t0 = time.perf_counter_ns() + c_lib.tokenize_fast(base_ptr, len(data), ctypes.byref(token_list)) + dt_ms = (time.perf_counter_ns() - t0) / 1e6 + c_lib.tokenlist_free(ctypes.byref(token_list)) + return dt_ms + +def _run_once_rust(text: str) -> float: + t0 = time.perf_counter_ns() + rust_split_text(SPLIT_PATTERN, text) + return (time.perf_counter_ns() - t0) / 1e6 def stats_summary(times: list) -> dict: """Compute statistics from timing list.""" @@ -65,11 +73,33 @@ def benchmark_dataset(name: str, data_bytes: bytes, iterations: int) -> None: print(f"\n--- Dataset: {name} ({len(data_bytes)} bytes, {iterations} iterations) ---") print() - - c_times = bench_c_regex(data_bytes, iterations) + + # Pre-touch data to avoid first-touch/page-fault skew + if data_bytes: + _ = data_bytes[0] + for i in range(0, len(data_bytes), 4096): + _ = data_bytes[i] + + # Warm-up + for _ in range(20): + _run_once_c(data_bytes) + _run_once_rust(test_text) + + # Disable GC during timed section + gc_was_enabled = gc.isenabled() + if gc_was_enabled: + gc.disable() + + c_times = [] + rust_times = [] + for _ in range(iterations): + c_times.append(_run_once_c(data_bytes)) + rust_times.append(_run_once_rust(test_text)) + + if gc_was_enabled: + gc.enable() + print(format_stats("C tokenizer", len(data_bytes), c_times), end='') - - rust_times = bench_rust_regex(test_text, iterations) print(format_stats("Rust split", len(data_bytes), rust_times), end='') if c_times and rust_times: @@ -115,7 +145,7 @@ def main(): try: data = path.read_bytes() - benchmark_dataset(path.name, data, 10_000) + benchmark_dataset(path.name, data, 1_000) except Exception as e: print(f"❌ Error reading {file_path}: {e}") else: @@ -124,8 +154,8 @@ def main(): ("tiny", 100, 1000), ("small", 1024, 500), ("medium", 10 * 1024, 100), - ("large", 100 * 1024, 30), - ("xlarge", 1024 * 1024, 10), + ("large", 100 * 1024, 100), + ("xlarge", 1024 * 1024, 100), ] for name, size_bytes, iterations in configs: @@ -140,6 +170,5 @@ def main(): print("=" * 140) - if __name__ == "__main__": main() diff --git a/fregex/cload.py b/fregex/cload.py index 2a17c94..3def82d 100644 --- a/fregex/cload.py +++ b/fregex/cload.py @@ -2,19 +2,50 @@ import ctypes c_lib = ctypes.CDLL("fregex/libfregex.dylib") -class TokenList(ctypes.Structure): - pass -TokenList._fields_ = [ - ("tokens", ctypes.POINTER(ctypes.POINTER(ctypes.c_char))), - ("lengths", ctypes.POINTER(ctypes.c_size_t)), - ("count", ctypes.c_size_t), - ("capacity", ctypes.c_size_t), -] +class TokenPos(ctypes.Structure): + _fields_ = [ + ("start", ctypes.c_size_t), + ("end", ctypes.c_size_t), + ] + + +class TokenList(ctypes.Structure): + _fields_ = [ + ("splits", ctypes.POINTER(TokenPos)), + ("count", ctypes.c_size_t), + ("capacity", ctypes.c_size_t), + ] + c_lib.tokenlist_init.argtypes = [ctypes.POINTER(TokenList)] c_lib.tokenlist_init.restype = None c_lib.tokenlist_free.argtypes = [ctypes.POINTER(TokenList)] c_lib.tokenlist_free.restype = None -c_lib.tokenize_fast.argtypes = [ctypes.c_char_p, ctypes.c_size_t, ctypes.POINTER(TokenList)] -c_lib.tokenize_fast.restype = None \ No newline at end of file +# Accept a raw pointer to the input buffer rather than a Python bytes object +c_lib.tokenize_fast.argtypes = [ctypes.c_void_p, ctypes.c_size_t, ctypes.POINTER(TokenList)] +c_lib.tokenize_fast.restype = None + +def tokenize_c_bytes(data: bytes) -> list[bytes]: + # Use a C char* view of the original bytes; offsets computed from this base + c_data = ctypes.c_char_p(data) + tl = TokenList() + c_lib.tokenlist_init(ctypes.byref(tl)) + try: + base_addr = ctypes.cast(c_data, ctypes.c_void_p).value + # Pass the same pointer to C + c_lib.tokenize_fast(ctypes.cast(c_data, ctypes.c_void_p), len(data), ctypes.byref(tl)) + out: list[bytes] = [] + count = int(tl.count) + for i in range(count): + start_addr = int(tl.splits[i].start) + end_addr = int(tl.splits[i].end) + # Compute offsets into our local buffer + off_start = start_addr - base_addr + off_end = end_addr - base_addr + if off_start < 0 or off_end < off_start or off_end > len(data): + raise RuntimeError(f"Invalid span [{start_addr}:{end_addr}] for buffer base {base_addr}") + out.append(data[off_start:off_end]) + return out + finally: + c_lib.tokenlist_free(ctypes.byref(tl)) \ No newline at end of file diff --git a/fregex/compare.py b/fregex/compare.py index ffdcb50..380c2d1 100644 --- a/fregex/compare.py +++ b/fregex/compare.py @@ -33,21 +33,6 @@ def escape_bytes(b: bytes) -> str: def dump_tokens(tokens: list[bytes]) -> str: return "\n".join(f"{len(b)}\t{escape_bytes(b)}" for b in tokens) -def tokenize_c_bytes(data: bytes) -> list[bytes]: - tl = TokenList() - c_lib.tokenlist_init(ctypes.byref(tl)) - try: - c_lib.tokenize_fast(data, len(data), ctypes.byref(tl)) - out: list[bytes] = [] - count = int(tl.count) - for i in range(count): - ptr = tl.tokens[i] - ln = int(tl.lengths[i]) - out.append(ctypes.string_at(ptr, ln)) - return out - finally: - c_lib.tokenlist_free(ctypes.byref(tl)) - def tokenize_py_bytes(data: bytes) -> list[bytes]: text = data.decode('utf-8', errors='surrogatepass') toks = py_tokenize_str(text) diff --git a/fregex/fregex.c b/fregex/fregex.c index c91aa35..466d41c 100644 --- a/fregex/fregex.c +++ b/fregex/fregex.c @@ -1,4 +1,4 @@ -#include "fregex.h" +#include "fregex-2.h" #include #include @@ -136,34 +136,29 @@ static char *xmemdup(const char *src, size_t len) { } void tokenlist_init(TokenList *list) { - list->tokens = NULL; - list->lengths = NULL; + list->splits = NULL; list->count = 0; list->capacity = 0; } void tokenlist_free(TokenList *list) { - if (!list) + if (!list) return; - for (size_t i = 0; i < list->count; ++i) - free(list->tokens[i]); - free(list->tokens); - free(list->lengths); - list->tokens = NULL; - list->lengths = NULL; - list->count = 0; + free(list->splits); + list->splits = NULL; + list->count = 0; list->capacity = 0; } static void tokenlist_push(TokenList *list, const char *start, size_t len) { if (list->count == list->capacity) { - const size_t new_cap = list->capacity ? (list->capacity * 2) : 64; - list->tokens = (char**)xrealloc(list->tokens, new_cap * sizeof(char*)); - list->lengths = (size_t*)xrealloc(list->lengths, new_cap * sizeof(size_t)); + const size_t new_cap = list->capacity ? (list->capacity * 2) : 128; + list->splits = (TokenPos*)xrealloc(list->splits, new_cap * sizeof(TokenPos)); list->capacity = new_cap; } - list->tokens[list->count] = xmemdup(start, len); - list->lengths[list->count] = len; + /* Write the start / end position of string */ + list->splits[list->count].start = (size_t)start; + list->splits[list->count].end = (size_t)(start + len); // len - 1 ? list->count++; } @@ -185,20 +180,6 @@ static void fput_escaped_char(unsigned char c, FILE *out) { } } -void print_token_escaped(const char *s, size_t len, FILE *out) { - fprintf(out, "%zu\t", len); - for (size_t i = 0; i < len; ++i) fput_escaped_char((unsigned char)s[i], out); - fputc('\n', out); -} - -void print_tokens_escaped(const TokenList *list, FILE *out) { - for (size_t i = 0; i < list->count; ++i) { - const char *tok = list->tokens[i]; - size_t len = list->lengths[i]; - print_token_escaped(tok, len, out); - } -} - /* A) '(?i:[sdmt]|ll|ve|re) */ static size_t match_contraction(const char *p, const char *end) { if (p >= end || *p != '\'' || (p + 1) >= end) @@ -470,7 +451,7 @@ static size_t match_ws_run(const char *p, const char *end) { void tokenize_fast(const char *input, size_t input_len, TokenList *out) { if (!input) { - out->tokens = NULL; + out->splits = NULL; out->count = 0; out->capacity = 0; return; diff --git a/fregex/fregex.h b/fregex/fregex.h index f41e3f1..e332115 100644 --- a/fregex/fregex.h +++ b/fregex/fregex.h @@ -5,19 +5,17 @@ #include typedef struct { - char **tokens; - size_t *lengths; + size_t start, end; +} TokenPos; + +typedef struct { + TokenPos *splits; size_t count; size_t capacity; } TokenList; void tokenlist_init(TokenList *list); void tokenlist_free(TokenList *list); - void tokenize_fast(const char *input, size_t input_len, TokenList *out); -void print_token_escaped(const char *s, size_t len, FILE *out); -void print_tokens_escaped(const TokenList *list, FILE *out); #endif // FAST_REGEX_H - - diff --git a/fregex/fuzz.py b/fregex/fuzz.py index b0cd6d5..2c29fbf 100644 --- a/fregex/fuzz.py +++ b/fregex/fuzz.py @@ -242,21 +242,6 @@ def _format_tokens_dump(tokens: list[bytes]) -> str: lines.append(f"{len(b)}\t{escape_bytes(b)}") return "\n".join(lines) -def tokenize_c_bytes(data: bytes) -> list[bytes]: - tl = TokenList() - c_lib.tokenlist_init(ctypes.byref(tl)) - try: - c_lib.tokenize_fast(data, len(data), ctypes.byref(tl)) - out: list[bytes] = [] - count = int(tl.count) - for i in range(count): - ptr = tl.tokens[i] - ln = int(tl.lengths[i]) - out.append(ctypes.string_at(ptr, ln)) - return out - finally: - c_lib.tokenlist_free(ctypes.byref(tl)) - def tokenize_py_bytes(data: bytes) -> list[bytes]: if py_tokenize_str is None: raise RuntimeError("py_tokenizer not available")