cleanup

2026-02-23 03:50:24 +00:00 · 2025-10-23 17:55:33 -04:00 · 2025-10-23 17:55:33 -04:00 · e02938c0aa
commit e02938c0aa
parent 12f418f0a1
3 changed files with 4 additions and 22 deletions
--- a/fregex/bench.py
+++ b/fregex/bench.py
@ -1,15 +1,3 @@
-"""
-Benchmarker for comparing tok.c tokenize_fast() vs rust split_text()
-Measures speed WITHOUT subprocess overhead - direct function calls only.
-
-Usage:
-    cd pytok
-    source ../.venv/bin/activate
-    python3 bench.py                          # Run synthetic data benchmarks
-    python3 bench.py /path/to/file.txt        # Benchmark a specific file
-    python3 bench.py file1.txt file2.txt ...  # Benchmark multiple files
-"""
-
 import sys
 import ctypes
 import random
--- a/fregex/compare.py
+++ b/fregex/compare.py
@ -119,7 +119,6 @@ def compare_one(path: Path) -> int:
        p_offs = byte_offsets(p_parsed)
        r_offs = byte_offsets(r_parsed)

-        # Load original input bytes so we can show precise substrings and code points
        data_bytes = Path(path).read_bytes()

        def print_unicode_debug(label, offs_list, idx):
--- a/fregex/fregex.h
+++ b/fregex/fregex.h
@ -1,12 +1,12 @@
-#ifndef FAST_TOKENIZER_H
-#define FAST_TOKENIZER_H
+#ifndef FAST_REGEX_H
+#define FAST_REGEX_H

 #include <stddef.h>
 #include <stdio.h>

 typedef struct {
 	char **tokens;
-	size_t *lengths;  // Store length of each token to handle null bytes
+	size_t *lengths;
 	size_t count;
 	size_t capacity;
 } TokenList;
@ -14,15 +14,10 @@ typedef struct {
 void tokenlist_init(TokenList *list);
 void tokenlist_free(TokenList *list);

-// Tokenize input according to the GPT-like regex split semantics
-// r"""'(?i:[sdmt]|ll|ve|re)|[^\r\n\p{L}\p{N}]?+\p{L}+|\p{N}{1,2}| ?[^\s\p{L}\p{N}]++[\r\n]*|\s*[\r\n]|\s+(?!\S)|\s+"""
 void tokenize_fast(const char *input, size_t input_len, TokenList *out);
-
-// Utility to print tokens with C-like escaping (one per line):
-// <length>\t<escaped-bytes>\n
 void print_token_escaped(const char *s, size_t len, FILE *out);
 void print_tokens_escaped(const TokenList *list, FILE *out);

-#endif // FAST_TOKENIZER_H
+#endif // FAST_REGEX_H