This commit is contained in:
MadMax129 2025-10-23 17:55:33 -04:00
parent 12f418f0a1
commit e02938c0aa
3 changed files with 4 additions and 22 deletions

View File

@ -1,15 +1,3 @@
"""
Benchmarker for comparing tok.c tokenize_fast() vs rust split_text()
Measures speed WITHOUT subprocess overhead - direct function calls only.
Usage:
cd pytok
source ../.venv/bin/activate
python3 bench.py # Run synthetic data benchmarks
python3 bench.py /path/to/file.txt # Benchmark a specific file
python3 bench.py file1.txt file2.txt ... # Benchmark multiple files
"""
import sys
import ctypes
import random

View File

@ -119,7 +119,6 @@ def compare_one(path: Path) -> int:
p_offs = byte_offsets(p_parsed)
r_offs = byte_offsets(r_parsed)
# Load original input bytes so we can show precise substrings and code points
data_bytes = Path(path).read_bytes()
def print_unicode_debug(label, offs_list, idx):

View File

@ -1,12 +1,12 @@
#ifndef FAST_TOKENIZER_H
#define FAST_TOKENIZER_H
#ifndef FAST_REGEX_H
#define FAST_REGEX_H
#include <stddef.h>
#include <stdio.h>
typedef struct {
char **tokens;
size_t *lengths; // Store length of each token to handle null bytes
size_t *lengths;
size_t count;
size_t capacity;
} TokenList;
@ -14,15 +14,10 @@ typedef struct {
void tokenlist_init(TokenList *list);
void tokenlist_free(TokenList *list);
// Tokenize input according to the GPT-like regex split semantics
// r"""'(?i:[sdmt]|ll|ve|re)|[^\r\n\p{L}\p{N}]?+\p{L}+|\p{N}{1,2}| ?[^\s\p{L}\p{N}]++[\r\n]*|\s*[\r\n]|\s+(?!\S)|\s+"""
void tokenize_fast(const char *input, size_t input_len, TokenList *out);
// Utility to print tokens with C-like escaping (one per line):
// <length>\t<escaped-bytes>\n
void print_token_escaped(const char *s, size_t len, FILE *out);
void print_tokens_escaped(const TokenList *list, FILE *out);
#endif // FAST_TOKENIZER_H
#endif // FAST_REGEX_H