mirror of
https://github.com/karpathy/nanochat.git
synced 2025-12-06 04:12:13 +00:00
cleanup
This commit is contained in:
parent
12f418f0a1
commit
e02938c0aa
|
|
@ -1,15 +1,3 @@
|
|||
"""
|
||||
Benchmarker for comparing tok.c tokenize_fast() vs rust split_text()
|
||||
Measures speed WITHOUT subprocess overhead - direct function calls only.
|
||||
|
||||
Usage:
|
||||
cd pytok
|
||||
source ../.venv/bin/activate
|
||||
python3 bench.py # Run synthetic data benchmarks
|
||||
python3 bench.py /path/to/file.txt # Benchmark a specific file
|
||||
python3 bench.py file1.txt file2.txt ... # Benchmark multiple files
|
||||
"""
|
||||
|
||||
import sys
|
||||
import ctypes
|
||||
import random
|
||||
|
|
|
|||
|
|
@ -119,7 +119,6 @@ def compare_one(path: Path) -> int:
|
|||
p_offs = byte_offsets(p_parsed)
|
||||
r_offs = byte_offsets(r_parsed)
|
||||
|
||||
# Load original input bytes so we can show precise substrings and code points
|
||||
data_bytes = Path(path).read_bytes()
|
||||
|
||||
def print_unicode_debug(label, offs_list, idx):
|
||||
|
|
|
|||
|
|
@ -1,12 +1,12 @@
|
|||
#ifndef FAST_TOKENIZER_H
|
||||
#define FAST_TOKENIZER_H
|
||||
#ifndef FAST_REGEX_H
|
||||
#define FAST_REGEX_H
|
||||
|
||||
#include <stddef.h>
|
||||
#include <stdio.h>
|
||||
|
||||
typedef struct {
|
||||
char **tokens;
|
||||
size_t *lengths; // Store length of each token to handle null bytes
|
||||
size_t *lengths;
|
||||
size_t count;
|
||||
size_t capacity;
|
||||
} TokenList;
|
||||
|
|
@ -14,15 +14,10 @@ typedef struct {
|
|||
void tokenlist_init(TokenList *list);
|
||||
void tokenlist_free(TokenList *list);
|
||||
|
||||
// Tokenize input according to the GPT-like regex split semantics
|
||||
// r"""'(?i:[sdmt]|ll|ve|re)|[^\r\n\p{L}\p{N}]?+\p{L}+|\p{N}{1,2}| ?[^\s\p{L}\p{N}]++[\r\n]*|\s*[\r\n]|\s+(?!\S)|\s+"""
|
||||
void tokenize_fast(const char *input, size_t input_len, TokenList *out);
|
||||
|
||||
// Utility to print tokens with C-like escaping (one per line):
|
||||
// <length>\t<escaped-bytes>\n
|
||||
void print_token_escaped(const char *s, size_t len, FILE *out);
|
||||
void print_tokens_escaped(const TokenList *list, FILE *out);
|
||||
|
||||
#endif // FAST_TOKENIZER_H
|
||||
#endif // FAST_REGEX_H
|
||||
|
||||
|
||||
|
|
|
|||
Loading…
Reference in New Issue
Block a user