mirror of
https://github.com/karpathy/nanochat.git
synced 2025-12-06 12:22:18 +00:00
removed buffer approuch
This commit is contained in:
parent
e02938c0aa
commit
41c8b8dbde
|
|
@ -115,7 +115,7 @@ def main():
|
|||
|
||||
try:
|
||||
data = path.read_bytes()
|
||||
benchmark_dataset(path.name, data, 10)
|
||||
benchmark_dataset(path.name, data, 10_000)
|
||||
except Exception as e:
|
||||
print(f"❌ Error reading {file_path}: {e}")
|
||||
else:
|
||||
|
|
|
|||
133
fregex/fregex.c
133
fregex/fregex.c
|
|
@ -48,6 +48,10 @@ static inline size_t utf8_decode_cp(
|
|||
return (size_t)ret;
|
||||
}
|
||||
|
||||
static inline bool is_utf8_cont_byte(unsigned char b) {
|
||||
return (b & 0xC0) == 0x80;
|
||||
}
|
||||
|
||||
static inline bool is_cr_or_lf(unsigned int cp) {
|
||||
return cp == UNICODE_LF || cp == UNICODE_CR;
|
||||
}
|
||||
|
|
@ -312,7 +316,6 @@ static size_t match_short_number(const char *p, const char *end) {
|
|||
}
|
||||
|
||||
/* D) ?[^\s\p{L}\p{N}]++[\r\n]* */
|
||||
// Optional single ASCII space, then 1+ of (not whitespace, not letter, not number),
|
||||
static size_t match_punct_run(const char *p, const char *end) {
|
||||
const char *q = p;
|
||||
|
||||
|
|
@ -365,92 +368,82 @@ static size_t match_punct_run(const char *p, const char *end) {
|
|||
/* E) \s*[\r\n] */
|
||||
static size_t match_ws_then_linebreak(const char *p, const char *end) {
|
||||
const char *q = p;
|
||||
const char *best = NULL;
|
||||
|
||||
// Collect all positions while consuming whitespace
|
||||
// TODO: ? Could we hit the limit
|
||||
const char *positions[256];
|
||||
int pos_count = 0;
|
||||
|
||||
// Store initial position (zero whitespace consumed)
|
||||
positions[pos_count++] = q;
|
||||
|
||||
while (q < end && pos_count < 255) {
|
||||
unsigned int cp;
|
||||
size_t n = utf8_decode_cp(q, end, &cp);
|
||||
if (n == 0 || !is_space(cp))
|
||||
break;
|
||||
q += n;
|
||||
positions[pos_count++] = q;
|
||||
}
|
||||
|
||||
// Try positions from longest to shortest (backtracking)
|
||||
// We need to find a position where the next character is a linebreak
|
||||
for (int i = pos_count - 1; i >= 0; i--) {
|
||||
q = positions[i];
|
||||
|
||||
// Check if next character is a linebreak
|
||||
if (q < end) {
|
||||
unsigned int br;
|
||||
size_t nb = utf8_decode_cp(q, end, &br);
|
||||
if (nb > 0 && is_cr_or_lf(br)) {
|
||||
// Found a linebreak, include it and return
|
||||
return (size_t)(q + nb - p);
|
||||
}
|
||||
} else {
|
||||
// EOF reached, rule requires a linebreak so fail
|
||||
continue;
|
||||
// Check boundary before consuming any whitespace, too (zero-length \s*)
|
||||
if (q < end) {
|
||||
unsigned int nx;
|
||||
size_t nn = utf8_decode_cp(q, end, &nx);
|
||||
if (nn > 0 && is_cr_or_lf(nx)) {
|
||||
best = q; // \s* = 0, [\r\n] = this char
|
||||
}
|
||||
}
|
||||
|
||||
// No position found where next char is a linebreak
|
||||
return 0;
|
||||
// Scan whitespace; at each boundary, test the next cp
|
||||
while (q < end) {
|
||||
unsigned int cp;
|
||||
size_t n = utf8_decode_cp(q, end, &cp);
|
||||
if (n == 0 || !is_space(cp))
|
||||
break;
|
||||
q += n; // we consumed one whitespace cp; boundary is at q now
|
||||
|
||||
if (q < end) {
|
||||
unsigned int nx;
|
||||
size_t nn = utf8_decode_cp(q, end, &nx);
|
||||
if (nn > 0 && is_cr_or_lf(nx)) {
|
||||
best = q; // prefer the rightmost usable boundary
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (!best) return 0;
|
||||
|
||||
// At 'best' the next cp is the CR/LF to include
|
||||
unsigned int br;
|
||||
size_t nb = utf8_decode_cp(best, end, &br);
|
||||
return (size_t)((best + nb) - p);
|
||||
}
|
||||
|
||||
/* F) \s+(?!\S) */
|
||||
static size_t match_trailing_ws(const char *p, const char *end) {
|
||||
if (p >= end) return 0;
|
||||
if (p >= end)
|
||||
return 0;
|
||||
|
||||
/* Must start with at least one whitespace */
|
||||
const char *q = p;
|
||||
unsigned int cp;
|
||||
size_t n = utf8_decode_cp(q, end, &cp);
|
||||
// First cp must be whitespace
|
||||
unsigned int cp;
|
||||
size_t n = utf8_decode_cp(p, end, &cp);
|
||||
if (n == 0 || !is_space(cp))
|
||||
return 0;
|
||||
|
||||
/* Collect all whitespace positions */
|
||||
// TODO: ? Could we hit the limit
|
||||
const char *positions[256];
|
||||
positions[0] = q + n; // Position after first whitespace
|
||||
int pos_count = 1;
|
||||
|
||||
q += n;
|
||||
|
||||
while (q < end && pos_count < 255) {
|
||||
size_t m = utf8_decode_cp(q, end, &cp);
|
||||
// Consume full whitespace run [p, r)
|
||||
const char *r = p + n;
|
||||
while (r < end) {
|
||||
size_t m = utf8_decode_cp(r, end, &cp);
|
||||
if (m == 0 || !is_space(cp))
|
||||
break;
|
||||
q += m;
|
||||
positions[pos_count++] = q;
|
||||
r += m;
|
||||
}
|
||||
|
||||
/* Try positions from longest to shortest (backtracking) */
|
||||
for (int i = pos_count - 1; i >= 0; i--) {
|
||||
q = positions[i];
|
||||
|
||||
/* Check negative lookahead: (?!\S) at this position */
|
||||
if (q < end) {
|
||||
size_t k = utf8_decode_cp(q, end, &cp);
|
||||
if (k > 0 && !is_space(cp)) {
|
||||
continue; /* Next char is non-space, try shorter match */
|
||||
}
|
||||
}
|
||||
|
||||
/* Lookahead succeeded at this position */
|
||||
return (size_t)(q - p);
|
||||
if (r == end) {
|
||||
// Only whitespace to EOF -> take all of it
|
||||
return (size_t)(r - p);
|
||||
}
|
||||
|
||||
/* All positions failed lookahead */
|
||||
return 0;
|
||||
|
||||
// Backtrack by exactly one whitespace cp
|
||||
// If the run length is only 1 cp, F must fail.
|
||||
// Find the start of the last whitespace cp in [p, r)
|
||||
const char *t = r;
|
||||
// step back to beginning of previous UTF-8 cp
|
||||
do {
|
||||
--t;
|
||||
} while (t > p && is_utf8_cont_byte(*t));
|
||||
|
||||
if (t == p) {
|
||||
// run had length 1 cp -> cannot backtrack to keep \s+ >= 1
|
||||
return 0;
|
||||
}
|
||||
// Now [p, t) is k-1 whitespace cps
|
||||
return (size_t)(t - p);
|
||||
}
|
||||
|
||||
/* G) \s+ */
|
||||
|
|
|
|||
|
|
@ -65,8 +65,6 @@ def gen_valid_unicode_string(rng: random.Random, max_len: int) -> str:
|
|||
continue
|
||||
return cp
|
||||
|
||||
MAX_WS_RUN = 255
|
||||
|
||||
def is_ws_char(ch: str) -> bool:
|
||||
cp = ord(ch)
|
||||
return ch.isspace() or (cp in ws_cps)
|
||||
|
|
@ -78,10 +76,10 @@ def gen_valid_unicode_string(rng: random.Random, max_len: int) -> str:
|
|||
seqs = ["\n", "\r", "\r\n"]
|
||||
unit = rng.choice(seqs)
|
||||
unit_len = len(unit)
|
||||
max_reps = max(1, min(max_run // unit_len, MAX_WS_RUN // unit_len))
|
||||
max_reps = max(1, max_run // unit_len)
|
||||
seg = unit * rng.randint(1, max_reps)
|
||||
return seg
|
||||
run = rng.randint(1, min(MAX_WS_RUN, max(1, max_run)))
|
||||
run = rng.randint(1, max(1, max_run))
|
||||
buf = []
|
||||
for _ in range(run):
|
||||
cp = rng.choice(ws_cps)
|
||||
|
|
@ -177,7 +175,6 @@ def gen_valid_unicode_string(rng: random.Random, max_len: int) -> str:
|
|||
|
||||
buf: list[str] = []
|
||||
curr_len = 0
|
||||
curr_ws_run = 0
|
||||
# Build by segments until target_len
|
||||
while curr_len < target_len:
|
||||
remain = target_len - curr_len
|
||||
|
|
@ -201,50 +198,32 @@ def gen_valid_unicode_string(rng: random.Random, max_len: int) -> str:
|
|||
if not seg:
|
||||
continue
|
||||
# Trim if needed
|
||||
# Append with whitespace-run capping
|
||||
# Append
|
||||
for ch in seg:
|
||||
if curr_len >= target_len:
|
||||
break
|
||||
if is_ws_char(ch):
|
||||
if curr_ws_run >= MAX_WS_RUN:
|
||||
# insert a non-whitespace breaker
|
||||
breaker = '.'
|
||||
buf.append(breaker)
|
||||
curr_len += 1
|
||||
curr_ws_run = 0
|
||||
if curr_len >= target_len:
|
||||
break
|
||||
buf.append(ch)
|
||||
curr_len += 1
|
||||
curr_ws_run += 1
|
||||
else:
|
||||
buf.append(ch)
|
||||
curr_len += 1
|
||||
curr_ws_run = 0
|
||||
|
||||
# Occasionally end with trailing spaces to stress \s+(?!\S)
|
||||
if curr_len < max_len and rng.random() < 0.3:
|
||||
trail = gen_ws_segment(max_len - curr_len)
|
||||
if rng.random() < 0.7:
|
||||
trail = (' ' if rng.random() < 0.6 else '\t') * rng.randint(1, min(8, max_len - curr_len))
|
||||
# Append trailing with cap as well
|
||||
# Append trailing
|
||||
for ch in trail:
|
||||
if curr_len >= max_len:
|
||||
break
|
||||
if is_ws_char(ch):
|
||||
if curr_ws_run >= MAX_WS_RUN:
|
||||
buf.append('.')
|
||||
curr_len += 1
|
||||
curr_ws_run = 0
|
||||
if curr_len >= max_len:
|
||||
break
|
||||
buf.append(ch)
|
||||
curr_len += 1
|
||||
curr_ws_run += 1
|
||||
else:
|
||||
buf.append(ch)
|
||||
curr_len += 1
|
||||
curr_ws_run = 0
|
||||
|
||||
return ''.join(buf)
|
||||
|
||||
|
|
|
|||
Loading…
Reference in New Issue
Block a user