mirror of
https://github.com/karpathy/nanochat.git
synced 2025-12-06 12:22:18 +00:00
removed buffer approuch
This commit is contained in:
parent
e02938c0aa
commit
41c8b8dbde
|
|
@ -115,7 +115,7 @@ def main():
|
||||||
|
|
||||||
try:
|
try:
|
||||||
data = path.read_bytes()
|
data = path.read_bytes()
|
||||||
benchmark_dataset(path.name, data, 10)
|
benchmark_dataset(path.name, data, 10_000)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(f"❌ Error reading {file_path}: {e}")
|
print(f"❌ Error reading {file_path}: {e}")
|
||||||
else:
|
else:
|
||||||
|
|
|
||||||
133
fregex/fregex.c
133
fregex/fregex.c
|
|
@ -48,6 +48,10 @@ static inline size_t utf8_decode_cp(
|
||||||
return (size_t)ret;
|
return (size_t)ret;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static inline bool is_utf8_cont_byte(unsigned char b) {
|
||||||
|
return (b & 0xC0) == 0x80;
|
||||||
|
}
|
||||||
|
|
||||||
static inline bool is_cr_or_lf(unsigned int cp) {
|
static inline bool is_cr_or_lf(unsigned int cp) {
|
||||||
return cp == UNICODE_LF || cp == UNICODE_CR;
|
return cp == UNICODE_LF || cp == UNICODE_CR;
|
||||||
}
|
}
|
||||||
|
|
@ -312,7 +316,6 @@ static size_t match_short_number(const char *p, const char *end) {
|
||||||
}
|
}
|
||||||
|
|
||||||
/* D) ?[^\s\p{L}\p{N}]++[\r\n]* */
|
/* D) ?[^\s\p{L}\p{N}]++[\r\n]* */
|
||||||
// Optional single ASCII space, then 1+ of (not whitespace, not letter, not number),
|
|
||||||
static size_t match_punct_run(const char *p, const char *end) {
|
static size_t match_punct_run(const char *p, const char *end) {
|
||||||
const char *q = p;
|
const char *q = p;
|
||||||
|
|
||||||
|
|
@ -365,92 +368,82 @@ static size_t match_punct_run(const char *p, const char *end) {
|
||||||
/* E) \s*[\r\n] */
|
/* E) \s*[\r\n] */
|
||||||
static size_t match_ws_then_linebreak(const char *p, const char *end) {
|
static size_t match_ws_then_linebreak(const char *p, const char *end) {
|
||||||
const char *q = p;
|
const char *q = p;
|
||||||
|
const char *best = NULL;
|
||||||
|
|
||||||
// Collect all positions while consuming whitespace
|
// Check boundary before consuming any whitespace, too (zero-length \s*)
|
||||||
// TODO: ? Could we hit the limit
|
if (q < end) {
|
||||||
const char *positions[256];
|
unsigned int nx;
|
||||||
int pos_count = 0;
|
size_t nn = utf8_decode_cp(q, end, &nx);
|
||||||
|
if (nn > 0 && is_cr_or_lf(nx)) {
|
||||||
// Store initial position (zero whitespace consumed)
|
best = q; // \s* = 0, [\r\n] = this char
|
||||||
positions[pos_count++] = q;
|
|
||||||
|
|
||||||
while (q < end && pos_count < 255) {
|
|
||||||
unsigned int cp;
|
|
||||||
size_t n = utf8_decode_cp(q, end, &cp);
|
|
||||||
if (n == 0 || !is_space(cp))
|
|
||||||
break;
|
|
||||||
q += n;
|
|
||||||
positions[pos_count++] = q;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Try positions from longest to shortest (backtracking)
|
|
||||||
// We need to find a position where the next character is a linebreak
|
|
||||||
for (int i = pos_count - 1; i >= 0; i--) {
|
|
||||||
q = positions[i];
|
|
||||||
|
|
||||||
// Check if next character is a linebreak
|
|
||||||
if (q < end) {
|
|
||||||
unsigned int br;
|
|
||||||
size_t nb = utf8_decode_cp(q, end, &br);
|
|
||||||
if (nb > 0 && is_cr_or_lf(br)) {
|
|
||||||
// Found a linebreak, include it and return
|
|
||||||
return (size_t)(q + nb - p);
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
// EOF reached, rule requires a linebreak so fail
|
|
||||||
continue;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// No position found where next char is a linebreak
|
// Scan whitespace; at each boundary, test the next cp
|
||||||
return 0;
|
while (q < end) {
|
||||||
|
unsigned int cp;
|
||||||
|
size_t n = utf8_decode_cp(q, end, &cp);
|
||||||
|
if (n == 0 || !is_space(cp))
|
||||||
|
break;
|
||||||
|
q += n; // we consumed one whitespace cp; boundary is at q now
|
||||||
|
|
||||||
|
if (q < end) {
|
||||||
|
unsigned int nx;
|
||||||
|
size_t nn = utf8_decode_cp(q, end, &nx);
|
||||||
|
if (nn > 0 && is_cr_or_lf(nx)) {
|
||||||
|
best = q; // prefer the rightmost usable boundary
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!best) return 0;
|
||||||
|
|
||||||
|
// At 'best' the next cp is the CR/LF to include
|
||||||
|
unsigned int br;
|
||||||
|
size_t nb = utf8_decode_cp(best, end, &br);
|
||||||
|
return (size_t)((best + nb) - p);
|
||||||
}
|
}
|
||||||
|
|
||||||
/* F) \s+(?!\S) */
|
/* F) \s+(?!\S) */
|
||||||
static size_t match_trailing_ws(const char *p, const char *end) {
|
static size_t match_trailing_ws(const char *p, const char *end) {
|
||||||
if (p >= end) return 0;
|
if (p >= end)
|
||||||
|
return 0;
|
||||||
|
|
||||||
/* Must start with at least one whitespace */
|
// First cp must be whitespace
|
||||||
const char *q = p;
|
unsigned int cp;
|
||||||
unsigned int cp;
|
size_t n = utf8_decode_cp(p, end, &cp);
|
||||||
size_t n = utf8_decode_cp(q, end, &cp);
|
|
||||||
if (n == 0 || !is_space(cp))
|
if (n == 0 || !is_space(cp))
|
||||||
return 0;
|
return 0;
|
||||||
|
|
||||||
/* Collect all whitespace positions */
|
// Consume full whitespace run [p, r)
|
||||||
// TODO: ? Could we hit the limit
|
const char *r = p + n;
|
||||||
const char *positions[256];
|
while (r < end) {
|
||||||
positions[0] = q + n; // Position after first whitespace
|
size_t m = utf8_decode_cp(r, end, &cp);
|
||||||
int pos_count = 1;
|
|
||||||
|
|
||||||
q += n;
|
|
||||||
|
|
||||||
while (q < end && pos_count < 255) {
|
|
||||||
size_t m = utf8_decode_cp(q, end, &cp);
|
|
||||||
if (m == 0 || !is_space(cp))
|
if (m == 0 || !is_space(cp))
|
||||||
break;
|
break;
|
||||||
q += m;
|
r += m;
|
||||||
positions[pos_count++] = q;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Try positions from longest to shortest (backtracking) */
|
if (r == end) {
|
||||||
for (int i = pos_count - 1; i >= 0; i--) {
|
// Only whitespace to EOF -> take all of it
|
||||||
q = positions[i];
|
return (size_t)(r - p);
|
||||||
|
|
||||||
/* Check negative lookahead: (?!\S) at this position */
|
|
||||||
if (q < end) {
|
|
||||||
size_t k = utf8_decode_cp(q, end, &cp);
|
|
||||||
if (k > 0 && !is_space(cp)) {
|
|
||||||
continue; /* Next char is non-space, try shorter match */
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/* Lookahead succeeded at this position */
|
|
||||||
return (size_t)(q - p);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/* All positions failed lookahead */
|
// Backtrack by exactly one whitespace cp
|
||||||
return 0;
|
// If the run length is only 1 cp, F must fail.
|
||||||
|
// Find the start of the last whitespace cp in [p, r)
|
||||||
|
const char *t = r;
|
||||||
|
// step back to beginning of previous UTF-8 cp
|
||||||
|
do {
|
||||||
|
--t;
|
||||||
|
} while (t > p && is_utf8_cont_byte(*t));
|
||||||
|
|
||||||
|
if (t == p) {
|
||||||
|
// run had length 1 cp -> cannot backtrack to keep \s+ >= 1
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
// Now [p, t) is k-1 whitespace cps
|
||||||
|
return (size_t)(t - p);
|
||||||
}
|
}
|
||||||
|
|
||||||
/* G) \s+ */
|
/* G) \s+ */
|
||||||
|
|
|
||||||
|
|
@ -65,8 +65,6 @@ def gen_valid_unicode_string(rng: random.Random, max_len: int) -> str:
|
||||||
continue
|
continue
|
||||||
return cp
|
return cp
|
||||||
|
|
||||||
MAX_WS_RUN = 255
|
|
||||||
|
|
||||||
def is_ws_char(ch: str) -> bool:
|
def is_ws_char(ch: str) -> bool:
|
||||||
cp = ord(ch)
|
cp = ord(ch)
|
||||||
return ch.isspace() or (cp in ws_cps)
|
return ch.isspace() or (cp in ws_cps)
|
||||||
|
|
@ -78,10 +76,10 @@ def gen_valid_unicode_string(rng: random.Random, max_len: int) -> str:
|
||||||
seqs = ["\n", "\r", "\r\n"]
|
seqs = ["\n", "\r", "\r\n"]
|
||||||
unit = rng.choice(seqs)
|
unit = rng.choice(seqs)
|
||||||
unit_len = len(unit)
|
unit_len = len(unit)
|
||||||
max_reps = max(1, min(max_run // unit_len, MAX_WS_RUN // unit_len))
|
max_reps = max(1, max_run // unit_len)
|
||||||
seg = unit * rng.randint(1, max_reps)
|
seg = unit * rng.randint(1, max_reps)
|
||||||
return seg
|
return seg
|
||||||
run = rng.randint(1, min(MAX_WS_RUN, max(1, max_run)))
|
run = rng.randint(1, max(1, max_run))
|
||||||
buf = []
|
buf = []
|
||||||
for _ in range(run):
|
for _ in range(run):
|
||||||
cp = rng.choice(ws_cps)
|
cp = rng.choice(ws_cps)
|
||||||
|
|
@ -177,7 +175,6 @@ def gen_valid_unicode_string(rng: random.Random, max_len: int) -> str:
|
||||||
|
|
||||||
buf: list[str] = []
|
buf: list[str] = []
|
||||||
curr_len = 0
|
curr_len = 0
|
||||||
curr_ws_run = 0
|
|
||||||
# Build by segments until target_len
|
# Build by segments until target_len
|
||||||
while curr_len < target_len:
|
while curr_len < target_len:
|
||||||
remain = target_len - curr_len
|
remain = target_len - curr_len
|
||||||
|
|
@ -201,50 +198,32 @@ def gen_valid_unicode_string(rng: random.Random, max_len: int) -> str:
|
||||||
if not seg:
|
if not seg:
|
||||||
continue
|
continue
|
||||||
# Trim if needed
|
# Trim if needed
|
||||||
# Append with whitespace-run capping
|
# Append
|
||||||
for ch in seg:
|
for ch in seg:
|
||||||
if curr_len >= target_len:
|
if curr_len >= target_len:
|
||||||
break
|
break
|
||||||
if is_ws_char(ch):
|
if is_ws_char(ch):
|
||||||
if curr_ws_run >= MAX_WS_RUN:
|
|
||||||
# insert a non-whitespace breaker
|
|
||||||
breaker = '.'
|
|
||||||
buf.append(breaker)
|
|
||||||
curr_len += 1
|
|
||||||
curr_ws_run = 0
|
|
||||||
if curr_len >= target_len:
|
|
||||||
break
|
|
||||||
buf.append(ch)
|
buf.append(ch)
|
||||||
curr_len += 1
|
curr_len += 1
|
||||||
curr_ws_run += 1
|
|
||||||
else:
|
else:
|
||||||
buf.append(ch)
|
buf.append(ch)
|
||||||
curr_len += 1
|
curr_len += 1
|
||||||
curr_ws_run = 0
|
|
||||||
|
|
||||||
# Occasionally end with trailing spaces to stress \s+(?!\S)
|
# Occasionally end with trailing spaces to stress \s+(?!\S)
|
||||||
if curr_len < max_len and rng.random() < 0.3:
|
if curr_len < max_len and rng.random() < 0.3:
|
||||||
trail = gen_ws_segment(max_len - curr_len)
|
trail = gen_ws_segment(max_len - curr_len)
|
||||||
if rng.random() < 0.7:
|
if rng.random() < 0.7:
|
||||||
trail = (' ' if rng.random() < 0.6 else '\t') * rng.randint(1, min(8, max_len - curr_len))
|
trail = (' ' if rng.random() < 0.6 else '\t') * rng.randint(1, min(8, max_len - curr_len))
|
||||||
# Append trailing with cap as well
|
# Append trailing
|
||||||
for ch in trail:
|
for ch in trail:
|
||||||
if curr_len >= max_len:
|
if curr_len >= max_len:
|
||||||
break
|
break
|
||||||
if is_ws_char(ch):
|
if is_ws_char(ch):
|
||||||
if curr_ws_run >= MAX_WS_RUN:
|
|
||||||
buf.append('.')
|
|
||||||
curr_len += 1
|
|
||||||
curr_ws_run = 0
|
|
||||||
if curr_len >= max_len:
|
|
||||||
break
|
|
||||||
buf.append(ch)
|
buf.append(ch)
|
||||||
curr_len += 1
|
curr_len += 1
|
||||||
curr_ws_run += 1
|
|
||||||
else:
|
else:
|
||||||
buf.append(ch)
|
buf.append(ch)
|
||||||
curr_len += 1
|
curr_len += 1
|
||||||
curr_ws_run = 0
|
|
||||||
|
|
||||||
return ''.join(buf)
|
return ''.join(buf)
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue
Block a user