removed buffer approuch

This commit is contained in:
MadMax129 2025-10-23 20:23:59 -04:00
parent e02938c0aa
commit 41c8b8dbde
3 changed files with 68 additions and 96 deletions

View File

@ -115,7 +115,7 @@ def main():
try:
data = path.read_bytes()
benchmark_dataset(path.name, data, 10)
benchmark_dataset(path.name, data, 10_000)
except Exception as e:
print(f"❌ Error reading {file_path}: {e}")
else:

View File

@ -48,6 +48,10 @@ static inline size_t utf8_decode_cp(
return (size_t)ret;
}
static inline bool is_utf8_cont_byte(unsigned char b) {
return (b & 0xC0) == 0x80;
}
static inline bool is_cr_or_lf(unsigned int cp) {
return cp == UNICODE_LF || cp == UNICODE_CR;
}
@ -312,7 +316,6 @@ static size_t match_short_number(const char *p, const char *end) {
}
/* D) ?[^\s\p{L}\p{N}]++[\r\n]* */
// Optional single ASCII space, then 1+ of (not whitespace, not letter, not number),
static size_t match_punct_run(const char *p, const char *end) {
const char *q = p;
@ -365,92 +368,82 @@ static size_t match_punct_run(const char *p, const char *end) {
/* E) \s*[\r\n] */
static size_t match_ws_then_linebreak(const char *p, const char *end) {
const char *q = p;
const char *best = NULL;
// Collect all positions while consuming whitespace
// TODO: ? Could we hit the limit
const char *positions[256];
int pos_count = 0;
// Check boundary before consuming any whitespace, too (zero-length \s*)
if (q < end) {
unsigned int nx;
size_t nn = utf8_decode_cp(q, end, &nx);
if (nn > 0 && is_cr_or_lf(nx)) {
best = q; // \s* = 0, [\r\n] = this char
}
}
// Store initial position (zero whitespace consumed)
positions[pos_count++] = q;
while (q < end && pos_count < 255) {
// Scan whitespace; at each boundary, test the next cp
while (q < end) {
unsigned int cp;
size_t n = utf8_decode_cp(q, end, &cp);
if (n == 0 || !is_space(cp))
break;
q += n;
positions[pos_count++] = q;
}
q += n; // we consumed one whitespace cp; boundary is at q now
// Try positions from longest to shortest (backtracking)
// We need to find a position where the next character is a linebreak
for (int i = pos_count - 1; i >= 0; i--) {
q = positions[i];
// Check if next character is a linebreak
if (q < end) {
unsigned int br;
size_t nb = utf8_decode_cp(q, end, &br);
if (nb > 0 && is_cr_or_lf(br)) {
// Found a linebreak, include it and return
return (size_t)(q + nb - p);
unsigned int nx;
size_t nn = utf8_decode_cp(q, end, &nx);
if (nn > 0 && is_cr_or_lf(nx)) {
best = q; // prefer the rightmost usable boundary
}
} else {
// EOF reached, rule requires a linebreak so fail
continue;
}
}
// No position found where next char is a linebreak
return 0;
if (!best) return 0;
// At 'best' the next cp is the CR/LF to include
unsigned int br;
size_t nb = utf8_decode_cp(best, end, &br);
return (size_t)((best + nb) - p);
}
/* F) \s+(?!\S) */
static size_t match_trailing_ws(const char *p, const char *end) {
if (p >= end) return 0;
if (p >= end)
return 0;
/* Must start with at least one whitespace */
const char *q = p;
// First cp must be whitespace
unsigned int cp;
size_t n = utf8_decode_cp(q, end, &cp);
size_t n = utf8_decode_cp(p, end, &cp);
if (n == 0 || !is_space(cp))
return 0;
/* Collect all whitespace positions */
// TODO: ? Could we hit the limit
const char *positions[256];
positions[0] = q + n; // Position after first whitespace
int pos_count = 1;
q += n;
while (q < end && pos_count < 255) {
size_t m = utf8_decode_cp(q, end, &cp);
// Consume full whitespace run [p, r)
const char *r = p + n;
while (r < end) {
size_t m = utf8_decode_cp(r, end, &cp);
if (m == 0 || !is_space(cp))
break;
q += m;
positions[pos_count++] = q;
r += m;
}
/* Try positions from longest to shortest (backtracking) */
for (int i = pos_count - 1; i >= 0; i--) {
q = positions[i];
/* Check negative lookahead: (?!\S) at this position */
if (q < end) {
size_t k = utf8_decode_cp(q, end, &cp);
if (k > 0 && !is_space(cp)) {
continue; /* Next char is non-space, try shorter match */
}
}
/* Lookahead succeeded at this position */
return (size_t)(q - p);
if (r == end) {
// Only whitespace to EOF -> take all of it
return (size_t)(r - p);
}
/* All positions failed lookahead */
return 0;
// Backtrack by exactly one whitespace cp
// If the run length is only 1 cp, F must fail.
// Find the start of the last whitespace cp in [p, r)
const char *t = r;
// step back to beginning of previous UTF-8 cp
do {
--t;
} while (t > p && is_utf8_cont_byte(*t));
if (t == p) {
// run had length 1 cp -> cannot backtrack to keep \s+ >= 1
return 0;
}
// Now [p, t) is k-1 whitespace cps
return (size_t)(t - p);
}
/* G) \s+ */

View File

@ -65,8 +65,6 @@ def gen_valid_unicode_string(rng: random.Random, max_len: int) -> str:
continue
return cp
MAX_WS_RUN = 255
def is_ws_char(ch: str) -> bool:
cp = ord(ch)
return ch.isspace() or (cp in ws_cps)
@ -78,10 +76,10 @@ def gen_valid_unicode_string(rng: random.Random, max_len: int) -> str:
seqs = ["\n", "\r", "\r\n"]
unit = rng.choice(seqs)
unit_len = len(unit)
max_reps = max(1, min(max_run // unit_len, MAX_WS_RUN // unit_len))
max_reps = max(1, max_run // unit_len)
seg = unit * rng.randint(1, max_reps)
return seg
run = rng.randint(1, min(MAX_WS_RUN, max(1, max_run)))
run = rng.randint(1, max(1, max_run))
buf = []
for _ in range(run):
cp = rng.choice(ws_cps)
@ -177,7 +175,6 @@ def gen_valid_unicode_string(rng: random.Random, max_len: int) -> str:
buf: list[str] = []
curr_len = 0
curr_ws_run = 0
# Build by segments until target_len
while curr_len < target_len:
remain = target_len - curr_len
@ -201,50 +198,32 @@ def gen_valid_unicode_string(rng: random.Random, max_len: int) -> str:
if not seg:
continue
# Trim if needed
# Append with whitespace-run capping
# Append
for ch in seg:
if curr_len >= target_len:
break
if is_ws_char(ch):
if curr_ws_run >= MAX_WS_RUN:
# insert a non-whitespace breaker
breaker = '.'
buf.append(breaker)
curr_len += 1
curr_ws_run = 0
if curr_len >= target_len:
break
buf.append(ch)
curr_len += 1
curr_ws_run += 1
else:
buf.append(ch)
curr_len += 1
curr_ws_run = 0
# Occasionally end with trailing spaces to stress \s+(?!\S)
if curr_len < max_len and rng.random() < 0.3:
trail = gen_ws_segment(max_len - curr_len)
if rng.random() < 0.7:
trail = (' ' if rng.random() < 0.6 else '\t') * rng.randint(1, min(8, max_len - curr_len))
# Append trailing with cap as well
# Append trailing
for ch in trail:
if curr_len >= max_len:
break
if is_ws_char(ch):
if curr_ws_run >= MAX_WS_RUN:
buf.append('.')
curr_len += 1
curr_ws_run = 0
if curr_len >= max_len:
break
buf.append(ch)
curr_len += 1
curr_ws_run += 1
else:
buf.append(ch)
curr_len += 1
curr_ws_run = 0
return ''.join(buf)