removed buffer approuch

This commit is contained in:
MadMax129 2025-10-23 20:23:59 -04:00
parent e02938c0aa
commit 41c8b8dbde
3 changed files with 68 additions and 96 deletions

View File

@ -115,7 +115,7 @@ def main():
try: try:
data = path.read_bytes() data = path.read_bytes()
benchmark_dataset(path.name, data, 10) benchmark_dataset(path.name, data, 10_000)
except Exception as e: except Exception as e:
print(f"❌ Error reading {file_path}: {e}") print(f"❌ Error reading {file_path}: {e}")
else: else:

View File

@ -48,6 +48,10 @@ static inline size_t utf8_decode_cp(
return (size_t)ret; return (size_t)ret;
} }
static inline bool is_utf8_cont_byte(unsigned char b) {
return (b & 0xC0) == 0x80;
}
static inline bool is_cr_or_lf(unsigned int cp) { static inline bool is_cr_or_lf(unsigned int cp) {
return cp == UNICODE_LF || cp == UNICODE_CR; return cp == UNICODE_LF || cp == UNICODE_CR;
} }
@ -312,7 +316,6 @@ static size_t match_short_number(const char *p, const char *end) {
} }
/* D) ?[^\s\p{L}\p{N}]++[\r\n]* */ /* D) ?[^\s\p{L}\p{N}]++[\r\n]* */
// Optional single ASCII space, then 1+ of (not whitespace, not letter, not number),
static size_t match_punct_run(const char *p, const char *end) { static size_t match_punct_run(const char *p, const char *end) {
const char *q = p; const char *q = p;
@ -365,92 +368,82 @@ static size_t match_punct_run(const char *p, const char *end) {
/* E) \s*[\r\n] */ /* E) \s*[\r\n] */
static size_t match_ws_then_linebreak(const char *p, const char *end) { static size_t match_ws_then_linebreak(const char *p, const char *end) {
const char *q = p; const char *q = p;
const char *best = NULL;
// Collect all positions while consuming whitespace // Check boundary before consuming any whitespace, too (zero-length \s*)
// TODO: ? Could we hit the limit if (q < end) {
const char *positions[256]; unsigned int nx;
int pos_count = 0; size_t nn = utf8_decode_cp(q, end, &nx);
if (nn > 0 && is_cr_or_lf(nx)) {
// Store initial position (zero whitespace consumed) best = q; // \s* = 0, [\r\n] = this char
positions[pos_count++] = q;
while (q < end && pos_count < 255) {
unsigned int cp;
size_t n = utf8_decode_cp(q, end, &cp);
if (n == 0 || !is_space(cp))
break;
q += n;
positions[pos_count++] = q;
}
// Try positions from longest to shortest (backtracking)
// We need to find a position where the next character is a linebreak
for (int i = pos_count - 1; i >= 0; i--) {
q = positions[i];
// Check if next character is a linebreak
if (q < end) {
unsigned int br;
size_t nb = utf8_decode_cp(q, end, &br);
if (nb > 0 && is_cr_or_lf(br)) {
// Found a linebreak, include it and return
return (size_t)(q + nb - p);
}
} else {
// EOF reached, rule requires a linebreak so fail
continue;
} }
} }
// No position found where next char is a linebreak // Scan whitespace; at each boundary, test the next cp
return 0; while (q < end) {
unsigned int cp;
size_t n = utf8_decode_cp(q, end, &cp);
if (n == 0 || !is_space(cp))
break;
q += n; // we consumed one whitespace cp; boundary is at q now
if (q < end) {
unsigned int nx;
size_t nn = utf8_decode_cp(q, end, &nx);
if (nn > 0 && is_cr_or_lf(nx)) {
best = q; // prefer the rightmost usable boundary
}
}
}
if (!best) return 0;
// At 'best' the next cp is the CR/LF to include
unsigned int br;
size_t nb = utf8_decode_cp(best, end, &br);
return (size_t)((best + nb) - p);
} }
/* F) \s+(?!\S) */ /* F) \s+(?!\S) */
static size_t match_trailing_ws(const char *p, const char *end) { static size_t match_trailing_ws(const char *p, const char *end) {
if (p >= end) return 0; if (p >= end)
return 0;
/* Must start with at least one whitespace */ // First cp must be whitespace
const char *q = p; unsigned int cp;
unsigned int cp; size_t n = utf8_decode_cp(p, end, &cp);
size_t n = utf8_decode_cp(q, end, &cp);
if (n == 0 || !is_space(cp)) if (n == 0 || !is_space(cp))
return 0; return 0;
/* Collect all whitespace positions */ // Consume full whitespace run [p, r)
// TODO: ? Could we hit the limit const char *r = p + n;
const char *positions[256]; while (r < end) {
positions[0] = q + n; // Position after first whitespace size_t m = utf8_decode_cp(r, end, &cp);
int pos_count = 1;
q += n;
while (q < end && pos_count < 255) {
size_t m = utf8_decode_cp(q, end, &cp);
if (m == 0 || !is_space(cp)) if (m == 0 || !is_space(cp))
break; break;
q += m; r += m;
positions[pos_count++] = q;
} }
/* Try positions from longest to shortest (backtracking) */ if (r == end) {
for (int i = pos_count - 1; i >= 0; i--) { // Only whitespace to EOF -> take all of it
q = positions[i]; return (size_t)(r - p);
/* Check negative lookahead: (?!\S) at this position */
if (q < end) {
size_t k = utf8_decode_cp(q, end, &cp);
if (k > 0 && !is_space(cp)) {
continue; /* Next char is non-space, try shorter match */
}
}
/* Lookahead succeeded at this position */
return (size_t)(q - p);
} }
/* All positions failed lookahead */ // Backtrack by exactly one whitespace cp
return 0; // If the run length is only 1 cp, F must fail.
// Find the start of the last whitespace cp in [p, r)
const char *t = r;
// step back to beginning of previous UTF-8 cp
do {
--t;
} while (t > p && is_utf8_cont_byte(*t));
if (t == p) {
// run had length 1 cp -> cannot backtrack to keep \s+ >= 1
return 0;
}
// Now [p, t) is k-1 whitespace cps
return (size_t)(t - p);
} }
/* G) \s+ */ /* G) \s+ */

View File

@ -65,8 +65,6 @@ def gen_valid_unicode_string(rng: random.Random, max_len: int) -> str:
continue continue
return cp return cp
MAX_WS_RUN = 255
def is_ws_char(ch: str) -> bool: def is_ws_char(ch: str) -> bool:
cp = ord(ch) cp = ord(ch)
return ch.isspace() or (cp in ws_cps) return ch.isspace() or (cp in ws_cps)
@ -78,10 +76,10 @@ def gen_valid_unicode_string(rng: random.Random, max_len: int) -> str:
seqs = ["\n", "\r", "\r\n"] seqs = ["\n", "\r", "\r\n"]
unit = rng.choice(seqs) unit = rng.choice(seqs)
unit_len = len(unit) unit_len = len(unit)
max_reps = max(1, min(max_run // unit_len, MAX_WS_RUN // unit_len)) max_reps = max(1, max_run // unit_len)
seg = unit * rng.randint(1, max_reps) seg = unit * rng.randint(1, max_reps)
return seg return seg
run = rng.randint(1, min(MAX_WS_RUN, max(1, max_run))) run = rng.randint(1, max(1, max_run))
buf = [] buf = []
for _ in range(run): for _ in range(run):
cp = rng.choice(ws_cps) cp = rng.choice(ws_cps)
@ -177,7 +175,6 @@ def gen_valid_unicode_string(rng: random.Random, max_len: int) -> str:
buf: list[str] = [] buf: list[str] = []
curr_len = 0 curr_len = 0
curr_ws_run = 0
# Build by segments until target_len # Build by segments until target_len
while curr_len < target_len: while curr_len < target_len:
remain = target_len - curr_len remain = target_len - curr_len
@ -201,50 +198,32 @@ def gen_valid_unicode_string(rng: random.Random, max_len: int) -> str:
if not seg: if not seg:
continue continue
# Trim if needed # Trim if needed
# Append with whitespace-run capping # Append
for ch in seg: for ch in seg:
if curr_len >= target_len: if curr_len >= target_len:
break break
if is_ws_char(ch): if is_ws_char(ch):
if curr_ws_run >= MAX_WS_RUN:
# insert a non-whitespace breaker
breaker = '.'
buf.append(breaker)
curr_len += 1
curr_ws_run = 0
if curr_len >= target_len:
break
buf.append(ch) buf.append(ch)
curr_len += 1 curr_len += 1
curr_ws_run += 1
else: else:
buf.append(ch) buf.append(ch)
curr_len += 1 curr_len += 1
curr_ws_run = 0
# Occasionally end with trailing spaces to stress \s+(?!\S) # Occasionally end with trailing spaces to stress \s+(?!\S)
if curr_len < max_len and rng.random() < 0.3: if curr_len < max_len and rng.random() < 0.3:
trail = gen_ws_segment(max_len - curr_len) trail = gen_ws_segment(max_len - curr_len)
if rng.random() < 0.7: if rng.random() < 0.7:
trail = (' ' if rng.random() < 0.6 else '\t') * rng.randint(1, min(8, max_len - curr_len)) trail = (' ' if rng.random() < 0.6 else '\t') * rng.randint(1, min(8, max_len - curr_len))
# Append trailing with cap as well # Append trailing
for ch in trail: for ch in trail:
if curr_len >= max_len: if curr_len >= max_len:
break break
if is_ws_char(ch): if is_ws_char(ch):
if curr_ws_run >= MAX_WS_RUN:
buf.append('.')
curr_len += 1
curr_ws_run = 0
if curr_len >= max_len:
break
buf.append(ch) buf.append(ch)
curr_len += 1 curr_len += 1
curr_ws_run += 1
else: else:
buf.append(ch) buf.append(ch)
curr_len += 1 curr_len += 1
curr_ws_run = 0
return ''.join(buf) return ''.join(buf)