From 41c8b8dbde74d162f9b93fb376870a185ef59d0b Mon Sep 17 00:00:00 2001 From: MadMax129 Date: Thu, 23 Oct 2025 20:23:59 -0400 Subject: [PATCH] removed buffer approuch --- fregex/bench.py | 2 +- fregex/fregex.c | 133 +++++++++++++++++++++++------------------------- fregex/fuzz.py | 29 ++--------- 3 files changed, 68 insertions(+), 96 deletions(-) diff --git a/fregex/bench.py b/fregex/bench.py index f707839..64dcda0 100755 --- a/fregex/bench.py +++ b/fregex/bench.py @@ -115,7 +115,7 @@ def main(): try: data = path.read_bytes() - benchmark_dataset(path.name, data, 10) + benchmark_dataset(path.name, data, 10_000) except Exception as e: print(f"❌ Error reading {file_path}: {e}") else: diff --git a/fregex/fregex.c b/fregex/fregex.c index 538430b..c91aa35 100644 --- a/fregex/fregex.c +++ b/fregex/fregex.c @@ -48,6 +48,10 @@ static inline size_t utf8_decode_cp( return (size_t)ret; } +static inline bool is_utf8_cont_byte(unsigned char b) { + return (b & 0xC0) == 0x80; +} + static inline bool is_cr_or_lf(unsigned int cp) { return cp == UNICODE_LF || cp == UNICODE_CR; } @@ -312,7 +316,6 @@ static size_t match_short_number(const char *p, const char *end) { } /* D) ?[^\s\p{L}\p{N}]++[\r\n]* */ -// Optional single ASCII space, then 1+ of (not whitespace, not letter, not number), static size_t match_punct_run(const char *p, const char *end) { const char *q = p; @@ -365,92 +368,82 @@ static size_t match_punct_run(const char *p, const char *end) { /* E) \s*[\r\n] */ static size_t match_ws_then_linebreak(const char *p, const char *end) { const char *q = p; + const char *best = NULL; - // Collect all positions while consuming whitespace - // TODO: ? Could we hit the limit - const char *positions[256]; - int pos_count = 0; - - // Store initial position (zero whitespace consumed) - positions[pos_count++] = q; - - while (q < end && pos_count < 255) { - unsigned int cp; - size_t n = utf8_decode_cp(q, end, &cp); - if (n == 0 || !is_space(cp)) - break; - q += n; - positions[pos_count++] = q; - } - - // Try positions from longest to shortest (backtracking) - // We need to find a position where the next character is a linebreak - for (int i = pos_count - 1; i >= 0; i--) { - q = positions[i]; - - // Check if next character is a linebreak - if (q < end) { - unsigned int br; - size_t nb = utf8_decode_cp(q, end, &br); - if (nb > 0 && is_cr_or_lf(br)) { - // Found a linebreak, include it and return - return (size_t)(q + nb - p); - } - } else { - // EOF reached, rule requires a linebreak so fail - continue; + // Check boundary before consuming any whitespace, too (zero-length \s*) + if (q < end) { + unsigned int nx; + size_t nn = utf8_decode_cp(q, end, &nx); + if (nn > 0 && is_cr_or_lf(nx)) { + best = q; // \s* = 0, [\r\n] = this char } } - // No position found where next char is a linebreak - return 0; + // Scan whitespace; at each boundary, test the next cp + while (q < end) { + unsigned int cp; + size_t n = utf8_decode_cp(q, end, &cp); + if (n == 0 || !is_space(cp)) + break; + q += n; // we consumed one whitespace cp; boundary is at q now + + if (q < end) { + unsigned int nx; + size_t nn = utf8_decode_cp(q, end, &nx); + if (nn > 0 && is_cr_or_lf(nx)) { + best = q; // prefer the rightmost usable boundary + } + } + } + + if (!best) return 0; + + // At 'best' the next cp is the CR/LF to include + unsigned int br; + size_t nb = utf8_decode_cp(best, end, &br); + return (size_t)((best + nb) - p); } /* F) \s+(?!\S) */ static size_t match_trailing_ws(const char *p, const char *end) { - if (p >= end) return 0; + if (p >= end) + return 0; - /* Must start with at least one whitespace */ - const char *q = p; - unsigned int cp; - size_t n = utf8_decode_cp(q, end, &cp); + // First cp must be whitespace + unsigned int cp; + size_t n = utf8_decode_cp(p, end, &cp); if (n == 0 || !is_space(cp)) return 0; - /* Collect all whitespace positions */ - // TODO: ? Could we hit the limit - const char *positions[256]; - positions[0] = q + n; // Position after first whitespace - int pos_count = 1; - - q += n; - - while (q < end && pos_count < 255) { - size_t m = utf8_decode_cp(q, end, &cp); + // Consume full whitespace run [p, r) + const char *r = p + n; + while (r < end) { + size_t m = utf8_decode_cp(r, end, &cp); if (m == 0 || !is_space(cp)) break; - q += m; - positions[pos_count++] = q; + r += m; } - /* Try positions from longest to shortest (backtracking) */ - for (int i = pos_count - 1; i >= 0; i--) { - q = positions[i]; - - /* Check negative lookahead: (?!\S) at this position */ - if (q < end) { - size_t k = utf8_decode_cp(q, end, &cp); - if (k > 0 && !is_space(cp)) { - continue; /* Next char is non-space, try shorter match */ - } - } - - /* Lookahead succeeded at this position */ - return (size_t)(q - p); + if (r == end) { + // Only whitespace to EOF -> take all of it + return (size_t)(r - p); } - - /* All positions failed lookahead */ - return 0; + + // Backtrack by exactly one whitespace cp + // If the run length is only 1 cp, F must fail. + // Find the start of the last whitespace cp in [p, r) + const char *t = r; + // step back to beginning of previous UTF-8 cp + do { + --t; + } while (t > p && is_utf8_cont_byte(*t)); + + if (t == p) { + // run had length 1 cp -> cannot backtrack to keep \s+ >= 1 + return 0; + } + // Now [p, t) is k-1 whitespace cps + return (size_t)(t - p); } /* G) \s+ */ diff --git a/fregex/fuzz.py b/fregex/fuzz.py index 0b22aaf..b0cd6d5 100644 --- a/fregex/fuzz.py +++ b/fregex/fuzz.py @@ -65,8 +65,6 @@ def gen_valid_unicode_string(rng: random.Random, max_len: int) -> str: continue return cp - MAX_WS_RUN = 255 - def is_ws_char(ch: str) -> bool: cp = ord(ch) return ch.isspace() or (cp in ws_cps) @@ -78,10 +76,10 @@ def gen_valid_unicode_string(rng: random.Random, max_len: int) -> str: seqs = ["\n", "\r", "\r\n"] unit = rng.choice(seqs) unit_len = len(unit) - max_reps = max(1, min(max_run // unit_len, MAX_WS_RUN // unit_len)) + max_reps = max(1, max_run // unit_len) seg = unit * rng.randint(1, max_reps) return seg - run = rng.randint(1, min(MAX_WS_RUN, max(1, max_run))) + run = rng.randint(1, max(1, max_run)) buf = [] for _ in range(run): cp = rng.choice(ws_cps) @@ -177,7 +175,6 @@ def gen_valid_unicode_string(rng: random.Random, max_len: int) -> str: buf: list[str] = [] curr_len = 0 - curr_ws_run = 0 # Build by segments until target_len while curr_len < target_len: remain = target_len - curr_len @@ -201,50 +198,32 @@ def gen_valid_unicode_string(rng: random.Random, max_len: int) -> str: if not seg: continue # Trim if needed - # Append with whitespace-run capping + # Append for ch in seg: if curr_len >= target_len: break if is_ws_char(ch): - if curr_ws_run >= MAX_WS_RUN: - # insert a non-whitespace breaker - breaker = '.' - buf.append(breaker) - curr_len += 1 - curr_ws_run = 0 - if curr_len >= target_len: - break buf.append(ch) curr_len += 1 - curr_ws_run += 1 else: buf.append(ch) curr_len += 1 - curr_ws_run = 0 # Occasionally end with trailing spaces to stress \s+(?!\S) if curr_len < max_len and rng.random() < 0.3: trail = gen_ws_segment(max_len - curr_len) if rng.random() < 0.7: trail = (' ' if rng.random() < 0.6 else '\t') * rng.randint(1, min(8, max_len - curr_len)) - # Append trailing with cap as well + # Append trailing for ch in trail: if curr_len >= max_len: break if is_ws_char(ch): - if curr_ws_run >= MAX_WS_RUN: - buf.append('.') - curr_len += 1 - curr_ws_run = 0 - if curr_len >= max_len: - break buf.append(ch) curr_len += 1 - curr_ws_run += 1 else: buf.append(ch) curr_len += 1 - curr_ws_run = 0 return ''.join(buf)