removed buffer approuch

2025-12-06 12:22:18 +00:00 · 2025-10-23 20:23:59 -04:00 · 2025-10-23 20:23:59 -04:00 · 41c8b8dbde
commit 41c8b8dbde
parent e02938c0aa
3 changed files with 68 additions and 96 deletions
--- a/fregex/bench.py
+++ b/fregex/bench.py
@ -115,7 +115,7 @@ def main():
            try:
                data = path.read_bytes()                
-                benchmark_dataset(path.name, data, 10)
+                benchmark_dataset(path.name, data, 10_000)
            except Exception as e:
                print(f"❌ Error reading {file_path}: {e}")
    else:
--- a/fregex/fregex.c
+++ b/fregex/fregex.c
@ -48,6 +48,10 @@ static inline size_t utf8_decode_cp(
    return (size_t)ret;
 }
 static inline bool is_utf8_cont_byte(unsigned char b) { 
    return (b & 0xC0) == 0x80; 
 }
 static inline bool is_cr_or_lf(unsigned int cp) { 
    return cp == UNICODE_LF || cp == UNICODE_CR; 
 }
@ -312,7 +316,6 @@ static size_t match_short_number(const char *p, const char *end) {
 }
 /* D) ?[^\s\p{L}\p{N}]++[\r\n]* */
 // Optional single ASCII space, then 1+ of (not whitespace, not letter, not number),
 static size_t match_punct_run(const char *p, const char *end) {
    const char *q = p;
@ -365,92 +368,82 @@ static size_t match_punct_run(const char *p, const char *end) {
 /* E) \s*[\r\n] */
 static size_t match_ws_then_linebreak(const char *p, const char *end) {
    const char *q = p;
    const char *best = NULL;
-    // Collect all positions while consuming whitespace
+    // Check boundary before consuming any whitespace, too (zero-length \s*)
-    // TODO: ? Could we hit the limit
+    if (q < end) {
-    const char *positions[256]; 
+        unsigned int nx; 
-    int pos_count = 0;
+        size_t nn = utf8_decode_cp(q, end, &nx);
-    
+        if (nn > 0 && is_cr_or_lf(nx)) {
-    // Store initial position (zero whitespace consumed)
+            best = q;  // \s* = 0, [\r\n] = this char
    positions[pos_count++] = q;
    while (q < end && pos_count < 255) {
        unsigned int cp;
        size_t n = utf8_decode_cp(q, end, &cp);
        if (n == 0 || !is_space(cp)) 
            break;
        q += n;
        positions[pos_count++] = q;
    }
    // Try positions from longest to shortest (backtracking)
    // We need to find a position where the next character is a linebreak
    for (int i = pos_count - 1; i >= 0; i--) {
        q = positions[i];
        // Check if next character is a linebreak
        if (q < end) {
            unsigned int br;
            size_t nb = utf8_decode_cp(q, end, &br);
            if (nb > 0 && is_cr_or_lf(br)) {
                // Found a linebreak, include it and return
                return (size_t)(q + nb - p);
            }
        } else {
            // EOF reached, rule requires a linebreak so fail
            continue;
        }
    }
-    // No position found where next char is a linebreak
+    // Scan whitespace; at each boundary, test the next cp
-    return 0;
+    while (q < end) {
        unsigned int cp; 
        size_t n = utf8_decode_cp(q, end, &cp);
        if (n == 0 || !is_space(cp)) 
            break;
        q += n; // we consumed one whitespace cp; boundary is at q now
        if (q < end) {
            unsigned int nx; 
            size_t nn = utf8_decode_cp(q, end, &nx);
            if (nn > 0 && is_cr_or_lf(nx)) {
                best = q;  // prefer the rightmost usable boundary
            }
        }
    }
    if (!best) return 0;
    // At 'best' the next cp is the CR/LF to include
    unsigned int br; 
    size_t nb = utf8_decode_cp(best, end, &br);
    return (size_t)((best + nb) - p);
 }
 /* F) \s+(?!\S) */
 static size_t match_trailing_ws(const char *p, const char *end) {
-    if (p >= end) return 0;
+    if (p >= end) 
        return 0;
-    /* Must start with at least one whitespace */
+    // First cp must be whitespace
-    const char *q = p;
+    unsigned int cp; 
-    unsigned int cp;
+    size_t n = utf8_decode_cp(p, end, &cp);
    size_t n = utf8_decode_cp(q, end, &cp);
    if (n == 0 || !is_space(cp)) 
        return 0;
-    /* Collect all whitespace positions */
+    // Consume full whitespace run [p, r)
-    // TODO: ? Could we hit the limit
+    const char *r = p + n;
-    const char *positions[256];  
+    while (r < end) {
-    positions[0] = q + n;  // Position after first whitespace
+        size_t m = utf8_decode_cp(r, end, &cp);
    int pos_count = 1;
    q += n;
    while (q < end && pos_count < 255) {
        size_t m = utf8_decode_cp(q, end, &cp);
        if (m == 0 || !is_space(cp)) 
            break;
-        q += m;
+        r += m;
        positions[pos_count++] = q;
    }
-    /* Try positions from longest to shortest (backtracking) */
+    if (r == end) {
-    for (int i = pos_count - 1; i >= 0; i--) {
+        // Only whitespace to EOF -> take all of it
-        q = positions[i];
+        return (size_t)(r - p);
        /* Check negative lookahead: (?!\S) at this position */
        if (q < end) {
            size_t k = utf8_decode_cp(q, end, &cp);
            if (k > 0 && !is_space(cp)) {
                continue;  /* Next char is non-space, try shorter match */
            }
        }
        /* Lookahead succeeded at this position */
        return (size_t)(q - p);
    }
-    
+
-    /* All positions failed lookahead */
+    // Backtrack by exactly one whitespace cp 
-    return 0;
+    // If the run length is only 1 cp, F must fail.
    // Find the start of the last whitespace cp in [p, r)
    const char *t = r;
    // step back to beginning of previous UTF-8 cp
    do { 
        --t; 
    } while (t > p && is_utf8_cont_byte(*t));
    if (t == p) {
        // run had length 1 cp -> cannot backtrack to keep \s+ >= 1
        return 0;
    }
    // Now [p, t) is k-1 whitespace cps
    return (size_t)(t - p);
 }
 /* G) \s+ */
--- a/fregex/fuzz.py
+++ b/fregex/fuzz.py
@ -65,8 +65,6 @@ def gen_valid_unicode_string(rng: random.Random, max_len: int) -> str:
                continue
            return cp
    MAX_WS_RUN = 255
    def is_ws_char(ch: str) -> bool:
        cp = ord(ch)
        return ch.isspace() or (cp in ws_cps)
@ -78,10 +76,10 @@ def gen_valid_unicode_string(rng: random.Random, max_len: int) -> str:
            seqs = ["\n", "\r", "\r\n"]
            unit = rng.choice(seqs)
            unit_len = len(unit)
-            max_reps = max(1, min(max_run // unit_len, MAX_WS_RUN // unit_len))
+            max_reps = max(1, max_run // unit_len)
            seg = unit * rng.randint(1, max_reps)
            return seg
-        run = rng.randint(1, min(MAX_WS_RUN, max(1, max_run)))
+        run = rng.randint(1, max(1, max_run))
        buf = []
        for _ in range(run):
            cp = rng.choice(ws_cps)
@ -177,7 +175,6 @@ def gen_valid_unicode_string(rng: random.Random, max_len: int) -> str:
    buf: list[str] = []
    curr_len = 0
    curr_ws_run = 0
    # Build by segments until target_len
    while curr_len < target_len:
        remain = target_len - curr_len
@ -201,50 +198,32 @@ def gen_valid_unicode_string(rng: random.Random, max_len: int) -> str:
        if not seg:
            continue
        # Trim if needed
-        # Append with whitespace-run capping
+        # Append
        for ch in seg:
            if curr_len >= target_len:
                break
            if is_ws_char(ch):
                if curr_ws_run >= MAX_WS_RUN:
                    # insert a non-whitespace breaker
                    breaker = '.'
                    buf.append(breaker)
                    curr_len += 1
                    curr_ws_run = 0
                    if curr_len >= target_len:
                        break
                buf.append(ch)
                curr_len += 1
                curr_ws_run += 1
            else:
                buf.append(ch)
                curr_len += 1
                curr_ws_run = 0
    # Occasionally end with trailing spaces to stress \s+(?!\S)
    if curr_len < max_len and rng.random() < 0.3:
        trail = gen_ws_segment(max_len - curr_len)
        if rng.random() < 0.7:
            trail = (' ' if rng.random() < 0.6 else '\t') * rng.randint(1, min(8, max_len - curr_len))
-        # Append trailing with cap as well
+        # Append trailing
        for ch in trail:
            if curr_len >= max_len:
                break
            if is_ws_char(ch):
                if curr_ws_run >= MAX_WS_RUN:
                    buf.append('.')
                    curr_len += 1
                    curr_ws_run = 0
                    if curr_len >= max_len:
                        break
                buf.append(ch)
                curr_len += 1
                curr_ws_run += 1
            else:
                buf.append(ch)
                curr_len += 1
                curr_ws_run = 0
    return ''.join(buf)