removed buffer approuch

2025-12-06 12:22:18 +00:00 · 2025-10-23 20:23:59 -04:00 · 2025-10-23 20:23:59 -04:00 · 41c8b8dbde
commit 41c8b8dbde
parent e02938c0aa
3 changed files with 68 additions and 96 deletions
--- a/fregex/bench.py
+++ b/fregex/bench.py
@ -115,7 +115,7 @@ def main():
            
            try:
                data = path.read_bytes()                
-                benchmark_dataset(path.name, data, 10)
+                benchmark_dataset(path.name, data, 10_000)
            except Exception as e:
                print(f"❌ Error reading {file_path}: {e}")
    else:
--- a/fregex/fregex.c
+++ b/fregex/fregex.c
@ -48,6 +48,10 @@ static inline size_t utf8_decode_cp(
    return (size_t)ret;
 }

+static inline bool is_utf8_cont_byte(unsigned char b) { 
+    return (b & 0xC0) == 0x80; 
+}
+
 static inline bool is_cr_or_lf(unsigned int cp) { 
    return cp == UNICODE_LF || cp == UNICODE_CR; 
 }
@ -312,7 +316,6 @@ static size_t match_short_number(const char *p, const char *end) {
 }

 /* D) ?[^\s\p{L}\p{N}]++[\r\n]* */
-// Optional single ASCII space, then 1+ of (not whitespace, not letter, not number),
 static size_t match_punct_run(const char *p, const char *end) {
    const char *q = p;

@ -365,92 +368,82 @@ static size_t match_punct_run(const char *p, const char *end) {
 /* E) \s*[\r\n] */
 static size_t match_ws_then_linebreak(const char *p, const char *end) {
    const char *q = p;
+    const char *best = NULL;

-    // Collect all positions while consuming whitespace
-    // TODO: ? Could we hit the limit
-    const char *positions[256]; 
-    int pos_count = 0;
-    
-    // Store initial position (zero whitespace consumed)
-    positions[pos_count++] = q;
-
-    while (q < end && pos_count < 255) {
-        unsigned int cp;
-        size_t n = utf8_decode_cp(q, end, &cp);
-        if (n == 0 || !is_space(cp)) 
-            break;
-        q += n;
-        positions[pos_count++] = q;
-    }
-
-    // Try positions from longest to shortest (backtracking)
-    // We need to find a position where the next character is a linebreak
-    for (int i = pos_count - 1; i >= 0; i--) {
-        q = positions[i];
-        
-        // Check if next character is a linebreak
-        if (q < end) {
-            unsigned int br;
-            size_t nb = utf8_decode_cp(q, end, &br);
-            if (nb > 0 && is_cr_or_lf(br)) {
-                // Found a linebreak, include it and return
-                return (size_t)(q + nb - p);
-            }
-        } else {
-            // EOF reached, rule requires a linebreak so fail
-            continue;
+    // Check boundary before consuming any whitespace, too (zero-length \s*)
+    if (q < end) {
+        unsigned int nx; 
+        size_t nn = utf8_decode_cp(q, end, &nx);
+        if (nn > 0 && is_cr_or_lf(nx)) {
+            best = q;  // \s* = 0, [\r\n] = this char
        }
    }

-    // No position found where next char is a linebreak
-    return 0;
+    // Scan whitespace; at each boundary, test the next cp
+    while (q < end) {
+        unsigned int cp; 
+        size_t n = utf8_decode_cp(q, end, &cp);
+        if (n == 0 || !is_space(cp)) 
+            break;
+        q += n; // we consumed one whitespace cp; boundary is at q now
+
+        if (q < end) {
+            unsigned int nx; 
+            size_t nn = utf8_decode_cp(q, end, &nx);
+            if (nn > 0 && is_cr_or_lf(nx)) {
+                best = q;  // prefer the rightmost usable boundary
+            }
+        }
+    }
+
+    if (!best) return 0;
+
+    // At 'best' the next cp is the CR/LF to include
+    unsigned int br; 
+    size_t nb = utf8_decode_cp(best, end, &br);
+    return (size_t)((best + nb) - p);
 }

 /* F) \s+(?!\S) */
 static size_t match_trailing_ws(const char *p, const char *end) {
-    if (p >= end) return 0;
+    if (p >= end) 
+        return 0;

-    /* Must start with at least one whitespace */
-    const char *q = p;
-    unsigned int cp;
-    size_t n = utf8_decode_cp(q, end, &cp);
+    // First cp must be whitespace
+    unsigned int cp; 
+    size_t n = utf8_decode_cp(p, end, &cp);
    if (n == 0 || !is_space(cp)) 
        return 0;

-    /* Collect all whitespace positions */
-    // TODO: ? Could we hit the limit
-    const char *positions[256];  
-    positions[0] = q + n;  // Position after first whitespace
-    int pos_count = 1;
-    
-    q += n;
-
-    while (q < end && pos_count < 255) {
-        size_t m = utf8_decode_cp(q, end, &cp);
+    // Consume full whitespace run [p, r)
+    const char *r = p + n;
+    while (r < end) {
+        size_t m = utf8_decode_cp(r, end, &cp);
        if (m == 0 || !is_space(cp)) 
            break;
-        q += m;
-        positions[pos_count++] = q;
+        r += m;
    }

-    /* Try positions from longest to shortest (backtracking) */
-    for (int i = pos_count - 1; i >= 0; i--) {
-        q = positions[i];
-        
-        /* Check negative lookahead: (?!\S) at this position */
-        if (q < end) {
-            size_t k = utf8_decode_cp(q, end, &cp);
-            if (k > 0 && !is_space(cp)) {
-                continue;  /* Next char is non-space, try shorter match */
-            }
-        }
-        
-        /* Lookahead succeeded at this position */
-        return (size_t)(q - p);
+    if (r == end) {
+        // Only whitespace to EOF -> take all of it
+        return (size_t)(r - p);
    }
-    
-    /* All positions failed lookahead */
-    return 0;
+
+    // Backtrack by exactly one whitespace cp 
+    // If the run length is only 1 cp, F must fail.
+    // Find the start of the last whitespace cp in [p, r)
+    const char *t = r;
+    // step back to beginning of previous UTF-8 cp
+    do { 
+        --t; 
+    } while (t > p && is_utf8_cont_byte(*t));
+
+    if (t == p) {
+        // run had length 1 cp -> cannot backtrack to keep \s+ >= 1
+        return 0;
+    }
+    // Now [p, t) is k-1 whitespace cps
+    return (size_t)(t - p);
 }

 /* G) \s+ */
--- a/fregex/fuzz.py
+++ b/fregex/fuzz.py
@ -65,8 +65,6 @@ def gen_valid_unicode_string(rng: random.Random, max_len: int) -> str:
                continue
            return cp

-    MAX_WS_RUN = 255
-
    def is_ws_char(ch: str) -> bool:
        cp = ord(ch)
        return ch.isspace() or (cp in ws_cps)
@ -78,10 +76,10 @@ def gen_valid_unicode_string(rng: random.Random, max_len: int) -> str:
            seqs = ["\n", "\r", "\r\n"]
            unit = rng.choice(seqs)
            unit_len = len(unit)
-            max_reps = max(1, min(max_run // unit_len, MAX_WS_RUN // unit_len))
+            max_reps = max(1, max_run // unit_len)
            seg = unit * rng.randint(1, max_reps)
            return seg
-        run = rng.randint(1, min(MAX_WS_RUN, max(1, max_run)))
+        run = rng.randint(1, max(1, max_run))
        buf = []
        for _ in range(run):
            cp = rng.choice(ws_cps)
@ -177,7 +175,6 @@ def gen_valid_unicode_string(rng: random.Random, max_len: int) -> str:

    buf: list[str] = []
    curr_len = 0
-    curr_ws_run = 0
    # Build by segments until target_len
    while curr_len < target_len:
        remain = target_len - curr_len
@ -201,50 +198,32 @@ def gen_valid_unicode_string(rng: random.Random, max_len: int) -> str:
        if not seg:
            continue
        # Trim if needed
-        # Append with whitespace-run capping
+        # Append
        for ch in seg:
            if curr_len >= target_len:
                break
            if is_ws_char(ch):
-                if curr_ws_run >= MAX_WS_RUN:
-                    # insert a non-whitespace breaker
-                    breaker = '.'
-                    buf.append(breaker)
-                    curr_len += 1
-                    curr_ws_run = 0
-                    if curr_len >= target_len:
-                        break
                buf.append(ch)
                curr_len += 1
-                curr_ws_run += 1
            else:
                buf.append(ch)
                curr_len += 1
-                curr_ws_run = 0

    # Occasionally end with trailing spaces to stress \s+(?!\S)
    if curr_len < max_len and rng.random() < 0.3:
        trail = gen_ws_segment(max_len - curr_len)
        if rng.random() < 0.7:
            trail = (' ' if rng.random() < 0.6 else '\t') * rng.randint(1, min(8, max_len - curr_len))
-        # Append trailing with cap as well
+        # Append trailing
        for ch in trail:
            if curr_len >= max_len:
                break
            if is_ws_char(ch):
-                if curr_ws_run >= MAX_WS_RUN:
-                    buf.append('.')
-                    curr_len += 1
-                    curr_ws_run = 0
-                    if curr_len >= max_len:
-                        break
                buf.append(ch)
                curr_len += 1
-                curr_ws_run += 1
            else:
                buf.append(ch)
                curr_len += 1
-                curr_ws_run = 0

    return ''.join(buf)