import sys import time import random import argparse import unicodedata as u import ctypes from pathlib import Path from fregex.cload import * HERE = Path(__file__).resolve().parent TESTS_DIR = HERE / "tests" from fregex.py_tokenizer import tokenize_py as py_tokenize_str def escape_bytes(b: bytes) -> str: buf = [] for code in b: if code == 0x5C: buf.append('\\\\') elif code == 0x0A: buf.append('\\n') elif code == 0x0D: buf.append('\\r') elif code == 0x09: buf.append('\\t') elif code == 0x0C: buf.append('\\f') elif code == 0x0B: buf.append('\\v') elif code == 0x22: buf.append('\\"') elif code < 32 or code >= 127: buf.append(f"\\x{code:02X}") else: buf.append(chr(code)) return ''.join(buf) def gen_valid_unicode_string(rng: random.Random, max_len: int) -> str: target_len = rng.randint(0, max_len) ws_cps = [ 0x20, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, # space, \t, \n, \v, \f, \r 0x00A0, # NO-BREAK SPACE 0x1680, # OGHAM SPACE MARK 0x2000, 0x2001, 0x2002, 0x2003, 0x2004, 0x2005, 0x2006, 0x2007, 0x2008, 0x2009, 0x200A, # EN/EM/THIN/HAIR SPACES etc. 0x2028, 0x2029, # LINE SEPARATOR, PARAGRAPH SEPARATOR 0x202F, # NARROW NO-BREAK SPACE 0x205F, # MEDIUM MATHEMATICAL SPACE 0x3000, # IDEOGRAPHIC SPACE 0x200B, # ZERO WIDTH SPACE (not WS in Python, but hits tokenizer class) 0xFEFF, # ZERO WIDTH NO-BREAK SPACE ] ascii_punct = [ ord(c) for c in "!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~" ] def rand_scalar_excluding_surrogates(lo: int, hi: int) -> int: while True: cp = rng.randint(lo, hi) if 0xD800 <= cp <= 0xDFFF: continue return cp def is_ws_char(ch: str) -> bool: cp = ord(ch) return ch.isspace() or (cp in ws_cps) def gen_ws_segment(max_run: int) -> str: # Mix of various spaces, often multi-length; sometimes explicit CRLFs if rng.random() < 0.35: # Build CR, LF, CRLF, or repeated newlines seqs = ["\n", "\r", "\r\n"] unit = rng.choice(seqs) unit_len = len(unit) max_reps = max(1, max_run // unit_len) seg = unit * rng.randint(1, max_reps) return seg run = rng.randint(1, max(1, max_run)) buf = [] for _ in range(run): cp = rng.choice(ws_cps) buf.append(chr(cp)) return ''.join(buf) def gen_letter_run(max_run: int) -> str: run = rng.randint(1, max(1, max_run)) buf = [] for _ in range(run): if rng.random() < 0.6: # ASCII letters base = ord('A') if rng.random() < 0.5 else ord('a') buf.append(chr(base + rng.randint(0, 25))) else: # Any Unicode letter while True: cp = rand_scalar_excluding_surrogates(0x00A0, 0x10FFFF) if u.category(chr(cp)).startswith('L'): buf.append(chr(cp)) break # optional prefix of single non-WS, non-letter, non-number to stress # the leading [^\r\n\p{L}\p{N}]?+ in the regex if rng.random() < 0.3: buf.insert(0, gen_punc_run(1, allow_space=False)) return ''.join(buf) def gen_number_run(max_run: int) -> str: # Bias to lengths 1..2 per \p{N}{1,2}, but sometimes longer if rng.random() < 0.7: run = rng.randint(1, min(2, max_run)) else: run = rng.randint(3, max(3, max_run)) buf = [] for _ in range(run): if rng.random() < 0.75: buf.append(chr(ord('0') + rng.randint(0, 9))) else: # Other numeric categories (Nd/Nl/No) while True: cp = rand_scalar_excluding_surrogates(0x00A0, 0x10FFFF) if u.category(chr(cp)).startswith('N'): buf.append(chr(cp)) break return ''.join(buf) def gen_punc_run(max_run: int, allow_space: bool = True) -> str: run = rng.randint(1, max(1, max_run)) buf = [] # optional leading single space before punc block if allow_space and rng.random() < 0.5: buf.append(' ') for _ in range(run): if rng.random() < 0.6: cp = rng.choice(ascii_punct) else: while True: cp = rand_scalar_excluding_surrogates(0, 0x10FFFF) ch = chr(cp) if ( not u.category(ch).startswith('L') and not u.category(ch).startswith('N') and cp not in ws_cps and not ch.isspace() ): break # ensure we don't accidentally add null buf.append(chr(cp)) # optional trailing newlines to stress [\r\n]* if rng.random() < 0.35: tail = gen_ws_segment(3) # Keep only CR/LF components in the tail for this case tail = tail.replace('\t', '').replace('\v', '').replace('\f', '').replace(' ', '') buf.append(tail) return ''.join(buf) def gen_contraction() -> str: # e.g., we're, he'll, I'd, I'm, can't, they've prefixes = [gen_letter_run( rng.randint(1, 6) )] suffix = rng.choice(["s", "d", "m", "t", "ll", "ve", "re"]) return prefixes[0] + "'" + suffix def gen_random_unicode(max_run: int) -> str: run = rng.randint(1, max(1, max_run)) buf = [] for _ in range(run): cp = rand_scalar_excluding_surrogates(0, 0x10FFFF) try: buf.append(chr(cp)) except ValueError: continue return ''.join(buf) buf: list[str] = [] curr_len = 0 # Build by segments until target_len while curr_len < target_len: remain = target_len - curr_len r = rng.random() if r < 0.40: seg = gen_ws_segment(remain) elif r < 0.45: # Explicit newline-focused segment seg = ("\r\n" if rng.random() < 0.5 else ("\n" if rng.random() < 0.5 else "\r")) * rng.randint(1, max(1, remain)) elif r < 0.65: seg = gen_letter_run(remain) elif r < 0.75: seg = gen_number_run(remain) elif r < 0.90: seg = gen_punc_run(remain) elif r < 0.95: seg = gen_contraction() else: seg = gen_random_unicode(remain) if not seg: continue # Trim if needed # Append for ch in seg: if curr_len >= target_len: break if is_ws_char(ch): buf.append(ch) curr_len += 1 else: buf.append(ch) curr_len += 1 # Occasionally end with trailing spaces to stress \s+(?!\S) if curr_len < max_len and rng.random() < 0.3: trail = gen_ws_segment(max_len - curr_len) if rng.random() < 0.7: trail = (' ' if rng.random() < 0.6 else '\t') * rng.randint(1, min(8, max_len - curr_len)) # Append trailing for ch in trail: if curr_len >= max_len: break if is_ws_char(ch): buf.append(ch) curr_len += 1 else: buf.append(ch) curr_len += 1 return ''.join(buf) def write_temp_case(text: str, tag: str = "RUN") -> Path: TESTS_DIR.mkdir(parents=True, exist_ok=True) ts = int(time.time() * 1000) fname = f"in_fuzz_{tag}_{ts}.txt" path = TESTS_DIR / fname with open(path, 'wb') as f: f.write(text.encode('utf-8', errors='surrogatepass')) return path def _format_tokens_dump(tokens: list[bytes]) -> str: lines = [] for b in tokens: lines.append(f"{len(b)}\t{escape_bytes(b)}") return "\n".join(lines) def tokenize_c_bytes(data: bytes) -> list[bytes]: tl = TokenList() c_lib.tokenlist_init(ctypes.byref(tl)) try: c_lib.tokenize_fast(data, len(data), ctypes.byref(tl)) out: list[bytes] = [] count = int(tl.count) for i in range(count): ptr = tl.tokens[i] ln = int(tl.lengths[i]) out.append(ctypes.string_at(ptr, ln)) return out finally: c_lib.tokenlist_free(ctypes.byref(tl)) def tokenize_py_bytes(data: bytes) -> list[bytes]: if py_tokenize_str is None: raise RuntimeError("py_tokenizer not available") text = data.decode('utf-8', errors='surrogatepass') toks = py_tokenize_str(text) return [t.encode('utf-8', errors='surrogatepass') for t in toks] def compare_pair_text(text: str): data = text.encode('utf-8', errors='surrogatepass') try: toks_c = tokenize_c_bytes(data) except Exception as e: return False, f"C failed: {e}", None, None try: toks_py = tokenize_py_bytes(data) except Exception as e: return False, f"Py failed: {e}", None, None ok = toks_c == toks_py return ok, None, _format_tokens_dump(toks_c), _format_tokens_dump(toks_py) def run_fuzz(iters: int, max_len: int, seed: int, stop_on_first: bool): rng = random.Random(seed) total = 0 mismatches = 0 last_save = None for i in range(iters if iters > 0 else 1_000_000_000): s = gen_valid_unicode_string(rng, max_len) ok, err, out_c, out_py = compare_pair_text(s) total += 1 if not ok: mismatches += 1 fail_path = write_temp_case(s, tag="FAIL") last_save = fail_path print(f"Mismatch at iter {i}, saved to {fail_path}") print(f"Seed: {seed}") cps = [f"U+{ord(ch):04X}" for ch in s] cats = [u.category(ch) for ch in s] print(f"Text bytes len: {len(s.encode('utf-8','surrogatepass'))}, chars: {len(s)}") print(f"Codepoints: {' '.join(cps)}") print(f"Categories: {' '.join(cats)}") if err: print(err) if out_c is not None: print("--- C tokens ---") print(out_c) if out_py is not None: print("--- Py tokens ---") print(out_py) if stop_on_first: break if (i + 1) % 100 == 0: print(f"[fuzz] {i+1} cases, mismatches={mismatches}") # print(out_c, out_py, sep="\n") return total, mismatches, last_save def main(): ap = argparse.ArgumentParser(description="Fuzz C vs Python tokenizers on random valid UTF-8 inputs") ap.add_argument("--iters", type=int, default=0, help="Number of iterations (0 = very large run)") ap.add_argument("--max-len", type=int, default=256, help="Maximum number of Unicode scalars per case") ap.add_argument("--seed", type=int, default=12345, help="PRNG seed for reproducibility") ap.add_argument("--stop-on-first", action="store_true", help="Stop at first mismatch (default: run all)") args = ap.parse_args() total, mismatches, last = run_fuzz(args.iters, args.max_len, args.seed, args.stop_on_first) print(f"Completed {total} cases, mismatches={mismatches}") if last: print(f"Last failing case saved at: {last}") if mismatches: sys.exit(1) if __name__ == "__main__": main()