From ccc0016ccdc58e4872117aaa1cba5023261605b1 Mon Sep 17 00:00:00 2001 From: Vishal Krishna Kumar Date: Sun, 28 Dec 2025 23:28:06 -0800 Subject: [PATCH] Add optional SHA256 checksum verification for FineWeb-Edu shards --- nanochat/dataset.py | 41 ++++++++++++++++++++++++++++++++++++++++- 1 file changed, 40 insertions(+), 1 deletion(-) diff --git a/nanochat/dataset.py b/nanochat/dataset.py index 602daed..06cd7af 100644 --- a/nanochat/dataset.py +++ b/nanochat/dataset.py @@ -6,7 +6,7 @@ This file contains utilities for: For details of how the dataset was prepared, see `repackage_data_reference.py`. """ - +import hashlib import os import argparse import time @@ -56,6 +56,38 @@ def parquets_iter_batched(split, start=0, step=1): texts = rg.column('text').to_pylist() yield texts + +def sha256sum(path, chunk_size=1024 * 1024): + """Compute SHA256 checksum of a file.""" + h = hashlib.sha256() + with open(path, "rb") as f: + for chunk in iter(lambda: f.read(chunk_size), b""): + h.update(chunk) + return h.hexdigest() + + +def verify_checksum(path, expected_sha256): + actual = sha256sum(path) + if actual != expected_sha256: + raise ValueError( + f"Checksum mismatch for {os.path.basename(path)}: " + f"expected {expected_sha256}, got {actual}" + ) + + +def load_checksums(): + checksums_path = os.path.join(DATA_DIR, "checksums_sha256.txt") + if not os.path.exists(checksums_path): + return None + + checksums = {} + with open(checksums_path, "r") as f: + for line in f: + name, sha = line.strip().split() + checksums[name] = sha + return checksums + + # ----------------------------------------------------------------------------- def download_single_file(index): """ Downloads a single file index, with some backoff """ @@ -85,6 +117,13 @@ def download_single_file(index): f.write(chunk) # Move temp file to final location os.rename(temp_path, filepath) + #Optional CheckSum Verification + + checksums = load_checksums() + if checksums and filename in checksums: + verify_checksum(filepath, checksums[filename]) + print(f'Checksum verified. for {filename}') + print(f"Successfully downloaded {filename}") return True