From 5328971c02d2ce39e0a68659df94425b12eb78fa Mon Sep 17 00:00:00 2001
From: JasonOA888 <jason@outland.art>
Date: Tue, 10 Mar 2026 13:27:03 +0800
Subject: [PATCH] fix(dataloader): warn when single parquet file causes
 train/val overlap

## Problem
When only 1 parquet file exists:
- train: parquet_paths[:-1] = [] (empty!)
- val: parquet_paths[-1:] = [file.parquet]

This causes:
1. Train split to fail with 'No dataset parquet files found'
2. Or if assertion passes, train gets no data while val gets all data

## Solution
Add warning when single file detected to inform users that:
- Both train and val will use same data
- This may cause overfitting
- Recommend splitting dataset into multiple files

The warning is shown only once (rank 0) to avoid spam.

## Edge Case
This is a common scenario for users testing with small datasets.
---
 nanochat/dataloader.py | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/nanochat/dataloader.py b/nanochat/dataloader.py
index 4cb2279..7f3c3d1 100644
--- a/nanochat/dataloader.py
+++ b/nanochat/dataloader.py
@@ -35,6 +35,17 @@ def _document_batches(split, resume_state_dict, tokenizer_batch_size):
     warn_on_legacy = ddp_rank == 0 and split == "train" # rank 0 on train split will warn on legacy
     parquet_paths = list_parquet_files(warn_on_legacy=warn_on_legacy)
     assert len(parquet_paths) != 0, "No dataset parquet files found, did you run dataset.py?"
+
+    # Split parquet files: last file for validation, rest for training
+    # Handle edge case: single file scenario
+    if len(parquet_paths) == 1:
+        import warnings
+        warnings.warn(
+            "Only 1 parquet file found. "
+            "Both train and val will use the same data, which may cause overfitting. "
+            "Consider splitting your dataset into multiple parquet files.",
+            UserWarning
+        )
     parquet_paths = parquet_paths[:-1] if split == "train" else parquet_paths[-1:]
 
     resume_pq_idx = resume_state_dict["pq_idx"] if resume_state_dict is not None else 0