mirror of
https://github.com/karpathy/nanochat.git
synced 2026-06-15 10:39:08 +00:00
fix(dataloader): warn when single parquet file causes train/val overlap
## Problem When only 1 parquet file exists: - train: parquet_paths[:-1] = [] (empty!) - val: parquet_paths[-1:] = [file.parquet] This causes: 1. Train split to fail with 'No dataset parquet files found' 2. Or if assertion passes, train gets no data while val gets all data ## Solution Add warning when single file detected to inform users that: - Both train and val will use same data - This may cause overfitting - Recommend splitting dataset into multiple files The warning is shown only once (rank 0) to avoid spam. ## Edge Case This is a common scenario for users testing with small datasets.
This commit is contained in:
parent
6ed7d1d82c
commit
5328971c02
|
|
@ -35,6 +35,17 @@ def _document_batches(split, resume_state_dict, tokenizer_batch_size):
|
|||
warn_on_legacy = ddp_rank == 0 and split == "train" # rank 0 on train split will warn on legacy
|
||||
parquet_paths = list_parquet_files(warn_on_legacy=warn_on_legacy)
|
||||
assert len(parquet_paths) != 0, "No dataset parquet files found, did you run dataset.py?"
|
||||
|
||||
# Split parquet files: last file for validation, rest for training
|
||||
# Handle edge case: single file scenario
|
||||
if len(parquet_paths) == 1:
|
||||
import warnings
|
||||
warnings.warn(
|
||||
"Only 1 parquet file found. "
|
||||
"Both train and val will use the same data, which may cause overfitting. "
|
||||
"Consider splitting your dataset into multiple parquet files.",
|
||||
UserWarning
|
||||
)
|
||||
parquet_paths = parquet_paths[:-1] if split == "train" else parquet_paths[-1:]
|
||||
|
||||
resume_pq_idx = resume_state_dict["pq_idx"] if resume_state_dict is not None else 0
|
||||
|
|
|
|||
Loading…
Reference in New Issue
Block a user