mirror of
https://github.com/karpathy/nanochat.git
synced 2025-12-06 12:22:18 +00:00
This commit introduces extensive documentation across the entire nanochat codebase. The goal is to make the project more accessible, educational, and easier for new contributors to understand. Key additions include: - A new "Codebase Overview and Data Flow" section in the main README.md, providing a high-level guide to the project structure and training pipeline. - Detailed, educational docstrings and inline comments in all Python modules within the `nanochat/`, `scripts/`, and `tasks/` directories. - Explanations of the rationale and implementation details for key components, including Python equivalents for non-Python code where applicable. - A new `README.md` in the `rustbpe/` directory explaining the BPE algorithm and the decision to use Rust. - Comprehensive comments in shell scripts and development scripts in the `dev/` directory, clarifying their purpose and usage.
112 lines
4.6 KiB
Python
112 lines
4.6 KiB
Python
#--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*#
|
||
#_-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*#
|
||
# #
|
||
# Dataset Preparation Reference: FineWebEdu-100B #
|
||
# #
|
||
#_-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*#
|
||
#--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*#
|
||
"""
|
||
This script serves as a reference and documentation for the preparation of the
|
||
`FinewebEdu-100B` dataset.
|
||
|
||
**NOTE: This file is not intended to be executed during the project's runtime.**
|
||
|
||
Purpose of this Script:
|
||
The primary goal of this script is to transform the raw `FinewebEdu-100B` dataset into a more
|
||
efficient format for large-scale model training. The key steps are:
|
||
|
||
1. **Shuffling:** The entire dataset is shuffled to ensure that the data is presented to the
|
||
model in a random order, which is crucial for effective training.
|
||
|
||
2. **Repackaging into Shards:** The shuffled dataset is broken down into smaller chunks, or "shards."
|
||
- Each shard is saved as a Parquet file.
|
||
- The target size for each compressed shard is approximately 100MB.
|
||
- This sharding strategy is vital for performance. It allows the DataLoader to stream the
|
||
dataset from a source (like the Hugging Face Hub) and cache it locally. This "just-in-time"
|
||
data loading significantly reduces training latency, as the model doesn't have to wait for
|
||
the entire massive dataset to be downloaded.
|
||
|
||
3. **Uploading to Hugging Face:** After processing, the shards are uploaded to the Hugging Face Hub,
|
||
making them easily accessible for training runs.
|
||
|
||
This preparation process is a critical step in enabling efficient and scalable training
|
||
for the nanochat project.
|
||
"""
|
||
import os
|
||
import time
|
||
|
||
from datasets import load_dataset
|
||
import pyarrow.parquet as pq
|
||
import pyarrow as pa
|
||
|
||
# Source dataset
|
||
dataset_kwargs = {
|
||
"path": "HuggingFaceFW/fineweb-edu",
|
||
"split": "train",
|
||
"name": "sample-100BT", # ~100B GPT-2 tokens at ~3 chars/token => ~300B chars total
|
||
}
|
||
ds = load_dataset(**dataset_kwargs)
|
||
|
||
# Shuffle to scramble the order
|
||
ds = ds.shuffle(seed=42)
|
||
ndocs = len(ds) # total number of documents to process
|
||
print(f"Total number of documents: {ndocs}")
|
||
|
||
# Repackage into parquet files
|
||
output_dir = "/home/ubuntu/.cache/nanochat/base_data"
|
||
os.makedirs(output_dir, exist_ok=True)
|
||
|
||
# Write to parquet files
|
||
chars_per_shard = 250_000_000
|
||
row_group_size = 1024 # HF uses 1000 but we use multiple of 2, nicer for distributed data loader later
|
||
shard_docs = []
|
||
shard_index = 0
|
||
shard_characters = 0
|
||
total_docs_processed = 0
|
||
total_time_spent = 0
|
||
t0 = time.time()
|
||
for doc in ds:
|
||
text = doc['text']
|
||
shard_docs.append(text)
|
||
shard_characters += len(text)
|
||
collected_enough_chars = shard_characters >= chars_per_shard
|
||
docs_multiple_of_row_group_size = len(shard_docs) % row_group_size == 0
|
||
if collected_enough_chars and docs_multiple_of_row_group_size: # leads to ~100MB of text (compressed)
|
||
shard_path = os.path.join(output_dir, f"shard_{shard_index:05d}.parquet")
|
||
shard_table = pa.Table.from_pydict({"text": shard_docs})
|
||
pq.write_table(
|
||
shard_table,
|
||
shard_path,
|
||
row_group_size=row_group_size,
|
||
use_dictionary=False, # this is usually used for categorical data
|
||
compression="zstd", # Valid values: {‘NONE’, ‘SNAPPY’, ‘GZIP’, ‘BROTLI’, ‘LZ4’, ‘ZSTD’}
|
||
compression_level=3,
|
||
write_statistics=False, # not needed for text
|
||
)
|
||
t1 = time.time()
|
||
dt = t1 - t0 # for this shard alone
|
||
t0 = t1
|
||
total_docs_processed += len(shard_docs)
|
||
total_time_spent += dt
|
||
remaining_docs = ndocs - total_docs_processed
|
||
avg_time_per_doc = total_time_spent / total_docs_processed
|
||
remaining_time = remaining_docs * avg_time_per_doc
|
||
remaining_time_hours = remaining_time / 3600
|
||
print(f"Wrote {shard_path}. #documents: {len(shard_docs)} | #characters: {shard_characters} | time: {dt:.2f}s | remaining time: {remaining_time_hours:.2f}h")
|
||
shard_docs = []
|
||
shard_characters = 0
|
||
shard_index += 1
|
||
|
||
# Demonstration of how the data was later uploaded to HuggingFace
|
||
def upload():
|
||
import os
|
||
from huggingface_hub import HfApi
|
||
token = os.getenv("HF_TOKEN")
|
||
api = HfApi(token=token)
|
||
api.upload_large_folder(
|
||
folder_path=output_dir,
|
||
repo_id="karpathy/fineweb-edu-100b-shuffle",
|
||
repo_type="dataset",
|
||
)
|
||
# upload()
|