mirror of
https://github.com/karpathy/nanochat.git
synced 2026-03-07 09:50:28 +00:00
Speedrun submission with nvidia climbmix, add the new dataset as an option
This commit is contained in:
parent
c7ba252142
commit
8603b68f5d
|
|
@ -13,15 +13,22 @@ import time
|
|||
import requests
|
||||
import pyarrow.parquet as pq
|
||||
from multiprocessing import Pool
|
||||
from functools import partial
|
||||
|
||||
from nanochat.common import get_base_dir
|
||||
|
||||
# -----------------------------------------------------------------------------
|
||||
# The specifics of the current pretraining dataset
|
||||
|
||||
# The URL on the internet where the data is hosted and downloaded from on demand
|
||||
BASE_URL = "https://huggingface.co/datasets/karpathy/fineweb-edu-100b-shuffle/resolve/main"
|
||||
MAX_SHARD = 1822 # the last datashard is shard_01822.parquet
|
||||
# The URLs on the internet where the data is hosted and downloaded from on demand
|
||||
# FineWeb-Edu dataset
|
||||
BASE_URL_FW = "https://huggingface.co/datasets/karpathy/fineweb-edu-100b-shuffle/resolve/main"
|
||||
MAX_SHARD_FW = 1822 # the last datashard is shard_01822.parquet
|
||||
|
||||
# NVIDIA ClimbMix dataset
|
||||
BASE_URL_NVCM = "https://huggingface.co/datasets/ddudek/nanochat-climbmix-80b-shuffle/resolve/main"
|
||||
MAX_SHARD_NVCM = 1306
|
||||
|
||||
index_to_filename = lambda index: f"shard_{index:05d}.parquet" # format of the filenames
|
||||
base_dir = get_base_dir()
|
||||
DATA_DIR = os.path.join(base_dir, "base_data")
|
||||
|
|
@ -57,9 +64,11 @@ def parquets_iter_batched(split, start=0, step=1):
|
|||
yield texts
|
||||
|
||||
# -----------------------------------------------------------------------------
|
||||
def download_single_file(index):
|
||||
def download_single_file(index, ds_climbmix):
|
||||
""" Downloads a single file index, with some backoff """
|
||||
|
||||
base_url = BASE_URL_NVCM if ds_climbmix else BASE_URL_FW
|
||||
|
||||
# Construct the local filepath for this file and skip if it already exists
|
||||
filename = index_to_filename(index)
|
||||
filepath = os.path.join(DATA_DIR, filename)
|
||||
|
|
@ -68,7 +77,7 @@ def download_single_file(index):
|
|||
return True
|
||||
|
||||
# Construct the remote URL for this file
|
||||
url = f"{BASE_URL}/{filename}"
|
||||
url = f"{base_url}/{filename}"
|
||||
print(f"Downloading {filename}...")
|
||||
|
||||
# Download with retries
|
||||
|
|
@ -110,18 +119,24 @@ def download_single_file(index):
|
|||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser(description="Download FineWeb-Edu 100BT dataset shards")
|
||||
parser = argparse.ArgumentParser(description="Download dataset shards")
|
||||
parser.add_argument("-n", "--num-files", type=int, default=-1, help="Number of shards to download (default: -1), -1 = disable")
|
||||
parser.add_argument("-w", "--num-workers", type=int, default=4, help="Number of parallel download workers (default: 4)")
|
||||
parser.add_argument("-dscm", "--ds-climbmix", action="store_true", help="Use NVIDIA Climbmix dataset, otherwise use default Fineweb-Edu")
|
||||
args = parser.parse_args()
|
||||
|
||||
num = MAX_SHARD + 1 if args.num_files == -1 else min(args.num_files, MAX_SHARD + 1)
|
||||
num_shards = MAX_SHARD_NVCM if args.ds_climbmix else MAX_SHARD_FW
|
||||
|
||||
num = num_shards + 1 if args.num_files == -1 else min(args.num_files, num_shards + 1)
|
||||
ids_to_download = list(range(num))
|
||||
print(f"Downloading {len(ids_to_download)} shards using {args.num_workers} workers...")
|
||||
print(f"Target directory: {DATA_DIR}")
|
||||
if args.ds_climbmix:
|
||||
print("Using NVIDIA ClimbMix dataset")
|
||||
print()
|
||||
with Pool(processes=args.num_workers) as pool:
|
||||
results = pool.map(download_single_file, ids_to_download)
|
||||
func = partial(download_single_file, ds_climbmix=args.ds_climbmix)
|
||||
results = pool.map(func, ids_to_download)
|
||||
|
||||
# Report results
|
||||
successful = sum(1 for success in results if success)
|
||||
|
|
|
|||
|
|
@ -53,11 +53,11 @@ python -m nanochat.report reset
|
|||
# so we download 2e9 / 250e6 = 8 data shards at this point
|
||||
# each shard is ~100MB of text (compressed), so this is about ~800MB of data on disk
|
||||
# look at dev/repackage_data_reference.py for details on how this data was prepared
|
||||
python -m nanochat.dataset -n 8
|
||||
python -m nanochat.dataset -n 8 --ds-climbmix
|
||||
# Immediately also kick off downloading more shards in the background while tokenizer trains
|
||||
# Approximately 350 shards are needed for 10B tokens of data for pretraining.
|
||||
# The maximum total number of shards available in the entire dataset is 1822.
|
||||
python -m nanochat.dataset -n 370 &
|
||||
python -m nanochat.dataset -n 370 --ds-climbmix &
|
||||
DATASET_DOWNLOAD_PID=$!
|
||||
# train the tokenizer with vocab size 2**15 = 32768 on ~2B characters of data
|
||||
python -m scripts.tok_train
|
||||
|
|
@ -69,8 +69,8 @@ python -m scripts.tok_eval
|
|||
echo "Waiting for dataset download to complete..."
|
||||
wait $DATASET_DOWNLOAD_PID
|
||||
|
||||
# d26 model (slightly undertrained to beat GPT-2 => decrease data:params ratio from compute optimal 10.5 (default) to 8.25)
|
||||
torchrun --standalone --nproc_per_node=8 -m scripts.base_train -- --depth=26 --target-param-data-ratio=8.25 --device-batch-size=16 --fp8 --run=$WANDB_RUN
|
||||
# d26 model (slightly undertrained to beat GPT-2 => decrease data:params ratio from compute optimal 10.5 (default) to 8.0)
|
||||
torchrun --standalone --nproc_per_node=8 -m scripts.base_train -- --depth=26 --target-param-data-ratio=8.0 --device-batch-size=16 --fp8 --run=$WANDB_RUN
|
||||
# evaluate the model: CORE metric, BPB on train/val, and draw samples
|
||||
torchrun --standalone --nproc_per_node=8 -m scripts.base_eval -- --device-batch-size=16
|
||||
|
||||
|
|
|
|||
Loading…
Reference in New Issue
Block a user