This commit is contained in:
Ruhollah Majdoddin 2025-11-01 07:14:41 -07:00 committed by GitHub
commit 6fe0b41dd2
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
25 changed files with 1896 additions and 1259 deletions

View File

@ -12,6 +12,8 @@
export OMP_NUM_THREADS=1
export NANOCHAT_BASE_DIR="$HOME/.cache/nanochat"
mkdir -p $NANOCHAT_BASE_DIR
curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y
source "$HOME/.cargo/env"
command -v uv &> /dev/null || curl -LsSf https://astral.sh/uv/install.sh | sh
[ -d ".venv" ] || uv venv
uv sync --extra cpu
@ -19,9 +21,6 @@ source .venv/bin/activate
if [ -z "$WANDB_RUN" ]; then
WANDB_RUN=dummy
fi
curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y
source "$HOME/.cargo/env"
uv run maturin develop --release --manifest-path rustbpe/Cargo.toml
EVAL_BUNDLE_URL=https://karpathy-public.s3.us-west-2.amazonaws.com/eval_bundle.zip
if [ ! -d "$NANOCHAT_BASE_DIR/eval_bundle" ]; then
curl -L -o eval_bundle.zip $EVAL_BUNDLE_URL

View File

@ -16,21 +16,15 @@ dependencies = [
"torch>=2.8.0",
"uvicorn>=0.36.0",
"wandb>=0.21.3",
"rustbpe~=0.1.0"
]
[build-system]
requires = ["maturin>=1.7,<2.0"]
build-backend = "maturin"
[tool.maturin]
module-name = "rustbpe"
bindings = "pyo3"
python-source = "."
manifest-path = "rustbpe/Cargo.toml"
requires = ["uv_build>=0.9.5,<0.10.0"]
build-backend = "uv_build"
[dependency-groups]
dev = [
"maturin>=1.9.4",
"pytest>=8.0.0",
]
@ -45,33 +39,37 @@ python_functions = ["test_*"]
# target torch to cuda 12.8 or CPU
[tool.uv.sources]
torch = [
{ index = "pytorch-cpu", extra = "cpu" },
{ index = "pytorch-cu128", extra = "gpu" },
torch = [
{ index = "pytorch-cpu", extra = "cpu" },
{ index = "pytorch-cu128", extra = "gpu" },
]
rustbpe = { workspace = true }
[[tool.uv.index]]
name = "pytorch-cpu"
url = "https://download.pytorch.org/whl/cpu"
explicit = true
[[tool.uv.index]]
name = "pytorch-cu128"
url = "https://download.pytorch.org/whl/cu128"
[[tool.uv.index]]
name = "pytorch-cpu"
url = "https://download.pytorch.org/whl/cpu"
explicit = true
[project.optional-dependencies]
cpu = [
"torch>=2.8.0",
]
gpu = [
"torch>=2.8.0",
]
[tool.uv]
conflicts = [
[
{ extra = "cpu" },
{ extra = "gpu" },
],
]
[[tool.uv.index]]
name = "pytorch-cu128"
url = "https://download.pytorch.org/whl/cu128"
explicit = true
[project.optional-dependencies]
cpu = [
"torch>=2.8.0",
]
gpu = [
"torch>=2.8.0",
]
[tool.uv]
conflicts = [
[
{ extra = "cpu" },
{ extra = "gpu" },
],
]
[tool.uv.workspace]
members = ["rustbpe"]

View File

@ -8,6 +8,8 @@
export OMP_NUM_THREADS=1
export NANOCHAT_BASE_DIR="$HOME/.cache/nanochat"
mkdir -p $NANOCHAT_BASE_DIR
curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y
source "$HOME/.cargo/env"
command -v uv &> /dev/null || curl -LsSf https://astral.sh/uv/install.sh | sh
[ -d ".venv" ] || uv venv
uv sync --extra gpu
@ -16,9 +18,6 @@ if [ -z "$WANDB_RUN" ]; then
WANDB_RUN=dummy
fi
python -m nanochat.report reset
curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y
source "$HOME/.cargo/env"
uv run maturin develop --release --manifest-path rustbpe/Cargo.toml
EVAL_BUNDLE_URL=https://karpathy-public.s3.us-west-2.amazonaws.com/eval_bundle.zip
if [ ! -d "$NANOCHAT_BASE_DIR/eval_bundle" ]; then
curl -L -o eval_bundle.zip $EVAL_BUNDLE_URL

113
rustbpe/Cargo.lock generated
View File

@ -62,9 +62,9 @@ dependencies = [
[[package]]
name = "cfg-if"
version = "1.0.3"
version = "1.0.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2fd1289c04a9ea8cb22300a459a72a385d7c73d3259e2ed7dcb2af674838cfa9"
checksum = "9330f8b2ff13f34540b44e946ef35111825727b38d33286ef986142615121801"
[[package]]
name = "compact_str"
@ -107,9 +107,9 @@ checksum = "d0a5c400df2834b80a4c3327b3aad3a4c4cd4de0629063962b03235697506a28"
[[package]]
name = "dary_heap"
version = "0.3.7"
version = "0.3.8"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "04d2cd9c18b9f454ed67da600630b021a8a80bf33f8c95896ab33aaf1c26b728"
checksum = "06d2e3287df1c007e74221c49ca10a95d557349e54b3a75dc2fb14712c751f04"
[[package]]
name = "either"
@ -125,9 +125,9 @@ checksum = "877a4ace8713b0bcf2a4e7eec82529c029f1d0619886d18145fea96c3ffe5c0f"
[[package]]
name = "fancy-regex"
version = "0.16.1"
version = "0.16.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "bf04c5ec15464ace8355a7b440a33aece288993475556d461154d7a62ad9947c"
checksum = "998b056554fbe42e03ae0e152895cd1a7e1002aec800fdc6635d20270260c46f"
dependencies = [
"bit-set",
"regex-automata",
@ -136,21 +136,21 @@ dependencies = [
[[package]]
name = "getrandom"
version = "0.3.3"
version = "0.3.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "26145e563e54f2cadc477553f1ec5ee650b00862f0a58bcd12cbdc5f0ea2d2f4"
checksum = "899def5c37c4fd7b2664648c28120ecec138e4d395b459e5ca34f9cce2dd77fd"
dependencies = [
"cfg-if",
"libc",
"r-efi",
"wasi",
"wasip2",
]
[[package]]
name = "hashbrown"
version = "0.15.5"
version = "0.16.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9229cfe53dfd69f0609a49f65461bd93001ea1ef889cd5529dd176593f5338a1"
checksum = "5419bdc4f6a9207fbeba6d11b604d481addf78ecd10c11ad51e76c2f6482748d"
[[package]]
name = "heck"
@ -160,9 +160,9 @@ checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea"
[[package]]
name = "indexmap"
version = "2.11.0"
version = "2.12.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f2481980430f9f78649238835720ddccc57e52df14ffce1c6f37391d61b563e9"
checksum = "6717a8d2a5a929a1a2eb43a12812498ed141a0bcfb7e8f7844fbdbe4303bba9f"
dependencies = [
"equivalent",
"hashbrown",
@ -170,9 +170,12 @@ dependencies = [
[[package]]
name = "indoc"
version = "2.0.6"
version = "2.0.7"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f4c7245a08504955605670dbf141fceab975f15ca21570696aebe9d2e71576bd"
checksum = "79cf5c93f93228cf8efb3ba362535fb11199ac548a09ce117c9b1adc3030d706"
dependencies = [
"rustversion",
]
[[package]]
name = "itoa"
@ -182,9 +185,9 @@ checksum = "4a5f13b858c8d314ee3e8f639011f7ccefe71f97f96e50151fb991f267928e2c"
[[package]]
name = "libc"
version = "0.2.175"
version = "0.2.177"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6a82ae493e598baaea5209805c49bbf2ea7de956d50d7da0da1164f9c6d28543"
checksum = "2874a2af47a2325c2001a6e6fad9b16a53b802102b528163885171cf92b15976"
[[package]]
name = "log"
@ -194,9 +197,9 @@ checksum = "34080505efa8e45a4b816c349525ebe327ceaa8559756f0356cba97ef3bf7432"
[[package]]
name = "memchr"
version = "2.7.5"
version = "2.7.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "32a282da65faaf38286cf3be983213fcf1d2e2a58700e808f83f4ea9a4804bc0"
checksum = "f52b00d39961fc5b2736ea853c9cc86238e165017a493d1d5c8eac6bdc4cc273"
[[package]]
name = "memoffset"
@ -221,20 +224,19 @@ checksum = "f84267b20a16ea918e43c6a88433c2d54fa145c92a811b5b047ccbe153674483"
[[package]]
name = "proc-macro2"
version = "1.0.101"
version = "1.0.103"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "89ae43fd86e4158d6db51ad8e2b80f313af9cc74f5c0e03ccb87de09998732de"
checksum = "5ee95bc4ef87b8d5ba32e8b7714ccc834865276eab0aed5c9958d00ec45f49e8"
dependencies = [
"unicode-ident",
]
[[package]]
name = "pyo3"
version = "0.23.5"
version = "0.26.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7778bffd85cf38175ac1f545509665d0b9b92a198ca7941f131f85f7a4f9a872"
checksum = "7ba0117f4212101ee6544044dae45abe1083d30ce7b29c4b5cbdfa2354e07383"
dependencies = [
"cfg-if",
"indoc",
"libc",
"memoffset",
@ -248,19 +250,18 @@ dependencies = [
[[package]]
name = "pyo3-build-config"
version = "0.23.5"
version = "0.26.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "94f6cbe86ef3bf18998d9df6e0f3fc1050a8c5efa409bf712e661a4366e010fb"
checksum = "4fc6ddaf24947d12a9aa31ac65431fb1b851b8f4365426e182901eabfb87df5f"
dependencies = [
"once_cell",
"target-lexicon",
]
[[package]]
name = "pyo3-ffi"
version = "0.23.5"
version = "0.26.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e9f1b4c431c0bb1c8fb0a338709859eed0d030ff6daa34368d3b152a63dfdd8d"
checksum = "025474d3928738efb38ac36d4744a74a400c901c7596199e20e45d98eb194105"
dependencies = [
"libc",
"pyo3-build-config",
@ -268,9 +269,9 @@ dependencies = [
[[package]]
name = "pyo3-log"
version = "0.12.4"
version = "0.13.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "45192e5e4a4d2505587e27806c7b710c231c40c56f3bfc19535d0bb25df52264"
checksum = "d359e20231345f21a3b5b6aea7e73f4dc97e1712ef3bfe2d88997ac6a308d784"
dependencies = [
"arc-swap",
"log",
@ -279,9 +280,9 @@ dependencies = [
[[package]]
name = "pyo3-macros"
version = "0.23.5"
version = "0.26.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "fbc2201328f63c4710f68abdf653c89d8dbc2858b88c5d88b0ff38a75288a9da"
checksum = "2e64eb489f22fe1c95911b77c44cc41e7c19f3082fc81cce90f657cdc42ffded"
dependencies = [
"proc-macro2",
"pyo3-macros-backend",
@ -291,9 +292,9 @@ dependencies = [
[[package]]
name = "pyo3-macros-backend"
version = "0.23.5"
version = "0.26.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "fca6726ad0f3da9c9de093d6f116a93c1a38e417ed73bf138472cf4064f72028"
checksum = "100246c0ecf400b475341b8455a9213344569af29a3c841d29270e53102e0fcf"
dependencies = [
"heck",
"proc-macro2",
@ -304,9 +305,9 @@ dependencies = [
[[package]]
name = "quote"
version = "1.0.40"
version = "1.0.41"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1885c039570dc00dcb4ff087a89e185fd56bae234ddc7f056a945bf36467248d"
checksum = "ce25767e7b499d1b604768e7cde645d14cc8584231ea6b295e9c9eb22c02e1d1"
dependencies = [
"proc-macro2",
]
@ -339,9 +340,9 @@ dependencies = [
[[package]]
name = "regex-automata"
version = "0.4.10"
version = "0.4.13"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6b9458fa0bfeeac22b5ca447c63aaf45f28439a709ccd244698632f9aa6394d6"
checksum = "5276caf25ac86c8d810222b3dbb938e512c55c6831a10f3e6ed1c93b84041f1c"
dependencies = [
"aho-corasick",
"memchr",
@ -350,9 +351,9 @@ dependencies = [
[[package]]
name = "regex-syntax"
version = "0.8.6"
version = "0.8.8"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "caf4aa5b0f434c91fe5c7f1ecb6a5ece2130b02ad2a590589dda5146df959001"
checksum = "7a2d987857b319362043e95f5353c0535c1f58eec5336fdfcf626430af7def58"
[[package]]
name = "rustbpe"
@ -389,9 +390,9 @@ checksum = "a2eb9349b6444b326872e140eb1cf5e7c522154d69e7a0ffb0fb81c06b37543f"
[[package]]
name = "syn"
version = "2.0.106"
version = "2.0.108"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ede7c438028d4436d71104916910f5bb611972c5cfd7f89b8300a8186e6fada6"
checksum = "da58917d35242480a05c2897064da0a80589a2a0476c9a3f2fdc83b53502e917"
dependencies = [
"proc-macro2",
"quote",
@ -400,15 +401,15 @@ dependencies = [
[[package]]
name = "target-lexicon"
version = "0.12.16"
version = "0.13.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "61c41af27dd6d1e27b1b16b489db798443478cef1f06a660c96db617ba5de3b1"
checksum = "df7f62577c25e07834649fc3b39fafdc597c0a3527dc1c60129201ccfcbaa50c"
[[package]]
name = "unicode-ident"
version = "1.0.18"
version = "1.0.20"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5a5f39404a5da50712a4c1eecf25e90dd62b613502b7e925fd4e4d19b5c96512"
checksum = "462eeb75aeb73aea900253ce739c8e18a67423fadf006037cd3ff27e82748a06"
[[package]]
name = "unindent"
@ -423,34 +424,34 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0b928f33d975fc6ad9f86c8f283853ad26bdd5b10b7f1542aa2fa15e2289105a"
[[package]]
name = "wasi"
version = "0.14.4+wasi-0.2.4"
name = "wasip2"
version = "1.0.1+wasi-0.2.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "88a5f4a424faf49c3c2c344f166f0662341d470ea185e939657aaff130f0ec4a"
checksum = "0562428422c63773dad2c345a1882263bbf4d65cf3f42e90921f787ef5ad58e7"
dependencies = [
"wit-bindgen",
]
[[package]]
name = "wit-bindgen"
version = "0.45.1"
version = "0.46.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5c573471f125075647d03df72e026074b7203790d41351cd6edc96f46bcccd36"
checksum = "f17a85883d4e6d00e8a97c586de764dabcc06133f7f1d55dce5cdc070ad7fe59"
[[package]]
name = "zerocopy"
version = "0.8.26"
version = "0.8.27"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1039dd0d3c310cf05de012d8a39ff557cb0d23087fd44cad61df08fc31907a2f"
checksum = "0894878a5fa3edfd6da3f88c4805f4c8558e2b996227a3d864f47fe11e38282c"
dependencies = [
"zerocopy-derive",
]
[[package]]
name = "zerocopy-derive"
version = "0.8.26"
version = "0.8.27"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9ecf5b4cc5364572d7f4c329661bcc82724222973f2cab6f050a4e5c22f75181"
checksum = "88d2b8d9c68ad2b9e4340d7832716a4d21a22a1154777ad56ea55c51a9cf3831"
dependencies = [
"proc-macro2",
"quote",

View File

@ -8,8 +8,8 @@ dary_heap = "0.3"
indexmap = "2.2"
fancy-regex = "0.16.1"
log = "0.4.28"
pyo3 = { version = "0.23.3", features = ["extension-module"] }
pyo3-log = "0.12.4"
pyo3 = { version = "0.26", features = ["extension-module"] }
pyo3-log = "0.13.1"
ahash = "0.8.12"
rayon = "1.11.0"
compact_str = "0.9.0"

25
rustbpe/pyproject.toml Normal file
View File

@ -0,0 +1,25 @@
[project]
name = "rustbpe"
version = "0.1.0"
description = "A very lightweight Rust library for training a GPT tokenizer"
readme = "README.md"
requires-python = ">=3.10"
# when uv/pip invoke maturin through the PEP 517 interface, it always builds in release mode by default.
[build-system]
requires = ["maturin>=1.9.6,<2.0"]
build-backend = "maturin"
[tool.maturin]
module-name = "rustbpe"
bindings = "pyo3"
manifest-path = "Cargo.toml"
features = ["pyo3/extension-module"]
[tool.uv]
cache-keys = [
{ file = "pyproject.toml" },
{ file = "src/**/*.rs" },
{ file = "Cargo.toml" },
{ file = "Cargo.lock" }
]

View File

@ -15,6 +15,10 @@ export OMP_NUM_THREADS=1
export NANOCHAT_BASE_DIR="$HOME/.cache/nanochat"
mkdir -p $NANOCHAT_BASE_DIR
# Install Rust / Cargo -- dependencies of rustbpe
curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y
source "$HOME/.cargo/env"
# -----------------------------------------------------------------------------
# Python venv setup with uv
@ -47,14 +51,6 @@ python -m nanochat.report reset
# -----------------------------------------------------------------------------
# Tokenizer
# Install Rust / Cargo
curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y
source "$HOME/.cargo/env"
# Build the rustbpe Tokenizer
uv run maturin develop --release --manifest-path rustbpe/Cargo.toml
# Download the first ~2B characters of pretraining dataset
# look at dev/repackage_data_reference.py for details on how this data was prepared
# each data shard is ~250M chars

View File

Before

Width:  |  Height:  |  Size: 1.4 KiB

After

Width:  |  Height:  |  Size: 1.4 KiB

2923
uv.lock

File diff suppressed because it is too large Load Diff