mirror of
https://github.com/karpathy/nanochat.git
synced 2026-06-15 02:29:09 +00:00
Merge 079cc226a9 into dc54a1a307
This commit is contained in:
commit
f4067021db
|
|
@ -37,6 +37,14 @@ uv sync --extra cpu # (or) Use for CPU-only / MPS
|
|||
source .venv/bin/activate
|
||||
```
|
||||
|
||||
If you want to use HuggingFace tokenizers or models, add the extra "hf":
|
||||
|
||||
```bash
|
||||
uv sync --extra gpu --extra hf # Use for CUDA (A100/H100/etc.)
|
||||
uv sync --extra cpu --extra hf # (or) Use for CPU-only / MPS
|
||||
source .venv/bin/activate
|
||||
```
|
||||
|
||||
For development (adds pytest, matplotlib, ipykernel, transformers, etc.):
|
||||
|
||||
```bash
|
||||
|
|
|
|||
|
|
@ -31,10 +31,6 @@ SPLIT_PATTERN = r"""'(?i:[sdmt]|ll|ve|re)|[^\r\n\p{L}\p{N}]?+\p{L}+|\p{N}{1,2}|
|
|||
|
||||
# -----------------------------------------------------------------------------
|
||||
# Generic GPT-4-style tokenizer based on HuggingFace Tokenizer
|
||||
from tokenizers import Tokenizer as HFTokenizer
|
||||
from tokenizers import pre_tokenizers, decoders, Regex
|
||||
from tokenizers.models import BPE
|
||||
from tokenizers.trainers import BpeTrainer
|
||||
|
||||
class HuggingFaceTokenizer:
|
||||
"""Light wrapper around HuggingFace Tokenizer for some utilities"""
|
||||
|
|
@ -42,15 +38,26 @@ class HuggingFaceTokenizer:
|
|||
def __init__(self, tokenizer):
|
||||
self.tokenizer = tokenizer
|
||||
|
||||
@staticmethod
|
||||
def _try_import():
|
||||
try:
|
||||
import tokenizers
|
||||
except ImportError as exc:
|
||||
raise ImportError("Missing HF dependencies, install the extra 'hf'") from exc
|
||||
|
||||
@classmethod
|
||||
def from_pretrained(cls, hf_path):
|
||||
# init from a HuggingFace pretrained tokenizer (e.g. "gpt2")
|
||||
cls._try_import()
|
||||
from tokenizers import Tokenizer as HFTokenizer
|
||||
tokenizer = HFTokenizer.from_pretrained(hf_path)
|
||||
return cls(tokenizer)
|
||||
|
||||
@classmethod
|
||||
def from_directory(cls, tokenizer_dir):
|
||||
# init from a local directory on disk (e.g. "out/tokenizer")
|
||||
cls._try_import()
|
||||
from tokenizers import Tokenizer as HFTokenizer
|
||||
tokenizer_path = os.path.join(tokenizer_dir, "tokenizer.json")
|
||||
tokenizer = HFTokenizer.from_file(tokenizer_path)
|
||||
return cls(tokenizer)
|
||||
|
|
@ -58,6 +65,10 @@ class HuggingFaceTokenizer:
|
|||
@classmethod
|
||||
def train_from_iterator(cls, text_iterator, vocab_size):
|
||||
# train from an iterator of text
|
||||
from tokenizers import Tokenizer as HFTokenizer
|
||||
from tokenizers import pre_tokenizers, decoders, Regex
|
||||
from tokenizers.models import BPE
|
||||
from tokenizers.trainers import BpeTrainer
|
||||
# Configure the HuggingFace Tokenizer
|
||||
tokenizer = HFTokenizer(BPE(
|
||||
byte_fallback=True, # needed!
|
||||
|
|
|
|||
|
|
@ -11,7 +11,6 @@ dependencies = [
|
|||
"psutil>=7.1.0",
|
||||
"rustbpe>=0.1.0",
|
||||
"tiktoken>=0.11.0",
|
||||
"tokenizers>=0.22.0",
|
||||
"torch==2.9.1",
|
||||
"uvicorn>=0.36.0",
|
||||
"wandb>=0.21.3",
|
||||
|
|
@ -23,7 +22,6 @@ dev = [
|
|||
"matplotlib>=3.10.8",
|
||||
"pytest>=8.0.0",
|
||||
"python-dotenv>=1.2.1",
|
||||
"transformers>=4.57.3",
|
||||
]
|
||||
|
||||
[tool.pytest.ini_options]
|
||||
|
|
@ -60,6 +58,10 @@ cpu = [
|
|||
gpu = [
|
||||
"torch==2.9.1",
|
||||
]
|
||||
hf = [
|
||||
"tokenizers>=0.22.0",
|
||||
"transformers>=4.57.3",
|
||||
]
|
||||
|
||||
[tool.uv]
|
||||
default-groups = []
|
||||
|
|
|
|||
|
|
@ -67,7 +67,10 @@ class ModelWrapper:
|
|||
def load_hf_model(hf_path: str, device):
|
||||
"""Load a HuggingFace model and tokenizer."""
|
||||
print0(f"Loading HuggingFace model from: {hf_path}")
|
||||
from transformers import AutoModelForCausalLM
|
||||
try:
|
||||
from transformers import AutoModelForCausalLM
|
||||
except ImportError as exc:
|
||||
raise ImportError("Missing HF dependencies, install the extra 'hf'") from exc
|
||||
model = AutoModelForCausalLM.from_pretrained(hf_path)
|
||||
model.to(device)
|
||||
model.eval()
|
||||
|
|
|
|||
Loading…
Reference in New Issue
Block a user