diff --git a/README.md b/README.md index 483f3e38..3f942145 100644 --- a/README.md +++ b/README.md @@ -37,6 +37,14 @@ uv sync --extra cpu # (or) Use for CPU-only / MPS source .venv/bin/activate ``` +If you want to use HuggingFace tokenizers or models, add the extra "hf": + +```bash +uv sync --extra gpu --extra hf # Use for CUDA (A100/H100/etc.) +uv sync --extra cpu --extra hf # (or) Use for CPU-only / MPS +source .venv/bin/activate +``` + For development (adds pytest, matplotlib, ipykernel, transformers, etc.): ```bash diff --git a/nanochat/tokenizer.py b/nanochat/tokenizer.py index a2146c2e..97068a7a 100644 --- a/nanochat/tokenizer.py +++ b/nanochat/tokenizer.py @@ -31,10 +31,6 @@ SPLIT_PATTERN = r"""'(?i:[sdmt]|ll|ve|re)|[^\r\n\p{L}\p{N}]?+\p{L}+|\p{N}{1,2}| # ----------------------------------------------------------------------------- # Generic GPT-4-style tokenizer based on HuggingFace Tokenizer -from tokenizers import Tokenizer as HFTokenizer -from tokenizers import pre_tokenizers, decoders, Regex -from tokenizers.models import BPE -from tokenizers.trainers import BpeTrainer class HuggingFaceTokenizer: """Light wrapper around HuggingFace Tokenizer for some utilities""" @@ -42,15 +38,26 @@ class HuggingFaceTokenizer: def __init__(self, tokenizer): self.tokenizer = tokenizer + @staticmethod + def _try_import(): + try: + import tokenizers + except ImportError as exc: + raise ImportError("Missing HF dependencies, install the extra 'hf'") from exc + @classmethod def from_pretrained(cls, hf_path): # init from a HuggingFace pretrained tokenizer (e.g. "gpt2") + cls._try_import() + from tokenizers import Tokenizer as HFTokenizer tokenizer = HFTokenizer.from_pretrained(hf_path) return cls(tokenizer) @classmethod def from_directory(cls, tokenizer_dir): # init from a local directory on disk (e.g. "out/tokenizer") + cls._try_import() + from tokenizers import Tokenizer as HFTokenizer tokenizer_path = os.path.join(tokenizer_dir, "tokenizer.json") tokenizer = HFTokenizer.from_file(tokenizer_path) return cls(tokenizer) @@ -58,6 +65,10 @@ class HuggingFaceTokenizer: @classmethod def train_from_iterator(cls, text_iterator, vocab_size): # train from an iterator of text + from tokenizers import Tokenizer as HFTokenizer + from tokenizers import pre_tokenizers, decoders, Regex + from tokenizers.models import BPE + from tokenizers.trainers import BpeTrainer # Configure the HuggingFace Tokenizer tokenizer = HFTokenizer(BPE( byte_fallback=True, # needed! diff --git a/pyproject.toml b/pyproject.toml index 0527369f..a95a0085 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -11,7 +11,6 @@ dependencies = [ "psutil>=7.1.0", "rustbpe>=0.1.0", "tiktoken>=0.11.0", - "tokenizers>=0.22.0", "torch==2.9.1", "uvicorn>=0.36.0", "wandb>=0.21.3", @@ -23,7 +22,6 @@ dev = [ "matplotlib>=3.10.8", "pytest>=8.0.0", "python-dotenv>=1.2.1", - "transformers>=4.57.3", ] [tool.pytest.ini_options] @@ -60,6 +58,10 @@ cpu = [ gpu = [ "torch==2.9.1", ] +hf = [ + "tokenizers>=0.22.0", + "transformers>=4.57.3", +] [tool.uv] default-groups = [] diff --git a/scripts/base_eval.py b/scripts/base_eval.py index a57bbaf6..18cbc946 100644 --- a/scripts/base_eval.py +++ b/scripts/base_eval.py @@ -67,7 +67,10 @@ class ModelWrapper: def load_hf_model(hf_path: str, device): """Load a HuggingFace model and tokenizer.""" print0(f"Loading HuggingFace model from: {hf_path}") - from transformers import AutoModelForCausalLM + try: + from transformers import AutoModelForCausalLM + except ImportError as exc: + raise ImportError("Missing HF dependencies, install the extra 'hf'") from exc model = AutoModelForCausalLM.from_pretrained(hf_path) model.to(device) model.eval()