This commit is contained in:
Sofie Van Landeghem 2026-05-12 16:37:37 +08:00 committed by GitHub
commit f4067021db
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
4 changed files with 31 additions and 7 deletions

View File

@ -37,6 +37,14 @@ uv sync --extra cpu # (or) Use for CPU-only / MPS
source .venv/bin/activate
```
If you want to use HuggingFace tokenizers or models, add the extra "hf":
```bash
uv sync --extra gpu --extra hf # Use for CUDA (A100/H100/etc.)
uv sync --extra cpu --extra hf # (or) Use for CPU-only / MPS
source .venv/bin/activate
```
For development (adds pytest, matplotlib, ipykernel, transformers, etc.):
```bash

View File

@ -31,10 +31,6 @@ SPLIT_PATTERN = r"""'(?i:[sdmt]|ll|ve|re)|[^\r\n\p{L}\p{N}]?+\p{L}+|\p{N}{1,2}|
# -----------------------------------------------------------------------------
# Generic GPT-4-style tokenizer based on HuggingFace Tokenizer
from tokenizers import Tokenizer as HFTokenizer
from tokenizers import pre_tokenizers, decoders, Regex
from tokenizers.models import BPE
from tokenizers.trainers import BpeTrainer
class HuggingFaceTokenizer:
"""Light wrapper around HuggingFace Tokenizer for some utilities"""
@ -42,15 +38,26 @@ class HuggingFaceTokenizer:
def __init__(self, tokenizer):
self.tokenizer = tokenizer
@staticmethod
def _try_import():
try:
import tokenizers
except ImportError as exc:
raise ImportError("Missing HF dependencies, install the extra 'hf'") from exc
@classmethod
def from_pretrained(cls, hf_path):
# init from a HuggingFace pretrained tokenizer (e.g. "gpt2")
cls._try_import()
from tokenizers import Tokenizer as HFTokenizer
tokenizer = HFTokenizer.from_pretrained(hf_path)
return cls(tokenizer)
@classmethod
def from_directory(cls, tokenizer_dir):
# init from a local directory on disk (e.g. "out/tokenizer")
cls._try_import()
from tokenizers import Tokenizer as HFTokenizer
tokenizer_path = os.path.join(tokenizer_dir, "tokenizer.json")
tokenizer = HFTokenizer.from_file(tokenizer_path)
return cls(tokenizer)
@ -58,6 +65,10 @@ class HuggingFaceTokenizer:
@classmethod
def train_from_iterator(cls, text_iterator, vocab_size):
# train from an iterator of text
from tokenizers import Tokenizer as HFTokenizer
from tokenizers import pre_tokenizers, decoders, Regex
from tokenizers.models import BPE
from tokenizers.trainers import BpeTrainer
# Configure the HuggingFace Tokenizer
tokenizer = HFTokenizer(BPE(
byte_fallback=True, # needed!

View File

@ -11,7 +11,6 @@ dependencies = [
"psutil>=7.1.0",
"rustbpe>=0.1.0",
"tiktoken>=0.11.0",
"tokenizers>=0.22.0",
"torch==2.9.1",
"uvicorn>=0.36.0",
"wandb>=0.21.3",
@ -23,7 +22,6 @@ dev = [
"matplotlib>=3.10.8",
"pytest>=8.0.0",
"python-dotenv>=1.2.1",
"transformers>=4.57.3",
]
[tool.pytest.ini_options]
@ -60,6 +58,10 @@ cpu = [
gpu = [
"torch==2.9.1",
]
hf = [
"tokenizers>=0.22.0",
"transformers>=4.57.3",
]
[tool.uv]
default-groups = []

View File

@ -67,7 +67,10 @@ class ModelWrapper:
def load_hf_model(hf_path: str, device):
"""Load a HuggingFace model and tokenizer."""
print0(f"Loading HuggingFace model from: {hf_path}")
from transformers import AutoModelForCausalLM
try:
from transformers import AutoModelForCausalLM
except ImportError as exc:
raise ImportError("Missing HF dependencies, install the extra 'hf'") from exc
model = AutoModelForCausalLM.from_pretrained(hf_path)
model.to(device)
model.eval()