From defd1246aa80c99cf1b486ec112b1223916e21df Mon Sep 17 00:00:00 2001 From: Luke Stanley <306671+lukestanley@users.noreply.github.com> Date: Tue, 21 Oct 2025 19:43:38 +0000 Subject: [PATCH] Fix Torch crash caused by pinning on CPU --- nanochat/dataloader.py | 3 ++- scripts/mid_train.py | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/nanochat/dataloader.py b/nanochat/dataloader.py index 3d479a1..6c864d3 100644 --- a/nanochat/dataloader.py +++ b/nanochat/dataloader.py @@ -38,7 +38,8 @@ def tokenizing_distributed_data_loader(B, T, split, tokenizer_threads=4, tokeniz batch_index += 1 # Move tokens from the deque into the scratch buffer tokens = [token_buffer.popleft() for _ in range(needed_tokens)] - scratch = torch.tensor(tokens, dtype=torch.int64, pin_memory=True) + # CUDA supports memory pinning for faster transfers between CPU and GPU: + scratch = torch.tensor(tokens, dtype=torch.int64, pin_memory=(device == "cuda")) # Create the inputs/targets as 1D tensors inputs_cpu = scratch[:-1].to(dtype=torch.int32) targets_cpu = scratch[1:] diff --git a/scripts/mid_train.py b/scripts/mid_train.py index c731d57..2835ebf 100644 --- a/scripts/mid_train.py +++ b/scripts/mid_train.py @@ -119,7 +119,8 @@ def mid_data_generator(split): assert dataset_size > 0 needed_tokens = device_batch_size * max_seq_len + 1 # to form one training batch of inputs,targets token_buffer = deque() - scratch = torch.empty(needed_tokens, dtype=torch.int64, pin_memory=True) + # CUDA supports memory pinning for faster transfers between CPU and GPU: + scratch = torch.empty(needed_tokens, dtype=torch.int64, pin_memory=(device_type == "cuda")) cursor = ddp_rank # increments by ddp_world_size each time, so each rank processes unique documents it = 0 # iteration counter while True: