diff --git a/nanochat/engine.py b/nanochat/engine.py
index 7f05eb4..376e0ea 100644
--- a/nanochat/engine.py
+++ b/nanochat/engine.py
@@ -308,7 +308,7 @@ if __name__ == "__main__":
     # init compute
     ddp, ddp_rank, ddp_local_rank, ddp_world_size, device = compute_init()
     device_type = autodetect_device_type()
-    autocast_ctx = torch.amp.autocast(device_type=device_type, dtype=torch.bfloat16) if device_type == "cuda" else nullcontext()
+    autocast_ctx = torch.amp.autocast(device_type=device_type, dtype=torch.bfloat16) if device_type != "cpu" else nullcontext()
 
     # load the model and tokenizer
     model, tokenizer, meta = load_model("base", device, phase="eval")
diff --git a/scripts/base_eval.py b/scripts/base_eval.py
index bd83ff3..706457a 100644
--- a/scripts/base_eval.py
+++ b/scripts/base_eval.py
@@ -156,7 +156,7 @@ def main():
     # distributed / precision setup
     device_type = autodetect_device_type()
     ddp, ddp_rank, ddp_local_rank, ddp_world_size, device = compute_init(device_type)
-    autocast_ctx = torch.amp.autocast(device_type=device_type, dtype=torch.bfloat16) if device_type == "cuda" else nullcontext()
+    autocast_ctx = torch.amp.autocast(device_type=device_type, dtype=torch.bfloat16) if device_type != "cpu" else nullcontext()
 
     # Load model and tokenizer from command line or from file system
     if args.hf_path is not None:
diff --git a/scripts/base_loss.py b/scripts/base_loss.py
index fb8cf59..2fc10f0 100644
--- a/scripts/base_loss.py
+++ b/scripts/base_loss.py
@@ -87,7 +87,7 @@ else:
     token_bytes = get_token_bytes(device=device)
     model_name = f"base_model (step {meta['step']})"
 
-autocast_ctx = torch.amp.autocast(device_type=device_type, dtype=torch.bfloat16) if device_type == "cuda" else nullcontext()
+autocast_ctx = torch.amp.autocast(device_type=device_type, dtype=torch.bfloat16) if device_type != "cpu" else nullcontext()
 
 print0(f"Evaluating model: {model_name}")
 
diff --git a/scripts/base_train.py b/scripts/base_train.py
index 2d61477..4504949 100644
--- a/scripts/base_train.py
+++ b/scripts/base_train.py
@@ -79,9 +79,9 @@ user_config = vars(args).copy()  # for logging
 device_type = autodetect_device_type() if args.device_type == "" else args.device_type
 ddp, ddp_rank, ddp_local_rank, ddp_world_size, device = compute_init(device_type)
 master_process = ddp_rank == 0 # this process will do logging, checkpointing etc.
-autocast_ctx = torch.amp.autocast(device_type=device_type, dtype=torch.bfloat16) if device_type == "cuda" else nullcontext()
-synchronize = torch.cuda.synchronize if device_type == "cuda" else lambda: None
-get_max_memory = torch.cuda.max_memory_allocated if device_type == "cuda" else lambda: 0
+autocast_ctx = torch.amp.autocast(device_type=device_type, dtype=torch.bfloat16) if device_type != "cpu" else nullcontext()
+synchronize = torch.cuda.synchronize if device_type != "cpu" else lambda: None
+get_max_memory = torch.cuda.max_memory_allocated if device_type != "cpu" else lambda: 0
 if device_type == "cuda":
     gpu_device_name = torch.cuda.get_device_name(0)
     gpu_peak_flops = get_peak_flops(gpu_device_name)
diff --git a/scripts/chat_cli.py b/scripts/chat_cli.py
index b14843a..c82802e 100644
--- a/scripts/chat_cli.py
+++ b/scripts/chat_cli.py
@@ -27,7 +27,7 @@ args = parser.parse_args()
 device_type = autodetect_device_type() if args.device_type == "" else args.device_type
 ddp, ddp_rank, ddp_local_rank, ddp_world_size, device = compute_init(device_type)
 ptdtype = torch.float32 if args.dtype == 'float32' else torch.bfloat16
-autocast_ctx = torch.amp.autocast(device_type=device_type, dtype=ptdtype) if device_type == "cuda" else nullcontext()
+autocast_ctx = torch.amp.autocast(device_type=device_type, dtype=ptdtype) if device_type != "cpu" else nullcontext()
 model, tokenizer, meta = load_model(args.source, device, phase="eval", model_tag=args.model_tag, step=args.step)
 
 # Special tokens for the chat state machine
diff --git a/scripts/chat_eval.py b/scripts/chat_eval.py
index a558303..a92d515 100644
--- a/scripts/chat_eval.py
+++ b/scripts/chat_eval.py
@@ -200,7 +200,7 @@ if __name__ == "__main__":
     device_type = autodetect_device_type() if args.device_type == "" else args.device_type
     ddp, ddp_rank, ddp_local_rank, ddp_world_size, device = compute_init(device_type)
     ptdtype = torch.float32 if args.dtype == 'float32' else torch.bfloat16
-    autocast_ctx = torch.amp.autocast(device_type=device_type, dtype=ptdtype) if device_type == "cuda" else nullcontext()
+    autocast_ctx = torch.amp.autocast(device_type=device_type, dtype=ptdtype) if device_type != "cpu" else nullcontext()
 
     model, tokenizer, meta = load_model(args.source, device, phase="eval", model_tag=args.model_tag, step=args.step)
     engine = Engine(model, tokenizer)
diff --git a/scripts/chat_sft.py b/scripts/chat_sft.py
index 9277cf9..c21c43a 100644
--- a/scripts/chat_sft.py
+++ b/scripts/chat_sft.py
@@ -69,7 +69,7 @@ device_type = autodetect_device_type() if args.device_type == "" else args.devic
 ddp, ddp_rank, ddp_local_rank, ddp_world_size, device = compute_init(device_type)
 master_process = ddp_rank == 0
 ptdtype = torch.float32 if args.dtype == 'float32' else torch.bfloat16
-autocast_ctx = torch.amp.autocast(device_type=device_type, dtype=ptdtype) if device_type == "cuda" else nullcontext()
+autocast_ctx = torch.amp.autocast(device_type=device_type, dtype=ptdtype) if device_type != "cpu" else nullcontext()
 
 # wandb logging init
 use_dummy_wandb = args.run == "dummy" or not master_process
diff --git a/scripts/chat_web.py b/scripts/chat_web.py
index 4b67b62..d8a49fb 100644
--- a/scripts/chat_web.py
+++ b/scripts/chat_web.py
@@ -100,7 +100,7 @@ class WorkerPool:
 
     def __init__(self, num_gpus: Optional[int] = None):
         if num_gpus is None:
-            if device_type == "cuda":
+            if device_type != "cpu":
                 num_gpus = torch.cuda.device_count()
             else:
                 num_gpus = 1 # e.g. cpu|mps
@@ -112,11 +112,11 @@ class WorkerPool:
         """Load model on each GPU."""
         print(f"Initializing worker pool with {self.num_gpus} GPUs...")
         if self.num_gpus > 1:
-            assert device_type == "cuda", "Only CUDA supports multiple workers/GPUs. cpu|mps does not."
+            assert device_type != "cpu", "Only CUDA supports multiple workers/GPUs. cpu|mps does not."
 
         for gpu_id in range(self.num_gpus):
 
-            if device_type == "cuda":
+            if device_type != "cpu":
                 device = torch.device(f"cuda:{gpu_id}")
                 print(f"Loading model on GPU {gpu_id}...")
             else:
@@ -125,7 +125,7 @@ class WorkerPool:
 
             model, tokenizer, _ = load_model(source, device, phase="eval", model_tag=model_tag, step=step)
             engine = Engine(model, tokenizer)
-            autocast_ctx = torch.amp.autocast(device_type=device_type, dtype=ptdtype) if device_type == "cuda" else nullcontext()
+            autocast_ctx = torch.amp.autocast(device_type=device_type, dtype=ptdtype) if device_type != "cpu" else nullcontext()
 
             worker = Worker(
                 gpu_id=gpu_id,
diff --git a/scripts/mid_train.py b/scripts/mid_train.py
index c127c94..aab86c0 100644
--- a/scripts/mid_train.py
+++ b/scripts/mid_train.py
@@ -67,9 +67,9 @@ device_type = autodetect_device_type() if args.device_type == "" else args.devic
 ddp, ddp_rank, ddp_local_rank, ddp_world_size, device = compute_init(device_type)
 master_process = ddp_rank == 0
 ptdtype = torch.float32 if args.dtype == 'float32' else torch.bfloat16
-autocast_ctx = torch.amp.autocast(device_type=device_type, dtype=ptdtype) if device_type == "cuda" else nullcontext()
-synchronize = torch.cuda.synchronize if device_type == "cuda" else lambda: None
-get_max_memory = torch.cuda.max_memory_allocated if device_type == "cuda" else lambda: 0
+autocast_ctx = torch.amp.autocast(device_type=device_type, dtype=ptdtype) if device_type != "cpu" else nullcontext()
+synchronize = torch.cuda.synchronize if device_type != "cpu" else lambda: None
+get_max_memory = torch.cuda.max_memory_allocated if device_type != "cpu" else lambda: 0
 
 # wandb logging init
 use_dummy_wandb = args.run == "dummy" or not master_process
@@ -209,7 +209,7 @@ def mid_data_generator_bos_bestfit(split, buffer_size=100):
                 last_step = True
 
         # Build tensors
-        use_cuda = device_type == "cuda"
+        use_cuda = device_type != "cpu"
         batch_tensor = torch.tensor(rows, dtype=torch.long, pin_memory=use_cuda)
         inputs = batch_tensor[:, :-1].to(device=device, dtype=torch.int32, non_blocking=use_cuda)
         targets = batch_tensor[:, 1:].to(device=device, dtype=torch.int64, non_blocking=use_cuda)