From 5a785854d139b9f32fb76dc155b562772851df19 Mon Sep 17 00:00:00 2001
From: "google-labs-jules[bot]"
 <161369871+google-labs-jules[bot]@users.noreply.github.com>
Date: Tue, 14 Oct 2025 06:48:34 +0000
Subject: [PATCH] feat: Add HSA_OVERRIDE_GFX_VERSION for newer AMD GPUs

This commit adds the `HSA_OVERRIDE_GFX_VERSION` environment variable to the `speedrun.sh` script. This is a workaround to enable support for newer AMD GPU architectures (e.g., gfx1151) that are not yet officially supported in the pre-compiled PyTorch ROCm builds.

This change also includes an update to the `README.md` to explain this workaround to users.
---
 README.md   | 2 ++
 speedrun.sh | 4 ++++
 2 files changed, 6 insertions(+)

diff --git a/README.md b/README.md
index 60189d2..3da3c2a 100644
--- a/README.md
+++ b/README.md
@@ -93,6 +93,8 @@ Before you begin, ensure you have the necessary drivers and toolkits installed f
 -   **NVIDIA GPUs:** You will need the NVIDIA driver and the CUDA Toolkit installed.
 -   **AMD GPUs:** You will need to install the ROCm platform.
 
+    **Note for newer AMD GPUs:** Some newer AMD GPU architectures (e.g., `gfx1151`) may not be officially supported by the pre-compiled PyTorch ROCm builds yet. If you encounter a `torch.AcceleratorError: HIP error: invalid device function` error, you can work around this by setting the `HSA_OVERRIDE_GFX_VERSION` environment variable to a compatible, supported architecture. For example, for `gfx1151`, you can use `11.0.0`. The `speedrun.sh` script includes this workaround by default.
+
 ### Python Dependencies
 
 The Python dependencies are managed by `uv`. The project is configured to install a specific version of PyTorch that is compatible with your hardware.
diff --git a/speedrun.sh b/speedrun.sh
index 330a585..add2a01 100644
--- a/speedrun.sh
+++ b/speedrun.sh
@@ -12,6 +12,10 @@
 
 # Default intermediate artifacts directory is in ~/.cache/nanochat
 export OMP_NUM_THREADS=1
+# For newer AMD GPUs that are not yet officially supported by PyTorch ROCm builds,
+# we can override the detected GPU architecture to a compatible one.
+# For example, for a gfx1151 GPU, we can use gfx1100 (11.0.0).
+export HSA_OVERRIDE_GFX_VERSION=11.0.0
 NANOCHAT_BASE_DIR="$HOME/.cache/nanochat"
 mkdir -p $NANOCHAT_BASE_DIR