mirror of
https://github.com/karpathy/nanochat.git
synced 2025-12-25 21:52:26 +00:00
fix gb200 tflops
This commit is contained in:
parent
4e6f5eb8b9
commit
32017e831a
|
|
@ -4,7 +4,6 @@ import torch
|
||||||
def get_promised_flops_per_gpu():
|
def get_promised_flops_per_gpu():
|
||||||
"""
|
"""
|
||||||
Return best-effort dense BF16 Tensor/Matrix peak FLOPs for the active GPU.
|
Return best-effort dense BF16 Tensor/Matrix peak FLOPs for the active GPU.
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
tuple[str, float, bool]: (device_name, flops_per_gpu, is_estimated)
|
tuple[str, float, bool]: (device_name, flops_per_gpu, is_estimated)
|
||||||
device_name is the CUDA-reported name for the active device.
|
device_name is the CUDA-reported name for the active device.
|
||||||
|
|
@ -23,29 +22,29 @@ def get_promised_flops_per_gpu():
|
||||||
return any(k in name for k in keywords)
|
return any(k in name for k in keywords)
|
||||||
|
|
||||||
# --- NVIDIA Blackwell ---
|
# --- NVIDIA Blackwell ---
|
||||||
if has("gb200", "grace blackwell", "b200"):
|
if has("gb200", "grace blackwell"):
|
||||||
return result(2.25e15, False) # B200 dense BF16 ≈ 2.25 PFLOPS. (4.5 PFLOPS sparse)
|
return result(2.5e15, False) # GB200 dense BF16 ≈ 2.5 PFLOPS (5.0 PFLOPS sparse)
|
||||||
|
if has("b200"):
|
||||||
|
return result(2.25e15, False) # B200 dense BF16 ≈ 2.25 PFLOPS (4.5 PFLOPS sparse)
|
||||||
if has("b100"):
|
if has("b100"):
|
||||||
return result(1.8e15, False) # B100 dense BF16 ≈ 1.8 PFLOPS. (3.5 PFLOPS sparse)
|
return result(1.8e15, False) # B100 dense BF16 ≈ 1.8 PFLOPS (3.5 PFLOPS sparse)
|
||||||
|
|
||||||
# --- NVIDIA Hopper (H100/H200/H800) ---
|
# --- NVIDIA Hopper (H100/H200/H800) ---
|
||||||
if has("h200"):
|
if has("h200"):
|
||||||
# SXM ~= H100 SXM for compute; NVL/PCIe is lower clock
|
|
||||||
if has("nvl", "pcie"):
|
if has("nvl", "pcie"):
|
||||||
return result(836e12, False) # H200 NVL/PCIe dense BF16 ≈ 836 TFLOPS
|
return result(836e12, False) # H200 NVL/PCIe dense BF16 ≈ 836 TFLOPS
|
||||||
return result(989e12, False) # H200 SXM dense BF16 ≈ 989 TFLOPS
|
return result(989e12, False) # H200 SXM dense BF16 ≈ 989 TFLOPS
|
||||||
|
|
||||||
if has("h100"):
|
if has("h100"):
|
||||||
if has("nvl"): # H100 NVL (per GPU)
|
if has("nvl"):
|
||||||
return result(835e12, False) # 1671 TFLOPS (sparse) ⇒ ~835 dense
|
return result(835e12, False) # H100 NVL dense BF16 ≈ 835 TFLOPS
|
||||||
if has("pcie"):
|
if has("pcie"):
|
||||||
return result(756e12, False) # H100 PCIe dense BF16 ≈ 756 TFLOPS
|
return result(756e12, False) # H100 PCIe dense BF16 ≈ 756 TFLOPS
|
||||||
return result(989e12, False) # H100 SXM dense BF16 ≈ 989 TFLOPS
|
return result(989e12, False) # H100 SXM dense BF16 ≈ 989 TFLOPS
|
||||||
|
|
||||||
if has("h800"):
|
if has("h800"):
|
||||||
# China-optimized Hopper; NVLink configs often quoted at H100-SXM-like numbers
|
|
||||||
if has("nvl"):
|
if has("nvl"):
|
||||||
return result(989e12, False) # H800 NVLink dense BF16 ≈ 989 TFLOPS (vendor configs)
|
return result(989e12, False) # H800 NVLink dense BF16 ≈ 989 TFLOPS
|
||||||
return result(756e12, False) # H800 PCIe dense BF16 ≈ 756 TFLOPS
|
return result(756e12, False) # H800 PCIe dense BF16 ≈ 756 TFLOPS
|
||||||
|
|
||||||
# --- NVIDIA Ampere data center / export variants ---
|
# --- NVIDIA Ampere data center / export variants ---
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue
Block a user