nanochat/test_torchrun.py
2025-12-01 19:59:58 -05:00

36 lines
875 B
Python

#!/usr/bin/env python3
"""Test torchrun command locally"""
import subprocess
import sys
# Simulate the exact command that will run
cmd = [
"torchrun", "--standalone", "--nproc_per_node=1",
"-m", "scripts.base_train",
"--depth=4",
"--device_batch_size=1",
"--num_iterations=2",
"--run=test_local",
"--vertex_experiment=",
"--vertex_tensorboard="
]
print("Testing command:")
print(" ".join(cmd))
print()
try:
result = subprocess.run(cmd, capture_output=True, text=True, timeout=30)
print("STDOUT:")
print(result.stdout[:1000])
print("\nSTDERR:")
print(result.stderr[:1000])
print(f"\nExit code: {result.returncode}")
sys.exit(result.returncode)
except subprocess.TimeoutExpired:
print("Command timed out (expected for training)")
sys.exit(0)
except Exception as e:
print(f"Error: {e}")
sys.exit(1)