mirror of
https://github.com/karpathy/nanochat.git
synced 2025-12-06 12:22:18 +00:00
36 lines
875 B
Python
36 lines
875 B
Python
#!/usr/bin/env python3
|
|
"""Test torchrun command locally"""
|
|
import subprocess
|
|
import sys
|
|
|
|
# Simulate the exact command that will run
|
|
cmd = [
|
|
"torchrun", "--standalone", "--nproc_per_node=1",
|
|
"-m", "scripts.base_train",
|
|
"--depth=4",
|
|
"--device_batch_size=1",
|
|
"--num_iterations=2",
|
|
"--run=test_local",
|
|
"--vertex_experiment=",
|
|
"--vertex_tensorboard="
|
|
]
|
|
|
|
print("Testing command:")
|
|
print(" ".join(cmd))
|
|
print()
|
|
|
|
try:
|
|
result = subprocess.run(cmd, capture_output=True, text=True, timeout=30)
|
|
print("STDOUT:")
|
|
print(result.stdout[:1000])
|
|
print("\nSTDERR:")
|
|
print(result.stderr[:1000])
|
|
print(f"\nExit code: {result.returncode}")
|
|
sys.exit(result.returncode)
|
|
except subprocess.TimeoutExpired:
|
|
print("Command timed out (expected for training)")
|
|
sys.exit(0)
|
|
except Exception as e:
|
|
print(f"Error: {e}")
|
|
sys.exit(1)
|