mirror of
https://github.com/karpathy/nanochat.git
synced 2026-06-19 12:39:10 +00:00
Fix train_loss to average all steps instead of keeping only the last
This commit is contained in:
parent
aeff095e97
commit
2ae28292aa
|
|
@ -485,13 +485,15 @@ while True:
|
||||||
# evaluate the gradient
|
# evaluate the gradient
|
||||||
synchronize()
|
synchronize()
|
||||||
t0 = time.time()
|
t0 = time.time()
|
||||||
|
train_loss = 0.0
|
||||||
for micro_step in range(grad_accum_steps):
|
for micro_step in range(grad_accum_steps):
|
||||||
with autocast_ctx:
|
with autocast_ctx:
|
||||||
loss = model(x, y)
|
loss = model(x, y)
|
||||||
train_loss = loss.detach() # for logging
|
train_loss += loss.detach() # accumulate for logging
|
||||||
loss = loss / grad_accum_steps # each .backward() is a grad sum => normalize loss here
|
loss = loss / grad_accum_steps # each .backward() is a grad sum => normalize loss here
|
||||||
loss.backward()
|
loss.backward()
|
||||||
x, y, dataloader_state_dict = next(train_loader) # prefetch the next batch while the GPU is busy with forward/backward
|
x, y, dataloader_state_dict = next(train_loader) # prefetch the next batch while the GPU is busy with forward/backward
|
||||||
|
train_loss = train_loss / grad_accum_steps # average across micro steps
|
||||||
# step the optimizer
|
# step the optimizer
|
||||||
lrm = get_lr_multiplier(step)
|
lrm = get_lr_multiplier(step)
|
||||||
muon_momentum = get_muon_momentum(step)
|
muon_momentum = get_muon_momentum(step)
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue
Block a user