mirror of
https://github.com/karpathy/nanochat.git
synced 2025-12-06 04:12:13 +00:00
typo fixes in scripts
This commit is contained in:
parent
0a3ce7b0ff
commit
8c9b004c99
|
|
@ -17,7 +17,7 @@ prompt:
|
||||||
2. You'll see that I added a large diversity of user first messages manually,
|
2. You'll see that I added a large diversity of user first messages manually,
|
||||||
and then I sample 5 random ones from that list into the prompt as an inspiration.
|
and then I sample 5 random ones from that list into the prompt as an inspiration.
|
||||||
This is really important to do because DIVERSITY CONTROL is key. If you don't
|
This is really important to do because DIVERSITY CONTROL is key. If you don't
|
||||||
manually inject diversity, the LLM might generate extrremely similar and repeptitive
|
manually inject diversity, the LLM might generate extremely similar and repetitive
|
||||||
conversations and things won't work well. Even this example below is not good enough,
|
conversations and things won't work well. Even this example below is not good enough,
|
||||||
for example you might want to actually suggest or inspire conversation topics, or questions,
|
for example you might want to actually suggest or inspire conversation topics, or questions,
|
||||||
and have a list of that. Basically, this is the KEY creative part to get right. Make sure you
|
and have a list of that. Basically, this is the KEY creative part to get right. Make sure you
|
||||||
|
|
|
||||||
|
|
@ -65,7 +65,7 @@ def evaluate_model(model, tokenizer, device, max_per_task=-1):
|
||||||
data = [json.loads(line.strip()) for line in f]
|
data = [json.loads(line.strip()) for line in f]
|
||||||
|
|
||||||
# shuffle the data because in many cases it appears ordered but we want
|
# shuffle the data because in many cases it appears ordered but we want
|
||||||
# the abillity to only run a subset of the data for debugging purposes etc.
|
# the ability to only run a subset of the data for debugging purposes etc.
|
||||||
shuffle_rng = random.Random(1337)
|
shuffle_rng = random.Random(1337)
|
||||||
shuffle_rng.shuffle(data)
|
shuffle_rng.shuffle(data)
|
||||||
if max_per_task > 0:
|
if max_per_task > 0:
|
||||||
|
|
|
||||||
|
|
@ -271,7 +271,7 @@ for step in range(num_iterations + 1):
|
||||||
loss = loss / grad_accum_steps # each .backward() is a grad sum => normalize loss here
|
loss = loss / grad_accum_steps # each .backward() is a grad sum => normalize loss here
|
||||||
loss.backward()
|
loss.backward()
|
||||||
x, y = next(train_loader) # prefetch the next batch while the GPU is busy with forward/backward
|
x, y = next(train_loader) # prefetch the next batch while the GPU is busy with forward/backward
|
||||||
# gradient clipping (TODO possibly expertiment with)
|
# gradient clipping (TODO possibly experiment with)
|
||||||
if grad_clip > 0.0:
|
if grad_clip > 0.0:
|
||||||
torch.nn.utils.clip_grad_norm_(orig_model.parameters(), grad_clip)
|
torch.nn.utils.clip_grad_norm_(orig_model.parameters(), grad_clip)
|
||||||
# step the optimizers
|
# step the optimizers
|
||||||
|
|
|
||||||
|
|
@ -117,7 +117,7 @@ def run_categorical_eval(task_object, tokenizer, model, batch_size, max_problems
|
||||||
logits = model(prompt_ids) # (B, T, V)
|
logits = model(prompt_ids) # (B, T, V)
|
||||||
|
|
||||||
# Focus on the available answer on just the letters corresponding to choices
|
# Focus on the available answer on just the letters corresponding to choices
|
||||||
# Note that this helps the evaluation a lot because it specifically narrows the focus to only the avilable letters
|
# Note that this helps the evaluation a lot because it specifically narrows the focus to only the available letters
|
||||||
# The much harder alternative would be to just generate from the Assistant and check if it responded with the correct
|
# The much harder alternative would be to just generate from the Assistant and check if it responded with the correct
|
||||||
# letter (e.g. A, B, C, D), but evaluations typically make the task easier in this way.
|
# letter (e.g. A, B, C, D), but evaluations typically make the task easier in this way.
|
||||||
for idx, conversation in enumerate(conversations):
|
for idx, conversation in enumerate(conversations):
|
||||||
|
|
|
||||||
|
|
@ -206,7 +206,7 @@ def get_lr_multiplier(it):
|
||||||
lrm = 1.0 - it / num_steps
|
lrm = 1.0 - it / num_steps
|
||||||
return lrm
|
return lrm
|
||||||
|
|
||||||
# Calculate the number of examples each rank handles to achive the desired examples_per_step
|
# Calculate the number of examples each rank handles to achieve the desired examples_per_step
|
||||||
print0(f"Total sequences per step: {examples_per_step * num_samples}") # total batch size in sequences/step
|
print0(f"Total sequences per step: {examples_per_step * num_samples}") # total batch size in sequences/step
|
||||||
assert examples_per_step % ddp_world_size == 0, "Desired examples per step must be divisible by the number of ranks"
|
assert examples_per_step % ddp_world_size == 0, "Desired examples per step must be divisible by the number of ranks"
|
||||||
examples_per_rank = examples_per_step // ddp_world_size # per GPU
|
examples_per_rank = examples_per_step // ddp_world_size # per GPU
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue
Block a user