mirror of
https://github.com/karpathy/nanochat.git
synced 2025-12-06 04:12:13 +00:00
typo fixes in scripts
This commit is contained in:
parent
0a3ce7b0ff
commit
8c9b004c99
|
|
@ -17,7 +17,7 @@ prompt:
|
|||
2. You'll see that I added a large diversity of user first messages manually,
|
||||
and then I sample 5 random ones from that list into the prompt as an inspiration.
|
||||
This is really important to do because DIVERSITY CONTROL is key. If you don't
|
||||
manually inject diversity, the LLM might generate extrremely similar and repeptitive
|
||||
manually inject diversity, the LLM might generate extremely similar and repetitive
|
||||
conversations and things won't work well. Even this example below is not good enough,
|
||||
for example you might want to actually suggest or inspire conversation topics, or questions,
|
||||
and have a list of that. Basically, this is the KEY creative part to get right. Make sure you
|
||||
|
|
|
|||
|
|
@ -65,7 +65,7 @@ def evaluate_model(model, tokenizer, device, max_per_task=-1):
|
|||
data = [json.loads(line.strip()) for line in f]
|
||||
|
||||
# shuffle the data because in many cases it appears ordered but we want
|
||||
# the abillity to only run a subset of the data for debugging purposes etc.
|
||||
# the ability to only run a subset of the data for debugging purposes etc.
|
||||
shuffle_rng = random.Random(1337)
|
||||
shuffle_rng.shuffle(data)
|
||||
if max_per_task > 0:
|
||||
|
|
|
|||
|
|
@ -271,7 +271,7 @@ for step in range(num_iterations + 1):
|
|||
loss = loss / grad_accum_steps # each .backward() is a grad sum => normalize loss here
|
||||
loss.backward()
|
||||
x, y = next(train_loader) # prefetch the next batch while the GPU is busy with forward/backward
|
||||
# gradient clipping (TODO possibly expertiment with)
|
||||
# gradient clipping (TODO possibly experiment with)
|
||||
if grad_clip > 0.0:
|
||||
torch.nn.utils.clip_grad_norm_(orig_model.parameters(), grad_clip)
|
||||
# step the optimizers
|
||||
|
|
|
|||
|
|
@ -117,7 +117,7 @@ def run_categorical_eval(task_object, tokenizer, model, batch_size, max_problems
|
|||
logits = model(prompt_ids) # (B, T, V)
|
||||
|
||||
# Focus on the available answer on just the letters corresponding to choices
|
||||
# Note that this helps the evaluation a lot because it specifically narrows the focus to only the avilable letters
|
||||
# Note that this helps the evaluation a lot because it specifically narrows the focus to only the available letters
|
||||
# The much harder alternative would be to just generate from the Assistant and check if it responded with the correct
|
||||
# letter (e.g. A, B, C, D), but evaluations typically make the task easier in this way.
|
||||
for idx, conversation in enumerate(conversations):
|
||||
|
|
|
|||
|
|
@ -206,7 +206,7 @@ def get_lr_multiplier(it):
|
|||
lrm = 1.0 - it / num_steps
|
||||
return lrm
|
||||
|
||||
# Calculate the number of examples each rank handles to achive the desired examples_per_step
|
||||
# Calculate the number of examples each rank handles to achieve the desired examples_per_step
|
||||
print0(f"Total sequences per step: {examples_per_step * num_samples}") # total batch size in sequences/step
|
||||
assert examples_per_step % ddp_world_size == 0, "Desired examples per step must be divisible by the number of ranks"
|
||||
examples_per_rank = examples_per_step // ddp_world_size # per GPU
|
||||
|
|
|
|||
Loading…
Reference in New Issue
Block a user