diff --git a/dev/gen_synthetic_data.py b/dev/gen_synthetic_data.py index 17b8424..13e5f55 100644 --- a/dev/gen_synthetic_data.py +++ b/dev/gen_synthetic_data.py @@ -17,7 +17,7 @@ prompt: 2. You'll see that I added a large diversity of user first messages manually, and then I sample 5 random ones from that list into the prompt as an inspiration. This is really important to do because DIVERSITY CONTROL is key. If you don't - manually inject diversity, the LLM might generate extrremely similar and repeptitive + manually inject diversity, the LLM might generate extremely similar and repetitive conversations and things won't work well. Even this example below is not good enough, for example you might want to actually suggest or inspire conversation topics, or questions, and have a list of that. Basically, this is the KEY creative part to get right. Make sure you diff --git a/scripts/base_eval.py b/scripts/base_eval.py index fc02120..8efde4f 100644 --- a/scripts/base_eval.py +++ b/scripts/base_eval.py @@ -65,7 +65,7 @@ def evaluate_model(model, tokenizer, device, max_per_task=-1): data = [json.loads(line.strip()) for line in f] # shuffle the data because in many cases it appears ordered but we want - # the abillity to only run a subset of the data for debugging purposes etc. + # the ability to only run a subset of the data for debugging purposes etc. shuffle_rng = random.Random(1337) shuffle_rng.shuffle(data) if max_per_task > 0: diff --git a/scripts/base_train.py b/scripts/base_train.py index 3725805..2570a72 100644 --- a/scripts/base_train.py +++ b/scripts/base_train.py @@ -271,7 +271,7 @@ for step in range(num_iterations + 1): loss = loss / grad_accum_steps # each .backward() is a grad sum => normalize loss here loss.backward() x, y = next(train_loader) # prefetch the next batch while the GPU is busy with forward/backward - # gradient clipping (TODO possibly expertiment with) + # gradient clipping (TODO possibly experiment with) if grad_clip > 0.0: torch.nn.utils.clip_grad_norm_(orig_model.parameters(), grad_clip) # step the optimizers diff --git a/scripts/chat_eval.py b/scripts/chat_eval.py index c77a89e..616411d 100644 --- a/scripts/chat_eval.py +++ b/scripts/chat_eval.py @@ -117,7 +117,7 @@ def run_categorical_eval(task_object, tokenizer, model, batch_size, max_problems logits = model(prompt_ids) # (B, T, V) # Focus on the available answer on just the letters corresponding to choices - # Note that this helps the evaluation a lot because it specifically narrows the focus to only the avilable letters + # Note that this helps the evaluation a lot because it specifically narrows the focus to only the available letters # The much harder alternative would be to just generate from the Assistant and check if it responded with the correct # letter (e.g. A, B, C, D), but evaluations typically make the task easier in this way. for idx, conversation in enumerate(conversations): diff --git a/scripts/chat_rl.py b/scripts/chat_rl.py index af70bda..bc78e79 100644 --- a/scripts/chat_rl.py +++ b/scripts/chat_rl.py @@ -206,7 +206,7 @@ def get_lr_multiplier(it): lrm = 1.0 - it / num_steps return lrm -# Calculate the number of examples each rank handles to achive the desired examples_per_step +# Calculate the number of examples each rank handles to achieve the desired examples_per_step print0(f"Total sequences per step: {examples_per_step * num_samples}") # total batch size in sequences/step assert examples_per_step % ddp_world_size == 0, "Desired examples per step must be divisible by the number of ranks" examples_per_rank = examples_per_step // ddp_world_size # per GPU