fix: cap SFT rendered conversation length

This commit is contained in:
Dylan Chen 2026-02-01 23:03:47 +08:00
parent 31b61d2d17
commit d4db003661

View File

@ -152,7 +152,7 @@ def sft_data_generator_bos_bestfit(split, buffer_size=100):
nonlocal cursor, epoch
while len(conv_buffer) < buffer_size:
conversation = dataset[cursor]
ids, _ = tokenizer.render_conversation(conversation)
ids, _ = tokenizer.render_conversation(conversation, max_tokens=row_capacity)
conv_buffer.append(ids)
cursor += ddp_world_size
if cursor >= dataset_size: