mirror of
https://github.com/karpathy/nanochat.git
synced 2025-12-06 12:22:18 +00:00
fix tokenization bug, there should be no space before first letter. sigh
This commit is contained in:
parent
8892470f29
commit
05a051dbe9
|
|
@ -260,7 +260,7 @@ class SimpleSpelling(Task):
|
||||||
# return the full conversation
|
# return the full conversation
|
||||||
messages = [
|
messages = [
|
||||||
{"role": "user", "content": f"Spell the word: {word}"},
|
{"role": "user", "content": f"Spell the word: {word}"},
|
||||||
{"role": "assistant", "content": f"{word}: {word_letters}"}
|
{"role": "assistant", "content": f"{word}:{word_letters}"}
|
||||||
]
|
]
|
||||||
conversation = {
|
conversation = {
|
||||||
"messages": messages,
|
"messages": messages,
|
||||||
|
|
@ -289,7 +289,16 @@ if __name__ == "__main__":
|
||||||
print()
|
print()
|
||||||
print("-" * 100)
|
print("-" * 100)
|
||||||
|
|
||||||
# also scrutinize the tokenization (last example only)
|
# # preview the SimpleSpelling task, first 10 examples
|
||||||
|
# task = SimpleSpelling()
|
||||||
|
# for i in range(10):
|
||||||
|
# ex = task.get_example(i)
|
||||||
|
# print("=" * 100)
|
||||||
|
# print(ex['messages'][0]['content'])
|
||||||
|
# print("-" * 100)
|
||||||
|
# print(ex['messages'][1]['content'])
|
||||||
|
|
||||||
|
# # also scrutinize the tokenization (last example only)
|
||||||
# from nanochat.tokenizer import get_tokenizer
|
# from nanochat.tokenizer import get_tokenizer
|
||||||
# tokenizer = get_tokenizer()
|
# tokenizer = get_tokenizer()
|
||||||
# ids, mask = tokenizer.render_conversation(ex)
|
# ids, mask = tokenizer.render_conversation(ex)
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue
Block a user