mirror of
https://github.com/karpathy/nanochat.git
synced 2026-03-24 13:53:27 +00:00
added a convert script for convert the current format of the idenitiy conversation for mid training to be compatiable with huggingface so there will be no need for the s3 one anymore
24 lines
782 B
Python
24 lines
782 B
Python
import json
|
|
import os
|
|
|
|
os.chdir(r'C:\tmp')
|
|
|
|
input_file = 'identity_conversations.jsonl'
|
|
output_file = 'identity_conversations_sharegpt.jsonl'
|
|
|
|
count = 0
|
|
with open(input_file, 'r', encoding='utf-8') as f_in, open(output_file, 'w', encoding='utf-8') as f_out:
|
|
for line in f_in:
|
|
messages = json.loads(line.strip())
|
|
converted = {'conversations': []}
|
|
for msg in messages:
|
|
role = 'human' if msg['role'] == 'user' else 'gpt'
|
|
converted['conversations'].append({
|
|
'from': role,
|
|
'value': msg['content']
|
|
})
|
|
f_out.write(json.dumps(converted, ensure_ascii=False) + '\n')
|
|
count += 1
|
|
|
|
print(f'Converted {count} conversations to ShareGPT format')
|
|
print(f'Output saved to: {output_file}') |