mirror of
https://github.com/karpathy/nanochat.git
synced 2026-06-15 02:29:09 +00:00
Create convert_to_sharegpt.py
added a convert script for convert the current format of the idenitiy conversation for mid training to be compatiable with huggingface so there will be no need for the s3 one anymore
This commit is contained in:
parent
1076f97059
commit
5b27c0c59e
24
dev/convert_to_sharegpt.py
Normal file
24
dev/convert_to_sharegpt.py
Normal file
|
|
@ -0,0 +1,24 @@
|
|||
import json
|
||||
import os
|
||||
|
||||
os.chdir(r'C:\tmp')
|
||||
|
||||
input_file = 'identity_conversations.jsonl'
|
||||
output_file = 'identity_conversations_sharegpt.jsonl'
|
||||
|
||||
count = 0
|
||||
with open(input_file, 'r', encoding='utf-8') as f_in, open(output_file, 'w', encoding='utf-8') as f_out:
|
||||
for line in f_in:
|
||||
messages = json.loads(line.strip())
|
||||
converted = {'conversations': []}
|
||||
for msg in messages:
|
||||
role = 'human' if msg['role'] == 'user' else 'gpt'
|
||||
converted['conversations'].append({
|
||||
'from': role,
|
||||
'value': msg['content']
|
||||
})
|
||||
f_out.write(json.dumps(converted, ensure_ascii=False) + '\n')
|
||||
count += 1
|
||||
|
||||
print(f'Converted {count} conversations to ShareGPT format')
|
||||
print(f'Output saved to: {output_file}')
|
||||
Loading…
Reference in New Issue
Block a user