From 5b27c0c59e5f926f5b8ac17ead63955bca022df3 Mon Sep 17 00:00:00 2001 From: Dustin Loring Date: Fri, 6 Mar 2026 11:20:10 -0500 Subject: [PATCH] Create convert_to_sharegpt.py added a convert script for convert the current format of the idenitiy conversation for mid training to be compatiable with huggingface so there will be no need for the s3 one anymore --- dev/convert_to_sharegpt.py | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) create mode 100644 dev/convert_to_sharegpt.py diff --git a/dev/convert_to_sharegpt.py b/dev/convert_to_sharegpt.py new file mode 100644 index 0000000..365afce --- /dev/null +++ b/dev/convert_to_sharegpt.py @@ -0,0 +1,24 @@ +import json +import os + +os.chdir(r'C:\tmp') + +input_file = 'identity_conversations.jsonl' +output_file = 'identity_conversations_sharegpt.jsonl' + +count = 0 +with open(input_file, 'r', encoding='utf-8') as f_in, open(output_file, 'w', encoding='utf-8') as f_out: + for line in f_in: + messages = json.loads(line.strip()) + converted = {'conversations': []} + for msg in messages: + role = 'human' if msg['role'] == 'user' else 'gpt' + converted['conversations'].append({ + 'from': role, + 'value': msg['content'] + }) + f_out.write(json.dumps(converted, ensure_ascii=False) + '\n') + count += 1 + +print(f'Converted {count} conversations to ShareGPT format') +print(f'Output saved to: {output_file}') \ No newline at end of file