Slim Orca data parsing

This commit is contained in:
Mark Mueller 2024-02-08 19:32:20 +01:00
parent 6703d0546d
commit 1d3598afa1
4 changed files with 22 additions and 3 deletions

View File

@ -26,7 +26,8 @@ If you are using a custom dataset, please provide your dataset definition in the
"user_tag": "the value of the role_tag represents the user. (default: human)", "user_tag": "the value of the role_tag represents the user. (default: human)",
"assistant_tag": "the value of the role_tag represents the assistant. (default: gpt)", "assistant_tag": "the value of the role_tag represents the assistant. (default: gpt)",
"observation_tag": "the value of the role_tag represents the tool results. (default: observation)", "observation_tag": "the value of the role_tag represents the tool results. (default: observation)",
"function_tag": "the value of the role_tag represents the function call. (default: function_call)" "function_tag": "the value of the role_tag represents the function call. (default: function_call)",
"system_tag": "the value of the role_tag represents the system prompt. (default: None) incompatible with system column"
} }
} }
``` ```

View File

@ -126,6 +126,20 @@
"system": "system_prompt" "system": "system_prompt"
} }
}, },
"slimorca": {
"hf_hub_url": "Open-Orca/SlimOrca",
"formatting": "sharegpt",
"columns": {
"messages": "conversations"
},
"tags": {
"role_tag": "from",
"content_tag": "value",
"user_tag": "human",
"assistant_tag": "gpt",
"system_tag": "system"
}
},
"intel_orca_dpo_pairs_de" : { "intel_orca_dpo_pairs_de" : {
"hf_hub_url": "mayflowergmbh/intel_orca_dpo_pairs_de", "hf_hub_url": "mayflowergmbh/intel_orca_dpo_pairs_de",
"ranking": true "ranking": true

View File

@ -47,10 +47,10 @@ def convert_sharegpt(examples: Dict[str, List[Any]], dataset_attr: "DatasetAttr"
dataset_attr.assistant_tag: Role.ASSISTANT, dataset_attr.assistant_tag: Role.ASSISTANT,
dataset_attr.observation_tag: Role.OBSERVATION, dataset_attr.observation_tag: Role.OBSERVATION,
dataset_attr.function_tag: Role.FUNCTION, dataset_attr.function_tag: Role.FUNCTION,
dataset_attr.system_tag: Role.SYSTEM,
} }
for i, messages in enumerate(examples[dataset_attr.messages]): for i, messages in enumerate(examples[dataset_attr.messages]):
messages = messages[: len(messages) // 2 * 2] # should be multiples of 2 if len(messages) <= 1:
if len(messages) == 0:
continue continue
prompt = [] prompt = []
@ -74,6 +74,9 @@ def convert_sharegpt(examples: Dict[str, List[Any]], dataset_attr: "DatasetAttr"
{"role": tag_mapping[message[dataset_attr.role_tag]], "content": message[dataset_attr.content_tag]} {"role": tag_mapping[message[dataset_attr.role_tag]], "content": message[dataset_attr.content_tag]}
) )
if len(prompt) % 2 == 1:
# Last message was neither from assistant nor function
prompt.pop(-1)
last_message = prompt.pop(-1) last_message = prompt.pop(-1)
response.append(last_message) response.append(last_message)
outputs["prompt"].append(prompt) outputs["prompt"].append(prompt)

View File

@ -21,6 +21,7 @@ class Role(str, Enum):
ASSISTANT = "assistant" ASSISTANT = "assistant"
OBSERVATION = "observation" OBSERVATION = "observation"
FUNCTION = "function" FUNCTION = "function"
SYSTEM = "system"
def checksum(data_files: List[str], file_sha1: Optional[str] = None) -> None: def checksum(data_files: List[str], file_sha1: Optional[str] = None) -> None: