From 1d3598afa10797ba0ce30d44f52e7994587c0ce8 Mon Sep 17 00:00:00 2001 From: Mark Mueller Date: Thu, 8 Feb 2024 19:32:20 +0100 Subject: [PATCH] Slim Orca data parsing --- data/README.md | 3 ++- data/dataset_info.json | 14 ++++++++++++++ src/llmtuner/data/aligner.py | 7 +++++-- src/llmtuner/data/utils.py | 1 + 4 files changed, 22 insertions(+), 3 deletions(-) diff --git a/data/README.md b/data/README.md index 3d950e1b..bb38935c 100644 --- a/data/README.md +++ b/data/README.md @@ -26,7 +26,8 @@ If you are using a custom dataset, please provide your dataset definition in the "user_tag": "the value of the role_tag represents the user. (default: human)", "assistant_tag": "the value of the role_tag represents the assistant. (default: gpt)", "observation_tag": "the value of the role_tag represents the tool results. (default: observation)", - "function_tag": "the value of the role_tag represents the function call. (default: function_call)" + "function_tag": "the value of the role_tag represents the function call. (default: function_call)", + "system_tag": "the value of the role_tag represents the system prompt. (default: None) incompatible with system column" } } ``` diff --git a/data/dataset_info.json b/data/dataset_info.json index 9b69f9a1..22dd11e6 100644 --- a/data/dataset_info.json +++ b/data/dataset_info.json @@ -126,6 +126,20 @@ "system": "system_prompt" } }, + "slimorca": { + "hf_hub_url": "Open-Orca/SlimOrca", + "formatting": "sharegpt", + "columns": { + "messages": "conversations" + }, + "tags": { + "role_tag": "from", + "content_tag": "value", + "user_tag": "human", + "assistant_tag": "gpt", + "system_tag": "system" + } + }, "intel_orca_dpo_pairs_de" : { "hf_hub_url": "mayflowergmbh/intel_orca_dpo_pairs_de", "ranking": true diff --git a/src/llmtuner/data/aligner.py b/src/llmtuner/data/aligner.py index cd3a7ea4..34c8980d 100644 --- a/src/llmtuner/data/aligner.py +++ b/src/llmtuner/data/aligner.py @@ -47,10 +47,10 @@ def convert_sharegpt(examples: Dict[str, List[Any]], dataset_attr: "DatasetAttr" dataset_attr.assistant_tag: Role.ASSISTANT, dataset_attr.observation_tag: Role.OBSERVATION, dataset_attr.function_tag: Role.FUNCTION, + dataset_attr.system_tag: Role.SYSTEM, } for i, messages in enumerate(examples[dataset_attr.messages]): - messages = messages[: len(messages) // 2 * 2] # should be multiples of 2 - if len(messages) == 0: + if len(messages) <= 1: continue prompt = [] @@ -74,6 +74,9 @@ def convert_sharegpt(examples: Dict[str, List[Any]], dataset_attr: "DatasetAttr" {"role": tag_mapping[message[dataset_attr.role_tag]], "content": message[dataset_attr.content_tag]} ) + if len(prompt) % 2 == 1: + # Last message was neither from assistant nor function + prompt.pop(-1) last_message = prompt.pop(-1) response.append(last_message) outputs["prompt"].append(prompt) diff --git a/src/llmtuner/data/utils.py b/src/llmtuner/data/utils.py index 062d390f..75a28c59 100644 --- a/src/llmtuner/data/utils.py +++ b/src/llmtuner/data/utils.py @@ -21,6 +21,7 @@ class Role(str, Enum): ASSISTANT = "assistant" OBSERVATION = "observation" FUNCTION = "function" + SYSTEM = "system" def checksum(data_files: List[str], file_sha1: Optional[str] = None) -> None: