From 1d3598afa10797ba0ce30d44f52e7994587c0ce8 Mon Sep 17 00:00:00 2001
From: Mark Mueller <mark.mueller@inf.ethz.ch>
Date: Thu, 8 Feb 2024 19:32:20 +0100
Subject: [PATCH] Slim Orca data parsing

---
 data/README.md               |  3 ++-
 data/dataset_info.json       | 14 ++++++++++++++
 src/llmtuner/data/aligner.py |  7 +++++--
 src/llmtuner/data/utils.py   |  1 +
 4 files changed, 22 insertions(+), 3 deletions(-)

diff --git a/data/README.md b/data/README.md
index 3d950e1b..bb38935c 100644
--- a/data/README.md
+++ b/data/README.md
@@ -26,7 +26,8 @@ If you are using a custom dataset, please provide your dataset definition in the
     "user_tag": "the value of the role_tag represents the user. (default: human)",
     "assistant_tag": "the value of the role_tag represents the assistant. (default: gpt)",
     "observation_tag": "the value of the role_tag represents the tool results. (default: observation)",
-    "function_tag": "the value of the role_tag represents the function call. (default: function_call)"
+    "function_tag": "the value of the role_tag represents the function call. (default: function_call)",
+    "system_tag": "the value of the role_tag represents the system prompt. (default: None) incompatible with system column"
   }
 }
 ```
diff --git a/data/dataset_info.json b/data/dataset_info.json
index 9b69f9a1..22dd11e6 100644
--- a/data/dataset_info.json
+++ b/data/dataset_info.json
@@ -126,6 +126,20 @@
       "system": "system_prompt"
     }
   },
+ "slimorca": {
+  "hf_hub_url": "Open-Orca/SlimOrca",
+  "formatting": "sharegpt",
+  "columns": {
+    "messages": "conversations"
+  },
+  "tags": {
+    "role_tag": "from",
+    "content_tag": "value",
+    "user_tag": "human",
+    "assistant_tag": "gpt",
+    "system_tag": "system"
+  }
+ },
   "intel_orca_dpo_pairs_de" : {
     "hf_hub_url": "mayflowergmbh/intel_orca_dpo_pairs_de",
     "ranking": true
diff --git a/src/llmtuner/data/aligner.py b/src/llmtuner/data/aligner.py
index cd3a7ea4..34c8980d 100644
--- a/src/llmtuner/data/aligner.py
+++ b/src/llmtuner/data/aligner.py
@@ -47,10 +47,10 @@ def convert_sharegpt(examples: Dict[str, List[Any]], dataset_attr: "DatasetAttr"
         dataset_attr.assistant_tag: Role.ASSISTANT,
         dataset_attr.observation_tag: Role.OBSERVATION,
         dataset_attr.function_tag: Role.FUNCTION,
+        dataset_attr.system_tag: Role.SYSTEM,
     }
     for i, messages in enumerate(examples[dataset_attr.messages]):
-        messages = messages[: len(messages) // 2 * 2]  # should be multiples of 2
-        if len(messages) == 0:
+        if len(messages) <= 1:
             continue
 
         prompt = []
@@ -74,6 +74,9 @@ def convert_sharegpt(examples: Dict[str, List[Any]], dataset_attr: "DatasetAttr"
                 {"role": tag_mapping[message[dataset_attr.role_tag]], "content": message[dataset_attr.content_tag]}
             )
 
+        if len(prompt) % 2 == 1:
+            # Last message was neither from assistant nor function
+            prompt.pop(-1)
         last_message = prompt.pop(-1)
         response.append(last_message)
         outputs["prompt"].append(prompt)
diff --git a/src/llmtuner/data/utils.py b/src/llmtuner/data/utils.py
index 062d390f..75a28c59 100644
--- a/src/llmtuner/data/utils.py
+++ b/src/llmtuner/data/utils.py
@@ -21,6 +21,7 @@ class Role(str, Enum):
     ASSISTANT = "assistant"
     OBSERVATION = "observation"
     FUNCTION = "function"
+    SYSTEM = "system"
 
 
 def checksum(data_files: List[str], file_sha1: Optional[str] = None) -> None: