From 2b5e33c338e6e8b10c4cbaa68ed26ef3b38ad5f9 Mon Sep 17 00:00:00 2001 From: hiyouga Date: Fri, 3 Nov 2023 00:15:23 +0800 Subject: [PATCH] update data readme --- data/README.md | 101 +++++++++++++++++++++++++++++++++++++++------- data/README_zh.md | 72 ++++++++++++++++----------------- 2 files changed, 123 insertions(+), 50 deletions(-) diff --git a/data/README.md b/data/README.md index 4a203f35..9010fb64 100644 --- a/data/README.md +++ b/data/README.md @@ -5,30 +5,103 @@ If you are using a custom dataset, please provide your dataset definition in the "hf_hub_url": "the name of the dataset repository on the Hugging Face hub. (if specified, ignore below 3 arguments)", "script_url": "the name of the directory containing a dataset loading script. (if specified, ignore below 2 arguments)", "file_name": "the name of the dataset file in the this directory. (required if above are not specified)", - "file_sha1": "the SHA-1 hash value of the dataset file. (optional)", - "subset": "", - "ranking": "whether the examples contains ranked responses or not. (default: false)", - "formatting": "", + "file_sha1": "the SHA-1 hash value of the dataset file. (optional, does not affect training)", + "subset": "the name of the subset. (optional, default: None)", + "ranking": "whether the dataset is a preference dataset or not. (default: false)", + "formatting": "the format of the dataset. (optional, default: alpaca, can be chosen from {alpaca, sharegpt})", "columns": { - "prompt": "the name of the column in the datasets containing the prompts. (default: instruction)", - "query": "the name of the column in the datasets containing the queries. (default: input)", - "response": "the name of the column in the datasets containing the responses. (default: output)", - "history": "the name of the column in the datasets containing the history of chat. (default: None)" + "prompt": "the column name in the dataset containing the prompts. (default: instruction, for alpaca)", + "query": "the column name in the dataset containing the queries. (default: input, for alpaca)", + "response": "the column name in the dataset containing the responses. (default: output, for alpaca)", + "history": "the column name in the dataset containing the histories. (default: None, for alpaca)", + "messages": "the column name in the dataset containing the messages. (default: conversations, for sharegpt)", + "role": "the key in the message represents the identity. (default: from, for sharegpt)", + "content": "the key in the message represents the content. (default: value, for sharegpt)" } } ``` -where the `prompt` and `response` columns should contain non-empty values. The `query` column will be concatenated with the `prompt` column and used as input for the model. The `history` column should contain a list where each element is a string tuple representing a query-response pair. +Given above, you can use the custom dataset via specifying `--dataset dataset_name`. -For datasets used in reward modeling or DPO training, the `response` column should be a string list, with the preferred answers appearing first, for example: +Currently we support dataset in **alpaca** or **sharegpt** format, the dataset in alpaca format should follow the below format: + +```json +[ + { + "instruction": "user instruction (required)", + "input": "user input (optional)", + "output": "model response (required)", + "history": [ + ["user instruction in the first round (optional)", "model response in the first round (optional)"], + ["user instruction in the second round (optional)", "model response in the second round (optional)"] + ] + } +] +``` + +Regarding the above dataset, the `columns` in `dataset_info.json` should be: + +```json +"dataset_name": { + "columns": { + "prompt": "instruction", + "query": "input", + "response": "output", + "history": "history" + } +} +``` + +where the `prompt` and `response` columns should contain non-empty values, represent instruction and response respectively. The `query` column will be concatenated with the `prompt` column and used as input for the model. + +The `history` column is a list consisting string tuples representing query-response pairs in history. Note that the responses **in each round will be used for training**. + +For the pre-training datasets, only the `prompt` column will be used for training. + +For the preference datasets, the `response` column should be a string list whose length is 2, with the preferred answers appearing first, for example: ```json { - "instruction": "Question", - "input": "", + "instruction": "user instruction", + "input": "user input", "output": [ - "Chosen answer", - "Rejected answer" + "chosen answer", + "rejected answer" ] } ``` + +The dataset in sharegpt format should follow the below format: + +```json +[ + { + "conversations": [ + { + "from": "human", + "value": "user instruction" + }, + { + "from": "gpt", + "value": "model response" + } + ] + } +] +``` + +Regarding the above dataset, the `columns` in `dataset_info.json` should be: + +```json +"dataset_name": { + "columns": { + "messages": "conversations", + "role": "from", + "content": "value" + } +} +``` + +where the `messages` column should be a list whose length is even, and follow the `u/a/u/a/u/a` order. + +Pre-training datasets and preference datasets are incompatible with the sharegpt format yet. diff --git a/data/README_zh.md b/data/README_zh.md index 66e666e2..740e27db 100644 --- a/data/README_zh.md +++ b/data/README_zh.md @@ -2,44 +2,44 @@ ```json "数据集名称": { - "hf_hub_url": "Hugging Face 上的项目地址", // 若指定,则忽略下列三个参数 - "script_url": "包含数据加载脚本的本地文件夹名称", // 若指定,则忽略下列两个参数 - "file_name": "该目录下数据集文件的名称", // 若上述参数未指定,则此项必需 - "file_sha1": "数据集文件的SHA-1哈希值", // 可选,留空不影响训练 - "subset": "数据集子集的名称", // 可选,默认:None - "ranking": "是否为偏好数据集", // 可选,默认:False - "formatting": "数据集格式", // 可选,默认:alpaca,可以为 alpaca 或 sharegpt - "columns": { // 可选 - "prompt": "数据集代表提示词的表头名称", // 默认:instruction(alpaca 格式) - "query": "数据集代表请求的表头名称", // 默认:input(alpaca 格式) - "response": "数据集代表回答的表头名称", // 默认:output(alpaca 格式) - "history": "数据集代表历史对话的表头名称", // 默认:None(alpaca 格式) - "messages": "数据集代表消息列表的表头名称", // 默认:conversations(sharegpt 格式) - "role": "消息中代表发送者身份的键名", // 默认:from(sharegpt 格式) - "content": "消息中代表文本内容的键名" // 默认:value(sharegpt 格式) + "hf_hub_url": "Hugging Face 上的项目地址(若指定,则忽略下列三个参数)", + "script_url": "包含数据加载脚本的本地文件夹名称(若指定,则忽略下列两个参数)", + "file_name": "该目录下数据集文件的名称(若上述参数未指定,则此项必需)", + "file_sha1": "数据集文件的SHA-1哈希值(可选,留空不影响训练)", + "subset": "数据集子集的名称(可选,默认:None)", + "ranking": "是否为偏好数据集(可选,默认:False)", + "formatting": "数据集格式(可选,默认:alpaca,可以为 alpaca 或 sharegpt)", + "columns": { + "prompt": "数据集代表提示词的表头名称(默认:instruction,用于 alpaca 格式)", + "query": "数据集代表请求的表头名称(默认:input,用于 alpaca 格式)", + "response": "数据集代表回答的表头名称(默认:output,用于 alpaca 格式)", + "history": "数据集代表历史对话的表头名称(默认:None,用于 alpaca 格式)", + "messages": "数据集代表消息列表的表头名称(默认:conversations,用于 sharegpt 格式)", + "role": "消息中代表发送者身份的键名(默认:from,用于 sharegpt 格式)", + "content": "消息中代表文本内容的键名(默认:value,用于 sharegpt 格式)" } } ``` 添加后可通过指定 `--dataset 数据集名称` 参数使用自定义数据集。 -该项目目前支持两种格式的数据集:alpaca 和 sharegpt,其中 alpaca 格式的数据集按照以下方式组织: +该项目目前支持两种格式的数据集:**alpaca** 和 **sharegpt**,其中 alpaca 格式的数据集按照以下方式组织: ```json [ { - "instruction": "用户指令", // 必填 - "input": "用户输入", // 选填 - "output": "模型回答", // 必填 - "history": [ // 选填 - ["第一轮指令", "第一轮回答"], - ["第二轮指令", "第二轮回答"] + "instruction": "用户指令(必填)", + "input": "用户输入(选填)", + "output": "模型回答(必填)", + "history": [ + ["第一轮指令(选填)", "第一轮回答(选填)"], + ["第二轮指令(选填)", "第二轮回答(选填)"] ] } ] ``` -对于上述格式的数据,定义中的 `columns` 应当为: +对于上述格式的数据,`dataset_info.json` 中的 `columns` 应为: ```json "数据集名称": { @@ -54,7 +54,7 @@ 其中 `prompt` 和 `response` 列应当是非空的字符串,分别代表用户指令和模型回答。`query` 列的内容将会和 `prompt` 列拼接作为模型输入。 -`history` 列是由多个字符串二元组构成的列表,分别代表历史消息中每轮的指令和回答。每轮的模型回答**均会被用于训练**。 +`history` 列是由多个字符串二元组构成的列表,分别代表历史消息中每轮的指令和回答。注意每轮的模型回答**均会被用于训练**。 对于预训练数据集,仅 `prompt` 列中的内容会用于模型训练。 @@ -76,23 +76,21 @@ ```json [ { - "conversations": [ // 必填 - [ - { - "from": "human", - "value": "用户指令" - }, - { - "from": "gpt", - "value": "模型回答" - } - ] // 长度必须为 2 的倍数且符合 u/a/u/a/u/a 顺序 + "conversations": [ + { + "from": "human", + "value": "用户指令" + }, + { + "from": "gpt", + "value": "模型回答" + } ] } ] ``` -对于上述格式的数据,定义中的 `columns` 应当为: +对于上述格式的数据,`dataset_info.json` 中的 `columns` 应为: ```json "数据集名称": { @@ -104,4 +102,6 @@ } ``` +其中 `messages` 列必须为偶数长度的列表,且符合 `用户/模型/用户/模型/用户/模型` 的顺序。 + 预训练数据集和偏好数据集尚不支持 sharegpt 格式。