From 26bb024cb49ed78d1b6b71e62b78f174c299e8dc Mon Sep 17 00:00:00 2001 From: "chaoyu@qiyuanlab.com" Date: Tue, 16 Jul 2024 18:34:11 +0800 Subject: [PATCH] =?UTF-8?q?=E4=BF=AE=E6=94=B9=E4=B8=80=E4=BA=9B=E6=A0=BC?= =?UTF-8?q?=E5=BC=8F=E9=97=AE=E9=A2=98?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- quick_start_clean/readmes/quick_start.md | 1 + 1 file changed, 1 insertion(+) diff --git a/quick_start_clean/readmes/quick_start.md b/quick_start_clean/readmes/quick_start.md index 0c4be0f..941ca0f 100644 --- a/quick_start_clean/readmes/quick_start.md +++ b/quick_start_clean/readmes/quick_start.md @@ -137,6 +137,7 @@ for line in sys.stdin: temp_json = {"input": "", "output": line.strip()}#预训练计算Loss时只计算output部分,所以input字段为空 print(json.dumps(temp_json, ensure_ascii=False)) ``` + 脚本使用方法如下,其中pretrain.txt是原始txt数据,pretrain.jsonl是输出的jsonl格式数据: ```shell cat pretrain.txt | python convert_txt2jsonl.py > pretrain.jsonl