From 42c90c8183a49cadb2c2abcc58f6ea27d325231d Mon Sep 17 00:00:00 2001 From: BUAADreamer <1428195643@qq.com> Date: Thu, 25 Apr 2024 19:58:47 +0800 Subject: [PATCH] merge data part to the text stream --- data/dataset_info.json | 3 --- data/mllm_example_dataset/README.md | 25 ------------------ data/mllm_example_dataset/data/test-0.parquet | Bin 4580 -> 0 bytes .../mllm_example_dataset/data/train-0.parquet | Bin 4580 -> 0 bytes scripts/test_mllm.py | 24 +++++++++++------ src/llmtuner/data/template.py | 4 +-- 6 files changed, 18 insertions(+), 38 deletions(-) delete mode 100644 data/mllm_example_dataset/README.md delete mode 100644 data/mllm_example_dataset/data/test-0.parquet delete mode 100644 data/mllm_example_dataset/data/train-0.parquet diff --git a/data/dataset_info.json b/data/dataset_info.json index f9adf108..4958d8c7 100644 --- a/data/dataset_info.json +++ b/data/dataset_info.json @@ -414,9 +414,6 @@ }, "folder": "python" }, - "llava_instruct": { - "hf_hub_url": "HuggingFaceH4/llava-instruct-mix-vsft" - }, "mllm_instruct_example": { "file_name": "llava_instruct_example.json", "formatting": "llava", diff --git a/data/mllm_example_dataset/README.md b/data/mllm_example_dataset/README.md deleted file mode 100644 index d5c8c0e6..00000000 --- a/data/mllm_example_dataset/README.md +++ /dev/null @@ -1,25 +0,0 @@ ---- -dataset_info: - features: - - name: messages - list: - - name: content - list: - - name: index - dtype: int64 - - name: text - dtype: string - - name: type - dtype: string - - name: role - dtype: string - - name: images - sequence: image -configs: -- config_name: default - data_files: - - split: train - path: data/train-* - - split: test - path: data/test-* ---- \ No newline at end of file diff --git a/data/mllm_example_dataset/data/test-0.parquet b/data/mllm_example_dataset/data/test-0.parquet deleted file mode 100644 index 42c20b192497168523c3d39447cdae4495085b84..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 4580 zcmdTIO>84qc_wRnlaN&LHsc*{qws2Fr_nZRJ5HP?tA%tXA^uq>&A)#Tg6ElM&t%6l zW@a46Sr)C3s!AL=RTT%s38AX0RKx*L#Q`A>sHgT&sS*OM5OAnaFCYOb-+N;_KV-Ko zDiw^9H}8G_|8J5_d3#m}2xG!{K^PFYD;zgC!F3;j6XHT@MwkS2NC-`cgFVd2E+S4} z00p|Mr68=q5`I~@OgC>ge)D7d89u~G5BUip5KVKTvFOoYgSalP6E|QQu6LQ3q(Ln2 zvT9o%yw3oGhNY1sVIVRYd6$oTz)OqL;FFjXodG{NVSs~W3|<@u=Z8bWuLlc)$UOfj z@aNnpz>B(#utSiilg{!C0Cr*X902ZMiy&-iDB}?C_%22@$8I16cZg%5^_FF*dVHJ- zz#a-K6G*cStG+xvnhjRwRdBBAU=JY3ws>P6xwfhj2h|K>YQv*1Yle$-vMhmsEP2jF zySm-LF32@a{>IujI8F>Ma-E^YaJf)-?3Sf1CxBETDsH(1>HNlsYZ}_kMpPvX5(fPocD;ve`lr& zf>9H9OdD%_PE-v{v(5kZ4?8-sj&-|*W*1Pya9zW;kYS;G3*wwgrcyJIgO-b`1T@gN zK}H~4j_aY7(Vyz7acGzZwvb(ejM%Lvnxouq6RX!(#^;h~9e4QkvAHu4)iVC5Rj~iDS@$#AeTYHxA`usg7>WF8Sb8`j{q@qsr)&I$Efy1`0_{8)E#Vgf z;2<@Gy2y$TqA3YCPDPWOW|oRUh6b{yv;BC`rk%XZ2bDqT!z=$`6go}9qVNMg@+L1m zxFWqBo>(}i^g=&w2=SgD!Y1_ty3QGbH-@@(J#2a43phIi)5M>bR4gSDhx#Ny9)9r> zticbTqTYxBKKR37>D!UMH`$9vV!*LYYPcBr5g+*(FTE;A?~F6U&uh5Qc$j+yFpnmI z&cAEI zCI&>0-9F_=VBpdeC;jr7$NLg!VoztTyg3m0)`0Z-HR%^o8rw4p9;x#pm!p3lOLJn# zRdv$9LV!yTi2c06JsT?A22;}kk=<}`83Dd``T5knEAW$uLvV{-AAoM5mm?>Pz@GKb zb*@0~@h$+0{tSQ?Qx^c5yZQYjRRLd`pZTt28o92ZNGLiHHUD>R_y4a!`B@LN|MJNB z6z9ih-@cKbKOG54gOS&+z{gy4M*N)Ks@LJ(u3?pkZw=gw8CK1X-9q)uVQOO&BDdBRcWjpRJP<}%1b)=X0=f{%pKTu*f%Q0wWL17zNC-8)&Nj5N{w3etSwT_2 z-=?Jqabcv4{3O!y7dR0$u><4OyQwytH?iZ`ZFEQ+_UH0!I-ZQDq9%Opo%`Wl8HT_5 I;r~1T1e%nlz5oCK diff --git a/data/mllm_example_dataset/data/train-0.parquet b/data/mllm_example_dataset/data/train-0.parquet deleted file mode 100644 index 42c20b192497168523c3d39447cdae4495085b84..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 4580 zcmdTIO>84qc_wRnlaN&LHsc*{qws2Fr_nZRJ5HP?tA%tXA^uq>&A)#Tg6ElM&t%6l zW@a46Sr)C3s!AL=RTT%s38AX0RKx*L#Q`A>sHgT&sS*OM5OAnaFCYOb-+N;_KV-Ko zDiw^9H}8G_|8J5_d3#m}2xG!{K^PFYD;zgC!F3;j6XHT@MwkS2NC-`cgFVd2E+S4} z00p|Mr68=q5`I~@OgC>ge)D7d89u~G5BUip5KVKTvFOoYgSalP6E|QQu6LQ3q(Ln2 zvT9o%yw3oGhNY1sVIVRYd6$oTz)OqL;FFjXodG{NVSs~W3|<@u=Z8bWuLlc)$UOfj z@aNnpz>B(#utSiilg{!C0Cr*X902ZMiy&-iDB}?C_%22@$8I16cZg%5^_FF*dVHJ- zz#a-K6G*cStG+xvnhjRwRdBBAU=JY3ws>P6xwfhj2h|K>YQv*1Yle$-vMhmsEP2jF zySm-LF32@a{>IujI8F>Ma-E^YaJf)-?3Sf1CxBETDsH(1>HNlsYZ}_kMpPvX5(fPocD;ve`lr& zf>9H9OdD%_PE-v{v(5kZ4?8-sj&-|*W*1Pya9zW;kYS;G3*wwgrcyJIgO-b`1T@gN zK}H~4j_aY7(Vyz7acGzZwvb(ejM%Lvnxouq6RX!(#^;h~9e4QkvAHu4)iVC5Rj~iDS@$#AeTYHxA`usg7>WF8Sb8`j{q@qsr)&I$Efy1`0_{8)E#Vgf z;2<@Gy2y$TqA3YCPDPWOW|oRUh6b{yv;BC`rk%XZ2bDqT!z=$`6go}9qVNMg@+L1m zxFWqBo>(}i^g=&w2=SgD!Y1_ty3QGbH-@@(J#2a43phIi)5M>bR4gSDhx#Ny9)9r> zticbTqTYxBKKR37>D!UMH`$9vV!*LYYPcBr5g+*(FTE;A?~F6U&uh5Qc$j+yFpnmI z&cAEI zCI&>0-9F_=VBpdeC;jr7$NLg!VoztTyg3m0)`0Z-HR%^o8rw4p9;x#pm!p3lOLJn# zRdv$9LV!yTi2c06JsT?A22;}kk=<}`83Dd``T5knEAW$uLvV{-AAoM5mm?>Pz@GKb zb*@0~@h$+0{tSQ?Qx^c5yZQYjRRLd`pZTt28o92ZNGLiHHUD>R_y4a!`B@LN|MJNB z6z9ih-@cKbKOG54gOS&+z{gy4M*N)Ks@LJ(u3?pkZw=gw8CK1X-9q)uVQOO&BDdBRcWjpRJP<}%1b)=X0=f{%pKTu*f%Q0wWL17zNC-8)&Nj5N{w3etSwT_2 z-=?Jqabcv4{3O!y7dR0$u><4OyQwytH?iZ`ZFEQ+_UH0!I-ZQDq9%Opo%`Wl8HT_5 I;r~1T1e%nlz5oCK diff --git a/scripts/test_mllm.py b/scripts/test_mllm.py index 94d8670b..b8fe3e0f 100644 --- a/scripts/test_mllm.py +++ b/scripts/test_mllm.py @@ -6,22 +6,23 @@ from datasets import load_dataset from peft import PeftModel from transformers import AutoTokenizer, AutoModelForVision2Seq, AutoProcessor import shutil +from PIL import Image """usage python3 scripts/test_mllm.py \ --base_model_path llava-hf/llava-1.5-7b-hf \ --lora_model_path saves/llava-1.5-7b/lora/sft \ --model_path saves/llava-1.5-7b/lora/merged \ ---dataset_name data/mllm_example_dataset \ +--dataset_name data/llava_instruct_example.json \ --do_merge 1 """ def get_processor(model_path): - CHAT_TEMPLATE = """{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. {% for message in messages %}{% if message['role'] == 'user' %}USER: {% else %}ASSISTANT: {% endif %}{% for item in message['content'] %}{% if item['type'] == 'text' %}{{ item['text'] }}{% elif item['type'] == 'image' %}{% endif %}{% endfor %}{% if message['role'] == 'user' %} {% else %}{{eos_token}}{% endif %}{% endfor %}{% if add_generation_prompt %}ASSISTANT: {% endif %}""" + processor = AutoProcessor.from_pretrained(model_path) + CHAT_TEMPLATE = """A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. {% for message in messages %}{% if message['role'] == 'user' %}USER: {{ message['content'] }} ASSISTANT: {% else %}{{ message['content'] }}{% endif %} {% if message['role'] == 'user' %} {% else %}{{eos_token}}{% endif %}{% endfor %}""" tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=True) tokenizer.chat_template = CHAT_TEMPLATE - processor = AutoProcessor.from_pretrained(model_path) processor.tokenizer = tokenizer return processor @@ -69,7 +70,7 @@ def main( device_map="cuda", ) processor = get_processor(model_path) - raw_datasets = load_dataset(dataset_name) + raw_datasets = load_dataset("json", data_files=dataset_name) train_dataset = raw_datasets["train"] examples = train_dataset.select(range(3)) texts = [] @@ -80,11 +81,18 @@ def main( messages, tokenize=False, add_generation_prompt=False ) texts.append(text) - images.append(example["images"][0]) - batch = processor(texts, images, return_tensors="pt", padding=True).to("cuda") + images.append(Image.open(example["images"][0])) + batch = processor(text=texts, images=images, return_tensors="pt", padding=True).to( + "cuda" + ) output = model.generate(**batch, max_new_tokens=100) - res = processor.batch_decode(output, skip_special_tokens=True) - print(res) + res_list = processor.batch_decode(output, skip_special_tokens=True) + for i, prompt in enumerate(texts): + res = res_list[i] + print(f"#{i}") + print(f"prompt:{prompt}") + print(f"response:{res[len(prompt):].strip()}") + print() if __name__ == "__main__": diff --git a/src/llmtuner/data/template.py b/src/llmtuner/data/template.py index 311660aa..e6cdadd6 100644 --- a/src/llmtuner/data/template.py +++ b/src/llmtuner/data/template.py @@ -1012,8 +1012,8 @@ _register_template( _register_template( name="llava", - format_user=StringFormatter(slots=["USER: {{content}} "]), - format_assistant=StringFormatter(slots=["ASSISTANT: {{content}}"]), + format_user=StringFormatter(slots=["USER: {{content}} ASSISTANT: "]), + format_assistant=StringFormatter(slots=["{{content}}", {"eos_token"}]), default_system=( "A chat between a curious user and an artificial intelligence assistant. " "The assistant gives helpful, detailed, and polite answers to the user's questions."