Update README.md

2024-11-12 13:41:57 +08:00 · 2024-11-12 12:22:21 +08:00 · 2024-11-12 11:16:38 +08:00 · 2024-11-03 20:04:38 +08:00 · 2024-11-03 13:30:04 +08:00 · 2024-11-03 13:29:30 +08:00
6 changed files with 4572 additions and 2 deletions
--- a/LLaMA-Factory.zip
+++ b/LLaMA-Factory.zip
--- a/README.md
+++ b/README.md
@ -1,2 +1,27 @@
-训练代码：
+夸克网盘 docker链接：https://pan.quark.cn/s/4cda395f13e8  
-LLaMA-Factory
+(没有会员请联系我下载)
 1.使用llama-factory对九格模型进行全参数微调。数据集见dataset
 2.训练和推理都已验证无误，在A100*8卡机器上。
 docker 启动：sudo docker run -it --runtime=nvidia --gpus all --shm-size=256g wjf:train
 推理：python inference.py
 训练：
 cd training
 sh training.sh
 3.推理使用多checkpoint、多次推理融合。
 4.所有资料都已打包进docker，只需要docker即可。
 5.启动训练时将覆盖提交的checkpoint。
 6.docker卡在数据处理可能是机器的问题，尝试docker中输入：
 export NCCL_DEBUG=INFO
 export NCCL_SHM_DISABLE=1
 export NCCL_P2P_DISABLE=1
 由于需要保存多个checkpoint，请务必保证磁盘空间足够，大于500G。
 7.提交不易，请有问题是及时联系我（电话：13121813131）
--- a/inference.py
+++ b/inference.py
@ -0,0 +1,278 @@
 import json, torch, re, sys, subprocess
 from transformers import AutoModelForCausalLM, AutoTokenizer, AutoModel, StoppingCriteria
 device = "cuda" # the device to load the model onto
 from tqdm import tqdm
 def exec_code(test):
    with open("test_case.json", "r") as f:
        test_cases = json.load(f)
    right_num = 0
    all_num = 0
    package = "import os, sys, math, re, json, random\n"
    for item, test_case in zip(test, test_cases):
        if "```python\n" in item["raw_outputs"]:
            matches = re.findall('```python(.*?)```', item["raw_outputs"], re.DOTALL)
            if len(matches) == 1:
                item["raw_outputs"] = matches[0]
            else:
                matches = re.findall('```python(.*?)assert', item["raw_outputs"], re.DOTALL)
                if len(matches) == 1:
                    item["raw_outputs"] = matches[0]
                else:
                    item["raw_outputs"] = item["raw_outputs"][item["raw_outputs"].index("python\n") + len("python\n"):]
                    print(item)
                    #break
        code = item["raw_outputs"].replace("<|im_end|>", "").replace("</s>", "").replace("```", "").strip().rstrip("\n")
        raw_code = code
        codes = raw_code.split("\n")
        last_line = 0
        for index, line in enumerate(codes):
            if "  return" in line:
                last_line = index
        code = "\n".join(codes[:last_line+1])
        '''
        if raw_code != code:
            print("\n--------------------------------------------------------\n", [raw_code], "\n--------------------------------------------------------\n")
            print("clean:\n", [code], "\n+++++++++++++++++++++++++++++++++++++++++++++++++++++++++\n\n\n")
        '''
        with open('code_.py', 'w') as fout:
            fout.write(package + code + "\n" + "\n".join(test_case["test_case"]))
        batcmd = 'timeout 3 ' + sys.executable + ' code_.py'
        try:
            shell_output = subprocess.check_output(batcmd, shell=True).decode('utf8')
            right_num += 1
            item["result"] = "True"
        except Exception as e:
            print("++++++++++++++++++++++++++++++++++++++++++++++++++++\n", raw_code, "\n-----------------------------------------\n\n\n", package + code + "\n--------------------------\n" + "\n".join(test_case["test_case"]))
            print("--------------------------------------------------------\n\n\nitem:", item)
            print("e: ", e, "\n================================================\n")#, e, )
            item["result"] = "False"
        all_num += 1
        item["raw_outputs"] = [code]
    print(len(test), right_num, all_num, right_num / all_num)
    with open(f'wjf_{model_path.replace("/", "-")}{right_num / all_num}.json', "w") as f:
        json.dump(test, f, indent=4)
    return test, right_num / all_num
 def get_result(model, tokenizer):
    test = []
    with open("/mnt/disk2/home/wujianfeng/com/code/code_round4.jsonl", "r") as f:
        #test = json.load(f)
        for line in f:
            test.append(json.loads(line))
    all_score = 0
    all_num = 0
    test_num = 1000
    from tqdm import tqdm
    for example in tqdm(test[:]):
        #print(example["question"])
        example["question"] = example["question"].replace("'''", '"""')
        ai_prefix = ""
        if example["question"].split(" ")[0] == "Write":
            question = example["question"][:example["question"].index("\n")].strip().rstrip()
            test_case = example["question"][example["question"].index("\n"):].split("\n")
            print("test_case: ", test_case)
            function_name = test_case[1].split(" ")[1].split("(")[0]
            ai_prefix = "def " + function_name
            messages = [
                {"role": "user", "content": question + "\n\n" + ("\n".join(test_case))}
            ]
            text = tokenizer.apply_chat_template(
                    messages,
                    tokenize=False,
                    add_generation_prompt=True
                )
            text += ai_prefix
            example["test_case"] = test_case
        else:
            tmp = re.findall(r'"""(.*?)"""', example["question"], flags=re.DOTALL)[0].split("\n")
            question = ""
            for line in tmp:
                line = line.strip().rstrip()
                if len(line) == 0:
                    continue
                #if "xample" in line and len(line) < 20:
                #    break
                question += line + " "
            code = re.sub(r'"""(.*?)"""', '', example["question"], flags=re.DOTALL).strip().rstrip()
            ai_prefix = code
            messages = [
                {"role": "user", "content": question}
            ]
            text = tokenizer.apply_chat_template(
                    messages,
                    tokenize=False,
                    add_generation_prompt=True
                )
            text += ai_prefix
        example["prompt"] = text
        print("text: " , [text])
        input_ids = tokenizer([text], return_tensors="pt").to(device).input_ids
        output = model.generate(input_ids, 
                                #top_p=1.0, 
                                max_new_tokens=600, 
                                #repetition_penalty=1.1 + t*0.01,
                                temperature=0.1,
                                #no_repeat_ngram_size = 5,
                                ).squeeze()
        output_str = tokenizer.decode(output[input_ids.shape[1]:])
        output_str = ai_prefix + output_str
        print("output_str:\n", output_str, "\n-----------------------------------------------------------------")
        example["raw_outputs"] = output_str#re.findall(r'```python(.*?)```', output_str)
    return test
 def get_result_1(model, tokenizer):
    test = []
    with open("/mnt/disk2/home/wujianfeng/com/code/code_round4.jsonl", "r") as f:
        #test = json.load(f)
        for line in f:
            test.append(json.loads(line))
    all_score = 0
    all_num = 0
    test_num = 1000
    from tqdm import tqdm
    for example in tqdm(test[:]):
        #print(example["question"])
        messages = [
            {"role": "user", "content": example["question"]}
        ]
        text = tokenizer.apply_chat_template(
                messages,
                tokenize=False,
                add_generation_prompt=True
            )
        example["prompt"] = text
        print("text: " , [text])
        input_ids = tokenizer([text], return_tensors="pt").to(device).input_ids
        output = model.generate(input_ids, 
                                #top_p=1.0, 
                                max_new_tokens=600, 
                                #repetition_penalty=1.1 + t*0.01,
                                temperature=0.1,
                                #no_repeat_ngram_size = 5,
                                ).squeeze()
        output_str = tokenizer.decode(output[input_ids.shape[1]:])
        print("output_str:\n", output_str, "\n-----------------------------------------------------------------")
        example["raw_outputs"] = output_str#re.findall(r'```python(.*?)```', output_str)
    return test
 answers = {}
 for model_path in [
    "/mnt/disk2/home/wujianfeng/LLaMA-Factory/all/TACO/",
    "/mnt/disk2/home/wujianfeng/LLaMA-Factory/all_new_2/CodeNet4Repair/",
    "/mnt/disk2/home/wujianfeng/LLaMA-Factory/all_new_1/CodeExercise-Python-27k/",  
 ]:
    print("model_path: ", model_path)
    model = AutoModelForCausalLM.from_pretrained(
        model_path,
        torch_dtype="auto",
        device_map=device,
        trust_remote_code=True,
    )
    tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
    test = get_result(model, tokenizer)
    test, score = exec_code(test)
    answers[score] = test
    test = get_result_1(model, tokenizer)
    test, score = exec_code(test)
    answers[score] = test
 '''
 import os
 for path in os.listdir("./"):
    if "home-wujianfeng" in path: 
        with open(path, "r") as f:
            test = json.load(f)
        answers[float(path.split(".")[-2].split("-")[-1])] = test
 ''' 
 answers = list(dict(sorted(answers.items())).values())
 print("answers: ", answers)
 right = 0
 jiuge_right = 0
 merge = []
 for i in range(len(answers[0])):
 #for i in range(2):
    flag = 0
    for answer in answers:
        if answer[i]["result"] == "True":
            right += 1
            jiuge_right += 1
            flag = 1
            merge.append(answer[i])
            break
    if flag == 0:
        merge.append(answers[0][i])
 print(right / len(answers[0]), jiuge_right / len(answers[0]))
 with open("wjf_jiuge.jsonl", "w") as f:
    for item in merge:
        item.pop("result")
        f.write(json.dumps(item, ensure_ascii=False) + '\n')
--- a/model_final_url.txt
+++ b/model_final_url.txt
@ -0,0 +1,14 @@
 model_wight:通过百度网盘分享的文件：
 链接：https://pan.baidu.com/s/1paYNO7d5OYESuyw3BVo7Ew 
 提取码：6666
 #https://www.alipan.com/s/FTPWUSBuz7s
 docker:
 链接：https://pan.baidu.com/s/1paYNO7d5OYESuyw3BVo7Ew 
 提取码：6666
 #https://www.alipan.com/s/FTPWUSBuz7s
 train_data:
 链接：https://pan.baidu.com/s/1paYNO7d5OYESuyw3BVo7Ew 
 提取码：6666
 #https://www.alipan.com/s/FTPWUSBuz7s
--- a/test_case.json
+++ b/test_case.json
--- a/train.sh
+++ b/train.sh
@ -0,0 +1,154 @@
 #!/bin/bash
 deepspeed --include localhost:0,1,2,3,4,5,6,7 --master_port 21666 src/train.py \
    --stage sft \
    --model_name_or_path /mnt/diskhd/Backup/DownloadModel/2b_sft_model/ \
    --do_train \
    --dataset TACO \
    --template jiuge \
    --finetuning_type full \
    --output_dir TACO \
    --per_device_train_batch_size 14 \
    --gradient_accumulation_steps 6 \
    --lr_scheduler_type cosine \
    --logging_step 1 \
    --save_steps 300 \
    --lr_scheduler_type cosine_with_restarts \
    --warmup_ratio 0.001 \
    --optim adamw_torch \
    --learning_rate 2e-5 \
    --num_train_epochs 2.0 \
    --plot_loss \
    --bf16 \
    --gradient_checkpointing \
    --report_to tensorboard \
    --deepspeed deepspeed_configs/zero2.json \
    --cutoff_len 2048
 deepspeed --include localhost:0,1,2,3,4,5,6,7 --master_port 21666 src/train.py \
    --stage sft \
    --model_name_or_path /mnt/diskhd/Backup/DownloadModel/2b_sft_model/ \
    --do_train \
    --dataset Tested-143k-Python-Alpaca \
    --template jiuge \
    --finetuning_type full \
    --output_dir Tested-143k-Python-Alpaca \
    --per_device_train_batch_size 14 \
    --gradient_accumulation_steps 6 \
    --lr_scheduler_type cosine \
    --logging_step 1 \
    --save_steps 300 \
    --lr_scheduler_type cosine_with_restarts \
    --warmup_ratio 0.001 \
    --optim adamw_torch \
    --learning_rate 2e-5 \
    --num_train_epochs 2.0 \
    --plot_loss \
    --bf16 \
    --gradient_checkpointing \
    --report_to tensorboard \
    --deepspeed deepspeed_configs/zero2.json \
    --cutoff_len 2048
 deepspeed --include localhost:0,1,2,3,4,5,6,7 --master_port 21666 src/train.py \
    --stage sft \
    --model_name_or_path /mnt/diskhd/Backup/DownloadModel/2b_sft_model/ \
    --do_train \
    --dataset UltraInteract_sft \
    --template jiuge \
    --finetuning_type full \
    --output_dir UltraInteract_sft \
    --per_device_train_batch_size 14 \
    --gradient_accumulation_steps 6 \
    --lr_scheduler_type cosine \
    --logging_step 1 \
    --save_steps 300 \
    --lr_scheduler_type cosine_with_restarts \
    --warmup_ratio 0.001 \
    --optim adamw_torch \
    --learning_rate 2e-5 \
    --num_train_epochs 2.0 \
    --plot_loss \
    --bf16 \
    --gradient_checkpointing \
    --report_to tensorboard \
    --deepspeed deepspeed_configs/zero2.json \
    --cutoff_len 2048
 deepspeed --include localhost:0,1,2,3,4,5,6,7 --master_port 21666 src/train.py \
    --stage sft \
    --model_name_or_path /mnt/diskhd/Backup/DownloadModel/2b_sft_model/ \
    --do_train \
    --dataset  code_instructions_120k_alpaca \
    --template jiuge \
    --finetuning_type full \
    --output_dir code_instructions_120k_alpaca \
    --per_device_train_batch_size 14 \
    --gradient_accumulation_steps 6 \
    --lr_scheduler_type cosine \
    --logging_step 1 \
    --save_steps 300 \
    --lr_scheduler_type cosine_with_restarts \
    --warmup_ratio 0.001 \
    --optim adamw_torch \
    --learning_rate 2e-5 \
    --num_train_epochs 2.0 \
    --plot_loss \
    --bf16 \
    --gradient_checkpointing \
    --report_to tensorboard \
    --deepspeed deepspeed_configs/zero2.json \
    --cutoff_len 2048
 deepspeed --include localhost:0,1,2,3,4,5,6,7 --master_port 21666 src/train.py \
    --stage sft \
    --model_name_or_path /mnt/diskhd/Backup/DownloadModel/2b_sft_model/ \
    --do_train \
    --dataset  CodeExercise-Python-27k \
    --template jiuge \
    --finetuning_type full \
    --output_dir CodeExercise-Python-27k \
    --per_device_train_batch_size 14 \
    --gradient_accumulation_steps 6 \
    --lr_scheduler_type cosine \
    --logging_step 1 \
    --save_steps 300 \
    --lr_scheduler_type cosine_with_restarts \
    --warmup_ratio 0.001 \
    --optim adamw_torch \
    --learning_rate 2e-5 \
    --num_train_epochs 2.0 \
    --plot_loss \
    --bf16 \
    --gradient_checkpointing \
    --report_to tensorboard \
    --deepspeed deepspeed_configs/zero2.json \
    --cutoff_len 2048
 deepspeed --include localhost:0,1,2,3,4,5,6,7 --master_port 21666 src/train.py \
    --stage sft \
    --model_name_or_path /mnt/diskhd/Backup/DownloadModel/2b_sft_model/ \
    --do_train \
    --dataset CodeNet4Repair \
    --template jiuge \
    --finetuning_type full \
    --output_dir CodeNet4Repair \
    --per_device_train_batch_size 14 \
    --gradient_accumulation_steps 6 \
    --lr_scheduler_type cosine \
    --logging_step 1 \
    --save_steps 300 \
    --lr_scheduler_type cosine_with_restarts \
    --warmup_ratio 0.001 \
    --optim adamw_torch \
    --learning_rate 2e-5 \
    --num_train_epochs 2.0 \
    --plot_loss \
    --bf16 \
    --gradient_checkpointing \
    --report_to tensorboard \
    --deepspeed deepspeed_configs/zero2.json \
    --cutoff_len 2048
Author	SHA1	Message	Date
p83651209	1033ad4a75	Update README.md	2024-11-12 13:41:57 +08:00
p83651209	b63fcef8d2	Update README.md	2024-11-12 12:22:21 +08:00
p83651209	db10b9114b	Update README.md	2024-11-12 11:16:38 +08:00
p83651209	58a7967a98	Update inference.py	2024-11-03 20:04:38 +08:00
p83651209	cd1bdcf117	Add model_final_url.txt	2024-11-03 13:30:04 +08:00
p83651209	4c8196bc84	Delete model_final	2024-11-03 13:29:30 +08:00
p83651209	124160cb1e	Add model_final	2024-11-03 12:49:21 +08:00
p83651209	b0406a26bb	Update README.md	2024-11-02 17:48:43 +08:00
p83651209	cc5b9a5ad8	Update README.md	2024-11-02 17:00:02 +08:00
p83651209	9441c81244	Update README.md	2024-11-02 16:54:13 +08:00
p83651209	ed4e38ea65	ADD file via upload	2024-11-02 16:52:22 +08:00
p83651209	0ff927cf92	Delete LLaMA-Factory.zip	2024-11-02 16:51:52 +08:00
p83651209	8807057563	Update README.md	2024-11-02 16:34:20 +08:00
p83651209	5b57b159f2	ADD file via upload	2024-11-02 16:32:04 +08:00
p83651209	7519028f67	Delete sft_code.sh	2024-11-02 16:30:05 +08:00
p83651209	2f35baea6c	ADD file via upload	2024-11-02 16:28:39 +08:00
p83651209	b6a00ea9ea	ADD file via upload	2024-11-02 16:18:55 +08:00
p83651209	5858ade20b	Update README.md	2024-11-02 16:18:09 +08:00
p83651209	defb7a8bdd	ADD file via upload	2024-11-02 16:15:33 +08:00
p83651209	2b55eb9f69	Update README.md	2024-11-02 16:09:56 +08:00
p83651209	93ba858875	ADD file via upload	2024-11-02 14:51:31 +08:00
p83651209	6b868588e1	Update README.md	2024-11-02 14:49:03 +08:00