3 changed files with 289 additions and 314 deletions
--- a/README.md
+++ b/README.md
@ -1,27 +1,24 @@
-夸克网盘 docker链接：https://pan.quark.cn/s/4cda395f13e8  
-(没有会员请联系我下载)
+方案：
+全参数微调，使用不同数据集训练多个模型和推理时增强进行融合。


-1.使用llama-factory对九格模型进行全参数微调。数据集见dataset
+训练代码：
+LLaMA-Factory.zip 解压后使用，可参照https://github.com/hiyouga/LLaMA-Factory配置环境，或将代码映射到docker中使用。
+训练：train.sh。将数据集放到LLaMA-Factory/data文件夹下，将train.sh放到LLaMA-Factory下使用。
+推理： python inference.py(需在inference.py中修改好模型路径。) test_case.json是从题目中提取出来的测试用例。

-2.训练和推理都已验证无误，在A100*8卡机器上。
-docker 启动：sudo docker run -it --runtime=nvidia --gpus all --shm-size=256g wjf:train
-推理：python inference.py
-训练：
-cd training
-sh training.sh
+百度网盘需要收费，使用阿里云盘
+model_wight:通过百度网盘分享的文件：
+链接：https://pan.baidu.com/s/1paYNO7d5OYESuyw3BVo7Ew 
+提取码：6666
+https://www.alipan.com/s/FTPWUSBuz7s

+docker:
+链接：https://pan.baidu.com/s/1paYNO7d5OYESuyw3BVo7Ew 
+提取码：6666
+https://www.alipan.com/s/FTPWUSBuz7s

-3.推理使用多checkpoint、多次推理融合。
-
-4.所有资料都已打包进docker，只需要docker即可。
-
-5.启动训练时将覆盖提交的checkpoint。
-
-6.docker卡在数据处理可能是机器的问题，尝试docker中输入：
-export NCCL_DEBUG=INFO
-export NCCL_SHM_DISABLE=1
-export NCCL_P2P_DISABLE=1
-由于需要保存多个checkpoint，请务必保证磁盘空间足够，大于500G。
-
-7.提交不易，请有问题是及时联系我（电话：13121813131）
+train_data:
+链接：https://pan.baidu.com/s/1paYNO7d5OYESuyw3BVo7Ew 
+提取码：6666
+https://www.alipan.com/s/FTPWUSBuz7s
--- a/inference.py
+++ b/inference.py
@ -1,278 +1,270 @@
-import json, torch, re, sys, subprocess
-
-from transformers import AutoModelForCausalLM, AutoTokenizer, AutoModel, StoppingCriteria
-device = "cuda" # the device to load the model onto
-from tqdm import tqdm
-
-
-def exec_code(test):
-    with open("test_case.json", "r") as f:
-        test_cases = json.load(f)
-    
-    right_num = 0
-    all_num = 0
-    package = "import os, sys, math, re, json, random\n"
-    
-    for item, test_case in zip(test, test_cases):
-    
-        
-        if "```python\n" in item["raw_outputs"]:
-            matches = re.findall('```python(.*?)```', item["raw_outputs"], re.DOTALL)
-            if len(matches) == 1:
-                item["raw_outputs"] = matches[0]
-            else:
-                matches = re.findall('```python(.*?)assert', item["raw_outputs"], re.DOTALL)
-                if len(matches) == 1:
-                    item["raw_outputs"] = matches[0]
-                else:
-                    item["raw_outputs"] = item["raw_outputs"][item["raw_outputs"].index("python\n") + len("python\n"):]
-                    print(item)
-                    #break
-        
-    
-        code = item["raw_outputs"].replace("<|im_end|>", "").replace("</s>", "").replace("```", "").strip().rstrip("\n")
-        
-        raw_code = code
-         
-        codes = raw_code.split("\n")
-        last_line = 0
-        for index, line in enumerate(codes):
-            if "  return" in line:
-                last_line = index
-        
-        code = "\n".join(codes[:last_line+1])
-        
-        
-        '''
-        if raw_code != code:
-            print("\n--------------------------------------------------------\n", [raw_code], "\n--------------------------------------------------------\n")
-            print("clean:\n", [code], "\n+++++++++++++++++++++++++++++++++++++++++++++++++++++++++\n\n\n")
-        '''
-            
-    
-        with open('code_.py', 'w') as fout:
-            fout.write(package + code + "\n" + "\n".join(test_case["test_case"]))
-    
-        batcmd = 'timeout 3 ' + sys.executable + ' code_.py'
-        try:
-            shell_output = subprocess.check_output(batcmd, shell=True).decode('utf8')
-            right_num += 1
-            item["result"] = "True"
-        except Exception as e:
-            print("++++++++++++++++++++++++++++++++++++++++++++++++++++\n", raw_code, "\n-----------------------------------------\n\n\n", package + code + "\n--------------------------\n" + "\n".join(test_case["test_case"]))
-            print("--------------------------------------------------------\n\n\nitem:", item)
-            print("e: ", e, "\n================================================\n")#, e, )
-            item["result"] = "False"
-    
-        all_num += 1
-            
-        item["raw_outputs"] = [code]
-    
-    print(len(test), right_num, all_num, right_num / all_num)
-    with open(f'wjf_{model_path.replace("/", "-")}{right_num / all_num}.json', "w") as f:
-        json.dump(test, f, indent=4)
-        
-    return test, right_num / all_num
-    
-
-
-def get_result(model, tokenizer):
-    
-    test = []
-    with open("/mnt/disk2/home/wujianfeng/com/code/code_round4.jsonl", "r") as f:
-        #test = json.load(f)
-        for line in f:
-            test.append(json.loads(line))
-    
-    all_score = 0
-    all_num = 0
-    test_num = 1000
-    
-    
-    from tqdm import tqdm
-    for example in tqdm(test[:]):
-        #print(example["question"])
-        example["question"] = example["question"].replace("'''", '"""')
-        
-        ai_prefix = ""
-        if example["question"].split(" ")[0] == "Write":
-            question = example["question"][:example["question"].index("\n")].strip().rstrip()
-            test_case = example["question"][example["question"].index("\n"):].split("\n")
-            print("test_case: ", test_case)
-            
-            function_name = test_case[1].split(" ")[1].split("(")[0]
-            ai_prefix = "def " + function_name
-            
-            messages = [
-                {"role": "user", "content": question + "\n\n" + ("\n".join(test_case))}
-            ]
-            text = tokenizer.apply_chat_template(
-                    messages,
-                    tokenize=False,
-                    add_generation_prompt=True
-                )
-            text += ai_prefix
-            example["test_case"] = test_case
-            
-    
-        else:
-            tmp = re.findall(r'"""(.*?)"""', example["question"], flags=re.DOTALL)[0].split("\n")
-            question = ""
-            for line in tmp:
-                line = line.strip().rstrip()
-                if len(line) == 0:
-                    continue
-                
-                #if "xample" in line and len(line) < 20:
-                #    break
-                
-                question += line + " "
-                
-            code = re.sub(r'"""(.*?)"""', '', example["question"], flags=re.DOTALL).strip().rstrip()
-            ai_prefix = code
-            
-            messages = [
-                {"role": "user", "content": question}
-            ]
-            text = tokenizer.apply_chat_template(
-                    messages,
-                    tokenize=False,
-                    add_generation_prompt=True
-                )
-    
-            text += ai_prefix
-    
-    
-        example["prompt"] = text
-        print("text: " , [text])
-        input_ids = tokenizer([text], return_tensors="pt").to(device).input_ids
-        
-
-        output = model.generate(input_ids, 
-                                #top_p=1.0, 
-                                max_new_tokens=600, 
-                                #repetition_penalty=1.1 + t*0.01,
-                                temperature=0.1,
-                                #no_repeat_ngram_size = 5,
-                                ).squeeze()
-
-        output_str = tokenizer.decode(output[input_ids.shape[1]:])
-        output_str = ai_prefix + output_str
-        print("output_str:\n", output_str, "\n-----------------------------------------------------------------")
-            
-        example["raw_outputs"] = output_str#re.findall(r'```python(.*?)```', output_str)
-    return test
-    
-    
-
-def get_result_1(model, tokenizer):
-    
-    test = []
-    with open("/mnt/disk2/home/wujianfeng/com/code/code_round4.jsonl", "r") as f:
-        #test = json.load(f)
-        for line in f:
-            test.append(json.loads(line))
-    
-    all_score = 0
-    all_num = 0
-    test_num = 1000
-    
-    
-    from tqdm import tqdm
-    for example in tqdm(test[:]):
-        #print(example["question"])
-        messages = [
-            {"role": "user", "content": example["question"]}
-        ]
-        text = tokenizer.apply_chat_template(
-                messages,
-                tokenize=False,
-                add_generation_prompt=True
-            )
-
-    
-        example["prompt"] = text
-        print("text: " , [text])
-        input_ids = tokenizer([text], return_tensors="pt").to(device).input_ids
-        
-
-        output = model.generate(input_ids, 
-                                #top_p=1.0, 
-                                max_new_tokens=600, 
-                                #repetition_penalty=1.1 + t*0.01,
-                                temperature=0.1,
-                                #no_repeat_ngram_size = 5,
-                                ).squeeze()
-        output_str = tokenizer.decode(output[input_ids.shape[1]:])
-        print("output_str:\n", output_str, "\n-----------------------------------------------------------------")
-            
-        example["raw_outputs"] = output_str#re.findall(r'```python(.*?)```', output_str)
-    return test
-    
-
-answers = {}
-
-for model_path in [
-    "/mnt/disk2/home/wujianfeng/LLaMA-Factory/all/TACO/",
-    "/mnt/disk2/home/wujianfeng/LLaMA-Factory/all_new_2/CodeNet4Repair/",
-    "/mnt/disk2/home/wujianfeng/LLaMA-Factory/all_new_1/CodeExercise-Python-27k/",  
-]:
-    print("model_path: ", model_path)
-    model = AutoModelForCausalLM.from_pretrained(
-        model_path,
-        torch_dtype="auto",
-        device_map=device,
-        trust_remote_code=True,
-    )
-    tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
-    
-    test = get_result(model, tokenizer)
-    test, score = exec_code(test)
-    answers[score] = test
-    
-
-    test = get_result_1(model, tokenizer)
-    test, score = exec_code(test)
-    answers[score] = test
-
-'''
-import os
-for path in os.listdir("./"):
-    if "home-wujianfeng" in path: 
-        with open(path, "r") as f:
-            test = json.load(f)
-        answers[float(path.split(".")[-2].split("-")[-1])] = test
-''' 
-    
-answers = list(dict(sorted(answers.items())).values())
-print("answers: ", answers)
-right = 0
-jiuge_right = 0
-merge = []
-for i in range(len(answers[0])):
-#for i in range(2):
-    flag = 0
-    for answer in answers:
-        if answer[i]["result"] == "True":
-            right += 1
-            jiuge_right += 1
-            flag = 1
-            merge.append(answer[i])
-            break
-            
-    if flag == 0:
-        merge.append(answers[0][i])
-         
-
-
-print(right / len(answers[0]), jiuge_right / len(answers[0]))
-with open("wjf_jiuge.jsonl", "w") as f:
-    for item in merge:
-        item.pop("result")
-        f.write(json.dumps(item, ensure_ascii=False) + '\n')
-    
-
-
-
-        
-            
+import json, torch, re, sys, subprocess
+
+from transformers import AutoModelForCausalLM, AutoTokenizer, AutoModel, StoppingCriteria
+device = "cuda" # the device to load the model onto
+from tqdm import tqdm
+
+
+def exec_code(test):
+    with open("test_case.json", "r") as f:
+        test_cases = json.load(f)
+    
+    right_num = 0
+    all_num = 0
+    package = "import os, sys, math, re, json, random\n"
+    
+    for item, test_case in zip(test, test_cases):
+    
+        
+        if "```python\n" in item["raw_outputs"]:
+            matches = re.findall('```python(.*?)```', item["raw_outputs"], re.DOTALL)
+            if len(matches) == 1:
+                item["raw_outputs"] = matches[0]
+            else:
+                matches = re.findall('```python(.*?)assert', item["raw_outputs"], re.DOTALL)
+                if len(matches) == 1:
+                    item["raw_outputs"] = matches[0]
+                else:
+                    item["raw_outputs"] = item["raw_outputs"][item["raw_outputs"].index("python\n") + len("python\n"):]
+                    print(item)
+                    #break
+        
+    
+        code = item["raw_outputs"].replace("<|im_end|>", "").replace("</s>", "").replace("```", "").strip().rstrip("\n")
+        
+        raw_code = code
+         
+        codes = raw_code.split("\n")
+        last_line = 0
+        for index, line in enumerate(codes):
+            if "  return" in line:
+                last_line = index
+        
+        code = "\n".join(codes[:last_line+1])
+        
+        
+        '''
+        if raw_code != code:
+            print("\n--------------------------------------------------------\n", [raw_code], "\n--------------------------------------------------------\n")
+            print("clean:\n", [code], "\n+++++++++++++++++++++++++++++++++++++++++++++++++++++++++\n\n\n")
+        '''
+            
+    
+        with open('code_.py', 'w') as fout:
+            fout.write(package + code + "\n" + "\n".join(test_case["test_case"]))
+    
+        batcmd = 'timeout 3 ' + sys.executable + ' code_.py'
+        try:
+            shell_output = subprocess.check_output(batcmd, shell=True).decode('utf8')
+            right_num += 1
+            item["result"] = "True"
+        except Exception as e:
+            print("++++++++++++++++++++++++++++++++++++++++++++++++++++\n", raw_code, "\n-----------------------------------------\n\n\n", package + code + "\n--------------------------\n" + "\n".join(test_case["test_case"]))
+            print("--------------------------------------------------------\n\n\nitem:", item)
+            print("e: ", e, "\n================================================\n")#, e, )
+            item["result"] = "False"
+    
+        all_num += 1
+            
+        item["raw_outputs"] = [code]
+    
+    print(len(test), right_num, all_num, right_num / all_num)
+    with open(f'wjf_{model_path.replace("/", "-")}{right_num / all_num}.json', "w") as f:
+        json.dump(test, f, indent=4)
+        
+    return test, right_num / all_num
+    
+
+
+def get_result(model, tokenizer):
+    
+    test = []
+    with open("/mnt/disk2/home/wujianfeng/com/code/code_round4.jsonl", "r") as f:
+        #test = json.load(f)
+        for line in f:
+            test.append(json.loads(line))
+    
+    all_score = 0
+    all_num = 0
+    test_num = 1000
+    
+    
+    from tqdm import tqdm
+    for example in tqdm(test[:]):
+        #print(example["question"])
+        example["question"] = example["question"].replace("'''", '"""')
+        
+        ai_prefix = ""
+        if example["question"].split(" ")[0] == "Write":
+            question = example["question"][:example["question"].index("\n")].strip().rstrip()
+            test_case = example["question"][example["question"].index("\n"):].split("\n")
+            print("test_case: ", test_case)
+            
+            function_name = test_case[1].split(" ")[1].split("(")[0]
+            ai_prefix = "def " + function_name
+            
+            messages = [
+                {"role": "user", "content": question + "\n\n" + ("\n".join(test_case))}
+            ]
+            text = tokenizer.apply_chat_template(
+                    messages,
+                    tokenize=False,
+                    add_generation_prompt=True
+                )
+            text += ai_prefix
+            example["test_case"] = test_case
+            
+    
+        else:
+            tmp = re.findall(r'"""(.*?)"""', example["question"], flags=re.DOTALL)[0].split("\n")
+            question = ""
+            for line in tmp:
+                line = line.strip().rstrip()
+                if len(line) == 0:
+                    continue
+                
+                #if "xample" in line and len(line) < 20:
+                #    break
+                
+                question += line + " "
+                
+            code = re.sub(r'"""(.*?)"""', '', example["question"], flags=re.DOTALL).strip().rstrip()
+            ai_prefix = code
+            
+            messages = [
+                {"role": "user", "content": question}
+            ]
+            text = tokenizer.apply_chat_template(
+                    messages,
+                    tokenize=False,
+                    add_generation_prompt=True
+                )
+    
+            text += ai_prefix
+    
+    
+        example["prompt"] = text
+        print("text: " , [text])
+        input_ids = tokenizer([text], return_tensors="pt").to(device).input_ids
+        
+
+        output = model.generate(input_ids, 
+                                #top_p=1.0, 
+                                max_new_tokens=600, 
+                                #repetition_penalty=1.1 + t*0.01,
+                                temperature=0.1,
+                                #no_repeat_ngram_size = 5,
+                                ).squeeze()
+
+        output_str = tokenizer.decode(output[input_ids.shape[1]:])
+        output_str = ai_prefix + output_str
+        print("output_str:\n", output_str, "\n-----------------------------------------------------------------")
+            
+        example["raw_outputs"] = output_str#re.findall(r'```python(.*?)```', output_str)
+    return test
+    
+    
+
+def get_result_1(model, tokenizer):
+    
+    test = []
+    with open("/mnt/disk2/home/wujianfeng/com/code/code_round4.jsonl", "r") as f:
+        #test = json.load(f)
+        for line in f:
+            test.append(json.loads(line))
+    
+    all_score = 0
+    all_num = 0
+    test_num = 1000
+    
+    
+    from tqdm import tqdm
+    for example in tqdm(test[:]):
+        #print(example["question"])
+        messages = [
+            {"role": "user", "content": example["question"]}
+        ]
+        text = tokenizer.apply_chat_template(
+                messages,
+                tokenize=False,
+                add_generation_prompt=True
+            )
+
+    
+        example["prompt"] = text
+        print("text: " , [text])
+        input_ids = tokenizer([text], return_tensors="pt").to(device).input_ids
+        
+
+        output = model.generate(input_ids, 
+                                #top_p=1.0, 
+                                max_new_tokens=600, 
+                                #repetition_penalty=1.1 + t*0.01,
+                                temperature=0.1,
+                                #no_repeat_ngram_size = 5,
+                                ).squeeze()
+        output_str = tokenizer.decode(output[input_ids.shape[1]:])
+        print("output_str:\n", output_str, "\n-----------------------------------------------------------------")
+            
+        example["raw_outputs"] = output_str#re.findall(r'```python(.*?)```', output_str)
+    return test
+    
+
+answers = {}
+for model_path in [
+  "/mnt/disk2/home/wujianfeng/LLaMA-Factory/all_new_1/checkpoint-600",
+  "/mnt/disk2/home/wujianfeng/LLaMA-Factory/all_new/checkpoint-600/",
+  
+]:
+    print("model_path: ", model_path)
+    model = AutoModelForCausalLM.from_pretrained(
+        model_path,
+        torch_dtype="auto",
+        device_map=device,
+        trust_remote_code=True,
+    )
+    tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
+    
+    test = get_result(model, tokenizer)
+    test, score = exec_code(test)
+    answers[score] = test
+    
+
+    test = get_result_1(model, tokenizer)
+    test, score = exec_code(test)
+    answers[score] = test
+
+
+
+answers = list(dict(sorted(answers.items())).values())
+print("answers: ", answers)
+right = 0
+jiuge_right = 0
+merge = []
+for i in range(len(answers)):
+#for i in range(2):
+    flag = 0
+    for answer in answers:
+        if answer[i]["result"] == "True":
+            right += 1
+            jiuge_right += 1
+            flag = 1
+            merge.append(answer[i])
+            break
+            
+    if flag == 0:
+        merge.append(answers[0][i])
+         
+
+
+print(right / len(answers), jiuge_right / len(answers))
+with open("wjf_jiuge.jsonl", "w") as f:
+    for item in merge:
+        item.pop("result")
+        f.write(json.dumps(item, ensure_ascii=False) + '\n')
+    
+
+
+
+        
+            
--- a/model_final_url.txt
+++ b/model_final_url.txt
@ -1,14 +0,0 @@
-model_wight:通过百度网盘分享的文件：
-链接：https://pan.baidu.com/s/1paYNO7d5OYESuyw3BVo7Ew 
-提取码：6666
-#https://www.alipan.com/s/FTPWUSBuz7s
-
-docker:
-链接：https://pan.baidu.com/s/1paYNO7d5OYESuyw3BVo7Ew 
-提取码：6666
-#https://www.alipan.com/s/FTPWUSBuz7s
-
-train_data:
-链接：https://pan.baidu.com/s/1paYNO7d5OYESuyw3BVo7Ew 
-提取码：6666
-#https://www.alipan.com/s/FTPWUSBuz7s