Compare commits

...

7 Commits
FM_9G ... FM_9G

Author SHA1 Message Date
p83651209 1033ad4a75 Update README.md 2024-11-12 13:41:57 +08:00
p83651209 b63fcef8d2 Update README.md 2024-11-12 12:22:21 +08:00
p83651209 db10b9114b Update README.md 2024-11-12 11:16:38 +08:00
p83651209 58a7967a98 Update inference.py 2024-11-03 20:04:38 +08:00
p83651209 cd1bdcf117 Add model_final_url.txt 2024-11-03 13:30:04 +08:00
p83651209 4c8196bc84 Delete model_final 2024-11-03 13:29:30 +08:00
p83651209 124160cb1e Add model_final 2024-11-03 12:49:21 +08:00
3 changed files with 314 additions and 289 deletions

View File

@ -1,24 +1,27 @@
方案: 夸克网盘 docker链接https://pan.quark.cn/s/4cda395f13e8
全参数微调,使用不同数据集训练多个模型和推理时增强进行融合。 (没有会员请联系我下载)
训练代码: 1.使用llama-factory对九格模型进行全参数微调。数据集见dataset
LLaMA-Factory.zip 解压后使用可参照https://github.com/hiyouga/LLaMA-Factory配置环境或将代码映射到docker中使用。
训练train.sh。将数据集放到LLaMA-Factory/data文件夹下将train.sh放到LLaMA-Factory下使用。
推理: python inference.py(需在inference.py中修改好模型路径。) test_case.json是从题目中提取出来的测试用例。
百度网盘需要收费,使用阿里云盘 2.训练和推理都已验证无误在A100*8卡机器上。
model_wight:通过百度网盘分享的文件: docker 启动sudo docker run -it --runtime=nvidia --gpus all --shm-size=256g wjf:train
链接https://pan.baidu.com/s/1paYNO7d5OYESuyw3BVo7Ew 推理python inference.py
提取码6666 训练:
https://www.alipan.com/s/FTPWUSBuz7s cd training
sh training.sh
docker:
链接https://pan.baidu.com/s/1paYNO7d5OYESuyw3BVo7Ew
提取码6666
https://www.alipan.com/s/FTPWUSBuz7s
train_data: 3.推理使用多checkpoint、多次推理融合。
链接https://pan.baidu.com/s/1paYNO7d5OYESuyw3BVo7Ew
提取码6666 4.所有资料都已打包进docker只需要docker即可。
https://www.alipan.com/s/FTPWUSBuz7s
5.启动训练时将覆盖提交的checkpoint。
6.docker卡在数据处理可能是机器的问题尝试docker中输入
export NCCL_DEBUG=INFO
export NCCL_SHM_DISABLE=1
export NCCL_P2P_DISABLE=1
由于需要保存多个checkpoint请务必保证磁盘空间足够大于500G。
7.提交不易请有问题是及时联系我电话13121813131

View File

@ -1,270 +1,278 @@
import json, torch, re, sys, subprocess import json, torch, re, sys, subprocess
from transformers import AutoModelForCausalLM, AutoTokenizer, AutoModel, StoppingCriteria from transformers import AutoModelForCausalLM, AutoTokenizer, AutoModel, StoppingCriteria
device = "cuda" # the device to load the model onto device = "cuda" # the device to load the model onto
from tqdm import tqdm from tqdm import tqdm
def exec_code(test): def exec_code(test):
with open("test_case.json", "r") as f: with open("test_case.json", "r") as f:
test_cases = json.load(f) test_cases = json.load(f)
right_num = 0 right_num = 0
all_num = 0 all_num = 0
package = "import os, sys, math, re, json, random\n" package = "import os, sys, math, re, json, random\n"
for item, test_case in zip(test, test_cases): for item, test_case in zip(test, test_cases):
if "```python\n" in item["raw_outputs"]: if "```python\n" in item["raw_outputs"]:
matches = re.findall('```python(.*?)```', item["raw_outputs"], re.DOTALL) matches = re.findall('```python(.*?)```', item["raw_outputs"], re.DOTALL)
if len(matches) == 1: if len(matches) == 1:
item["raw_outputs"] = matches[0] item["raw_outputs"] = matches[0]
else: else:
matches = re.findall('```python(.*?)assert', item["raw_outputs"], re.DOTALL) matches = re.findall('```python(.*?)assert', item["raw_outputs"], re.DOTALL)
if len(matches) == 1: if len(matches) == 1:
item["raw_outputs"] = matches[0] item["raw_outputs"] = matches[0]
else: else:
item["raw_outputs"] = item["raw_outputs"][item["raw_outputs"].index("python\n") + len("python\n"):] item["raw_outputs"] = item["raw_outputs"][item["raw_outputs"].index("python\n") + len("python\n"):]
print(item) print(item)
#break #break
code = item["raw_outputs"].replace("<|im_end|>", "").replace("</s>", "").replace("```", "").strip().rstrip("\n") code = item["raw_outputs"].replace("<|im_end|>", "").replace("</s>", "").replace("```", "").strip().rstrip("\n")
raw_code = code raw_code = code
codes = raw_code.split("\n") codes = raw_code.split("\n")
last_line = 0 last_line = 0
for index, line in enumerate(codes): for index, line in enumerate(codes):
if " return" in line: if " return" in line:
last_line = index last_line = index
code = "\n".join(codes[:last_line+1]) code = "\n".join(codes[:last_line+1])
''' '''
if raw_code != code: if raw_code != code:
print("\n--------------------------------------------------------\n", [raw_code], "\n--------------------------------------------------------\n") print("\n--------------------------------------------------------\n", [raw_code], "\n--------------------------------------------------------\n")
print("clean:\n", [code], "\n+++++++++++++++++++++++++++++++++++++++++++++++++++++++++\n\n\n") print("clean:\n", [code], "\n+++++++++++++++++++++++++++++++++++++++++++++++++++++++++\n\n\n")
''' '''
with open('code_.py', 'w') as fout: with open('code_.py', 'w') as fout:
fout.write(package + code + "\n" + "\n".join(test_case["test_case"])) fout.write(package + code + "\n" + "\n".join(test_case["test_case"]))
batcmd = 'timeout 3 ' + sys.executable + ' code_.py' batcmd = 'timeout 3 ' + sys.executable + ' code_.py'
try: try:
shell_output = subprocess.check_output(batcmd, shell=True).decode('utf8') shell_output = subprocess.check_output(batcmd, shell=True).decode('utf8')
right_num += 1 right_num += 1
item["result"] = "True" item["result"] = "True"
except Exception as e: except Exception as e:
print("++++++++++++++++++++++++++++++++++++++++++++++++++++\n", raw_code, "\n-----------------------------------------\n\n\n", package + code + "\n--------------------------\n" + "\n".join(test_case["test_case"])) print("++++++++++++++++++++++++++++++++++++++++++++++++++++\n", raw_code, "\n-----------------------------------------\n\n\n", package + code + "\n--------------------------\n" + "\n".join(test_case["test_case"]))
print("--------------------------------------------------------\n\n\nitem:", item) print("--------------------------------------------------------\n\n\nitem:", item)
print("e: ", e, "\n================================================\n")#, e, ) print("e: ", e, "\n================================================\n")#, e, )
item["result"] = "False" item["result"] = "False"
all_num += 1 all_num += 1
item["raw_outputs"] = [code] item["raw_outputs"] = [code]
print(len(test), right_num, all_num, right_num / all_num) print(len(test), right_num, all_num, right_num / all_num)
with open(f'wjf_{model_path.replace("/", "-")}{right_num / all_num}.json', "w") as f: with open(f'wjf_{model_path.replace("/", "-")}{right_num / all_num}.json', "w") as f:
json.dump(test, f, indent=4) json.dump(test, f, indent=4)
return test, right_num / all_num return test, right_num / all_num
def get_result(model, tokenizer): def get_result(model, tokenizer):
test = [] test = []
with open("/mnt/disk2/home/wujianfeng/com/code/code_round4.jsonl", "r") as f: with open("/mnt/disk2/home/wujianfeng/com/code/code_round4.jsonl", "r") as f:
#test = json.load(f) #test = json.load(f)
for line in f: for line in f:
test.append(json.loads(line)) test.append(json.loads(line))
all_score = 0 all_score = 0
all_num = 0 all_num = 0
test_num = 1000 test_num = 1000
from tqdm import tqdm from tqdm import tqdm
for example in tqdm(test[:]): for example in tqdm(test[:]):
#print(example["question"]) #print(example["question"])
example["question"] = example["question"].replace("'''", '"""') example["question"] = example["question"].replace("'''", '"""')
ai_prefix = "" ai_prefix = ""
if example["question"].split(" ")[0] == "Write": if example["question"].split(" ")[0] == "Write":
question = example["question"][:example["question"].index("\n")].strip().rstrip() question = example["question"][:example["question"].index("\n")].strip().rstrip()
test_case = example["question"][example["question"].index("\n"):].split("\n") test_case = example["question"][example["question"].index("\n"):].split("\n")
print("test_case: ", test_case) print("test_case: ", test_case)
function_name = test_case[1].split(" ")[1].split("(")[0] function_name = test_case[1].split(" ")[1].split("(")[0]
ai_prefix = "def " + function_name ai_prefix = "def " + function_name
messages = [ messages = [
{"role": "user", "content": question + "\n\n" + ("\n".join(test_case))} {"role": "user", "content": question + "\n\n" + ("\n".join(test_case))}
] ]
text = tokenizer.apply_chat_template( text = tokenizer.apply_chat_template(
messages, messages,
tokenize=False, tokenize=False,
add_generation_prompt=True add_generation_prompt=True
) )
text += ai_prefix text += ai_prefix
example["test_case"] = test_case example["test_case"] = test_case
else: else:
tmp = re.findall(r'"""(.*?)"""', example["question"], flags=re.DOTALL)[0].split("\n") tmp = re.findall(r'"""(.*?)"""', example["question"], flags=re.DOTALL)[0].split("\n")
question = "" question = ""
for line in tmp: for line in tmp:
line = line.strip().rstrip() line = line.strip().rstrip()
if len(line) == 0: if len(line) == 0:
continue continue
#if "xample" in line and len(line) < 20: #if "xample" in line and len(line) < 20:
# break # break
question += line + " " question += line + " "
code = re.sub(r'"""(.*?)"""', '', example["question"], flags=re.DOTALL).strip().rstrip() code = re.sub(r'"""(.*?)"""', '', example["question"], flags=re.DOTALL).strip().rstrip()
ai_prefix = code ai_prefix = code
messages = [ messages = [
{"role": "user", "content": question} {"role": "user", "content": question}
] ]
text = tokenizer.apply_chat_template( text = tokenizer.apply_chat_template(
messages, messages,
tokenize=False, tokenize=False,
add_generation_prompt=True add_generation_prompt=True
) )
text += ai_prefix text += ai_prefix
example["prompt"] = text example["prompt"] = text
print("text: " , [text]) print("text: " , [text])
input_ids = tokenizer([text], return_tensors="pt").to(device).input_ids input_ids = tokenizer([text], return_tensors="pt").to(device).input_ids
output = model.generate(input_ids, output = model.generate(input_ids,
#top_p=1.0, #top_p=1.0,
max_new_tokens=600, max_new_tokens=600,
#repetition_penalty=1.1 + t*0.01, #repetition_penalty=1.1 + t*0.01,
temperature=0.1, temperature=0.1,
#no_repeat_ngram_size = 5, #no_repeat_ngram_size = 5,
).squeeze() ).squeeze()
output_str = tokenizer.decode(output[input_ids.shape[1]:]) output_str = tokenizer.decode(output[input_ids.shape[1]:])
output_str = ai_prefix + output_str output_str = ai_prefix + output_str
print("output_str:\n", output_str, "\n-----------------------------------------------------------------") print("output_str:\n", output_str, "\n-----------------------------------------------------------------")
example["raw_outputs"] = output_str#re.findall(r'```python(.*?)```', output_str) example["raw_outputs"] = output_str#re.findall(r'```python(.*?)```', output_str)
return test return test
def get_result_1(model, tokenizer): def get_result_1(model, tokenizer):
test = [] test = []
with open("/mnt/disk2/home/wujianfeng/com/code/code_round4.jsonl", "r") as f: with open("/mnt/disk2/home/wujianfeng/com/code/code_round4.jsonl", "r") as f:
#test = json.load(f) #test = json.load(f)
for line in f: for line in f:
test.append(json.loads(line)) test.append(json.loads(line))
all_score = 0 all_score = 0
all_num = 0 all_num = 0
test_num = 1000 test_num = 1000
from tqdm import tqdm from tqdm import tqdm
for example in tqdm(test[:]): for example in tqdm(test[:]):
#print(example["question"]) #print(example["question"])
messages = [ messages = [
{"role": "user", "content": example["question"]} {"role": "user", "content": example["question"]}
] ]
text = tokenizer.apply_chat_template( text = tokenizer.apply_chat_template(
messages, messages,
tokenize=False, tokenize=False,
add_generation_prompt=True add_generation_prompt=True
) )
example["prompt"] = text example["prompt"] = text
print("text: " , [text]) print("text: " , [text])
input_ids = tokenizer([text], return_tensors="pt").to(device).input_ids input_ids = tokenizer([text], return_tensors="pt").to(device).input_ids
output = model.generate(input_ids, output = model.generate(input_ids,
#top_p=1.0, #top_p=1.0,
max_new_tokens=600, max_new_tokens=600,
#repetition_penalty=1.1 + t*0.01, #repetition_penalty=1.1 + t*0.01,
temperature=0.1, temperature=0.1,
#no_repeat_ngram_size = 5, #no_repeat_ngram_size = 5,
).squeeze() ).squeeze()
output_str = tokenizer.decode(output[input_ids.shape[1]:]) output_str = tokenizer.decode(output[input_ids.shape[1]:])
print("output_str:\n", output_str, "\n-----------------------------------------------------------------") print("output_str:\n", output_str, "\n-----------------------------------------------------------------")
example["raw_outputs"] = output_str#re.findall(r'```python(.*?)```', output_str) example["raw_outputs"] = output_str#re.findall(r'```python(.*?)```', output_str)
return test return test
answers = {} answers = {}
for model_path in [
"/mnt/disk2/home/wujianfeng/LLaMA-Factory/all_new_1/checkpoint-600", for model_path in [
"/mnt/disk2/home/wujianfeng/LLaMA-Factory/all_new/checkpoint-600/", "/mnt/disk2/home/wujianfeng/LLaMA-Factory/all/TACO/",
"/mnt/disk2/home/wujianfeng/LLaMA-Factory/all_new_2/CodeNet4Repair/",
]: "/mnt/disk2/home/wujianfeng/LLaMA-Factory/all_new_1/CodeExercise-Python-27k/",
print("model_path: ", model_path) ]:
model = AutoModelForCausalLM.from_pretrained( print("model_path: ", model_path)
model_path, model = AutoModelForCausalLM.from_pretrained(
torch_dtype="auto", model_path,
device_map=device, torch_dtype="auto",
trust_remote_code=True, device_map=device,
) trust_remote_code=True,
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) )
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
test = get_result(model, tokenizer)
test, score = exec_code(test) test = get_result(model, tokenizer)
answers[score] = test test, score = exec_code(test)
answers[score] = test
test = get_result_1(model, tokenizer)
test, score = exec_code(test) test = get_result_1(model, tokenizer)
answers[score] = test test, score = exec_code(test)
answers[score] = test
'''
answers = list(dict(sorted(answers.items())).values()) import os
print("answers: ", answers) for path in os.listdir("./"):
right = 0 if "home-wujianfeng" in path:
jiuge_right = 0 with open(path, "r") as f:
merge = [] test = json.load(f)
for i in range(len(answers)): answers[float(path.split(".")[-2].split("-")[-1])] = test
#for i in range(2): '''
flag = 0
for answer in answers: answers = list(dict(sorted(answers.items())).values())
if answer[i]["result"] == "True": print("answers: ", answers)
right += 1 right = 0
jiuge_right += 1 jiuge_right = 0
flag = 1 merge = []
merge.append(answer[i]) for i in range(len(answers[0])):
break #for i in range(2):
flag = 0
if flag == 0: for answer in answers:
merge.append(answers[0][i]) if answer[i]["result"] == "True":
right += 1
jiuge_right += 1
flag = 1
print(right / len(answers), jiuge_right / len(answers)) merge.append(answer[i])
with open("wjf_jiuge.jsonl", "w") as f: break
for item in merge:
item.pop("result") if flag == 0:
f.write(json.dumps(item, ensure_ascii=False) + '\n') merge.append(answers[0][i])
print(right / len(answers[0]), jiuge_right / len(answers[0]))
with open("wjf_jiuge.jsonl", "w") as f:
for item in merge:
item.pop("result")
f.write(json.dumps(item, ensure_ascii=False) + '\n')

14
model_final_url.txt Normal file
View File

@ -0,0 +1,14 @@
model_wight:通过百度网盘分享的文件:
链接https://pan.baidu.com/s/1paYNO7d5OYESuyw3BVo7Ew
提取码6666
#https://www.alipan.com/s/FTPWUSBuz7s
docker:
链接https://pan.baidu.com/s/1paYNO7d5OYESuyw3BVo7Ew
提取码6666
#https://www.alipan.com/s/FTPWUSBuz7s
train_data:
链接https://pan.baidu.com/s/1paYNO7d5OYESuyw3BVo7Ew
提取码6666
#https://www.alipan.com/s/FTPWUSBuz7s