CPM-9G-8B/convert.py

import sys
import shutil
import json
sys.path.insert(0, "/home/wangshuo1/projects/CPM-9G/gejiu_train")
import torch
from transformers import LlamaTokenizer, LlamaForCausalLM, LlamaConfig
from cpm.cpm9g.models import CPM9GTorch, CPM9GConfig

from transformers import AutoTokenizer, AutoConfig
from cpm.cpm9g import CPM9GTokenizer as BMTCPM9GTokenizer
from cpm.cpm9g import CPM9GTokenizer
from cpm.cpm9g.generation.cpm9g import CPM9GBeamSearch

source_path = "/data/public/zwl_data/11b-base/"
target_path = "/home/wangshuo1/projects/CPM-9G/convert_to_hf/11b-base-hf/"
file_name = "11b.pt"

def convert_pkl():
    shutil.copyfile(f"{source_path}vocabs.txt", f"{target_path}vocabs.txt")
    with open(f"{source_path}config.json") as f:
        bmt_config = json.load(f)
    config = {
        "architectures": [
          "LlamaForCausalLM"
        ],
        "bos_token_id": 1,
        "eos_token_id": 2,
        "hidden_act": "silu",
        "hidden_size": bmt_config['dim_model'],
        "initializer_range": 0.02,
        "intermediate_size": bmt_config['dim_ff'],
        "max_length": 4096,
        "max_position_embeddings": 4096,
        "model_type": "llama",
        "num_attention_heads": bmt_config['num_heads'],
        "num_hidden_layers": bmt_config['num_layers'],
        "num_key_value_heads": bmt_config['num_kv_heads'],
        "pad_token_id": 0,
        "pretraining_tp": 1,
        "rms_norm_eps": 1e-05,
        "rope_scaling": None,
        "tie_word_embeddings": False,
        "torch_dtype": "float32",
        "transformers_version": "4.31.0",
        "use_cache": True,
        "vocab_size": bmt_config['vocab_size'],
    }
    with open(f"{target_path}config.json", "w") as f:
        json.dump(config, f)

    state = torch.load(f"{source_path}{file_name}")
    new_state = {}
    new_state["model.embed_tokens.weight"] = state["input_embedding.weight"]
    new_state["lm_head.weight"] = state["lm_head.weight"]
    new_state["model.norm.weight"] = state["encoder.output_layernorm.weight"]
    layer_num = bmt_config['num_layers']
    for lid in range(layer_num):
        print(lid)
        new_state[f"model.layers.{lid}.self_attn.q_proj.weight"] = state[f"encoder.layers.{lid}.self_att.self_attention.project_q.weight"]
        new_state[f"model.layers.{lid}.self_attn.k_proj.weight"] = state[f"encoder.layers.{lid}.self_att.self_attention.project_k.weight"]
        new_state[f"model.layers.{lid}.self_attn.v_proj.weight"] = state[f"encoder.layers.{lid}.self_att.self_attention.project_v.weight"]

        new_state[f"model.layers.{lid}.self_attn.o_proj.weight"] = state[f"encoder.layers.{lid}.self_att.self_attention.attention_out.weight"]
        new_state[f"model.layers.{lid}.mlp.gate_proj.weight"] = state[f"encoder.layers.{lid}.ffn.ffn.w_in.w_0.weight"]
        new_state[f"model.layers.{lid}.mlp.up_proj.weight"] = state[f"encoder.layers.{lid}.ffn.ffn.w_in.w_1.weight"]
        new_state[f"model.layers.{lid}.mlp.down_proj.weight"] = state[f"encoder.layers.{lid}.ffn.ffn.w_out.weight"]

        new_state[f"model.layers.{lid}.input_layernorm.weight"] = state[f"encoder.layers.{lid}.self_att.layernorm_before_attention.weight"]
        new_state[f"model.layers.{lid}.post_attention_layernorm.weight"] = state[f"encoder.layers.{lid}.ffn.layernorm_before_ffn.weight"]
    del state
    state = None
    torch.save(new_state, f"{target_path}pytorch_model.bin")

def test():
    config = LlamaConfig.from_pretrained(f"{target_path}")
    tokenizer = CPM9GTokenizer(f"{target_path}vocabs.txt")
    model = LlamaForCausalLM.from_pretrained(f"{target_path}").cuda()

    text = "请介绍一下清华大学："
    inputs = torch.tensor([[tokenizer.bos_id] + tokenizer.encode(text)]).cuda()
    output = model.generate(inputs, max_length=200)[0].tolist()
    print(tokenizer.decode(output))

if __name__ == "__main__":
    convert_pkl()
    test()
update quick start 2024-02-28 17:42:58 +08:00			`import sys`
			`import shutil`
			`import json`
			`sys.path.insert(0, "/home/wangshuo1/projects/CPM-9G/gejiu_train")`
			`import torch`
			`from transformers import LlamaTokenizer, LlamaForCausalLM, LlamaConfig`
			`from cpm.cpm9g.models import CPM9GTorch, CPM9GConfig`

			`from transformers import AutoTokenizer, AutoConfig`
			`from cpm.cpm9g import CPM9GTokenizer as BMTCPM9GTokenizer`
			`from cpm.cpm9g import CPM9GTokenizer`
			`from cpm.cpm9g.generation.cpm9g import CPM9GBeamSearch`

			`source_path = "/data/public/zwl_data/11b-base/"`
			`target_path = "/home/wangshuo1/projects/CPM-9G/convert_to_hf/11b-base-hf/"`
			`file_name = "11b.pt"`

			`def convert_pkl():`
			`shutil.copyfile(f"{source_path}vocabs.txt", f"{target_path}vocabs.txt")`
			`with open(f"{source_path}config.json") as f:`
			`bmt_config = json.load(f)`
			`config = {`
			`"architectures": [`
			`"LlamaForCausalLM"`
			`],`
			`"bos_token_id": 1,`
			`"eos_token_id": 2,`
			`"hidden_act": "silu",`
			`"hidden_size": bmt_config['dim_model'],`
			`"initializer_range": 0.02,`
			`"intermediate_size": bmt_config['dim_ff'],`
			`"max_length": 4096,`
			`"max_position_embeddings": 4096,`
			`"model_type": "llama",`
			`"num_attention_heads": bmt_config['num_heads'],`
			`"num_hidden_layers": bmt_config['num_layers'],`
			`"num_key_value_heads": bmt_config['num_kv_heads'],`
			`"pad_token_id": 0,`
			`"pretraining_tp": 1,`
			`"rms_norm_eps": 1e-05,`
			`"rope_scaling": None,`
			`"tie_word_embeddings": False,`
			`"torch_dtype": "float32",`
			`"transformers_version": "4.31.0",`
			`"use_cache": True,`
			`"vocab_size": bmt_config['vocab_size'],`
			`}`
			`with open(f"{target_path}config.json", "w") as f:`
			`json.dump(config, f)`

			`state = torch.load(f"{source_path}{file_name}")`
			`new_state = {}`
			`new_state["model.embed_tokens.weight"] = state["input_embedding.weight"]`
			`new_state["lm_head.weight"] = state["lm_head.weight"]`
			`new_state["model.norm.weight"] = state["encoder.output_layernorm.weight"]`
			`layer_num = bmt_config['num_layers']`
			`for lid in range(layer_num):`
			`print(lid)`
			`new_state[f"model.layers.{lid}.self_attn.q_proj.weight"] = state[f"encoder.layers.{lid}.self_att.self_attention.project_q.weight"]`
			`new_state[f"model.layers.{lid}.self_attn.k_proj.weight"] = state[f"encoder.layers.{lid}.self_att.self_attention.project_k.weight"]`
			`new_state[f"model.layers.{lid}.self_attn.v_proj.weight"] = state[f"encoder.layers.{lid}.self_att.self_attention.project_v.weight"]`

			`new_state[f"model.layers.{lid}.self_attn.o_proj.weight"] = state[f"encoder.layers.{lid}.self_att.self_attention.attention_out.weight"]`
			`new_state[f"model.layers.{lid}.mlp.gate_proj.weight"] = state[f"encoder.layers.{lid}.ffn.ffn.w_in.w_0.weight"]`
			`new_state[f"model.layers.{lid}.mlp.up_proj.weight"] = state[f"encoder.layers.{lid}.ffn.ffn.w_in.w_1.weight"]`
			`new_state[f"model.layers.{lid}.mlp.down_proj.weight"] = state[f"encoder.layers.{lid}.ffn.ffn.w_out.weight"]`

			`new_state[f"model.layers.{lid}.input_layernorm.weight"] = state[f"encoder.layers.{lid}.self_att.layernorm_before_attention.weight"]`
			`new_state[f"model.layers.{lid}.post_attention_layernorm.weight"] = state[f"encoder.layers.{lid}.ffn.layernorm_before_ffn.weight"]`
			`del state`
			`state = None`
			`torch.save(new_state, f"{target_path}pytorch_model.bin")`

			`def test():`
			`config = LlamaConfig.from_pretrained(f"{target_path}")`
			`tokenizer = CPM9GTokenizer(f"{target_path}vocabs.txt")`
			`model = LlamaForCausalLM.from_pretrained(f"{target_path}").cuda()`

			`text = "请介绍一下清华大学："`
			`inputs = torch.tensor([[tokenizer.bos_id] + tokenizer.encode(text)]).cuda()`
			`output = model.generate(inputs, max_length=200)[0].tolist()`
			`print(tokenizer.decode(output))`

			`if __name__ == "__main__":`
			`convert_pkl()`
			`test()`