From faa909dcc396b1c3625708dc9a3e0d75ade3fe5f Mon Sep 17 00:00:00 2001
From: wql <wengqinlan18@163.com>
Date: Tue, 10 Sep 2024 15:38:33 +0800
Subject: [PATCH] add: add mindie file

---
 mindie/examples/README.md                     |  220 ++
 mindie/examples/__init__.py                   |    0
 mindie/examples/convert/__init__.py           |    0
 mindie/examples/convert/convert_utils.py      |   27 +
 mindie/examples/convert/convert_weights.py    |   41 +
 .../examples/convert/model_slim/__init__.py   |    0
 .../examples/convert/model_slim/boolq.jsonl   |   50 +
 .../model_slim/get_calibration_dataset.py     |   12 +
 .../examples/convert/model_slim/quantifier.py |  176 ++
 .../convert/model_slim/sparse_compressor.py   |   94 +
 .../model_slim/teacher_qualification.jsonl    |   44 +
 mindie/examples/input.jsonl                   |    1 +
 mindie/examples/models/aquila/README.md       |  181 ++
 mindie/examples/models/aquila/run_fa.sh       |   23 +
 mindie/examples/models/aquila/run_pa.sh       |   24 +
 .../examples/models/atb_speed_sdk/README.md   |  306 +++
 .../atb_speed_sdk/atb_speed/__init__.py       |    0
 .../atb_speed/common/__init__.py              |    0
 .../atb_speed_sdk/atb_speed/common/config.py  |  122 +
 .../atb_speed/common/cpu_binding.py           |  178 ++
 .../atb_speed/common/launcher/__init__.py     |   12 +
 .../atb_speed/common/launcher/base.py         |  244 ++
 .../atb_speed/common/launcher/gpu.py          |   57 +
 .../atb_speed/common/launcher/npu.py          |  117 +
 .../atb_speed/common/log/__init__.py          |    0
 .../atb_speed/common/log/logging.py           |   39 +
 .../log/multiprocess_logging_handler.py       |  135 ++
 .../atb_speed/common/performance/__init__.py  |    0
 .../atb_speed/common/performance/base.py      |  231 ++
 .../atb_speed/common/precision/__init__.py    |   21 +
 .../atb_speed/common/precision/base.py        |  256 +++
 .../precision/ceval_subject_mapping.json      |  262 +++
 .../precision/mmlu_subject_mapping.json       |   59 +
 .../atb_speed_sdk/atb_speed/common/timer.py   |  101 +
 .../atb_speed_sdk/atb_speed/common/utils.py   |   81 +
 mindie/examples/models/atb_speed_sdk/setup.py |   19 +
 .../test/sdk_ceval_config_test.py             |   36 +
 .../test/sdk_perf_config_test.py              |   32 +
 .../models/atb_speed_sdk/test/sdk_test.py     |   40 +
 .../models/atb_speed_sdk/test/template.ini    |   41 +
 .../models/atb_speed_sdk/test/test_config.py  |   14 +
 .../models/atb_speed_sdk/test/test_timer.py   |   49 +
 mindie/examples/models/baichuan/README.md     |  302 +++
 .../baichuan/quant_baichuan2_13b_w4a16.py     |  208 ++
 .../baichuan/quant_baichuan2_13b_w8a8.py      |  197 ++
 .../baichuan/quant_baichuan2_7b_w8a8.py       |  746 +++++++
 mindie/examples/models/baichuan/run_fa.sh     |   23 +
 mindie/examples/models/baichuan/run_pa.sh     |   20 +
 .../models/bge/large-zh-v1.5/README.md        |  251 +++
 .../models/bge/large-zh-v1.5/bin2onnx.py      |   28 +
 .../models/bge/large-zh-v1.5/config_bge.json  |    8 +
 .../bge/large-zh-v1.5/configuration_bert.py   |  129 ++
 .../models/bge/large-zh-v1.5/convert.sh       |   42 +
 .../examples/models/bge/large-zh-v1.5/demo.py |   85 +
 .../models/bge/large-zh-v1.5/eval_cmteb.py    |  304 +++
 .../bge/large-zh-v1.5/eval_performance.py     |  302 +++
 .../models/bge/large-zh-v1.5/infer.py         |   95 +
 .../examples/models/bge/large-zh-v1.5/main.py |   83 +
 .../bge/large-zh-v1.5/modeling_bert_ascend.py | 1982 +++++++++++++++++
 .../models/bge/large-zh-v1.5/ops_info.json    |   10 +
 .../models/bge/large-zh-v1.5/requirements.txt |    3 +
 .../models/bge/reranker-large/README.md       |  251 +++
 .../models/bge/reranker-large/config.json     |    8 +
 .../models/bge/reranker-large/convert.sh      |   39 +
 .../bge/reranker-large/eval_performance.py    |  299 +++
 .../bge/reranker-large/eval_precision.py      |  351 +++
 .../reranker-large/models/om/ops_info.json    |   16 +
 .../reranker-large/models/pytorch/config.json |   39 +
 .../pytorch/configuration_xlm_roberta.py      |  170 ++
 .../models/pytorch/modeling_xlm_roberta_fa.py | 1899 ++++++++++++++++
 .../bge/reranker-large/requirements.txt       |    4 +
 .../examples/models/bge/reranker-large/run.py |  181 ++
 mindie/examples/models/bloom/README.md        |  138 ++
 .../models/bloom/convert_quant_weights.py     |   76 +
 mindie/examples/models/bloom/run_fa.sh        |   37 +
 mindie/examples/models/bloom/run_pa.sh        |   36 +
 .../examples/models/chatglm/v2_6b/README.md   |  231 ++
 .../models/chatglm/v2_6b/calib_data.jsonl     |   15 +
 .../models/chatglm/v2_6b/generate_sparse.sh   |   17 +
 .../chatglm/v2_6b/quant_chatglm_w4a16.py      |   50 +
 .../chatglm/v2_6b/quant_chatglm_w8a8.py       |   59 +
 .../models/chatglm/v2_6b/quant_utils.py       |   62 +
 .../models/chatglm/v2_6b/run_300i_duo_pa.sh   |   18 +
 .../models/chatglm/v2_6b/run_800i_a2_pa.sh    |   27 +
 .../models/chatglm/v2_6b/web_demo.patch       |  109 +
 .../models/chatglm/v2_6b/web_requirements.txt |    3 +
 .../examples/models/chatglm/v3_6b/README.md   |   33 +
 .../examples/models/chinese_alpaca/README.md  |   99 +
 .../examples/models/chinese_alpaca/run_pa.sh  |   23 +
 .../examples/models/codegeex/v2_6b/README.md  |   57 +
 .../codegeex/v2_6b/quant_codegeex2_6b_w8a8.py |   93 +
 mindie/examples/models/codellama/README.md    |  172 ++
 .../models/codellama/convert_quant_weights.py |   84 +
 .../models/codellama/humaneval_python.json    |    7 +
 mindie/examples/models/codellama/run_pa.sh    |   23 +
 mindie/examples/models/codeshell/README.md    |   33 +
 .../models/deepseek/README_DeepSeek_Coder.md  |  112 +
 .../models/deepseek/README_deepseek_llm.md    |  101 +
 .../models/deepseek/README_deepseek_moe.md    |  103 +
 mindie/examples/models/deepseek/run_pa.sh     |   26 +
 .../models/deepseek/run_pa_deepseek_moe.sh    |   26 +
 mindie/examples/models/gemma/README.md        |  146 ++
 mindie/examples/models/gemma/boolq.jsonl      |    5 +
 .../gemma/convert_w8a8_quant_weights.py       |  100 +
 mindie/examples/models/gemma/run_pa.sh        |   19 +
 mindie/examples/models/gpt_neox/README.md     |  328 +++
 mindie/examples/models/gpt_neox/config.ini    |   33 +
 .../models/gpt_neox/configuration_gpt_neox.py |  128 ++
 .../models/gpt_neox/cut_model_and_run.sh      |   21 +
 .../models/gpt_neox/cut_model_util.py         |  100 +
 mindie/examples/models/gpt_neox/main.py       |  125 ++
 .../gpt_neox/modeling_gpt_neox_ascend.py      | 1194 ++++++++++
 .../models/gpt_neox/modeling_gpt_neox_cut.py  |  761 +++++++
 mindie/examples/models/gpt_neox/run_pa.sh     |   18 +
 mindie/examples/models/internlm/README.md     |  242 ++
 mindie/examples/models/internlm/run_pa.sh     |   19 +
 mindie/examples/models/llama/README.md        |  312 +++
 .../models/llama/convert_quant_weights.py     |   85 +
 mindie/examples/models/llama/run_fa.sh        |   19 +
 mindie/examples/models/llama/run_pa.sh        |   19 +
 mindie/examples/models/llama3/README.md       |  144 ++
 .../models/llama3/convert_quant_weights.py    |  125 ++
 mindie/examples/models/llama3/run_pa.sh       |   23 +
 mindie/examples/models/llava/README.md        |  115 +
 mindie/examples/models/llava/llava.py         |  370 +++
 .../precision/GPU_NPU_result_example.json     |    4 +
 .../llava/precision/clip_score_llava.py       |  121 +
 .../models/llava/precision/run_coco_gpu.py    |   74 +
 mindie/examples/models/llava/run_pa.sh        |   81 +
 mindie/examples/models/minigpt4/README.md     |  421 ++++
 .../models/minigpt4/om_trans/eva_vit_model.py |  106 +
 .../models/minigpt4/om_trans/image_encoder.py |    9 +
 .../minigpt4/om_trans/onnx_model_export.py    |   67 +
 .../minigpt4/performance/run_performance.sh   |   52 +
 .../precision/GPU_NPU_result_example.json     |    4 +
 .../minigpt4/precision/clip_score_minigpt4.py |  103 +
 .../precision/run_predict_walk_dir_gpu.py     |  101 +
 .../models/minigpt4/predict/make_embeds.py    |   83 +
 .../models/minigpt4/predict/run_predict.sh    |   93 +
 mindie/examples/models/mistral/README.md      |  129 ++
 .../models/mistral/convert_quant_weights.py   |   67 +
 mindie/examples/models/mistral/input.jsonl    |    1 +
 mindie/examples/models/mistral/run_pa.sh      |   29 +
 mindie/examples/models/mixtral/README.md      |  110 +
 mindie/examples/models/mixtral/run_pa.sh      |   25 +
 mindie/examples/models/phi3/README.md         |   88 +
 mindie/examples/models/phi3/run_pa.sh         |   28 +
 mindie/examples/models/qwen/README.md         |  338 +++
 .../models/qwen/convert_quant_weights.py      |   64 +
 .../models/qwen/convert_quant_weights_14b.py  |   66 +
 .../models/qwen/quant_qwen2_14b_w4a16_64.py   |   63 +
 .../models/qwen/quant_qwen2_72b_w4a16_64.py   |   56 +
 .../models/qwen/quant_qwen2_72b_w8a16_fast.py |   53 +
 .../models/qwen/quant_qwen_14b_w8a8.py        |  339 +++
 .../models/qwen/quant_qwen_72b_w8a16.py       |  132 ++
 mindie/examples/models/qwen/qwen.jinja        |   10 +
 mindie/examples/models/qwen/run_fa.sh         |   42 +
 mindie/examples/models/qwen/run_pa.sh         |   43 +
 mindie/examples/models/qwen_vl/README.md      |  107 +
 .../precision/GPU_NPU_result_example.json     |    4 +
 .../qwen_vl/precision/clip_score_qwenvl.py    |  121 +
 .../qwen_vl/precision/run_coco_rst_GPU.py     |   62 +
 mindie/examples/models/qwen_vl/run_pa.py      |  448 ++++
 mindie/examples/models/qwen_vl/run_pa.sh      |   20 +
 mindie/examples/models/stablelm/run_pa.sh     |   21 +
 mindie/examples/models/starcoder/README.md    |  171 ++
 .../starcoder/convert_w8a8_quant_weights.py   |  115 +
 .../models/starcoder/humaneval_python.txt     |   10 +
 .../examples/models/starcoder/run_300i_duo.sh |   17 +
 .../models/starcoder/run_800i_a2_pa.sh        |   15 +
 mindie/examples/models/starcoder2/README.md   |  165 ++
 .../starcoder2/convert_w8a8_quant_weights.py  |   58 +
 .../models/starcoder2/humaneval_python.txt    |   10 +
 .../models/starcoder2/run_800i_a2_pa.sh       |   15 +
 mindie/examples/models/telechat/README.md     |   97 +
 .../telechat/convert_w8a8_quant_weights.py    |  112 +
 .../examples/models/telechat/run_300i_duo.sh  |   17 +
 mindie/examples/models/vicuna/README.md       |  100 +
 mindie/examples/models/vicuna/run_pa.sh       |   23 +
 mindie/examples/models/vlmo/README.md         |  264 +++
 mindie/examples/models/yi/README.md           |  121 +
 mindie/examples/models/yi/run_pa.sh           |   23 +
 mindie/examples/models/ziya/README.md         |   94 +
 mindie/examples/models/ziya/run_pa.sh         |   21 +
 mindie/examples/run_fa.py                     |  231 ++
 mindie/examples/run_pa.py                     |  363 +++
 mindie/examples/server/__init__.py            |    0
 mindie/examples/server/batch.py               |  156 ++
 mindie/examples/server/cache.py               |  153 ++
 mindie/examples/server/example_text.csv       |   10 +
 mindie/examples/server/example_token.csv      |   10 +
 mindie/examples/server/generate.py            |  227 ++
 mindie/examples/server/request.py             |  110 +
 193 files changed, 24234 insertions(+)
 create mode 100644 mindie/examples/README.md
 create mode 100644 mindie/examples/__init__.py
 create mode 100644 mindie/examples/convert/__init__.py
 create mode 100644 mindie/examples/convert/convert_utils.py
 create mode 100644 mindie/examples/convert/convert_weights.py
 create mode 100644 mindie/examples/convert/model_slim/__init__.py
 create mode 100644 mindie/examples/convert/model_slim/boolq.jsonl
 create mode 100644 mindie/examples/convert/model_slim/get_calibration_dataset.py
 create mode 100644 mindie/examples/convert/model_slim/quantifier.py
 create mode 100644 mindie/examples/convert/model_slim/sparse_compressor.py
 create mode 100644 mindie/examples/convert/model_slim/teacher_qualification.jsonl
 create mode 100644 mindie/examples/input.jsonl
 create mode 100644 mindie/examples/models/aquila/README.md
 create mode 100644 mindie/examples/models/aquila/run_fa.sh
 create mode 100644 mindie/examples/models/aquila/run_pa.sh
 create mode 100644 mindie/examples/models/atb_speed_sdk/README.md
 create mode 100644 mindie/examples/models/atb_speed_sdk/atb_speed/__init__.py
 create mode 100644 mindie/examples/models/atb_speed_sdk/atb_speed/common/__init__.py
 create mode 100644 mindie/examples/models/atb_speed_sdk/atb_speed/common/config.py
 create mode 100644 mindie/examples/models/atb_speed_sdk/atb_speed/common/cpu_binding.py
 create mode 100644 mindie/examples/models/atb_speed_sdk/atb_speed/common/launcher/__init__.py
 create mode 100644 mindie/examples/models/atb_speed_sdk/atb_speed/common/launcher/base.py
 create mode 100644 mindie/examples/models/atb_speed_sdk/atb_speed/common/launcher/gpu.py
 create mode 100644 mindie/examples/models/atb_speed_sdk/atb_speed/common/launcher/npu.py
 create mode 100644 mindie/examples/models/atb_speed_sdk/atb_speed/common/log/__init__.py
 create mode 100644 mindie/examples/models/atb_speed_sdk/atb_speed/common/log/logging.py
 create mode 100644 mindie/examples/models/atb_speed_sdk/atb_speed/common/log/multiprocess_logging_handler.py
 create mode 100644 mindie/examples/models/atb_speed_sdk/atb_speed/common/performance/__init__.py
 create mode 100644 mindie/examples/models/atb_speed_sdk/atb_speed/common/performance/base.py
 create mode 100644 mindie/examples/models/atb_speed_sdk/atb_speed/common/precision/__init__.py
 create mode 100644 mindie/examples/models/atb_speed_sdk/atb_speed/common/precision/base.py
 create mode 100644 mindie/examples/models/atb_speed_sdk/atb_speed/common/precision/ceval_subject_mapping.json
 create mode 100644 mindie/examples/models/atb_speed_sdk/atb_speed/common/precision/mmlu_subject_mapping.json
 create mode 100644 mindie/examples/models/atb_speed_sdk/atb_speed/common/timer.py
 create mode 100644 mindie/examples/models/atb_speed_sdk/atb_speed/common/utils.py
 create mode 100644 mindie/examples/models/atb_speed_sdk/setup.py
 create mode 100644 mindie/examples/models/atb_speed_sdk/test/sdk_ceval_config_test.py
 create mode 100644 mindie/examples/models/atb_speed_sdk/test/sdk_perf_config_test.py
 create mode 100644 mindie/examples/models/atb_speed_sdk/test/sdk_test.py
 create mode 100644 mindie/examples/models/atb_speed_sdk/test/template.ini
 create mode 100644 mindie/examples/models/atb_speed_sdk/test/test_config.py
 create mode 100644 mindie/examples/models/atb_speed_sdk/test/test_timer.py
 create mode 100644 mindie/examples/models/baichuan/README.md
 create mode 100644 mindie/examples/models/baichuan/quant_baichuan2_13b_w4a16.py
 create mode 100644 mindie/examples/models/baichuan/quant_baichuan2_13b_w8a8.py
 create mode 100644 mindie/examples/models/baichuan/quant_baichuan2_7b_w8a8.py
 create mode 100644 mindie/examples/models/baichuan/run_fa.sh
 create mode 100644 mindie/examples/models/baichuan/run_pa.sh
 create mode 100644 mindie/examples/models/bge/large-zh-v1.5/README.md
 create mode 100644 mindie/examples/models/bge/large-zh-v1.5/bin2onnx.py
 create mode 100644 mindie/examples/models/bge/large-zh-v1.5/config_bge.json
 create mode 100644 mindie/examples/models/bge/large-zh-v1.5/configuration_bert.py
 create mode 100644 mindie/examples/models/bge/large-zh-v1.5/convert.sh
 create mode 100644 mindie/examples/models/bge/large-zh-v1.5/demo.py
 create mode 100644 mindie/examples/models/bge/large-zh-v1.5/eval_cmteb.py
 create mode 100644 mindie/examples/models/bge/large-zh-v1.5/eval_performance.py
 create mode 100644 mindie/examples/models/bge/large-zh-v1.5/infer.py
 create mode 100644 mindie/examples/models/bge/large-zh-v1.5/main.py
 create mode 100644 mindie/examples/models/bge/large-zh-v1.5/modeling_bert_ascend.py
 create mode 100644 mindie/examples/models/bge/large-zh-v1.5/ops_info.json
 create mode 100644 mindie/examples/models/bge/large-zh-v1.5/requirements.txt
 create mode 100644 mindie/examples/models/bge/reranker-large/README.md
 create mode 100644 mindie/examples/models/bge/reranker-large/config.json
 create mode 100644 mindie/examples/models/bge/reranker-large/convert.sh
 create mode 100644 mindie/examples/models/bge/reranker-large/eval_performance.py
 create mode 100644 mindie/examples/models/bge/reranker-large/eval_precision.py
 create mode 100644 mindie/examples/models/bge/reranker-large/models/om/ops_info.json
 create mode 100644 mindie/examples/models/bge/reranker-large/models/pytorch/config.json
 create mode 100644 mindie/examples/models/bge/reranker-large/models/pytorch/configuration_xlm_roberta.py
 create mode 100644 mindie/examples/models/bge/reranker-large/models/pytorch/modeling_xlm_roberta_fa.py
 create mode 100644 mindie/examples/models/bge/reranker-large/requirements.txt
 create mode 100644 mindie/examples/models/bge/reranker-large/run.py
 create mode 100644 mindie/examples/models/bloom/README.md
 create mode 100644 mindie/examples/models/bloom/convert_quant_weights.py
 create mode 100644 mindie/examples/models/bloom/run_fa.sh
 create mode 100644 mindie/examples/models/bloom/run_pa.sh
 create mode 100644 mindie/examples/models/chatglm/v2_6b/README.md
 create mode 100644 mindie/examples/models/chatglm/v2_6b/calib_data.jsonl
 create mode 100644 mindie/examples/models/chatglm/v2_6b/generate_sparse.sh
 create mode 100644 mindie/examples/models/chatglm/v2_6b/quant_chatglm_w4a16.py
 create mode 100644 mindie/examples/models/chatglm/v2_6b/quant_chatglm_w8a8.py
 create mode 100644 mindie/examples/models/chatglm/v2_6b/quant_utils.py
 create mode 100644 mindie/examples/models/chatglm/v2_6b/run_300i_duo_pa.sh
 create mode 100644 mindie/examples/models/chatglm/v2_6b/run_800i_a2_pa.sh
 create mode 100644 mindie/examples/models/chatglm/v2_6b/web_demo.patch
 create mode 100644 mindie/examples/models/chatglm/v2_6b/web_requirements.txt
 create mode 100644 mindie/examples/models/chatglm/v3_6b/README.md
 create mode 100644 mindie/examples/models/chinese_alpaca/README.md
 create mode 100644 mindie/examples/models/chinese_alpaca/run_pa.sh
 create mode 100644 mindie/examples/models/codegeex/v2_6b/README.md
 create mode 100644 mindie/examples/models/codegeex/v2_6b/quant_codegeex2_6b_w8a8.py
 create mode 100644 mindie/examples/models/codellama/README.md
 create mode 100644 mindie/examples/models/codellama/convert_quant_weights.py
 create mode 100644 mindie/examples/models/codellama/humaneval_python.json
 create mode 100644 mindie/examples/models/codellama/run_pa.sh
 create mode 100644 mindie/examples/models/codeshell/README.md
 create mode 100644 mindie/examples/models/deepseek/README_DeepSeek_Coder.md
 create mode 100644 mindie/examples/models/deepseek/README_deepseek_llm.md
 create mode 100644 mindie/examples/models/deepseek/README_deepseek_moe.md
 create mode 100644 mindie/examples/models/deepseek/run_pa.sh
 create mode 100644 mindie/examples/models/deepseek/run_pa_deepseek_moe.sh
 create mode 100644 mindie/examples/models/gemma/README.md
 create mode 100644 mindie/examples/models/gemma/boolq.jsonl
 create mode 100644 mindie/examples/models/gemma/convert_w8a8_quant_weights.py
 create mode 100644 mindie/examples/models/gemma/run_pa.sh
 create mode 100644 mindie/examples/models/gpt_neox/README.md
 create mode 100644 mindie/examples/models/gpt_neox/config.ini
 create mode 100644 mindie/examples/models/gpt_neox/configuration_gpt_neox.py
 create mode 100644 mindie/examples/models/gpt_neox/cut_model_and_run.sh
 create mode 100644 mindie/examples/models/gpt_neox/cut_model_util.py
 create mode 100644 mindie/examples/models/gpt_neox/main.py
 create mode 100644 mindie/examples/models/gpt_neox/modeling_gpt_neox_ascend.py
 create mode 100644 mindie/examples/models/gpt_neox/modeling_gpt_neox_cut.py
 create mode 100644 mindie/examples/models/gpt_neox/run_pa.sh
 create mode 100644 mindie/examples/models/internlm/README.md
 create mode 100644 mindie/examples/models/internlm/run_pa.sh
 create mode 100644 mindie/examples/models/llama/README.md
 create mode 100644 mindie/examples/models/llama/convert_quant_weights.py
 create mode 100644 mindie/examples/models/llama/run_fa.sh
 create mode 100644 mindie/examples/models/llama/run_pa.sh
 create mode 100644 mindie/examples/models/llama3/README.md
 create mode 100644 mindie/examples/models/llama3/convert_quant_weights.py
 create mode 100644 mindie/examples/models/llama3/run_pa.sh
 create mode 100644 mindie/examples/models/llava/README.md
 create mode 100644 mindie/examples/models/llava/llava.py
 create mode 100644 mindie/examples/models/llava/precision/GPU_NPU_result_example.json
 create mode 100644 mindie/examples/models/llava/precision/clip_score_llava.py
 create mode 100644 mindie/examples/models/llava/precision/run_coco_gpu.py
 create mode 100644 mindie/examples/models/llava/run_pa.sh
 create mode 100644 mindie/examples/models/minigpt4/README.md
 create mode 100644 mindie/examples/models/minigpt4/om_trans/eva_vit_model.py
 create mode 100644 mindie/examples/models/minigpt4/om_trans/image_encoder.py
 create mode 100644 mindie/examples/models/minigpt4/om_trans/onnx_model_export.py
 create mode 100644 mindie/examples/models/minigpt4/performance/run_performance.sh
 create mode 100644 mindie/examples/models/minigpt4/precision/GPU_NPU_result_example.json
 create mode 100644 mindie/examples/models/minigpt4/precision/clip_score_minigpt4.py
 create mode 100644 mindie/examples/models/minigpt4/precision/run_predict_walk_dir_gpu.py
 create mode 100644 mindie/examples/models/minigpt4/predict/make_embeds.py
 create mode 100644 mindie/examples/models/minigpt4/predict/run_predict.sh
 create mode 100644 mindie/examples/models/mistral/README.md
 create mode 100644 mindie/examples/models/mistral/convert_quant_weights.py
 create mode 100644 mindie/examples/models/mistral/input.jsonl
 create mode 100644 mindie/examples/models/mistral/run_pa.sh
 create mode 100644 mindie/examples/models/mixtral/README.md
 create mode 100644 mindie/examples/models/mixtral/run_pa.sh
 create mode 100755 mindie/examples/models/phi3/README.md
 create mode 100755 mindie/examples/models/phi3/run_pa.sh
 create mode 100644 mindie/examples/models/qwen/README.md
 create mode 100644 mindie/examples/models/qwen/convert_quant_weights.py
 create mode 100644 mindie/examples/models/qwen/convert_quant_weights_14b.py
 create mode 100644 mindie/examples/models/qwen/quant_qwen2_14b_w4a16_64.py
 create mode 100644 mindie/examples/models/qwen/quant_qwen2_72b_w4a16_64.py
 create mode 100644 mindie/examples/models/qwen/quant_qwen2_72b_w8a16_fast.py
 create mode 100644 mindie/examples/models/qwen/quant_qwen_14b_w8a8.py
 create mode 100644 mindie/examples/models/qwen/quant_qwen_72b_w8a16.py
 create mode 100644 mindie/examples/models/qwen/qwen.jinja
 create mode 100644 mindie/examples/models/qwen/run_fa.sh
 create mode 100644 mindie/examples/models/qwen/run_pa.sh
 create mode 100644 mindie/examples/models/qwen_vl/README.md
 create mode 100644 mindie/examples/models/qwen_vl/precision/GPU_NPU_result_example.json
 create mode 100644 mindie/examples/models/qwen_vl/precision/clip_score_qwenvl.py
 create mode 100644 mindie/examples/models/qwen_vl/precision/run_coco_rst_GPU.py
 create mode 100644 mindie/examples/models/qwen_vl/run_pa.py
 create mode 100644 mindie/examples/models/qwen_vl/run_pa.sh
 create mode 100644 mindie/examples/models/stablelm/run_pa.sh
 create mode 100644 mindie/examples/models/starcoder/README.md
 create mode 100644 mindie/examples/models/starcoder/convert_w8a8_quant_weights.py
 create mode 100644 mindie/examples/models/starcoder/humaneval_python.txt
 create mode 100644 mindie/examples/models/starcoder/run_300i_duo.sh
 create mode 100644 mindie/examples/models/starcoder/run_800i_a2_pa.sh
 create mode 100644 mindie/examples/models/starcoder2/README.md
 create mode 100644 mindie/examples/models/starcoder2/convert_w8a8_quant_weights.py
 create mode 100644 mindie/examples/models/starcoder2/humaneval_python.txt
 create mode 100644 mindie/examples/models/starcoder2/run_800i_a2_pa.sh
 create mode 100644 mindie/examples/models/telechat/README.md
 create mode 100644 mindie/examples/models/telechat/convert_w8a8_quant_weights.py
 create mode 100644 mindie/examples/models/telechat/run_300i_duo.sh
 create mode 100644 mindie/examples/models/vicuna/README.md
 create mode 100644 mindie/examples/models/vicuna/run_pa.sh
 create mode 100644 mindie/examples/models/vlmo/README.md
 create mode 100644 mindie/examples/models/yi/README.md
 create mode 100644 mindie/examples/models/yi/run_pa.sh
 create mode 100644 mindie/examples/models/ziya/README.md
 create mode 100644 mindie/examples/models/ziya/run_pa.sh
 create mode 100644 mindie/examples/run_fa.py
 create mode 100644 mindie/examples/run_pa.py
 create mode 100644 mindie/examples/server/__init__.py
 create mode 100644 mindie/examples/server/batch.py
 create mode 100644 mindie/examples/server/cache.py
 create mode 100644 mindie/examples/server/example_text.csv
 create mode 100644 mindie/examples/server/example_token.csv
 create mode 100644 mindie/examples/server/generate.py
 create mode 100644 mindie/examples/server/request.py

diff --git a/mindie/examples/README.md b/mindie/examples/README.md
new file mode 100644
index 00000000..4a774f69
--- /dev/null
+++ b/mindie/examples/README.md
@@ -0,0 +1,220 @@
+# README
+
+- 此README对各模型统一的脚本及其使用方式进行介绍
+
+## 路径变量解释
+| 变量名  | 含义                                             |
+|--------|--------------------------------------------------|
+| working_dir | 加速库及模型库下载后放置目录                  |
+| llm_path | 模型仓所在路径。若使用编译好的包，则路径为`${working_dir}/MindIE-LLM/`；若使用gitee下载的代码，则路径为`${working_dir}/MindIE-LLM/examples/atb_models`    |
+| weight_path | 模型权重路径                                 |
+| w8a8s_weight_path | 稀疏量化权重路径                        |
+| w8a8sc_weight_path | 切分并压缩后的稀疏量化权重路径           |
+| cur_dir | 运行指令或执行脚本时的路径（当前目录）              |
+
+## 权重
+
+### 权重设置
+- `${weight_path}/config.json`文件中需设置`dtype`和`quantize`类型来标识权重的量化类型和精度
+  - 若`dtype`和`quantize`字段不存在，需新增
+
+- 配置
+  | 量化类型及精度  | torch_dtype | quantize |
+  |----------------|-------------|----------|
+  | FP16           | "float16"   | 无       |
+  | BF16           | "bfloat16"  | 无       |
+  | W8A8           | "float16"   | "w8a8"   |
+  | W8A8S          | "float16"   | "w8a8s"  |
+  | W8A8SC         | "float16"   | "w8a8sc" |
+  | W8A16          | "float16"   | "w8a16"  |
+
+- 示例
+  - LLaMa模型的权重使用BF16精度，非量化
+    ```json
+    {
+      "architectures": [
+        "LlamaForCausalLM"
+      ],
+      ...
+      "torch_dtype": "bfloat16",
+      ...
+    }
+    ```
+  - LLaMa模型的权重使用FP16精度，W8A16量化
+    ```json
+    {
+      "architectures": [
+        "LlamaForCausalLM"
+      ],
+      ...
+      "torch_dtype": "float16",
+      ...
+      "quantize": "w8a16",
+    }
+    ```
+
+### 权重转换
+> 当前仅支持加载safetensor格式的权重文件
+> 若下载的权重文件中已有safetensor格式的文件，则无需进行权重转换
+> 若环境中仅有bin格式的权重文件，请按照如下方式进行转换
+> 若当前环境不存在模型权重，请至hugging face官网下载
+- 使用`${llm_path}/examples/convert/convert_weights.py`将bin转成safetensor格式
+- 示例
+    ```shell
+    cd ${llm_path}
+    python examples/convert/convert_weights.py --model_path ${weight_path}
+    ```
+  - 注意：必须先进入`${llm_path}`路径下执行以上命令，否则由于脚本中存在相对路径，会导致moudle not found的问题
+- 输出结果会保存在bin权重同目录下
+
+### NPU多卡量化
+- 环境要求
+  - 硬件环境：910A或910B环境
+  - Pytorch、PTA配套在2.1版本以上
+  - CANN >= 8.0.RC2.B010
+  - accelerate >= 0.28.0
+  - 关闭虚拟内存：`PYTORCH_NPU_ALLOC_CONF`环境变量需设置为`expandable_segments:False`（虚拟内存默认关闭）
+- 调用`${llm_path}/examples/convert/model_slim/quantifier.py`脚本时，`--device_type`参数需设置为`npu`
+- 参数配置和运行指令见各模型README文件
+
+### 稀疏量化权重生成
+- Step 1：生成稀疏量化权重
+  ```shell
+  cd ${llm_path}
+  python -m examples.convert.model_slim.quantifier --model_path ${weight_path} --save_directory ${w8a8s_weight_path} --w_bit 4 --a_bit 8 --calib_dataset_type TeacherQualification --fraction 0.011 --co_sparse True
+  ```
+  - 参数配置以模型README文件中的描述为准
+- Step 2：量化权重切分及压缩
+  ```shell
+  torchrun --nproc_per_node {TP数} -m examples.convert.model_slim.sparse_compressor --model_path ${w8a8s_weight_path} --save_directory ${w8a8sc_weight_path}
+  ```
+  - TP数为tensor parallel并行个数
+  - 注意：若权重生成时以TP=4进行切分，则运行时也需以TP=4运行
+  - 示例
+    ```shell
+      torchrun --nproc_per_node 2 -m examples.convert.model_slim.sparse_compressor --model_path /data1/weights/model_slim/llama2-7b_w8a8s --save_directory /data1/weights/model_slim/llama2-7b_w8a8sc_temp
+    ```
+
+## 启动脚本
+- Flash Attention的启动脚本路径为`${llm_path}/examples/run_fa.py`
+- Page Attention的启动脚本路径为`${llm_path}/examples/run_pa.py`
+
+### 启动脚本相关环境变量
+  - `ASCEND_RT_VISIBLE_DEVICES`
+    - 指定当前机器上可用的逻辑NPU核心，多个核心间使用逗号相连
+    - 核心编号需要通过 npu-smi info 指令查阅
+    - Atlas 800I A2服务器需基于输出的 NPU 列查阅
+        ![npu_smi_info](../images/npu_smi_info_800i_a2.png)
+    - Atlas 300I DUO服务器需基于输出的 Device 列查阅
+        ![npu_smi_info](../images/npu_smi_info_300i_duo.png)
+        - 若要使用单卡双芯，请指定至少两个可见核心；若要使用双卡四芯，请指定至少四个可见核心
+  - `BIND_CPU`
+    - 绑定CPU核心开关
+    - 设置为1进行绑核，设置为0则不绑核；默认进行绑核
+    - 若当前机器未设置NUMA或绑核失败，可将 BIND_CPU 设为 0
+  - `PROFILING_LEVEL`
+    - 设置ProfilerLevel，默认为0
+  - `ATB_PROFILING_ENABLE`
+    - 是否落性能profiling文件
+    - 设置为1生成profiling文件，设置为0则不生成；默认不生成profiling文件
+  - `PROFILING_FILEPATH`
+    - （若生成profiling文件）profiling文件的路径
+    - 默认为`${cur_dir}/profiling`
+  - `ATB_LLM_BENCHMARK_ENABLE`
+    - 是否统计端到端和各token的性能数据
+    - 设置为1统计耗时，设置为0则不统计；默认不统计
+  - `ATB_LLM_BENCHMARK_FILEPATH`
+    - 性能数据的保存路径
+    - 默认为`${cur_dir}/benchmark_result/benchmark.csv`
+  - `ATB_LLM_LCOC_ENABLE`
+    - 是否开启通信计算掩盖功能
+    - 在Prefill阶段开启通信计算掩盖会提升性能
+  - `ATB_LLM_LOGITS_SAVE_ENABLE`
+    - 是否保存每个token的logits，每个logits会保存成一个单独的pth文件
+    - 设置为1保存，设置为0则不保存；默认不保存
+  - `ATB_LLM_LOGITS_SAVE_FOLDER`
+    - logits保存路径
+    - 默认为`${cur_dir}`
+  - `ATB_LLM_TOKEN_IDS_SAVE_ENABLE`
+    - 是否保存每个token的id，输入和输出token会单独保存成两个文件
+    - 设置为1保存，设置为0则不保存；默认不保存
+  - `ATB_LLM_TOKEN_IDS_SAVE_FOLDER`
+    - token id保存路径
+    - 默认为`${cur_dir}`
+
+### run_fa.py脚本参数
+- `--model_path`
+  - 模型权重路径
+- `--input_text`
+  - 输入问题
+  - 支持字符串列表或者字符串
+  - 若此值为字符串，则构造推理输入时会基于batch size入参复制多份
+  - 若此值为列表，则构造推理输入时会忽略batch size入参，真实的batch size为此列表实际长度
+- `--max_input_length`
+  - 最大输入长度
+  - 默认512个token
+  - 若输入长度不足512个token，会自动使用padding补齐
+- `--max_output_length`
+  - 最大输出长度
+  - - 默认输出20个token
+- `--batch_size`
+  - 推理时固定的batch数量
+  - 默认单batch
+- `--is_flash_causal_lm`
+  - 是否使用Paged Attention，默认不使用
+- 示例
+  ```shell
+  # 使用多卡运行Flash Attention，设置模型权重路径，设置输出长度为2048个token，精度使用BF16
+  torchrun --nproc_per_node 2 --master_port 20038 -m examples.run_fa --model_path ${weight_path} --max_output_length 2048 --is_bf16
+  ```
+
+### run_pa.py脚本参数
+- `--model_path`
+  - 模型权重路径
+- `--input_text`
+  - 输入问题
+  - 支持字符串列表或者字符串
+  - 若此值为单元素列表或字符串，则构造推理输入时会基于batch size入参复制多份
+  - 若此值为多元素列表，则构造推理输入时会忽略batch size入参，真实的batch size为此列表实际长度
+- `--input_file`
+  - 目前仅支持jsonl格式文件，每一行必须为List[Dict]格式的按时间顺序排序对话数据
+  - 每个Dict字典中需要至少包含"role"和"content"两个字段
+- `--max_position_embeddings`
+  - 模型可接受的最长输入长度
+  - 默认从模型权重的config文件中读取
+- `--max_output_length`
+  - 最大输出长度
+  - - 默认输出20个token
+- `--max_prefill_tokens`
+  - Prefill推理阶段，最大输入长度
+  - 默认4096个token
+- `--max_batch_size`
+  - 最大batch size，实际运行的batch size动态变化，有可能达不到设置的最大batch size
+  - 默认单batch
+- `--is_flash_model`
+  - 是否使用Paged Attention，默认使用
+- `--is_chat_model`
+  - store_true类型参数，若添加，则判定是chat模型
+  - 会从input_file(当前仅支持jsonl格式文件)中读取List[Dict]类型的对话数据
+  - 若未指定input_file，则会将input_text中的文本自动组成对话数据
+- `--chat_template`
+    - 默认值为None，且仅在is_chat_model为True时生效
+    - 若设置为文件名且is_chat_model为True，则从文件名指定的文件中读取jinja格式的chat模板
+    - 若设置为字符串且is_chat_model为True，则将该字符串解析为jinja格式的chat模板
+- 示例
+  ```shell
+  # 使用多卡运行Paged Attention，设置模型权重路径，设置输出长度为2048个token
+  torchrun --nproc_per_node 2 --master_port 20038 -m examples.run_pa --model_path ${weight_path} --max_output_length 2048
+  ```
+
+### 特殊场景说明
+- 单机多用户场景
+  - 300I DUO 和 800I A2 上，单机多用户场景下，由于通信算子之间采用共享内存进行通信，每个用户需要配置如下环境变量，进行共享内存的区分；
+    ```shell
+    export ATB_SHARE_MEMORY_NAME_SUFFIX="user1"
+    ```
+  - 单机多用户场景：如300I DUO上有4张卡，每张卡单独跑一个模型推理任务，需要根据不同任务设置上述环境变量来区分，如`user1`、`user2`
+- 300I DUO卡上需开启以下环境变量
+    ```shell
+    export INT8_FORMAT_NZ_ENABLE=1
+    ```
diff --git a/mindie/examples/__init__.py b/mindie/examples/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/mindie/examples/convert/__init__.py b/mindie/examples/convert/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/mindie/examples/convert/convert_utils.py b/mindie/examples/convert/convert_utils.py
new file mode 100644
index 00000000..84eb71bd
--- /dev/null
+++ b/mindie/examples/convert/convert_utils.py
@@ -0,0 +1,27 @@
+# Copyright Huawei Technologies Co., Ltd. 2023-2024. All rights reserved.
+import json
+import os.path
+import shutil
+from atb_llm.utils.file_utils import safe_open
+
+
+def copy_tokenizer_files(model_dir, dest_dir):
+    os.makedirs(dest_dir, exist_ok=True)
+    for filename in os.listdir(model_dir):
+        if 'tokenizer' in filename or 'tokenization' in filename or 'special_token_map' in filename:
+            src_filepath = os.path.join(model_dir, filename)
+            dest_filepath = os.path.join(dest_dir, filename)
+            shutil.copyfile(src_filepath, dest_filepath)
+
+
+def modify_config(model_dir, dest_dir, torch_dtype, quantize_type, kv_quant_type=False):
+    src_config_filepath = os.path.join(model_dir, 'config.json')
+    with open(src_config_filepath, 'r', encoding='utf-8') as fr:
+        data = json.load(fr)
+    data['torch_dtype'] = str(torch_dtype).split(".")[1]
+    data['quantize'] = quantize_type
+    if kv_quant_type:
+        data['kv_quant'] = "C8"  # 当前仅支持kv cache仅支持C8类型的量化方式
+    dest_config_filtpath = os.path.join(dest_dir, 'config.json')
+    with safe_open(dest_config_filtpath, 'w', encoding='utf-8', is_exist_ok=False) as fw:
+        json.dump(data, fw, indent=4)
\ No newline at end of file
diff --git a/mindie/examples/convert/convert_weights.py b/mindie/examples/convert/convert_weights.py
new file mode 100644
index 00000000..51300b8e
--- /dev/null
+++ b/mindie/examples/convert/convert_weights.py
@@ -0,0 +1,41 @@
+# Copyright Huawei Technologies Co., Ltd. 2023-2024. All rights reserved.
+import argparse
+
+from atb_llm.utils.convert import convert_files
+from atb_llm.utils.hub import weight_files
+from atb_llm.utils.log import logger
+
+
+def parse_arguments():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--model_path', help="model and tokenizer path")
+    return parser.parse_args()
+
+
+def convert_bin2st(model_path):
+    local_pt_files = weight_files(model_path, revision=None, extension=".bin")
+    local_st_files = [
+        p.parent / f"{p.stem.lstrip('pytorch_')}.safetensors"
+        for p in local_pt_files
+    ]
+    convert_files(local_pt_files, local_st_files, discard_names=[])
+    _ = weight_files(model_path)
+
+
+def convert_bin2st_from_pretrained(model_path):
+    from transformers import AutoModelForCausalLM
+    model = AutoModelForCausalLM.from_pretrained(
+        pretrained_model_name_or_path=model_path,
+        low_cpu_mem_usage=True,
+        torch_dtype="auto")
+    model.save_pretrained(model_path, safe_serialization=True)
+
+
+if __name__ == '__main__':
+    args = parse_arguments()
+
+    try:
+        convert_bin2st(args.model_path)
+    except RuntimeError:
+        logger.warning('convert weights failed with torch.load method, need model loaded to convert')
+        convert_bin2st_from_pretrained(args.model_path)
\ No newline at end of file
diff --git a/mindie/examples/convert/model_slim/__init__.py b/mindie/examples/convert/model_slim/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/mindie/examples/convert/model_slim/boolq.jsonl b/mindie/examples/convert/model_slim/boolq.jsonl
new file mode 100644
index 00000000..993a08ac
--- /dev/null
+++ b/mindie/examples/convert/model_slim/boolq.jsonl
@@ -0,0 +1,50 @@
+{"id": 0, "inputs_pretokenized": "Ghost in the Shell -- Animation studio Production I.G has produced several different anime adaptations of Ghost in the Shell, starting with the 1995 film of the same name, telling the story of Section 9's investigation of the Puppet Master. The television series Ghost in the Shell: Stand Alone Complex followed in 2002, telling an alternate story from the manga and first film, featuring Section 9's investigations of government corruption in the Laughing Man and Individual Eleven incidents. A sequel to the 1995 film, Ghost in the Shell 2: Innocence, was released in 2004. In 2006, the film Ghost in the Shell: Stand Alone Complex - Solid State Society retook the story of the television series. 2013 saw the start of the Ghost in the Shell: Arise original video animation (OVA) series, consisting of four parts through mid-2014. The series was recompiled in early 2015 as a television series titled Ghost in the Shell: Arise - Alternative Architecture, airing with an additional two episodes (one part). An animated feature film produced by most of the Arise staff, titled Ghost in the Shell: The New Movie, was released on June 20, 2015. A live-action American film of the same name was released on March 31, 2017.\nQuestion: is ghost in the shell based on the anime?\nAnswer:"}
+{"id": 1, "inputs_pretokenized": "The Walking Dead (season 8) -- The eighth season of The Walking Dead, an American post-apocalyptic horror television series on AMC, premiered on October 22, 2017, and concluded on April 15, 2018, consisting of 16 episodes. Developed for television by Frank Darabont, the series is based on the eponymous series of comic books by Robert Kirkman, Tony Moore, and Charlie Adlard. The executive producers are Kirkman, David Alpert, Scott M. Gimple, Greg Nicotero, Tom Luse, and Gale Anne Hurd, with Gimple as showrunner for his fifth and final season. The eighth season received positive reviews from critics. It was nominated for multiple awards and won two, including Best Horror Television Series for the third consecutive year, at the 44th Saturn Awards.\nQuestion: is there gonna be a season 8 of the walking dead?\nAnswer:"}
+{"id": 2, "inputs_pretokenized": "Onyx -- Brazilian green onyx was often used as plinths for art deco sculptures created in the 1920s and 1930s. The German sculptor Ferdinand Preiss used Brazilian green onyx for the base on the majority of his chryselephantine sculptures. Green onyx was also used for trays and pin dishes -- produced mainly in Austria -- often with small bronze animals or figures attached.\nQuestion: is there such a thing as green onyx?\nAnswer:"}
+{"id": 3, "inputs_pretokenized": "Wachovia -- The acquisition of Wachovia by Wells Fargo was completed on December 31, 2008 after a government-forced sale to avoid Wachovia's failure. The Wachovia brand was absorbed into the Wells Fargo brand in a process that lasted three years: on October 15, 2011, the last Wachovia branches in North Carolina were converted to Wells Fargo.\nQuestion: is wells fargo and wachovia the same bank?\nAnswer:"}
+{"id": 4, "inputs_pretokenized": "Friday Night Lights (film) -- Friday Night Lights is a 2004 American sports drama film, directed by Peter Berg, which 'dramatized' the coach and players of a high school football team in the Texas city of Odessa that supported and was obsessed with them. The book on which it was based, Friday Night Lights: A Town, a Team, and a Dream (1990) by H.G. Bissinger, followed the story of the 1988 Permian High School Panthers football team as they made a run towards the state championship. A television series of the same name premiered on October 3, 2006 on NBC. The film won the Best Sports Movie ESPY Award and was ranked number 37 on Entertainment Weekly's list of the Best High School Movies.\nQuestion: is friday night lights movie based on a true story?\nAnswer:"}
+{"id": 5, "inputs_pretokenized": "Peace bond -- The use of peace bonds is rather uncommon in the U.S. justice system, but a deferred prosecution has a similar effect. Since there is no conviction or admission of any guilt, signing a peace bond in Canada does not usually result in U.S. inadmissibility under INA \u00a7 212 (a) (2).\nQuestion: is a peace bond an admission of guilt?\nAnswer:"}
+{"id": 6, "inputs_pretokenized": "Eating mucus -- Mucophagy, despite its benefits on one's immunity, comes with some health risks due to the potential physical aggravation resulting from the action of nose picking, and the germs on fingers and in mucus. Picking one's nose can cause upper airway irritation as well as other injuries including nasal septal perforation (a ``through-and-through defect'' of the cartilage separating the nostrils), and epistaxis (nosebleed). In a study by Andrade and Srihari, 25% of subjects were ailed by nose bleeds, 17% with nasal infections, and 2% with damage more serious than bleeding. W. Buzina studied the fungal diversity in nasal mucus in 2003. 104 samples were gathered with 331 identifiable strains of fungi and 9 different species per patient.\nQuestion: does eating your boogers improve your immune system?\nAnswer:"}
+{"id": 7, "inputs_pretokenized": "High-altitude flatus expulsion -- High-altitude flatus expulsion (HAFE) is a gastrointestinal syndrome which involves the spontaneous passage of increased quantities of rectal gases at high altitudes. First described by Joseph Hamel in c. 1820 and occasionally described afterward, a landmark study of this phenomenon was published in 1981 by Paul Auerbach and York Miller.\nQuestion: do you have more gas at higher altitudes?\nAnswer:"}
+{"id": 8, "inputs_pretokenized": "Big Boss (Metal Gear) -- Big Boss is one of the central characters in the Metal Gear video game series. He was introduced in the original Metal Gear games for the MSX2 as the commanding officer and subsequent nemesis of Solid Snake. He is later featured as Naked Snake, the protagonist of Metal Gear Solid prequels where he is initially depicted as an American Special Forces Operator and decorated war hero until political manipulations cause him to be disillusioned and start his own private mercenary company. Big Boss's character has been praised by video game publications for his role as a villain as well for his relationship with Solid Snake. As the series' chronology progressed, his exact allegiance and motivations became increasingly complex; his first appearances are depicted as a traitor dreaming of a world of perpetual war, but subsequent appearances have revealed him to be a key figure in an ideological dispute that shaped the latter half of the twentieth century and a man whose conscience was disturbed by the attitude of leaders towards soldiers, prompting his decision to become a soldier of fortune and Venom Snake's mental template.\nQuestion: is solid snake and big boss the same person?\nAnswer:"}
+{"id": 9, "inputs_pretokenized": "Jessie (2011 TV series) -- After casting was finalized and changes were made to several of the characters to suit the actors chosen, the series skipped the pilot phase and was put directly into production. Filming began in June 2011 on Stage 3/8 at Hollywood Center Studios which, prior to start of production, served as the sound stage where the Disney Channel series Wizards of Waverly Place was taped. 13 episodes were originally ordered for the first season, but while the show's first season was in production, Disney Channel ordered an additional seven episodes, bringing the total number of episodes for the first season to 20. When asked about the atmosphere on set during an interview with MSN TV, Ryan described her relationship with the young cast: ``I definitely feel like a nanny! They are smart kids, but they're real kids. They like to have fun. My policy is: We can play hard, as long as we work hard, and because we work hard, we need to play hard.'' Filming on the series wrapped on February 22, 2015.\nQuestion: is the show jessie filmed in new york?\nAnswer:"}
+{"id": 10, "inputs_pretokenized": "Song of Songs -- The Song of Songs, also Song of Solomon or Canticles (Hebrew: \u05e9\u05b4\u05c1\u05d9\u05e8 \u05d4\u05b7\u05e9\u05b4\u05bc\u05c1\u05d9\u05e8\u05b4\u05d9\u05dd\u202c, \u0160\u00eer Ha\u0161\u0160\u00eer\u00eem, Greek: \u1f8e\u03c3\u03bc\u03b1 \u1f8e\u03c3\u03bc\u03ac\u03c4\u03c9\u03bd, asma asmaton, both meaning Song of Songs), is one of the megillot (scrolls) found in the last section of the Tanakh, known as the Ketuvim (or ``Writings''), and a book of the Old Testament.\nQuestion: is the song of songs the same as the song of solomon?\nAnswer:"}
+{"id": 11, "inputs_pretokenized": "Northwest Florida State College -- The school voted to change its name to Okaloosa-Walton Community College in 1988, and gained four-year status in 2003, thus changing its name to Okaloosa-Walton College.\nQuestion: is northwest florida state college a 4 year college?\nAnswer:"}
+{"id": 12, "inputs_pretokenized": "A Quiet Place (film) -- A Quiet Place is a production of Sunday Night and Platinum Dunes; it was produced on a budget of $17 million. Krasinski wrote the screenplay with story co-writers Scott Beck and Bryan Woods. Beck and Woods grew up together in the US state of Iowa, and had watched numerous silent films in college. By 2013, they began working on the story that would lead to the film. They used their experience growing up close to farmland as the basis, including a grain silo setting as a place considered dangerous in their upbringing. They initiated their approach with a 15-page proof of concept. Initially, the writers had considered developing the film into a Cloverfield installment, but after pitching their ideas to the studio collectively, all of those involved decided to keep the film its own entity.\nQuestion: is the movie the quiet place based on a book?\nAnswer:"}
+{"id": 13, "inputs_pretokenized": "2018 FIFA World Cup qualification \u2013 UEFA Group G -- The group winners, Spain, qualified directly for the 2018 FIFA World Cup. The group runners-up, Italy, advanced to the play-offs as one of the best 8 runners-up, where they lost to Sweden and thus failed to qualify for the first time since 1958.\nQuestion: did spain qualify for the 2018 world cup?\nAnswer:"}
+{"id": 14, "inputs_pretokenized": "Red squirrel -- The eastern grey squirrel and the red squirrel are not directly antagonistic, and violent conflict between these species is not a factor in the decline in red squirrel populations. However, the eastern grey squirrel appears to be able to decrease the red squirrel population due to several reasons:\nQuestion: are grey and red squirrels the same species?\nAnswer:"}
+{"id": 15, "inputs_pretokenized": "Bermuda -- Bermuda is a group of low-forming volcanoes in the Atlantic Ocean, near the western edge of the Sargasso Sea, roughly 578 nautical miles (1,070 km; 665 mi) east-southeast of Cape Hatteras on the Outer Banks of North Carolina and about 594 nautical miles (1,100 km; 684 mi) southeast of Martha's Vineyard of Massachusetts. It is 898 nautical miles (1,663 km; 1,033 mi) northeast of Miami, Florida, and 667 nautical miles (1,235 km; 768 mi) from Cape Sable Island, in Nova Scotia, Canada. The islands lie due east of Fripp Island, South Carolina, west-northwest of Cape Verde, southeast of New York City, New York, north-northwest of Brazil and 1,759 km (1,093 mi) north of Cuba.\nQuestion: is bermuda off the coast of south carolina?\nAnswer:"}
+{"id": 16, "inputs_pretokenized": "The People's Court -- The losing party does not actually need to pay the judgment, as such. Instead (as is stated in the disclaimer at the end of each show), both parties are paid from a fund (set up by Ralph Edwards-Stu Billett Productions). This fund was based on the amount of the lawsuit claim, but an exact formula was not stated. The fund was to be first divided equally, then any monetary judgment ordered was subtracted from the loser's half (and presumably both halves in the case of cross judgments). Each litigant received at least what remained of their half in shows concluding with that disclaimer.\nQuestion: do litigants on people's court get paid?\nAnswer:"}
+{"id": 17, "inputs_pretokenized": "Texas -- Texas (/\u02c8t\u025bks\u0259s/, locally /-s\u0259z/; Spanish: Texas or Tejas (\u02c8texas)) is the second largest state in the United States by both area and population. Geographically located in the South Central region of the country, Texas shares borders with the U.S. states of Louisiana to the east, Arkansas to the northeast, Oklahoma to the north, New Mexico to the west, and the Mexican states of Chihuahua, Coahuila, Nuevo Le\u00f3n, and Tamaulipas to the southwest, while the Gulf of Mexico is to the southeast.\nQuestion: is texas the biggest state in the us?\nAnswer:"}
+{"id": 18, "inputs_pretokenized": "The Adventures of Tintin (film) -- Spielberg acquired rights to produce a film based on The Adventures of Tintin series following Herg\u00e9's death in 1983, and re-optioned them in 2002. Filming was due to begin in October 2008 for a 2010 release, but release was delayed to 2011 after Universal opted out of producing the film with Paramount, who provided $30 million on pre-production. Sony chose to co-produce the film. The delay resulted in Thomas Sangster, who had been originally cast as Tintin, departing from the project. Producer Peter Jackson, whose company Weta Digital provided the computer animation, intends to direct a sequel. Spielberg and Jackson also hope to co-direct a third film. The world premi\u00e8re took place on 22 October 2011 in Brussels. The film was released in the United Kingdom and other European countries on 26 October 2011, and in the United States on 21 December 2011, in Digital 3D and IMAX.\nQuestion: will there be a adventures of tintin 2?\nAnswer:"}
+{"id": 19, "inputs_pretokenized": "Emma Pillsbury -- Emma Pillsbury Schuester (previously Pillsbury-Howell) is a fictional character from the Fox musical comedy-drama series Glee. Portrayed by actress Jayma Mays, Emma has appeared in Glee from its pilot episode, first broadcast on May 19, 2009. Emma was developed by Glee creators Ryan Murphy, Brad Falchuk and Ian Brennan. She is a guidance counselor at the fictional William McKinley High School in Lima, Ohio where the series is set. Emma suffers from obsessive-compulsive disorder and has romantic feelings for glee club director Will Schuester (Matthew Morrison), but becomes engaged to football coach Ken Tanaka (Patrick Gallagher) as Will is married. Ken ultimately breaks up with her on their wedding day because of her feelings for Will, and when Will leaves his wife Terri (Jessalyn Gilsig), he and Emma share a kiss. Their relationship is short-lived, and in the second season, Emma and her dentist boyfriend Carl Howell (John Stamos) marry in Las Vegas. The wedding is later annulled as it was unconsummated. At the beginning of the third season, she and Will are living together; they become engaged shortly after New Years, and consummate their relationship near the end of the school year. Emma leaves Will at the altar midway through the fourth season, but the two later reconcile and marry in the season finale. She becomes pregnant during the middle of the fifth season.\nQuestion: do will and emma get together in glee?\nAnswer:"}
+{"id": 20, "inputs_pretokenized": "The Princess and the Goblin (film) -- The Princess and the Goblin (Hungarian: A hercegn\u0151 \u00e9s a kobold) is a 1991 British-Hungarian-American animated musical fantasy film directed by J\u00f3zsef G\u00e9mes and written by Robin Lyons, an adaptation of George MacDonald's 1872 novel of the same name.\nQuestion: is the princess and the goblin a disney movie?\nAnswer:"}
+{"id": 21, "inputs_pretokenized": "WWE draft -- On May 25, 2016, due to SmackDown moving to Tuesdays and to a live broadcast starting July 19, necessitating a brand extension, WWE announced that the draft would be returning. It would later be announced that the 2016 WWE draft would take place on July 19 during SmackDown's first live broadcast, which was also the first time that the draft took place on SmackDown. The 2017 draft was labeled the Superstar Shake-up as instead of a traditional draft, the general managers of Raw and SmackDown could trade and make deals between their respective talent.\nQuestion: is there going to be a wwe draft in 2017?\nAnswer:"}
+{"id": 22, "inputs_pretokenized": "Izzie Stevens -- Heigl garnered critical acclaim for her performance as Izzie and received numerous awards and nominations for her role, winning the ``Outstanding Supporting Actress In A Drama Series'' at the 2007 Emmy Awards. She was critical of the character's development during the show's fourth season, particularly her romance with George. She declined to put herself forward for the 2008 Emmy Awards, citing insufficient material in the role. After speculation that Izzie would be killed off in the fifth season, the character was diagnosed with Stage 4 metastatic melanoma. She married Alex in the series' one-hundredth episode, and afterwards, her tumor was successfully removed. Izzie made her final appearance in the sixth season, leaving Seattle after Alex refused to resume their marriage. Heigl requested to be released from her contract 18 months early, in order to spend more time with her family. In January 2012, Heigl reported that she would like to return to Grey's Anatomy to give closure to her character, however, Rhimes confirmed that there were no plans to have the character return at that time and has since stated that she has no plans to ever re-approach Izzie's storyline again.\nQuestion: does izzie come back in grey's anatomy?\nAnswer:"}
+{"id": 23, "inputs_pretokenized": "Sam Beckett -- When Sam corrected the timeline, he leaped forward, but not all the way home; this time, he found himself assuming the identity of a minor-league professional baseball player named Tim Fox. For the rest of his life (an epilogue in the series finale tells us Sam never gets home, but in our terms, it was the next four years/five seasons, the duration of the show) Sam would continue to travel back and forth through time; swapping identities with various people and as a tagline for the show reiterated, ``setting right what once went wrong.''\nQuestion: did sam ever make it home in quantum leap?\nAnswer:"}
+{"id": 24, "inputs_pretokenized": "Safety (gridiron football score) -- In gridiron football, the safety (American football) or safety touch (Canadian football) is a scoring play that results in two points (or, in rare cases, one point) being awarded to the scoring team. Safeties can be scored in a number of ways, such as when a ball carrier is tackled in his own end zone or when a foul is committed by the offense in their own end zone. After a safety is scored in American football, the ball is kicked off to the team that scored the safety from the 20-yard line; in Canadian football, the scoring team also has the options of taking control of the ball at their own 35-yard line or kicking off the ball, also at their own 35-yard line. The ability of the scoring team to receive the ball through a kickoff differs from the touchdown and field goal, which require the scoring team to kick the ball off to the scored upon team. Despite being of relatively low point value, safeties can have a significant impact on the result of games, and Brian Burke of Advanced NFL Stats estimated that safeties have a greater abstract value than field goals, despite being worth a point less, due to the field position and reclaimed possession gained off the safety kick.\nQuestion: is it possible to get 1 point in football?\nAnswer:"}
+{"id": 25, "inputs_pretokenized": "Atomic number -- The atomic number or proton number (symbol Z) of a chemical element is the number of protons found in the nucleus of an atom. It is identical to the charge number of the nucleus. The atomic number uniquely identifies a chemical element. In an uncharged atom, the atomic number is also equal to the number of electrons.\nQuestion: is the atomic number equal to the number of protons?\nAnswer:"}
+{"id": 26, "inputs_pretokenized": "Tick (comics) -- In the Amazon Prime video series, The Tick is fixated on Arthur, and even mentions at one point that his thinking is fuzzy when away from Arthur. Despite Arthur's repeated attempts to push The Tick away, the hero won't leave Arthur's side for long. The Tick also frequently talks about Destiny as if she is a literal person, guiding Arthur's path (``Destiny gave him the suit. I just acted in more of a 'delivery man' role''), alluding to the Parcae in Roman mythology. At one point, Arthur starts to believe that The Tick is merely another hallucination, but that thought is quickly dispelled when Arthur's sister, Dot, interacts with ``The Blue Guy.''\nQuestion: is the tick part of arthur's imagination?\nAnswer:"}
+{"id": 27, "inputs_pretokenized": "Game of Thrones -- Game of Thrones is an American fantasy drama television series created by David Benioff and D.B. Weiss. It is an adaptation of A Song of Ice and Fire, George R.R. Martin's series of fantasy novels, the first of which is A Game of Thrones. It is filmed in Belfast and elsewhere in the United Kingdom, Canada, Croatia, Iceland, Malta, Morocco, Spain, and the United States. The series premiered on HBO in the United States on April 17, 2011, and its seventh season ended on August 27, 2017. The series will conclude with its eighth season premiering either in 2018 or 2019.\nQuestion: is this the last season of gsme of thrones?\nAnswer:"}
+{"id": 28, "inputs_pretokenized": "State supreme court -- The court consists of a panel of judges selected by methods outlined in the state constitution. State supreme courts are completely distinct from any United States federal courts located within the geographical boundaries of a state's territory, or the federal United States Supreme Court (although appeals, on some issues, from judgments of a state's highest court can be sought in the U.S. Supreme Court).\nQuestion: can a state supreme court decision be appealed?\nAnswer:"}
+{"id": 29, "inputs_pretokenized": "Snake River -- The Snake River is the thirteenth longest river in the United States. Its watershed is the 10th largest among North American rivers, and covers almost 108,000 square miles (280,000 km) in portions of six U.S. states: Wyoming, Idaho, Nevada, Utah, Oregon, and Washington, with the largest portion in Idaho. Most of the Snake River watershed lies between the Rocky Mountains on the east and the Columbia Plateau on the northwest. The largest tributary of the Columbia River, the Snake River watershed makes up about 41% of the entire Columbia River Basin. Its average discharge at the mouth constitutes 31% of the Columbia's flow at that point. Above the confluence, the Snake is slightly longer than the Columbia--1,078 miles (1,735 km) compared to 928 miles (1,493 km)--and its drainage basin is slightly larger--4% bigger than the upstream Columbia River watershed.\nQuestion: does the snake river flow into the columbia river?\nAnswer:"}
+{"id": 30, "inputs_pretokenized": "Outlier -- Deletion of outlier data is a controversial practice frowned upon by many scientists and science instructors; while mathematical criteria provide an objective and quantitative method for data rejection, they do not make the practice more scientifically or methodologically sound, especially in small sets or where a normal distribution cannot be assumed. Rejection of outliers is more acceptable in areas of practice where the underlying model of the process being measured and the usual distribution of measurement error are confidently known. An outlier resulting from an instrument reading error may be excluded but it is desirable that the reading is at least verified.\nQuestion: can there be outliers in a normal distribution?\nAnswer:"}
+{"id": 31, "inputs_pretokenized": "Ready Player One -- Ready Player One is a 2011 science fiction novel, and the debut novel of American author Ernest Cline. The story, set in a dystopian 2040s, follows protagonist Wade Watts on his search for an Easter egg in a worldwide virtual reality game, the discovery of which will lead him to inherit the game creator's fortune. Cline sold the rights to publish the novel in June 2010, in a bidding war to the Crown Publishing Group (a division of Random House). The book was published on August 16, 2011. An audiobook was released the same day; it was narrated by Wil Wheaton, who was mentioned briefly in one of the chapters. In 2012, the book received an Alex Award from the Young Adult Library Services Association division of the American Library Association and won the 2012 Prometheus Award.\nQuestion: is ready player one based on a true story?\nAnswer:"}
+{"id": 32, "inputs_pretokenized": "Four-leaf clover -- The four-leaf clover is a rare variation of the common three-leaf clover. According to traditional superstition, such clovers bring good luck, though it is not clear when or how that superstition got started. The earliest mention of ``Fower-leafed or purple grasse'' is from 1640 and simply says that it was kept in gardens because it was ``good for the purples in children or others''. A description from 1869 says that four-leaf clovers were ``gathered at night-time during the full moon by sorceresses, who mixed it with vervain and other ingredients, while young girls in search of a token of perfect happiness made quest of the plant by day''. The first reference to luck might be from an 11-year-old girl, who wrote in an 1877 letter to St. Nicholas Magazine, ``Did the fairies ever whisper in your ear, that a four-leaf clover brought good luck to the finder?''\nQuestion: is there such a thing as a four leaf clover?\nAnswer:"}
+{"id": 33, "inputs_pretokenized": "Statutory declaration -- Statutory declarations are commonly used to allow a person to declare something to be true for the purposes of satisfying some legal requirement or regulation when no other evidence is available. They are thus similar to affidavits (which are made on oath).\nQuestion: can a statutory declaration be used as evidence?\nAnswer:"}
+{"id": 34, "inputs_pretokenized": "Convention to propose amendments to the United States Constitution -- To become part of the Constitution, an amendment must be ratified by either--as determined by Congress--the legislatures of three-fourths (presently 38) of the states or State ratifying conventions in three-fourths of the states. Thirty-three amendments to the United States Constitution have been approved by Congress and sent to the states for ratification. Twenty-seven of these amendments have been ratified and are now part of the Constitution. As of 2018, the convention process has never been used for proposing constitutional amendments.\nQuestion: has there ever been a convention of states?\nAnswer:"}
+{"id": 35, "inputs_pretokenized": "South African English -- SAE is an extraterritorial (ET) variety of English, or a language variety that has been ``transported'' outside its mainland home. More specifically, SAE is a Southern hemisphere ET originating from later English colonisation in the 18th and 19th centuries (Zimbabwean, Australian, and New Zealand English are also Southern hemisphere ET varieties). SAE resembles British English more closely than it does American English due to the close ties that South African colonies maintained with the mainland in the 19th and 20th centuries. However, with the increasing influence of American pop-culture around the world via modes of contact like television, American English has become more familiar in South Africa. Indeed, some American lexical items are becoming alternatives to comparable British terms.\nQuestion: is south african english similar to british english?\nAnswer:"}
+{"id": 36, "inputs_pretokenized": "Haroun and the Sea of Stories -- Haroun and the Sea of Stories is a 1990 children's book by Salman Rushdie. It was Rushdie's fifth novel after The Satanic Verses. It is a phantasmagorical story that begins in a city so old and ruinous that it has forgotten its name.\nQuestion: is haroun and the sea of stories a children's book?\nAnswer:"}
+{"id": 37, "inputs_pretokenized": "Mandalay Bay -- Mandalay Bay is a 43-story luxury resort and casino on the Las Vegas Strip in Paradise, Nevada. It is owned and operated by MGM Resorts International. One of the property's towers operates as the Delano; the Four Seasons Hotel is independently operated within the Mandalay Bay tower, occupying 5 floors (35--39).\nQuestion: is four seasons las vegas part of mandalay bay?\nAnswer:"}
+{"id": 38, "inputs_pretokenized": "Lynette Scavo -- Her world is further shocked when Tom asks for a divorce, and announces that he and Jane will be moving in together. Lynette is devastated, and her rivalry with Jane becomes more heated at Penny's birthday party when they continually try to one up each other. Jane then later tries to reconcile with Lynette, but then she begins to choke on a snack. Lynette hesitates to help Jane, but ultimately comes to her aid and saves her. However, Jane is alarmed at Lynette thinking such an action over believing she thought of letting Jane die. Then on the day of Mike Delfino's funeral, Tom and Lynette comfort each other as Jane looks on. Sparks of their marriage appear and while sitting at the service Lynette thinks back to the day Tom moved out. Mike tries to understand why Lynette isn't fighting for her marriage. He then reveals that everyone in the neighborhood knows that she and Tom belong together. This memory finally causes Lynette to make the decision to fight for her marriage, win Tom back, and dissolve his romance with Jane. In With So Little to Be Sure Of Lynette and Tom officially sign their divorce papers ending their marriage. When Lynette hears Tom hasn't filed the papers, she is hopeful but after seeing Tom and Jane kiss at the office, she accepts a date from Tom's boss. It goes well at first but when he plans to transfer Tom to India, Lynette breaks it off. The boss sardonically insults Lynette before Tom about her being hung up on another man and after insults to her, Tom punches him. He and Jane argue with Jane realizing that Tom still loves Lynette and they break up. Tom goes to see Lynette but sees her hugging Lee and (not seeing who it is), thinks Lynette has moved on. He tells her he is filing but in a later talk, they realize how much they love each other and reconcile.\nQuestion: do tom and lynette get back together spoiler?\nAnswer:"}
+{"id": 39, "inputs_pretokenized": "List of Major League Baseball single-game home run leaders -- Writers of Sporting News described hitting four home runs in a single Major League Baseball (MLB) game as ``baseball's greatest single-game accomplishment''. Eighteen players have accomplished the feat to date, the most recent being Scooter Gennett on June 6, 2017 against the St. Louis Cardinals. No player has done this more than once in his career and no player has ever hit more than four in a game. Bobby Lowe was the first to hit four home runs in a single game, doing so on May 30, 1894. Fans were reportedly so excited that they threw $160 in silver coins ($4,500 today) onto the field after his fourth home run.\nQuestion: has there ever been a 5 home run game?\nAnswer:"}
+{"id": 40, "inputs_pretokenized": "Virginia Cavaliers men's basketball -- The Wahoos, as they are unofficially known, have appeared in the NCAA Tournament twenty-two times, advancing to the Elite Eight six times (1981, 1983, 1984, 1989, 1995, 2016). They further advanced to the 1981 and 1984 Final Fours; in the former winning the last NCAA third place game ever played, defeating No. 1 LSU 78--74. The Cavaliers won the post-season NIT Tournaments of 1980 and 1992.\nQuestion: has university of virginia ever won the ncaa tournament?\nAnswer:"}
+{"id": 41, "inputs_pretokenized": "Chiko Roll -- A Chiko Roll's filling is primarily cabbage and barley, as well as carrot, green beans, beef, beef tallow, wheat cereal, celery and onion. This filling is partially pulped and enclosed in a thick egg and flour pastry tube designed to survive handling at football matches. The roll is typically deep-fried in vegetable oil.\nQuestion: is there any meat in a chiko roll?\nAnswer:"}
+{"id": 42, "inputs_pretokenized": "Pupil -- The pupil is a hole located in the center of the iris of the eye that allows light to strike the retina. It appears black because light rays entering the pupil are either absorbed by the tissues inside the eye directly, or absorbed after diffuse reflections within the eye that mostly miss exiting the narrow pupil.\nQuestion: is your pupil a hole in your eye?\nAnswer:"}
+{"id": 43, "inputs_pretokenized": "Interleague play -- Interleague play in Major League Baseball refers to regular-season baseball games played between an American League (AL) team and a National League (NL) team. Interleague play was first introduced in the 1997 Major League Baseball season. Prior to that, matchups between AL teams and NL teams occurred only during spring training, the All-Star Game, other exhibition games (such as the Hall of Fame Game in Cooperstown, New York), and the World Series. Unlike modern interleague play, none of these contests, except for the World Series, counted toward official team or league records.\nQuestion: does the national league play the american league in the world series?\nAnswer:"}
+{"id": 44, "inputs_pretokenized": "Steel-toe boot -- A steel-toe boot (also known as a safety boot, steel-capped boot or safety shoe) is a durable boot or shoe that has a protective reinforcement in the toe which protects the foot from falling objects or compression, usually combined with a mid sole plate to protect against punctures from below.\nQuestion: are steel toe boots made to cut toes off?\nAnswer:"}
+{"id": 45, "inputs_pretokenized": "51st state -- Voters in Washington, D.C. and Puerto Rico have both voted for statehood in referendums. As statehood candidates, their admission to the Union requires congressional approval. American Samoa, Guam, the Northern Mariana Islands, and the United States Virgin Islands are also U.S. territories and could potentially become U.S. states someday.\nQuestion: is puerto rico the 51st state of the united states?\nAnswer:"}
+{"id": 46, "inputs_pretokenized": "List of The Waltons characters -- Mary Ellen (Judy Norton Taylor) is the oldest of Liv and John's daughters, born in April 1920, aged 13 in season one. Throughout the first few seasons, she is a typically whiny, sometimes rebellious teenager, somewhat of a tomboy who enjoys playing baseball, but could also be vain, engaging in a rivalry with rich-girl Martha-Rose Coverdale for the affections of the awkward G.W. Haines (David Doremus). Mary Ellen matures into a wiser young woman and her childish fantasy of becoming a movie star gives way for a more reasonable and realistic ambition to go into medicine after reading up on it and developing an interest. She then works to gain an education as a medical worker, and becomes a nurse. However, when she ends up taking care of the people out in the country by herself, she concludes they need more medical expertise than she can offer them and continues studying medicine until she succeeds in becoming a fully-fledged doctor. Even though some people frown upon female doctors and she receives mixed support from her family, she refuses to let this stop her. Mary Ellen has a special relationship with each of her six siblings, but she is especially close to her younger sister Erin. Mary Ellen and Erin fought a lot when they were younger girls, particularly in seasons 1 and 2. But in the middle seasons, Mary Ellen and Erin matured and became friends. In season 5 after Mary Ellen married Curt, her relationship with her sister deepened even further and by the end of the show, they truly did become each other's best friend.\nQuestion: does mary ellen become a doctor on the waltons?\nAnswer:"}
+{"id": 47, "inputs_pretokenized": "Switched at Birth (film) -- Switched at Birth is a 1991 American television film directed by Waris Hussein. It is based on the true story of Kimberly Mays and Arlena Twigg, babies switched soon after birth in a Florida hospital in 1978.\nQuestion: is switched at birth based on a real story?\nAnswer:"}
+{"id": 48, "inputs_pretokenized": "Pine oil -- Pine oil is distinguished from other products from pine, such as turpentine, the low-boiling fraction from the distillation of pine sap, and rosin, the thick tar remaining after turpentine is distilled.\nQuestion: is pine oil and turpentine the same thing?\nAnswer:"}
+{"id": 49, "inputs_pretokenized": "Mayfly -- Mayflies (also known as Canadian soldiers in the United States, or shadflies or fishflies in Canada and Michigan; also up-winged flies in the United Kingdom ) are aquatic insects belonging to the order Ephemeroptera. This order is part of an ancient group of insects termed the Palaeoptera, which also contains dragonflies and damselflies. Over 3,000 species of mayfly are known worldwide, grouped into over 400 genera in 42 families.\nQuestion: are canadian soldiers and mayflies the same thing?\nAnswer:"}
diff --git a/mindie/examples/convert/model_slim/get_calibration_dataset.py b/mindie/examples/convert/model_slim/get_calibration_dataset.py
new file mode 100644
index 00000000..4f3705f3
--- /dev/null
+++ b/mindie/examples/convert/model_slim/get_calibration_dataset.py
@@ -0,0 +1,12 @@
+# Copyright Huawei Technologies Co., Ltd. 2023-2024. All rights reserved.
+import json
+
+
+def load_jsonl(dataset_path, key_name='inputs_pretokenized'):
+    dataset = []
+    with open(dataset_path, encoding='utf-8') as file:
+        for line in file:
+            data = json.loads(line)
+            text = data[key_name]
+            dataset.append(text)
+    return dataset
diff --git a/mindie/examples/convert/model_slim/quantifier.py b/mindie/examples/convert/model_slim/quantifier.py
new file mode 100644
index 00000000..02c5c1ee
--- /dev/null
+++ b/mindie/examples/convert/model_slim/quantifier.py
@@ -0,0 +1,176 @@
+# Copyright Huawei Technologies Co., Ltd. 2023-2024. All rights reserved.
+import os
+import argparse
+import torch
+from transformers import AutoTokenizer, AutoModelForCausalLM, AutoConfig
+from msmodelslim.pytorch.llm_ptq.anti_outlier import AntiOutlier, AntiOutlierConfig
+from msmodelslim.pytorch.llm_ptq.llm_ptq_tools import Calibrator, QuantConfig
+
+
+from examples.convert.convert_utils import copy_tokenizer_files, modify_config
+from examples.convert.model_slim.get_calibration_dataset import load_jsonl
+
+
+CPU = "cpu"
+NPU = "npu"
+
+
+def cmd_bool(cmd_arg):
+    if cmd_arg == "True":
+        return True
+    elif cmd_arg == "False":
+        return False
+    raise ValueError(f"{cmd_arg} should be a boolean")
+
+
+def parse_arguments():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--model_path', help="model and tokenizer path")
+    parser.add_argument('--save_directory')
+    parser.add_argument(
+        '--calib_texts',
+        type=str,
+        nargs='+',
+        default=["What's deep learning?"])
+    parser.add_argument(
+        '--calib_file',
+        type=str,
+        help='CSV or Numpy file containing tokenized input. Alternative to text input.',
+        default=f"{os.path.join(os.path.dirname(__file__), 'teacher_qualification.jsonl')}")
+    parser.add_argument(
+        '--calib_dataset_length',
+        type=int,
+        help='Max calibration dataset length.',
+        default=50)
+    parser.add_argument('--w_bit', type=int, default=8)
+    parser.add_argument('--a_bit', type=int, default=8)
+    parser.add_argument('--disable_names', type=str, nargs='+', default=None)
+    parser.add_argument('--device_type', type=str, choices=[CPU, NPU], default=CPU)
+    parser.add_argument('--fraction', type=float, default=0.01)
+    parser.add_argument("--act_method", type=int, choices=[1, 2, 3], default=1,
+                        help=" `1`: `MinMax`, `2`: `Histogram`, `3`: `Auto`")
+    parser.add_argument('--co_sparse', type=cmd_bool, default=False)
+    parser.add_argument('--anti_method', type=str, default='',help=" `m3`: `AWQ`")
+    parser.add_argument('--disable_level', type=str, default='L0')
+    parser.add_argument('--input_ids_name', type=str, default='input_ids')
+    parser.add_argument('--attention_mask_name', type=str, default='attention_mask')
+    parser.add_argument('--do_smooth', type=cmd_bool, default=False)
+    parser.add_argument('--use_sigma', type=cmd_bool, default=False)
+    parser.add_argument('--sigma_factor', type=float, default=3.0)
+    parser.add_argument('--is_lowbit', type=cmd_bool, default=False)
+    parser.add_argument('--mm_tensor', type=cmd_bool, default=True)
+    parser.add_argument('--w_sym', type=cmd_bool, default=True)
+    parser.add_argument('--use_kvcache_quant', type=cmd_bool, default=False)
+    parser.add_argument('--open_outlier', type=cmd_bool, default=True)
+    parser.add_argument('--group_size', type=int, default=64)
+    return parser.parse_args()
+
+
+class Quantifier:
+    def __init__(self, model_path_or_name, quant_config=None, anti_outlier_config=None, device_type='cpu', **kwargs):
+        self.device_type = device_type
+        device_map = CPU if self.device_type == CPU else "auto"
+
+        self.quant_config = quant_config
+        self.anti_outlier_config = anti_outlier_config
+        self.model_path_or_name = model_path_or_name
+        self.config = AutoConfig.from_pretrained(self.model_path_or_name, trust_remote_code=True)
+        self.dtype = self.config.torch_dtype if self.device_type == NPU else torch.float32
+        self.model = AutoModelForCausalLM.from_pretrained(
+            pretrained_model_name_or_path=model_path_or_name,
+            low_cpu_mem_usage=True, torch_dtype=self.dtype,
+            device_map=device_map,
+            use_safetensors=True, trust_remote_code=True)
+
+        tokenizer_args = kwargs.get("tokenizer_args", {})
+        self.tokenizer = AutoTokenizer.from_pretrained(
+            model_path_or_name, use_fast=False, trust_remote_code=True, legacy=False, **tokenizer_args
+        )
+
+    def get_tokenized_data(self, input_texts,
+                           input_ids_name='input_ids',
+                           attention_mask_name='attention_mask'):
+        tokenized_data = []
+        for input_text in input_texts:
+            inputs = self.tokenizer(input_text, return_tensors='pt', padding=True).to(self.device_type)
+            tokenized_data.append(
+                [inputs.data[input_ids_name], inputs.data[attention_mask_name]])
+        return tokenized_data
+
+    def convert(self, tokenized_data, save_path, disable_level):
+        if self.device_type == NPU:
+            # 避免在线编译算子，使用二进制编译的算子
+            torch.npu.set_compile_mode(jit_compile=False)
+
+        if self.anti_outlier_config is not None:
+            anti_outlier = AntiOutlier(self.model, calib_data=tokenized_data, cfg=self.anti_outlier_config)
+            anti_outlier.process()
+
+        if not os.path.exists(save_path):
+            os.mkdir(save_path)
+
+        calibrator = Calibrator(self.model, self.quant_config, calib_data=tokenized_data, disable_level=disable_level)
+        calibrator.run()
+        calibrator.save(save_path, save_type=["safe_tensor"])
+
+
+if __name__ == '__main__':
+    args = parse_arguments()
+    rank = int(os.getenv("RANK", "0"))
+
+    calib_file = args.calib_file
+    calib_texts = load_jsonl(calib_file) if calib_file else args.calib_texts
+    model_path = args.model_path
+    save_directory = args.save_directory
+
+    quant_conf = QuantConfig(
+        w_bit=args.w_bit,
+        a_bit=args.a_bit,
+        disable_names=args.disable_names,
+        dev_type=args.device_type,
+        dev_id=rank,
+        act_method=args.act_method,
+        pr=1.0,  # randseed
+        nonuniform=False,
+        w_sym=args.w_sym,
+        mm_tensor=False,
+        co_sparse=args.co_sparse,
+        fraction=args.fraction,
+        sigma_factor=args.sigma_factor,
+        use_sigma=args.use_sigma,
+        is_lowbit=args.is_lowbit,
+        do_smooth=args.do_smooth,
+        use_kvcache_quant=args.use_kvcache_quant,
+        open_outlier=args.open_outlier,
+        group_size=args.group_size
+    )
+    anti_outlier_config = None
+    if args.anti_method == 'm3':
+        anti_outlier_config = AntiOutlierConfig(a_bit=args.a_bit, w_bit=args.w_bit, 
+            anti_method=args.anti_method, w_sym=args.w_sym, dev_type=args.device_type)
+    elif args.anti_method:
+        anti_outlier_config = AntiOutlierConfig(anti_method=args.anti_method)
+    quantifier = Quantifier(
+        model_path, quant_conf, anti_outlier_config,
+        device_type=args.device_type
+    )
+    tokenized_calib_data = None
+    if calib_texts is not None:
+        tokenized_calib_data = quantifier.get_tokenized_data(
+            calib_texts,
+            input_ids_name=args.input_ids_name,
+            attention_mask_name=args.attention_mask_name
+        )
+
+    if not os.path.exists(save_directory):
+        os.makedirs(save_directory, exist_ok=True)
+    #为适配工具稀疏量化传入w_bit=4,a_bit=8暂时修改quant_type
+    quantifier.convert(tokenized_calib_data, save_directory, args.disable_level)
+    quant_type = f"w{args.w_bit}a{args.a_bit}"
+    is_sparseCompress = args.w_bit == 4 and args.a_bit == 8 and (args.co_sparse or args.is_lowbit)
+    if is_sparseCompress:
+        quant_type = "w8a8s"
+    auto_config = AutoConfig.from_pretrained(model_path, trust_remote_code=True)
+    modify_config(model_path, save_directory, auto_config.torch_dtype,
+                  quant_type, args.use_kvcache_quant)
+    copy_tokenizer_files(model_path, save_directory)
diff --git a/mindie/examples/convert/model_slim/sparse_compressor.py b/mindie/examples/convert/model_slim/sparse_compressor.py
new file mode 100644
index 00000000..c0cc3301
--- /dev/null
+++ b/mindie/examples/convert/model_slim/sparse_compressor.py
@@ -0,0 +1,94 @@
+# Copyright Huawei Technologies Co., Ltd. 2023-2024. All rights reserved.
+import argparse
+import os
+import torch
+from atb_llm.runner import ModelRunner
+from atb_llm.utils.cpu_binding import NpuHbmInfo
+from atb_llm.utils.log import logger, print_log
+from atb_llm.models.base.model_utils import unwrap_model_state_dict
+
+from msmodelslim.pytorch.weight_compression import CompressConfig, Compressor
+from examples.convert.convert_utils import copy_tokenizer_files, modify_config
+
+
+class SparseCompressor:
+    def __init__(self, **kwargs):
+        self.rank = kwargs.get('rank', '0')
+        self.world_size = kwargs.get('world_size', '1')
+
+        self.model_path = kwargs.get('model_path', None)
+        self.save_directory = kwargs.get('save_directory', None)
+        self.multiprocess_num = kwargs.get('multiprocess_num', 8)
+        self.save_split_w8a8s_dir = kwargs.get('save_split_w8a8s_dir', None)
+
+        self.model = ModelRunner(self.model_path, rank=self.rank, world_size=self.world_size)
+        self.dtype = self.model.dtype
+        self.quantize = self.model.quantize
+        self.model.load_weights()
+
+        self.device = self.model.device
+        self.max_memory = NpuHbmInfo.get_hbm_capacity(self.rank, self.world_size, self.model.soc_info.need_nz)
+        self.init_memory = int(
+            self.max_memory * NpuHbmInfo.get_hbm_usage(self.rank, self.world_size, self.model.soc_info.need_nz))
+        print_log(self.rank, logger.info, f'hbm_capacity(GB): {self.max_memory / (1024 ** 3)}, '
+                                          f'init_memory(GB): {self.init_memory / (1024 ** 3)}')
+
+        self.warm_up_memory = 0
+        self.warm_up_num_blocks = 0
+        self.cache_manager = None
+
+        if self.save_split_w8a8s_dir is not None:
+            self.model.save_pretrained(save_directory=f'{self.save_split_w8a8s_dir}_{self.world_size}',
+                                       safe_serialization=True)
+            modify_config(model_path, save_directory, torch.float16, 'w8a8s')
+            copy_tokenizer_files(model_path, save_directory)
+
+    def compress(self):
+        model_dict = unwrap_model_state_dict(self.model.model.state_dict())
+        quant_desc = self.model.model.generate_description()
+        compress_config = CompressConfig(do_pseudo_sparse=False, sparse_ratio=1, is_debug=True,
+                                         record_detail_root=self.save_directory,
+                                         multiprocess_num=self.multiprocess_num)
+        compressor = Compressor(compress_config, weight=model_dict, quant_model_description=quant_desc)
+        compressor.run()
+        part_save_directory = os.path.join(self.save_directory, f'part{self.rank}-of-{self.world_size}')
+        os.makedirs(part_save_directory, exist_ok=True)
+        compressor.export_safetensors(part_save_directory)
+
+
+def parse_arguments():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--model_path',
+                        help="model and tokenizer path",
+                        default='/data/acltransformer_testdata/weights/llama2/llama-2-70b',
+                        )
+    parser.add_argument('--save_directory', type=str, required=True)
+    parser.add_argument('--multiprocess_num', type=int, default=8)
+    parser.add_argument('--save_split_w8a8s_dir', type=str, default=None)
+
+    return parser.parse_args()
+
+
+if __name__ == '__main__':
+    args = parse_arguments()
+
+    rank = int(os.getenv("RANK", "0"))
+    world_size = int(os.getenv("WORLD_SIZE", "1"))
+    input_dict = {
+        'rank': rank,
+        'world_size': world_size,
+        **vars(args)
+    }
+
+    model_path = args.model_path
+    save_directory = args.save_directory
+    if not os.path.exists(save_directory):
+        os.makedirs(save_directory, exist_ok=True)
+
+    sparse_compressor = SparseCompressor(**input_dict)
+
+    sparse_compressor.compress()
+
+    if rank == 0:
+        modify_config(model_path, save_directory, torch.float16, 'w8a8sc')
+        copy_tokenizer_files(model_path, save_directory)
diff --git a/mindie/examples/convert/model_slim/teacher_qualification.jsonl b/mindie/examples/convert/model_slim/teacher_qualification.jsonl
new file mode 100644
index 00000000..fa12f636
--- /dev/null
+++ b/mindie/examples/convert/model_slim/teacher_qualification.jsonl
@@ -0,0 +1,44 @@
+{"id": 0, "inputs_pretokenized": "编写中小学教科书的直接依据是____。\nA. 《中华人民共和国教育法》\nB. 课程计划\nC. 课程标准\nD. 课程表", "choices_pretokenized": [" A", " B", " C", " D"], "label": 2, "targets_pretokenized": ["C"]}
+{"id": 1, "inputs_pretokenized": "下列关于课程的三种文本表现形式说法正确的是____\nA. 课程计划是由当地教育主管部门制订的\nB. 课程标准是依据课程计划制定的\nC. 课程标准的核心是实施建议\nD. 教材编写的基本方式有直线式、螺旋式、交叉式", "choices_pretokenized": [" A", " B", " C", " D"], "label": 1, "targets_pretokenized": ["B"]}
+{"id": 2, "inputs_pretokenized": "悦悦是一名右耳失聪的残疾儿童，活动课上有时会听不清楚周老师所讲的内容，因此经常提问题。对此，周老师应当采取的措施是____。\nA. 给予悦悦更多的帮助和指导\nB. 指导家长带悦悦回家自学\nC. 建议家长将悦悦转到特殊幼儿园\nD. 照顾大多数幼儿，不理会悦悦", "choices_pretokenized": [" A", " B", " C", " D"], "label": 0, "targets_pretokenized": ["A"]}
+{"id": 3, "inputs_pretokenized": "内流河也称“内陆河”，是指没有流入海洋的河流，大多分布在大陆内部干燥地区，上游降水或冰雪融水为其主要补给水源，最终消失于沙漠或注入内陆湖泊。下列中国内流河中，最长的是____。\nA. 塔里木河\nB. 柴达木河\nC. 尼雅河\nD. 疏勒河", "choices_pretokenized": [" A", " B", " C", " D"], "label": 0, "targets_pretokenized": ["A"]}
+{"id": 4, "inputs_pretokenized": "学校规定学生不能烫染头发，但是小文为了彰显个性，在假期把头发染成了棕色。面对小文的情况，教师应该怎样处理？____\nA. 年轻人追求个性是合情合理的，应该宽容对待\nB. 违反学校的校规，应该严格处分\nC. 强制要求小文将头发颜色染回来才可以进校门\nD. 探明小文违反校规的原因，并对其进行劝导和教育", "choices_pretokenized": [" A", " B", " C", " D"], "label": 3, "targets_pretokenized": ["D"]}
+{"id": 5, "inputs_pretokenized": "张老师根据自己班级的情况，为解决班级内部班干部的人际关系问题，建立和谐融洽的班级氛围，自主开发了“和谐人际”的班级课程，这体现了教师____。\nA. 是教育教学的研究者\nB. 是课程的建设者和开发者\nC. 是学生学习的促进者\nD. 是社区型的开放教师", "choices_pretokenized": [" A", " B", " C", " D"], "label": 1, "targets_pretokenized": ["B"]}
+{"id": 6, "inputs_pretokenized": "刘老师工作很负责，学生在学校出现一点问题他就会与家长联系，在与家长沟通时他经常以前辈的姿态对待家长，对家长的教育方式指指点点。刘老师的做法____。\nA. 正确，老师就应该与家长经常沟通\nB. 正确，老师的经验比家长丰富，应该多指导家长\nC. 不正确，教师没有权利指导家长\nD. 不正确，教师应该与家长建立平等的沟通关系，尊重家长的人格", "choices_pretokenized": [" A", " B", " C", " D"], "label": 3, "targets_pretokenized": ["D"]}
+{"id": 7, "inputs_pretokenized": "在古代印度，有一户人家经营一家棉布店销售自己手工制作的衣服。你认为这户人家属于哪个等级？____\nA. 婆罗门\nB. 刹帝利\nC. 吠舍\nD. 首陀罗", "choices_pretokenized": [" A", " B", " C", " D"], "label": 2, "targets_pretokenized": ["C"]}
+{"id": 8, "inputs_pretokenized": "“小型分散，便于开展多种多样的活动，满足学生不同的兴趣、爱好，发展学生的才能，使学生得到更多的学习和锻炼的机会。”这种课外活动的形式是____。\nA. 科技活动\nB. 学科活动\nC. 个人活动\nD. 小组活动", "choices_pretokenized": [" A", " B", " C", " D"], "label": 3, "targets_pretokenized": ["D"]}
+{"id": 9, "inputs_pretokenized": "小红每天晚上临睡前都要多次反复检查自己的书包，确保带齐了第二天需要用的教材和文具。她明知道没有这个必要，但就是控制不住。她可能出现了____。\nA. 抑郁症\nB. 焦虑症\nC. 强迫症\nD. 恐惧症", "choices_pretokenized": [" A", " B", " C", " D"], "label": 2, "targets_pretokenized": ["C"]}
+{"id": 10, "inputs_pretokenized": "国家管理和评价课程的基础是____。\nA. 课程计划\nB. 课程标准\nC. 教学目标\nD. 教育目的", "choices_pretokenized": [" A", " B", " C", " D"], "label": 1, "targets_pretokenized": ["B"]}
+{"id": 11, "inputs_pretokenized": "儿童坚持性发生明显质变的年龄约在____\nA. 3～4岁\nB. 4～5岁\nC. 5～6岁\nD. 6岁以后", "choices_pretokenized": [" A", " B", " C", " D"], "label": 1, "targets_pretokenized": ["B"]}
+{"id": 12, "inputs_pretokenized": "《红楼梦》中人物众多、关系繁杂。为了帮助读者阅读，许多红学爱好者都在网络上发布了自己整理制作的主要人物关系图。这属于____。\nA. 纲要策略\nB. 精细加工策略\nC. 资源管理策略\nD. 监控策略", "choices_pretokenized": [" A", " B", " C", " D"], "label": 0, "targets_pretokenized": ["A"]}
+{"id": 13, "inputs_pretokenized": "学期结束时，班主任王老师会对学生思想品德的发展变化情况进行评价。这项工作属于____。\nA. 工作总结\nB. 工作计划\nC. 操行评定\nD. 建立学生档案", "choices_pretokenized": [" A", " B", " C", " D"], "label": 2, "targets_pretokenized": ["C"]}
+{"id": 14, "inputs_pretokenized": "人们常说：“教学有法而教无定法。”这反映了教师的劳动具有____。\nA. 连续性\nB. 示范性\nC. 长期性\nD. 创造性", "choices_pretokenized": [" A", " B", " C", " D"], "label": 3, "targets_pretokenized": ["D"]}
+{"id": 15, "inputs_pretokenized": "县级以上地方各级人民代表大会是县级以上地方国家权力机关，其职权不包括____。\nA. 改变或撤销本级人大常务委员会不适当的决定\nB. 选举并有权罢免本级人民法院院长\nC. 批准本行政区域内的预算执行情况的报告\nD. 决定并宣布下一级行政区城进入紧急状态", "choices_pretokenized": [" A", " B", " C", " D"], "label": 3, "targets_pretokenized": ["D"]}
+{"id": 16, "inputs_pretokenized": "在心理健康课上，同一批学生在第二次进行同样内容的人格测验时获得的分数与上次测验差别较大。这说明该测验存在的问题是____。\nA. 信度问题\nB. 效度问题\nC. 难度问题\nD. 区分度问题", "choices_pretokenized": [" A", " B", " C", " D"], "label": 0, "targets_pretokenized": ["A"]}
+{"id": 17, "inputs_pretokenized": "李老师在教学生区分形近字“渴”“竭”“碣”“谒”时，将四个字相同的右半部分用白色粉笔写出，相异的左半部分用彩色粉笔写出。李老师运用了知觉的____。\nA. 整体性\nB. 选择性\nC. 理解性\nD. 恒常性", "choices_pretokenized": [" A", " B", " C", " D"], "label": 1, "targets_pretokenized": ["B"]}
+{"id": 18, "inputs_pretokenized": "兰兰学会走路后,就要很喜欢尝试自己穿衣、吃饭、捡东西,喜欢探索周围世界。按照埃里克森人格发展阶段理论,兰兰所处的发展阶段是____\nA. 信任对怀疑\nB. 自立对羞怯\nC. 主动感对内疚感\nD. 勤奋感对自卑感", "choices_pretokenized": [" A", " B", " C", " D"], "label": 1, "targets_pretokenized": ["B"]}
+{"id": 19, "inputs_pretokenized": "杨老师在教授生字词的过程中发现部分学生有缺笔少画的现象，于是他把“小学生缺笔少画现象的原因及对策研究”作为研究课题，拟订相应的研究计划，在工作中收集、整理相关资料并实施教学措施，最后根据反馈信息调整教学方案。这种研究方法属于____。\nA. 教育行动研究法\nB. 教育实验法\nC. 教育叙事研究法\nD. 个案研究法", "choices_pretokenized": [" A", " B", " C", " D"], "label": 0, "targets_pretokenized": ["A"]}
+{"id": 20, "inputs_pretokenized": "小青的数学成绩不好，她认为这是因为自己脑子笨，不是学数学的料。她的这种归因属于____。\nA. 内部、稳定，不可控的归因\nB. 外部、稳定、可控的归因\nC. 内部、不稳定，可控的归因\nD. 外部，不稳定，不可控的归因", "choices_pretokenized": [" A", " B", " C", " D"], "label": 0, "targets_pretokenized": ["A"]}
+{"id": 21, "inputs_pretokenized": "中小学教科书不同于其他任何书籍的基本特点是内容的____。\nA. 准确性\nB. 示范性\nC. 新颖性\nD. 基础性", "choices_pretokenized": [" A", " B", " C", " D"], "label": 3, "targets_pretokenized": ["D"]}
+{"id": 22, "inputs_pretokenized": "王老师在课堂上给学生演示了与知识点有关的几个实验。这属于____。\nA. 实物直观\nB. 模象直观\nC. 言语直观\nD. 思维直观", "choices_pretokenized": [" A", " B", " C", " D"], "label": 0, "targets_pretokenized": ["A"]}
+{"id": 23, "inputs_pretokenized": "在Excel中，单元格A1, A2, A3中的内容依次为数值1，2，3，单元格A4中的内容为字符前添加了英文单撇号“，”的文本字符“3”，在单元格A5的编辑栏输入公式“=COUNT( A1：A4) +12”并点击回车键，A5单元格的内容为____。\nA. 15\nB. 21\nC. 12\nD. 18", "choices_pretokenized": [" A", " B", " C", " D"], "label": 3, "targets_pretokenized": ["D"]}
+{"id": 24, "inputs_pretokenized": "唐朝时形成了“父教其子，子教其弟”“五尺童子耻不言文墨焉”的社会风尚，它的形成主要得益于____。\nA. 社会经济的繁荣\nB. 科举制度的推行\nC. 学校体系的完备\nD. 三省六部制的确立", "choices_pretokenized": [" A", " B", " C", " D"], "label": 1, "targets_pretokenized": ["B"]}
+{"id": 25, "inputs_pretokenized": "教导处的刘老师抓到两名学生藏在厕所里偷偷抽烟，于是把他们叫到办公室，慢悠悠地点燃了一根香烟，准备耐心细致地给他们做思想工作。对此，以下说法错误的是____。\nA. 刘老师既禁止学生抽烟，又能耐心劝导，严慈相济，真正做到了关爱学生\nB. 刘老师要求学生不要抽烟，却在学生面前抽烟，违背了为人师表的要求\nC. 刘老师的抽烟行为与他教导学生不能抽烟的言词相悖，很容易损害自己的威信\nD. 刘老师的行为表明教师队伍中存在一些教师需要对其加强师风师德建设的", "choices_pretokenized": [" A", " B", " C", " D"], "label": 0, "targets_pretokenized": ["A"]}
+{"id": 26, "inputs_pretokenized": "小班幼儿看木偶剧表演时，看到“老虎”会感到害怕。这说明幼儿的____\nA. 想象脱离现实\nB. 想象与现实混淆\nC. 想象容易受情绪影响\nD. 想象内容零散", "choices_pretokenized": [" A", " B", " C", " D"], "label": 1, "targets_pretokenized": ["B"]}
+{"id": 27, "inputs_pretokenized": "有的成语与历史人物密切相关。下列选项中，与“狡兔三窟”相关的历史人物是____。\nA. 管仲与齐桓公\nB. 毛遂与平原君\nC. 冯谖与孟尝君\nD. 曹刿与鲁庄公", "choices_pretokenized": [" A", " B", " C", " D"], "label": 2, "targets_pretokenized": ["C"]}
+{"id": 28, "inputs_pretokenized": "王浩同学活动过多、注意力不集中、冲动行为多。这种心理障碍可能是____。\nA. 多动综合征\nB. 学习困难综合征\nC. 儿童厌学症\nD. 儿童强迫行为", "choices_pretokenized": [" A", " B", " C", " D"], "label": 0, "targets_pretokenized": ["A"]}
+{"id": 29, "inputs_pretokenized": "在对班级学生进行教育时，班主任李老师引导学生对自己每日的学习、行为进行反省。李老师主要运用的德育方法是____。\nA. 自我修养法\nB. 榜样示范法\nC. 实践锻炼法\nD. 情感陶冶法", "choices_pretokenized": [" A", " B", " C", " D"], "label": 0, "targets_pretokenized": ["A"]}
+{"id": 30, "inputs_pretokenized": "在讲解方程时，王老师先讲一元一次方程，再讲二元一次方程，然后讲一元二次方程，逐步加深难度。这种教学方式所遵循的原则是____。\nA. 理论联系实际原则\nB. 启发性原则\nC. 循序渐进原则\nD. 巩固性原则", "choices_pretokenized": [" A", " B", " C", " D"], "label": 2, "targets_pretokenized": ["C"]}
+{"id": 31, "inputs_pretokenized": "近代原子核物理学之父是____。\nA. 普朗克\nB. 卢瑟福\nC. 玻尔\nD. 霍金", "choices_pretokenized": [" A", " B", " C", " D"], "label": 1, "targets_pretokenized": ["B"]}
+{"id": 32, "inputs_pretokenized": "很多人因为有了受教育的机会而得到了和父辈完全不同的人生发展机遇。这说明教育在人的发展中起到____。\nA. 辅助作用\nB. 决定作用\nC. 次要作用\nD. 主导作用", "choices_pretokenized": [" A", " B", " C", " D"], "label": 3, "targets_pretokenized": ["D"]}
+{"id": 33, "inputs_pretokenized": "下面是中国古代四大名著中的人物与情节，其中搭配不当的一项是____。\nA. 鲁智深——倒拔垂杨柳\nB. 孙悟空——大闹天宫\nC. 周瑜——三顾茅庐\nD. 刘姥姥——进大观园", "choices_pretokenized": [" A", " B", " C", " D"], "label": 2, "targets_pretokenized": ["C"]}
+{"id": 34, "inputs_pretokenized": "找规律填数字是一项很有趣的活动，特别锻炼观察和思考能力。下列选项中，填入数列“1、7、8、57、____、26050”空缺处的数字，符合该组数字排列规律的是____。\nA. 456\nB. 457\nC. 458\nD. 459", "choices_pretokenized": [" A", " B", " C", " D"], "label": 1, "targets_pretokenized": ["B"]}
+{"id": 35, "inputs_pretokenized": "教育自身的许多规律，是人类长期教育实践认识的结果，它们不会因政治经济制度和其他文化的发展而过时，更不会随时代的发展而被否定。这说明教育具有____。\nA. 历史性\nB. 永恒性\nC. 阶级性\nD. 相对独立性", "choices_pretokenized": [" A", " B", " C", " D"], "label": 3, "targets_pretokenized": ["D"]}
+{"id": 36, "inputs_pretokenized": "高中毕业会考是一种达标考试，属于____。\nA. 定量评价\nB. 相对性评价\nC. 形成性评价\nD. 绝对性评价", "choices_pretokenized": [" A", " B", " C", " D"], "label": 3, "targets_pretokenized": ["D"]}
+{"id": 37, "inputs_pretokenized": "下列选项中，与“图书”和“音乐书”的逻辑关系相同的一组是____。\nA. “钢笔”和“铅笔”\nB. “蛋糕”和“香油”\nC. “水果”和“西瓜”\nD. “白菜”和“黄瓜”", "choices_pretokenized": [" A", " B", " C", " D"], "label": 2, "targets_pretokenized": ["C"]}
+{"id": 38, "inputs_pretokenized": "语文教师裴老师每天下课后都会对自己一天的工作进行总结反思，并记录下来。这属于布鲁巴奇反思方法中的____。\nA. 反思日记\nB. 详细描述\nC. 交流讨论\nD. 行动研究", "choices_pretokenized": [" A", " B", " C", " D"], "label": 0, "targets_pretokenized": ["A"]}
+{"id": 39, "inputs_pretokenized": "以下关于幼儿有意注意发展的表述，不正确的是____\nA. 幼儿有意注意发展受大脑发育水平局限\nB. 幼儿有意注意的发展水平较低，无法依靠活动和操作来维持\nC. 幼儿在幼儿园需要遵守各种行为规则，完成各项任务，这都需要幼儿形成或发展有意注意\nD. 教师在组织活动时，要求幼儿保持注意的对象应该是幼儿认知范围以内或幼儿易于理解的事物", "choices_pretokenized": [" A", " B", " C", " D"], "label": 1, "targets_pretokenized": ["B"]}
+{"id": 40, "inputs_pretokenized": "某幼儿园根据幼儿的发展情况将班级分为快班、中班和慢班。对于快班的幼儿安排大量优秀师资和先进设备，而对于慢班的幼儿则给予较少的优良教育资源。该幼儿园的做法违背了素质教育内涵中的____。\nA. 以提高国民素质为基本宗旨\nB. 面向全体幼儿\nC. 促进幼儿全面发展\nD. 促进幼儿个性发展", "choices_pretokenized": [" A", " B", " C", " D"], "label": 1, "targets_pretokenized": ["B"]}
+{"id": 41, "inputs_pretokenized": "作为古埃及文明的象征之一，____既寄托了古埃及人对死后重生的向往，又证明了新一代法老王权统治的神圣不可侵犯，充分显示了古埃及人的高度智慧和精湛的建筑艺术。\nA. 金字塔\nB. 帕特农神庙\nC. 圆形竞技场\nD. 麦加清真寺", "choices_pretokenized": [" A", " B", " C", " D"], "label": 0, "targets_pretokenized": ["A"]}
+{"id": 42, "inputs_pretokenized": "在太阳系的八大行星中，质量最大和最小的行星分别是____。\nA. 木星；水星\nB. 火星；地球\nC. 金星；水星\nD. 土星；天王星", "choices_pretokenized": [" A", " B", " C", " D"], "label": 0, "targets_pretokenized": ["A"]}
+{"id": 43, "inputs_pretokenized": "据调查，教师对学生拳打脚踢的情况现在已经较少存在，取而代之的是“心罚”。比如，对于成绩不好的学生罚做题目、罚抄单词一百遍。教师这样的行为____。\nA. 是正确的，教育中适当的惩罚是必不可少的\nB. 是正确的，教师没有侵犯学生的身体健康\nC. 是不正确的，教师没能做到依法执教\nD. 是不正确的，教师没能做到团结合作", "choices_pretokenized": [" A", " B", " C", " D"], "label": 2, "targets_pretokenized": ["C"]}
diff --git a/mindie/examples/input.jsonl b/mindie/examples/input.jsonl
new file mode 100644
index 00000000..3e94fd3b
--- /dev/null
+++ b/mindie/examples/input.jsonl
@@ -0,0 +1 @@
+[{"role": "system", "content": "You are a helpful assistant."}, {"role": "user", "content": "What's deep learning?"}, {"role": "assistant", "content": "Deep learning is a subset of machine learning that uses artificial neural networks to learn from data."}, {"role": "user", "content": "Can you explain in more detail?"}]
diff --git a/mindie/examples/models/aquila/README.md b/mindie/examples/models/aquila/README.md
new file mode 100644
index 00000000..d82ccc60
--- /dev/null
+++ b/mindie/examples/models/aquila/README.md
@@ -0,0 +1,181 @@
+# README
+
+- 悟道·天鹰（Aquila） 语言大模型是首个具备中英双语知识、支持商用许可协议、国内数据合规需求的开源语言大模型。
+
+- 此代码仓中实现了一套基于NPU硬件的Aquila推理模型。配合加速库使用，旨在NPU上获得极致的推理性能。
+
+# 特性矩阵
+- 此矩阵罗列了各Aquila模型支持的特性
+
+| 模型及参数量           | 800I A2 Tensor Parallelism | 300I DUO Tensor Parallelism | FP16 | BF16 | Flash Attention | Paged Attention | W8A8量化 | W8A16量化 | W4A16量化 | KV cache量化 | 稀疏量化 | MOE量化 | MindIE Service | TGI  | 长序列 |
+| ---------------------- |-------------------------|---------------------------| ---- |-----| --------------- | --------------- | -------- | --------- | --------- | ------------ | -------------------------- | ---- | ------ | ---- |-----|
+| Aquila-7B                | 支持world size 1,2,4,8    | 支持world size 1,2,4        | √    | ×   | √               | √               | ×        | ×         | ×         | ×            | ×                          | ×    | ×      | ×    | ×    |
+| Aquila2-7B               | 支持world size 1,2,4,8    | 支持world size 1,2,4        | √    | ×   | √               | √               | ×        | ×         | ×         | ×            | ×                          | ×    | ×      | ×    | ×    |
+| Aquila2-34B              | 支持world size 4,8        | ×                         | √    | ×   | √               | √               | ×        | ×         | ×         | ×            | ×                          | ×    | ×      | ×    | ×    |
+
+- 此模型仓已适配的模型版本
+    - [FalshAI GitHub仓](https://github.com/FlagAI-Open/FlagAI/)
+
+# 使用说明
+
+## 路径变量解释
+| 变量名  | 含义                                                                                                                  |
+|--------|---------------------------------------------------------------------------------------------------------------------|
+| working_dir | 加速库及模型库下载后放置的目录                                                                                                     |
+| llm_path | 模型仓所在路径。若使用编译好的包，则路径为`${working_dir}/MindIE-LLM/`；若使用gitee下载的代码，则路径为`${working_dir}/MindIE-LLM/examples/atb_models` |
+| script_path | 脚本所在路径；Aquila和Aquila2的工作脚本所在路径为`${llm_path}/examples/models/aquila`                                                 |
+| weight_path | 模型权重路径                                                                                                              |
+
+## 权重
+**权重下载**
+- [Aquila-7B](https://huggingface.co/BAAI/Aquila-7B/tree/main)
+- [Aquila2-7B](https://huggingface.co/BAAI/Aquila2-7B/tree/main)
+- [Aquila2-34B](https://huggingface.co/BAAI/Aquila2-34B/tree/main)
+**权重转换**
+- 参考[此README文件](../../README.md)
+
+**量化权重生成**
+- 基于原始的FP16的权重，生成量化权重
+- W8A8 Antioutlier量化权重请使用以下指令生成
+- 暂不支持
+
+- W8A8量化权重请使用以下指令生成
+- 暂不支持
+
+- W8A16量化权重请使用以下指令生成
+- 暂不支持
+
+- 稀疏量化权重请使用以下指令生成
+- 暂不支持
+
+**基础环境变量**
+- 参考[此README文件](../../../README.md)
+
+## 推理
+
+### 对话测试
+**运行Flash Attention FP16**
+- 其余Aquila模型参考以下运行方式
+    - 运行启动脚本
+        - 在\${llm_path}目录下执行以下指令
+          ```shell
+          bash ${script_path}/run_fa.sh ${weight_path}
+          ```
+    - 环境变量说明
+        - `export ASCEND_RT_VISIBLE_DEVICES=0,1,2,3,4,5,6,7`
+            - 指定当前机器上可用的逻辑NPU核心，多个核心间使用逗号相连
+            - 核心ID查阅方式见[此README文件](../../README.md)的【启动脚本相关环境变量】章节
+            - 对于300I DUO卡而言，若要使用单卡双芯，请指定至少两个可见核心；若要使用双卡四芯，请指定至少四个可见核心
+            - 各模型支持的核心数参考“特性矩阵”
+        - `export MASTER_PORT=20031`
+            - 设置卡间通信端口
+            - 默认使用20031端口
+            - 目的是为了避免同一台机器同时运行多个多卡模型时出现通信冲突
+            - 设置时端口建议范围为：20000-20050
+        - 以下环境变量与性能和内存优化相关，通常情况下无需修改
+          ```shell
+          export ATB_LAYER_INTERNAL_TENSOR_REUSE=1
+          export INF_NAN_MODE_ENABLE=0
+          export ATB_OPERATION_EXECUTE_ASYNC=1
+          export TASK_QUEUE_ENABLE=1
+          export ATB_CONVERT_NCHW_TO_ND=1
+          export HCCL_BUFFSIZE=120
+          export HCCL_WHITELIST_DISABLE=1
+          export ATB_CONTEXT_WORKSPACE_RING=1
+          export ATB_CONTEXT_WORKSPACE_SIZE=2629145600
+          export ATB_WORKSPACE_MEM_ALLOC_GLOBAL=0
+          export ATB_LAUNCH_KERNEL_WITH_TILING=0
+          export ATB_OPSRUNNER_KERNEL_CACHE_GLOABL_COUNT=1
+          export ATB_OPSRUNNER_KERNEL_CACHE_LOCAL_COUNT=0
+    
+          ```
+
+**运行Flash Attention BF16**
+- 暂不支持
+
+**运行Flash Attention W8A8**
+- 暂不支持
+
+**运行Flash Attention W8A16**
+- 暂不支持
+
+**运行Paged Attention FP16**
+- 运行启动脚本
+    - 在\${llm_path}目录下执行以下指令
+      ```shell
+      bash ${script_path}/run_pa.sh ${weight_path}
+      ```
+- 环境变量说明
+    - `export ASCEND_RT_VISIBLE_DEVICES=0,1,2,3,4,5,6,7`
+        - 指定当前机器上可用的逻辑NPU核心，多个核心间使用逗号相连
+        - 核心ID查阅方式见[此README文件](../../README.md)的【启动脚本相关环境变量】章节
+        - 对于300I DUO卡而言，若要使用单卡双芯，请指定至少两个可见核心；若要使用双卡四芯，请指定至少四个可见核心
+        - 各模型支持的核心数参考“特性矩阵”
+    - `export MASTER_PORT=20031`
+        - 设置卡间通信端口
+        - 默认使用20031端口
+        - 目的是为了避免同一台机器同时运行多个多卡模型时出现通信冲突
+        - 设置时端口建议范围为：20000-20050
+    - 以下环境变量与性能和内存优化相关，通常情况下无需修改
+      ```shell
+      export ATB_LAYER_INTERNAL_TENSOR_REUSE=1
+      export INF_NAN_MODE_ENABLE=0
+      export ATB_OPERATION_EXECUTE_ASYNC=1
+      export TASK_QUEUE_ENABLE=1
+      export ATB_CONVERT_NCHW_TO_ND=1
+      export LCCL_ENABLE_FALLBACK=1
+      export ATB_WORKSPACE_MEM_ALLOC_GLOBAL=1
+      export ATB_CONTEXT_WORKSPACE_SIZE=0
+      ```
+
+**运行Paged Attention BF16**
+- 暂不支持
+
+**运行Paged Attention W8A8**
+- 暂不支持
+
+**运行Paged Attention W8A16**
+- 暂不支持
+
+**运行KV cache量化**
+- 暂不支持
+
+**运行稀疏量化**
+- 暂不支持
+
+**运行MOE量化**
+- 暂不支持
+
+## 精度测试
+- 参考[此README文件](../../../tests/modeltest/README.md)
+    - 示例
+      ```shell
+      cd ${llm_path}/tests/modeltest
+      export ASCEND_RT_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+      export MAX_MEMORY_GB=29
+      bash run.sh pa_fp16 full_BoolQ 1 aquila_7b ${aquila-7b权重路径} 8
+      bash run.sh pa_fp16 full_BoolQ 1 aquila2_7b ${aquila2-7b权重路径} 8
+      bash run.sh pa_fp16 full_BoolQ 1 aquila2_34b ${aquila2-34b权重路径} 8
+      ```
+    - MMLU测试集精度测试
+      - 使用GPU测试Aquila模型测试MMLU数据集，需修改如下配置：
+      - 1、修改开源文件config.json中max_position_embeddings大于3072
+      - 2、修改开源文件tokenizer_config.json中model_max_length为3072
+
+## 性能测试
+- 参考[此README文件](../../../tests/modeltest/README.md)
+    - 示例
+      ```shell
+      cd ${llm_path}/tests/modeltest
+      export ASCEND_RT_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+      export MAX_MEMORY_GB=29
+      export ATB_LLM_BENCHMARK_ENABLE=1
+      bash run.sh pa_fp16 performance [[2048,2048],[1024,1024],[512,512],[256,256]] 1 aquila_7b ${aquila-7b权重路径} 8
+      bash run.sh pa_fp16 performance [[2048,2048],[1024,1024],[512,512],[256,256]] 1 aquila2_7b ${aquila2-7b权重路径} 8
+      bash run.sh pa_fp16 performance [[2048,2048],[1024,1024],[512,512],[256,256]] 1 aquila2_34b ${aquila2-34b权重路径} 8
+      ```
+
+## FAQ
+- 更多环境变量见[此README文件](../../README.md)
+- 对话测试实际执行的Python文件为`${llm_path}/examples/run_fa.py`和`${llm_path}/examples/run_pa.py`；这两个文件的参数说明见[此README文件](../../README.md)
+- 运行时，需要通过指令pip list｜grep protobuf确认protobuf版本，如果版本高于3.20.x，请运行指令pip install protobuf==3.20.0进行更新
\ No newline at end of file
diff --git a/mindie/examples/models/aquila/run_fa.sh b/mindie/examples/models/aquila/run_fa.sh
new file mode 100644
index 00000000..a9ff42f8
--- /dev/null
+++ b/mindie/examples/models/aquila/run_fa.sh
@@ -0,0 +1,23 @@
+#Copyright (c) Huawei Technologies Co., Ltd. 2024-2024. All rights reserved.
+#
+export ASCEND_RT_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+export MASTER_PORT=20031
+
+# 以下环境变量与性能和内存优化相关，通常情况下无需修改
+export ATB_LAYER_INTERNAL_TENSOR_REUSE=1
+export INF_NAN_MODE_ENABLE=0
+export ATB_OPERATION_EXECUTE_ASYNC=1
+export TASK_QUEUE_ENABLE=1
+export ATB_CONVERT_NCHW_TO_ND=1
+export HCCL_BUFFSIZE=120
+export ATB_WORKSPACE_MEM_ALLOC_GLOBAL=1
+export ATB_CONTEXT_WORKSPACE_SIZE=0
+
+extra_param="--max_output_length=128"
+world_size=$(($(echo "${ASCEND_RT_VISIBLE_DEVICES}" | grep -o , | wc -l) +1))
+
+if [ "$TP_WORLD_SIZE" == "1" ]; then
+    python -m examples.run_fa --model_path $1 $extra_param
+else
+    torchrun --nproc_per_node $world_size --master_port $MASTER_PORT -m examples.run_fa --model_path $1 $extra_param --input_text='假如你是小明，请给小红写一封情书？'
+fi
diff --git a/mindie/examples/models/aquila/run_pa.sh b/mindie/examples/models/aquila/run_pa.sh
new file mode 100644
index 00000000..bd1343e3
--- /dev/null
+++ b/mindie/examples/models/aquila/run_pa.sh
@@ -0,0 +1,24 @@
+# Copyright Huawei Technologies Co., Ltd. 2023-2024. All rights reserved.
+# 参数配置以及启动指令的说明见同级目录下的README.md文件
+export ASCEND_RT_VISIBLE_DEVICES=4,5
+export MASTER_PORT=20030
+
+# 以下环境变量与性能和内存优化相关，通常情况下无需修改
+export ATB_LAYER_INTERNAL_TENSOR_REUSE=1
+export INF_NAN_MODE_ENABLE=0
+export ATB_OPERATION_EXECUTE_ASYNC=1
+export TASK_QUEUE_ENABLE=1
+export ATB_CONVERT_NCHW_TO_ND=1
+export LCCL_ENABLE_FALLBACK=1
+export ATB_WORKSPACE_MEM_ALLOC_GLOBAL=1
+export ATB_CONTEXT_WORKSPACE_SIZE=0
+export INT8_FORMAT_NZ_ENABLE=1
+
+extra_param=""
+world_size=$(($(echo "${ASCEND_RT_VISIBLE_DEVICES}" | grep -o , | wc -l) +1))
+
+if [ "$TP_WORLD_SIZE" == "1" ]; then
+    python -m examples.run_pa --model_path $1 $extra_param
+else
+    torchrun --nproc_per_node $world_size --master_port $MASTER_PORT -m examples.run_pa --model_path $1 $extra_param
+fi
\ No newline at end of file
diff --git a/mindie/examples/models/atb_speed_sdk/README.md b/mindie/examples/models/atb_speed_sdk/README.md
new file mode 100644
index 00000000..7b3b535a
--- /dev/null
+++ b/mindie/examples/models/atb_speed_sdk/README.md
@@ -0,0 +1,306 @@
+# atb_speed_sdk
+
+*提高加速库的易用性，统一下游任务，集成公共能力*  
+优点：
+
+1. 同时兼容GPU与NPU，最大程度减少迁移适配的工作量
+2. 屏蔽NPU与GPU的差异，用户无感切换
+3. 一个配置文件覆盖所有配置
+4. 进程安全的日志
+
+# sdk安装
+
+```shell
+pip install .
+```
+
+# 配置文件使用及样例
+
+## 使用
+
+```python
+from atb_speed.common.config import atb_speed_config
+
+config_path = "xxxx"
+atb_speed_config.init_config(config_path)
+```
+
+## 样例
+
+```
+[model]
+;模型路径
+model_path=../model
+;使用的设备号,多卡用逗号分隔，设置多卡，将默认使用并行模式
+device_ids=2
+;并行通信类型，默认是hccl，可选hccl/nccl(GPU)
+;parallel_backend=hccl
+;日志保存路径，默认是执行脚本所在路径
+;log_dir=./
+;是否绑核，0或1，默认是1表示开启
+;bind_cpu=1
+
+[precision]
+;精度测试方法，默认为ceval，可选ceval/mmlu
+mode=ceval
+;精度测试工作路径
+work_dir=./
+;批量精度测试，默认是1
+batch=1
+;每个科目的shot数量，默认是5
+shot=5
+;每个问题的回答长度，默认是32
+;seq_len_out=32
+
+[performance]
+;性能测试模型名称，用于结果文件的命名
+model_name=vicuna_13b
+;测试的batch size
+batch_size=1
+;测试的输入的最大2的幂
+max_len_exp=10
+;测试的输入的最小2的幂
+min_len_exp=5
+;特定用例测试，格式为[[seq_in,seq_out]],注意当设置这个参数时，max_len_exp min_len_exp不生效
+;case_pair=[[1,2],[2,3]]
+;生成的结果文件名称，默认会自动生成，一般不设置
+;save_file_name=
+;性能测试方法，detail / normal , 默认是normal.要使用detail需要配合装饰器计时，并加上环境变量 TIMEIT=1
+;perf_mode=
+;性能测试时是否只测试generate而跳过decode，0/1 默认是0
+;skip_decode=
+```
+
+# 使用说明
+
+最核心的模块是launcher，所有的下游任务都围绕launcher来执行
+
+## launcher [model]
+
+用户通过继承Launcher，多卡继承ParallelLauncher 基类来实现自定义launcher。  
+当前的launcher对GPU和NPU做了自适应适配，因此可以通用。  
+使用launcher时，用户需要实现自定义的init_model方法，这里需要注意的是，self.model_path是从配置文件中读出的。  
+如果要进行功能测试，则需要实现自定义的infer方法。
+
+```python
+from atb_speed.common.config import atb_speed_config
+from atb_speed.common.launcher import Launcher
+from transformers import AutoTokenizer, AutoModelForCausalLM
+
+
+class BaichuanLM(Launcher):
+
+    def init_model(self):
+        """
+        模型初始化
+        :return:
+        """
+        tokenizer = AutoTokenizer.from_pretrained(self.model_path, trust_remote_code=True, use_fast=False)
+        model = AutoModelForCausalLM.from_pretrained(self.model_path, trust_remote_code=True).half().to(self._device)
+        model.eval()
+        model.generation_config = self.remove_part_of_generation_config(model.generation_config)
+        return model, tokenizer
+
+
+if __name__ == '__main__':
+    atb_speed_config.init_config()
+    baichuan = BaichuanLM()
+    print("---------------warm-up---------------")
+    baichuan.infer('Hamlet->Shakespeare\nOne Hundred Years of Solitude->')
+
+    print("---------------inference---------------")
+    baichuan.infer('登鹳雀楼->王之涣\n夜雨寄北->')
+    baichuan.infer('苹果公司的CEO是')
+
+    query_list = ["谷歌公司的CEO是",
+                  '登鹳雀楼->王之涣\n夜雨寄北->',
+                  '苹果公司的CEO是',
+                  '华为公司的CEO是',
+                  '微软公司的CEO是']
+    baichuan.infer_batch(query_list)
+
+```
+
+# 精度测试
+
+SDK提供了两种精度测试方法，ceval和mmlu
+
+## 配置说明 [precision]
+
+| 配置项key      | 默认值   | 备注                                |
+|-------------|-------|-----------------------------------|
+| mode        | ceval | 精度测试方法。可选ceval/mmlu               |
+| work_dir    |       | 精度测试工作路径。必填                       |
+| batch       | 1     | 批量精度测试的批数，请注意batch大于1时精度会和等于1时有差别 |
+| shot        | 5     | 每个科目的shot数量                       |
+| seq_len_out | 32    | 每个问题的回答长度                         |
+
+### 1. 下载测试数据集
+
+ceval
+
+```
+wget https://huggingface.co/datasets/ceval/ceval-exam/resolve/main/ceval-exam.zip
+unzip ceval-exam.zip -d data
+```
+
+mmlu
+
+```shell
+wget https://people.eecs.berkeley.edu/~hendrycks/data.tar
+tar -xvf data.tar
+```
+
+注:wget网络不通请从网页下载并复制
+
+### 2. 配置精度测试相关项
+
+0. 按照推理指导,下载模型及配置路径，并安装atb_speed_sdk
+1. 新建工作文件夹${precision_work_dir}。
+2. 将下载的测试数据集进行解压后的数据和脚本放置在${precision_work_dir}
+3. 修改config.ini文件设置，设置ceval相关路径
+
+目录结构示例${ceval_work_dir}:  
+--test_result 跑完之后生成  
+--data (包含：数据文件夹dev、test、val三者)
+
+## 运行脚本
+
+只需要声明一个launcher即可使用
+
+```python
+from atb_speed.common.precision import get_precision_test_cls
+from atb_speed.common.config import atb_speed_config
+from atb_speed.common.launcher import Launcher
+from transformers import AutoTokenizer, AutoModelForCausalLM
+
+
+class BaichuanLM(Launcher):
+    def init_model(self):
+        """
+        模型初始化
+        :return:
+        """
+        tokenizer = AutoTokenizer.from_pretrained(self.model_path, trust_remote_code=True, use_fast=False)
+        model = AutoModelForCausalLM.from_pretrained(self.model_path, trust_remote_code=True).half().to(self._device)
+        model.eval()
+        model.generation_config = self.remove_part_of_generation_config(model.generation_config)
+        return model, tokenizer
+
+
+if __name__ == '__main__':
+    atb_speed_config.init_config("config.ini")
+    baichuan = BaichuanLM()
+    c_t = get_precision_test_cls()(baichuan)
+    c_t.run()
+```
+
+# 性能测试 [performance]
+
+SDK提供了两种性能测试的方法,常规估计法，精确打点法。也提供了两种测试方案，2幂测试和特定case测试
+
+## 配置说明
+
+| 配置项key         | 默认值    | 备注                                                                                    |
+|----------------|--------|---------------------------------------------------------------------------------------|
+| model_name     |        | 性能测试模型名称，用于结果文件的命名                                                                    |
+| batch_size     | 1      | 测试的batch size                                                                         |
+| max_len_exp    | 10     | 测试的输入的最大2的幂                                                                           |
+| min_len_exp    | 5      | 测试的输入的最小2的幂                                                                           |
+| case_pair      |        | 特定用例测试，格式为[[seq_in,seq_out]],注意当设置这个参数时，max_len_exp min_len_exp不生效                    |
+| save_file_name |        | 生成的结果文件名称，默认会自动生成，一般不设置                                                               |
+| perf_mode      | normal | 性能测试方法，detail / normal , 默认是normal.要使用detail需要侵入式替换utils，并加上环境变量 RETURN_PERF_DETAIL=1 |
+| skip_decode    | 0      | 性能测试时是否只测试generate而跳过decode，0/1 默认是0                                                  |
+
+## 精确打点法
+
+- 通过在modeling中使用sdk里的计时装饰器进行计时
+- 不再需要侵入式修改任何的三方件中的源码，支持任意版本的transformers
+- perf_mode设为detail
+- 将环境变量`TIMEIT`设置成1来开启性能测试，为了不影响正常使用，默认是0
+
+### Timer介绍
+
+- 将环境变量`TIMEIT`设置成1来开计时，为了不影响正常使用，默认是0
+- 计时的数据是累积的，使用 Timer.reset() 来重置计时器
+- 硬件设备上的数据需要同步才能准确计时。在计时前，请使用`Timer.sync = getattr(torch, device_type).synchronize`设置计时器的同步函数
+
+### 如何使用
+
+只需要在最外层的forward函数上方增加timing的计时器即可。  
+例如：
+
+```python
+import torch
+from torch import nn
+
+from atb_speed.common.timer import Timer
+
+
+class AddNet(nn.Module):
+    def __init__(self, in_dim, h_dim=5, out_dim=1):
+        super().__init__()
+        self.fc1 = nn.Linear(in_dim, h_dim)
+        self.fc2 = nn.Linear(h_dim, out_dim)
+
+    @Timer.timing
+    def forward(self, x, y):
+        out = torch.cat([x, y], dim=1)
+        out = torch.relu(self.fc1(out))
+        out = self.fc2(out)
+        return out
+
+
+if __name__ == '__main__':
+    add_net = AddNet(in_dim=2)
+    Timer.sync = torch.cuda.synchronize
+    Timer.reset()
+    for i in range(5):
+        x = torch.randn(1, 1)
+        y = torch.randn(1, 1)
+        result = add_net.forward(x, y)
+        print(result)
+    print(Timer.timeit_res)
+    print(Timer.timeit_res.first_token_delay)
+    print(Timer.timeit_res.next_token_avg_delay)
+```
+
+## 常规估计法
+
+- 通过第一次生成1个token，第2次生成n个token，计时作差来估计性能。
+- *假设两次推理首token的时延相同*
+- perf_mode设为normal
+
+## 运行脚本
+
+```python
+from atb_speed.common.config import atb_speed_config
+from atb_speed.common.launcher import Launcher
+from atb_speed.common.performance.base import PerformanceTest
+from transformers import AutoTokenizer, AutoModelForCausalLM
+
+
+class LMLauncher(Launcher):
+    """
+    Baichuan2_7B_NPU
+    """
+
+    def init_model(self):
+        """
+        模型初始化
+        :return:
+        """
+        tokenizer = AutoTokenizer.from_pretrained(
+            self.model_path, trust_remote_code=True, use_fast=False)
+        model = AutoModelForCausalLM.from_pretrained(self.model_path, trust_remote_code=True).half().to(self._device)
+        model.eval()
+        model.generation_config = self.remove_part_of_generation_config(model.generation_config)
+        return model, tokenizer
+
+
+if __name__ == '__main__':
+    atb_speed_config.init_config("config.ini")
+    performance_test = PerformanceTest(LMLauncher())
+    performance_test.warm_up()
+    performance_test.run_test()
+```
\ No newline at end of file
diff --git a/mindie/examples/models/atb_speed_sdk/atb_speed/__init__.py b/mindie/examples/models/atb_speed_sdk/atb_speed/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/mindie/examples/models/atb_speed_sdk/atb_speed/common/__init__.py b/mindie/examples/models/atb_speed_sdk/atb_speed/common/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/mindie/examples/models/atb_speed_sdk/atb_speed/common/config.py b/mindie/examples/models/atb_speed_sdk/atb_speed/common/config.py
new file mode 100644
index 00000000..c9c4599f
--- /dev/null
+++ b/mindie/examples/models/atb_speed_sdk/atb_speed/common/config.py
@@ -0,0 +1,122 @@
+#!/usr/bin/env python
+# coding:utf-8
+# Copyright Huawei Technologies Co., Ltd. 2010-2018. All rights reserved
+"""
+config
+"""
+import ast
+import configparser
+import os
+import warnings
+from dataclasses import dataclass
+from typing import Optional, List, Union, Type
+
+
+class ConfigInitializationError(Exception):
+    def __init__(self, message):
+        self.message = message
+        super().__init__(self.message)
+
+
+@dataclass
+class PrecisionConfig:
+    work_dir: str = ""
+    batch: int = 1
+    shot: int = 5
+    seq_len_out: int = 32
+    mode: str = "ceval"
+
+    def __post_init__(self):
+        int_attr = ("batch", "shot", "seq_len_out")
+        for attr in int_attr:
+            self.__dict__[attr] = int(self.__dict__[attr])
+        self.work_dir = os.path.realpath(self.work_dir)
+
+
+@dataclass
+class PerformanceConfig:
+    model_name: str = ""
+    batch_size: int = 1
+    max_len_exp: int = 11
+    min_len_exp: int = 5
+    case_pair: Union[Optional[List[int]], str] = None
+    save_file_name: str = ""
+    perf_mode: str = "normal"
+    skip_decode: int = 0
+
+    def __post_init__(self):
+        int_attr = ("batch_size", "max_len_exp", "min_len_exp", "skip_decode")
+        for attr in int_attr:
+            self.__dict__[attr] = int(self.__dict__[attr])
+        if self.case_pair is not None:
+            self.case_pair = ast.literal_eval(self.case_pair)
+
+
+@dataclass
+class ModelConfig:
+    model_path: str = ""
+    device_ids: str = "0"
+    parallel_backend: str = "hccl"
+    device_num: int = 1
+    log_dir: str = os.path.join(os.getcwd(), "atb_speed_log")
+    bind_cpu: int = 1
+
+    def __post_init__(self):
+        self.model_path = os.path.realpath(self.model_path)
+        self.device_num = len(self.device_ids.split(","))
+        int_attr = ("bind_cpu",)
+        for attr in int_attr:
+            self.__dict__[attr] = int(self.__dict__[attr])
+
+
+@dataclass
+class Config:
+    model: ModelConfig = None
+    performance: PerformanceConfig = None
+    precision: PrecisionConfig = None
+
+    def init_config(self, raw_content_path, allow_modify=False):
+        if not os.path.exists(raw_content_path):
+            raise FileNotFoundError(f"{raw_content_path} not exists.")
+
+        section_map = {
+            "model": ModelConfig,
+            "performance": PerformanceConfig,
+            "precision": PrecisionConfig
+        }
+        if allow_modify:
+            warn_msg = "Warning, allow_modify has been set as True. " \
+                       "It is dangerous to modify the reserved fields below.\n"
+            for cfg_key, cfg_cls in section_map.items():
+                warn_msg = warn_msg + "\n".join(
+                    f"{cfg_key}.{sub_k} is reserved."
+                    for sub_k in cfg_cls.__dict__ if not sub_k.startswith("__")) + "\n"
+            warnings.warn(warn_msg, DeprecationWarning, stacklevel=2)
+        conf = configparser.ConfigParser()
+        conf.read(raw_content_path, encoding="utf-8")
+        for section_name, section_content in conf.items():
+            if section_name == "DEFAULT":
+                continue
+            if section_name == "ceval":
+                warnings.warn(
+                    "The section_name [ceval] is deprecated, "
+                    "please refer to readme and use [precision] instead",
+                    DeprecationWarning,
+                    stacklevel=2)
+                section_name = "precision"
+            if not hasattr(self, section_name) and not allow_modify:
+                warnings.warn(f"The section [{section_name}] is not recognized and not allowed to modify.",
+                              UserWarning,
+                              stacklevel=2)
+                continue
+            config_cls: Type | None = section_map.get(section_name)
+            if not config_cls:
+                raise ConfigInitializationError(f"No configuration class found for section [{section_name}].")
+            try:
+                attr = config_cls(**section_content)
+            except TypeError as e:
+                raise ConfigInitializationError(f"Invalid configuration for section [{section_name}].") from e
+            setattr(self, section_name, attr)
+
+
+atb_speed_config = Config()
diff --git a/mindie/examples/models/atb_speed_sdk/atb_speed/common/cpu_binding.py b/mindie/examples/models/atb_speed_sdk/atb_speed/common/cpu_binding.py
new file mode 100644
index 00000000..0d4e2ccd
--- /dev/null
+++ b/mindie/examples/models/atb_speed_sdk/atb_speed/common/cpu_binding.py
@@ -0,0 +1,178 @@
+# Copyright Huawei Technologies Co., Ltd. 2023-2024. All rights reserved.
+import logging
+import os
+import subprocess
+from dataclasses import dataclass
+from typing import List, Dict, Union
+
+import psutil
+
+
+def execute_command(cmd_list):
+    with subprocess.Popen(cmd_list,
+                          shell=False,
+                          stdout=subprocess.PIPE,
+                          stderr=subprocess.PIPE) as p:
+        out, err = p.communicate(timeout=1000)
+    res = out.decode()
+    return res
+
+
+@dataclass
+class DeviceInfo:
+    _info_line: str = ""
+    npu_id: int = 0
+    chip_id: int = 0
+    chip_logic_id: Union[int, str] = 0
+    chip_name: str = ""
+
+    def __post_init__(self):
+        self.npu_id, self.chip_id, self.chip_logic_id, self.chip_name = self._info_line.strip().split(None, 3)
+        self.npu_id = int(self.npu_id)
+        self.chip_id = int(self.chip_id)
+        if self.chip_logic_id.isnumeric():
+            self.chip_logic_id = int(self.chip_logic_id)
+
+
+@dataclass
+class CPUBinder:
+    logger: logging.Logger = logging.getLogger()
+
+    @staticmethod
+    def _get_device_map_info() -> Dict[int, DeviceInfo]:
+        device_map_info = {}
+        device_map = execute_command([f"npu-smi", "info", "-m"]).strip().split("\n")[1:]
+        for line in device_map:
+            device_info = DeviceInfo(line.strip())
+            if isinstance(device_info.chip_logic_id, int):
+                device_map_info[device_info.chip_logic_id] = device_info
+        return device_map_info
+
+    @staticmethod
+    def _get_pcie_info(devices: List[int], keyword="PCIeBusInfo"):
+        device_map_info = CPUBinder._get_device_map_info()
+        device_pcie_tbl = {}
+        for device in devices:
+            device_info = device_map_info.get(device)
+            if not device_info:
+                raise RuntimeError("Can not get device info, binding cpu will skip.")
+            pcie_info = execute_command(["npu-smi", "info", "-t", "board", "-i", f"{device_info.npu_id}",
+                                         "-c", f"{device_info.chip_id}"]).strip().split("\n")
+            for _ in pcie_info:
+                line = ''.join(_.split())  # 此处是因为310P的关键字是 PCIe Bus Info 910是 PCIeBusInfo，故去掉空格以此兼容
+                if line.startswith(keyword):
+                    device_pcie_tbl[device] = line[len(keyword) + 1:]
+                    break
+
+        return device_pcie_tbl
+
+    @staticmethod
+    def _get_numa_info(pcie_tbl, keyword="NUMAnode"):
+        device_numa_tbl = {}  # key is device id, value is numa id
+        numa_devices_tbl = {}  # key is numa id, value is device id list
+
+        for device, pcie_no in pcie_tbl.items():
+            numa_info = execute_command(["lspci", "-s", f"{pcie_no}", "-vvv"]).strip().split("\n")
+            for _ in numa_info:
+                line = ''.join(_.split())
+                if line.startswith(keyword):
+                    numa_id = int(line[len(keyword) + 1:])
+                    device_numa_tbl[device] = numa_id
+
+                    devices = numa_devices_tbl.get(numa_id, None)
+                    if devices is None:
+                        numa_devices_tbl[numa_id] = list()
+
+                    numa_devices_tbl[numa_id].append(device)
+                    break
+
+        return device_numa_tbl, numa_devices_tbl
+
+    @staticmethod
+    def _get_cpu_info(numa_ids, keyword1="NUMAnode", keyword2="CPU(s)"):
+        cpu_idx_tbl = dict()
+        numa_keywords = [keyword1 + str(idx) + keyword2 for idx in numa_ids]
+        cpu_info = execute_command(["lscpu"]).strip().split("\n")
+        for _ in cpu_info:
+            line = ''.join(_.split())
+            if any(line.startswith(word) for word in numa_keywords):
+                split_info = line.split(":")
+                cpu_id_ranges = split_info[-1].split(",")
+
+                ranges = list()
+                for range_str in cpu_id_ranges:
+                    endpoints = range_str.split("-")
+                    if len(endpoints) != 2:
+                        raise Exception("lscpu command output error, please check !")
+
+                    ranges += [cid for cid in range(int(endpoints[0]), int(endpoints[1]) + 1)]
+
+                numa_id = int(split_info[0].replace(keyword1, '').replace(keyword2, ''))
+                cpu_idx_tbl[numa_id] = ranges
+        return cpu_idx_tbl
+
+    def bind_cpus(self, visible_devices: List[int] = None, rank_id: int = 0, ratio: float = 0.5):
+        """
+        可以用export CPU_BINDING_NUM设置每个进程绑的核数;如果不设置CPU_BINDING_NUM,
+        会根据ratio(numa利用率)进行计算,如果有64个核，0.5表示用一半，用32个核, 平分给亲和在这个numa上的npu
+        :param visible_devices:
+        :param rank_id:
+        :param ratio:
+        :return:
+        """
+
+        if visible_devices is None:
+            devices = [
+                int(item.strip())
+                for item in os.getenv("ASCEND_RT_VISIBLE_DEVICES", None).split(",")
+                if item.isnumeric()
+            ]
+        else:
+            devices = visible_devices
+
+        # 获取npu和pcie的对应关系
+        device_pcie_tbl = self._get_pcie_info(devices)
+        # 根据pcie信息获取npu和numa的对应关系
+        device_numa_tbl, numa_devices_tbl = self._get_numa_info(device_pcie_tbl)
+        # 获取使用的numa对应的cpu核分配信息
+        cpu_idx_tbl = self._get_cpu_info(list(numa_devices_tbl.keys()))
+
+        # 当前rank的npu id
+        cur_device = devices[rank_id]
+        # 获取npu对应的numa id
+        numa_id = device_numa_tbl.get(cur_device)
+
+        # 获取共享该numa的npu信息
+        shard_devices = numa_devices_tbl.get(numa_id)
+        # 按照npu id进行排序
+        shard_devices.sort()
+
+        # 获取该numa上所有的cpu id信息
+        all_cpus = cpu_idx_tbl[numa_id]
+        info_msg = (f"rank_id: {rank_id}, device_id: {cur_device}, numa_id: {numa_id}, "
+                    f"shard_devices: {shard_devices}, cpus: {all_cpus}")
+        self.logger.info(info_msg)
+
+        cpu_nums = len(all_cpus)
+        # 计算给该共享numa的npu分配的核的个数
+        cpu_binding_num = os.environ.get("CPU_BINDING_NUM", None)
+        if cpu_binding_num is None:
+            cpu_num_per_device = int(cpu_nums * ratio // len(shard_devices))
+        else:
+            cpu_num_per_device = int(cpu_binding_num)
+            if len(shard_devices) * cpu_num_per_device > cpu_nums:
+                raise Exception(
+                    f"Cpu num in numa {numa_id} to assign {cpu_num_per_device} for every device is not enough, "
+                    f"please decrease the value of CPU_BINDING_NUM!")
+
+        # 获取该npu的下标信息
+        idx = shard_devices.index(cur_device)
+        # 给该npu分配要绑定的cpu id
+        binding_cpus = [all_cpus[_] for _ in range(idx * cpu_num_per_device, (idx + 1) * cpu_num_per_device)]
+
+        # cpu bind
+        p = psutil.Process()
+        p.cpu_affinity(binding_cpus)
+        new_affinity = p.cpu_affinity()
+        info_msg = f"process {p.pid}, new_affinity is {new_affinity}, cpu count {cpu_num_per_device}"
+        self.logger.info(info_msg)
diff --git a/mindie/examples/models/atb_speed_sdk/atb_speed/common/launcher/__init__.py b/mindie/examples/models/atb_speed_sdk/atb_speed/common/launcher/__init__.py
new file mode 100644
index 00000000..03adbe2e
--- /dev/null
+++ b/mindie/examples/models/atb_speed_sdk/atb_speed/common/launcher/__init__.py
@@ -0,0 +1,12 @@
+#!/usr/bin/env python
+# coding:utf-8
+# Copyright Huawei Technologies Co., Ltd. 2010-2018. All rights reserved
+"""
+common launcher
+"""
+from atb_speed.common.launcher.base import get_device, DeviceType
+
+if get_device() == DeviceType.npu:
+    from atb_speed.common.launcher.npu import Launcher, ParallelLauncher
+else:
+    from atb_speed.common.launcher.gpu import Launcher, ParallelLauncher
diff --git a/mindie/examples/models/atb_speed_sdk/atb_speed/common/launcher/base.py b/mindie/examples/models/atb_speed_sdk/atb_speed/common/launcher/base.py
new file mode 100644
index 00000000..718af695
--- /dev/null
+++ b/mindie/examples/models/atb_speed_sdk/atb_speed/common/launcher/base.py
@@ -0,0 +1,244 @@
+#!/usr/bin/env python
+# coding:utf-8
+# Copyright Huawei Technologies Co., Ltd. 2010-2018. All rights reserved
+"""
+common launcher
+"""
+import inspect
+import logging
+import os
+import time
+from abc import abstractmethod
+from enum import Enum
+from typing import Dict, Tuple
+
+import torch
+from atb_speed.common.config import atb_speed_config
+from atb_speed.common.log.logging import init_logger
+from transformers import GenerationConfig
+
+
+class DeviceType(str, Enum):
+    npu = "npu"
+    cuda = "cuda"
+    cpu = "cpu"
+
+
+def get_device() -> str:
+    """
+    获取当前所在设备
+    :return:
+    """
+    flag = torch.cuda.is_available()
+    if flag:
+        return DeviceType.cuda
+    try:
+        import torch_npu
+        flag = torch.npu.is_available()
+    except ImportError:
+        flag = False
+    return DeviceType.npu if flag else DeviceType.cpu
+
+
+class BaseLauncher:
+    """
+    BaseLauncher
+    """
+
+    def __init__(self, device_ids: str = None, model_path="", options=None):
+        options = {} if options is None else options
+        self.model_path = atb_speed_config.model.model_path if not model_path else model_path
+
+        if device_ids is None and atb_speed_config.model:
+            device_ids = atb_speed_config.model.device_ids
+        self.device_ids = device_ids
+        self.device_id_list = [int(item.strip()) for item in self.device_ids.split(",") if item.isnumeric()]
+        self.local_rank, self.world_size = self.setup_model_parallel()
+
+        self.logger_name = f"device{self.local_rank}_{self.world_size}_{time.time()}.log"
+        os.makedirs(atb_speed_config.model.log_dir, exist_ok=True)
+        self.logger_path = os.path.join(atb_speed_config.model.log_dir, self.logger_name)
+        self.logger = init_logger(logging.getLogger(f"device_{self.local_rank}"), self.logger_path)
+        if atb_speed_config.model.bind_cpu:
+            try:
+                self.bind_cpu()
+            except Exception as err:
+                self.logger.error(f"Failed to bind cpu, skip to bind cpu. \nDetail: %s ", err)
+        self.set_torch_env(self.device_ids, options)
+        self.model, self.tokenizer = self.init_model()
+        self.logger.info(self.model.device)
+        self.logger.info(f"load model from %s successfully!", os.path.basename(inspect.getmodule(self.model).__file__))
+        self.logger.info(f"load model from %s successfully!", os.path.realpath(inspect.getmodule(self.model).__file__))
+
+    @property
+    def _device(self) -> str:
+        """
+         获取当前所在设备
+        :return:
+        """
+        return get_device()
+
+    @property
+    def device(self) -> torch.device:
+        """
+        获取模型所在的设备
+        :return:
+        """
+        return self.model.device
+
+    @property
+    def device_type(self) -> str:
+        """
+        获取模型所在的设备的字符串
+        :return:
+        """
+        return self.model.device.type
+
+    @property
+    def device_name(self) -> str:
+        """
+        获取所在设备的详细硬件名称
+        :return:
+        """
+        if self.device_type == DeviceType.npu:
+            device_name = torch.npu.get_device_name()
+        elif self.device_type == DeviceType.cuda:
+            device_name = torch.cuda.get_device_name()
+        else:
+            device_name = "cpu"
+        return "_".join(device_name.split())
+
+    @abstractmethod
+    def init_model(self):
+        """
+        模型初始化
+        :return:
+        """
+        ...
+
+    @staticmethod
+    def set_torch_env(device_ids, options: Dict = None):
+        """
+
+        :param device_ids:
+        :param options:
+        :return:
+        """
+
+    @staticmethod
+    def bind_cpu():
+        ...
+
+    @staticmethod
+    def setup_model_parallel() -> Tuple[int, int]:
+        local_rank, world_size = 0, 1
+        return local_rank, world_size
+
+    @classmethod
+    def safe_serialization(cls, model, tokenizer, save_dir):
+        """
+        权重转safetensors
+        :param model:
+        :param tokenizer:
+        :param save_dir:
+        :return:
+        """
+        os.makedirs(save_dir, exist_ok=True)
+        model.save_pretrained(save_dir, safe_serialization=True)
+        tokenizer.save_pretrained(save_dir)
+
+    def infer(self, query, model_params=None):
+        """
+        推理代码
+        :param query:
+        :param model_params:
+        :return:
+        """
+        inputs = self.tokenizer(query, return_tensors='pt')
+        inputs = inputs.to(self.model.device)
+        with torch.no_grad():
+            start_time = time.time()
+            model_params = model_params if model_params is not None else {}
+            pred = self.model.generate(**inputs, **model_params)
+            end_time = time.time()
+            time_cost = end_time - start_time
+        output = self.tokenizer.decode(pred.cpu()[0], skip_special_tokens=True)
+        self.logger.info(output)
+        self.logger.info(f"cost %s s", time_cost)
+        new_tokens = len(pred[0]) - len(inputs.input_ids[0])
+        final_msg = f"generate {new_tokens} new tokens，({new_tokens / time_cost:.2f} tokens/s)"
+        self.logger.info(final_msg)
+        return output
+
+    def infer_batch(self, query, model_params=None):
+        """
+        推理代码
+        :param query:
+        :param model_params:
+        :return:
+        """
+        inputs = self.tokenizer(query, return_tensors='pt', padding=True)
+        inputs = inputs.to(self.model.device)
+        with torch.no_grad():
+            start_time = time.time()
+            model_params = model_params if model_params is not None else {}
+            pred = self.model.generate(**inputs, **model_params)
+            end_time = time.time()
+            time_cost = end_time - start_time
+        output = self.tokenizer.batch_decode(pred, skip_special_tokens=True, clean_up_tokenization_spaces=False)
+
+        for ind, item in enumerate(output):
+            self.logger.info(f"###### batch %s ", ind)
+            self.logger.info(item)
+
+        self.logger.info(f"cost %s s", time_cost)
+        new_tokens = len(pred[0]) - len(inputs.input_ids[0])
+        final_msg = f"generate {new_tokens} new tokens，({new_tokens / time_cost:.2f} tokens/s)"
+        self.logger.info(final_msg)
+        return output
+
+    def infer_test(self, batch_size: int = 1, seq_in: int = 2048, seq_out: int = 64):
+        """
+        推理代码
+        :param batch_size: 特定batch size
+        :param seq_in:  特定长度输入
+        :param seq_out: 特定长度输出
+        :return:
+        """
+        inputs = self.tokenizer("hi", return_tensors='pt')
+        dummy_input_ids_nxt = torch.randint(0, self.model.config.vocab_size, [batch_size, seq_in], dtype=torch.int64)
+        dummy_attention_mask = torch.ones((batch_size, seq_in), dtype=torch.int64)
+        inputs["input_ids"] = dummy_input_ids_nxt
+        inputs["attention_mask"] = dummy_attention_mask
+        inputs = inputs.to(self.model.device)
+        with torch.no_grad():
+            start_time = time.time()
+            pred = self.model.generate(**inputs, max_new_tokens=seq_out,
+                                       eos_token_id=self.model.config.vocab_size * 2)
+            end_time = time.time()
+            time_cost = end_time - start_time
+        output = self.tokenizer.decode(pred.cpu()[0], skip_special_tokens=True)
+        self.logger.info(f"cost %s s", time_cost)
+        new_tokens = len(pred[0]) - seq_in
+        final_msg = (f"generate {batch_size * new_tokens} new tokens，"
+                     f"({batch_size * new_tokens / time_cost:.2f} tokens/s)")
+        self.logger.info(final_msg)
+        return output
+
+    def remove_part_of_generation_config(self, generation_config) -> GenerationConfig:
+        """
+        移除部分当前不支持后处理相关参数
+        :param generation_config:
+        :return:
+        """
+        ori_gen = GenerationConfig()
+        diff_dict = generation_config.to_diff_dict()
+        self.logger.info(diff_dict)
+        for key in diff_dict:
+            if key.endswith("_id"):
+                continue
+            ori_value = getattr(ori_gen, key, None)
+            if ori_value is not None:
+                setattr(generation_config, key, getattr(ori_gen, key))
+                self.logger.info(f"replace %s", key)
+        return generation_config
diff --git a/mindie/examples/models/atb_speed_sdk/atb_speed/common/launcher/gpu.py b/mindie/examples/models/atb_speed_sdk/atb_speed/common/launcher/gpu.py
new file mode 100644
index 00000000..4763046b
--- /dev/null
+++ b/mindie/examples/models/atb_speed_sdk/atb_speed/common/launcher/gpu.py
@@ -0,0 +1,57 @@
+#!/usr/bin/env python
+# coding:utf-8
+# Copyright Huawei Technologies Co., Ltd. 2010-2018. All rights reserved
+"""
+common launcher
+"""
+import abc
+import os
+from typing import Dict
+
+import torch
+from atb_speed.common.launcher.base import BaseLauncher
+
+
+class Launcher(BaseLauncher):
+    """
+    BaseLauncher
+    """
+
+    @staticmethod
+    def set_torch_env(device_ids, options: Dict = None):
+        """
+
+        :param device_ids:
+        :param options:
+        :return:
+        """
+        os.environ['CUDA_VISIBLE_DEVICES'] = device_ids
+
+    @abc.abstractmethod
+    def init_model(self):
+        """
+        模型初始化
+        :return:
+        """
+        ...
+
+
+class ParallelLauncher(Launcher):
+    @staticmethod
+    def set_torch_env(device_ids, options: Dict = None):
+        os.environ['CUDA_VISIBLE_DEVICES'] = device_ids
+
+    @abc.abstractmethod
+    def init_model(self):
+        """
+        模型初始化
+        :return:
+        """
+        ...
+
+    def setup_model_parallel(self):
+        torch.distributed.init_process_group()
+        local_rank = torch.distributed.get_rank()
+        world_size = torch.distributed.get_world_size()
+        torch.manual_seed(1)
+        return local_rank, world_size
diff --git a/mindie/examples/models/atb_speed_sdk/atb_speed/common/launcher/npu.py b/mindie/examples/models/atb_speed_sdk/atb_speed/common/launcher/npu.py
new file mode 100644
index 00000000..c6c3709c
--- /dev/null
+++ b/mindie/examples/models/atb_speed_sdk/atb_speed/common/launcher/npu.py
@@ -0,0 +1,117 @@
+#!/usr/bin/env python
+# coding:utf-8
+# Copyright Huawei Technologies Co., Ltd. 2010-2018. All rights reserved
+"""
+common launcher
+"""
+import abc
+from dataclasses import dataclass
+from typing import Dict
+
+import torch
+import torch_npu
+from atb_speed.common.config import atb_speed_config
+from atb_speed.common.cpu_binding import CPUBinder
+from atb_speed.common.launcher.base import BaseLauncher
+
+
+@dataclass
+class NPUSocInfo:
+    soc_name: str = ""
+    soc_version: int = -1
+    need_nz: bool = False
+
+    def __post_init__(self):
+        self.soc_version = torch_npu._C._npu_get_soc_version()
+        if self.soc_version in (100, 101, 102, 103, 104, 200, 201, 202, 203):
+            self.need_nz = True
+
+
+class Launcher(BaseLauncher):
+    """
+    BaseLauncher
+    """
+
+    def __init__(self, device_ids: str = None, model_path="", options=None):
+        super().__init__(device_ids, model_path, options)
+        self.soc_info = NPUSocInfo()
+        self.fit_npu(self.model)
+
+    @staticmethod
+    def set_torch_env(device_ids, options: Dict = None):
+        """
+
+        :param device_ids:
+        :param options:
+        :return:
+        """
+        torch_npu.npu.set_device(int(device_ids.split(",")[0]))
+        torch.npu.set_compile_mode(jit_compile=False)
+        torch.npu.set_option(options)
+
+    @abc.abstractmethod
+    def init_model(self):
+        """
+        模型初始化
+        :return:
+        """
+        ...
+
+    def fit_npu(self, model):
+        """
+        芯片适配,提前转换，提高性能
+        :param model:
+        :return:
+        """
+        if not self.soc_info.need_nz:
+            for _, module in model.named_modules():
+                if isinstance(module, torch.nn.Linear):
+                    module.weight.data = torch_npu.npu_format_cast(module.weight.data, 2)
+            self.logger.info(f"soc info: {self.soc_info.soc_version} , {self.soc_info.soc_name}, support ND")
+        else:
+            # if on 910A or 310P chip, eliminate the TransData and Transpose ops by converting weight data types
+            for name, module in model.named_modules():
+                if isinstance(module, torch.nn.Linear):
+                    if name == 'lm_head':
+                        # eliminate TransData op before lm_head calculation
+                        module.weight.data = torch.nn.parameter.Parameter(module.weight.data)
+                    module.weight.data = torch_npu.npu_format_cast(module.weight.data, 29)
+            self.logger.info(f"soc info: {self.soc_info.soc_version} , {self.soc_info.soc_name}, support NZ")
+
+        for _, module in model.named_modules():
+            if isinstance(module, torch.nn.Embedding):
+                module.weight.data = torch_npu.npu_format_cast(module.weight.data, 2)
+
+    def bind_cpu(self):
+        """
+        绑核
+        :return:
+        """
+        cpu_binder = CPUBinder(self.logger)
+        cpu_binder.bind_cpus(self.device_id_list, self.local_rank, 1.0)
+        self.logger.info("Bind cpu successfully!")
+
+
+class ParallelLauncher(Launcher):
+
+    @staticmethod
+    def set_torch_env(device_ids, options: Dict = None):
+        torch.npu.set_compile_mode(jit_compile=False)
+        torch.npu.set_option(options)
+
+    @abc.abstractmethod
+    def init_model(self):
+        """
+        模型初始化
+        :return:
+        """
+        ...
+
+    def setup_model_parallel(self):
+        torch.distributed.init_process_group(atb_speed_config.model.parallel_backend)
+        local_rank = torch.distributed.get_rank()
+        world_size = torch.distributed.get_world_size()
+        torch_npu.npu.set_device(self.device_id_list[local_rank])
+        # seed must be the same in all processes
+        torch.manual_seed(1)
+        return local_rank, world_size
diff --git a/mindie/examples/models/atb_speed_sdk/atb_speed/common/log/__init__.py b/mindie/examples/models/atb_speed_sdk/atb_speed/common/log/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/mindie/examples/models/atb_speed_sdk/atb_speed/common/log/logging.py b/mindie/examples/models/atb_speed_sdk/atb_speed/common/log/logging.py
new file mode 100644
index 00000000..50e14626
--- /dev/null
+++ b/mindie/examples/models/atb_speed_sdk/atb_speed/common/log/logging.py
@@ -0,0 +1,39 @@
+#!/usr/bin/env python
+# coding:utf-8
+# Copyright Huawei Technologies Co., Ltd. 2010-2018. All rights reserved
+"""
+logging
+"""
+import logging
+import os
+from logging.handlers import RotatingFileHandler
+
+from atb_speed.common.log.multiprocess_logging_handler import install_logging_handler
+
+
+def init_logger(logger: logging.Logger, file_name: str):
+    """
+    日志初始化
+    :param logger:
+    :param file_name:
+    :return:
+    """
+    logger.setLevel(logging.INFO)
+    # 创建日志记录器，指明日志保存路径,每个日志的大小，保存日志的上限
+    flask_file_handle = RotatingFileHandler(
+        filename=file_name,
+        maxBytes=int(os.getenv('PYTHON_LOG_MAXSIZE', "1073741824")),
+        backupCount=10,
+        encoding="utf-8")
+    formatter = logging.Formatter('%(asctime)s [%(levelname)s] pid: %(process)d %(filename)s-%(lineno)d: %(message)s')
+    # 将日志记录器指定日志的格式
+    flask_file_handle.setFormatter(formatter)
+    # 为全局的日志工具对象添加日志记录器
+    logger.addHandler(flask_file_handle)
+
+    # 添加控制台输出日志
+    console_handle = logging.StreamHandler()
+    console_handle.setFormatter(formatter)
+    logger.addHandler(console_handle)
+    install_logging_handler(logger)
+    return logger
diff --git a/mindie/examples/models/atb_speed_sdk/atb_speed/common/log/multiprocess_logging_handler.py b/mindie/examples/models/atb_speed_sdk/atb_speed/common/log/multiprocess_logging_handler.py
new file mode 100644
index 00000000..9d3f221a
--- /dev/null
+++ b/mindie/examples/models/atb_speed_sdk/atb_speed/common/log/multiprocess_logging_handler.py
@@ -0,0 +1,135 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+# Copyright Huawei Technologies Co., Ltd. 2010-2018. All rights reserved.
+"""
+multiprocess_logging_handler
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import unicode_literals
+
+import logging
+import multiprocessing
+import threading
+
+
+def install_logging_handler(logger=None):
+    """
+    Wraps the handlers in the given Logger with an MultiProcessingHandler.
+    :param logger: whose handlers to wrap. By default, the root logger.
+    """
+    if logger is None:
+        logger = logging.getLogger("service_operation")
+
+    for index, org_handler in enumerate(list(logger.handlers)):
+        handler = MultiLoggingHandler('mp-handler-{0}'.format(index), log_handler=org_handler)
+        logger.removeHandler(org_handler)
+        logger.addHandler(handler)
+
+
+class MultiLoggingHandler(logging.Handler):
+    """
+    multiprocessing handler.
+    """
+
+    def __init__(self, name, log_handler=None):
+        """
+        Init multiprocessing handler
+        :param name:
+        :param log_handler:
+        :return:
+        """
+        super().__init__()
+
+        if log_handler is None:
+            log_handler = logging.StreamHandler()
+
+        self.log_handler = log_handler
+        self.queue = multiprocessing.Queue(-1)
+        self.setLevel(self.log_handler.level)
+        self.set_formatter(self.log_handler.formatter)
+        # The thread handles receiving records asynchronously.
+        t_thd = threading.Thread(target=self.receive, name=name)
+        t_thd.daemon = True
+        t_thd.start()
+
+    def set_formatter(self, fmt):
+        """
+
+        :param fmt:
+        :return:
+        """
+        logging.Handler.setFormatter(self, fmt)
+        self.log_handler.setFormatter(fmt)
+
+    def receive(self):
+        """
+
+        :return:
+        """
+        while True:
+            try:
+                record = self.queue.get()
+                self.log_handler.emit(record)
+            except (KeyboardInterrupt, SystemExit) as err:
+                raise err
+            except EOFError:
+                break
+            except ValueError:
+                pass
+
+    def send(self, message):
+        """
+
+        :param message:
+        :return:
+        """
+        self.queue.put_nowait(message)
+
+    def emit(self, record):
+        """
+
+        :param record:
+        :return:
+        """
+        try:
+            sd_record = self._format_record(record)
+            self.send(sd_record)
+        except (KeyboardInterrupt, SystemExit) as err:
+            raise err
+        except ValueError:
+            self.handleError(record)
+
+    def close(self):
+        """
+
+        :return:
+        """
+        self.log_handler.close()
+        logging.Handler.close(self)
+
+    def handle(self, record):
+        """
+
+        :param record:
+        :return:
+        """
+        rsv_record = self.filter(record)
+        if rsv_record:
+            self.emit(record)
+        return rsv_record
+
+    def _format_record(self, org_record):
+        """
+
+        :param org_record:
+        :return:
+        """
+        if org_record.args:
+            org_record.msg = org_record.msg % org_record.args
+            org_record.args = None
+        if org_record.exc_info:
+            self.format(org_record)
+            org_record.exc_info = None
+        return org_record
diff --git a/mindie/examples/models/atb_speed_sdk/atb_speed/common/performance/__init__.py b/mindie/examples/models/atb_speed_sdk/atb_speed/common/performance/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/mindie/examples/models/atb_speed_sdk/atb_speed/common/performance/base.py b/mindie/examples/models/atb_speed_sdk/atb_speed/common/performance/base.py
new file mode 100644
index 00000000..c700660b
--- /dev/null
+++ b/mindie/examples/models/atb_speed_sdk/atb_speed/common/performance/base.py
@@ -0,0 +1,231 @@
+#!/usr/bin/env python
+# coding:utf-8
+# Copyright Huawei Technologies Co., Ltd. 2010-2018. All rights reserved
+"""
+performance test base
+"""
+import time
+from dataclasses import dataclass
+from enum import Enum
+from typing import List, Callable
+
+import torch
+import torch.distributed as dist
+from atb_speed.common.config import atb_speed_config
+from atb_speed.common.launcher.base import BaseLauncher
+from atb_speed.common.timer import Timer
+from atb_llm.utils.file_utils import safe_open
+
+
+class PerfMode(str, Enum):
+    detail = "detail"
+    normal = "normal"
+
+
+@dataclass
+class PerformanceTestConfig:
+    """
+    PerformanceTestGPUConfig
+    """
+    batch_size: int = 1
+    max_len_exp: int = 5
+    min_len_exp: int = 11
+    model_name: str = "model"
+    device_name: str = "cpu"
+    save_file_name: str = ""
+    case_pair: List[List[int]] = None
+
+    def __post_init__(self):
+        self.batch_size = atb_speed_config.performance.batch_size
+        self.max_len_exp = atb_speed_config.performance.max_len_exp
+        self.min_len_exp = atb_speed_config.performance.min_len_exp
+        self.model_name = atb_speed_config.performance.model_name
+        self.case_pair = atb_speed_config.performance.case_pair
+        if not atb_speed_config.performance.save_file_name:
+            self.save_file_name = f"performance_test_{self.model_name}_{self.device_name}_bs{self.batch_size}.csv"
+        else:
+            self.save_file_name = atb_speed_config.performance.save_file_name
+
+
+class PerformanceTest:
+    """
+    PerformanceTestNPU
+    """
+
+    def __init__(self, launcher: BaseLauncher):
+        """
+
+        :param launcher:
+        """
+        self.launcher = launcher
+        self.local_rank, self.world_size = launcher.local_rank, launcher.world_size
+        self.config = PerformanceTestConfig(device_name=self.launcher.device_name)
+        self.launcher.logger.info(self.config.__dict__)
+        self.model, self.tokenizer = launcher.model, launcher.tokenizer
+        self.dummy_input = "Common sense questions and answers\n\nQuestion: Why do people need sleep\nFactual answer:"
+        if atb_speed_config.performance.perf_mode == PerfMode.detail:
+            self.perf = self._perf_detail_v2
+        else:
+            self.perf = self._perf
+        self.test_case = self.generate_test_case()
+
+    def generate_test_case(self):
+        if self.config.case_pair is None:
+            return [[2 ** i, 2 ** j]
+                    for i in range(self.config.min_len_exp, self.config.max_len_exp + 1)
+                    for j in range(self.config.min_len_exp, self.config.max_len_exp + 1)]
+        return self.config.case_pair
+
+    def warm_up(self, seq_len_in=None, seq_len_out=None):
+        """
+
+        :return:
+        """
+        if seq_len_in is None:
+            seq_len_in = max(case[0] for case in self.test_case)
+        if seq_len_out is None:
+            seq_len_out = max(case[1] for case in self.test_case)
+        dummy_input_ids_nxt = torch.randint(0, self.model.config.vocab_size, [self.config.batch_size, seq_len_in],
+                                            dtype=torch.int64)
+        dummy_attention_mask = torch.ones((self.config.batch_size, seq_len_in), dtype=torch.int64)
+        inputs = self.tokenizer([self.dummy_input] * self.config.batch_size, return_tensors="pt", padding='max_length',
+                                max_length=seq_len_in)
+        inputs["input_ids"] = dummy_input_ids_nxt
+        inputs["attention_mask"] = dummy_attention_mask
+        inputs = inputs.to(self.model.device)
+        with torch.no_grad():
+            _ = self.model.generate(
+                **inputs,
+                max_new_tokens=seq_len_out,
+                eos_token_id=self.model.config.vocab_size * 2
+            )
+        self.launcher.logger.info("warm up finished.")
+
+    def run_test(self):
+        self.launcher.logger.info("---------------inference---------------")
+        file = None
+        if self.local_rank == 0:
+            file = safe_open(self.config.save_file_name, "w", encoding="utf-8")
+            file.write(
+                "batch_size,"
+                "input_seq_len(Encoding),"
+                "output_seq_len(Decoding),"
+                "ResponseTime(s),"
+                "forward_first_token_time(ms),"
+                "forward_next_token_time(ms),"
+                "pre_next_token_time(ms),"
+                "post_next_token_time_post(ms)\n")
+        for seq_len_in, seq_len_out in self.test_case:
+            time_tensor = self._run(seq_len_in, seq_len_out)
+            if self.local_rank == 0:
+                file.write(
+                    f"{self.config.batch_size},"
+                    f"{seq_len_in},"
+                    f"{seq_len_out},"
+                    f"{round(time_tensor[0], 2)},"
+                    f"{time_tensor[1]},"
+                    f"{time_tensor[2]},"
+                    f"{time_tensor[3]},"
+                    f"{time_tensor[4]}\n")
+        if self.local_rank == 0:
+            file.close()
+
+    def _run(self, seq_len_in, seq_len_out):
+        dummy_input_ids_nxt = torch.randint(0, self.model.config.vocab_size, [self.config.batch_size, seq_len_in],
+                                            dtype=torch.int64)
+        dummy_attention_mask = torch.ones((self.config.batch_size, seq_len_in), dtype=torch.int64)
+        inputs = self.tokenizer(
+            [self.dummy_input] * self.config.batch_size,
+            return_tensors="pt", padding='max_length', max_length=seq_len_in)
+        inputs["input_ids"] = dummy_input_ids_nxt
+        inputs["attention_mask"] = dummy_attention_mask
+        inputs = inputs.to(self.model.device)
+        self.launcher.logger.info("---------------inputs shape---------------")
+        self.launcher.logger.info(inputs.input_ids.shape)
+        self.launcher.logger.info(f"seq_len_in: {seq_len_in}, seq_len_out: {seq_len_out}")
+        start_time = time.time()
+        forward_first_token_time, forward_next_token_time, pre_next_token_time, post_next_token_time_post = (
+            self.perf(inputs, seq_len_out))
+        end_time = time.time()
+        # output
+        # time analysis
+        total_time = end_time - start_time
+        time_tensor = torch.tensor(
+            [total_time,
+             forward_first_token_time,
+             forward_next_token_time,
+             pre_next_token_time,
+             post_next_token_time_post], device=self.model.device)
+        if self.world_size > 1:
+            dist.all_reduce(time_tensor, dist.ReduceOp.MAX)
+        time_tensor = time_tensor.tolist()
+        return time_tensor
+
+    def _perf_detail_v2(self, inputs, seq_len_out):
+        """
+        使用装饰器的方式进行计时，从而从根本上解决侵入式修改打点的方式
+        :param inputs:
+        :param seq_len_out:
+        :return:
+        """
+        Timer.reset()
+        Timer.sync = getattr(torch, self.launcher.device_type).synchronize
+        with torch.no_grad():
+            generate_ids = self.model.generate(**inputs, max_new_tokens=seq_len_out,
+                                               eos_token_id=self.model.config.vocab_size * 2  # 避免提前停止
+                                               )
+            # decode
+            if not atb_speed_config.performance.skip_decode:
+                _ = self.tokenizer.batch_decode(generate_ids, skip_special_tokens=True,
+                                                clean_up_tokenization_spaces=False)
+        return [Timer.timeit_res.first_token_delay, Timer.timeit_res.next_token_avg_delay, 0, 0]
+
+    def _perf_detail(self, inputs, seq_len_out):
+        with torch.no_grad():
+            generate_ids, \
+                forward_first_token_time, \
+                forward_next_token_time, \
+                pre_next_token_time, \
+                post_next_token_time_post = \
+                self.model.generate(**inputs, max_new_tokens=seq_len_out,
+                                    eos_token_id=self.model.config.vocab_size * 2  # 避免提前停止
+                                    )
+            # decode
+            if not atb_speed_config.performance.skip_decode:
+                _ = self.tokenizer.batch_decode(generate_ids, skip_special_tokens=True,
+                                                clean_up_tokenization_spaces=False)
+        return [forward_first_token_time,
+                forward_next_token_time,
+                pre_next_token_time,
+                post_next_token_time_post]
+
+    def _perf(self, inputs, seq_len_out):
+        with torch.no_grad():
+            getattr(torch, self.launcher.device_type).synchronize()
+            first_token_start = time.time()
+            generate_ids = self.model.generate(**inputs,
+                                               min_new_tokens=1,
+                                               max_new_tokens=1)
+            getattr(torch, self.launcher.device_type).synchronize()
+            first_token_end = time.time()
+            if not atb_speed_config.performance.skip_decode:
+                _ = self.tokenizer.batch_decode(generate_ids, skip_special_tokens=True,
+                                                clean_up_tokenization_spaces=False)
+
+            getattr(torch, self.launcher.device_type).synchronize()
+            total_start = time.time()
+            generate_ids = self.model.generate(
+                **inputs,
+                min_new_tokens=seq_len_out,
+                max_new_tokens=seq_len_out
+            )
+            getattr(torch, self.launcher.device_type).synchronize()
+            total_end = time.time()
+        if not atb_speed_config.performance.skip_decode:
+            _ = self.tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)
+        # time analysis
+        forward_first_token_time = (first_token_end - first_token_start) * 1000
+        time_inc_total = (total_end - total_start) * 1000
+
+        forward_next_token_time = (time_inc_total - forward_first_token_time) / (seq_len_out - 1)
+        return [forward_first_token_time, forward_next_token_time, 0, 0]
diff --git a/mindie/examples/models/atb_speed_sdk/atb_speed/common/precision/__init__.py b/mindie/examples/models/atb_speed_sdk/atb_speed/common/precision/__init__.py
new file mode 100644
index 00000000..205b6880
--- /dev/null
+++ b/mindie/examples/models/atb_speed_sdk/atb_speed/common/precision/__init__.py
@@ -0,0 +1,21 @@
+#!/usr/bin/env python
+# coding:utf-8
+# Copyright Huawei Technologies Co., Ltd. 2010-2018. All rights reserved
+"""
+common launcher
+"""
+from atb_speed.common.config import atb_speed_config
+
+from .base import CEVALPrecisionTest, MMLUPrecisionTest
+
+
+def get_precision_test_cls(mode=""):
+    """
+
+    :return:
+    """
+    cls_map = {
+        "mmlu": MMLUPrecisionTest,
+        "ceval": CEVALPrecisionTest
+    }
+    return cls_map.get(mode or atb_speed_config.precision.mode.lower(), CEVALPrecisionTest)
diff --git a/mindie/examples/models/atb_speed_sdk/atb_speed/common/precision/base.py b/mindie/examples/models/atb_speed_sdk/atb_speed/common/precision/base.py
new file mode 100644
index 00000000..22770a4c
--- /dev/null
+++ b/mindie/examples/models/atb_speed_sdk/atb_speed/common/precision/base.py
@@ -0,0 +1,256 @@
+#!/usr/bin/env python
+# coding:utf-8
+# Copyright Huawei Technologies Co., Ltd. 2010-2018. All rights reserved
+"""
+precision base
+"""
+import json
+import os
+from string import ascii_letters
+
+import pandas as pd
+import torch
+from atb_llm.utils.file_utils import safe_open
+from atb_speed.common.config import atb_speed_config
+from atb_speed.common.launcher.base import BaseLauncher
+from atb_speed.common.utils import torch_parallel_info
+from tqdm import tqdm
+
+HARD_TASK = (
+    "advanced_mathematics", "discrete_mathematics", "probability_and_statistics", "college_chemistry",
+    "college_physics", "high_school_mathematics", "high_school_chemistry", "high_school_physics"
+)
+
+
+class Record:
+    """only keep one card result when debug is False"""
+
+    def __init__(self, log_dir, log_flag, debug=False):
+        self.debug = debug
+        self.flag = log_flag if debug else ""
+        self.log_name = os.path.join(log_dir, f"device{self.flag}.log")
+        self.cache_name = os.path.join(log_dir, f"cache{self.flag}.csv")
+        self.begin_idx = self.load_cache()
+
+    def log(self, *msg):
+        if self.debug or torch_parallel_info.is_rank_0:
+            with safe_open(self.log_name, "a", encoding="utf-8") as f:
+                f.write(" ".join([str(i) for i in msg]) + '\n')
+
+    def load_cache(self):
+        if not os.path.exists(self.cache_name):
+            self.log("[-] No cache file, cache will be created")
+            return 0
+        self.log("[~] Loading cache on last abnormal exit ... (and continue with the cache)")
+        with safe_open(self.cache_name, "r", encoding="utf-8") as f:
+            cache = f.read().strip().split()
+        if not cache:
+            return 0
+        cache = [row.split(",") for row in cache]
+        start_idx = cache[-1][0]
+        self.log(f"[+] Load cache successfully! start idx: {start_idx}")
+        return int(start_idx) + 1
+
+    def update_cache(self, task_name, question_id, truth_answer, predict_answer):
+        if self.debug or torch_parallel_info.is_rank_0:
+            with safe_open(self.cache_name, "a", encoding="utf-8") as f:
+                f.write(f"{question_id},{task_name},{truth_answer},{predict_answer}\n")
+
+
+class PrecisionTestBase:
+    def __init__(self, launcher: BaseLauncher, workdir="", **kwargs):
+        workdir = atb_speed_config.precision.work_dir if not workdir else workdir
+        self.data_dir = os.path.join(workdir, "data")
+        self.result_output_dir = os.path.join(workdir, "test_result")
+        self.init_result_dir()
+        self.choices = ["A", "B", "C", "D"]
+        self.shot = 5
+        self.batch = 1
+        self.seq_len_out = 32
+
+        self.model, self.tokenizer = launcher.model, launcher.tokenizer
+        self.local_rank = launcher.local_rank
+        self.launcher = launcher
+        self.recorder = Record(self.result_output_dir, self.local_rank)
+        self.subject_mapping_path = os.path.join(os.path.dirname(os.path.realpath(__file__)),
+                                                 f"{atb_speed_config.precision.mode}_subject_mapping.json")
+        # kwargs have higher priority
+        if atb_speed_config.precision:
+            self.update_param(atb_speed_config.precision.__dict__)
+        self.update_param(kwargs)
+
+    @staticmethod
+    def format_subject(subject):
+        sub_list = subject.split("_")
+        final_str = ""
+        for entry in sub_list:
+            final_str += " " + entry
+        return final_str
+
+    def update_param(self, param_dict):
+        for key, value in param_dict.items():
+            setattr(self, key, value)
+            self.recorder.log(f"[+] set {key} to {value}")
+
+    def init_result_dir(self):
+        if torch_parallel_info.is_rank_0:
+            os.makedirs(self.result_output_dir, exist_ok=True)
+        if torch_parallel_info.world_size > 1:
+            torch.distributed.barrier()
+
+    def compute_metric(self, subject_mapping):
+        run_results = pd.read_csv(
+            self.recorder.cache_name,
+            names=['question_id', 'task_name', 'truth_answer', 'predict_answer'])
+        classes_acc = dict()
+        subject_acc = dict()
+        hard_task = [0, 0]
+        for task in subject_mapping:
+            class_of_task = subject_mapping[task][2]
+            this_task = run_results.loc[run_results.task_name == task]
+            if not this_task.shape[0]:
+                continue
+            correct_num = (this_task.truth_answer == this_task.predict_answer).sum()
+            if class_of_task not in classes_acc:
+                classes_acc[class_of_task] = [0, 0]  # correct num, total num
+            if task in HARD_TASK:
+                hard_task[0] += correct_num
+            hard_task[1] += this_task.shape[0]
+            subject_acc[task] = correct_num / this_task.shape[0]
+            classes_acc[class_of_task][0] += correct_num
+            classes_acc[class_of_task][1] += this_task.shape[0]
+
+        avg_acc = sum([i[0] for i in classes_acc.values()]) / sum([j[1] for j in classes_acc.values()])
+        for c in classes_acc:
+            classes_acc[c] = classes_acc[c][0] / classes_acc[c][1]
+        classes_acc["Avg"] = avg_acc
+        classes_acc["Avg(Hard)"] = hard_task[0] / hard_task[1]
+        with safe_open(os.path.join(self.result_output_dir, f"result{self.recorder.flag}_subject_acc.json"), "w") as fp:
+            json.dump(subject_acc, fp)
+        with safe_open(os.path.join(self.result_output_dir, f"result{self.recorder.flag}_classes_acc.json"), "w") as fp:
+            json.dump(classes_acc, fp)
+        if torch_parallel_info.is_rank_0:
+            self.launcher.logger.info(f"[+] Avg acc: {classes_acc['Avg']}")
+
+    def get_subject_mapping(self):
+        with safe_open(self.subject_mapping_path, "r", encoding="utf-8") as f:
+            subject_mapping = json.load(f)
+        return subject_mapping
+
+    def load_csv_by_task_name(self, task_name):
+        dev_df = pd.read_csv(os.path.join(self.data_dir, "dev", task_name + "_dev.csv"), header=None)[
+                 :self.shot + 1]
+        val_df = pd.read_csv(os.path.join(self.data_dir, "val", task_name + "_val.csv"), header=None)
+
+        return dev_df, val_df
+
+    def format_example(self, df, idx, include_answer=True):
+        prompt = df.iloc[idx, 0]
+        k = len(self.choices)
+        for j in range(k):
+            prompt += "\n{}. {}".format(self.choices[j], df.iloc[idx, j + 1])
+        prompt += "\nAnswer:"
+        if include_answer:
+            prompt += " {}\n\n".format(df.iloc[idx, k + 1])
+        return prompt
+
+    def gen_prompt(self, train_df, subject, k=-1):
+        prompt = "The following are multiple choice questions (with answers) about {}.\n\n".format(
+            self.format_subject(subject))
+        if k == -1:
+            k = train_df.shape[0]
+        for i in range(k):
+            prompt += self.format_example(train_df, i)
+        return prompt
+
+    def batch_infer(self, qr_pair, begin_idx):
+        prompts = [item['prompt'] for item in qr_pair]
+        truth_answers = [item['answer'] for item in qr_pair]
+        task_names = [item['task_name'] for item in qr_pair]
+
+        inputs = self.tokenizer(prompts, return_tensors="pt", padding='longest')
+        inputs = inputs.to(self.model.device)
+        input_len = len(inputs.input_ids[0])
+        with torch.no_grad():
+            output = self.model.generate(inputs.input_ids,
+                                         attention_mask=inputs.attention_mask,
+                                         max_new_tokens=self.seq_len_out)
+        answers = self.tokenizer.batch_decode(output.to(torch.int32)[:, input_len:])
+
+        for prompt, truth_answer, task_name, ori_answer in zip(prompts, truth_answers, task_names, answers):
+            self.recorder.log("\n========== prompt start ==========\n", prompt,
+                              "\n==========  prompt end  ==========\n")
+            self.recorder.log(f"[+] prompt length: {input_len}")
+            self.recorder.log("\n========== answer start ==========\n", ori_answer,
+                              "\n==========  answer end  ==========\n")
+            answer_list = [char.upper() for char in ori_answer if char in ascii_letters]
+            answer = answer_list[0] if answer_list else "-1"
+            is_correct = "Correct" if answer == truth_answer else "Wrong"
+            self.recorder.log(f"[{is_correct}] predict: {answer}, label: {truth_answer}")
+            self.recorder.update_cache(task_name, begin_idx, truth_answer, answer)
+            begin_idx += 1
+
+    def run(self):
+        subject_mapping = self.get_subject_mapping()
+        subject_name_list = sorted(list(subject_mapping.keys()))
+        qr_pair = []
+
+        total_len = 0
+        begin_idx = self.recorder.begin_idx
+        for task_name in subject_name_list:
+            dev_df, val_df = self.load_csv_by_task_name(task_name)
+            total_len += len(val_df)
+            if len(val_df) <= begin_idx:
+                self.recorder.log(f"[~] Skip Task: {task_name}")
+                begin_idx -= len(val_df)
+                continue
+
+            for i in range(val_df.shape[0]):
+                if begin_idx > 0:
+                    begin_idx -= 1
+                    continue
+                for cut_shot in range(self.shot):
+                    prompt_end = self.format_example(val_df, i, include_answer=False)
+                    train_prompt = self.gen_prompt(dev_df, task_name, self.shot - cut_shot)
+                    prompt = train_prompt + prompt_end
+                    input_len = len(self.tokenizer(prompt, return_tensors="pt").input_ids[0])
+                    if input_len > 2000:
+                        continue
+                    label = val_df.iloc[i, val_df.shape[1] - 1]
+                    qr_pair.append({'task_name': task_name, 'prompt': prompt, 'answer': label})
+                    break
+        pbar = None
+        if torch_parallel_info.is_rank_0:
+            pbar = tqdm(total=total_len, initial=self.recorder.begin_idx)
+        for i in range(0, len(qr_pair), self.batch):
+            self.batch_infer(qr_pair[i: i + self.batch], i + self.recorder.begin_idx)
+            if torch_parallel_info.is_rank_0:
+                pbar.update(self.batch if i + self.batch <= len(qr_pair) else len(qr_pair) - i)
+        if torch_parallel_info.is_rank_0:
+            pbar.close()
+        self.compute_metric(subject_mapping)
+
+
+class CEVALPrecisionTest(PrecisionTestBase):
+    """
+    CEVAL
+    """
+
+    def load_csv_by_task_name(self, task_name):
+        dev_df, val_df = super().load_csv_by_task_name(task_name)
+
+        # remove the first row "column names" and the first column "id"
+        dev_df = dev_df.iloc[1:, 1:]
+        val_df = val_df.iloc[1:, 1:]
+
+        return dev_df, val_df
+
+
+class MMLUPrecisionTest(PrecisionTestBase):
+    """
+    MMLU
+    """
+
+    def compute_metric(self, subject_mapping):
+        subject_mapping_adapt = {k: [None, None, v] for k, v in subject_mapping.items()}
+        return super().compute_metric(subject_mapping_adapt)
diff --git a/mindie/examples/models/atb_speed_sdk/atb_speed/common/precision/ceval_subject_mapping.json b/mindie/examples/models/atb_speed_sdk/atb_speed/common/precision/ceval_subject_mapping.json
new file mode 100644
index 00000000..493c0f38
--- /dev/null
+++ b/mindie/examples/models/atb_speed_sdk/atb_speed/common/precision/ceval_subject_mapping.json
@@ -0,0 +1,262 @@
+{
+	"computer_network": [
+		"Computer Network",
+		"\u8ba1\u7b97\u673a\u7f51\u7edc",
+		"STEM"
+	],
+	"operating_system": [
+		"Operating System",
+		"\u64cd\u4f5c\u7cfb\u7edf",
+		"STEM"
+	],
+	"computer_architecture": [
+		"Computer Architecture",
+		"\u8ba1\u7b97\u673a\u7ec4\u6210",
+		"STEM"
+	],
+	"college_programming": [
+		"College Programming",
+		"\u5927\u5b66\u7f16\u7a0b",
+		"STEM"
+	],
+	"college_physics": [
+		"College Physics",
+		"\u5927\u5b66\u7269\u7406",
+		"STEM"
+	],
+	"college_chemistry": [
+		"College Chemistry",
+		"\u5927\u5b66\u5316\u5b66",
+		"STEM"
+	],
+	"advanced_mathematics": [
+		"Advanced Mathematics",
+		"\u9ad8\u7b49\u6570\u5b66",
+		"STEM"
+	],
+	"probability_and_statistics": [
+		"Probability and Statistics",
+		"\u6982\u7387\u7edf\u8ba1",
+		"STEM"
+	],
+	"discrete_mathematics": [
+		"Discrete Mathematics",
+		"\u79bb\u6563\u6570\u5b66",
+		"STEM"
+	],
+	"electrical_engineer": [
+		"Electrical Engineer",
+		"\u6ce8\u518c\u7535\u6c14\u5de5\u7a0b\u5e08",
+		"STEM"
+	],
+	"metrology_engineer": [
+		"Metrology Engineer",
+		"\u6ce8\u518c\u8ba1\u91cf\u5e08",
+		"STEM"
+	],
+	"high_school_mathematics": [
+		"High School Mathematics",
+		"\u9ad8\u4e2d\u6570\u5b66",
+		"STEM"
+	],
+	"high_school_physics": [
+		"High School Physics",
+		"\u9ad8\u4e2d\u7269\u7406",
+		"STEM"
+	],
+	"high_school_chemistry": [
+		"High School Chemistry",
+		"\u9ad8\u4e2d\u5316\u5b66",
+		"STEM"
+	],
+	"high_school_biology": [
+		"High School Biology",
+		"\u9ad8\u4e2d\u751f\u7269",
+		"STEM"
+	],
+	"middle_school_mathematics": [
+		"Middle School Mathematics",
+		"\u521d\u4e2d\u6570\u5b66",
+		"STEM"
+	],
+	"middle_school_biology": [
+		"Middle School Biology",
+		"\u521d\u4e2d\u751f\u7269",
+		"STEM"
+	],
+	"middle_school_physics": [
+		"Middle School Physics",
+		"\u521d\u4e2d\u7269\u7406",
+		"STEM"
+	],
+	"middle_school_chemistry": [
+		"Middle School Chemistry",
+		"\u521d\u4e2d\u5316\u5b66",
+		"STEM"
+	],
+	"veterinary_medicine": [
+		"Veterinary Medicine",
+		"\u517d\u533b\u5b66",
+		"STEM"
+	],
+	"college_economics": [
+		"College Economics",
+		"\u5927\u5b66\u7ecf\u6d4e\u5b66",
+		"Social Science"
+	],
+	"business_administration": [
+		"Business Administration",
+		"\u5de5\u5546\u7ba1\u7406",
+		"Social Science"
+	],
+	"marxism": [
+		"Marxism",
+		"\u9a6c\u514b\u601d\u4e3b\u4e49\u57fa\u672c\u539f\u7406",
+		"Social Science"
+	],
+	"mao_zedong_thought": [
+		"Mao Zedong Thought",
+		"\u6bdb\u6cfd\u4e1c\u601d\u60f3\u548c\u4e2d\u56fd\u7279\u8272\u793e\u4f1a\u4e3b\u4e49\u7406\u8bba\u4f53\u7cfb\u6982\u8bba",
+		"Social Science"
+	],
+	"education_science": [
+		"Education Science",
+		"\u6559\u80b2\u5b66",
+		"Social Science"
+	],
+	"teacher_qualification": [
+		"Teacher Qualification",
+		"\u6559\u5e08\u8d44\u683c",
+		"Social Science"
+	],
+	"high_school_politics": [
+		"High School Politics",
+		"\u9ad8\u4e2d\u653f\u6cbb",
+		"Social Science"
+	],
+	"high_school_geography": [
+		"High School Geography",
+		"\u9ad8\u4e2d\u5730\u7406",
+		"Social Science"
+	],
+	"middle_school_politics": [
+		"Middle School Politics",
+		"\u521d\u4e2d\u653f\u6cbb",
+		"Social Science"
+	],
+	"middle_school_geography": [
+		"Middle School Geography",
+		"\u521d\u4e2d\u5730\u7406",
+		"Social Science"
+	],
+	"modern_chinese_history": [
+		"Modern Chinese History",
+		"\u8fd1\u4ee3\u53f2\u7eb2\u8981",
+		"Humanities"
+	],
+	"ideological_and_moral_cultivation": [
+		"Ideological and Moral Cultivation",
+		"\u601d\u60f3\u9053\u5fb7\u4fee\u517b\u4e0e\u6cd5\u5f8b\u57fa\u7840",
+		"Humanities"
+	],
+	"logic": [
+		"Logic",
+		"\u903b\u8f91\u5b66",
+		"Humanities"
+	],
+	"law": [
+		"Law",
+		"\u6cd5\u5b66",
+		"Humanities"
+	],
+	"chinese_language_and_literature": [
+		"Chinese Language and Literature",
+		"\u4e2d\u56fd\u8bed\u8a00\u6587\u5b66",
+		"Humanities"
+	],
+	"art_studies": [
+		"Art Studies",
+		"\u827a\u672f\u5b66",
+		"Humanities"
+	],
+	"professional_tour_guide": [
+		"Professional Tour Guide",
+		"\u5bfc\u6e38\u8d44\u683c",
+		"Humanities"
+	],
+	"legal_professional": [
+		"Legal Professional",
+		"\u6cd5\u5f8b\u804c\u4e1a\u8d44\u683c",
+		"Humanities"
+	],
+	"high_school_chinese": [
+		"High School Chinese",
+		"\u9ad8\u4e2d\u8bed\u6587",
+		"Humanities"
+	],
+	"high_school_history": [
+		"High School History",
+		"\u9ad8\u4e2d\u5386\u53f2",
+		"Humanities"
+	],
+	"middle_school_history": [
+		"Middle School History",
+		"\u521d\u4e2d\u5386\u53f2",
+		"Humanities"
+	],
+	"civil_servant": [
+		"Civil Servant",
+		"\u516c\u52a1\u5458",
+		"Other"
+	],
+	"sports_science": [
+		"Sports Science",
+		"\u4f53\u80b2\u5b66",
+		"Other"
+	],
+	"plant_protection": [
+		"Plant Protection",
+		"\u690d\u7269\u4fdd\u62a4",
+		"Other"
+	],
+	"basic_medicine": [
+		"Basic Medicine",
+		"\u57fa\u7840\u533b\u5b66",
+		"Other"
+	],
+	"clinical_medicine": [
+		"Clinical Medicine",
+		"\u4e34\u5e8a\u533b\u5b66",
+		"Other"
+	],
+	"urban_and_rural_planner": [
+		"Urban and Rural Planner",
+		"\u6ce8\u518c\u57ce\u4e61\u89c4\u5212\u5e08",
+		"Other"
+	],
+	"accountant": [
+		"Accountant",
+		"\u6ce8\u518c\u4f1a\u8ba1\u5e08",
+		"Other"
+	],
+	"fire_engineer": [
+		"Fire Engineer",
+		"\u6ce8\u518c\u6d88\u9632\u5de5\u7a0b\u5e08",
+		"Other"
+	],
+	"environmental_impact_assessment_engineer": [
+		"Environmental Impact Assessment Engineer",
+		"\u73af\u5883\u5f71\u54cd\u8bc4\u4ef7\u5de5\u7a0b\u5e08",
+		"Other"
+	],
+	"tax_accountant": [
+		"Tax Accountant",
+		"\u7a0e\u52a1\u5e08",
+		"Other"
+	],
+	"physician": [
+		"Physician",
+		"\u533b\u5e08\u8d44\u683c",
+		"Other"
+	]
+}
\ No newline at end of file
diff --git a/mindie/examples/models/atb_speed_sdk/atb_speed/common/precision/mmlu_subject_mapping.json b/mindie/examples/models/atb_speed_sdk/atb_speed/common/precision/mmlu_subject_mapping.json
new file mode 100644
index 00000000..09c850e4
--- /dev/null
+++ b/mindie/examples/models/atb_speed_sdk/atb_speed/common/precision/mmlu_subject_mapping.json
@@ -0,0 +1,59 @@
+{
+	"abstract_algebra": "STEM",
+	"anatomy": "other",
+	"astronomy": "STEM",
+	"business_ethics": "other",
+	"clinical_knowledge": "other",
+	"college_biology": "STEM",
+	"college_chemistry": "STEM",
+	"college_computer_science": "STEM",
+	"college_mathematics": "STEM",
+	"college_medicine": "other",
+	"college_physics": "STEM",
+	"computer_security": "STEM",
+	"conceptual_physics": "STEM",
+	"econometrics": "social sciences",
+	"electrical_engineering": "STEM",
+	"elementary_mathematics": "STEM",
+	"formal_logic": "humanities",
+	"global_facts": "other",
+	"high_school_biology": "STEM",
+	"high_school_chemistry": "STEM",
+	"high_school_computer_science": "STEM",
+	"high_school_european_history": "humanities",
+	"high_school_geography": "social sciences",
+	"high_school_government_and_politics": "social sciences",
+	"high_school_macroeconomics": "social sciences",
+	"high_school_mathematics": "STEM",
+	"high_school_microeconomics": "social sciences",
+	"high_school_physics": "STEM",
+	"high_school_psychology": "social sciences",
+	"high_school_statistics": "STEM",
+	"high_school_us_history": "humanities",
+	"high_school_world_history": "humanities",
+	"human_aging": "other",
+	"human_sexuality": "social sciences",
+	"international_law": "humanities",
+	"jurisprudence": "humanities",
+	"logical_fallacies": "humanities",
+	"machine_learning": "STEM",
+	"management": "other",
+	"marketing": "other",
+	"medical_genetics": "other",
+	"miscellaneous": "other",
+	"moral_disputes": "humanities",
+	"moral_scenarios": "humanities",
+	"nutrition": "other",
+	"philosophy": "humanities",
+	"prehistory": "humanities",
+	"professional_accounting": "other",
+	"professional_law": "humanities",
+	"professional_medicine": "other",
+	"professional_psychology": "social sciences",
+	"public_relations": "social sciences",
+	"security_studies": "social sciences",
+	"sociology": "social sciences",
+	"us_foreign_policy": "social sciences",
+	"virology": "other",
+	"world_religions": "humanities"
+  }
\ No newline at end of file
diff --git a/mindie/examples/models/atb_speed_sdk/atb_speed/common/timer.py b/mindie/examples/models/atb_speed_sdk/atb_speed/common/timer.py
new file mode 100644
index 00000000..f3066f9b
--- /dev/null
+++ b/mindie/examples/models/atb_speed_sdk/atb_speed/common/timer.py
@@ -0,0 +1,101 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+# Copyright Huawei Technologies Co., Ltd. 2010-2018. All rights reserved.
+"""
+decorator
+"""
+
+import logging
+import os
+import time
+import uuid
+from dataclasses import dataclass, field
+from functools import wraps, partial
+from typing import List
+from typing import Union
+
+
+@dataclass
+class TimeData:
+    step: int = 0
+    time_cost: Union[float, int] = 0
+
+
+@dataclass
+class SeqTimeData:
+    task_id: str = ""
+    time_data_list: List[TimeData] = field(default_factory=list)
+
+    @property
+    def generated_tokens(self):
+        return len(self.time_data_list)
+
+    @property
+    def first_token_delay(self):
+        return self.time_data_list[0].time_cost if self.time_data_list else 0
+
+    @property
+    def next_token_avg_delay(self):
+        if self.generated_tokens <= 1:
+            return 0
+        return sum(item.time_cost for item in self.time_data_list[1:]) / (self.generated_tokens - 1)
+
+
+class Timer:
+    """
+    CommonDecorator
+    """
+    step: int = 0
+    timeit_res: SeqTimeData = SeqTimeData(str(uuid.uuid4()))
+
+    @classmethod
+    def reset(cls):
+        cls.step = 0
+        cls.timeit_res = SeqTimeData(str(uuid.uuid4()))
+
+    @classmethod
+    def sync(cls):
+        ...
+
+    @classmethod
+    def timing(cls, func=None, *, logger=None, level=logging.INFO):
+        """
+        函数计时
+        :return:
+        """
+        if logger is None:
+            logger = logging.getLogger()
+        if func is None:
+            # 没有括号的时候args是func，有括号的时候args是None
+            return partial(Timer.timing, logger=logger, level=level)
+
+        run = cls._timeit_run if os.getenv("TIMEIT", "0") == "1" else cls._run
+
+        @wraps(func)
+        def wrapper(*args, **kwargs):
+            """
+            wrapper
+            :param args:
+            :param kwargs:
+            :return:
+            """
+            res = run(func, *args, **kwargs)
+            return res
+
+        return wrapper
+
+    @classmethod
+    def _run(cls, func, *args, **kwargs):
+        res = func(*args, **kwargs)
+        return res
+
+    @classmethod
+    def _timeit_run(cls, func, *args, **kwargs):
+        cls.sync()
+        start_time = time.time()
+        res = func(*args, **kwargs)
+        cls.sync()
+        end_time = (time.time() - start_time) * 1000  # ms
+        cls.timeit_res.time_data_list.append(TimeData(cls.step, end_time))
+        cls.step = cls.step + 1
+        return res
diff --git a/mindie/examples/models/atb_speed_sdk/atb_speed/common/utils.py b/mindie/examples/models/atb_speed_sdk/atb_speed/common/utils.py
new file mode 100644
index 00000000..372d207e
--- /dev/null
+++ b/mindie/examples/models/atb_speed_sdk/atb_speed/common/utils.py
@@ -0,0 +1,81 @@
+#!/usr/bin/env python
+# coding:utf-8
+# Copyright Huawei Technologies Co., Ltd. 2010-2018. All rights reserved
+"""
+utils
+"""
+import os
+from dataclasses import dataclass
+
+import torch
+
+FLAG_OS_MAP = {
+    'r': os.O_RDONLY, 'r+': os.O_RDWR,
+    'w': os.O_CREAT | os.O_TRUNC | os.O_WRONLY,
+    'w+': os.O_CREAT | os.O_TRUNC | os.O_RDWR,
+    'a': os.O_CREAT | os.O_APPEND | os.O_WRONLY,
+    'a+': os.O_CREAT | os.O_APPEND | os.O_RDWR,
+    'x': os.O_CREAT | os.O_EXCL,
+    "b": getattr(os, "O_BINARY", 0)
+}
+
+
+@dataclass
+class TorchParallelInfo:
+    __is_initialized: bool = False
+    __world_size: int = 1
+    __local_rank: int = 0
+
+    def __post_init__(self):
+        self.try_to_init()
+
+    @property
+    def is_initialized(self):
+        return self.__is_initialized
+
+    @property
+    def world_size(self):
+        _ = self.try_to_init()
+        return self.__world_size
+
+    @property
+    def local_rank(self):
+        _ = self.try_to_init()
+        return self.__local_rank
+
+    @property
+    def is_rank_0(self) -> bool:
+        return self.local_rank == 0
+
+    @staticmethod
+    def get_rank() -> int:
+        return 0 if not torch.distributed.is_initialized() else torch.distributed.get_rank()
+
+    @staticmethod
+    def get_world_size() -> int:
+        return 1 if not torch.distributed.is_initialized() else torch.distributed.get_world_size()
+
+    def try_to_init(self):
+        """
+        没有初始化的时候，刷新初始化状态以及world_size local_rank
+        :return:
+        """
+        if not self.__is_initialized:
+            is_initialized = torch.distributed.is_initialized()
+            if is_initialized:
+                self.__local_rank = self.get_rank()
+                self.__world_size = self.get_world_size()
+            self.__is_initialized = is_initialized
+        return self.__is_initialized
+
+
+def load_atb_speed():
+    env_name = "ATB_SPEED_HOME_PATH"
+    atb_speed_home_path = os.getenv(env_name)
+    if atb_speed_home_path is None:
+        raise RuntimeError(f"env {env_name} not exist, source set_env.sh")
+    lib_path = os.path.join(atb_speed_home_path, "lib", "libatb_speed_torch.so")
+    torch.classes.load_library(lib_path)
+
+
+torch_parallel_info = TorchParallelInfo()
diff --git a/mindie/examples/models/atb_speed_sdk/setup.py b/mindie/examples/models/atb_speed_sdk/setup.py
new file mode 100644
index 00000000..e4013d26
--- /dev/null
+++ b/mindie/examples/models/atb_speed_sdk/setup.py
@@ -0,0 +1,19 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+# Copyright Huawei Technologies Co., Ltd. 2022-2022. All rights reserved.
+"""
+setup
+"""
+
+from setuptools import find_packages, setup
+
+setup(name='atb_speed',
+      version='1.1.0',
+      description='atb speed sdk',
+      license='MIT',
+      keywords='atb_speed',
+      packages=find_packages(),
+      install_requires=["pandas"],
+      package_data={"atb_speed": ["**/*.json"]},
+      include_package_data=True
+      )
diff --git a/mindie/examples/models/atb_speed_sdk/test/sdk_ceval_config_test.py b/mindie/examples/models/atb_speed_sdk/test/sdk_ceval_config_test.py
new file mode 100644
index 00000000..50e00c06
--- /dev/null
+++ b/mindie/examples/models/atb_speed_sdk/test/sdk_ceval_config_test.py
@@ -0,0 +1,36 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+# Copyright Huawei Technologies Co., Ltd. 2022-2022. All rights reserved.
+from atb_speed.common.config import atb_speed_config
+from atb_speed.common.launcher import Launcher
+from atb_speed.common.precision import get_precision_test_cls
+from transformers import AutoTokenizer, AutoModelForCausalLM
+
+
+class BaichuanLM(Launcher):
+    def init_model(self):
+        """
+        模型初始化
+        :return:
+        """
+        tokenizer = AutoTokenizer.from_pretrained(self.model_path, trust_remote_code=True, use_fast=False)
+        model = AutoModelForCausalLM.from_pretrained(self.model_path, trust_remote_code=True).half().to(self._device)
+        model.eval()
+        model.generation_config = self.remove_part_of_generation_config(model.generation_config)
+        return model, tokenizer
+
+
+def demo_ceval(launcher: Launcher):
+    """
+
+    :param launcher:
+    :return:
+    """
+    c_t = get_precision_test_cls()(launcher)
+    c_t.run()
+
+
+if __name__ == '__main__':
+    atb_speed_config.init_config("config.ini")
+    baichuan = BaichuanLM()
+    demo_ceval(baichuan)
diff --git a/mindie/examples/models/atb_speed_sdk/test/sdk_perf_config_test.py b/mindie/examples/models/atb_speed_sdk/test/sdk_perf_config_test.py
new file mode 100644
index 00000000..2195bce1
--- /dev/null
+++ b/mindie/examples/models/atb_speed_sdk/test/sdk_perf_config_test.py
@@ -0,0 +1,32 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+# Copyright Huawei Technologies Co., Ltd. 2022-2022. All rights reserved.
+from atb_speed.common.config import atb_speed_config
+from atb_speed.common.launcher import Launcher
+from atb_speed.common.performance.base import PerformanceTest
+from transformers import AutoTokenizer, AutoModelForCausalLM
+
+
+class LMLauncher(Launcher):
+    """
+    LMLauncher
+    """
+
+    def init_model(self):
+        """
+        模型初始化
+        :return:
+        """
+        tokenizer = AutoTokenizer.from_pretrained(
+            self.model_path, trust_remote_code=True, use_fast=False)
+        model = AutoModelForCausalLM.from_pretrained(self.model_path, trust_remote_code=True).half().to(self._device)
+        model.eval()
+        model.generation_config = self.remove_part_of_generation_config(model.generation_config)
+        return model, tokenizer
+
+
+if __name__ == '__main__':
+    atb_speed_config.init_config("config.ini")
+    performance_test = PerformanceTest(LMLauncher("0"))
+    performance_test.warm_up()
+    performance_test.run_test()
diff --git a/mindie/examples/models/atb_speed_sdk/test/sdk_test.py b/mindie/examples/models/atb_speed_sdk/test/sdk_test.py
new file mode 100644
index 00000000..721e7a93
--- /dev/null
+++ b/mindie/examples/models/atb_speed_sdk/test/sdk_test.py
@@ -0,0 +1,40 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+# Copyright Huawei Technologies Co., Ltd. 2022-2022. All rights reserved.
+import os
+
+from atb_speed.common.launcher import Launcher
+from transformers import AutoTokenizer, AutoModelForCausalLM
+
+
+class BaichuanLM(Launcher):
+
+    def init_model(self):
+        """
+        模型初始化
+        :return:
+        """
+        pwd = os.path.realpath(os.path.dirname(__file__))
+        model_path = os.path.join(pwd, "..", "model")
+        tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True, use_fast=False)
+        model = AutoModelForCausalLM.from_pretrained(model_path, trust_remote_code=True).half().to(self._device)
+        model.eval()
+        model.generation_config = self.remove_part_of_generation_config(model.generation_config)
+        return model, tokenizer
+
+
+if __name__ == '__main__':
+    baichuan = BaichuanLM(device_ids="1", )
+    baichuan.infer('Hamlet->Shakespeare\nOne Hundred Years of Solitude->')
+
+    baichuan.infer('登鹳雀楼->王之涣\n夜雨寄北->')
+    baichuan.infer('苹果公司的CEO是')
+
+    query_list = [
+        "谷歌公司的CEO是",
+        '登鹳雀楼->王之涣\n夜雨寄北->',
+        '苹果公司的CEO是',
+        '华为公司的CEO是',
+        '微软公司的CEO是'
+    ]
+    baichuan.infer_batch(query_list)
diff --git a/mindie/examples/models/atb_speed_sdk/test/template.ini b/mindie/examples/models/atb_speed_sdk/test/template.ini
new file mode 100644
index 00000000..6eebde22
--- /dev/null
+++ b/mindie/examples/models/atb_speed_sdk/test/template.ini
@@ -0,0 +1,41 @@
+[model]
+;模型路径
+model_path=../model
+;使用的设备号,多卡用逗号分隔，设置多卡，将默认使用并行模式
+device_ids=2
+;并行通信类型，默认是hccl，可选hccl/nccl(GPU)
+;parallel_backend=hccl
+;日志保存路径，默认是执行脚本所在路径
+;log_dir=./
+;是否绑核，0或1，默认是1表示开启
+;bind_cpu=1
+
+[precision]
+;精度测试方法，默认为ceval，可选ceval/mmlu
+mode=ceval
+;精度测试工作路径
+work_dir=./
+;批量精度测试，默认是1
+batch=1
+;每个科目的shot数量，默认是5
+shot=5
+;每个问题的回答长度，默认是32
+;seq_len_out=32
+
+[performance]
+;性能测试模型名称，用于结果文件的命名
+model_name=vicuna_13b
+;测试的batch size
+batch_size=1
+;测试的输入的最大2的幂
+max_len_exp=10
+;测试的输入的最小2的幂
+min_len_exp=5
+;特定用例测试，格式为[[seq_in,seq_out]],注意当设置这个参数时，max_len_exp min_len_exp不生效
+;case_pair=[[1,2],[2,3]]
+;生成的结果文件名称，默认会自动生成，一般不设置
+;save_file_name=
+;性能测试方法，detail / normal , 默认是normal.要使用detail需要配合装饰器计时，并加上环境变量 TIMEIT=1
+;perf_mode=
+;性能测试时是否只测试generate而跳过decode，0/1 默认是0
+;skip_decode=
\ No newline at end of file
diff --git a/mindie/examples/models/atb_speed_sdk/test/test_config.py b/mindie/examples/models/atb_speed_sdk/test/test_config.py
new file mode 100644
index 00000000..0210d428
--- /dev/null
+++ b/mindie/examples/models/atb_speed_sdk/test/test_config.py
@@ -0,0 +1,14 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+# Copyright Huawei Technologies Co., Ltd. 2022-2022. All rights reserved.
+import os
+from unittest import TestCase
+
+from atb_speed.common.config import atb_speed_config
+
+
+class ConfigTest(TestCase):
+    def test_1(self):
+        pwd = os.path.dirname(os.path.realpath(__file__))
+        atb_speed_config.init_config(os.path.join(pwd, "template.ini"))
+        self.assertEqual(atb_speed_config.performance.batch_size, 1)
diff --git a/mindie/examples/models/atb_speed_sdk/test/test_timer.py b/mindie/examples/models/atb_speed_sdk/test/test_timer.py
new file mode 100644
index 00000000..e5f2579f
--- /dev/null
+++ b/mindie/examples/models/atb_speed_sdk/test/test_timer.py
@@ -0,0 +1,49 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+# Copyright Huawei Technologies Co., Ltd. 2022-2022. All rights reserved.
+"""
+@Time   :  2024/2/9 14:46
+"""
+import logging
+import os
+from unittest import TestCase
+
+import torch
+import torch.nn as nn
+from atb_speed.common.timer import Timer
+
+logging.basicConfig(level=logging.NOTSET)
+
+os.environ["TIMEIT"] = "1"
+
+
+class AddNet(nn.Module):
+    def __init__(self, in_dim, h_dim=5, out_dim=1):
+        super().__init__()
+        self.fc1 = nn.Linear(in_dim, h_dim)
+        self.fc2 = nn.Linear(h_dim, out_dim)
+
+    @Timer.timing
+    def forward(self, x_tensor, y_tensor):
+        out = torch.cat([x_tensor, y_tensor], dim=1)
+        out = torch.relu(self.fc1(out))
+        out = self.fc2(out)
+        return out
+
+
+class TimerTest(TestCase):
+    @classmethod
+    def setUpClass(cls):
+        Timer.reset()
+        # Timer.sync= xxxx
+        cls.add_net = AddNet(in_dim=2)
+
+    def test_1(self):
+        for _ in range(5):
+            x_tensor = torch.randn(1, 1)
+            y_tensor = torch.randn(1, 1)
+            result = self.add_net.forward(x_tensor, y_tensor)
+            logging.info(result)
+        logging.info(Timer.timeit_res)
+        logging.info(Timer.timeit_res.first_token_delay)
+        logging.info(Timer.timeit_res.next_token_avg_delay)
diff --git a/mindie/examples/models/baichuan/README.md b/mindie/examples/models/baichuan/README.md
new file mode 100644
index 00000000..51efe293
--- /dev/null
+++ b/mindie/examples/models/baichuan/README.md
@@ -0,0 +1,302 @@
+# README
+
+- Baichuan大模型，融合了意图理解、信息检索以及强化学习技术，结合有监督微调与人类意图对齐，在知识问答、文本创作领域表现突出。
+
+- 此代码仓中实现了一套基于NPU硬件的Baichuan推理模型。配合加速库使用，旨在NPU上获得极致的推理性能。
+
+# 特性矩阵
+
+- 此矩阵罗列了各Baichuan模型支持的特性
+
+| 模型及参数量                | 800I A2 Tensor Parallelism | 300I DUO Tensor Parallelism | FP16 | BF16 | Flash Attention | Paged Attention | W8A8量化 | W8A16量化 | W4A16量化 | KV cache量化 | 稀疏量化 | MOE量化 | MindIE Service | TGI  | 长序列 |
+|-----------------------|----------------------------|-----------------------------| ---- |-----| --------------- | --------------- | -------- | --------- | --------- | ------------ | -------------------------- | ---- | ------ | ---- |-----|
+| Baichuan2-7B          | 支持world size 1,2,4,8       | 支持world size 2              | √    | ×   | √               | √               | √        | ×         | ×         | ×            | √                          | ×    | √      | √    | ×    |
+| Baichuan2-13B         | 支持world size 2,4,8         | 支持world size 2,4            | √    | ×   | √               | √               | √        | ×         | √         | ×            | √                          | ×    | √      | √    | ×    |
+| Baichuan-7B           | 支持world size 1,2,4,8       | 支持world size 2              | √    | ×   | √               | √               | ×        | ×         | ×         | ×            | ×                          | ×    | √      | ×    | ×    |
+| Baichuan-13B          | 支持world size 2,4,8         | 支持world size 2,4            | √    | ×   | √               | √               | ×        | ×         | ×         | ×            | ×                          | ×    | √      | ×    | ×    |
+
+# 使用说明
+
+## 路径变量解释
+
+| 变量名         | 含义                                                                                                                             |
+|-------------|--------------------------------------------------------------------------------------------------------------------------------|
+| working_dir | 加速库及模型库下载后放置的目录                                                                                                                |
+| llm_path    | 模型仓所在路径。若使用编译好的包，则路径为`${working_dir}/ModelLink/`；若使用gitee下载的代码，则路径为`${working_dir}/ModelLink/mindie_ref/mindie_llm/atb_models` |
+| script_path | 脚本所在路径。Baichuan系列模型的工作脚本所在路径为${llm_path}/examples/models/baichuan                                                              |
+| weight_path | 模型权重路径                                                                                                                         |
+
+## 权重
+**权重下载**
+- [Baichuan-7B](https://huggingface.co/baichuan-inc/Baichuan-7B/tree/main)
+- [Baichuan-13B](https://huggingface.co/baichuan-inc/Baichuan-13B-Chat/tree/main)
+- [Baichuan2-7B](https://huggingface.co/baichuan-inc/Baichuan2-7B-Chat/tree/main)
+- [Baichuan2-13B](https://huggingface.co/baichuan-inc/Baichuan2-13B-Chat/tree/main)
+
+**权重转换**
+- Paged Attention 场景下需要.safetensors 格式的权重，如果没有，参考[此README文件](../../README.md)转换
+
+**量化权重生成**
+- 基于原始的FP16的权重，生成量化权重
+- W8A8 Antioutlier量化权重请使用以下指令生成
+    - 暂不支持
+
+- W8A8量化权重请使用以下指令生成
+  - baichuan2-7b使用quant_baichuan2_7b_w8a8.py，baichuan2-13b使用quant_baichuan2_13b_w8a8.py
+  - 备注：建议精度测试使用cpu生成量化权重。npu生成的量化权重可作为调试使用，精度会有损失。
+  - 修改权重路径
+    - 根据模型，将当前目录下的quant_baichuan2_7b_w8a8.py或quant_baichuan2_13b_w8a8.py文件中的input_fp16_path 和output_w8a8_path修改为自己的浮点权重路径和输出权重路径
+    - 如果想用npu转换权重，需要根据注释修改代码将设备设置为npu
+  - 执行
+    ```
+    python quant_baichuan2_7b_w8a8.py    (baichuan2-7b)
+    python quant_baichuan2_13b_w8a8.py   (baichuan2-13b)
+    ```
+  - 将原权重文件夹下所有文件（除权重文件*。bin）拷贝到新的量化权重文件下
+  - `${weight_path}/config.json`文件中需设置`dtype`和`quantize`类型来标识量化类型和精度
+  - 若`dtype`和`quantize`字段不存在，需新增
+    - 配置
+
+      | 量化类型及精度 | torch_dtype | quantize |
+      | -------------- | ----------- | -------- |
+      | FP16           | "float16"   | ""       |
+      | W8A8           | "float16"   | "w8a8"   |
+    - 示例
+      - baichuan模型使用FP16精度，W8A8量化
+        ```
+        {
+          "torch_dtype": "float16",
+          "quantize": "w8a8"
+        }
+        ``` 
+
+- W8A16量化权重请使用以下指令生成
+    - 暂不支持
+
+- W4A16量化权重请使用以下指令生成
+  - 当前w4a16只支持baichuan2-13b模型
+  - baichuan2-13b使用quant_baichuan2_13b_w4a16.py
+  - 备注：建议精度测试使用cpu生成量化权重。
+  - 修改权重路径
+    - 根据模型，将当前目录下的quant_baichuan2_13b_w4a16.py文件中的FP16_PATH 和OUTPUT_PATH修改为自己的浮点权重路径和输出权重路径
+  - 执行
+    ```
+    python quant_baichuan2_13b_w4a16.py   (baichuan2-13b)
+    ```
+  - 将原权重文件夹下所有文件（除权重文件*。bin）拷贝到新的量化权重文件下
+  - `${weight_path}/config.json`文件中需设置`dtype`和`quantize`类型来标识量化类型和精度
+  - 若`dtype`和`quantize`字段不存在，需新增
+    - 配置
+
+      | 量化类型及精度 | torch_dtype | quantize |
+      | -------------- | ----------- | -------- |
+      | FP16           | "float16"   | ""       |
+      | W4A16          | "float16"   | "w4a16"  |
+    - 示例
+      - baichuan模型使用FP16精度，W8A8量化
+        ```
+        {
+          "torch_dtype": "float16",
+          "quantize": "w4a16"
+        }
+        ``` 
+
+- 稀疏量化权重请使用以下指令生成
+  - Step 1
+    ```shell
+    # 设置CANN包的环境变量
+    source /usr/local/Ascend/ascend-toolkit/set_env.sh
+    cd ${llm_path}
+    python examples/models/llama/convert_quant_weights.py --model_path {浮点权重路径} --save_directory {W8A8S量化权重路径} --w_bit 4 --a_bit 8 --calib_file ${llm_path}/examples/convert/model_slim/teacher_qualification.jsonl --fraction 0.011 --co_sparse True
+    ```
+    请确保转换量化权重时transformer是==4.30.2
+  - Step 2：量化权重切分及压缩
+    > 运行前需要确保压缩工具编译过
+    >
+    > `cd /usr/local/Ascend/ascend-toolkit/latest/python/site-packages/msmodelslim/pytorch/weight_compression/compress_graph`
+    >
+    > `bash build.sh /usr/local/Ascend/ascend-toolkit/latest`
+
+    ```
+    torchrun --nproc_per_node {TP数} -m examples.convert.model_slim.sparse_compressor --model_path {W8A8S量化权重路径} --save_directory {W8A8SC量化权重路径}
+    ```
+
+    - TP数为tensor parallel并行个数
+    - 注意：若权重生成时以TP=4进行切分，则运行时也需以TP=4运行
+    - 示例
+      ```
+        torchrun --nproc_per_node 4 -m examples.convert.model_slim.sparse_compressor --model_path /data1/weights/model_slim/baichuan2-7b_w8a8s --save_directory /data1/weights/model_slim/baichuan2-7b_w8a8sc
+      ```
+
+**基础环境变量**
+- 参考[此README文件](../../../README.md)
+
+## 推理
+### 对话测试
+**运行Flash Attention FP16**
+- 其余Baichuan模型参考以下运行方式
+    - 运行启动脚本
+        - 在\${llm_path}目录下执行以下指令
+          ```shell
+          bash examples/models/baichuan/run_fa.sh ${weight_path}
+          ```
+    - 环境变量说明
+        - `export ASCEND_RT_VISIBLE_DEVICES=0,1,2,3,4,5,6,7`
+            - 指定当前机器上可用的逻辑NPU核心，多个核心间使用逗号相连
+            - 核心ID查阅方式见[此README文件](../../README.md)的【启动脚本相关环境变量】章节
+            - 对于300I DUO卡而言，若要使用单卡双芯，请指定至少两个可见核心；若要使用双卡四芯，请指定至少四个可见核心
+            - 各模型支持的核心数参考“特性矩阵”
+        - `export MASTER_PORT=20036`
+            - 设置卡间通信端口
+            - 默认使用20036端口
+            - 目的是为了避免同一台机器同时运行多个多卡模型时出现通信冲突
+            - 设置时端口建议范围为：20000-20050
+        - 以下环境变量与性能和内存优化相关，通常情况下无需修改
+          ```shell
+          export ATB_LAYER_INTERNAL_TENSOR_REUSE=1
+          export INF_NAN_MODE_ENABLE=0
+          export ATB_OPERATION_EXECUTE_ASYNC=1
+          export TASK_QUEUE_ENABLE=1
+          export ATB_CONVERT_NCHW_TO_ND=1
+          export HCCL_BUFFSIZE=120
+          export HCCL_WHITELIST_DISABLE=1
+          export ATB_CONTEXT_WORKSPACE_RING=1
+          export ATB_CONTEXT_WORKSPACE_SIZE=2629145600
+          export ATB_WORKSPACE_MEM_ALLOC_GLOBAL=0
+          export ATB_LAUNCH_KERNEL_WITH_TILING=0
+          export ATB_OPSRUNNER_KERNEL_CACHE_GLOABL_COUNT=1
+          export ATB_OPSRUNNER_KERNEL_CACHE_LOCAL_COUNT=0
+    
+          ```
+
+**运行Flash Attention BF16**
+- 暂不支持
+
+**运行Flash Attention W8A8**
+- 暂不支持
+
+**运行Flash Attention W8A16**
+- 暂不支持
+
+**运行Flash Attention W4A16**
+- 暂不支持
+
+**运行Paged Attention FP16**
+- 运行启动脚本
+    - 在\${llm_path}目录下执行以下指令
+      ```shell
+      chat模式（仅支持baichuan2系列）:
+      bash examples/models/baichuan/run_pa.sh ${weight_path} chat
+
+      非chat模式:
+      bash examples/models/baichuan/run_pa.sh ${weight_path} 
+      ```
+- 环境变量说明
+    - `export ASCEND_RT_VISIBLE_DEVICES=0,1,2,3,4,5,6,7`
+        - 指定当前机器上可用的逻辑NPU核心，多个核心间使用逗号相连
+        - 核心ID查阅方式见[此README文件](../../README.md)的【启动脚本相关环境变量】章节
+        - 对于300I DUO卡而言，若要使用单卡双芯，请指定至少两个可见核心；若要使用双卡四芯，请指定至少四个可见核心
+        - 各模型支持的核心数参考“特性矩阵”
+    - `export MASTER_PORT=20036`
+        - 设置卡间通信端口
+        - 默认使用20036端口
+        - 目的是为了避免同一台机器同时运行多个多卡模型时出现通信冲突
+        - 设置时端口建议范围为：20000-20050
+    - 以下环境变量与性能和内存优化相关，通常情况下无需修改
+      ```shell
+      export ATB_LAYER_INTERNAL_TENSOR_REUSE=1
+      export INF_NAN_MODE_ENABLE=0
+      export ATB_OPERATION_EXECUTE_ASYNC=1
+      export TASK_QUEUE_ENABLE=1
+      export ATB_CONVERT_NCHW_TO_ND=1
+      export LCCL_ENABLE_FALLBACK=1
+      export ATB_WORKSPACE_MEM_ALLOC_GLOBAL=1
+      export ATB_CONTEXT_WORKSPACE_SIZE=0
+      ```
+
+**运行Paged Attention BF16**
+- 暂不支持
+
+**运行Paged Attention W8A8**
+- 运行启动脚本
+  - 与“运行Paged Attention FP16”的启动方式相同
+  - `${weight_path}`为W8A8量化权重的路径
+- 环境变量说明
+  - 参见“运行Paged Attention FP16”中的环境变量说明
+- 相比于FP16，运行量化时需修改W8A8量化权重`${weight_path}/config.json`中的`quantize`字段，将此字段对应的值修改为`w8a8`
+  - 若config.json中无此字段，则新增
+
+**运行Paged Attention W8A16**
+- 暂不支持
+
+**运行Paged Attention W4A16**
+- 运行启动脚本
+  - 与“运行Paged Attention FP16”的启动方式相同
+  - `${weight_path}`为W4A16量化权重的路径
+- 环境变量说明
+  - 参见“运行Paged Attention FP16”中的环境变量说明
+- 相比于FP16，运行量化时需修改W4A16量化权重`${weight_path}/config.json`中的`quantize`字段，将此字段对应的值修改为`w4a16`
+  - 若config.json中无此字段，则新增
+
+**运行KV cache量化**
+- 待补充
+
+**运行稀疏量化**
+- 运行启动脚本
+  - 与“运行Paged Attention FP16”的启动方式相同
+  - `${weight_path}`为W8A8量化权重的路径
+- 环境变量说明
+  - 参见“运行Paged Attention FP16”中的环境变量说明
+- 相比于FP16，运行量化时需修改W8A8量化权重`${weight_path}/config.json`中的`quantize`字段，将此字段对应的值修改为`w8a8sc`
+  - 若config.json中无此字段，则新增
+- 注意：压缩算法与硬件强相关，当前仅300I DUO卡支持稀疏量化
+
+**运行MOE量化**
+- 待补充
+
+## 精度测试
+- 参考[此README文件](https://gitee.com/ascend/MindIE-LLM/blob/master/examples/atb_models/tests/modeltest/README.md)
+  - 示例
+    ```shell
+    cd ${llm_path}/tests/modeltest
+    export ASCEND_RT_VISIBLE_DEVICES=0,1,2,3
+    export MAX_MEMORY_GB=29
+    bash run.sh pa_fp16 full_BoolQ 1 baichuan2_7b ${baichuan-7b权重路径} 4
+    bash run.sh pa_fp16 full_BoolQ 1 baichuan2_13b ${baichuan-13b权重路径} 4
+    bash run.sh pa_fp16 full_BoolQ 1 baichuan2_7b ${baichuan2-7b权重路径} 4
+    bash run.sh pa_fp16 full_BoolQ 1 baichuan2_13b ${baichuan2-13b权重路径} 4
+    ```
+- 注意：baichuan-7b和baichuan-13b模型测试时复用baichuan2_7b和baichuan2_13b的model_name
+- 运行量化权重时需注意`${weight_path}/config.json`中的`quantize`字段和`torch_dtype`字段是否与权重匹配，参考[此README文件](https://gitee.com/ascend/MindIE-LLM/blob/master/examples/atb_models/examples/README.md)
+
+## 性能测试
+- 支持ALiBi Mask Free。默认关闭，如需开启，请修改当前目录下的run_pa.sh中环境变量如下：
+```
+export IS_ALIBI_MASK_FREE=1
+```
+- 参考[此README文件](https://gitee.com/ascend/MindIE-LLM/blob/master/examples/atb_models/tests/modeltest/README.md)
+  - 示例
+    ```shell
+    cd ${llm_path}/tests/modeltest
+    export ASCEND_RT_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+    export MAX_MEMORY_GB=29
+    export ATB_LLM_BENCHMARK_ENABLE=1
+    bash run.sh pa_fp16 performance [[2048,2048],[1024,1024],[512,512],[256,256]] 1 baichuan2_7b ${baichuan-7b权重路径} 8
+    bash run.sh pa_fp16 performance [[2048,2048],[1024,1024],[512,512],[256,256]] 1 baichuan2_13b ${baichuan-13b权重路径} 8
+    bash run.sh pa_fp16 performance [[2048,2048],[1024,1024],[512,512],[256,256]] 1 baichuan2_7b ${baichuan2-7b权重路径} 8
+    bash run.sh pa_fp16 performance [[2048,2048],[1024,1024],[512,512],[256,256]] 1 baichuan2_13b ${baichuan2-13b权重路径} 8
+    ```
+- 注意：baichuan-7b和baichuan-13b模型测试时复用baichuan2_7b和baichuan2_13b的model_name
+- 运行量化权重时需注意`${weight_path}/config.json`中的`quantize`字段和`torch_dtype`字段是否与权重匹配，参考[此README文件](https://gitee.com/ascend/MindIE-LLM/blob/master/examples/atb_models/examples/README.md)
+- 特殊场景说明: 若在性能测试时发现有波动情况，可配置透明大页，提升内存访问性能。该功能请按需开启，对内存占用有一定影响。
+```shell
+# 性能测试时，可按需开启透明大页
+echo always > /sys/kernel/mm/transparent_hugepage/enabled
+# 关闭透明大页
+echo never > /sys/kernel/mm/transparent_hugepage/enabled
+```
+
+## FAQ
+- 更多环境变量见[此README文件](../../README.md)
+- 对话测试实际执行的Python文件为`${llm_path}/examples/run_fa.py`和`${llm_path}/examples/run_pa.py`；这两个文件的参数说明见[此README文件](../../README.md)
+- 运行时，需要通过指令pip list｜grep protobuf确认protobuf版本，如果版本高于3.20.x，请运行指令pip install protobuf==3.20.0进行更新
diff --git a/mindie/examples/models/baichuan/quant_baichuan2_13b_w4a16.py b/mindie/examples/models/baichuan/quant_baichuan2_13b_w4a16.py
new file mode 100644
index 00000000..21b4ba16
--- /dev/null
+++ b/mindie/examples/models/baichuan/quant_baichuan2_13b_w4a16.py
@@ -0,0 +1,208 @@
+# Copyright Huawei Technologies Co., Ltd. 2023-2024. All rights reserved.
+import torch.utils.data
+from transformers import AutoTokenizer, AutoModelForCausalLM
+from msmodelslim.pytorch.llm_ptq.anti_outlier import AntiOutlierConfig, AntiOutlier
+from msmodelslim.pytorch.llm_ptq.llm_ptq_tools import Calibrator, QuantConfig
+
+SEQ_LEN_OUT = 32
+
+
+# for local path
+OUTPUT_PATH = "your output path"
+FP16_PATH = "your path to model"    # 原始模型路径，其中的内容如下图
+tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path=FP16_PATH, trust_remote_code=True)
+model = AutoModelForCausalLM.from_pretrained(
+    pretrained_model_name_or_path=FP16_PATH,
+    torch_dtype=torch.float32,
+    trust_remote_code=True
+)
+
+W_SYM = True
+
+
+# 获取校准数据函数定义
+def get_calib_dataset(input_tokenizer, calib_list, device="cpu"):  # 如果需要使用npu进行量化, device="npu:0"。使用cpu,device="cpu"
+    calib_dataset = []
+    for calib_data in calib_list:
+        inputs = input_tokenizer(calib_data, return_tensors='pt')
+        calib_dataset.append([
+            inputs.data['input_ids'].to(device),
+            inputs.data['attention_mask'].to(device)
+            ])
+    return calib_dataset
+
+
+CALIB_SET = [
+  "The following are multiple choice questions (with answers) about  teacher qualification.\n\n下列对于多动症的说法，不正确的是____\
+A. 由多种原因引起的一组综合征\nB. 某种神经递质的缺陷可诱发该病\nC. 神经髓鞘发育落后可诱发该病\nD. 营养不良可诱发该病\nAnswer: D\n\n学习迁移发生的必要条件是两种学习活动之间存在共同原理，\
+学习迁移产生的关键是学习者通过活动能概括出其共同原理。持这种观点的迁移理论被称为____\nA. 形式训练说\nB. 相同要素说\nC. 概括化理论\nD. 关系理论\nAnswer: C\n\nExcel中，通常在单元格内出现\
+“####”符号时，表明____。\nA. 显示的是字符串“####”\nB. 列宽不够，无法显示数值数据\nC. 数值溢出\nD. 计算错误\nAnswer: B\n\n第二次世界大战开始时间是____。\nA. 1914年\
+B. 1918年\nC. 1939年\nD. 1945年\nAnswer: C\n\n在日常生活中，我们经常会接触一些民谚、俗语，这些民谚、俗语蕴含着丰富的物理知识。下列民谚、俗语蕴含的物理知识所属领域不同的是____。\
+A. 坐井观天，所见甚少\nB. 瑞雪兆丰年\nC. 酒香不怕巷子深\nD. 下雪不寒化雪寒\nAnswer: A\n\n编写中小学教科书的直接依据是____。\nA. 《中华人民共和国教育法》\nB. 课程计划\nC. 课程标准\
+D. 课程表\nAnswer:",
+  "The following are multiple choice questions (with answers) about  teacher qualification.\n\n下列对于多动症的说法，不正确的是____\
+A. 由多种原因引起的一组综合征\nB. 某种神经递质的缺陷可诱发该病\nC. 神经髓鞘发育落后可诱发该病\nD. 营养不良可诱发该病\nAnswer: D\n\n学习迁移发生的必要条件是两种学习活动之间存在共同原理，\
+学习迁移产生的关键是学习者通过活动能概括出其共同原理。持这种观点的迁移理论被称为____\nA. 形式训练说\nB. 相同要素说\nC. 概括化理论\nD. 关系理论\nAnswer: C\n\nExcel中，通常在单元格内出现\
+“####”符号时，表明____。\nA. 显示的是字符串“####”\nB. 列宽不够，无法显示数值数据\nC. 数值溢出\nD. 计算错误\nAnswer: B\n\n第二次世界大战开始时间是____。\nA. 1914\
+年\nB. 1918年\nC. 1939年\nD. 1945年\nAnswer: C\n\n在日常生活中，我们经常会接触一些民谚、俗语，这些民谚、俗语蕴含着丰富的物理知识。下列民谚、俗语蕴含的物理知识所属领域不同的\
+是____。\nA. 坐井观天，所见甚少B. 瑞雪兆丰年\nC. 酒香不怕巷子深\nD. 下雪不寒化雪寒\nAnswer: A\n\n下列关于课程的三种文本表现形式说法正确的是____。\nA. 课程计划是由当\
+地教育主管部门制订的\nB. 课程标准是依据课程计划制定的C. 课程标准的核心是实施建议\nD. 教材编写的基本方式有直线式、螺旋式、交叉式\nAnswer:",
+  "The following are multiple choice questions (with answers) about  teacher qualification.\n\n下列对于多动症的说法，不正确的是____\
+A. 由多种原因引起的一组综合征\nB. 某种神经递质的缺陷可诱发该病\nC. 神经髓鞘发育落后可诱发该病\nD. 营养不良可诱发该病\nAnswer: D\n\n学习迁移发生的必要条件是两种学习活动之间存在共同原理，\
+学习迁移产生的关键是学习者通过活动能概括出其共同原理。持这种观点的迁移理论被称为____\nA. 形式训练说\nB. 相同要素说\nC. 概括化理论\nD. 关系理论\nAnswer: C\n\nExcel中，通常在单元格内出现\
+“####”符号时，表明____。\nA. 显示的是字符串“####”\nB. 列宽不够，无法显示数值数据\nC. 数值溢出\nD. 计算错误\nAnswer: B\n\n第二次世界大战开始时间是____。\nA. 1914\
+年\nB. 1918年\nC. 1939年\nD. 1945年\nAnswer: C\n\n在日常生活中，我们经常会接触一些民谚、俗语，这些民谚、俗语蕴含着丰富的物理知识。下列民谚、俗语蕴含的物理知识所属领域不同的\
+是____。\nA. 坐井观天，所见甚少\nB. 瑞雪兆丰年\nC. 酒香不怕巷子深\nD. 下雪不寒化雪寒\nAnswer: A\n\n悦悦是一名右耳失聪的残疾儿童，活动课上有时会听不清楚周老师所讲的内容，因此\
+经常提问题。对此，周老师应当采取的措施是____。\nA. 给予悦悦更多的帮助和指导\nB. 指导家长带悦悦回家自学\nC. 建议家长将悦悦转到特殊幼儿园\nD. 照顾大多数幼儿，不理会悦悦\nAnswer:",
+  "The following are multiple choice questions (with answers) about  teacher qualification.\n\n下列对于多动症的说法，不正确的是____\
+A. 由多种原因引起的一组综合征\nB. 某种神经递质的缺陷可诱发该病\nC. 神经髓鞘发育落后可诱发该病\nD. 营养不良可诱发该病\nAnswer: D\n\n学习迁移发生的必要条件是两种学习活动之间存在共同原理，\
+学习迁移产生的关键是学习者通过活动能概括出其共同原理。持这种观点的迁移理论被称为____\nA. 形式训练说\nB. 相同要素说\nC. 概括化理论\nD. 关系理论\nAnswer: C\n\nExcel中，通常在单元格内出现\
+“####”符号时，表明____。\nA. 显示的是字符串“####”\nB. 列宽不够，无法显示数值数据\nC. 数值溢出\nD. 计算错误\nAnswer: B\n\n第二次世界大战开始时间是____。\nA. 1914\
+年\nB. 1918年\nC. 1939年\nD. 1945年\nAnswer: C\n\n在日常生活中，我们经常会接触一些民谚、俗语，这些民谚、俗语蕴含着丰富的物理知识。下列民谚、俗语蕴含的物理知识所属领域不同\
+的是____。\nA. 坐井观天，所见甚少\nB. 瑞雪兆丰年\nC. 酒香不怕巷子深\nD. 下雪不寒化雪寒\nAnswer: A\n\n内流河也称“内陆河”，是指没有流入海洋的河流，大多分布在大陆内部干燥地区，上\
+游降水或冰雪融水为其主要补给水源，最终消失于沙漠或注入内陆湖泊。下列中国内流河中，最长的是____。\nA. 塔里木河\nB. 柴达木河\nC. 尼雅河\nD. 疏勒河\nAnswer:",
+  "The following are multiple choice questions (with answers) about  teacher qualification.\n\n下列对于多动症的说法，不正确的是____\
+A. 由多种原因引起的一组综合征\nB. 某种神经递质的缺陷可诱发该病\nC. 神经髓鞘发育落后可诱发该病\nD. 营养不良可诱发该病\nAnswer: D\n\n学习迁移发生的必要条件是两种学习活动之间存在共同原理，\
+学习迁移产生的关键是学习者通过活动能概括出其共同原理。持这种观点的迁移理论被称为____\nA. 形式训练说\nB. 相同要素说\nC. 概括化理论\nD. 关系理论\nAnswer: C\n\nExcel中，通常在单元格内出现\
+“####”符号时，表明____。\nA. 显示的是字符串“####”\nB. 列宽不够，无法显示数值数据\nC. 数值溢出\nD. 计算错误\nAnswer: B\n\n第二次世界大战开始时间是____。\nA. 1914\
+年\nB. 1918年\nC. 1939年\nD. 1945年\nAnswer: C\n\n在日常生活中，我们经常会接触一些民谚、俗语，这些民谚、俗语蕴含着丰富的物理知识。下列民谚、俗语蕴含的物理知识所属领域不同\
+的是____。\nA. 坐井观天，所见甚少\nB. 瑞雪兆丰年\nC. 酒香不怕巷子深\nD. 下雪不寒化雪寒\nAnswer: A\n\n学校规定学生不能烫染头发，但是小文为了彰显个性，在假期把头发染成了棕色。面\
+对小文的情况，教师应该怎样处理？____。\nA. 年轻人追求个性是合情合理的，应该宽容对待\nB. 违反学校的校规，应该严格处分\nC. 强制要求小文将头发颜色染回来才可以进校门\nD. 探明\
+小文违反校规的原因，并对其进行劝导和教育\nAnswer:",
+  "The following are multiple choice questions (with answers) about  teacher qualification.\n\n下列对于多动症的说法，不正确的是____\
+A. 由多种原因引起的一组综合征\nB. 某种神经递质的缺陷可诱发该病\nC. 神经髓鞘发育落后可诱发该病\nD. 营养不良可诱发该病\nAnswer: D\n\n学习迁移发生的必要条件是两种学习活动之间存在共同原理，\
+学习迁移产生的关键是学习者通过活动能概括出其共同原理。持这种观点的迁移理论被称为____\nA. 形式训练说\nB. 相同要素说\nC. 概括化理论\nD. 关系理论\nAnswer: C\n\nExcel中，通常在单元格内出现\
+“####”符号时，表明____。\nA. 显示的是字符串“####”\nB. 列宽不够，无法显示数值数据\nC. 数值溢出\nD. 计算错误\nAnswer: B\n\n第二次世界大战开始时间是____。\nA. 1914\
+年\nB. 1918年\nC. 1939年\nD. 1945年\nAnswer: C\n\n在日常生活中，我们经常会接触一些民谚、俗语，这些民谚、俗语蕴含着丰富的物理知识。下列民谚、俗语蕴含的物理知识所属领域不同的\
+是____。\nA. 坐井观天，所见甚少\nB. 瑞雪兆丰年\nC. 酒香不怕巷子深\nD. 下雪不寒化雪寒\nAnswer: A\n\n张老师根据自己班级的情况，为解决班级内部班干部的人际关系问题，建立和谐融洽\
+的班级氛围，自主开发了“和谐人际”的班级课程，这体现了教师____。\nA. 是教育教学的研究者\nB. 是课程的建设者和开发者\nC. 是学生学习的促进者\nD. 是社区型的开放教师\nAnswer:",
+  "The following are multiple choice questions (with answers) about  teacher qualification.\n\n下列对于多动症的说法，不正确的是____\
+A. 由多种原因引起的一组综合征\nB. 某种神经递质的缺陷可诱发该病\nC. 神经髓鞘发育落后可诱发该病\nD. 营养不良可诱发该病\nAnswer: D\n\n学习迁移发生的必要条件是两种学习活动之间存在共同原理，学习迁移产生的关键\
+是学习者通过活动能概括出其共同原理。持这种观点的迁移理论被称为____\nA. 形式训练说\nB. 相同要素说\nC. 概括化理论\nD. 关系理论\nAnswer: C\n\nExcel中，通常在单元格内出现“####”符号时，表明\
+____。\nA. 显示的是字符串“####”\nB. 列宽不够，无法显示数值数据\nC. 数值溢出\nD. 计算错误\nAnswer: B\n\n第二次世界大战开始时间是____。\nA. 1914年\nB. 1918年\
+C. 1939年\nD. 1945年\nAnswer: C\n\n在日常生活中，我们经常会接触一些民谚、俗语，这些民谚、俗语蕴含着丰富的物理知识。下列民谚、俗语蕴含的物理知识所属领域不同的是____。\nA. 坐井观天，所见甚少\
+B. 瑞雪兆丰年\nC. 酒香不怕巷子深\nD. 下雪不寒化雪寒\nAnswer: A\n\n刘老师工作很负责，学生在学校出现一点问题他就会与家长联系，在与家长沟通时他经常以前辈的姿态对待家长，对家长的教育方式指指点点。刘老师的做法\
+____。\nA. 正确，老师就应该与家长经常沟通\nB. 正确，老师的经验比家长丰富，应该多指导家长\nC. 不正确，教师没有权利指导家长\nD. 不正确，教师应该与家长建立平等的沟通关系，尊重家长的人格\nAnswer:",
+  "The following are multiple choice questions (with answers) about  teacher qualification.\n\n下列对于多动症的说法，不正确的是____\
+A. 由多种原因引起的一组综合征\nB. 某种神经递质的缺陷可诱发该病\nC. 神经髓鞘发育落后可诱发该病\nD. 营养不良可诱发该病\nAnswer: D\n\n学习迁移发生的必要条件是两种学习活动之间存在共同原理，学习迁移产生的关键\
+是学习者通过活动能概括出其共同原理。持这种观点的迁移理论被称为____\nA. 形式训练说\nB. 相同要素说\nC. 概括化理论\nD. 关系理论\nAnswer: C\n\nExcel中，通常在单元格内出现“####”符号时，表明\
+____。\nA. 显示的是字符串“####”\nB. 列宽不够，无法显示数值数据\nC. 数值溢出\nD. 计算错误\nAnswer: B\n\n第二次世界大战开始时间是____。\nA. 1914年\nB. 1918年\
+C. 1939年\nD. 1945年\nAnswer: C\n\n在日常生活中，我们经常会接触一些民谚、俗语，这些民谚、俗语蕴含着丰富的物理知识。下列民谚、俗语蕴含的物理知识所属领域不同的是____。\nA. 坐井观天，所见甚少\
+B. 瑞雪兆丰年\nC. 酒香不怕巷子深\nD. 下雪不寒化雪寒\nAnswer: A\n\n在古代印度，有一户人家经营一家棉布店销售自己手工制作的衣服。你认为这户人家属于哪个等级？____\nA. 婆罗门\nB. 刹帝利\
+C. 吠舍\nD. 首陀罗\nAnswer:",
+  "The following are multiple choice questions (with answers) about  teacher qualification.\n\n下列对于多动症的说法，不正确的是____\
+A. 由多种原因引起的一组综合征\nB. 某种神经递质的缺陷可诱发该病\nC. 神经髓鞘发育落后可诱发该病\nD. 营养不良可诱发该病\nAnswer: D\n\n学习迁移发生的必要条件是两种学习活动之间存在共同原理，学习迁移产生的关键\
+是学习者通过活动能概括出其共同原理。持这种观点的迁移理论被称为____\nA. 形式训练说\nB. 相同要素说\nC. 概括化理论\nD. 关系理论\nAnswer: C\n\nExcel中，通常在单元格内出现“####”符号时，表明\
+____。\nA. 显示的是字符串“####”\nB. 列宽不够，无法显示数值数据\nC. 数值溢出\nD. 计算错误\nAnswer: B\n\n第二次世界大战开始时间是____。\nA. 1914年\nB. 1918年\
+C. 1939年\nD. 1945年\nAnswer: C\n\n在日常生活中，我们经常会接触一些民谚、俗语，这些民谚、俗语蕴含着丰富的物理知识。下列民谚、俗语蕴含的物理知识所属领域不同的是____。\nA. 坐井观天，所见甚少\
+B. 瑞雪兆丰年\nC. 酒香不怕巷子深\nD. 下雪不寒化雪寒\nAnswer: A\n\n“小型分散，便于开展多种多样的活动，满足学生不同的兴趣、爱好，发展学生的才能，使学生得到更多的学习和锻炼的机会。\
+”这种课外活动的形式是____。\nA. 科技活动\nB. 学科活动\nC. 个人活动\nD. 小组活动\nAnswer:",
+  "The following are multiple choice questions (with answers) about  teacher qualification.\n\n下列对于多动症的说法，不正确的是____\
+A. 由多种原因引起的一组综合征\nB. 某种神经递质的缺陷可诱发该病\nC. 神经髓鞘发育落后可诱发该病\nD. 营养不良可诱发该病\nAnswer: D\n\n学习迁移发生的必要条件是两种学习活动之间存在共同原理，学习迁移产生的关键\
+是学习者通过活动能概括出其共同原理。持这种观点的迁移理论被称为____\nA. 形式训练说\nB. 相同要素说\nC. 概括化理论\nD. 关系理论\nAnswer: C\n\nExcel中，通常在单元格内出现“####”符号时，表明\
+____。\nA. 显示的是字符串“####”\nB. 列宽不够，无法显示数值数据\nC. 数值溢出\nD. 计算错误\nAnswer: B\n\n第二次世界大战开始时间是____。\nA. 1914年\nB. 1918年\
+C. 1939年\nD. 1945年\nAnswer: C\n\n在日常生活中，我们经常会接触一些民谚、俗语，这些民谚、俗语蕴含着丰富的物理知识。下列民谚、俗语蕴含的物理知识所属领域不同的是____。\nA. 坐井观天，所见甚少\
+B. 瑞雪兆丰年\nC. 酒香不怕巷子深\nD. 下雪不寒化雪寒\nAnswer: A\n\n小红每天晚上临睡前都要多次反复检查自己的书包，确保带齐了第二天需要用的教材和文具。她明知道没有这个必要，但就是控制不住。她可\
+能出现了____。\nA. 抑郁症\nB. 焦虑症\nC. 强迫症\nD. 恐惧症\nAnswer:",
+  "The following are multiple choice questions (with answers) about  teacher qualification.\n\n下列对于多动症的说法，不正确的是____\
+A. 由多种原因引起的一组综合征\nB. 某种神经递质的缺陷可诱发该病\nC. 神经髓鞘发育落后可诱发该病\nD. 营养不良可诱发该病\nAnswer: D\n\n学习迁移发生的必要条件是两种学习活动之间存在共同原理，学习迁移产生的关键\
+是学习者通过活动能概括出其共同原理。持这种观点的迁移理论被称为____\nA. 形式训练说\nB. 相同要素说\nC. 概括化理论\nD. 关系理论\nAnswer: C\n\nExcel中，通常在单元格内出现“####”符号时，表明\
+____。\nA. 显示的是字符串“####”\nB. 列宽不够，无法显示数值数据\nC. 数值溢出\nD. 计算错误\nAnswer: B\n\n第二次世界大战开始时间是____。\nA. 1914年\nB. 1918年\
+C. 1939年\nD. 1945年\nAnswer: C\n\n在日常生活中，我们经常会接触一些民谚、俗语，这些民谚、俗语蕴含着丰富的物理知识。下列民谚、俗语蕴含的物理知识所属领域不同的是____。\nA. 坐井观天，所见甚少\
+B. 瑞雪兆丰年\nC. 酒香不怕巷子深\nD. 下雪不寒化雪寒\nAnswer: A\n\n国家管理和评价课程的基础是____。\nA. 课程计划\nB. 课程标准\nC. 教学目标\nD. 教育目的\nAnswer:",
+  "The following are multiple choice questions (with answers) about  teacher qualification.\n\n下列对于多动症的说法，不正确的是____\
+A. 由多种原因引起的一组综合征\nB. 某种神经递质的缺陷可诱发该病\nC. 神经髓鞘发育落后可诱发该病\nD. 营养不良可诱发该病\nAnswer: D\n\n学习迁移发生的必要条件是两种学习活动之间存在共同原理，学习迁移产生的关键\
+是学习者通过活动能概括出其共同原理。持这种观点的迁移理论被称为____\nA. 形式训练说\nB. 相同要素说\nC. 概括化理论\nD. 关系理论\nAnswer: C\n\nExcel中，通常在单元格内出现“####”符号时，表明\
+____。\nA. 显示的是字符串“####”\nB. 列宽不够，无法显示数值数据\nC. 数值溢出\nD. 计算错误\nAnswer: B\n\n第二次世界大战开始时间是____。\nA. 1914年\nB. 1918年\
+C. 1939年\nD. 1945年\nAnswer: C\n\n在日常生活中，我们经常会接触一些民谚、俗语，这些民谚、俗语蕴含着丰富的物理知识。下列民谚、俗语蕴含的物理知识所属领域不同的是____。\nA. 坐井观天，所见甚少\
+B. 瑞雪兆丰年\nC. 酒香不怕巷子深\nD. 下雪不寒化雪寒\nAnswer: A\n\n儿童坚持性发生明显质变的年龄约在____\nA. 3～4岁\nB. 4～5岁\nC. 5～6岁\nD. 6岁以后\nAnswer:",
+  "The following are multiple choice questions (with answers) about  teacher qualification.\n\n下列对于多动症的说法，不正确的是____\
+A. 由多种原因引起的一组综合征\nB. 某种神经递质的缺陷可诱发该病\nC. 神经髓鞘发育落后可诱发该病\nD. 营养不良可诱发该病\nAnswer: D\n\n学习迁移发生的必要条件是两种学习活动之间存在共同原理，学习迁移产生的关键\
+是学习者通过活动能概括出其共同原理。持这种观点的迁移理论被称为____\nA. 形式训练说\nB. 相同要素说\nC. 概括化理论\nD. 关系理论\nAnswer: C\n\nExcel中，通常在单元格内出现“####”符号时，表明\
+____。\nA. 显示的是字符串“####”\nB. 列宽不够，无法显示数值数据\nC. 数值溢出\nD. 计算错误\nAnswer: B\n\n第二次世界大战开始时间是____。\nA. 1914年\nB. 1918年\
+C. 1939年\nD. 1945年\nAnswer: C\n\n在日常生活中，我们经常会接触一些民谚、俗语，这些民谚、俗语蕴含着丰富的物理知识。下列民谚、俗语蕴含的物理知识所属领域不同的是____。\nA. 坐井观天，所见甚少\
+B. 瑞雪兆丰年\nC. 酒香不怕巷子深\nD. 下雪不寒化雪寒\nAnswer: A\n\n《红楼梦》中人物众多、关系繁杂。为了帮助读者阅读，许多红学爱好者都在网络上发布了自己整理制作的主要人物关系图。这属于____。\
+A. 纲要策略\nB. 精细加工策略\nC. 资源管理策略\nD. 监控策略\nAnswer:",
+  "The following are multiple choice questions (with answers) about  teacher qualification.\n\n下列对于多动症的说法，不正确的是____\
+A. 由多种原因引起的一组综合征\nB. 某种神经递质的缺陷可诱发该病\nC. 神经髓鞘发育落后可诱发该病\nD. 营养不良可诱发该病\nAnswer: D\n\n学习迁移发生的必要条件是两种学习活动之间存在共同原理，学习迁移产生的关键\
+是学习者通过活动能概括出其共同原理。持这种观点的迁移理论被称为____\nA. 形式训练说\nB. 相同要素说\nC. 概括化理论\nD. 关系理论\nAnswer: C\n\nExcel中，通常在单元格内出现“####”符号时，表明\
+____。\nA. 显示的是字符串“####”\nB. 列宽不够，无法显示数值数据\nC. 数值溢出\nD. 计算错误\nAnswer: B\n\n第二次世界大战开始时间是____。\nA. 1914年\nB. 1918年\
+C. 1939年\nD. 1945年\nAnswer: C\n\n在日常生活中，我们经常会接触一些民谚、俗语，这些民谚、俗语蕴含着丰富的物理知识。下列民谚、俗语蕴含的物理知识所属领域不同的是____。\nA. 坐井观天，所见甚少\
+B. 瑞雪兆丰年\nC. 酒香不怕巷子深\nD. 下雪不寒化雪寒\nAnswer: A\n\n学期结束时，班主任王老师会对学生思想品德的发展变化情况进行评价。这项工作属于____。\nA. 工作总结\nB. 工作计划\nC. 操行评定\
+D. 建立学生档案\nAnswer:",
+  "The following are multiple choice questions (with answers) about  teacher qualification.\n\n下列对于多动症的说法，不正确的是____\
+A. 由多种原因引起的一组综合征\nB. 某种神经递质的缺陷可诱发该病\nC. 神经髓鞘发育落后可诱发该病\nD. 营养不良可诱发该病\nAnswer: D\n\n学习迁移发生的必要条件是两种学习活动之间存在共同原理，学习迁移产生的关键\
+是学习者通过活动能概括出其共同原理。持这种观点的迁移理论被称为____\nA. 形式训练说\nB. 相同要素说\nC. 概括化理论\nD. 关系理论\nAnswer: C\n\nExcel中，通常在单元格内出现“####”符号时，表明\
+____。\nA. 显示的是字符串“####”\nB. 列宽不够，无法显示数值数据\nC. 数值溢出\nD. 计算错误\nAnswer: B\n\n第二次世界大战开始时间是____。\nA. 1914年\nB. 1918年\
+C. 1939年\nD. 1945年\nAnswer: C\n\n在日常生活中，我们经常会接触一些民谚、俗语，这些民谚、俗语蕴含着丰富的物理知识。下列民谚、俗语蕴含的物理知识所属领域不同的是____。\nA. 坐井观天，所见甚少\
+B. 瑞雪兆丰年\nC. 酒香不怕巷子深\nD. 下雪不寒化雪寒\nAnswer: A\n\n人们常说：“教学有法而教无定法。”这反映了教师的劳动具有____。\nA. 连续性\nB. 示范性\nC. 长期性\nD. 创造\
+性\nAnswer:",
+  "The following are multiple choice questions (with answers) about  teacher qualification.\n\n下列对于多动症的说法，不正确的是____\
+A. 由多种原因引起的一组综合征\nB. 某种神经递质的缺陷可诱发该病\nC. 神经髓鞘发育落后可诱发该病\nD. 营养不良可诱发该病\nAnswer: D\n\n学习迁移发生的必要条件是两种学习活动之间存在共同原理，学习迁移产生的关键\
+是学习者通过活动能概括出其共同原理。持这种观点的迁移理论被称为____\nA. 形式训练说\nB. 相同要素说\nC. 概括化理论\nD. 关系理论\nAnswer: C\n\nExcel中，通常在单元格内出现“####”符号时，表明\
+____。\nA. 显示的是字符串“####”\nB. 列宽不够，无法显示数值数据\nC. 数值溢出\nD. 计算错误\nAnswer: B\n\n第二次世界大战开始时间是____。\nA. 1914年\nB. 1918年\
+C. 1939年\nD. 1945年\nAnswer: C\n\n在日常生活中，我们经常会接触一些民谚、俗语，这些民谚、俗语蕴含着丰富的物理知识。下列民谚、俗语蕴含的物理知识所属领域不同的是____。\nA. 坐井观天，所见甚少\
+B. 瑞雪兆丰年\nC. 酒香不怕巷子深\nD. 下雪不寒化雪寒\nAnswer: A\n\n县级以上地方各级人民代表大会是县级以上地方国家权力机关，其职权不包括____。\nA. 改变或撤销本级人大常务委员会不适当的决定\
+B. 选举并有权罢免本级人民法院院长\nC. 批准本行政区域内的预算执行情况的报告\nD. 决定并宣布下一级行政区城进入紧急状态\nAnswer:",
+  "The following are multiple choice questions (with answers) about  teacher qualification.\n\n下列对于多动症的说法，不正确的是____\
+A. 由多种原因引起的一组综合征\nB. 某种神经递质的缺陷可诱发该病\nC. 神经髓鞘发育落后可诱发该病\nD. 营养不良可诱发该病\nAnswer: D\n\n学习迁移发生的必要条件是两种学习活动之间存在共同原理，学习迁移产生的关键\
+是学习者通过活动能概括出其共同原理。持这种观点的迁移理论被称为____\nA. 形式训练说\nB. 相同要素说\nC. 概括化理论\nD. 关系理论\nAnswer: C\n\nExcel中，通常在单元格内出现“####”符号时，表明\
+____。\nA. 显示的是字符串“####”\nB. 列宽不够，无法显示数值数据\nC. 数值溢出\nD. 计算错误\nAnswer: B\n\n第二次世界大战开始时间是____。\nA. 1914年\nB. 1918年\
+C. 1939年\nD. 1945年\nAnswer: C\n\n在日常生活中，我们经常会接触一些民谚、俗语，这些民谚、俗语蕴含着丰富的物理知识。下列民谚、俗语蕴含的物理知识所属领域不同的是____。\nA. 坐井观天，所见甚少\
+B. 瑞雪兆丰年\nC. 酒香不怕巷子深\nD. 下雪不寒化雪寒\nAnswer: A\n\n在心理健康课上，同一批学生在第二次进行同样内容的人格测验时获得的分数与上次测验差别较大。这说明该测验存在的问题是____。\
+A. 信度问题\nB. 效度问题\nC. 难度问题\nD. 区分度问题\nAnswer:",
+  "The following are multiple choice questions (with answers) about  teacher qualification.\n\n下列对于多动症的说法，不正确的是____\
+A. 由多种原因引起的一组综合征\nB. 某种神经递质的缺陷可诱发该病\nC. 神经髓鞘发育落后可诱发该病\nD. 营养不良可诱发该病\nAnswer: D\n\n学习迁移发生的必要条件是两种学习活动之间存在共同原理，学习迁移产生的关键\
+是学习者通过活动能概括出其共同原理。持这种观点的迁移理论被称为____\nA. 形式训练说\nB. 相同要素说\nC. 概括化理论\nD. 关系理论\nAnswer: C\n\nExcel中，通常在单元格内出现“####”符号时，表明\
+____。\nA. 显示的是字符串“####”\nB. 列宽不够，无法显示数值数据\nC. 数值溢出\nD. 计算错误\nAnswer: B\n\n第二次世界大战开始时间是____。\nA. 1914年\nB. 1918年\
+C. 1939年\nD. 1945年\nAnswer: C\n\n在日常生活中，我们经常会接触一些民谚、俗语，这些民谚、俗语蕴含着丰富的物理知识。下列民谚、俗语蕴含的物理知识所属领域不同的是____。\nA. 坐井观天，所见甚少\
+B. 瑞雪兆丰年\nC. 酒香不怕巷子深\nD. 下雪不寒化雪寒\nAnswer: A\n\n李老师在教学生区分形近字“渴”“竭”“碣”“谒”时，将四个字相同的右半部分用白色粉笔写出，相异的左半部分用彩色粉笔写出。李老师运用了\
+知觉的____。\nA. 整体性\nB. 选择性\nC. 理解性\nD. 恒常性\nAnswer:",
+  "The following are multiple choice questions (with answers) about  teacher qualification.\n\n下列对于多动症的说法，不正确的是____\
+A. 由多种原因引起的一组综合征\nB. 某种神经递质的缺陷可诱发该病\nC. 神经髓鞘发育落后可诱发该病\nD. 营养不良可诱发该病\nAnswer: D\n\n学习迁移发生的必要条件是两种学习活动之间存在共同原理，学习迁移产生的关键\
+是学习者通过活动能概括出其共同原理。持这种观点的迁移理论被称为____\nA. 形式训练说\nB. 相同要素说\nC. 概括化理论\nD. 关系理论\nAnswer: C\n\nExcel中，通常在单元格内出现“####”符号时，表明\
+____。\nA. 显示的是字符串“####”\nB. 列宽不够，无法显示数值数据\nC. 数值溢出\nD. 计算错误\nAnswer: B\n\n第二次世界大战开始时间是____。\nA. 1914年\nB. 1918年\
+C. 1939年\nD. 1945年\nAnswer: C\n\n在日常生活中，我们经常会接触一些民谚、俗语，这些民谚、俗语蕴含着丰富的物理知识。下列民谚、俗语蕴含的物理知识所属领域不同的是____。\nA. 坐井观天，所见甚少\
+B. 瑞雪兆丰年\nC. 酒香不怕巷子深\nD. 下雪不寒化雪寒\nAnswer: A\n\n兰兰学会走路后,就要很喜欢尝试自己穿衣、吃饭、捡东西,喜欢探索周围世界。按照埃里克森人格发展阶段理论,兰兰所处的发展阶段是____\
+A. 信任对怀疑\nB. 自立对羞怯\nC. 主动感对内疚感\nD. 勤奋感对自卑感\nAnswer:",
+  "The following are multiple choice questions (with answers) about  teacher qualification.\n\n下列对于多动症的说法，不正确的是____\
+A. 由多种原因引起的一组综合征\nB. 某种神经递质的缺陷可诱发该病\nC. 神经髓鞘发育落后可诱发该病\nD. 营养不良可诱发该病\nAnswer: D\n\n学习迁移发生的必要条件是两种学习活动之间存在共同原理，学习迁移产生的关键\
+是学习者通过活动能概括出其共同原理。持这种观点的迁移理论被称为____\nA. 形式训练说\nB. 相同要素说\nC. 概括化理论\nD. 关系理论\nAnswer: C\n\nExcel中，通常在单元格内出现“####”符号时，表明\
+____。\nA. 显示的是字符串“####”\nB. 列宽不够，无法显示数值数据\nC. 数值溢出\nD. 计算错误\nAnswer: B\n\n第二次世界大战开始时间是____。\nA. 1914年\nB. 1918年\
+C. 1939年\nD. 1945年\nAnswer: C\n\n在日常生活中，我们经常会接触一些民谚、俗语，这些民谚、俗语蕴含着丰富的物理知识。下列民谚、俗语蕴含的物理知识所属领域不同的是____。\nA. 坐井观天，所见甚少\
+B. 瑞雪兆丰年\nC. 酒香不怕巷子深\nD. 下雪不寒化雪寒\nAnswer: A\n\n杨老师在教授生字词的过程中发现部分学生有缺笔少画的现象，于是他把“小学生缺笔少画现象的原因及对策研究”作为研究课题，拟订相应的研究计划，\
+在工作中收集、整理相关资料并实施教学措施，最后根据反馈信息调整教学方案。这种研究方法属于____。\nA. 教育行动研究法\nB. 教育实验法\nC. 教育叙事研究法\nD. 个案研究法\nAnswer:"
+]
+
+
+def main():
+    dataset_calib = get_calib_dataset(tokenizer, CALIB_SET)
+    '''
+    对于linear算子中的激活值如果有表示范围过大，或者“尖刺”的异常值过多，
+    需要使用anti outlier功能，使用方法如下
+    '''
+    anti_config = AntiOutlierConfig(a_bit=16, w_bit=4, anti_method="m3", dev_type="cpu", w_sym=W_SYM)
+    anti_outlier = AntiOutlier(model, calib_data=dataset_calib, cfg=anti_config, norm_class_name="RMSNorm")
+    anti_outlier.process()
+    '''
+    下面是回退层的设置，因为w8a8的对激活值也进行了量化，会有部分网络层对激活值的表示
+    范围较为敏感所以需要回退这些网络层使用浮点权重进行计算
+    '''
+
+    disable_names = []
+    baichuan_layers = 40
+    disable_idx_lst = list(range(baichuan_layers))
+    for layer_index in disable_idx_lst:
+        down_proj_name = "model.layers.{}.mlp.down_proj".format(layer_index)
+        disable_names.append(down_proj_name)
+        
+    model.eval()
+    quant_config = QuantConfig(a_bit=16, w_bit=4, disable_names=disable_names, dev_type='cpu',
+                               w_sym=W_SYM, mm_tensor=False, is_lowbit=True, open_outlier=False, 
+                               group_size=64, disable_last_linear=False)
+    calibrator = Calibrator(model, quant_config, calib_data=[], disable_level='L0')
+    calibrator.run()
+    calibrator.save(OUTPUT_PATH, save_type=["safe_tensor", "numpy"])
+
+if __name__ == "__main__":
+    main()
\ No newline at end of file
diff --git a/mindie/examples/models/baichuan/quant_baichuan2_13b_w8a8.py b/mindie/examples/models/baichuan/quant_baichuan2_13b_w8a8.py
new file mode 100644
index 00000000..7176b5fe
--- /dev/null
+++ b/mindie/examples/models/baichuan/quant_baichuan2_13b_w8a8.py
@@ -0,0 +1,197 @@
+# Copyright Huawei Technologies Co., Ltd. 2023-2024. All rights reserved.
+from transformers import AutoTokenizer, AutoModelForCausalLM
+from msmodelslim.pytorch.llm_ptq.anti_outlier import AntiOutlierConfig, AntiOutlier
+from msmodelslim.pytorch.llm_ptq.llm_ptq_tools import Calibrator, QuantConfig
+
+INPORT_FP16_PATH = 'the_path_of_fp16_model_input'
+OUTPORT_W8A8_PATH = 'the_path_of_w8a8_model_output'
+tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path=INPORT_FP16_PATH, trust_remote_code=True)
+model = AutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path=INPORT_FP16_PATH, trust_remote_code=True).\
+        float().cpu()
+
+
+# 获取校准数据函数定义
+def get_calib_dataset(tokenizer, calib_list, device="cpu"):  # 如果需要使用npu进行量化, device="npu:0"。使用cpu,device="cpu"
+    calib_dataset = []
+    for calib_data in calib_list:
+        inputs = tokenizer(calib_data, return_tensors='pt')
+        calib_dataset.append([
+            inputs.data['input_ids'].to(device),
+            inputs.data['attention_mask'].to(device)
+            ])
+    return calib_dataset
+
+
+CALIB_SET = [
+  "The following are multiple choice questions (with answers) about  teacher qualification.\n\n下列对于多动症的说法，不正确的是____\
+A. 由多种原因引起的一组综合征\nB. 某种神经递质的缺陷可诱发该病\nC. 神经髓鞘发育落后可诱发该病\nD. 营养不良可诱发该病\nAnswer: D\n\n学习迁移发生的必要条件是两种学习活动之间存在共同原理，\
+学习迁移产生的关键是学习者通过活动能概括出其共同原理。持这种观点的迁移理论被称为____\nA. 形式训练说\nB. 相同要素说\nC. 概括化理论\nD. 关系理论\nAnswer: C\n\nExcel中，通常在单元格内出现\
+“####”符号时，表明____。\nA. 显示的是字符串“####”\nB. 列宽不够，无法显示数值数据\nC. 数值溢出\nD. 计算错误\nAnswer: B\n\n第二次世界大战开始时间是____。\nA. 1914年\
+B. 1918年\nC. 1939年\nD. 1945年\nAnswer: C\n\n在日常生活中，我们经常会接触一些民谚、俗语，这些民谚、俗语蕴含着丰富的物理知识。下列民谚、俗语蕴含的物理知识所属领域不同的是____。\
+A. 坐井观天，所见甚少\nB. 瑞雪兆丰年\nC. 酒香不怕巷子深\nD. 下雪不寒化雪寒\nAnswer: A\n\n编写中小学教科书的直接依据是____。\nA. 《中华人民共和国教育法》\nB. 课程计划\nC. 课程标准\
+D. 课程表\nAnswer:",
+  "The following are multiple choice questions (with answers) about  teacher qualification.\n\n下列对于多动症的说法，不正确的是____\
+A. 由多种原因引起的一组综合征\nB. 某种神经递质的缺陷可诱发该病\nC. 神经髓鞘发育落后可诱发该病\nD. 营养不良可诱发该病\nAnswer: D\n\n学习迁移发生的必要条件是两种学习活动之间存在共同原理，\
+学习迁移产生的关键是学习者通过活动能概括出其共同原理。持这种观点的迁移理论被称为____\nA. 形式训练说\nB. 相同要素说\nC. 概括化理论\nD. 关系理论\nAnswer: C\n\nExcel中，通常在单元格内出现\
+“####”符号时，表明____。\nA. 显示的是字符串“####”\nB. 列宽不够，无法显示数值数据\nC. 数值溢出\nD. 计算错误\nAnswer: B\n\n第二次世界大战开始时间是____。\nA. 1914\
+年\nB. 1918年\nC. 1939年\nD. 1945年\nAnswer: C\n\n在日常生活中，我们经常会接触一些民谚、俗语，这些民谚、俗语蕴含着丰富的物理知识。下列民谚、俗语蕴含的物理知识所属领域不同的\
+是____。\nA. 坐井观天，所见甚少B. 瑞雪兆丰年\nC. 酒香不怕巷子深\nD. 下雪不寒化雪寒\nAnswer: A\n\n下列关于课程的三种文本表现形式说法正确的是____。\nA. 课程计划是由当\
+地教育主管部门制订的\nB. 课程标准是依据课程计划制定的C. 课程标准的核心是实施建议\nD. 教材编写的基本方式有直线式、螺旋式、交叉式\nAnswer:",
+  "The following are multiple choice questions (with answers) about  teacher qualification.\n\n下列对于多动症的说法，不正确的是____\
+A. 由多种原因引起的一组综合征\nB. 某种神经递质的缺陷可诱发该病\nC. 神经髓鞘发育落后可诱发该病\nD. 营养不良可诱发该病\nAnswer: D\n\n学习迁移发生的必要条件是两种学习活动之间存在共同原理，\
+学习迁移产生的关键是学习者通过活动能概括出其共同原理。持这种观点的迁移理论被称为____\nA. 形式训练说\nB. 相同要素说\nC. 概括化理论\nD. 关系理论\nAnswer: C\n\nExcel中，通常在单元格内出现\
+“####”符号时，表明____。\nA. 显示的是字符串“####”\nB. 列宽不够，无法显示数值数据\nC. 数值溢出\nD. 计算错误\nAnswer: B\n\n第二次世界大战开始时间是____。\nA. 1914\
+年\nB. 1918年\nC. 1939年\nD. 1945年\nAnswer: C\n\n在日常生活中，我们经常会接触一些民谚、俗语，这些民谚、俗语蕴含着丰富的物理知识。下列民谚、俗语蕴含的物理知识所属领域不同的\
+是____。\nA. 坐井观天，所见甚少\nB. 瑞雪兆丰年\nC. 酒香不怕巷子深\nD. 下雪不寒化雪寒\nAnswer: A\n\n悦悦是一名右耳失聪的残疾儿童，活动课上有时会听不清楚周老师所讲的内容，因此\
+经常提问题。对此，周老师应当采取的措施是____。\nA. 给予悦悦更多的帮助和指导\nB. 指导家长带悦悦回家自学\nC. 建议家长将悦悦转到特殊幼儿园\nD. 照顾大多数幼儿，不理会悦悦\nAnswer:",
+  "The following are multiple choice questions (with answers) about  teacher qualification.\n\n下列对于多动症的说法，不正确的是____\
+A. 由多种原因引起的一组综合征\nB. 某种神经递质的缺陷可诱发该病\nC. 神经髓鞘发育落后可诱发该病\nD. 营养不良可诱发该病\nAnswer: D\n\n学习迁移发生的必要条件是两种学习活动之间存在共同原理，\
+学习迁移产生的关键是学习者通过活动能概括出其共同原理。持这种观点的迁移理论被称为____\nA. 形式训练说\nB. 相同要素说\nC. 概括化理论\nD. 关系理论\nAnswer: C\n\nExcel中，通常在单元格内出现\
+“####”符号时，表明____。\nA. 显示的是字符串“####”\nB. 列宽不够，无法显示数值数据\nC. 数值溢出\nD. 计算错误\nAnswer: B\n\n第二次世界大战开始时间是____。\nA. 1914\
+年\nB. 1918年\nC. 1939年\nD. 1945年\nAnswer: C\n\n在日常生活中，我们经常会接触一些民谚、俗语，这些民谚、俗语蕴含着丰富的物理知识。下列民谚、俗语蕴含的物理知识所属领域不同\
+的是____。\nA. 坐井观天，所见甚少\nB. 瑞雪兆丰年\nC. 酒香不怕巷子深\nD. 下雪不寒化雪寒\nAnswer: A\n\n内流河也称“内陆河”，是指没有流入海洋的河流，大多分布在大陆内部干燥地区，上\
+游降水或冰雪融水为其主要补给水源，最终消失于沙漠或注入内陆湖泊。下列中国内流河中，最长的是____。\nA. 塔里木河\nB. 柴达木河\nC. 尼雅河\nD. 疏勒河\nAnswer:",
+  "The following are multiple choice questions (with answers) about  teacher qualification.\n\n下列对于多动症的说法，不正确的是____\
+A. 由多种原因引起的一组综合征\nB. 某种神经递质的缺陷可诱发该病\nC. 神经髓鞘发育落后可诱发该病\nD. 营养不良可诱发该病\nAnswer: D\n\n学习迁移发生的必要条件是两种学习活动之间存在共同原理，\
+学习迁移产生的关键是学习者通过活动能概括出其共同原理。持这种观点的迁移理论被称为____\nA. 形式训练说\nB. 相同要素说\nC. 概括化理论\nD. 关系理论\nAnswer: C\n\nExcel中，通常在单元格内出现\
+“####”符号时，表明____。\nA. 显示的是字符串“####”\nB. 列宽不够，无法显示数值数据\nC. 数值溢出\nD. 计算错误\nAnswer: B\n\n第二次世界大战开始时间是____。\nA. 1914\
+年\nB. 1918年\nC. 1939年\nD. 1945年\nAnswer: C\n\n在日常生活中，我们经常会接触一些民谚、俗语，这些民谚、俗语蕴含着丰富的物理知识。下列民谚、俗语蕴含的物理知识所属领域不同\
+的是____。\nA. 坐井观天，所见甚少\nB. 瑞雪兆丰年\nC. 酒香不怕巷子深\nD. 下雪不寒化雪寒\nAnswer: A\n\n学校规定学生不能烫染头发，但是小文为了彰显个性，在假期把头发染成了棕色。面\
+对小文的情况，教师应该怎样处理？____。\nA. 年轻人追求个性是合情合理的，应该宽容对待\nB. 违反学校的校规，应该严格处分\nC. 强制要求小文将头发颜色染回来才可以进校门\nD. 探明\
+小文违反校规的原因，并对其进行劝导和教育\nAnswer:",
+  "The following are multiple choice questions (with answers) about  teacher qualification.\n\n下列对于多动症的说法，不正确的是____\
+A. 由多种原因引起的一组综合征\nB. 某种神经递质的缺陷可诱发该病\nC. 神经髓鞘发育落后可诱发该病\nD. 营养不良可诱发该病\nAnswer: D\n\n学习迁移发生的必要条件是两种学习活动之间存在共同原理，\
+学习迁移产生的关键是学习者通过活动能概括出其共同原理。持这种观点的迁移理论被称为____\nA. 形式训练说\nB. 相同要素说\nC. 概括化理论\nD. 关系理论\nAnswer: C\n\nExcel中，通常在单元格内出现\
+“####”符号时，表明____。\nA. 显示的是字符串“####”\nB. 列宽不够，无法显示数值数据\nC. 数值溢出\nD. 计算错误\nAnswer: B\n\n第二次世界大战开始时间是____。\nA. 1914\
+年\nB. 1918年\nC. 1939年\nD. 1945年\nAnswer: C\n\n在日常生活中，我们经常会接触一些民谚、俗语，这些民谚、俗语蕴含着丰富的物理知识。下列民谚、俗语蕴含的物理知识所属领域不同的\
+是____。\nA. 坐井观天，所见甚少\nB. 瑞雪兆丰年\nC. 酒香不怕巷子深\nD. 下雪不寒化雪寒\nAnswer: A\n\n张老师根据自己班级的情况，为解决班级内部班干部的人际关系问题，建立和谐融洽\
+的班级氛围，自主开发了“和谐人际”的班级课程，这体现了教师____。\nA. 是教育教学的研究者\nB. 是课程的建设者和开发者\nC. 是学生学习的促进者\nD. 是社区型的开放教师\nAnswer:",
+  "The following are multiple choice questions (with answers) about  teacher qualification.\n\n下列对于多动症的说法，不正确的是____\
+A. 由多种原因引起的一组综合征\nB. 某种神经递质的缺陷可诱发该病\nC. 神经髓鞘发育落后可诱发该病\nD. 营养不良可诱发该病\nAnswer: D\n\n学习迁移发生的必要条件是两种学习活动之间存在共同原理，学习迁移产生的关键\
+是学习者通过活动能概括出其共同原理。持这种观点的迁移理论被称为____\nA. 形式训练说\nB. 相同要素说\nC. 概括化理论\nD. 关系理论\nAnswer: C\n\nExcel中，通常在单元格内出现“####”符号时，表明\
+____。\nA. 显示的是字符串“####”\nB. 列宽不够，无法显示数值数据\nC. 数值溢出\nD. 计算错误\nAnswer: B\n\n第二次世界大战开始时间是____。\nA. 1914年\nB. 1918年\
+C. 1939年\nD. 1945年\nAnswer: C\n\n在日常生活中，我们经常会接触一些民谚、俗语，这些民谚、俗语蕴含着丰富的物理知识。下列民谚、俗语蕴含的物理知识所属领域不同的是____。\nA. 坐井观天，所见甚少\
+B. 瑞雪兆丰年\nC. 酒香不怕巷子深\nD. 下雪不寒化雪寒\nAnswer: A\n\n刘老师工作很负责，学生在学校出现一点问题他就会与家长联系，在与家长沟通时他经常以前辈的姿态对待家长，对家长的教育方式指指点点。刘老师的做法\
+____。\nA. 正确，老师就应该与家长经常沟通\nB. 正确，老师的经验比家长丰富，应该多指导家长\nC. 不正确，教师没有权利指导家长\nD. 不正确，教师应该与家长建立平等的沟通关系，尊重家长的人格\nAnswer:",
+  "The following are multiple choice questions (with answers) about  teacher qualification.\n\n下列对于多动症的说法，不正确的是____\
+A. 由多种原因引起的一组综合征\nB. 某种神经递质的缺陷可诱发该病\nC. 神经髓鞘发育落后可诱发该病\nD. 营养不良可诱发该病\nAnswer: D\n\n学习迁移发生的必要条件是两种学习活动之间存在共同原理，学习迁移产生的关键\
+是学习者通过活动能概括出其共同原理。持这种观点的迁移理论被称为____\nA. 形式训练说\nB. 相同要素说\nC. 概括化理论\nD. 关系理论\nAnswer: C\n\nExcel中，通常在单元格内出现“####”符号时，表明\
+____。\nA. 显示的是字符串“####”\nB. 列宽不够，无法显示数值数据\nC. 数值溢出\nD. 计算错误\nAnswer: B\n\n第二次世界大战开始时间是____。\nA. 1914年\nB. 1918年\
+C. 1939年\nD. 1945年\nAnswer: C\n\n在日常生活中，我们经常会接触一些民谚、俗语，这些民谚、俗语蕴含着丰富的物理知识。下列民谚、俗语蕴含的物理知识所属领域不同的是____。\nA. 坐井观天，所见甚少\
+B. 瑞雪兆丰年\nC. 酒香不怕巷子深\nD. 下雪不寒化雪寒\nAnswer: A\n\n在古代印度，有一户人家经营一家棉布店销售自己手工制作的衣服。你认为这户人家属于哪个等级？____\nA. 婆罗门\nB. 刹帝利\
+C. 吠舍\nD. 首陀罗\nAnswer:",
+  "The following are multiple choice questions (with answers) about  teacher qualification.\n\n下列对于多动症的说法，不正确的是____\
+A. 由多种原因引起的一组综合征\nB. 某种神经递质的缺陷可诱发该病\nC. 神经髓鞘发育落后可诱发该病\nD. 营养不良可诱发该病\nAnswer: D\n\n学习迁移发生的必要条件是两种学习活动之间存在共同原理，学习迁移产生的关键\
+是学习者通过活动能概括出其共同原理。持这种观点的迁移理论被称为____\nA. 形式训练说\nB. 相同要素说\nC. 概括化理论\nD. 关系理论\nAnswer: C\n\nExcel中，通常在单元格内出现“####”符号时，表明\
+____。\nA. 显示的是字符串“####”\nB. 列宽不够，无法显示数值数据\nC. 数值溢出\nD. 计算错误\nAnswer: B\n\n第二次世界大战开始时间是____。\nA. 1914年\nB. 1918年\
+C. 1939年\nD. 1945年\nAnswer: C\n\n在日常生活中，我们经常会接触一些民谚、俗语，这些民谚、俗语蕴含着丰富的物理知识。下列民谚、俗语蕴含的物理知识所属领域不同的是____。\nA. 坐井观天，所见甚少\
+B. 瑞雪兆丰年\nC. 酒香不怕巷子深\nD. 下雪不寒化雪寒\nAnswer: A\n\n“小型分散，便于开展多种多样的活动，满足学生不同的兴趣、爱好，发展学生的才能，使学生得到更多的学习和锻炼的机会。\
+”这种课外活动的形式是____。\nA. 科技活动\nB. 学科活动\nC. 个人活动\nD. 小组活动\nAnswer:",
+  "The following are multiple choice questions (with answers) about  teacher qualification.\n\n下列对于多动症的说法，不正确的是____\
+A. 由多种原因引起的一组综合征\nB. 某种神经递质的缺陷可诱发该病\nC. 神经髓鞘发育落后可诱发该病\nD. 营养不良可诱发该病\nAnswer: D\n\n学习迁移发生的必要条件是两种学习活动之间存在共同原理，学习迁移产生的关键\
+是学习者通过活动能概括出其共同原理。持这种观点的迁移理论被称为____\nA. 形式训练说\nB. 相同要素说\nC. 概括化理论\nD. 关系理论\nAnswer: C\n\nExcel中，通常在单元格内出现“####”符号时，表明\
+____。\nA. 显示的是字符串“####”\nB. 列宽不够，无法显示数值数据\nC. 数值溢出\nD. 计算错误\nAnswer: B\n\n第二次世界大战开始时间是____。\nA. 1914年\nB. 1918年\
+C. 1939年\nD. 1945年\nAnswer: C\n\n在日常生活中，我们经常会接触一些民谚、俗语，这些民谚、俗语蕴含着丰富的物理知识。下列民谚、俗语蕴含的物理知识所属领域不同的是____。\nA. 坐井观天，所见甚少\
+B. 瑞雪兆丰年\nC. 酒香不怕巷子深\nD. 下雪不寒化雪寒\nAnswer: A\n\n小红每天晚上临睡前都要多次反复检查自己的书包，确保带齐了第二天需要用的教材和文具。她明知道没有这个必要，但就是控制不住。她可\
+能出现了____。\nA. 抑郁症\nB. 焦虑症\nC. 强迫症\nD. 恐惧症\nAnswer:",
+  "The following are multiple choice questions (with answers) about  teacher qualification.\n\n下列对于多动症的说法，不正确的是____\
+A. 由多种原因引起的一组综合征\nB. 某种神经递质的缺陷可诱发该病\nC. 神经髓鞘发育落后可诱发该病\nD. 营养不良可诱发该病\nAnswer: D\n\n学习迁移发生的必要条件是两种学习活动之间存在共同原理，学习迁移产生的关键\
+是学习者通过活动能概括出其共同原理。持这种观点的迁移理论被称为____\nA. 形式训练说\nB. 相同要素说\nC. 概括化理论\nD. 关系理论\nAnswer: C\n\nExcel中，通常在单元格内出现“####”符号时，表明\
+____。\nA. 显示的是字符串“####”\nB. 列宽不够，无法显示数值数据\nC. 数值溢出\nD. 计算错误\nAnswer: B\n\n第二次世界大战开始时间是____。\nA. 1914年\nB. 1918年\
+C. 1939年\nD. 1945年\nAnswer: C\n\n在日常生活中，我们经常会接触一些民谚、俗语，这些民谚、俗语蕴含着丰富的物理知识。下列民谚、俗语蕴含的物理知识所属领域不同的是____。\nA. 坐井观天，所见甚少\
+B. 瑞雪兆丰年\nC. 酒香不怕巷子深\nD. 下雪不寒化雪寒\nAnswer: A\n\n国家管理和评价课程的基础是____。\nA. 课程计划\nB. 课程标准\nC. 教学目标\nD. 教育目的\nAnswer:",
+  "The following are multiple choice questions (with answers) about  teacher qualification.\n\n下列对于多动症的说法，不正确的是____\
+A. 由多种原因引起的一组综合征\nB. 某种神经递质的缺陷可诱发该病\nC. 神经髓鞘发育落后可诱发该病\nD. 营养不良可诱发该病\nAnswer: D\n\n学习迁移发生的必要条件是两种学习活动之间存在共同原理，学习迁移产生的关键\
+是学习者通过活动能概括出其共同原理。持这种观点的迁移理论被称为____\nA. 形式训练说\nB. 相同要素说\nC. 概括化理论\nD. 关系理论\nAnswer: C\n\nExcel中，通常在单元格内出现“####”符号时，表明\
+____。\nA. 显示的是字符串“####”\nB. 列宽不够，无法显示数值数据\nC. 数值溢出\nD. 计算错误\nAnswer: B\n\n第二次世界大战开始时间是____。\nA. 1914年\nB. 1918年\
+C. 1939年\nD. 1945年\nAnswer: C\n\n在日常生活中，我们经常会接触一些民谚、俗语，这些民谚、俗语蕴含着丰富的物理知识。下列民谚、俗语蕴含的物理知识所属领域不同的是____。\nA. 坐井观天，所见甚少\
+B. 瑞雪兆丰年\nC. 酒香不怕巷子深\nD. 下雪不寒化雪寒\nAnswer: A\n\n儿童坚持性发生明显质变的年龄约在____\nA. 3～4岁\nB. 4～5岁\nC. 5～6岁\nD. 6岁以后\nAnswer:",
+  "The following are multiple choice questions (with answers) about  teacher qualification.\n\n下列对于多动症的说法，不正确的是____\
+A. 由多种原因引起的一组综合征\nB. 某种神经递质的缺陷可诱发该病\nC. 神经髓鞘发育落后可诱发该病\nD. 营养不良可诱发该病\nAnswer: D\n\n学习迁移发生的必要条件是两种学习活动之间存在共同原理，学习迁移产生的关键\
+是学习者通过活动能概括出其共同原理。持这种观点的迁移理论被称为____\nA. 形式训练说\nB. 相同要素说\nC. 概括化理论\nD. 关系理论\nAnswer: C\n\nExcel中，通常在单元格内出现“####”符号时，表明\
+____。\nA. 显示的是字符串“####”\nB. 列宽不够，无法显示数值数据\nC. 数值溢出\nD. 计算错误\nAnswer: B\n\n第二次世界大战开始时间是____。\nA. 1914年\nB. 1918年\
+C. 1939年\nD. 1945年\nAnswer: C\n\n在日常生活中，我们经常会接触一些民谚、俗语，这些民谚、俗语蕴含着丰富的物理知识。下列民谚、俗语蕴含的物理知识所属领域不同的是____。\nA. 坐井观天，所见甚少\
+B. 瑞雪兆丰年\nC. 酒香不怕巷子深\nD. 下雪不寒化雪寒\nAnswer: A\n\n《红楼梦》中人物众多、关系繁杂。为了帮助读者阅读，许多红学爱好者都在网络上发布了自己整理制作的主要人物关系图。这属于____。\
+A. 纲要策略\nB. 精细加工策略\nC. 资源管理策略\nD. 监控策略\nAnswer:",
+  "The following are multiple choice questions (with answers) about  teacher qualification.\n\n下列对于多动症的说法，不正确的是____\
+A. 由多种原因引起的一组综合征\nB. 某种神经递质的缺陷可诱发该病\nC. 神经髓鞘发育落后可诱发该病\nD. 营养不良可诱发该病\nAnswer: D\n\n学习迁移发生的必要条件是两种学习活动之间存在共同原理，学习迁移产生的关键\
+是学习者通过活动能概括出其共同原理。持这种观点的迁移理论被称为____\nA. 形式训练说\nB. 相同要素说\nC. 概括化理论\nD. 关系理论\nAnswer: C\n\nExcel中，通常在单元格内出现“####”符号时，表明\
+____。\nA. 显示的是字符串“####”\nB. 列宽不够，无法显示数值数据\nC. 数值溢出\nD. 计算错误\nAnswer: B\n\n第二次世界大战开始时间是____。\nA. 1914年\nB. 1918年\
+C. 1939年\nD. 1945年\nAnswer: C\n\n在日常生活中，我们经常会接触一些民谚、俗语，这些民谚、俗语蕴含着丰富的物理知识。下列民谚、俗语蕴含的物理知识所属领域不同的是____。\nA. 坐井观天，所见甚少\
+B. 瑞雪兆丰年\nC. 酒香不怕巷子深\nD. 下雪不寒化雪寒\nAnswer: A\n\n学期结束时，班主任王老师会对学生思想品德的发展变化情况进行评价。这项工作属于____。\nA. 工作总结\nB. 工作计划\nC. 操行评定\
+D. 建立学生档案\nAnswer:",
+  "The following are multiple choice questions (with answers) about  teacher qualification.\n\n下列对于多动症的说法，不正确的是____\
+A. 由多种原因引起的一组综合征\nB. 某种神经递质的缺陷可诱发该病\nC. 神经髓鞘发育落后可诱发该病\nD. 营养不良可诱发该病\nAnswer: D\n\n学习迁移发生的必要条件是两种学习活动之间存在共同原理，学习迁移产生的关键\
+是学习者通过活动能概括出其共同原理。持这种观点的迁移理论被称为____\nA. 形式训练说\nB. 相同要素说\nC. 概括化理论\nD. 关系理论\nAnswer: C\n\nExcel中，通常在单元格内出现“####”符号时，表明\
+____。\nA. 显示的是字符串“####”\nB. 列宽不够，无法显示数值数据\nC. 数值溢出\nD. 计算错误\nAnswer: B\n\n第二次世界大战开始时间是____。\nA. 1914年\nB. 1918年\
+C. 1939年\nD. 1945年\nAnswer: C\n\n在日常生活中，我们经常会接触一些民谚、俗语，这些民谚、俗语蕴含着丰富的物理知识。下列民谚、俗语蕴含的物理知识所属领域不同的是____。\nA. 坐井观天，所见甚少\
+B. 瑞雪兆丰年\nC. 酒香不怕巷子深\nD. 下雪不寒化雪寒\nAnswer: A\n\n人们常说：“教学有法而教无定法。”这反映了教师的劳动具有____。\nA. 连续性\nB. 示范性\nC. 长期性\nD. 创造\
+性\nAnswer:",
+  "The following are multiple choice questions (with answers) about  teacher qualification.\n\n下列对于多动症的说法，不正确的是____\
+A. 由多种原因引起的一组综合征\nB. 某种神经递质的缺陷可诱发该病\nC. 神经髓鞘发育落后可诱发该病\nD. 营养不良可诱发该病\nAnswer: D\n\n学习迁移发生的必要条件是两种学习活动之间存在共同原理，学习迁移产生的关键\
+是学习者通过活动能概括出其共同原理。持这种观点的迁移理论被称为____\nA. 形式训练说\nB. 相同要素说\nC. 概括化理论\nD. 关系理论\nAnswer: C\n\nExcel中，通常在单元格内出现“####”符号时，表明\
+____。\nA. 显示的是字符串“####”\nB. 列宽不够，无法显示数值数据\nC. 数值溢出\nD. 计算错误\nAnswer: B\n\n第二次世界大战开始时间是____。\nA. 1914年\nB. 1918年\
+C. 1939年\nD. 1945年\nAnswer: C\n\n在日常生活中，我们经常会接触一些民谚、俗语，这些民谚、俗语蕴含着丰富的物理知识。下列民谚、俗语蕴含的物理知识所属领域不同的是____。\nA. 坐井观天，所见甚少\
+B. 瑞雪兆丰年\nC. 酒香不怕巷子深\nD. 下雪不寒化雪寒\nAnswer: A\n\n县级以上地方各级人民代表大会是县级以上地方国家权力机关，其职权不包括____。\nA. 改变或撤销本级人大常务委员会不适当的决定\
+B. 选举并有权罢免本级人民法院院长\nC. 批准本行政区域内的预算执行情况的报告\nD. 决定并宣布下一级行政区城进入紧急状态\nAnswer:",
+  "The following are multiple choice questions (with answers) about  teacher qualification.\n\n下列对于多动症的说法，不正确的是____\
+A. 由多种原因引起的一组综合征\nB. 某种神经递质的缺陷可诱发该病\nC. 神经髓鞘发育落后可诱发该病\nD. 营养不良可诱发该病\nAnswer: D\n\n学习迁移发生的必要条件是两种学习活动之间存在共同原理，学习迁移产生的关键\
+是学习者通过活动能概括出其共同原理。持这种观点的迁移理论被称为____\nA. 形式训练说\nB. 相同要素说\nC. 概括化理论\nD. 关系理论\nAnswer: C\n\nExcel中，通常在单元格内出现“####”符号时，表明\
+____。\nA. 显示的是字符串“####”\nB. 列宽不够，无法显示数值数据\nC. 数值溢出\nD. 计算错误\nAnswer: B\n\n第二次世界大战开始时间是____。\nA. 1914年\nB. 1918年\
+C. 1939年\nD. 1945年\nAnswer: C\n\n在日常生活中，我们经常会接触一些民谚、俗语，这些民谚、俗语蕴含着丰富的物理知识。下列民谚、俗语蕴含的物理知识所属领域不同的是____。\nA. 坐井观天，所见甚少\
+B. 瑞雪兆丰年\nC. 酒香不怕巷子深\nD. 下雪不寒化雪寒\nAnswer: A\n\n在心理健康课上，同一批学生在第二次进行同样内容的人格测验时获得的分数与上次测验差别较大。这说明该测验存在的问题是____。\
+A. 信度问题\nB. 效度问题\nC. 难度问题\nD. 区分度问题\nAnswer:",
+  "The following are multiple choice questions (with answers) about  teacher qualification.\n\n下列对于多动症的说法，不正确的是____\
+A. 由多种原因引起的一组综合征\nB. 某种神经递质的缺陷可诱发该病\nC. 神经髓鞘发育落后可诱发该病\nD. 营养不良可诱发该病\nAnswer: D\n\n学习迁移发生的必要条件是两种学习活动之间存在共同原理，学习迁移产生的关键\
+是学习者通过活动能概括出其共同原理。持这种观点的迁移理论被称为____\nA. 形式训练说\nB. 相同要素说\nC. 概括化理论\nD. 关系理论\nAnswer: C\n\nExcel中，通常在单元格内出现“####”符号时，表明\
+____。\nA. 显示的是字符串“####”\nB. 列宽不够，无法显示数值数据\nC. 数值溢出\nD. 计算错误\nAnswer: B\n\n第二次世界大战开始时间是____。\nA. 1914年\nB. 1918年\
+C. 1939年\nD. 1945年\nAnswer: C\n\n在日常生活中，我们经常会接触一些民谚、俗语，这些民谚、俗语蕴含着丰富的物理知识。下列民谚、俗语蕴含的物理知识所属领域不同的是____。\nA. 坐井观天，所见甚少\
+B. 瑞雪兆丰年\nC. 酒香不怕巷子深\nD. 下雪不寒化雪寒\nAnswer: A\n\n李老师在教学生区分形近字“渴”“竭”“碣”“谒”时，将四个字相同的右半部分用白色粉笔写出，相异的左半部分用彩色粉笔写出。李老师运用了\
+知觉的____。\nA. 整体性\nB. 选择性\nC. 理解性\nD. 恒常性\nAnswer:",
+  "The following are multiple choice questions (with answers) about  teacher qualification.\n\n下列对于多动症的说法，不正确的是____\
+A. 由多种原因引起的一组综合征\nB. 某种神经递质的缺陷可诱发该病\nC. 神经髓鞘发育落后可诱发该病\nD. 营养不良可诱发该病\nAnswer: D\n\n学习迁移发生的必要条件是两种学习活动之间存在共同原理，学习迁移产生的关键\
+是学习者通过活动能概括出其共同原理。持这种观点的迁移理论被称为____\nA. 形式训练说\nB. 相同要素说\nC. 概括化理论\nD. 关系理论\nAnswer: C\n\nExcel中，通常在单元格内出现“####”符号时，表明\
+____。\nA. 显示的是字符串“####”\nB. 列宽不够，无法显示数值数据\nC. 数值溢出\nD. 计算错误\nAnswer: B\n\n第二次世界大战开始时间是____。\nA. 1914年\nB. 1918年\
+C. 1939年\nD. 1945年\nAnswer: C\n\n在日常生活中，我们经常会接触一些民谚、俗语，这些民谚、俗语蕴含着丰富的物理知识。下列民谚、俗语蕴含的物理知识所属领域不同的是____。\nA. 坐井观天，所见甚少\
+B. 瑞雪兆丰年\nC. 酒香不怕巷子深\nD. 下雪不寒化雪寒\nAnswer: A\n\n兰兰学会走路后,就要很喜欢尝试自己穿衣、吃饭、捡东西,喜欢探索周围世界。按照埃里克森人格发展阶段理论,兰兰所处的发展阶段是____\
+A. 信任对怀疑\nB. 自立对羞怯\nC. 主动感对内疚感\nD. 勤奋感对自卑感\nAnswer:",
+  "The following are multiple choice questions (with answers) about  teacher qualification.\n\n下列对于多动症的说法，不正确的是____\
+A. 由多种原因引起的一组综合征\nB. 某种神经递质的缺陷可诱发该病\nC. 神经髓鞘发育落后可诱发该病\nD. 营养不良可诱发该病\nAnswer: D\n\n学习迁移发生的必要条件是两种学习活动之间存在共同原理，学习迁移产生的关键\
+是学习者通过活动能概括出其共同原理。持这种观点的迁移理论被称为____\nA. 形式训练说\nB. 相同要素说\nC. 概括化理论\nD. 关系理论\nAnswer: C\n\nExcel中，通常在单元格内出现“####”符号时，表明\
+____。\nA. 显示的是字符串“####”\nB. 列宽不够，无法显示数值数据\nC. 数值溢出\nD. 计算错误\nAnswer: B\n\n第二次世界大战开始时间是____。\nA. 1914年\nB. 1918年\
+C. 1939年\nD. 1945年\nAnswer: C\n\n在日常生活中，我们经常会接触一些民谚、俗语，这些民谚、俗语蕴含着丰富的物理知识。下列民谚、俗语蕴含的物理知识所属领域不同的是____。\nA. 坐井观天，所见甚少\
+B. 瑞雪兆丰年\nC. 酒香不怕巷子深\nD. 下雪不寒化雪寒\nAnswer: A\n\n杨老师在教授生字词的过程中发现部分学生有缺笔少画的现象，于是他把“小学生缺笔少画现象的原因及对策研究”作为研究课题，拟订相应的研究计划，\
+在工作中收集、整理相关资料并实施教学措施，最后根据反馈信息调整教学方案。这种研究方法属于____。\nA. 教育行动研究法\nB. 教育实验法\nC. 教育叙事研究法\nD. 个案研究法\nAnswer:"
+]
+
+
+dataset_calib = get_calib_dataset(tokenizer, CALIB_SET)
+# 对于linear算子中的激活值如果有表示范围过大，或者“尖刺”的异常值过多，
+# 需要使用anti outleir功能，使用方法如下
+anti_config = AntiOutlierConfig(anti_method="m2", dev_type="cpu")  # dev_type="npu", dev_id=0  如果需要使用npu进行量化。
+anti_outlier = AntiOutlier(model, calib_data=dataset_calib, cfg=anti_config, norm_class_name="RMSNorm")
+anti_outlier.process()
+# 下面是回退层的设置，因为w8a8的对激活值也进行了量化，会有部分网络层对激活值的表示
+# 范围较为敏感所以需要回退这些网络层使用浮点权重进行计算。
+disable_names = []
+baichuan_layers = 40
+disable_idx_lst = list(range(baichuan_layers))
+for layer_index in disable_idx_lst:
+    down_proj_name = "model.layers.{}.mlp.down_proj".format(layer_index)
+    disable_names.append(down_proj_name)
+quant_config = QuantConfig(
+    a_bit=8, 
+    w_bit=8, 
+    disable_names=disable_names, 
+    disable_last_linear=False, 
+    dev_type='cpu',  # dev_type="npu", dev_id=0  如果需要使用npu进行量化
+    act_method=3, 
+    pr=1.0, 
+    w_sym=True, 
+    mm_tensor=False
+)
+
+calibrator = Calibrator(model, quant_config, calib_data=dataset_calib, disable_level='L0')
+calibrator.run()  # 执行PTQ量化校准
+# "safe_tensor"对应safetensors格式权重，"numpy"对应npy格式权重
+calibrator.save(OUTPORT_W8A8_PATH, save_type=["safe_tensor"])
diff --git a/mindie/examples/models/baichuan/quant_baichuan2_7b_w8a8.py b/mindie/examples/models/baichuan/quant_baichuan2_7b_w8a8.py
new file mode 100644
index 00000000..7b8bf795
--- /dev/null
+++ b/mindie/examples/models/baichuan/quant_baichuan2_7b_w8a8.py
@@ -0,0 +1,746 @@
+# Copyright Huawei Technologies Co., Ltd. 2023-2024. All rights reserved.
+import logging
+from transformers import AutoTokenizer, AutoModelForCausalLM
+from msmodelslim.pytorch.llm_ptq.anti_outlier import AntiOutlierConfig, AntiOutlier
+from msmodelslim.pytorch.llm_ptq.llm_ptq_tools import Calibrator, QuantConfig
+
+INPORT_FP16_PATH = 'the_path_of_fp16_model_input'
+OUTPORT_W8A8_PATH = 'the_path_of_w8a8_model_output'
+tokenizer = AutoTokenizer.from_pretrained(
+    pretrained_model_name_or_path=INPORT_FP16_PATH,
+    use_fast=False,
+    padding_side='left',
+    trust_remote_code=True)
+model = AutoModelForCausalLM.from_pretrained(
+    pretrained_model_name_or_path=INPORT_FP16_PATH,
+    trust_remote_code=True).float().cpu()
+
+# model = model.half().npu() # 如果需要使用npu进行量化
+
+
+# 获取校准数据函数定义
+def get_calib_dataset(
+        auto_tokenizer,
+        calib_list,
+        device="cpu"):  # 如果需要使用npu进行量化, device="npu:0"。使用cpu,device="cpu"
+    calib_dataset = []
+    for calib_data in calib_list:
+        inputs = auto_tokenizer(calib_data, return_tensors='pt')
+        calib_dataset.append([
+            inputs.data['input_ids'].to(device),
+            inputs.data['attention_mask'].to(device)
+        ])
+    return calib_dataset
+
+
+calib_set = [
+    "The following are multiple choice questions (with answers) about  civil servant.\n\n透过车轮卷起的黄土，却见山野人秋，庄稼割过，\
+静谧中含着一些寂静，只有阳光在切割过的根茬上烁烁闪亮。____。 填入横线上最恰当的是____。\nA. 这是一段颠簸的行程，一路上景色苍凉雄浑，寂静中裸\
+露着贫瘠\nB. 也许是久旱的缘故，这边的溪流也变成了涓涓细流，在盘踞的石缝间流动\nC. 同绿色的南方相比，这里是荒凉的，乃至荒蛮\nD. 偶见人迹，大都\
+是荒村野店。时而会有一座小小的孤庙一闪而过\nAnswer: D\n\n据说，在东南沿海一带，渔民在捕到螃蟹后，将螃蟹放进一个上小肚大的竹篓里面，第一只\螃蟹\
+放进去时，渔夫就用重物将口封住，当第二只、第三只放进去后，渔夫就不再盖重物了，因为，第一只即将爬出篓口的螃蟹，会被第二只、第三只螃蟹拉到篓底。如\
+此下去，即使篓口没有盖盖子，但也没有一只蟹能够爬出去。 这个故事意在告诉我们____。\nA. 人多不一定好办事\nB. 恶性竞争必然导致两败俱伤\nC. 内讧必\
+然内耗，团结就是力量\nD. 与人方便，自己方便\nAnswer: C\n\n谨慎：成就____\nA. 温和：好感\nB. 勤奋：努力\nC. 轻松：普通\nD. 好学：智慧\nAnswer:\
+ A\n\n①我的奶奶是这样，我的父亲也是这样——那张画像，已经成为许多老百姓生活必需品的一部分，没有它，似乎客厅都是空的 ②如果因为认知能力的提升而将偶像\
+抽离出记忆，那么，在那个时代里成长、生活的许多人，脑子里将空空如也，甚至不记得自己曾经活过这一回 ③卷的过程，是在收叠他个人的历史 ④有时挂旧了、破了，\
+也会认认真真卷起来放好，我曾看别人卷过这画像，那种澄澈的眼神令人难忘 ⑤有些伟大者永远不会被人遗忘，因为那个伟大者，在那个时代，其实是一种生活，精神生\
+活 ⑥从这个意义上说，尽管也许并不懂他，但人们心甘情愿尊他的名为圣 将以上6个句子重新排列，语序正确的是____。\nA. ②⑥⑤①④③\nB. ②⑥④③①⑤\nC. ①④③②⑥⑤\nD. \
+⑤②⑥①④③\nAnswer: D\n\n相机：拍摄____\nA. 空调：降温\nB. B超：诊断\nC. 电脑：操作\nD. 地图：交通\nAnswer: B\n\n甲与乙准备进行一个游戏：向空中扔\
+三枚硬币，如果它们落地后全是\正面向上或全是反面向上，乙就给甲钱；但若出现两正面一反面或两反面一正面的情况，则由甲给乙钱。乙要求甲每次给10元，那\
+么，从长远来看，甲应该要求乙每次至少给____元才可考虑参加这个游戏。\nA. 10\nB. 15\nC. 20\nD. 30\nAnswer:",
+    "The following are multiple choice questions (with answers) about  civil servant.\n\n透过车轮卷起的黄土，却见山野人秋，庄稼割过，静谧\
+中含着一些寂静，只有阳光在切割过的根茬上烁烁闪亮。____。 填入横线上最恰当的是____。\nA. 这是一段颠簸的行程，一路上景色苍凉雄浑，寂静中裸露着贫\
+瘠\nB. 也许是久旱的缘故，这边的溪流也变成了涓涓细流，在盘踞的石缝间流动\nC. 同绿色的南方相比，这里是荒凉的，乃至荒蛮\nD. 偶见人迹，大都是荒村野\
+店。时而会有一座小小的孤庙一闪而过\nAnswer: D\n\n据说，在东南沿海一带，渔民在捕到螃蟹后，将螃蟹放进一个上小肚大的竹篓里面，第一只螃蟹放进去时，渔\
+夫就用重物将口封住，当第二只、第三只放进去后，渔夫就不再盖重物了，因为，第一只即将爬出篓口的螃蟹，会被第二只、第三只螃蟹拉到篓底。如此下去，即使篓\
+口没有盖盖子，但也没有一只蟹能够爬出去。 这个故事意在告诉我们____。\nA. 人多不一定好办事\nB. 恶性竞争必然导致两败俱伤\nC. 内讧必然内耗，团结就是\
+力量\nD. 与人方便，自己方便\nAnswer: C\n\n谨慎：成就____\nA. 温和：好感\nB. 勤奋：努力\nC. 轻松：普通\nD. 好学：智慧\nAnswer: A\n\n①我的奶奶\
+是这样，我的父亲也是这样——那张画像，已经成为许多老百姓生活必需品的一部分，没有它，似乎客厅都是空的 ②如果因为认知能力的提升而将偶像抽离出记忆，那么，\
+在那个时代里成长、生活的许多人，脑子里将空空如也，甚至不记得自己曾经活过这一回 ③卷的过程，是在收叠他个人的历史 ④有时挂旧了、破了，也会认认真真卷起\
+来放好，我曾看别人卷过这画像，那种澄澈的眼神令人难忘 ⑤有些伟大者永远不会被人遗忘，因为那个伟大者，在那个时代，其实是一种生活，精神生活 ⑥从这个意义\
+上说，尽管也许并不懂他，但人们心甘情愿尊他的名为圣 将以上6个句子重新排列，语序正确的是____。\nA. ②⑥⑤①④③\nB. ②⑥④③①⑤\nC. ①④③②⑥⑤\nD. ⑤②⑥①④③\nAnswer: \
+D\n\n相机：拍摄____\nA. 空调：降温\nB. B超：诊断\nC. 电脑：操作\nD. 地图：交通\nAnswer: B\n\n下列著名诗人与其代表作对应有误的是____。\nA. \
+李白——《将进酒》\nB. 白居易——《琵琶行》\nC. 王之焕——《登鹳雀楼》\nD. 杜甫——《长恨歌》\nAnswer:",
+    "The following are multiple choice questions (with answers) about  civil servant.\n\n透过车轮卷起的黄土，却见山野人秋，庄稼割过，静谧\
+中含着一些寂静，只有阳光在切割过的根茬上烁烁闪亮。____。 填入横线上最恰当的是____。\nA. 这是一段颠簸的行程，一路上景色苍凉雄浑，寂静中裸露着贫\
+瘠\nB. 也许是久旱的缘故，这边的溪流也变成了涓涓细流，在盘踞的石缝间流动\nC. 同绿色的南方相比，这里是荒凉的，乃至荒蛮\nD. 偶见人迹，大都是荒村野店。\
+时而会有一座小小的孤庙一闪而过\nAnswer: D\n\n据说，在东南沿海一带，渔民在捕到螃蟹后，将螃蟹放进一个上小肚大的竹篓里面，第一只螃蟹放进去时，渔夫就用\
+重物将口封住，当第二只、第三只放进去后，渔夫就不再盖重物了，因为，第一只即将爬出篓口的螃蟹，会被第二只、第三只螃蟹拉到篓底。如此下去，即使篓口没有盖\
+盖子，但也没有一只蟹能够爬出去。 这个故事意在告诉我们____。\nA. 人多不一定好办事\nB. 恶性竞争必然导致两败俱伤\nC. 内讧必然内耗，团结就是力量\nD. 与\
+人方便，自己方便\nAnswer: C\n\n谨慎：成就____\nA. 温和：好感\nB. 勤奋：努力\nC. 轻松：普通\nD. 好学：智慧\nAnswer: A\n\n①我的奶奶是这样，我的父\
+亲也是这样——那张画像，已经成为许多老百姓生活必需品的一部分，没有它，似乎客厅都是空的 ②如果因为认知能力的提升而将偶像抽离出记忆，那么，在那个时代里成\
+长、生活的许多人，脑子里将空空如也，甚至不记得自己曾经活过这一回 ③卷的过程，是在收叠他个人的历史 ④有时挂旧了、破了，也会认认真真卷起来放好，我曾看别\
+人卷过这画像，那种澄澈的眼神令人难忘 ⑤有些伟大者永远不会被人遗忘，因为那个伟大者，在那个时代，其实是一种生活，精神生活 ⑥从这个意义上说，尽管也许并不\
+懂他，但人们心甘情愿尊他的名为圣 将以上6个句子重新排列，语序正确的是____。\nA. ②⑥⑤①④③\nB. ②⑥④③①⑤\nC. ①④③②⑥⑤\nD. ⑤②⑥①④③\nAnswer: D\n\n相机：拍\
+摄____\nA. 空调：降温\nB. B超：诊断\nC. 电脑：操作\nD. 地图：交通\nAnswer: B\n\n经济学上所推崇的“橄榄型”收入分配结构，是指低收入和高收入相对较\
+少、中等收入占绝大多数的分配结构。我国正在采取措施，实施“提低、扩中、调高、打非、保困”的方针，使收入分配朝着“橄榄型”方向发展。这主要是为了\
+促进____。\nA. 生产的发展\nB. 效率的提高\nC. 社会的公平\nD. 内需的扩大\nAnswer:",
+    "The following are multiple choice questions (with answers) about  civil servant.\n\n透过车轮卷起的黄土，却见山野人秋，庄稼割过，静谧中\
+含着一些寂静，只有阳光在切割过的根茬上烁烁闪亮。____。 填入横线上最恰当的是____。\nA. 这是一段颠簸的行程，一路上景色苍凉雄浑，寂静中裸露着贫瘠\nB. \
+也许是久旱的缘故，这边的溪流也变成了涓涓细流，在盘踞的石缝间流动\nC. 同绿色的南方相比，这里是荒凉的，乃至荒蛮\nD. 偶见人迹，大都是荒村野店。时而会有\
+一座小小的孤庙一闪而过\nAnswer: D\n\n据说，在东南沿海一带，渔民在捕到螃蟹后，将螃蟹放进一个上小肚大的竹篓里面，第一只螃蟹放进去时，渔夫就用重物将口\
+封住，当第二只、第三只放进去后，渔夫就不再盖重物了，因为，第一只即将爬出篓口的螃蟹，会被第二只、第三只螃蟹拉到篓底。如此下去，即使篓口没有盖盖子，但\
+也没有一只蟹能够爬出去。 这个故事意在告诉我们____。\nA. 人多不一定好办事\nB. 恶性竞争必然导致两败俱伤\nC. 内讧必然内耗，团结就是力量\nD. 与人方便，\
+自己方便\nAnswer: C\n\n谨慎：成就____\nA. 温和：好感\nB. 勤奋：努力\nC. 轻松：普通\nD. 好学：智慧\nAnswer: A\n\n①我的奶奶是这样，我的父亲也是这\
+样——那张画像，已经成为许多老百姓生活必需品的一部分，没有它，似乎客厅都是空的 ②如果因为认知能力的提升而将偶像抽离出记忆，那么，在那个时代里成长、生活\
+的许多人，脑子里将空空如也，甚至不记得自己曾经活过这一回 ③卷的过程，是在收叠他个人的历史 ④有时挂旧了、破了，也会认认真真卷起来放好，我曾看别人卷过这\
+画像，那种澄澈的眼神令人难忘 ⑤有些伟大者永远不会被人遗忘，因为那个伟大者，在那个时代，其实是一种生活，精神生活 ⑥从这个意义上说，尽管也许并不懂他，但\
+人们心甘情愿尊他的名为圣 将以上6个句子重新排列，语序正确的是____。\nA. ②⑥⑤①④③\nB. ②⑥④③①⑤\nC. ①④③②⑥⑤\nD. ⑤②⑥①④③\nAnswer: D\n\n相机：拍摄\
+____\nA. 空调：降温\nB. B超：诊断\nC. 电脑：操作\nD. 地图：交通\nAnswer: B\n\n-81，-36，-9，0，9，36，____\nA. 49\nB. 64\nC. 81\nD. 100\nAns\
+wer:",
+    "The following are multiple choice questions (with answers) about  civil servant.\n\n透过车轮卷起的黄土，却见山野人秋，庄稼割过，静谧中\
+含着一些寂静，只有阳光在切割过的根茬上烁烁闪亮。____。 填入横线上最恰当的是____。\nA. 这是一段颠簸的行程，一路上景色苍凉雄浑，寂静中裸露着贫瘠\nB. \
+也许是久旱的缘故，这边的溪流也变成了涓涓细流，在盘踞的石缝间流动\nC. 同绿色的南方相比，这里是荒凉的，乃至荒蛮\nD. 偶见人迹，大都是荒村野店。时而会有\
+一座小小的孤庙一闪而过\nAnswer: D\n\n据说，在东南沿海一带，渔民在捕到螃蟹后，将螃蟹放进一个上小肚大的竹篓里面，第一只螃蟹放进去时，渔夫就用重物将口\
+封住，当第二只、第三只放进去后，渔夫就不再盖重物了，因为，第一只即将爬出篓口的螃蟹，会被第二只、第三只螃蟹拉到篓底。如此下去，即使篓口没有盖盖子，但\
+也没有一只蟹能够爬出去。 这个故事意在告诉我们____。\nA. 人多不一定好办事\nB. 恶性竞争必然导致两败俱伤\nC. 内讧必然内耗，团结就是力量\nD. 与人方便，\
+自己方便\nAnswer: C\n\n谨慎：成就____\nA. 温和：好感\nB. 勤奋：努力\nC. 轻松：普通\nD. 好学：智慧\nAnswer: A\n\n①我的奶奶是这样，我的父亲也是这\
+样——那张画像，已经成为许多老百姓生活必需品的一部分，没有它，似乎客厅都是空的 ②如果因为认知能力的提升而将偶像抽离出记忆，那么，在那个时代里成长、生活\
+的许多人，脑子里将空空如也，甚至不记得自己曾经活过这一回 ③卷的过程，是在收叠他个人的历史 ④有时挂旧了、破了，也会认认真真卷起来放好，我曾看别人卷过这\
+画像，那种澄澈的眼神令人难忘 ⑤有些伟大者永远不会被人遗忘，因为那个伟大者，在那个时代，其实是一种生活，精神生活 ⑥从这个意义上说，尽管也许并不懂他，但\
+人们心甘情愿尊他的名为圣 将以上6个句子重新排列，语序正确的是____。\nA. ②⑥⑤①④③\nB. ②⑥④③①⑤\nC. ①④③②⑥⑤\nD. ⑤②⑥①④③\nAnswer: D\n\n相机：\
+拍摄____\nA. 空调：降温\nB. B超：诊断\nC. 电脑：操作\nD. 地图：交通\nAnswer: B\n\nVIP服务本来是个好东西，大企业作为市场竞争的主体，实行差别化服\
+务，无可厚非。但近年来，一些企业纷纷进军医院、机场、车站等公共场所，掏些赞助费，设立所谓“贵宾厅”，霸占公共资源，不仅带来浪费，更造成公共资源分配的不\
+公。这段文字主要强调的是____。\nA. 公共资源不该过度VIP\nB. VIP服务导致了公共资源的不公平分配\nC. 一些企业搬进医院、机场、车站办公\nD. 实行差别化\
+服务是VIP服务的优势所在\nAnswer:",
+    "The following are multiple choice questions (with answers) about  civil servant.\n\n透过车轮卷起的黄土，却见山野人秋，庄稼割过，静谧中\
+含着一些寂静，只有阳光在切割过的根茬上烁烁闪亮。____。 填入横线上最恰当的是____。\nA. 这是一段颠簸的行程，一路上景色苍凉雄浑，寂静中裸露着贫瘠\nB. \
+也许是久旱的缘故，这边的溪流也变成了涓涓细流，在盘踞的石缝间流动\nC. 同绿色的南方相比，这里是荒凉的，乃至荒蛮\nD. 偶见人迹，大都是荒村野店。时而会有\
+一座小小的孤庙一闪而过\nAnswer: D\n\n据说，在东南沿海一带，渔民在捕到螃蟹后，将螃蟹放进一个上小肚大的竹篓里面，第一只螃蟹放进去时，渔夫就用重物将口\
+封住，当第二只、第三只放进去后，渔夫就不再盖重物了，因为，第一只即将爬出篓口的螃蟹，会被第二只、第三只螃蟹拉到篓底。如此下去，即使篓口没有盖盖子，但也\
+没有一只蟹能够爬出去。 这个故事意在告诉我们____。\nA. 人多不一定好办事\nB. 恶性竞争必然导致两败俱伤\nC. 内讧必然内耗，团结就是力量\nD. 与人方便，自\
+己方便\nAnswer: C\n\n谨慎：成就____\nA. 温和：好感\nB. 勤奋：努力\nC. 轻松：普通\nD. 好学：智慧\nAnswer: A\n\n①我的奶奶是这样，我的父亲也是这样\
+——那张画像，已经成为许多老百姓生活必需品的一部分，没有它，似乎客厅都是空的 ②如果因为认知能力的提升而将偶像抽离出记忆，那么，在那个时代里成长、生活的\
+许多人，脑子里将空空如也，甚至不记得自己曾经活过这一回 ③卷的过程，是在收叠他个人的历史 ④有时挂旧了、破了，也会认认真真卷起来放好，我曾看别人卷过这画\
+像，那种澄澈的眼神令人难忘 ⑤有些伟大者永远不会被人遗忘，因为那个伟大者，在那个时代，其实是一种生活，精神生活 ⑥从这个意义上说，尽管也许并不懂他，但人\
+们心甘情愿尊他的名为圣 将以上6个句子重新排列，语序正确的是____。\nA. ②⑥⑤①④③\nB. ②⑥④③①⑤\nC. ①④③②⑥⑤\nD. ⑤②⑥①④③\nAnswer: D\n\n相机：拍摄____\nA. \
+空调：降温\nB. B超：诊断\nC. 电脑：操作\nD. 地图：交通\nAnswer: B\n\n2，5，8，12，17，24，____\nA. 30\nB. 32\nC. 34\nD. 36\nAnswer:",
+    "The following are multiple choice questions (with answers) about  civil servant.\n\n透过车轮卷起的黄土，却见山野人秋，庄稼割过，静谧中\
+含着一些寂静，只有阳光在切割过的根茬上烁烁闪亮。____。 填入横线上最恰当的是____。\nA. 这是一段颠簸的行程，一路上景色苍凉雄浑，寂静中裸露着贫瘠\nB. \
+也许是久旱的缘故，这边的溪流也变成了涓涓细流，在盘踞的石缝间流动\nC. 同绿色的南方相比，这里是荒凉的，乃至荒蛮\nD. 偶见人迹，大都是荒村野店。时而会有\
+一座小小的孤庙一闪而过\nAnswer: D\n\n据说，在东南沿海一带，渔民在捕到螃蟹后，将螃蟹放进一个上小肚大的竹篓里面，第一只螃蟹放进去时，渔夫就用重物将口\
+封住，当第二只、第三只放进去后，渔夫就不再盖重物了，因为，第一只即将爬出篓口的螃蟹，会被第二只、第三只螃蟹拉到篓底。如此下去，即使篓口没有盖盖子，但也\
+没有一只蟹能够爬出去。 这个故事意在告诉我们____。\nA. 人多不一定好办事\nB. 恶性竞争必然导致两败俱伤\nC. 内讧必然内耗，团结就是力量\nD. 与人方便，自\
+己方便\nAnswer: C\n\n谨慎：成就____\nA. 温和：好感\nB. 勤奋：努力\nC. 轻松：普通\nD. 好学：智慧\nAnswer: A\n\n①我的奶奶是这样，我的父亲也是这样\
+——那张画像，已经成为许多老百姓生活必需品的一部分，没有它，似乎客厅都是空的 ②如果因为认知能力的提升而将偶像抽离出记忆，那么，在那个时代里成长、生活的\
+许多人，脑子里将空空如也，甚至不记得自己曾经活过这一回 ③卷的过程，是在收叠他个人的历史 ④有时挂旧了、破了，也会认认真真卷起来放好，我曾看别人卷过这画\
+像，那种澄澈的眼神令人难忘 ⑤有些伟大者永远不会被人遗忘，因为那个伟大者，在那个时代，其实是一种生活，精神生活 ⑥从这个意义上说，尽管也许并不懂他，但人\
+们心甘情愿尊他的名为圣 将以上6个句子重新排列，语序正确的是____。\nA. ②⑥⑤①④③\nB. ②⑥④③①⑤\nC. ①④③②⑥⑤\nD. ⑤②⑥①④③\nAnswer: D\n\n相机：拍摄____\nA. \
+空调：降温\nB. B超：诊断\nC. 电脑：操作\nD. 地图：交通\nAnswer: B\n\n4，4，6，12，30，____\nA. 48\nB. 64\nC. 80\nD. 90\nAnswer:",
+    "The following are multiple choice questions (with answers) about  civil servant.\n\n透过车轮卷起的黄土，却见山野人秋，庄稼割过，静谧中\
+含着一些寂静，只有阳光在切割过的根茬上烁烁闪亮。____。 填入横线上最恰当的是____。\nA. 这是一段颠簸的行程，一路上景色苍凉雄浑，寂静中裸露着贫瘠\nB. \
+也许是久旱的缘故，这边的溪流也变成了涓涓细流，在盘踞的石缝间流动\nC. 同绿色的南方相比，这里是荒凉的，乃至荒蛮\nD. 偶见人迹，大都是荒村野店。时而会有\
+一座小小的孤庙一闪而过\nAnswer: D\n\n据说，在东南沿海一带，渔民在捕到螃蟹后，将螃蟹放进一个上小肚大的竹篓里面，第一只螃蟹放进去时，渔夫就用重物将口\
+封住，当第二只、第三只放进去后，渔夫就不再盖重物了，因为，第一只即将爬出篓口的螃蟹，会被第二只、第三只螃蟹拉到篓底。如此下去，即使篓口没有盖盖子，但\
+也没有一只蟹能够爬出去。 这个故事意在告诉我们____。\nA. 人多不一定好办事\nB. 恶性竞争必然导致两败俱伤\nC. 内讧必然内耗，团结就是力量\nD. 与人方便，\
+自己方便\nAnswer: C\n\n谨慎：成就____\nA. 温和：好感\nB. 勤奋：努力\nC. 轻松：普通\nD. 好学：智慧\nAnswer: A\n\n①我的奶奶是这样，我的父亲也是这\
+样——那张画像，已经成为许多老百姓生活必需品的一部分，没有它，似乎客厅都是空的 ②如果因为认知能力的提升而将偶像抽离出记忆，那么，在那个时代里成长、生活\
+的许多人，脑子里将空空如也，甚至不记得自己曾经活过这一回 ③卷的过程，是在收叠他个人的历史 ④有时挂旧了、破了，也会认认真真卷起来放好，我曾看别人卷过这\
+画像，那种澄澈的眼神令人难忘 ⑤有些伟大者永远不会被人遗忘，因为那个伟大者，在那个时代，其实是一种生活，精神生活 ⑥从这个意义上说，尽管也许并不懂他，但\
+人们心甘情愿尊他的名为圣 将以上6个句子重新排列，语序正确的是____。\nA. ②⑥⑤①④③\nB. ②⑥④③①⑤\nC. ①④③②⑥⑤\nD. ⑤②⑥①④③\nAnswer: D\n\n相机：\
+拍摄____\nA. 空调：降温\nB. B超：诊断\nC. 电脑：操作\nD. 地图：交通\nAnswer: B\n\n当下中国文学描写官斗、职斗、婚斗、家斗的作品比较流行，这些作品\
+中包含了不少对日常生活中权术和心机的描写。这样的写作有可能削弱文学对社会的积极影响。文学有必要与正义结盟，形成诗性正义，以提升生活。 作者想表达的主\
+要观点是____。\nA. 当下文学作品的社会影响力有下降的趋势\nB. 流行作品未必是好作品，这需要时间的检验\nC. 文学不应过度渲染权术机诈，否则有可能泯灭正\
+义\nD. 生活中没那么多权术机诈，文学创作应该贴近生活，不能闭门造车\nAnswer:",
+    "The following are multiple choice questions (with answers) about  civil servant.\n\n透过车轮卷起的黄土，却见山野人秋，庄稼割过，静谧中\
+含着一些寂静，只有阳光在切割过的根茬上烁烁闪亮。____。 填入横线上最恰当的是____。\nA. 这是一段颠簸的行程，一路上景色苍凉雄浑，寂静中裸露着贫瘠\nB. \
+也许是久旱的缘故，这边的溪流也变成了涓涓细流，在盘踞的石缝间流动\nC. 同绿色的南方相比，这里是荒凉的，乃至荒蛮\nD. 偶见人迹，大都是荒村野店。时而会有\
+一座小小的孤庙一闪而过\nAnswer: D\n\n据说，在东南沿海一带，渔民在捕到螃蟹后，将螃蟹放进一个上小肚大的竹篓里面，第一只螃蟹放进去时，渔夫就用重物将口\
+封住，当第二只、第三只放进去后，渔夫就不再盖重物了，因为，第一只即将爬出篓口的螃蟹，会被第二只、第三只螃蟹拉到篓底。如此下去，即使篓口没有盖盖子，但\
+也没有一只蟹能够爬出去。 这个故事意在告诉我们____。\nA. 人多不一定好办事\nB. 恶性竞争必然导致两败俱伤\nC. 内讧必然内耗，团结就是力量\nD. 与人方便，\
+自己方便\nAnswer: C\n\n谨慎：成就____\nA. 温和：好感\nB. 勤奋：努力\nC. 轻松：普通\nD. 好学：智慧\nAnswer: A\n\n①我的奶奶是这样，我的父亲也是这\
+样——那张画像，已经成为许多老百姓生活必需品的一部分，没有它，似乎客厅都是空的 ②如果因为认知能力的提升而将偶像抽离出记忆，那么，在那个时代里成长、生活\
+的许多人，脑子里将空空如也，甚至不记得自己曾经活过这一回 ③卷的过程，是在收叠他个人的历史 ④有时挂旧了、破了，也会认认真真卷起来放好，我曾看别人卷过这\
+画像，那种澄澈的眼神令人难忘 ⑤有些伟大者永远不会被人遗忘，因为那个伟大者，在那个时代，其实是一种生活，精神生活 ⑥从这个意义上说，尽管也许并不懂他，但\
+人们心甘情愿尊他的名为圣 将以上6个句子重新排列，语序正确的是____。\nA. ②⑥⑤①④③\nB. ②⑥④③①⑤\nC. ①④③②⑥⑤\nD. ⑤②⑥①④③\nAnswer: D\n\n相机：\
+拍摄____\nA. 空调：降温\nB. B超：诊断\nC. 电脑：操作\nD. 地图：交通\nAnswer: B\n\n一天，一个农民的驴子掉到枯井里，那可怜的驴子在井里凄凉地惨叫了\
+几个钟头，农民亦急得团团转，就是毫无办法把它救起来，最后，他断然认定：驴子已老了，这口枯井也该填起来，不值得花精力去救驴子。他请来所有邻居帮他填井。\
+大家抓起铁锹，开始往井里填土。驴子很快意识到发生了什么事，起初，它恐慌地大哭，不一会儿，居然安静下来。人们忍不住往井里看，奇迹发生了。每一铲砸到驴子\
+背上的土，它都作了出人意料的处理：迅速抖落一身尘土，然后狠狠地用脚踩紧。这样，没过多久，驴子竟然自己把自己升了起来，到了井口，它纵身一跳，平安地跑开\
+了，在场的人均惊诧不已。 这段文字告诉我们的道理是____。\nA. 人生中的每一个困难都是通往成功的垫脚石\nB. 换一种思维常常能够产生意想不到的效果\nC. 冷\
+静思考是克服困难的首要条件\nD. 求人不如求己，很多时候，自己才是自己最大的救星\nAnswer:",
+    "The following are multiple choice questions (with answers) about  civil servant.\n\n透过车轮卷起的黄土，却见山野人秋，庄稼割过，静谧中含\
+着一些寂静，只有阳光在切割过的根茬上烁烁闪亮。____。 填入横线上最恰当的是____。\nA. 这是一段颠簸的行程，一路上景色苍凉雄浑，寂静中裸露着贫瘠\nB. 也\
+许是久旱的缘故，这边的溪流也变成了涓涓细流，在盘踞的石缝间流动\nC. 同绿色的南方相比，这里是荒凉的，乃至荒蛮\nD. 偶见人迹，大都是荒村野店。时而会有一\
+座小小的孤庙一闪而过\nAnswer: D\n\n据说，在东南沿海一带，渔民在捕到螃蟹后，将螃蟹放进一个上小肚大的竹篓里面，第一只螃蟹放进去时，渔夫就用重物将口封\
+住，当第二只、第三只放进去后，渔夫就不再盖重物了，因为，第一只即将爬出篓口的螃蟹，会被第二只、第三只螃蟹拉到篓底。如此下去，即使篓口没有盖盖子，但也没\
+有一只蟹能够爬出去。 这个故事意在告诉我们____。\nA. 人多不一定好办事\nB. 恶性竞争必然导致两败俱伤\nC. 内讧必然内耗，团结就是力量\nD. 与人方便，自己\
+方便\nAnswer: C\n\n谨慎：成就____\nA. 温和：好感\nB. 勤奋：努力\nC. 轻松：普通\nD. 好学：智慧\nAnswer: A\n\n①我的奶奶是这样，我的父亲也是这样——\
+那张画像，已经成为许多老百姓生活必需品的一部分，没有它，似乎客厅都是空的 ②如果因为认知能力的提升而将偶像抽离出记忆，那么，在那个时代里成长、生活的许\
+多人，脑子里将空空如也，甚至不记得自己曾经活过这一回 ③卷的过程，是在收叠他个人的历史 ④有时挂旧了、破了，也会认认真真卷起来放好，我曾看别人卷过这画像，\
+那种澄澈的眼神令人难忘 ⑤有些伟大者永远不会被人遗忘，因为那个伟大者，在那个时代，其实是一种生活，精神生活 ⑥从这个意义上说，尽管也许并不懂他，但人们心\
+甘情愿尊他的名为圣 将以上6个句子重新排列，语序正确的是____。\nA. ②⑥⑤①④③\nB. ②⑥④③①⑤\nC. ①④③②⑥⑤\nD. ⑤②⑥①④③\nAnswer: D\n\n相机：拍摄____\nA. 空\
+调：降温\nB. B超：诊断\nC. 电脑：操作\nD. 地图：交通\nAnswer: B\n\n在现代社会，教育符号也即文凭和学历是一种重要的文化货币，手持符号资本，可进入相\
+应职业群体、身份团体和社会位置。譬如，凭借医学博士文凭，可成为医生。此为教育的筛选功能，亦被喻为人才的分类编码场，如同公共汽车总站，目的地不同的人选\
+择不同的路线，乘坐不同的车辆，到达不同的地方。 下列选项不符合文意的一项是____。\nA. 文凭与学历都是符号资本\nB. 教育符号是人才的分类编码\nC. 文凭体\
+现了教育的筛选功能\nD. 手持相应的符号资本才能进入相应的职业群体\nAnswer:",
+    "The following are multiple choice questions (with answers) about  civil servant.\n\n透过车轮卷起的黄土，却见山野人秋，庄稼割过，静谧中\
+含着一些寂静，只有阳光在切割过的根茬上烁烁闪亮。____。 填入横线上最恰当的是____。\nA. 这是一段颠簸的行程，一路上景色苍凉雄浑，寂静中裸露着贫瘠\nB. \
+也许是久旱的缘故，这边的溪流也变成了涓涓细流，在盘踞的石缝间流动\nC. 同绿色的南方相比，这里是荒凉的，乃至荒蛮\nD. 偶见人迹，大都是荒村野店。时而会有\
+一座小小的孤庙一闪而过\nAnswer: D\n\n据说，在东南沿海一带，渔民在捕到螃蟹后，将螃蟹放进一个上小肚大的竹篓里面，第一只螃蟹放进去时，渔夫就用重物将口\
+封住，当第二只、第三只放进去后，渔夫就不再盖重物了，因为，第一只即将爬出篓口的螃蟹，会被第二只、第三只螃蟹拉到篓底。如此下去，即使篓口没有盖盖子，但\
+也没有一只蟹能够爬出去。 这个故事意在告诉我们____。\nA. 人多不一定好办事\nB. 恶性竞争必然导致两败俱伤\nC. 内讧必然内耗，团结就是力量\nD. 与人方便，\
+自己方便\nAnswer: C\n\n谨慎：成就____\nA. 温和：好感\nB. 勤奋：努力\nC. 轻松：普通\nD. 好学：智慧\nAnswer: A\n\n①我的奶奶是这样，我的父亲也是这\
+样——那张画像，已经成为许多老百姓生活必需品的一部分，没有它，似乎客厅都是空的 ②如果因为认知能力的提升而将偶像抽离出记忆，那么，在那个时代里成长、生活\
+的许多人，脑子里将空空如也，甚至不记得自己曾经活过这一回 ③卷的过程，是在收叠他个人的历史 ④有时挂旧了、破了，也会认认真真卷起来放好，我曾看别人卷过这\
+画像，那种澄澈的眼神令人难忘 ⑤有些伟大者永远不会被人遗忘，因为那个伟大者，在那个时代，其实是一种生活，精神生活 ⑥从这个意义上说，尽管也许并不懂他，但\
+人们心甘情愿尊他的名为圣 将以上6个句子重新排列，语序正确的是____。\nA. ②⑥⑤①④③\nB. ②⑥④③①⑤\nC. ①④③②⑥⑤\nD. ⑤②⑥①④③\nAnswer: D\n\n相机：\
+拍摄____\nA. 空调：降温\nB. B超：诊断\nC. 电脑：操作\nD. 地图：交通\nAnswer: B\n\n侯方域：《桃花扇》____\nA. 蒲松龄：《聊斋志异》\nB. 石头记：\
+《红楼梦》\nC. 崔莺莺：《西厢记》\nD. 秦始皇：《后汉书》\nAnswer:",
+    "The following are multiple choice questions (with answers) about  civil servant.\n\n透过车轮卷起的黄土，却见山野人秋，庄稼割过，静谧中\
+含着一些寂静，只有阳光在切割过的根茬上烁烁闪亮。____。 填入横线上最恰当的是____。\nA. 这是一段颠簸的行程，一路上景色苍凉雄浑，寂静中裸露着贫瘠\nB. \
+也许是久旱的缘故，这边的溪流也变成了涓涓细流，在盘踞的石缝间流动\nC. 同绿色的南方相比，这里是荒凉的，乃至荒蛮\nD. 偶见人迹，大都是荒村野店。时而会有\
+一座小小的孤庙一闪而过\nAnswer: D\n\n据说，在东南沿海一带，渔民在捕到螃蟹后，将螃蟹放进一个上小肚大的竹篓里面，第一只螃蟹放进去时，渔夫就用重物将口\
+封住，当第二只、第三只放进去后，渔夫就不再盖重物了，因为，第一只即将爬出篓口的螃蟹，会被第二只、第三只螃蟹拉到篓底。如此下去，即使篓口没有盖盖子，但\
+也没有一只蟹能够爬出去。 这个故事意在告诉我们____。\nA. 人多不一定好办事\nB. 恶性竞争必然导致两败俱伤\nC. 内讧必然内耗，团结就是力量\nD. 与人方便，\
+自己方便\nAnswer: C\n\n谨慎：成就____\nA. 温和：好感\nB. 勤奋：努力\nC. 轻松：普通\nD. 好学：智慧\nAnswer: A\n\n①我的奶奶是这样，我的父亲也是这\
+样——那张画像，已经成为许多老百姓生活必需品的一部分，没有它，似乎客厅都是空的 ②如果因为认知能力的提升而将偶像抽离出记忆，那么，在那个时代里成长、生活\
+的许多人，脑子里将空空如也，甚至不记得自己曾经活过这一回 ③卷的过程，是在收叠他个人的历史 ④有时挂旧了、破了，也会认认真真卷起来放好，我曾看别人卷过这\
+画像，那种澄澈的眼神令人难忘 ⑤有些伟大者永远不会被人遗忘，因为那个伟大者，在那个时代，其实是一种生活，精神生活 ⑥从这个意义上说，尽管也许并不懂他，但\
+人们心甘情愿尊他的名为圣 将以上6个句子重新排列，语序正确的是____。\nA. ②⑥⑤①④③\nB. ②⑥④③①⑤\nC. ①④③②⑥⑤\nD. ⑤②⑥①④③\nAnswer: D\n\n相机：\
+拍摄____\nA. 空调：降温\nB. B超：诊断\nC. 电脑：操作\nD. 地图：交通\nAnswer: B\n\n____全党同志和全国人民团结一心，坚持不懈地奋斗，不断取得扎扎实\
+实的成效，我们____一定能够使社会主义新农村建设真正成为惠及广大农民群众的民心工程。 填入画横线部分最恰当的一项是____。\nA. 如果 就\nB. 只有 才\
+能\nC. 只要 就\nD. 倘若 也就\nAnswer:",
+    "The following are multiple choice questions (with answers) about  civil servant.\n\n透过车轮卷起的黄土，却见山野人秋，庄稼割过，静谧中含\
+着一些寂静，只有阳光在切割过的根茬上烁烁闪亮。____。 填入横线上最恰当的是____。\nA. 这是一段颠簸的行程，一路上景色苍凉雄浑，寂静中裸露着贫瘠\nB. \
+也许是久旱的缘故，这边的溪流也变成了涓涓细流，在盘踞的石缝间流动\nC. 同绿色的南方相比，这里是荒凉的，乃至荒蛮\nD. 偶见人迹，大都是荒村野店。时而会\
+有一座小小的孤庙一闪而过\nAnswer: D\n\n据说，在东南沿海一带，渔民在捕到螃蟹后，将螃蟹放进一个上小肚大的竹篓里面，第一只螃蟹放进去时，渔夫就用重物\
+将口封住，当第二只、第三只放进去后，渔夫就不再盖重物了，因为，第一只即将爬出篓口的螃蟹，会被第二只、第三只螃蟹拉到篓底。如此下去，即使篓口没有盖盖\
+子，但也没有一只蟹能够爬出去。 这个故事意在告诉我们____。\nA. 人多不一定好办事\nB. 恶性竞争必然导致两败俱伤\nC. 内讧必然内耗，团结就是力量\nD. 与\
+人方便，自己方便\nAnswer: C\n\n谨慎：成就____\nA. 温和：好感\nB. 勤奋：努力\nC. 轻松：普通\nD. 好学：智慧\nAnswer: A\n\n①我的奶奶是这样，我的父\
+亲也是这样——那张画像，已经成为许多老百姓生活必需品的一部分，没有它，似乎客厅都是空的 ②如果因为认知能力的提升而将偶像抽离出记忆，那么，在那个时代里成\
+长、生活的许多人，脑子里将空空如也，甚至不记得自己曾经活过这一回 ③卷的过程，是在收叠他个人的历史 ④有时挂旧了、破了，也会认认真真卷起来放好，我曾看别\
+人卷过这画像，那种澄澈的眼神令人难忘 ⑤有些伟大者永远不会被人遗忘，因为那个伟大者，在那个时代，其实是一种生活，精神生活 ⑥从这个意义上说，尽管也许并不\
+懂他，但人们心甘情愿尊他的名为圣 将以上6个句子重新排列，语序正确的是____。\nA. ②⑥⑤①④③\nB. ②⑥④③①⑤\nC. ①④③②⑥⑤\nD. ⑤②⑥①④③\nAnswer: D\n\n相机：\
+拍摄____\nA. 空调：降温\nB. B超：诊断\nC. 电脑：操作\nD. 地图：交通\nAnswer: B\n\n下列关于世界银行的说法中不正确的是____。\nA. 原名国际复兴开发\
+银行，于1944年开始营业\nB. 它是联合国下属的一个专门机构\nC. 是负责长期贷款的国际金融机构\nD. 贷款期限较长，一般为数年，最长可达30年\nAnswer:",
+    "The following are multiple choice questions (with answers) about  civil servant.\n\n透过车轮卷起的黄土，却见山野人秋，庄稼割过，静谧中\
+含着一些寂静，只有阳光在切割过的根茬上烁烁闪亮。____。 填入横线上最恰当的是____。\nA. 这是一段颠簸的行程，一路上景色苍凉雄浑，寂静中裸露着贫瘠\nB. \
+也许是久旱的缘故，这边的溪流也变成了涓涓细流，在盘踞的石缝间流动\nC. 同绿色的南方相比，这里是荒凉的，乃至荒蛮\nD. 偶见人迹，大都是荒村野店。时而会有\
+一座小小的孤庙一闪而过\nAnswer: D\n\n据说，在东南沿海一带，渔民在捕到螃蟹后，将螃蟹放进一个上小肚大的竹篓里面，第一只螃蟹放进去时，渔夫就用重物将口\
+封住，当第二只、第三只放进去后，渔夫就不再盖重物了，因为，第一只即将爬出篓口的螃蟹，会被第二只、第三只螃蟹拉到篓底。如此下去，即使篓口没有盖盖子，但\
+也没有一只蟹能够爬出去。 这个故事意在告诉我们____。\nA. 人多不一定好办事\nB. 恶性竞争必然导致两败俱伤\nC. 内讧必然内耗，团结就是力量\nD. 与人方便，\
+自己方便\nAnswer: C\n\n谨慎：成就____\nA. 温和：好感\nB. 勤奋：努力\nC. 轻松：普通\nD. 好学：智慧\nAnswer: A\n\n①我的奶奶是这样，我的父亲也是这\
+样——那张画像，已经成为许多老百姓生活必需品的一部分，没有它，似乎客厅都是空的 ②如果因为认知能力的提升而将偶像抽离出记忆，那么，在那个时代里成长、生活\
+的许多人，脑子里将空空如也，甚至不记得自己曾经活过这一回 ③卷的过程，是在收叠他个人的历史 ④有时挂旧了、破了，也会认认真真卷起来放好，我曾看别人卷过这\
+画像，那种澄澈的眼神令人难忘 ⑤有些伟大者永远不会被人遗忘，因为那个伟大者，在那个时代，其实是一种生活，精神生活 ⑥从这个意义上说，尽管也许并不懂他，但\
+人们心甘情愿尊他的名为圣 将以上6个句子重新排列，语序正确的是____。\nA. ②⑥⑤①④③\nB. ②⑥④③①⑤\nC. ①④③②⑥⑤\nD. ⑤②⑥①④③\nAnswer: D\n\n相机：\
+拍摄____\nA. 空调：降温\nB. B超：诊断\nC. 电脑：操作\nD. 地图：交通\nAnswer: B\n\n外资银行进入新兴市场国家，新兴市场国家银行业的各主体为了维持自\
+身的生存，会尽可能争取较大的市场份额，充分拓展自身竞争优势，努力向客户提供质优价廉的金融产品和金融服务，这个过程必然带动银行业微观效率的提升。 “这个\
+过程”指的是____。\nA. 外资银行进入新兴市场国家的过程\nB. 新兴市场国家银行业发展的过程\nC. 外资银行提供优质服务的过程\nD. 新兴市场国家银行业扩大市场\
+份额的过程\nAnswer:",
+    "The following are multiple choice questions (with answers) about  civil servant.\n\n透过车轮卷起的黄土，却见山野人秋，庄稼割过，静谧中含\
+着一些寂静，只有阳光在切割过的根茬上烁烁闪亮。____。 填入横线上最恰当的是____。\nA. 这是一段颠簸的行程，一路上景色苍凉雄浑，寂静中裸露着贫瘠\nB. \
+也许是久旱的缘故，这边的溪流也变成了涓涓细流，在盘踞的石缝间流动\nC. 同绿色的南方相比，这里是荒凉的，乃至荒蛮\nD. 偶见人迹，大都是荒村野店。时而会\
+有一座小小的孤庙一闪而过\nAnswer: D\n\n据说，在东南沿海一带，渔民在捕到螃蟹后，将螃蟹放进一个上小肚大的竹篓里面，第一只螃蟹放进去时，渔夫就用重物将\
+口封住，当第二只、第三只放进去后，渔夫就不再盖重物了，因为，第一只即将爬出篓口的螃蟹，会被第二只、第三只螃蟹拉到篓底。如此下去，即使篓口没有盖盖子，\
+但也没有一只蟹能够爬出去。 这个故事意在告诉我们____。\nA. 人多不一定好办事\nB. 恶性竞争必然导致两败俱伤\nC. 内讧必然内耗，团结就是力量\nD. 与人方\
+便，自己方便\nAnswer: C\n\n谨慎：成就____\nA. 温和：好感\nB. 勤奋：努力\nC. 轻松：普通\nD. 好学：智慧\nAnswer: A\n\n①我的奶奶是这样，我的父亲也\
+是这样——那张画像，已经成为许多老百姓生活必需品的一部分，没有它，似乎客厅都是空的 ②如果因为认知能力的提升而将偶像抽离出记忆，那么，在那个时代里成长、\
+生活的许多人，脑子里将空空如也，甚至不记得自己曾经活过这一回 ③卷的过程，是在收叠他个人的历史 ④有时挂旧了、破了，也会认认真真卷起来放好，我曾看别人卷\
+过这画像，那种澄澈的眼神令人难忘 ⑤有些伟大者永远不会被人遗忘，因为那个伟大者，在那个时代，其实是一种生活，精神生活 ⑥从这个意义上说，尽管也许并不懂\
+他，但人们心甘情愿尊他的名为圣 将以上6个句子重新排列，语序正确的是____。\nA. ②⑥⑤①④③\nB. ②⑥④③①⑤\nC. ①④③②⑥⑤\nD. ⑤②⑥①④③\nAnswer: D\n\n相机：\
+拍摄____\nA. 空调：降温\nB. B超：诊断\nC. 电脑：操作\nD. 地图：交通\nAnswer: B\n\n按照行政层级标准来划分，我国政府机构的类型有____。\nA. 一\
+般地方国家行政机关和基层国家行政机关两大类\nB. 常设机构与非常设机构两类\nC. 领导机构、办公办事机构、职能机构和派出机构四类\nD. 中央国家行政机关和地\
+方国家行政机关两大类\nAnswer:",
+    "The following are multiple choice questions (with answers) about  civil servant.\n\n透过车轮卷起的黄土，却见山野人秋，庄稼割过，静谧中\
+含着一些寂静，只有阳光在切割过的根茬上烁烁闪亮。____。 填入横线上最恰当的是____。\nA. 这是一段颠簸的行程，一路上景色苍凉雄浑，寂静中裸露着贫瘠\nB. \
+也许是久旱的缘故，这边的溪流也变成了涓涓细流，在盘踞的石缝间流动\nC. 同绿色的南方相比，这里是荒凉的，乃至荒蛮\nD. 偶见人迹，大都是荒村野店。时而会有\
+一座小小的孤庙一闪而过\nAnswer: D\n\n据说，在东南沿海一带，渔民在捕到螃蟹后，将螃蟹放进一个上小肚大的竹篓里面，第一只螃蟹放进去时，渔夫就用重物将口\
+封住，当第二只、第三只放进去后，渔夫就不再盖重物了，因为，第一只即将爬出篓口的螃蟹，会被第二只、第三只螃蟹拉到篓底。如此下去，即使篓口没有盖盖子，但\
+也没有一只蟹能够爬出去。 这个故事意在告诉我们____。\nA. 人多不一定好办事\nB. 恶性竞争必然导致两败俱伤\nC. 内讧必然内耗，团结就是力量\nD. 与人方便，\
+自己方便\nAnswer: C\n\n谨慎：成就____\nA. 温和：好感\nB. 勤奋：努力\nC. 轻松：普通\nD. 好学：智慧\nAnswer: A\n\n①我的奶奶是这样，我的父亲也是这\
+样——那张画像，已经成为许多老百姓生活必需品的一部分，没有它，似乎客厅都是空的 ②如果因为认知能力的提升而将偶像抽离出记忆，那么，在那个时代里成长、生活\
+的许多人，脑子里将空空如也，甚至不记得自己曾经活过这一回 ③卷的过程，是在收叠他个人的历史 ④有时挂旧了、破了，也会认认真真卷起来放好，我曾看别人卷过这\
+画像，那种澄澈的眼神令人难忘 ⑤有些伟大者永远不会被人遗忘，因为那个伟大者，在那个时代，其实是一种生活，精神生活 ⑥从这个意义上说，尽管也许并不懂他，但\
+人们心甘情愿尊他的名为圣 将以上6个句子重新排列，语序正确的是____。\nA. ②⑥⑤①④③\nB. ②⑥④③①⑤\nC. ①④③②⑥⑤\nD. ⑤②⑥①④③\nAnswer: D\n\n相机：\
+拍摄____\nA. 空调：降温\nB. B超：诊断\nC. 电脑：操作\nD. 地图：交通\nAnswer: B\n\n在某市一项对公司年轻人员的最新调查中，与往年相比，今年有70％的\
+人打算购买房屋，这一比例已达到历史最高值。然而，在房屋管理局的统计中，该市今年的房屋成交量却比往年有所下降。以下哪项如果为真，最不能解释上述现\
+象?____\nA. 一些打算购买房屋的年轻人目前并不具备该市购买房屋的条件\nB. 往年资料表明，年轻人员购买房屋的比例不足购买房屋成员的30％\nC. 近年来爆发的\
+金融风暴，对房地产行业有一定的打击\nD. 近几个月该市楼市价格不稳定，使得一些购房者持观望态度\nAnswer:",
+    "The following are multiple choice questions (with answers) about  civil servant.\n\n透过车轮卷起的黄土，却见山野人秋，庄稼割过，静谧中\
+含着一些寂静，只有阳光在切割过的根茬上烁烁闪亮。____。 填入横线上最恰当的是____。\nA. 这是一段颠簸的行程，一路上景色苍凉雄浑，寂静中裸露着贫瘠\nB. \
+也许是久旱的缘故，这边的溪流也变成了涓涓细流，在盘踞的石缝间流动\nC. 同绿色的南方相比，这里是荒凉的，乃至荒蛮\nD. 偶见人迹，大都是荒村野店。时而会有\
+一座小小的孤庙一闪而过\nAnswer: D\n\n据说，在东南沿海一带，渔民在捕到螃蟹后，将螃蟹放进一个上小肚大的竹篓里面，第一只螃蟹放进去时，渔夫就用重物将口\
+封住，当第二只、第三只放进去后，渔夫就不再盖重物了，因为，第一只即将爬出篓口的螃蟹，会被第二只、第三只螃蟹拉到篓底。如此下去，即使篓口没有盖盖子，但也\
+没有一只蟹能够爬出去。 这个故事意在告诉我们____。\nA. 人多不一定好办事\nB. 恶性竞争必然导致两败俱伤\nC. 内讧必然内耗，团结就是力量\nD. 与人方便，自\
+己方便\nAnswer: C\n\n谨慎：成就____\nA. 温和：好感\nB. 勤奋：努力\nC. 轻松：普通\nD. 好学：智慧\nAnswer: A\n\n①我的奶奶是这样，我的父亲也是这样—\
+—那张画像，已经成为许多老百姓生活必需品的一部分，没有它，似乎客厅都是空的 ②如果因为认知能力的提升而将偶像抽离出记忆，那么，在那个时代里成长、生活的许\
+多人，脑子里将空空如也，甚至不记得自己曾经活过这一回 ③卷的过程，是在收叠他个人的历史 ④有时挂旧了、破了，也会认认真真卷起来放好，我曾看别人卷过这画\
+像，那种澄澈的眼神令人难忘 ⑤有些伟大者永远不会被人遗忘，因为那个伟大者，在那个时代，其实是一种生活，精神生活 ⑥从这个意义上说，尽管也许并不懂他，但人\
+们心甘情愿尊他的名为圣 将以上6个句子重新排列，语序正确的是____。\nA. ②⑥⑤①④③\nB. ②⑥④③①⑤\nC. ①④③②⑥⑤\nD. ⑤②⑥①④③\nAnswer: D\n\n相机：拍摄____\nA. \
+空调：降温\nB. B超：诊断\nC. 电脑：操作\nD. 地图：交通\nAnswer: B\n\n我们以往所理解的“现代化”概念仅仅局限于物质层面，局限于表层经济现代化，这也是\
+迟发展国家长期存在的一个普遍性问题：在物质层面上求变的欲望很强，而在制度层面和观念层面上却是文化守成主义的，这种状况对于现代化实际进程的影响自不必说，\
+它对于学术的影响是导致知识的流俗化。不断地更换新词语，在新词语的装潢下重复古老的思想观念，结果是词语和口号不断地更换而社会精神气质则没有实质性的变化。 \
+这段文字要表达的主要意思是____。\nA. 现代化应包括物质的、制度的、观念的三个层面\nB. 片面理解现代化是迟发展国家长期存在的一个普遍性问题\nC. 物质层面\
+的落后现状是迟发展国家片面理解现代化的一个重要因素\nD. 片面理解现代化会导致知识的流俗化\nAnswer:",
+    "The following are multiple choice questions (with answers) about  civil servant.\n\n透过车轮卷起的黄土，却见山野人秋，庄稼割过，静谧中\
+含着一些寂静，只有阳光在切割过的根茬上烁烁闪亮。____。 填入横线上最恰当的是____。\nA. 这是一段颠簸的行程，一路上景色苍凉雄浑，寂静中裸露着贫瘠\nB. \
+也许是久旱的缘故，这边的溪流也变成了涓涓细流，在盘踞的石缝间流动\nC. 同绿色的南方相比，这里是荒凉的，乃至荒蛮\nD. 偶见人迹，大都是荒村野店。时而会有\
+一座小小的孤庙一闪而过\nAnswer: D\n\n据说，在东南沿海一带，渔民在捕到螃蟹后，将螃蟹放进一个上小肚大的竹篓里面，第一只螃蟹放进去时，渔夫就用重物将口\
+封住，当第二只、第三只放进去后，渔夫就不再盖重物了，因为，第一只即将爬出篓口的螃蟹，会被第二只、第三只螃蟹拉到篓底。如此下去，即使篓口没有盖盖子，但\
+也没有一只蟹能够爬出去。 这个故事意在告诉我们____。\nA. 人多不一定好办事\nB. 恶性竞争必然导致两败俱伤\nC. 内讧必然内耗，团结就是力量\nD. 与人方便，\
+自己方便\nAnswer: C\n\n谨慎：成就____\nA. 温和：好感\nB. 勤奋：努力\nC. 轻松：普通\nD. 好学：智慧\nAnswer: A\n\n①我的奶奶是这样，我的父亲也是这\
+样——那张画像，已经成为许多老百姓生活必需品的一部分，没有它，似乎客厅都是空的 ②如果因为认知能力的提升而将偶像抽离出记忆，那么，在那个时代里成长、生活\
+的许多人，脑子里将空空如也，甚至不记得自己曾经活过这一回 ③卷的过程，是在收叠他个人的历史 ④有时挂旧了、破了，也会认认真真卷起来放好，我曾看别人卷过这\
+画像，那种澄澈的眼神令人难忘 ⑤有些伟大者永远不会被人遗忘，因为那个伟大者，在那个时代，其实是一种生活，精神生活 ⑥从这个意义上说，尽管也许并不懂他，但\
+人们心甘情愿尊他的名为圣 将以上6个句子重新排列，语序正确的是____。\nA. ②⑥⑤①④③\nB. ②⑥④③①⑤\nC. ①④③②⑥⑤\nD. ⑤②⑥①④③\nAnswer: D\n\n相机：\
+拍摄____\nA. 空调：降温\nB. B超：诊断\nC. 电脑：操作\nD. 地图：交通\nAnswer: B\n\n皮肤破损出血、颈髓损伤、锐器插入体内、严重挤压伤等是灾害发生时\
+的常见损伤类型．掌握科学的自救方法对于延续生命、等待救援很重要。下列自救措施中，恰当的是____。\nA. 锐器插人体内后，应快速将锐器拔出，简单处理伤口后\
+立即送往医院救治\nB. 对颈后锐痛、活动时疼痛加剧等症状，即用颈托，一时无颈托，可临时用敷料、硬板纸或塑料板做成颈圈固定颈部\nC. 伤口发生喷射状出血时，\
+应立即用厚消毒纱布(或毛巾)包扎好伤口\nD. 被重物挤压引起肢体肿胀或青紫时，应尽快在患处用热毛巾湿敷消肿\nAnswer:",
+    "The following are multiple choice questions (with answers) about  civil servant.\n\n透过车轮卷起的黄土，却见山野人秋，庄稼割过，静谧中\
+含着一些寂静，只有阳光在切割过的根茬上烁烁闪亮。____。 填入横线上最恰当的是____。\nA. 这是一段颠簸的行程，一路上景色苍凉雄浑，寂静中裸露着贫瘠\nB. \
+也许是久旱的缘故，这边的溪流也变成了涓涓细流，在盘踞的石缝间流动\nC. 同绿色的南方相比，这里是荒凉的，乃至荒蛮\nD. 偶见人迹，大都是荒村野店。时而会有\
+一座小小的孤庙一闪而过\nAnswer: D\n\n据说，在东南沿海一带，渔民在捕到螃蟹后，将螃蟹放进一个上小肚大的竹篓里面，第一只螃蟹放进去时，渔夫就用重物将口\
+封住，当第二只、第三只放进去后，渔夫就不再盖重物了，因为，第一只即将爬出篓口的螃蟹，会被第二只、第三只螃蟹拉到篓底。如此下去，即使篓口没有盖盖子，但也\
+没有一只蟹能够爬出去。 这个故事意在告诉我们____。\nA. 人多不一定好办事\nB. 恶性竞争必然导致两败俱伤\nC. 内讧必然内耗，团结就是力量\nD. 与人方便，自\
+己方便\nAnswer: C\n\n谨慎：成就____\nA. 温和：好感\nB. 勤奋：努力\nC. 轻松：普通\nD. 好学：智慧\nAnswer: A\n\n①我的奶奶是这样，我的父亲也是这样—\
+—那张画像，已经成为许多老百姓生活必需品的一部分，没有它，似乎客厅都是空的 ②如果因为认知能力的提升而将偶像抽离出记忆，那么，在那个时代里成长、生活的许\
+多人，脑子里将空空如也，甚至不记得自己曾经活过这一回 ③卷的过程，是在收叠他个人的历史 ④有时挂旧了、破了，也会认认真真卷起来放好，我曾看别人卷过这画像，\
+那种澄澈的眼神令人难忘 ⑤有些伟大者永远不会被人遗忘，因为那个伟大者，在那个时代，其实是一种生活，精神生活 ⑥从这个意义上说，尽管也许并不懂他，但人们心\
+甘情愿尊他的名为圣 将以上6个句子重新排列，语序正确的是____。\nA. ②⑥⑤①④③\nB. ②⑥④③①⑤\nC. ①④③②⑥⑤\nD. ⑤②⑥①④③\nAnswer: D\n\n相机：拍摄____\nA. 空\
+调：降温\nB. B超：诊断\nC. 电脑：操作\nD. 地图：交通\nAnswer: B\n\n多年以来，医生和家属对待癌症患者大多采取这样的态度：即向患者隐瞒已得癌症的实情，\
+这样的做法在医学上叫作“保护性医疗”，其目的在于减少患者的心理负担。但是，某肿瘤医生新设立的康复科的张主任却主张实行“公开性治疗”。 由此可推知下文将要论\
+述的是____。\nA. 家属对实行“公开性治疗”的态度\nB. “保护性医疗”的弊端\nC. “公开性治疗”将使病情得到控制和好转\nD. “公开性治疗”的含义和形式\nAnswer:",
+    "The following are multiple choice questions (with answers) about  civil servant.\n\n透过车轮卷起的黄土，却见山野人秋，庄稼割过，静谧中\
+含着一些寂静，只有阳光在切割过的根茬上烁烁闪亮。____。 填入横线上最恰当的是____。\nA. 这是一段颠簸的行程，一路上景色苍凉雄浑，寂静中裸露着贫瘠\nB. \
+也许是久旱的缘故，这边的溪流也变成了涓涓细流，在盘踞的石缝间流动\nC. 同绿色的南方相比，这里是荒凉的，乃至荒蛮\nD. 偶见人迹，大都是荒村野店。时而会有\
+一座小小的孤庙一闪而过\nAnswer: D\n\n据说，在东南沿海一带，渔民在捕到螃蟹后，将螃蟹放进一个上小肚大的竹篓里面，第一只螃蟹放进去时，渔夫就用重物将口\
+封住，当第二只、第三只放进去后，渔夫就不再盖重物了，因为，第一只即将爬出篓口的螃蟹，会被第二只、第三只螃蟹拉到篓底。如此下去，即使篓口没有盖盖子，但也\
+没有一只蟹能够爬出去。 这个故事意在告诉我们____。\nA. 人多不一定好办事\nB. 恶性竞争必然导致两败俱伤\nC. 内讧必然内耗，团结就是力量\nD. 与人方便，自\
+己方便\nAnswer: C\n\n谨慎：成就____\nA. 温和：好感\nB. 勤奋：努力\nC. 轻松：普通\nD. 好学：智慧\nAnswer: A\n\n①我的奶奶是这样，我的父亲也是这样\
+——那张画像，已经成为许多老百姓生活必需品的一部分，没有它，似乎客厅都是空的 ②如果因为认知能力的提升而将偶像抽离出记忆，那么，在那个时代里成长、生活的\
+许多人，脑子里将空空如也，甚至不记得自己曾经活过这一回 ③卷的过程，是在收叠他个人的历史 ④有时挂旧了、破了，也会认认真真卷起来放好，我曾看别人卷过这画像\
+，那种澄澈的眼神令人难忘 ⑤有些伟大者永远不会被人遗忘，因为那个伟大者，在那个时代，其实是一种生活，精神生活 ⑥从这个意义上说，尽管也许并不懂他，但人们\
+心甘情愿尊他的名为圣 将以上6个句子重新排列，语序正确的是____。\nA. ②⑥⑤①④③\nB. ②⑥④③①⑤\nC. ①④③②⑥⑤\nD. ⑤②⑥①④③\nAnswer: D\n\n相机：拍摄____\nA. 空\
+调：降温\nB. B超：诊断\nC. 电脑：操作\nD. 地图：交通\nAnswer: B\n\n古人归纳总结出许多观天象识天气的谚语。下列与天气变化无关的谚语是____。\nA. 朝\
+霞不出门，晚霞行千里\nB. 天上鱼鳞云，地下雨淋淋\nC. 东风是个精，不下也要阴\nD. 百日连阴雨，总有一日晴\nAnswer:",
+    "The following are multiple choice questions (with answers) about  civil servant.\n\n透过车轮卷起的黄土，却见山野人秋，庄稼割过，静谧\
+中含着一些寂静，只有阳光在切割过的根茬上烁烁闪亮。____。 填入横线上最恰当的是____。\nA. 这是一段颠簸的行程，一路上景色苍凉雄浑，寂静中裸瘠\nB. 也许\
+是久旱的缘故，这边的溪流也变成了涓涓细流，在盘踞的石缝间流动\nC. 同绿色的南方相比，这里是荒凉的，乃至荒蛮\nD. 偶见人迹，大都是荒村野店。时而会有一座\
+小小的孤庙一闪而过\nAnswer: D\n\n据说，在东南沿海一带，渔民在捕到螃蟹后，将螃蟹放进一个上小肚大的竹篓里面，第一只螃蟹放进去时，渔夫就用重物将口封\
+住，当第二只、第三只放进去后，渔夫就不再盖重物了，因为，第一只即将爬出篓口的螃蟹，会被第二只、第三只螃蟹拉到篓底。如此下去，即使篓口没有盖盖子，但也\
+没有一只蟹能够爬出去。 这个故事意在告诉我们____。\nA. 人多不一定好办事\nB. 恶性竞争必然导致两败俱伤\nC. 内讧必然内耗，团结就是力量\nD. 与人方便，\
+自己方便\nAnswer: C\n\n谨慎：成就____\nA. 温和：好感\nB. 勤奋：努力\nC. 轻松：普通\nD. 好学：智慧\nAnswer: A\n\n①我的奶奶是这样，我的父亲也是这\
+样——那张画像，已经成为许多老百姓生活必需品的一部分，没有它，似乎客厅都是空的 ②如果因为认知能力的提升而将偶像抽离出记忆，那么，在那个时代里成长、生活\
+的许多人，脑子里将空空如也，甚至不记得自己曾经活过这一回 ③卷的过程，是在收叠他个人的历史 ④有时挂旧了、破了，也会认认真真卷起来放好，我曾看别人卷过这\
+画像，那种澄澈的眼神令人难忘 ⑤有些伟大者永远不会被人遗忘，因为那个伟大者，在那个时代，其实是一种生活，精神生活 ⑥从这个意义上说，尽管也许并不懂他，但\
+人们心甘情愿尊他的名为圣 将以上6个句子重新排列，语序正确的是____。\nA. ②⑥⑤①④③\nB. ②⑥④③①⑤\nC. ①④③②⑥⑤\nD. ⑤②⑥①④③\nAnswer: D\n\n相机：\
+拍摄____\nA. 空调：降温\nB. B超：诊断\nC. 电脑：操作\nD. 地图：交通\nAnswer: B\n\n从《论语》看，孔子对音乐的重视，可以说远远超出了后世那些尊敬他\
+的人的想象，这一方面来自他对于乐的精神艺术的新发现。艺术，只在人们精神的发现中才存在，可以说，就现在见到的材料看，孔子可能是中国历史上最伟大的艺术精\
+神的发现者。这段文字重点强调____。\nA. 孔子在音乐方面的成就与贡献\nB. 后人评价孔子时所存在的偏颇\nC. 艺术精神在乐教传承中的作用\nD. 《论语》作为文\
+献的重要意义\nAnswer:",
+    "The following are multiple choice questions (with answers) about  civil servant.\n\n透过车轮卷起的黄土，却见山野人秋，庄稼割过，静谧中\
+含着一些寂静，只有阳光在切割过的根茬上烁烁闪亮。____。 填入横线上最恰当的是____。\nA. 这是一段颠簸的行程，一路上景色苍凉雄浑，寂静中裸露着贫瘠\nB. \
+也许是久旱的缘故，这边的溪流也变成了涓涓细流，在盘踞的石缝间流动\nC. 同绿色的南方相比，这里是荒凉的，乃至荒蛮\nD. 偶见人迹，大都是荒村野店。时而会有\
+一座小小的孤庙一闪而过\nAnswer: D\n\n据说，在东南沿海一带，渔民在捕到螃蟹后，将螃蟹放进一个上小肚大的竹篓里面，第一只螃蟹放进去时，渔夫就用重物将口\
+封住，当第二只、第三只放进去后，渔夫就不再盖重物了，因为，第一只即将爬出篓口的螃蟹，会被第二只、第三只螃蟹拉到篓底。如此下去，即使篓口没有盖盖子，但\
+也没有一只蟹能够爬出去。 这个故事意在告诉我们____。\nA. 人多不一定好办事\nB. 恶性竞争必然导致两败俱伤\nC. 内讧必然内耗，团结就是力量\nD. 与人方便，\
+自己方便\nAnswer: C\n\n谨慎：成就____\nA. 温和：好感\nB. 勤奋：努力\nC. 轻松：普通\nD. 好学：智慧\nAnswer: A\n\n①我的奶奶是这样，我的父亲也是这\
+样——那张画像，已经成为许多老百姓生活必需品的一部分，没有它，似乎客厅都是空的 ②如果因为认知能力的提升而将偶像抽离出记忆，那么，在那个时代里成长、生活\
+的许多人，脑子里将空空如也，甚至不记得自己曾经活过这一回 ③卷的过程，是在收叠他个人的历史 ④有时挂旧了、破了，也会认认真真卷起来放好，我曾看别人卷过这\
+画像，那种澄澈的眼神令人难忘 ⑤有些伟大者永远不会被人遗忘，因为那个伟大者，在那个时代，其实是一种生活，精神生活 ⑥从这个意义上说，尽管也许并不懂他，但\
+人们心甘情愿尊他的名为圣 将以上6个句子重新排列，语序正确的是____。\nA. ②⑥⑤①④③\nB. ②⑥④③①⑤\nC. ①④③②⑥⑤\nD. ⑤②⑥①④③\nAnswer: D\n\n相机：\
+拍摄____\nA. 空调：降温\nB. B超：诊断\nC. 电脑：操作\nD. 地图：交通\nAnswer: B\n\n①当地球撞进尘埃带时，从地球上看，是短时间内无数尘埃以极高的速\
+度划破大气层下落 ②因此，流星雨实际上是彗星留下的无数尘埃形成的 ③进入大气层的尘埃被大气加热，发出明亮的光 ④彗星释放出的尘埃，并非顷刻扩散到宇宙空间，\
+消失得无影无踪，而是留在彗星的轨道上继续公转 ⑤这样看上去就有许多流星，也就是流星雨 ⑥这样形成的“尘埃带”，有些和地球的公转轨道交叉 将以上6个句子重新排\
+列，语序正确的是____。\nA. ④②⑥③⑤①\nB. ①④③⑥⑤②\nC. ④⑥①③⑤②\nD. ①③⑤②④⑥\nAnswer:",
+    "The following are multiple choice questions (with answers) about  civil servant.\n\n透过车轮卷起的黄土，却见山野人秋，庄稼割过，静谧中\
+含着一些寂静，只有阳光在切割过的根茬上烁烁闪亮。____。 填入横线上最恰当的是____。\nA. 这是一段颠簸的行程，一路上景色苍凉雄浑，寂静中裸露着贫瘠\nB. \
+也许是久旱的缘故，这边的溪流也变成了涓涓细流，在盘踞的石缝间流动\nC. 同绿色的南方相比，这里是荒凉的，乃至荒蛮\nD. 偶见人迹，大都是荒村野店。时而会有\
+一座小小的孤庙一闪而过\nAnswer: D\n\n据说，在东南沿海一带，渔民在捕到螃蟹后，将螃蟹放进一个上小肚大的竹篓里面，第一只螃蟹放进去时，渔夫就用重物将口\
+封住，当第二只、第三只放进去后，渔夫就不再盖重物了，因为，第一只即将爬出篓口的螃蟹，会被第二只、第三只螃蟹拉到篓底。如此下去，即使篓口没有盖盖子，但\
+也没有一只蟹能够爬出去。 这个故事意在告诉我们____。\nA. 人多不一定好办事\nB. 恶性竞争必然导致两败俱伤\nC. 内讧必然内耗，团结就是力量\nD. 与人方便，\
+自己方便\nAnswer: C\n\n谨慎：成就____\nA. 温和：好感\nB. 勤奋：努力\nC. 轻松：普通\nD. 好学：智慧\nAnswer: A\n\n①我的奶奶是这样，我的父亲也是这\
+样——那张画像，已经成为许多老百姓生活必需品的一部分，没有它，似乎客厅都是空的 ②如果因为认知能力的提升而将偶像抽离出记忆，那么，在那个时代里成长、生活\
+的许多人，脑子里将空空如也，甚至不记得自己曾经活过这一回 ③卷的过程，是在收叠他个人的历史 ④有时挂旧了、破了，也会认认真真卷起来放好，我曾看别人卷过这\
+画像，那种澄澈的眼神令人难忘 ⑤有些伟大者永远不会被人遗忘，因为那个伟大者，在那个时代，其实是一种生活，精神生活 ⑥从这个意义上说，尽管也许并不懂他，但\
+人们心甘情愿尊他的名为圣 将以上6个句子重新排列，语序正确的是____。\nA. ②⑥⑤①④③\nB. ②⑥④③①⑤\nC. ①④③②⑥⑤\nD. ⑤②⑥①④③\nAnswer: D\n\n相机：\
+拍摄____\nA. 空调：降温\nB. B超：诊断\nC. 电脑：操作\nD. 地图：交通\nAnswer: B\n\n3，7，16，107，____\nA. 1704\nB. 1072\nC. 1707\nD. \
+\1068\nAnswer:",
+    "The following are multiple choice questions (with answers) about  civil servant.\n\n透过车轮卷起的黄土，却见山野人秋，庄稼割过，静谧中\
+含着一些寂静，只有阳光在切割过的根茬上烁烁闪亮。____。 填入横线上最恰当的是____。\nA. 这是一段颠簸的行程，一路上景色苍凉雄浑，寂静中裸露着贫瘠\nB. \
+也许是久旱的缘故，这边的溪流也变成了涓涓细流，在盘踞的石缝间流动\nC. 同绿色的南方相比，这里是荒凉的，乃至荒蛮\nD. 偶见人迹，大都是荒村野店。时而会有一\
+座小小的孤庙一闪而过\nAnswer: D\n\n据说，在东南沿海一带，渔民在捕到螃蟹后，将螃蟹放进一个上小肚大的竹篓里面，第一只螃蟹放进去时，渔夫就用重物将口封\
+住，当第二只、第三只放进去后，渔夫就不再盖重物了，因为，第一只即将爬出篓口的螃蟹，会被第二只、第三只螃蟹拉到篓底。如此下去，即使篓口没有盖盖子，但也\
+没有一只蟹能够爬出去。 这个故事意在告诉我们____。\nA. 人多不一定好办事\nB. 恶性竞争必然导致两败俱伤\nC. 内讧必然内耗，团结就是力量\nD. 与人方便，自\
+己方便\nAnswer: C\n\n谨慎：成就____\nA. 温和：好感\nB. 勤奋：努力\nC. 轻松：普通\nD. 好学：智慧\nAnswer: A\n\n①我的奶奶是这样，我的父亲也是这样\
+——那张画像，已经成为许多老百姓生活必需品的一部分，没有它，似乎客厅都是空的 ②如果因为认知能力的提升而将偶像抽离出记忆，那么，在那个时代里成长、生活的\
+许多人，脑子里将空空如也，甚至不记得自己曾经活过这一回 ③卷的过程，是在收叠他个人的历史 ④有时挂旧了、破了，也会认认真真卷起来放好，我曾看别人卷过这画\
+像，那种澄澈的眼神令人难忘 ⑤有些伟大者永远不会被人遗忘，因为那个伟大者，在那个时代，其实是一种生活，精神生活 ⑥从这个意义上说，尽管也许并不懂他，但人\
+们心甘情愿尊他的名为圣 将以上6个句子重新排列，语序正确的是____。\nA. ②⑥⑤①④③\nB. ②⑥④③①⑤\nC. ①④③②⑥⑤\nD. ⑤②⑥①④③\nAnswer: D\n\n相机：拍摄____\nA. \
+空调：降温\nB. B超：诊断\nC. 电脑：操作\nD. 地图：交通\nAnswer: B\n\n我始终____，开始在内心生活得更严肃的人，也会在外在上开始生活得更____。在一个\
+奢华浪费的年代，我希望能向世界____，人类真正要的东西是非常之微小的。 填入画横线部分最恰当的一项是____。\nA. 确认 朴素 表明\nB. 相信 质朴 证明\nC. \
+确认 质朴 证明\nD. 相信 朴素 表明\nAnswer:",
+    "The following are multiple choice questions (with answers) about  civil servant.\n\n透过车轮卷起的黄土，却见山野人秋，庄稼割过，静谧中\
+含着一些寂静，只有阳光在切割过的根茬上烁烁闪亮。____。 填入横线上最恰当的是____。\nA. 这是一段颠簸的行程，一路上景色苍凉雄浑，寂静中裸露着贫瘠\nB. \
+也许是久旱的缘故，这边的溪流也变成了涓涓细流，在盘踞的石缝间流动\nC. 同绿色的南方相比，这里是荒凉的，乃至荒蛮\nD. 偶见人迹，大都是荒村野店。时而会有\
+一座小小的孤庙一闪而过\nAnswer: D\n\n据说，在东南沿海一带，渔民在捕到螃蟹后，将螃蟹放进一个上小肚大的竹篓里面，第一只螃蟹放进去时，渔夫就用重物将口\
+封住，当第二只、第三只放进去后，渔夫就不再盖重物了，因为，第一只即将爬出篓口的螃蟹，会被第二只、第三只螃蟹拉到篓底。如此下去，即使篓口没有盖盖子，但也\
+没有一只蟹能够爬出去。 这个故事意在告诉我们____。\nA. 人多不一定好办事\nB. 恶性竞争必然导致两败俱伤\nC. 内讧必然内耗，团结就是力量\nD. 与人方便，自\
+己方便\nAnswer: C\n\n谨慎：成就____\nA. 温和：好感\nB. 勤奋：努力\nC. 轻松：普通\nD. 好学：智慧\nAnswer: A\n\n①我的奶奶是这样，我的父亲也是这样—\
+—那张画像，已经成为许多老百姓生活必需品的一部分，没有它，似乎客厅都是空的 ②如果因为认知能力的提升而将偶像抽离出记忆，那么，在那个时代里成长、生活的许\
+多人，脑子里将空空如也，甚至不记得自己曾经活过这一回 ③卷的过程，是在收叠他个人的历史 ④有时挂旧了、破了，也会认认真真卷起来放好，我曾看别人卷过这画\
+像，那种澄澈的眼神令人难忘 ⑤有些伟大者永远不会被人遗忘，因为那个伟大者，在那个时代，其实是一种生活，精神生活 ⑥从这个意义上说，尽管也许并不懂他，但人\
+们心甘情愿尊他的名为圣 将以上6个句子重新排列，语序正确的是____。\nA. ②⑥⑤①④③\nB. ②⑥④③①⑤\nC. ①④③②⑥⑤\nD. ⑤②⑥①④③\nAnswer: D\n\n相机：拍摄____\nA. \
+空调：降温\nB. B超：诊断\nC. 电脑：操作\nD. 地图：交通\nAnswer: B\n\n一特殊跑道为正三角形，某运动员用6米／秒的速度跑一圈耗时50秒，问该运动员提\
+速10％后从跑道的某个顶点横穿跑道跑向对边，问最少约需多少秒可踏足对边?(四舍五入到个位)____\nA. 9秒\nB. 10秒\nC. 13秒\nD. 15秒\nAnswer:",
+    "The following are multiple choice questions (with answers) about  civil servant.\n\n透过车轮卷起的黄土，却见山野人秋，庄稼割过，静谧中\
+含着一些寂静，只有阳光在切割过的根茬上烁烁闪亮。____。 填入横线上最恰当的是____。\nA. 这是一段颠簸的行程，一路上景色苍凉雄浑，寂静中裸露着贫瘠\nB. \
+也许是久旱的缘故，这边的溪流也变成了涓涓细流，在盘踞的石缝间流动\nC. 同绿色的南方相比，这里是荒凉的，乃至荒蛮\nD. 偶见人迹，大都是荒村野店。时而会有\
+一座小小的孤庙一闪而过\nAnswer: D\n\n据说，在东南沿海一带，渔民在捕到螃蟹后，将螃蟹放进一个上小肚大的竹篓里面，第一只螃蟹放进去时，渔夫就用重物将口\
+封住，当第二只、第三只放进去后，渔夫就不再盖重物了，因为，第一只即将爬出篓口的螃蟹，会被第二只、第三只螃蟹拉到篓底。如此下去，即使篓口没有盖盖子，但也\
+没有一只蟹能够爬出去。 这个故事意在告诉我们____。\nA. 人多不一定好办事\nB. 恶性竞争必然导致两败俱伤\nC. 内讧必然内耗，团结就是力量\nD. 与人方便，自\
+己方便\nAnswer: C\n\n谨慎：成就____\nA. 温和：好感\nB. 勤奋：努力\nC. 轻松：普通\nD. 好学：智慧\nAnswer: A\n\n①我的奶奶是这样，我的父亲也是这样—\
+—那张画像，已经成为许多老百姓生活必需品的一部分，没有它，似乎客厅都是空的 ②如果因为认知能力的提升而将偶像抽离出记忆，那么，在那个时代里成长、生活的许\
+多人，脑子里将空空如也，甚至不记得自己曾经活过这一回 ③卷的过程，是在收叠他个人的历史 ④有时挂旧了、破了，也会认认真真卷起来放好，我曾看别人卷过这画\
+像，那种澄澈的眼神令人难忘 ⑤有些伟大者永远不会被人遗忘，因为那个伟大者，在那个时代，其实是一种生活，精神生活 ⑥从这个意义上说，尽管也许并不懂他，但人\
+们心甘情愿尊他的名为圣 将以上6个句子重新排列，语序正确的是____。\nA. ②⑥⑤①④③\nB. ②⑥④③①⑤\nC. ①④③②⑥⑤\nD. ⑤②⑥①④③\nAnswer: D\n\n相机：拍摄____\nA. \
+空调：降温\nB. B超：诊断\nC. 电脑：操作\nD. 地图：交通\nAnswer: B\n\n文学资料在思想史领域著作中，被使用得还是相当少。其实，作为记述史实的历史，可\
+能对有些夸张和虚构的小说需要警惕，但是，作为考察理性和情感的思想史，却不必胶柱鼓瑟或因噎废食，任何文学作品也许在事实上有想象，但在语言、立场和情感上，\
+却仿佛“当堂呈供”，并不能把自己的本相全盘隐匿。 对这段文字的主旨理解最准确的是____。\nA. 文学作品呈现艺术的真实\nB. 思想史研究应体现理性和情\
+感\nC. 文学资料可以作为思想史研究的史料\nD. 思想史研究中要慎用文学资料\nAnswer:",
+    "The following are multiple choice questions (with answers) about  civil servant.\n\n透过车轮卷起的黄土，却见山野人秋，庄稼割过，静谧中\
+含着一些寂静，只有阳光在切割过的根茬上烁烁闪亮。____。 填入横线上最恰当的是____。\nA. 这是一段颠簸的行程，一路上景色苍凉雄浑，寂静中裸露着贫瘠\nB. \
+也许是久旱的缘故，这边的溪流也变成了涓涓细流，在盘踞的石缝间流动\nC. 同绿色的南方相比，这里是荒凉的，乃至荒蛮\nD. 偶见人迹，大都是荒村野店。时而会有\
+一座小小的孤庙一闪而过\nAnswer: D\n\n据说，在东南沿海一带，渔民在捕到螃蟹后，将螃蟹放进一个上小肚大的竹篓里面，第一只螃蟹放进去时，渔夫就用重物将口\
+封住，当第二只、第三只放进去后，渔夫就不再盖重物了，因为，第一只即将爬出篓口的螃蟹，会被第二只、第三只螃蟹拉到篓底。如此下去，即使篓口没有盖盖子，但\
+也没有一只蟹能够爬出去。 这个故事意在告诉我们____。\nA. 人多不一定好办事\nB. 恶性竞争必然导致两败俱伤\nC. 内讧必然内耗，团结就是力量\nD. 与人方便，\
+自己方便\nAnswer: C\n\n谨慎：成就____\nA. 温和：好感\nB. 勤奋：努力\nC. 轻松：普通\nD. 好学：智慧\nAnswer: A\n\n①我的奶奶是这样，我的父亲也是这\
+样——那张画像，已经成为许多老百姓生活必需品的一部分，没有它，似乎客厅都是空的 ②如果因为认知能力的提升而将偶像抽离出记忆，那么，在那个时代里成长、生活\
+的许多人，脑子里将空空如也，甚至不记得自己曾经活过这一回 ③卷的过程，是在收叠他个人的历史 ④有时挂旧了、破了，也会认认真真卷起来放好，我曾看别人卷过这\
+画像，那种澄澈的眼神令人难忘 ⑤有些伟大者永远不会被人遗忘，因为那个伟大者，在那个时代，其实是一种生活，精神生活 ⑥从这个意义上说，尽管也许并不懂他，但\
+人们心甘情愿尊他的名为圣 将以上6个句子重新排列，语序正确的是____。\nA. ②⑥⑤①④③\nB. ②⑥④③①⑤\nC. ①④③②⑥⑤\nD. ⑤②⑥①④③\nAnswer: D\n\n相机：拍\
+摄____\nA. 空调：降温\nB. B超：诊断\nC. 电脑：操作\nD. 地图：交通\nAnswer: B\n\n下列关于国际组织的表述不正确的是____。\nA. 石油输出国组织通过实行\
+石油生产配额限制维护石油生产国利益\nB. 博鳌亚洲论坛是第一个总部设在中国的国际会议组织\nC. 蒙古国是上海合作组织的成员国之一\nD. 国际货币基金组织是联\
+合国的专门机构\nAnswer:",
+    "The following are multiple choice questions (with answers) about  civil servant.\n\n透过车轮卷起的黄土，却见山野人秋，庄稼割过，静谧中\
+含着一些寂静，只有阳光在切割过的根茬上烁烁闪亮。____。 填入横线上最恰当的是____。\nA. 这是一段颠簸的行程，一路上景色苍凉雄浑，寂静中裸露着贫瘠\nB. \
+也许是久旱的缘故，这边的溪流也变成了涓涓细流，在盘踞的石缝间流动\nC. 同绿色的南方相比，这里是荒凉的，乃至荒蛮\nD. 偶见人迹，大都是荒村野店。时而会有\
+一座小小的孤庙一闪而过\nAnswer: D\n\n据说，在东南沿海一带，渔民在捕到螃蟹后，将螃蟹放进一个上小肚大的竹篓里面，第一只螃蟹放进去时，渔夫就用重物将口\
+封住，当第二只、第三只放进去后，渔夫就不再盖重物了，因为，第一只即将爬出篓口的螃蟹，会被第二只、第三只螃蟹拉到篓底。如此下去，即使篓口没有盖盖子，但\
+也没有一只蟹能够爬出去。 这个故事意在告诉我们____。\nA. 人多不一定好办事\nB. 恶性竞争必然导致两败俱伤\nC. 内讧必然内耗，团结就是力量\nD. 与人方便，\
+自己方便\nAnswer: C\n\n谨慎：成就____\nA. 温和：好感\nB. 勤奋：努力\nC. 轻松：普通\nD. 好学：智慧\nAnswer: A\n\n①我的奶奶是这样，我的父亲也是这\
+样——那张画像，已经成为许多老百姓生活必需品的一部分，没有它，似乎客厅都是空的 ②如果因为认知能力的提升而将偶像抽离出记忆，那么，在那个时代里成长、生活\
+的许多人，脑子里将空空如也，甚至不记得自己曾经活过这一回 ③卷的过程，是在收叠他个人的历史 ④有时挂旧了、破了，也会认认真真卷起来放好，我曾看别人卷过这\
+画像，那种澄澈的眼神令人难忘 ⑤有些伟大者永远不会被人遗忘，因为那个伟大者，在那个时代，其实是一种生活，精神生活 ⑥从这个意义上说，尽管也许并不懂他，但\
+人们心甘情愿尊他的名为圣 将以上6个句子重新排列，语序正确的是____。\nA. ②⑥⑤①④③\nB. ②⑥④③①⑤\nC. ①④③②⑥⑤\nD. ⑤②⑥①④③\nAnswer: D\n\n相机：拍\
+摄____\nA. 空调：降温\nB. B超：诊断\nC. 电脑：操作\nD. 地图：交通\nAnswer: B\n\n实验证明，植物体内含有一种觉察光的蛋白质，可以“分辨”光的强弱。这\
+种能力很可能使植物看到人类视力所看不到的波长，而且具有较高的灵敏度。植物能感觉光照射过来的方向，光使植物知道早上什么时候该醒来，同样也能促使植物额外\
+分泌栎精和堪非醇这两种无色色素，他们能过滤强烈的阳光，充分发挥遮光剂的作用，从而保护植物免受紫外线的强烈照射。 这段文字主要介绍的是____。\nA. 植物是\
+怎么辨别方向的\nB. 植物是如何避免阳光暴晒的\nC. 植物具有一定意义上的“视觉”\nD. 感知阳光对植物生长的重要性\nAnswer:",
+    "The following are multiple choice questions (with answers) about  civil servant.\n\n透过车轮卷起的黄土，却见山野人秋，庄稼割过，静谧中\
+含着一些寂静，只有阳光在切割过的根茬上烁烁闪亮。____。 填入横线上最恰当的是____。\nA. 这是一段颠簸的行程，一路上景色苍凉雄浑，寂静中裸露着贫瘠\nB. \
+也许是久旱的缘故，这边的溪流也变成了涓涓细流，在盘踞的石缝间流动\nC. 同绿色的南方相比，这里是荒凉的，乃至荒蛮\nD. 偶见人迹，大都是荒村野店。时而会有\
+一座小小的孤庙一闪而过\nAnswer: D\n\n据说，在东南沿海一带，渔民在捕到螃蟹后，将螃蟹放进一个上小肚大的竹篓里面，第一只螃蟹放进去时，渔夫就用重物将口\
+封住，当第二只、第三只放进去后，渔夫就不再盖重物了，因为，第一只即将爬出篓口的螃蟹，会被第二只、第三只螃蟹拉到篓底。如此下去，即使篓口没有盖盖子，但\
+也没有一只蟹能够爬出去。 这个故事意在告诉我们____。\nA. 人多不一定好办事\nB. 恶性竞争必然导致两败俱伤\nC. 内讧必然内耗，团结就是力量\nD. 与人方\
+便，自己方便\nAnswer: C\n\n谨慎：成就____\nA. 温和：好感\nB. 勤奋：努力\nC. 轻松：普通\nD. 好学：智慧\nAnswer: A\n\n①我的奶奶是这样，我的父亲也\
+是这样——那张画像，已经成为许多老百姓生活必需品的一部分，没有它，似乎客厅都是空的 ②如果因为认知能力的提升而将偶像抽离出记忆，那么，在那个时代里成长、\
+生活的许多人，脑子里将空空如也，甚至不记得自己曾经活过这一回 ③卷的过程，是在收叠他个人的历史 ④有时挂旧了、破了，也会认认真真卷起来放好，我曾看别人卷\
+过这画像，那种澄澈的眼神令人难忘 ⑤有些伟大者永远不会被人遗忘，因为那个伟大者，在那个时代，其实是一种生活，精神生活 ⑥从这个意义上说，尽管也许并不懂\
+他，但人们心甘情愿尊他的名为圣 将以上6个句子重新排列，语序正确的是____。\nA. ②⑥⑤①④③\nB. ②⑥④③①⑤\nC. ①④③②⑥⑤\nD. ⑤②⑥①④③\nAnswer: D\n\n相机：拍\
+摄____\nA. 空调：降温\nB. B超：诊断\nC. 电脑：操作\nD. 地图：交通\nAnswer: B\n\n1，10，37，82，145，____\nA. 170\nB. 197\nC. 224\nD. \
+226\nAnswer:",
+    "The following are multiple choice questions (with answers) about  civil servant.\n\n透过车轮卷起的黄土，却见山野人秋，庄稼割过，静谧中\
+含着一些寂静，只有阳光在切割过的根茬上烁烁闪亮。____。 填入横线上最恰当的是____。\nA. 这是一段颠簸的行程，一路上景色苍凉雄浑，寂静中裸露着贫瘠\nB. \
+也许是久旱的缘故，这边的溪流也变成了涓涓细流，在盘踞的石缝间流动\nC. 同绿色的南方相比，这里是荒凉的，乃至荒蛮\nD. 偶见人迹，大都是荒村野店。时而会有\
+一座小小的孤庙一闪而过\nAnswer: D\n\n据说，在东南沿海一带，渔民在捕到螃蟹后，将螃蟹放进一个上小肚大的竹篓里面，第一只螃蟹放进去时，渔夫就用重物将口封\
+住，当第二只、第三只放进去后，渔夫就不再盖重物了，因为，第一只即将爬出篓口的螃蟹，会被第二只、第三只螃蟹拉到篓底。如此下去，即使篓口没有盖盖子，但也没\
+有一只蟹能够爬出去。 这个故事意在告诉我们____。\nA. 人多不一定好办事\nB. 恶性竞争必然导致两败俱伤\nC. 内讧必然内耗，团结就是力量\nD. 与人方便，自己\
+方便\nAnswer: C\n\n谨慎：成就____\nA. 温和：好感\nB. 勤奋：努力\nC. 轻松：普通\nD. 好学：智慧\nAnswer: A\n\n①我的奶奶是这样，我的父亲也是这样——\
+那张画像，已经成为许多老百姓生活必需品的一部分，没有它，似乎客厅都是空的 ②如果因为认知能力的提升而将偶像抽离出记忆，那么，在那个时代里成长、生活的许多\
+人，脑子里将空空如也，甚至不记得自己曾经活过这一回 ③卷的过程，是在收叠他个人的历史 ④有时挂旧了、破了，也会认认真真卷起来放好，我曾看别人卷过这画像，\
+那种澄澈的眼神令人难忘 ⑤有些伟大者永远不会被人遗忘，因为那个伟大者，在那个时代，其实是一种生活，精神生活 ⑥从这个意义上说，尽管也许并不懂他，但人们心\
+甘情愿尊他的名为圣 将以上6个句子重新排列，语序正确的是____。\nA. ②⑥⑤①④③\nB. ②⑥④③①⑤\nC. ①④③②⑥⑤\nD. ⑤②⑥①④③\nAnswer: D\n\n相机：拍摄____\nA. 空调\
+：降温\nB. B超：诊断\nC. 电脑：操作\nD. 地图：交通\nAnswer: B\n\n某县在一次招商引资活动中，投资商刁难引资方说：“我有三个项目：环境项目、旅游项目\
+和化工项目。如果你说的话是正确的，我会把其中一个项目投资到贵县，但是如果你说的话是错误的，我就一个项目也不投资。”引资方当然想获得环境项目，那么引资\
+方该如何说呢?____\nA. 你不会把环境项目或旅游项目投资到我县\nB. 你不会把环境项目或化工项目投资到我县\nC. 你不会把旅游项目或化工项目投资到我县\nD. 你\
+不会把旅游项目和化工项目都投资到我县\nAnswer:",
+    "The following are multiple choice questions (with answers) about  civil servant.\n\n透过车轮卷起的黄土，却见山野人秋，庄稼割过，静谧中\
+含着一些寂静，只有阳光在切割过的根茬上烁烁闪亮。____。 填入横线上最恰当的是____。\nA. 这是一段颠簸的行程，一路上景色苍凉雄浑，寂静中裸露着贫瘠\nB. \
+也许是久旱的缘故，这边的溪流也变成了涓涓细流，在盘踞的石缝间流动\nC. 同绿色的南方相比，这里是荒凉的，乃至荒蛮\nD. 偶见人迹，大都是荒村野店。时而会有\
+一座小小的孤庙一闪而过\nAnswer: D\n\n据说，在东南沿海一带，渔民在捕到螃蟹后，将螃蟹放进一个上小肚大的竹篓里面，第一只螃蟹放进去时，渔夫就用重物将口\
+封住，当第二只、第三只放进去后，渔夫就不再盖重物了，因为，第一只即将爬出篓口的螃蟹，会被第二只、第三只螃蟹拉到篓底。如此下去，即使篓口没有盖盖子，但也\
+没有一只蟹能够爬出去。 这个故事意在告诉我们____。\nA. 人多不一定好办事\nB. 恶性竞争必然导致两败俱伤\nC. 内讧必然内耗，团结就是力量\nD. 与人方便，\
+自己方便\nAnswer: C\n\n谨慎：成就____\nA. 温和：好感\nB. 勤奋：努力\nC. 轻松：普通\nD. 好学：智慧\nAnswer: A\n\n①我的奶奶是这样，我的父亲也是这\
+样——那张画像，已经成为许多老百姓生活必需品的一部分，没有它，似乎客厅都是空的 ②如果因为认知能力的提升而将偶像抽离出记忆，那么，在那个时代里成长、生活\
+的许多人，脑子里将空空如也，甚至不记得自己曾经活过这一回 ③卷的过程，是在收叠他个人的历史 ④有时挂旧了、破了，也会认认真真卷起来放好，我曾看别人卷过这\
+画像，那种澄澈的眼神令人难忘 ⑤有些伟大者永远不会被人遗忘，因为那个伟大者，在那个时代，其实是一种生活，精神生活 ⑥从这个意义上说，尽管也许并不懂他，但\
+人们心甘情愿尊他的名为圣 将以上6个句子重新排列，语序正确的是____。\nA. ②⑥⑤①④③\nB. ②⑥④③①⑤\nC. ①④③②⑥⑤\nD. ⑤②⑥①④③\nAnswer: D\n\n相机：拍\
+摄____\nA. 空调：降温\nB. B超：诊断\nC. 电脑：操作\nD. 地图：交通\nAnswer: B\n\n民意“被满意”，民众“不满意”，甚至“很生气”。尊重民意、顺应民意、采\
+纳民意是服务型政府的执政要义，是政治文明建设的题中之意。民意的力量一方面取决于民意征集占全民的比例，即广泛性；另一方面也体现在政府对民意的尊重程度\
+上。保障民众的知情权、参与权、表达权和监督权，就是要随时随地与民众进行多种途径的沟通、交流。民意内涵民智，民意关乎民生。我们不仅要从民意中看到民众欢\
+迎什么、反对什么，为科学决策提供依据，而且要充分发挥民智的作用。尊重民意、吸纳民智是科学决策的重要保证，也是衡量政府亲民为民的重要标志。阅读上面文\
+，最符合文意的一项是____。\nA. 让民众“不满意”“很生气”的政府就不是服务型政府\nB. 知情权是监督权的前提，参与权是表达权的前提\nC. 尊重民意、吸纳民智\
+是科学决策的决定性因素\nD. 民意力量的发挥取决于民意征集的广度和尊重民意的程度\nAnswer:",
+    "The following are multiple choice questions (with answers) about  civil servant.\n\n透过车轮卷起的黄土，却见山野人秋，庄稼割过，静谧中\
+含着一些寂静，只有阳光在切割过的根茬上烁烁闪亮。____。 填入横线上最恰当的是____。\nA. 这是一段颠簸的行程，一路上景色苍凉雄浑，寂静中裸露着贫瘠\nB. \
+也许是久旱的缘故，这边的溪流也变成了涓涓细流，在盘踞的石缝间流动\nC. 同绿色的南方相比，这里是荒凉的，乃至荒蛮\nD. 偶见人迹，大都是荒村野店。时而会有\
+一座小小的孤庙一闪而过\nAnswer: D\n\n据说，在东南沿海一带，渔民在捕到螃蟹后，将螃蟹放进一个上小肚大的竹篓里面，第一只螃蟹放进去时，渔夫就用重物将口\
+封住，当第二只、第三只放进去后，渔夫就不再盖重物了，因为，第一只即将爬出篓口的螃蟹，会被第二只、第三只螃蟹拉到篓底。如此下去，即使篓口没有盖盖子，但也\
+没有一只蟹能够爬出去。 这个故事意在告诉我们____。\nA. 人多不一定好办事\nB. 恶性竞争必然导致两败俱伤\nC. 内讧必然内耗，团结就是力量\nD. 与人方便，自\
+己方便\nAnswer: C\n\n谨慎：成就____\nA. 温和：好感\nB. 勤奋：努力\nC. 轻松：普通\nD. 好学：智慧\nAnswer: A\n\n①我的奶奶是这样，我的父亲也是这样\
+——那张画像，已经成为许多老百姓生活必需品的一部分，没有它，似乎客厅都是空的 ②如果因为认知能力的提升而将偶像抽离出记忆，那么，在那个时代里成长、生活的\
+许多人，脑子里将空空如也，甚至不记得自己曾经活过这一回 ③卷的过程，是在收叠他个人的历史 ④有时挂旧了、破了，也会认认真真卷起来放好，我曾看别人卷过这画\
+像，那种澄澈的眼神令人难忘 ⑤有些伟大者永远不会被人遗忘，因为那个伟大者，在那个时代，其实是一种生活，精神生活 ⑥从这个意义上说，尽管也许并不懂他，但人\
+们心甘情愿尊他的名为圣 将以上6个句子重新排列，语序正确的是____。\nA. ②⑥⑤①④③\nB. ②⑥④③①⑤\nC. ①④③②⑥⑤\nD. ⑤②⑥①④③\nAnswer: D\n\n相机：拍摄____\nA. \
+空调：降温\nB. B超：诊断\nC. 电脑：操作\nD. 地图：交通\nAnswer: B\n\n3，5，16，82，1315，____\nA. 107834\nB. 12849\nC. 12847\nD. 108847\nAns\
+wer:",
+    "The following are multiple choice questions (with answers) about  civil servant.\n\n透过车轮卷起的黄土，却见山野人秋，庄稼割过，静谧中\
+含着一些寂静，只有阳光在切割过的根茬上烁烁闪亮。____。 填入横线上最恰当的是____。\nA. 这是一段颠簸的行程，一路上景色苍凉雄浑，寂静中裸露着贫瘠\nB. \
+也许是久旱的缘故，这边的溪流也变成了涓涓细流，在盘踞的石缝间流动\nC. 同绿色的南方相比，这里是荒凉的，乃至荒蛮\nD. 偶见人迹，大都是荒村野店。时而会有\
+一座小小的孤庙一闪而过\nAnswer: D\n\n据说，在东南沿海一带，渔民在捕到螃蟹后，将螃蟹放进一个上小肚大的竹篓里面，第一只螃蟹放进去时，渔夫就用重物将口\
+封住，当第二只、第三只放进去后，渔夫就不再盖重物了，因为，第一只即将爬出篓口的螃蟹，会被第二只、第三只螃蟹拉到篓底。如此下去，即使篓口没有盖盖子，但也\
+没有一只蟹能够爬出去。 这个故事意在告诉我们____。\nA. 人多不一定好办事\nB. 恶性竞争必然导致两败俱伤\nC. 内讧必然内耗，团结就是力量\nD. 与人方便，自\
+己方便\nAnswer: C\n\n谨慎：成就____\nA. 温和：好感\nB. 勤奋：努力\nC. 轻松：普通\nD. 好学：智慧\nAnswer: A\n\n①我的奶奶是这样，我的父亲也是这样—\
+—那张画像，已经成为许多老百姓生活必需品的一部分，没有它，似乎客厅都是空的 ②如果因为认知能力的提升而将偶像抽离出记忆，那么，在那个时代里成长、生活的许\
+多人，脑子里将空空如也，甚至不记得自己曾经活过这一回 ③卷的过程，是在收叠他个人的历史 ④有时挂旧了、破了，也会认认真真卷起来放好，我曾看别人卷过这画像\
+，那种澄澈的眼神令人难忘 ⑤有些伟大者永远不会被人遗忘，因为那个伟大者，在那个时代，其实是一种生活，精神生活 ⑥从这个意义上说，尽管也许并不懂他，但人们\
+心甘情愿尊他的名为圣 将以上6个句子重新排列，语序正确的是____。\nA. ②⑥⑤①④③\nB. ②⑥④③①⑤\nC. ①④③②⑥⑤\nD. ⑤②⑥①④③\nAnswer: D\n\n相机：拍摄____\nA. 空\
+调：降温\nB. B超：诊断\nC. 电脑：操作\nD. 地图：交通\nAnswer: B\n\n下列可以反映气候垂直变化的诗句是____。\nA. 东边日出西边雨，道是无晴却有晴\nB. \
+罗浮山下四时春，卢橘杨梅次第新\nC. 人间四月芳菲尽，山寺桃花始盛开\nD. 横看成岭侧成峰，远近高低各不同\nAnswer:",
+    "The following are multiple choice questions (with answers) about  civil servant.\n\n透过车轮卷起的黄土，却见山野人秋，庄稼割过，静谧中\
+含着一些寂静，只有阳光在切割过的根茬上烁烁闪亮。____。 填入横线上最恰当的是____。\nA. 这是一段颠簸的行程，一路上景色苍凉雄浑，寂静中裸露着贫瘠\nB. \
+也许是久旱的缘故，这边的溪流也变成了涓涓细流，在盘踞的石缝间流动\nC. 同绿色的南方相比，这里是荒凉的，乃至荒蛮\nD. 偶见人迹，大都是荒村野店。时而会有\
+一座小小的孤庙一闪而过\nAnswer: D\n\n据说，在东南沿海一带，渔民在捕到螃蟹后，将螃蟹放进一个上小肚大的竹篓里面，第一只螃蟹放进去时，渔夫就用重物将口封\
+住，当第二只、第三只放进去后，渔夫就不再盖重物了，因为，第一只即将爬出篓口的螃蟹，会被第二只、第三只螃蟹拉到篓底。如此下去，即使篓口没有盖盖子，但也没\
+有一只蟹能够爬出去。 这个故事意在告诉我们____。\nA. 人多不一定好办事\nB. 恶性竞争必然导致两败俱伤\nC. 内讧必然内耗，团结就是力量\nD. 与人方便，自\
+己方便\nAnswer: C\n\n谨慎：成就____\nA. 温和：好感\nB. 勤奋：努力\nC. 轻松：普通\nD. 好学：智慧\nAnswer: A\n\n①我的奶奶是这样，我的父亲也是这样\
+——那张画像，已经成为许多老百姓生活必需品的一部分，没有它，似乎客厅都是空的 ②如果因为认知能力的提升而将偶像抽离出记忆，那么，在那个时代里成长、生活的\
+许多人，脑子里将空空如也，甚至不记得自己曾经活过这一回 ③卷的过程，是在收叠他个人的历史 ④有时挂旧了、破了，也会认认真真卷起来放好，我曾看别人卷过这画\
+像，那种澄澈的眼神令人难忘 ⑤有些伟大者永远不会被人遗忘，因为那个伟大者，在那个时代，其实是一种生活，精神生活 ⑥从这个意义上说，尽管也许并不懂他，但人\
+们心甘情愿尊他的名为圣 将以上6个句子重新排列，语序正确的是____。\nA. ②⑥⑤①④③\nB. ②⑥④③①⑤\nC. ①④③②⑥⑤\nD. ⑤②⑥①④③\nAnswer: D\n\n相机：拍摄____\nA. \
+空调：降温\nB. B超：诊断\nC. 电脑：操作\nD. 地图：交通\nAnswer: B\n\n日本松下公司日前在东京“松下中心”向当地媒体展示了其面向未来的“零排放概念环保房\
+屋”。环保屋的主要特点是“节能、创能、蓄能”。“节能”就是提高对自然界既有资源的利用率，同时采用环保隔热的建筑材料以及最先进的环保节能家电设备等。 下文最\
+有可能介绍的是____。\nA. 环保屋是怎样设计出来的\nB. 环保屋的创能、蓄能特点\nC. 环保屋的推广\nD. 环保屋的材料\nAnswer:",
+    "The following are multiple choice questions (with answers) about  civil servant.\n\n透过车轮卷起的黄土，却见山野人秋，庄稼割过，静谧中\
+含着一些寂静，只有阳光在切割过的根茬上烁烁闪亮。____。 填入横线上最恰当的是____。\nA. 这是一段颠簸的行程，一路上景色苍凉雄浑，寂静中裸露着贫瘠\nB. \
+也许是久旱的缘故，这边的溪流也变成了涓涓细流，在盘踞的石缝间流动\nC. 同绿色的南方相比，这里是荒凉的，乃至荒蛮\nD. 偶见人迹，大都是荒村野店。时而会有\
+一座小小的孤庙一闪而过\nAnswer: D\n\n据说，在东南沿海一带，渔民在捕到螃蟹后，将螃蟹放进一个上小肚大的竹篓里面，第一只螃蟹放进去时，渔夫就用重物将口\
+封住，当第二只、第三只放进去后，渔夫就不再盖重物了，因为，第一只即将爬出篓口的螃蟹，会被第二只、第三只螃蟹拉到篓底。如此下去，即使篓口没有盖盖子，但也\
+没有一只蟹能够爬出去。 这个故事意在告诉我们____。\nA. 人多不一定好办事\nB. 恶性竞争必然导致两败俱伤\nC. 内讧必然内耗，团结就是力量\nD. 与人方便，自\
+己方便\nAnswer: C\n\n谨慎：成就____\nA. 温和：好感\nB. 勤奋：努力\nC. 轻松：普通\nD. 好学：智慧\nAnswer: A\n\n①我的奶奶是这样，我的父亲也是这样—\
+—那张画像，已经成为许多老百姓生活必需品的一部分，没有它，似乎客厅都是空的 ②如果因为认知能力的提升而将偶像抽离出记忆，那么，在那个时代里成长、生活的许\
+多人，脑子里将空空如也，甚至不记得自己曾经活过这一回 ③卷的过程，是在收叠他个人的历史 ④有时挂旧了、破了，也会认认真真卷起来放好，我曾看别人卷过这画\
+像，那种澄澈的眼神令人难忘 ⑤有些伟大者永远不会被人遗忘，因为那个伟大者，在那个时代，其实是一种生活，精神生活 ⑥从这个意义上说，尽管也许并不懂他，但人\
+们心甘情愿尊他的名为圣 将以上6个句子重新排列，语序正确的是____。\nA. ②⑥⑤①④③\nB. ②⑥④③①⑤\nC. ①④③②⑥⑤\nD. ⑤②⑥①④③\nAnswer: D\n\n相机：拍摄____\nA. \
+空调：降温\nB. B超：诊断\nC. 电脑：操作\nD. 地图：交通\nAnswer: B\n\n下列没有歧义的一项是____。\nA. 几个派出所的民警。\nB. 法院门前的石狮\
+子。\nC. 这份起诉书我写不好。\nD. 咬死了主人的藏獒。\nAnswer:",
+    "The following are multiple choice questions (with answers) about  civil servant.\n\n透过车轮卷起的黄土，却见山野人秋，庄稼割过，静谧中\
+含着一些寂静，只有阳光在切割过的根茬上烁烁闪亮。____。 填入横线上最恰当的是____。\nA. 这是一段颠簸的行程，一路上景色苍凉雄浑，寂静中裸露着贫瘠\nB. \
+也许是久旱的缘故，这边的溪流也变成了涓涓细流，在盘踞的石缝间流动\nC. 同绿色的南方相比，这里是荒凉的，乃至荒蛮\nD. 偶见人迹，大都是荒村野店。时而会有\
+一座小小的孤庙一闪而过\nAnswer: D\n\n据说，在东南沿海一带，渔民在捕到螃蟹后，将螃蟹放进一个上小肚大的竹篓里面，第一只螃蟹放进去时，渔夫就用重物将口\
+封住，当第二只、第三只放进去后，渔夫就不再盖重物了，因为，第一只即将爬出篓口的螃蟹，会被第二只、第三只螃蟹拉到篓底。如此下去，即使篓口没有盖盖子，但也\
+没有一只蟹能够爬出去。 这个故事意在告诉我们____。\nA. 人多不一定好办事\nB. 恶性竞争必然导致两败俱伤\nC. 内讧必然内耗，团结就是力量\nD. 与人方便，自\
+己方便\nAnswer: C\n\n谨慎：成就____\nA. 温和：好感\nB. 勤奋：努力\nC. 轻松：普通\nD. 好学：智慧\nAnswer: A\n\n①我的奶奶是这样，我的父亲也是这样\
+——那张画像，已经成为许多老百姓生活必需品的一部分，没有它，似乎客厅都是空的 ②如果因为认知能力的提升而将偶像抽离出记忆，那么，在那个时代里成长、生活的\
+许多人，脑子里将空空如也，甚至不记得自己曾经活过这一回 ③卷的过程，是在收叠他个人的历史 ④有时挂旧了、破了，也会认认真真卷起来放好，我曾看别人卷过这画\
+像，那种澄澈的眼神令人难忘 ⑤有些伟大者永远不会被人遗忘，因为那个伟大者，在那个时代，其实是一种生活，精神生活 ⑥从这个意义上说，尽管也许并不懂他，但人\
+们心甘情愿尊他的名为圣 将以上6个句子重新排列，语序正确的是____。\nA. ②⑥⑤①④③\nB. ②⑥④③①⑤\nC. ①④③②⑥⑤\nD. ⑤②⑥①④③\nAnswer: D\n\n相机：拍摄____\nA. \
+空调：降温\nB. B超：诊断\nC. 电脑：操作\nD. 地图：交通\nAnswer: B\n\n我们发现零工制度有一个重要的支持机制就是完善的、科学化的员工培训系统。几乎所\
+有的现代企业和公司都非常重视内部培训，有的企业主甚至成为了培训狂，哪怕有一秒钟的空闲也要为员工安排一次培训。但真正有效的培训并不是无休止的洗脑和课程\
+轰炸，不是“潜能激发”和“感恩教育”，而是适合公司运营需求的专业性、针对性、科学性的业务训练。这种培训机制如果能够建立起来，无论你是否采用零工制度都会对\
+企业的发展起到重要的推动作用。 这段文字意在说明____。\nA. 很多公司培训缺乏科学性\nB. 科学的员工培训对企业很重要\nC. 零工制度不一定适合所有企业\nD.\
+过度培训可能会造成相反效果\nAnswer:",
+    "The following are multiple choice questions (with answers) about  civil servant.\n\n透过车轮卷起的黄土，却见山野人秋，庄稼割过，静谧中\
+含着一些寂静，只有阳光在切割过的根茬上烁烁闪亮。____。 填入横线上最恰当的是____。\nA. 这是一段颠簸的行程，一路上景色苍凉雄浑，寂静中裸露着贫瘠\nB. \
+也许是久旱的缘故，这边的溪流也变成了涓涓细流，在盘踞的石缝间流动\nC. 同绿色的南方相比，这里是荒凉的，乃至荒蛮\nD. 偶见人迹，大都是荒村野店。时而会有\
+一座小小的孤庙一闪而过\nAnswer: D\n\n据说，在东南沿海一带，渔民在捕到螃蟹后，将螃蟹放进一个上小肚大的竹篓里面，第一只螃蟹放进去时，渔夫就用重物将口\
+封住，当第二只、第三只放进去后，渔夫就不再盖重物了，因为，第一只即将爬出篓口的螃蟹，会被第二只、第三只螃蟹拉到篓底。如此下去，即使篓口没有盖盖子，但\
+也没有一只蟹能够爬出去。 这个故事意在告诉我们____。\nA. 人多不一定好办事\nB. 恶性竞争必然导致两败俱伤\nC. 内讧必然内耗，团结就是力量\nD. 与人方便，\
+自己方便\nAnswer: C\n\n谨慎：成就____\nA. 温和：好感\nB. 勤奋：努力\nC. 轻松：普通\nD. 好学：智慧\nAnswer: A\n\n①我的奶奶是这样，我的父亲也是这\
+样——那张画像，已经成为许多老百姓生活必需品的一部分，没有它，似乎客厅都是空的 ②如果因为认知能力的提升而将偶像抽离出记忆，那么，在那个时代里成长、生活\
+的许多人，脑子里将空空如也，甚至不记得自己曾经活过这一回 ③卷的过程，是在收叠他个人的历史 ④有时挂旧了、破了，也会认认真真卷起来放好，我曾看别人卷过这\
+画像，那种澄澈的眼神令人难忘 ⑤有些伟大者永远不会被人遗忘，因为那个伟大者，在那个时代，其实是一种生活，精神生活 ⑥从这个意义上说，尽管也许并不懂他，但\
+人们心甘情愿尊他的名为圣 将以上6个句子重新排列，语序正确的是____。\nA. ②⑥⑤①④③\nB. ②⑥④③①⑤\nC. ①④③②⑥⑤\nD. ⑤②⑥①④③\nAnswer: D\n\n相机：拍\
+摄____\nA. 空调：降温\nB. B超：诊断\nC. 电脑：操作\nD. 地图：交通\nAnswer: B\n\n全国人民代表大会举行会议时，主持大会正式会议的是____。\nA. 全国人\
+大常委会\nB. 大会主席团\nC. 全国人大常委会委员长\nD. 大会秘书长\nAnswer:",
+    "The following are multiple choice questions (with answers) about  civil servant.\n\n透过车轮卷起的黄土，却见山野人秋，庄稼割过，静谧中\
+含着一些寂静，只有阳光在切割过的根茬上烁烁闪亮。____。 填入横线上最恰当的是____。\nA. 这是一段颠簸的行程，一路上景色苍凉雄浑，寂静中裸露着贫瘠\nB. \
+也许是久旱的缘故，这边的溪流也变成了涓涓细流，在盘踞的石缝间流动\nC. 同绿色的南方相比，这里是荒凉的，乃至荒蛮\nD. 偶见人迹，大都是荒村野店。时而会有\
+一座小小的孤庙一闪而过\nAnswer: D\n\n据说，在东南沿海一带，渔民在捕到螃蟹后，将螃蟹放进一个上小肚大的竹篓里面，第一只螃蟹放进去时，渔夫就用重物将口\
+封住，当第二只、第三只放进去后，渔夫就不再盖重物了，因为，第一只即将爬出篓口的螃蟹，会被第二只、第三只螃蟹拉到篓底。如此下去，即使篓口没有盖盖子，但也\
+没有一只蟹能够爬出去。 这个故事意在告诉我们____。\nA. 人多不一定好办事\nB. 恶性竞争必然导致两败俱伤\nC. 内讧必然内耗，团结就是力量\nD. 与人方便，自\
+己方便\nAnswer: C\n\n谨慎：成就____\nA. 温和：好感\nB. 勤奋：努力\nC. 轻松：普通\nD. 好学：智慧\nAnswer: A\n\n①我的奶奶是这样，我的父亲也是这样—\
+—那张画像，已经成为许多老百姓生活必需品的一部分，没有它，似乎客厅都是空的 ②如果因为认知能力的提升而将偶像抽离出记忆，那么，在那个时代里成长、生活的许\
+多人，脑子里将空空如也，甚至不记得自己曾经活过这一回 ③卷的过程，是在收叠他个人的历史 ④有时挂旧了、破了，也会认认真真卷起来放好，我曾看别人卷过这画\
+像，那种澄澈的眼神令人难忘 ⑤有些伟大者永远不会被人遗忘，因为那个伟大者，在那个时代，其实是一种生活，精神生活 ⑥从这个意义上说，尽管也许并不懂他，但人\
+们心甘情愿尊他的名为圣 将以上6个句子重新排列，语序正确的是____。\nA. ②⑥⑤①④③\nB. ②⑥④③①⑤\nC. ①④③②⑥⑤\nD. ⑤②⑥①④③\nAnswer: D\n\n相机：拍摄____\nA. \
+空调：降温\nB. B超：诊断\nC. 电脑：操作\nD. 地图：交通\nAnswer: B\n\n改革开放以来，中国农学会____“献身、创新、求实、协作”的宗旨，始终不渝地坚持以\
+推动农业科技进步、促进农村发展为己任，大力开展学术交流和科技普及，积极____和举荐人才，为提高广大农民科技素质、加快农业科技进步作出了重要贡献。 填入画\
+横线部分最恰当的一项是____。\nA. 继承 出谋划策\nB. 继承 建言献策\nC. 秉承 建言献策\nD. 秉承 出谋划策\nAnswer:",
+    "The following are multiple choice questions (with answers) about  civil servant.\n\n透过车轮卷起的黄土，却见山野人秋，庄稼割过，静谧中\
+含着一些寂静，只有阳光在切割过的根茬上烁烁闪亮。____。 填入横线上最恰当的是____。\nA. 这是一段颠簸的行程，一路上景色苍凉雄浑，寂静中裸露着贫瘠\nB. \
+也许是久旱的缘故，这边的溪流也变成了涓涓细流，在盘踞的石缝间流动\nC. 同绿色的南方相比，这里是荒凉的，乃至荒蛮\nD. 偶见人迹，大都是荒村野店。时而会有\
+一座小小的孤庙一闪而过\nAnswer: D\n\n据说，在东南沿海一带，渔民在捕到螃蟹后，将螃蟹放进一个上小肚大的竹篓里面，第一只螃蟹放进去时，渔夫就用重物将口\
+封住，当第二只、第三只放进去后，渔夫就不再盖重物了，因为，第一只即将爬出篓口的螃蟹，会被第二只、第三只螃蟹拉到篓底。如此下去，即使篓口没有盖盖子，但\
+也没有一只蟹能够爬出去。 这个故事意在告诉我们____。\nA. 人多不一定好办事\nB. 恶性竞争必然导致两败俱伤\nC. 内讧必然内耗，团结就是力量\nD. 与人方便，\
+自己方便\nAnswer: C\n\n谨慎：成就____\nA. 温和：好感\nB. 勤奋：努力\nC. 轻松：普通\nD. 好学：智慧\nAnswer: A\n\n①我的奶奶是这样，我的父亲也是这\
+样——那张画像，已经成为许多老百姓生活必需品的一部分，没有它，似乎客厅都是空的 ②如果因为认知能力的提升而将偶像抽离出记忆，那么，在那个时代里成长、生活\
+的许多人，脑子里将空空如也，甚至不记得自己曾经活过这一回 ③卷的过程，是在收叠他个人的历史 ④有时挂旧了、破了，也会认认真真卷起来放好，我曾看别人卷过这\
+画像，那种澄澈的眼神令人难忘 ⑤有些伟大者永远不会被人遗忘，因为那个伟大者，在那个时代，其实是一种生活，精神生活 ⑥从这个意义上说，尽管也许并不懂他，但\
+人们心甘情愿尊他的名为圣 将以上6个句子重新排列，语序正确的是____。\nA. ②⑥⑤①④③\nB. ②⑥④③①⑤\nC. ①④③②⑥⑤\nD. ⑤②⑥①④③\nAnswer: D\n\n相机：拍\
+摄____\nA. 空调：降温\nB. B超：诊断\nC. 电脑：操作\nD. 地图：交通\nAnswer: B\n\n0， 4， 3， 10， 6， 7， ____\nA. 101\nB. 102\nC. 103\nD. 1\
+04\nAnswer:",
+    "The following are multiple choice questions (with answers) about  civil servant.\n\n透过车轮卷起的黄土，却见山野人秋，庄稼割过，静谧中\
+含着一些寂静，只有阳光在切割过的根茬上烁烁闪亮。____。 填入横线上最恰当的是____。\nA. 这是一段颠簸的行程，一路上景色苍凉雄浑，寂静中裸露着贫瘠\nB. \
+也许是久旱的缘故，这边的溪流也变成了涓涓细流，在盘踞的石缝间流动\nC. 同绿色的南方相比，这里是荒凉的，乃至荒蛮\nD. 偶见人迹，大都是荒村野店。时而会有\
+一座小小的孤庙一闪而过\nAnswer: D\n\n据说，在东南沿海一带，渔民在捕到螃蟹后，将螃蟹放进一个上小肚大的竹篓里面，第一只螃蟹放进去时，渔夫就用重物将口\
+封住，当第二只、第三只放进去后，渔夫就不再盖重物了，因为，第一只即将爬出篓口的螃蟹，会被第二只、第三只螃蟹拉到篓底。如此下去，即使篓口没有盖盖子，但也\
+没有一只蟹能够爬出去。 这个故事意在告诉我们____。\nA. 人多不一定好办事\nB. 恶性竞争必然导致两败俱伤\nC. 内讧必然内耗，团结就是力量\nD. 与人方便，自\
+己方便\nAnswer: C\n\n谨慎：成就____\nA. 温和：好感\nB. 勤奋：努力\nC. 轻松：普通\nD. 好学：智慧\nAnswer: A\n\n①我的奶奶是这样，我的父亲也是这样\
+——那张画像，已经成为许多老百姓生活必需品的一部分，没有它，似乎客厅都是空的 ②如果因为认知能力的提升而将偶像抽离出记忆，那么，在那个时代里成长、生活的\
+许多人，脑子里将空空如也，甚至不记得自己曾经活过这一回 ③卷的过程，是在收叠他个人的历史 ④有时挂旧了、破了，也会认认真真卷起来放好，我曾看别人卷过这画\
+像，那种澄澈的眼神令人难忘 ⑤有些伟大者永远不会被人遗忘，因为那个伟大者，在那个时代，其实是一种生活，精神生活 ⑥从这个意义上说，尽管也许并不懂他，但人\
+们心甘情愿尊他的名为圣 将以上6个句子重新排列，语序正确的是____。\nA. ②⑥⑤①④③\nB. ②⑥④③①⑤\nC. ①④③②⑥⑤\nD. ⑤②⑥①④③\nAnswer: D\n\n相机：拍摄____\nA. \
+空调：降温\nB. B超：诊断\nC. 电脑：操作\nD. 地图：交通\nAnswer: B\n\n“新生代散文”作家大多有写现代诗的背景，诗人所拥有的____的思维、大胆的想象、敏\
+锐的感觉，将“诗质”____在散文语言的血液和肌理里。这不同于平铺直叙式的浅浮的诗意，而是自我心灵的体认中____而成的诗质。 填入画横线部分最恰当的一项\
+是____。\nA. 跳脱 镶嵌 凝结\nB. 另类 浓缩 升华\nC. 感性 渗透 铸就\nD. 活跃 散播 提炼\nAnswer:",
+    "The following are multiple choice questions (with answers) about  civil servant.\n\n透过车轮卷起的黄土，却见山野人秋，庄稼割过，静谧中\
+含着一些寂静，只有阳光在切割过的根茬上烁烁闪亮。____。 填入横线上最恰当的是____。\nA. 这是一段颠簸的行程，一路上景色苍凉雄浑，寂静中裸露着贫瘠\nB.\
+也许是久旱的缘故，这边的溪流也变成了涓涓细流，在盘踞的石缝间流动\nC. 同绿色的南方相比，这里是荒凉的，乃至荒蛮\nD. 偶见人迹，大都是荒村野店。时而会有\
+一座小小的孤庙一闪而过\nAnswer: D\n\n据说，在东南沿海一带，渔民在捕到螃蟹后，将螃蟹放进一个上小肚大的竹篓里面，第一只螃蟹放进去时，渔夫就用重物将口\
+封住，当第二只、第三只放进去后，渔夫就不再盖重物了，因为，第一只即将爬出篓口的螃蟹，会被第二只、第三只螃蟹拉到篓底。如此下去，即使篓口没有盖盖子，但\
+也没有一只蟹能够爬出去。 这个故事意在告诉我们____。\nA. 人多不一定好办事\nB. 恶性竞争必然导致两败俱伤\nC. 内讧必然内耗，团结就是力量\nD. 与人方便，\
+自己方便\nAnswer: C\n\n谨慎：成就____\nA. 温和：好感\nB. 勤奋：努力\nC. 轻松：普通\nD. 好学：智慧\nAnswer: A\n\n①我的奶奶是这样，我的父亲也是这\
+样——那张画像，已经成为许多老百姓生活必需品的一部分，没有它，似乎客厅都是空的 ②如果因为认知能力的提升而将偶像抽离出记忆，那么，在那个时代里成长、生活\
+的许多人，脑子里将空空如也，甚至不记得自己曾经活过这一回 ③卷的过程，是在收叠他个人的历史 ④有时挂旧了、破了，也会认认真真卷起来放好，我曾看别人卷过这画\
+像，那种澄澈的眼神令人难忘 ⑤有些伟大者永远不会被人遗忘，因为那个伟大者，在那个时代，其实是一种生活，精神生活 ⑥从这个意义上说，尽管也许并不懂他，但人\
+们心甘情愿尊他的名为圣 将以上6个句子重新排列，语序正确的是____。\nA. ②⑥⑤①④③\nB. ②⑥④③①⑤\nC. ①④③②⑥⑤\nD. ⑤②⑥①④③\nAnswer: D\n\n相机：拍摄____\nA. \
+空调：降温\nB. B超：诊断\nC. 电脑：操作\nD. 地图：交通\nAnswer: B\n\n据《咬文嚼字》编辑部透露，编制年度“十大流行语”是一项十分严肃的事，既要____到\
+词语在当年的流行度，又要从语文伦理角度加以必要的____，选优汰劣，力争通过“十大流行语”向社会____正能量。 填入画横线部分最恰当的一项是____。\nA. 斟酌 \
+估量 传播\nB. 思考 权衡 传送\nC. 思索 考察 传达\nD. 考虑 考量 传递\nAnswer:",
+    "The following are multiple choice questions (with answers) about  civil servant.\n\n透过车轮卷起的黄土，却见山野人秋，庄稼割过，静谧中\
+含着一些寂静，只有阳光在切割过的根茬上烁烁闪亮。____。 填入横线上最恰当的是____。\nA. 这是一段颠簸的行程，一路上景色苍凉雄浑，寂静中裸露着贫瘠\nB. \
+也许是久旱的缘故，这边的溪流也变成了涓涓细流，在盘踞的石缝间流动\nC. 同绿色的南方相比，这里是荒凉的，乃至荒蛮\nD. 偶见人迹，大都是荒村野店。时而会有\
+一座小小的孤庙一闪而过\nAnswer: D\n\n据说，在东南沿海一带，渔民在捕到螃蟹后，将螃蟹放进一个上小肚大的竹篓里面，第一只螃蟹放进去时，渔夫就用重物将口封\
+住，当第二只、第三只放进去后，渔夫就不再盖重物了，因为，第一只即将爬出篓口的螃蟹，会被第二只、第三只螃蟹拉到篓底。如此下去，即使篓口没有盖盖子，但也没\
+有一只蟹能够爬出去。 这个故事意在告诉我们____。\nA. 人多不一定好办事\nB. 恶性竞争必然导致两败俱伤\nC. 内讧必然内耗，团结就是力量\nD. 与人方便，自己\
+方便\nAnswer: C\n\n谨慎：成就____\nA. 温和：好感\nB. 勤奋：努力\nC. 轻松：普通\nD. 好学：智慧\nAnswer: A\n\n①我的奶奶是这样，我的父亲也是这样——\
+那张画像，已经成为许多老百姓生活必需品的一部分，没有它，似乎客厅都是空的 ②如果因为认知能力的提升而将偶像抽离出记忆，那么，在那个时代里成长、生活的许多\
+人，脑子里将空空如也，甚至不记得自己曾经活过这一回 ③卷的过程，是在收叠他个人的历史 ④有时挂旧了、破了，也会认认真真卷起来放好，我曾看别人卷过这画像，\
+那种澄澈的眼神令人难忘 ⑤有些伟大者永远不会被人遗忘，因为那个伟大者，在那个时代，其实是一种生活，精神生活 ⑥从这个意义上说，尽管也许并不懂他，但人们心\
+甘情愿尊他的名为圣 将以上6个句子重新排列，语序正确的是____。\nA. ②⑥⑤①④③\nB. ②⑥④③①⑤\nC. ①④③②⑥⑤\nD. ⑤②⑥①④③\nAnswer: D\n\n相机：拍摄____\nA. 空\
+调：降温\nB. B超：诊断\nC. 电脑：操作\nD. 地图：交通\nAnswer: B\n\n20世纪60年代以前，世界各国普遍注重防洪的工程措施，即通过修建大堤、水库水利设施\
+对洪水进行控制。但在60年代以后，世界各国在防洪规划中越来越重视非工程措施的运用，即通过洪水预警、灾情评估、洪灾保险等多种手段，结合各种工程措施，从而\
+尽可能减少洪灾对人类经济、环境和社会发展的影响。 这段文字主要谈的是____。\nA. 世界各国防洪理念的转变\nB. 世界各国控制洪水的新途径\nC. 单纯重视防洪\
+工程不能有效控制洪水\nD. 非工程措施逐渐成为防洪规划的主导\nAnswer:",
+    "The following are multiple choice questions (with answers) about  civil servant.\n\n透过车轮卷起的黄土，却见山野人秋，庄稼割过，静谧中\
+含着一些寂静，只有阳光在切割过的根茬上烁烁闪亮。____。 填入横线上最恰当的是____。\nA. 这是一段颠簸的行程，一路上景色苍凉雄浑，寂静中裸露着贫瘠\nB. \
+也许是久旱的缘故，这边的溪流也变成了涓涓细流，在盘踞的石缝间流动\nC. 同绿色的南方相比，这里是荒凉的，乃至荒蛮\nD. 偶见人迹，大都是荒村野店。时而会有\
+一座小小的孤庙一闪而过\nAnswer: D\n\n据说，在东南沿海一带，渔民在捕到螃蟹后，将螃蟹放进一个上小肚大的竹篓里面，第一只螃蟹放进去时，渔夫就用重物将口\
+封住，当第二只、第三只放进去后，渔夫就不再盖重物了，因为，第一只即将爬出篓口的螃蟹，会被第二只、第三只螃蟹拉到篓底。如此下去，即使篓口没有盖盖子，但\
+也没有一只蟹能够爬出去。 这个故事意在告诉我们____。\nA. 人多不一定好办事\nB. 恶性竞争必然导致两败俱伤\nC. 内讧必然内耗，团结就是力量\nD. 与人方便，\
+自己方便\nAnswer: C\n\n谨慎：成就____\nA. 温和：好感\nB. 勤奋：努力\nC. 轻松：普通\nD. 好学：智慧\nAnswer: A\n\n①我的奶奶是这样，我的父亲也是这\
+样——那张画像，已经成为许多老百姓生活必需品的一部分，没有它，似乎客厅都是空的 ②如果因为认知能力的提升而将偶像抽离出记忆，那么，在那个时代里成长、生活\
+的许多人，脑子里将空空如也，甚至不记得自己曾经活过这一回 ③卷的过程，是在收叠他个人的历史 ④有时挂旧了、破了，也会认认真真卷起来放好，我曾看别人卷过这\
+画像，那种澄澈的眼神令人难忘 ⑤有些伟大者永远不会被人遗忘，因为那个伟大者，在那个时代，其实是一种生活，精神生活 ⑥从这个意义上说，尽管也许并不懂他，但\
+人们心甘情愿尊他的名为圣 将以上6个句子重新排列，语序正确的是____。\nA. ②⑥⑤①④③\nB. ②⑥④③①⑤\nC. ①④③②⑥⑤\nD. ⑤②⑥①④③\nAnswer: D\n\n相机：拍\
+摄____\nA. 空调：降温\nB. B超：诊断\nC. 电脑：操作\nD. 地图：交通\nAnswer: B\n\n近年来，国家房地产调控措施的出台十分密集，除了增加公共租赁住房供应\
+外，再加上央行加息，多个城市出现了房屋成交量下跌的态势，房价涨幅开始放缓。这表明____。\nA. 国家通过宏观调控平衡供求关系\nB. 价格的波动通过供求关系表\
+现出来\nC. 宏观调控是资源配置的基础性手段\nD. 宏观调控可以克服市场调节的滞后性\nAnswer:",
+    "The following are multiple choice questions (with answers) about  civil servant.\n\n透过车轮卷起的黄土，却见山野人秋，庄稼割过，静谧中\
+含着一些寂静，只有阳光在切割过的根茬上烁烁闪亮。____。 填入横线上最恰当的是____。\nA. 这是一段颠簸的行程，一路上景色苍凉雄浑，寂静中裸露着贫瘠\nB. \
+也许是久旱的缘故，这边的溪流也变成了涓涓细流，在盘踞的石缝间流动\nC. 同绿色的南方相比，这里是荒凉的，乃至荒蛮\nD. 偶见人迹，大都是荒村野店。时而会有\
+一座小小的孤庙一闪而过\nAnswer: D\n\n据说，在东南沿海一带，渔民在捕到螃蟹后，将螃蟹放进一个上小肚大的竹篓里面，第一只螃蟹放进去时，渔夫就用重物将口\
+封住，当第二只、第三只放进去后，渔夫就不再盖重物了，因为，第一只即将爬出篓口的螃蟹，会被第二只、第三只螃蟹拉到篓底。如此下去，即使篓口没有盖盖子，但\
+也没有一只蟹能够爬出去。 这个故事意在告诉我们____。\nA. 人多不一定好办事\nB. 恶性竞争必然导致两败俱伤\nC. 内讧必然内耗，团结就是力量\nD. 与人方便，\
+自己方便\nAnswer: C\n\n谨慎：成就____\nA. 温和：好感\nB. 勤奋：努力\nC. 轻松：普通\nD. 好学：智慧\nAnswer: A\n\n①我的奶奶是这样，我的父亲也是这\
+样——那张画像，已经成为许多老百姓生活必需品的一部分，没有它，似乎客厅都是空的 ②如果因为认知能力的提升而将偶像抽离出记忆，那么，在那个时代里成长、生活\
+的许多人，脑子里将空空如也，甚至不记得自己曾经活过这一回 ③卷的过程，是在收叠他个人的历史 ④有时挂旧了、破了，也会认认真真卷起来放好，我曾看别人卷过这\
+画像，那种澄澈的眼神令人难忘 ⑤有些伟大者永远不会被人遗忘，因为那个伟大者，在那个时代，其实是一种生活，精神生活 ⑥从这个意义上说，尽管也许并不懂他，但\
+人们心甘情愿尊他的名为圣 将以上6个句子重新排列，语序正确的是____。\nA. ②⑥⑤①④③\nB. ②⑥④③①⑤\nC. ①④③②⑥⑤\nD. ⑤②⑥①④③\nAnswer: D\n\n相机：拍\
+摄____\nA. 空调：降温\nB. B超：诊断\nC. 电脑：操作\nD. 地图：交通\nAnswer: B\n\n学生在操场上列队做操，只知人数在90-110之间。如果排成3排则不多不\
+少：排成5排则少2人；排成7排则少4人。问学生人数是多少人?____\nA. 102\nB. 98\nC. 104\nD. 108\nAnswer:",
+    "The following are multiple choice questions (with answers) about  civil servant.\n\n透过车轮卷起的黄土，却见山野人秋，庄稼割过，静谧中\
+含着一些寂静，只有阳光在切割过的根茬上烁烁闪亮。____。 填入横线上最恰当的是____。\nA. 这是一段颠簸的行程，一路上景色苍凉雄浑，寂静中裸露着贫瘠\nB. \
+也许是久旱的缘故，这边的溪流也变成了涓涓细流，在盘踞的石缝间流动\nC. 同绿色的南方相比，这里是荒凉的，乃至荒蛮\nD. 偶见人迹，大都是荒村野店。时而会有\
+一座小小的孤庙一闪而过\nAnswer: D\n\n据说，在东南沿海一带，渔民在捕到螃蟹后，将螃蟹放进一个上小肚大的竹篓里面，第一只螃蟹放进去时，渔夫就用重物将口\
+封住，当第二只、第三只放进去后，渔夫就不再盖重物了，因为，第一只即将爬出篓口的螃蟹，会被第二只、第三只螃蟹拉到篓底。如此下去，即使篓口没有盖盖子，但\
+也没有一只蟹能够爬出去。 这个故事意在告诉我们____。\nA. 人多不一定好办事\nB. 恶性竞争必然导致两败俱伤\nC. 内讧必然内耗，团结就是力量\nD. 与人方便，\
+自己方便\nAnswer: C\n\n谨慎：成就____\nA. 温和：好感\nB. 勤奋：努力\nC. 轻松：普通\nD. 好学：智慧\nAnswer: A\n\n①我的奶奶是这样，我的父亲也是这\
+样——那张画像，已经成为许多老百姓生活必需品的一部分，没有它，似乎客厅都是空的 ②如果因为认知能力的提升而将偶像抽离出记忆，那么，在那个时代里成长、生活\
+的许多人，脑子里将空空如也，甚至不记得自己曾经活过这一回 ③卷的过程，是在收叠他个人的历史 ④有时挂旧了、破了，也会认认真真卷起来放好，我曾看别人卷过这\
+画像，那种澄澈的眼神令人难忘 ⑤有些伟大者永远不会被人遗忘，因为那个伟大者，在那个时代，其实是一种生活，精神生活 ⑥从这个意义上说，尽管也许并不懂他，但\
+人们心甘情愿尊他的名为圣 将以上6个句子重新排列，语序正确的是____。\nA. ②⑥⑤①④③\nB. ②⑥④③①⑤\nC. ①④③②⑥⑤\nD. ⑤②⑥①④③\nAnswer: D\n\n相机：拍\
+摄____\nA. 空调：降温\nB. B超：诊断\nC. 电脑：操作\nD. 地图：交通\nAnswer: B\n\n有人说：人本是散落的珍珠，随地乱滚。文化就是那极____又强韧的细\
+线，将珠子串起来成为社会。也有人说：文化犹如空气中的氧气，自然界的春雨，不可或缺却____，飘飘洒洒，润物无声。可见，文化资源价值是无法用尺度衡量的。 填\
+入画横线部分最恰当的一项是____。\nA. 柔弱 视之无形\nB. 纤细 不可名状\nC. 结实 视而不见\nD. 薄弱 不可捉摸\nAnswer:",
+    "The following are multiple choice questions (with answers) about  civil servant.\n\n透过车轮卷起的黄土，却见山野人秋，庄稼割过，静谧中\
+含着一些寂静，只有阳光在切割过的根茬上烁烁闪亮。____。 填入横线上最恰当的是____。\nA. 这是一段颠簸的行程，一路上景色苍凉雄浑，寂静中裸露着贫瘠\nB. \
+也许是久旱的缘故，这边的溪流也变成了涓涓细流，在盘踞的石缝间流动\nC. 同绿色的南方相比，这里是荒凉的，乃至荒蛮\nD. 偶见人迹，大都是荒村野店。时而会有\
+一座小小的孤庙一闪而过\nAnswer: D\n\n据说，在东南沿海一带，渔民在捕到螃蟹后，将螃蟹放进一个上小肚大的竹篓里面，第一只螃蟹放进去时，渔夫就用重物将口\
+封住，当第二只、第三只放进去后，渔夫就不再盖重物了，因为，第一只即将爬出篓口的螃蟹，会被第二只、第三只螃蟹拉到篓底。如此下去，即使篓口没有盖盖子，但也\
+没有一只蟹能够爬出去。 这个故事意在告诉我们____。\nA. 人多不一定好办事\nB. 恶性竞争必然导致两败俱伤\nC. 内讧必然内耗，团结就是力量\nD. 与人方便，自\
+己方便\nAnswer: C\n\n谨慎：成就____\nA. 温和：好感\nB. 勤奋：努力\nC. 轻松：普通\nD. 好学：智慧\nAnswer: A\n\n①我的奶奶是这样，我的父亲也是这样\
+——那张画像，已经成为许多老百姓生活必需品的一部分，没有它，似乎客厅都是空的 ②如果因为认知能力的提升而将偶像抽离出记忆，那么，在那个时代里成长、生活的\
+许多人，脑子里将空空如也，甚至不记得自己曾经活过这一回 ③卷的过程，是在收叠他个人的历史 ④有时挂旧了、破了，也会认认真真卷起来放好，我曾看别人卷过这画\
+像，那种澄澈的眼神令人难忘 ⑤有些伟大者永远不会被人遗忘，因为那个伟大者，在那个时代，其实是一种生活，精神生活 ⑥从这个意义上说，尽管也许并不懂他，但人\
+们心甘情愿尊他的名为圣 将以上6个句子重新排列，语序正确的是____。\nA. ②⑥⑤①④③\nB. ②⑥④③①⑤\nC. ①④③②⑥⑤\nD. ⑤②⑥①④③\nAnswer: D\n\n相机：拍摄____\nA. \
+空调：降温\nB. B超：诊断\nC. 电脑：操作\nD. 地图：交通\nAnswer: B\n\n政府职能与成本问题一直备受争议，但这方面的研究似乎还处于一种观点与立场远未一致\
+的状态，一个重要原因是研究视角与方法的局限。大体上看，这类研究有两条思路，一条是信守新古典经济学理论预设，认为市场可以有效解决经济社会发展中的问\
+题，持“小政府”观点；另一条是信守政府干预主义理论预设，认为政府不时干预是市场能够健康运转的必要条件。笔者认为，要解决这种困境，必须有新的理论视野和新\
+的研究方法，而新兴古典经济学理论就是其中之一。 这段文字接下来最有可能讲述的是____。\nA. 新兴古典经济学的理论框架与研究方法\nB. 新理论视野对提高政府\
+的行政效率有何帮助\nC. 新古典经济学理论预设的局限性\nD. 政府职能与成本之间矛盾难解的原因\nAnswer:",
+    "The following are multiple choice questions (with answers) about  civil servant.\n\n透过车轮卷起的黄土，却见山野人秋，庄稼割过，静谧中\
+含着一些寂静，只有阳光在切割过的根茬上烁烁闪亮。____。 填入横线上最恰当的是____。\nA. 这是一段颠簸的行程，一路上景色苍凉雄浑，寂静中裸露着贫瘠\nB. \
+也许是久旱的缘故，这边的溪流也变成了涓涓细流，在盘踞的石缝间流动\nC. 同绿色的南方相比，这里是荒凉的，乃至荒蛮\nD. 偶见人迹，大都是荒村野店。时而会有\
+一座小小的孤庙一闪而过\nAnswer: D\n\n据说，在东南沿海一带，渔民在捕到螃蟹后，将螃蟹放进一个上小肚大的竹篓里面，第一只螃蟹放进去时，渔夫就用重物将口\
+封住，当第二只、第三只放进去后，渔夫就不再盖重物了，因为，第一只即将爬出篓口的螃蟹，会被第二只、第三只螃蟹拉到篓底。如此下去，即使篓口没有盖盖子，但\
+也没有一只蟹能够爬出去。 这个故事意在告诉我们____。\nA. 人多不一定好办事\nB. 恶性竞争必然导致两败俱伤\nC. 内讧必然内耗，团结就是力量\nD. 与人方便，\
+自己方便\nAnswer: C\n\n谨慎：成就____\nA. 温和：好感\nB. 勤奋：努力\nC. 轻松：普通\nD. 好学：智慧\nAnswer: A\n\n①我的奶奶是这样，我的父亲也是这\
+样——那张画像，已经成为许多老百姓生活必需品的一部分，没有它，似乎客厅都是空的 ②如果因为认知能力的提升而将偶像抽离出记忆，那么，在那个时代里成长、生活\
+的许多人，脑子里将空空如也，甚至不记得自己曾经活过这一回 ③卷的过程，是在收叠他个人的历史 ④有时挂旧了、破了，也会认认真真卷起来放好，我曾看别人卷过这\
+画像，那种澄澈的眼神令人难忘 ⑤有些伟大者永远不会被人遗忘，因为那个伟大者，在那个时代，其实是一种生活，精神生活 ⑥从这个意义上说，尽管也许并不懂他，但\
+人们心甘情愿尊他的名为圣 将以上6个句子重新排列，语序正确的是____。\nA. ②⑥⑤①④③\nB. ②⑥④③①⑤\nC. ①④③②⑥⑤\nD. ⑤②⑥①④③\nAnswer: D\n\n相机：拍\
+摄____\nA. 空调：降温\nB. B超：诊断\nC. 电脑：操作\nD. 地图：交通\nAnswer: B\n\n2009年有两次“立春”，很容易让人联想到“第二春”“二度春”，可想而知这\
+样的婚姻不稳定，所以网络上有“2009年不能结婚，或者2009年爱情不会长久”等传闻。但是，大多数年轻人认为，登记结婚是件水到渠成的事，不会因为赶日子仓促提前\
+或延迟。 根据这段文字，下列说法正确的是____。\nA. 作者认为2009年不适合结婚\nB. 大多数年轻人认为2009年是结婚的好年头\nC. 2009年结婚会使婚姻不稳定的\
+说法是无稽之谈\nD. 大多数年轻人不会因为2009年有两次“立春”而改变自己的结婚计划\nAnswer:"
+]
+dataset_calib = get_calib_dataset(tokenizer, calib_set)
+# 对于linear算子中的激活值如果有表示范围过大，或者“尖刺”的异常值过多，
+# 需要使用anti outleir功能，使用方法如下
+
+logging.info("===============start AntiOutlier==============")
+anti_config = AntiOutlierConfig(
+    w_bit=8, a_bit=8, anti_method="m2",
+    dev_type="cpu")  # dev_type="npu", dev_id=0  如果需要使用npu进行量化。
+anti_outlier = AntiOutlier(model,
+                           calib_data=dataset_calib,
+                           cfg=anti_config,
+                           norm_class_name="RMSNorm")
+anti_outlier.process()
+
+#下面是回退层的设置，因为w8a8的对激活值也进行了量化，会有部分网络层对激活值的表示
+#范围较为敏感所以需要回退这些网络层使用浮点权重进行计算。
+
+logging.info("===============end AntiOutlier==============")
+disable_names = []
+BAICHUAN_LAYERS = 32
+disable_idx_lst = list(range(BAICHUAN_LAYERS))
+for layer_index in disable_idx_lst:
+    down_proj_name = "model.layers.{}.mlp.down_proj".format(layer_index)
+    disable_names.append(down_proj_name)
+quant_config = QuantConfig(
+    a_bit=8,
+    w_bit=8,
+    disable_names=disable_names,
+    disable_last_linear=False,
+    dev_type='cpu',  # dev_type="npu", dev_id=0,  如果需要使用npu进行量化
+    act_method=3,
+    pr=1.0,
+    w_sym=True,
+    mm_tensor=False)
+logging.info("===============start Calibrator==============")
+calibrator = Calibrator(model,
+                        quant_config,
+                        calib_data=dataset_calib,
+                        disable_level='L0')
+calibrator.run()  # 执行PTQ量化校准
+calibrator.save(OUTPORT_W8A8_PATH, save_type=[
+    "safe_tensor"
+])  # "safe_tensor"对应safetensors格式权重，"numpy"对应npy格式权重
+logging.info("===============end Calibrator==============")
diff --git a/mindie/examples/models/baichuan/run_fa.sh b/mindie/examples/models/baichuan/run_fa.sh
new file mode 100644
index 00000000..2c7bc560
--- /dev/null
+++ b/mindie/examples/models/baichuan/run_fa.sh
@@ -0,0 +1,23 @@
+# copyright (c) Huawei Technologies Co., Ltd. 2024-2024. All rights reserved.
+
+export ASCEND_RT_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+export MASTER_PORT=20031
+
+# 以下环境变量与性能和内存优化相关，通常情况下无需修改
+export ATB_LAYER_INTERNAL_TENSOR_REUSE=1
+export INF_NAN_MODE_ENABLE=0
+export ATB_OPERATION_EXECUTE_ASYNC=1
+export TASK_QUEUE_ENABLE=1
+export ATB_CONVERT_NCHW_TO_ND=1
+export HCCL_BUFFSIZE=120
+export ATB_WORKSPACE_MEM_ALLOC_GLOBAL=1
+export ATB_CONTEXT_WORKSPACE_SIZE=0
+
+extra_param=""
+world_size=$(($(echo "${ASCEND_RT_VISIBLE_DEVICES}" | grep -o , | wc -l) +1))
+
+if [ "$TP_WORLD_SIZE" == "1" ]; then
+    python -m examples.run_fa --model_path $1 $extra_param
+else
+    torchrun --nproc_per_node $world_size --master_port $MASTER_PORT -m examples.run_fa --model_path $1 $extra_param
+fi
diff --git a/mindie/examples/models/baichuan/run_pa.sh b/mindie/examples/models/baichuan/run_pa.sh
new file mode 100644
index 00000000..4a3b74a4
--- /dev/null
+++ b/mindie/examples/models/baichuan/run_pa.sh
@@ -0,0 +1,20 @@
+#!/bin/bash
+set -ex
+# Copyright Huawei Technologies Co., Ltd. 2023-2024. All rights reserved.
+# 参数配置以及启动指令的说明见同级目录下的README.md文件
+export BIND_CPU=1
+export IS_QUANT=0
+export RESERVED_MEMORY_GB=3
+export ASCEND_RT_VISIBLE_DEVICES=0,1,2,3
+export MASTER_PORT=20036
+export IS_ALIBI_MASK_FREE=0
+export TP_WORLD_SIZE=$(($(echo "${ASCEND_RT_VISIBLE_DEVICES}" | grep -o , | wc -l) +1))
+export INT8_FORMAT_NZ_ENABLE=1
+atb_options="ATB_LAUNCH_KERNEL_WITH_TILING=1 ATB_LAYER_INTERNAL_TENSOR_REUSE=1 ATB_WORKSPACE_MEM_ALLOC_GLOBAL=1 PYTORCH_NPU_ALLOC_CONF='max_split_size_mb:2048' HCCL_BUFFSIZE=120"
+atb_async_options="ATB_OPERATION_EXECUTE_ASYNC=1 TASK_QUEUE_ENABLE=1"
+base_cmd="torchrun --nproc_per_node $TP_WORLD_SIZE --master_port $MASTER_PORT -m examples.run_pa --model_path $1"
+if [[ "$2" == "chat" ]]; then
+    base_cmd+=" --is_chat_model"
+fi
+run_cmd="${atb_options} ${atb_async_options} ${base_cmd}"
+eval "${run_cmd}"
diff --git a/mindie/examples/models/bge/large-zh-v1.5/README.md b/mindie/examples/models/bge/large-zh-v1.5/README.md
new file mode 100644
index 00000000..da505c8d
--- /dev/null
+++ b/mindie/examples/models/bge/large-zh-v1.5/README.md
@@ -0,0 +1,251 @@
+# README
+
+# 特性矩阵
+- 此矩阵罗列了各bge-large-zh模型支持的特性
+
+| 模型及参数量       | 800I A2 Tensor Parallelism | 300I DUO Tensor Parallelism | FP16 | BF16 | Flash Attention | Paged Attention | W8A8量化 | W8A16量化 | W4A16量化 | KV cache量化 | 稀疏量化 | MOE量化 | MindIE Service | TGI  | 长序列 |
+|--------------|-------------------------|---------------------------| ---- |-----| --------------- | --------------- | -------- | --------- | --------- | ------------ |------| ---- | ------ | ---- |-----|
+| bge-large-zh | 支持world size 1    | 支持world size 1        | √    | ×   | ×               | ×               | ×        | ×         | ×         | ×            | ×    | ×    | ×      | ×    | ×    |
+
+## 离线模型版本
+
+### 模型介绍
+
+bge-large-zh是由智源研究院研发的中文版文本表示模型，可将任意文本映射为低维稠密向量，以用于检索、分类、聚类或语义匹配等任务，并可支持为大模型调用外部知识。其中**1.5版本**的相似度分布更加合理
+
+[开源模型地址](https://huggingface.co/BAAI/bge-large-zh-v1.5)
+
+`Commit-id 79e7739b6ab944e86d6171e44d24c997fc1e0116`
+
+### 模型转换流程
+
+首先获取`huggingface`开源模型，将其转换为ONNX格式，再使用Ascend ATC工具将ONNX格式的模型转换为om格式，我们主要关注该模型在昇腾设备上的精度和性能表现。
+
+### 变量名称解释
+
+|变量名         |含义   |
+| ------------ | ------------ |
+|save_directory |onnx模型以及转换后om离线模型存放目录 |
+|soc_version |昇腾AI处理器的版本，可以通过执行**npu-smi info** 命令查询，在查询到的型号前加Ascend信息，例如：**Ascend910B4、Ascend310P3** |
+|precision_mode_v2 |设置网络模型的精度模式。例如：**fp16、mixed_float16、origin**
+| cur_dir  |运行指令或执行脚本时的路径(当前目录)   |
+|device_id  |npu芯片的id,在装了CANN驱动的服务器上使用npu-smi info查看可用的npu芯片的id |
+
+### 安装python依赖
+
+```shell
+cd ${cur_dir}
+pip install -r requirements.txt
+```
+
+### 安装ais_bench推理工具
+
+[ais_bench推理工具使用指南](https://gitee.com/ascend/tools/blob/master/ais-bench_workload/tool/ais_bench/README.md)
+
+- 需安装**aclruntime**包和**ais_bench**推理程序包
+
+#### 开源模型转换onnx格式
+
+```shell
+cd ${cur_dir}
+python bin2onnx.py --model_path ${save_directory}
+```
+
+#### onnx转换om离线模型
+
+在环境上使用[昇腾ATC](https://www.hiascend.com/document/detail/zh/CANNCommunityEdition/80RC1alpha003/devaids/auxiliarydevtool/atlasatc_16_0001.html)将onnx格式转换为om格式的离线模型
+
+- ATC工具集成在CANN中，source相应的环境变量即可
+
+```shell
+source /usr/local/Ascend/ascend-toolkit/set_env.sh
+```
+
+在 ${cur_dir} 下运行脚本
+
+```shell
+atc --model=${save_directory}/model.onnx --framework=5 --output=${save_directory}/bge --soc_version=${soc_version} --input_shape="input_ids:-1,-1;attention_mask:-1,-1;token_type_ids:-1,-1" --optypelist_for_implmode="Gelu" --op_select_implmode=high_performance --input_format=ND --precision_mode_v2=${precision_mode} --modify_mixlist=${cur_dir}/ops_info.json
+```
+
+#### 参数说明
+
+- bert模型的三个输入依次为**input_ids**、 **attention_mask**、 **token_type_ids**， 按顺序指定模型输入数据的shape。
+
+- 参照ATC说明文档，设置shape范围时，若设置为 -1，表示此维度可以使用 >=0 的任意取值，该场景下取值上限为 int64 数据类型表达范围，但受限于host和device侧物理内存的大小，用户可以通过增大内存来支持。
+- Gelu算子在不影响精度的情况下开启高性能模式，提升模型性能
+
+- 所配置的精度模式不同，网络模型精度以及性能有所不同，具体为：
+
+精度高低排序：`origin>mixed_float16>fp16`
+
+性能优劣排序：`fp16>=mixed_float16>origin`
+
+推荐配置: **mixed_float16**
+
+- modify_mixlist参数为配置混合精度下的黑白灰名单，目的是控制在fp16精度溢出的算子保持原精度格式，避免其溢出，这里定义了一个将算子写入黑名单的json文件
+
+### 获取测试数据集
+
+```shell
+cd ${cur_dir}
+mkdir dataset
+cd dataset
+```
+
+将[corpus、queries](https://huggingface.co/datasets/C-MTEB/T2Retrieval/tree/main/data)和[dev](https://huggingface.co/datasets/C-MTEB/T2Retrieval-qrels/tree/main/data)下载到该路径下
+
+### 离线模型推理脚本指南  
+
+- om模型推理脚本的启动路径为`${cur_dir}/infer.py`
+- hf开源模型推理脚本的启动路径为`${cur_dir}/demo.py`
+
+在昇腾机器上**运行**`python infer.py --model-path ${save_directory} --device ${device_id}`
+
+或者GPU的权重存放路径上**运行**`python demo.py`
+
+- **说明：**执行infer.py时，脚本会运行模型存放的目录的第一个以.om为结尾的模型，若想指定某个om模型，可以在infer.py中修改
+`session = InferSession(device_id=device, model_path=model_path)` 的 **model_path** 为$`{save_directory}/*.om`
+
+其中，*为OM离线模型文件名。
+
+### 精度 & 性能测试
+
+- 修改Config_bge.json内的模型路径为各模型所在的相应路径
+
+- 精度测试脚本
+
+```shell
+python eval_cmteb.py --model_type_or_path om --device ${device_id}
+```
+
+- 性能测试脚本
+
+```shell
+python eval_performance.py --model_type_or_path om --input_shape [batch_size, seq_len] --device ${device_id}
+```
+
+#### 模型推理性能
+
+性能验证NPU环境使用 `OM` 模型，GPU环境使用 `ONNX` 模型
+
+吞吐率：1000 * batch_size / compute_time
+
+| 环境  | 芯片型号        | batch_size | seq_len | 吞吐率（fps） |
+|-----|-------------|------------|---------|----------|
+| NPU | Ascend310P3 | 8          | 100     | 449.22   |
+| NPU | Ascend310P3 | 20         | 512     | 39.40    |
+| NPU | Ascend310P3 | 128        | 512     | 39.63    |
+| GPU | NVIDIA A10  | 8          | 100     | 149.93   |
+| GPU | NVIDIA A10  | 20         | 512     | 48.21    |
+| GPU | NVIDIA A10  | 128        | 512     | 49.38    |
+
+说明：Atlas 300I Duo 推理卡为单卡双芯，比较吞吐率时需要×2
+
+| 环境  | 芯片型号        | batch_size | seq_len | 吞吐率（fps） |
+|-----|-------------|------------|---------|----------|
+| NPU | Ascend910B4 | 8          | 100     | 696.06   |
+| NPU | Ascend910B4 | 20         | 512     | 132.96   |
+| NPU | Ascend910B4 | 128        | 512     | 123.94   |
+| GPU | NVIDIA L20  | 8          | 100     | 384.60   |
+| GPU | NVIDIA L20  | 20         | 512     | 112.80   |
+| GPU | NVIDIA L20  | 128        | 512     | 104.37   |
+
+#### 模型推理精度
+
+精度验证NPU环境使用 `OM` 模型，GPU环境使用 `ONNX` 模型
+
+| 环境  | 芯片型号        | ndcg@10（%） |
+|-----|-------------|--------|
+| NPU | Ascend310P3 | 83.66  |
+| GPU | Nvidia A10  | 83.67  |
+
+| 环境  | 芯片型号        | ndcg@10（%） |
+|-----|-------------|--------|
+| NPU | Ascend910B4 | 83.86  |
+| GPU | Nvidia L20  | 83.67  |
+
+### Ascend310P3性能说明
+
+在昇腾310P3上，需要进行一项操作来发挥出算子更好的性能
+
+1. SoftmaxV2使能VectorCore：需要在以下路径的json文件中找到SoftmaxV2
+
+```
+/usr/local/Ascend/ascend-toolkit/latest/opp/built-in/op_impl/ai_core/tbe/config/ascend310p/aic-ascend310p-ops-info.json
+```
+
+加入使能VectorCore
+
+```json
+"enableVectorCore":{
+        "flag":"true"
+}
+```
+
+2. 并且在以下路径中把已经存在的softmax_v2改为其它名称，否则使能不生效
+
+```shell
+ascend-toolkit/latest/opp/built-in/op_impl/ai_core/tbe/kernel/ascend310p
+```
+
+3. 重新进行ATC转换再进行性能测试即可
+
+------------
+
+## 加速库版本
+
+### 离线模型推理脚本指南  
+
+- 接入FA加速库模型推理脚本的启动路径为`${cur_dir}/main.py`
+
+1. 把 **modeling_bert_ascend.py** 的代码替换原生transformers内的 **modeling_bert.py** 的代码
+
+路径为
+
+```shell
+/miniconda/envs/${conda_name}/lib/python3.10/site-packages/transformers/models/bert/modeling_bert.py
+```
+
+2. 在昇腾机器上**运行**`python main.py`
+
+### 精度 & 性能测试
+
+- 修改Config_bge.json内的模型路径为各模型所在的相应路径
+
+- 精度测试脚本
+
+```shell
+python eval_cmteb.py --model_type_or_path pytorch --device ${device_id}
+```
+
+- 性能测试脚本
+
+```shell
+python eval_performance.py --model_type_or_path pytorch --input_shape [batch_size, seq_len] --device ${device_id}
+```
+
+#### 模型推理性能
+
+性能验证NPU环境使用 `PYTORCH` 模型，GPU环境使用 `PYTORCH` 模型
+
+吞吐率：1000 * batch_size / compute_time
+
+| 环境  | 芯片型号        | batch_size | seq_len | 吞吐率（fps） |
+|-----|-------------|------------|---------|----------|
+| NPU | Ascend910B4 | 8          | 100     | 486.66   |
+| NPU | Ascend910B4 | 20         | 512     | 1100.48   |
+| NPU | Ascend910B4 | 128        | 512     | 4885.53   |
+| GPU | NVIDIA L40  | 8          | 100     | 453.42   |
+| GPU | NVIDIA L40  | 20         | 512     | 575.13   |
+| GPU | NVIDIA L40  | 128        | 512     | 2104.04   |
+
+#### 模型推理精度
+
+精度验证NPU环境使用 `PYTORCH` 模型，GPU环境使用 `PYTORCH` 模型
+
+| 环境  | 芯片型号           | ndcg@10（%） |
+|-----|-------------       |--------|
+| NPU | Ascend910B4 (fp16) | 83.67  |
+| GPU | Nvidia L40 (fp32)  | 83.67  |
+
+- Ascend310P3待测试
diff --git a/mindie/examples/models/bge/large-zh-v1.5/bin2onnx.py b/mindie/examples/models/bge/large-zh-v1.5/bin2onnx.py
new file mode 100644
index 00000000..df625617
--- /dev/null
+++ b/mindie/examples/models/bge/large-zh-v1.5/bin2onnx.py
@@ -0,0 +1,28 @@
+# Copyright 2024 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the License);
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+from optimum.onnxruntime import ORTModelForFeatureExtraction
+
+parser = argparse.ArgumentParser(description="Export a model from transformers to ONNX format.")
+parser.add_argument("--model_path", type=str, required=True, help="Path to the model checkpoint to convert.")
+
+args = parser.parse_args()
+
+model_checkpoint = args.model_path
+
+ort_model = ORTModelForFeatureExtraction.from_pretrained(model_checkpoint, export=True, from_transformers=True)
+
+# Save the ONNX model
+ort_model.save_pretrained(model_checkpoint)
\ No newline at end of file
diff --git a/mindie/examples/models/bge/large-zh-v1.5/config_bge.json b/mindie/examples/models/bge/large-zh-v1.5/config_bge.json
new file mode 100644
index 00000000..cce01e4e
--- /dev/null
+++ b/mindie/examples/models/bge/large-zh-v1.5/config_bge.json
@@ -0,0 +1,8 @@
+{
+    "default_path": {
+        "tokenizer_path": "./bge-large-zh-v1.5",
+        "pytorch_model_path": "./bge-large-zh-v1.5",
+        "onnx_model_path": "./bge-large-zh-v1.5",
+        "om_model_path": "./bge-large-zh-v1.5/bge_liunx_aarch.om"
+    }
+}
\ No newline at end of file
diff --git a/mindie/examples/models/bge/large-zh-v1.5/configuration_bert.py b/mindie/examples/models/bge/large-zh-v1.5/configuration_bert.py
new file mode 100644
index 00000000..9aced32d
--- /dev/null
+++ b/mindie/examples/models/bge/large-zh-v1.5/configuration_bert.py
@@ -0,0 +1,129 @@
+# coding=utf-8
+# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" BERT model configuration"""
+from transformers.configuration_utils import PretrainedConfig
+from transformers.utils import logging
+
+logger = logging.get_logger(__name__)
+
+
+class BertConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`BertModel`] or a [`TFBertModel`]. It is used to
+    instantiate a BERT model according to the specified arguments, defining the model architecture. Instantiating a
+    configuration with the defaults will yield a similar configuration to that of the BERT
+    [google-bert/bert-base-uncased](https://huggingface.co/google-bert/bert-base-uncased) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+
+    Args:
+        vocab_size (`int`, *optional*, defaults to 30522):
+            Vocabulary size of the BERT model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling [`BertModel`] or [`TFBertModel`].
+        hidden_size (`int`, *optional*, defaults to 768):
+            Dimensionality of the encoder layers and the pooler layer.
+        num_hidden_layers (`int`, *optional*, defaults to 12):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 12):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        intermediate_size (`int`, *optional*, defaults to 3072):
+            Dimensionality of the "intermediate" (often named feed-forward) layer in the Transformer encoder.
+        hidden_act (`str` or `Callable`, *optional*, defaults to `"gelu"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+            `"relu"`, `"silu"` and `"gelu_new"` are supported.
+        hidden_dropout_prob (`float`, *optional*, defaults to 0.1):
+            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
+        attention_probs_dropout_prob (`float`, *optional*, defaults to 0.1):
+            The dropout ratio for the attention probabilities.
+        max_position_embeddings (`int`, *optional*, defaults to 512):
+            The maximum sequence length that this model might ever be used with. Typically set this to something large
+            just in case (e.g., 512 or 1024 or 2048).
+        type_vocab_size (`int`, *optional*, defaults to 2):
+            The vocabulary size of the `token_type_ids` passed when calling [`BertModel`] or [`TFBertModel`].
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-12):
+            The epsilon used by the layer normalization layers.
+        position_embedding_type (`str`, *optional*, defaults to `"absolute"`):
+            Type of position embedding. Choose one of `"absolute"`, `"relative_key"`, `"relative_key_query"`. For
+            positional embeddings use `"absolute"`. For more information on `"relative_key"`, please refer to
+            [Self-Attention with Relative Position Representations (Shaw et al.)](https://arxiv.org/abs/1803.02155).
+            For more information on `"relative_key_query"`, please refer to *Method 4* in [Improve Transformer Models
+            with Better Relative Position Embeddings (Huang et al.)](https://arxiv.org/abs/2009.13658).
+        is_decoder (`bool`, *optional*, defaults to `False`):
+            Whether the model is used as a decoder or not. If `False`, the model is used as an encoder.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values attentions (not used by all models). Only
+            relevant if `config.is_decoder=True`.
+        classifier_dropout (`float`, *optional*):
+            The dropout ratio for the classification head.
+
+    Examples:
+
+    ```python
+    >>> from transformers import BertConfig, BertModel
+
+    >>> # Initializing a BERT google-bert/bert-base-uncased style configuration
+    >>> configuration = BertConfig()
+
+    >>> # Initializing a model (with random weights) from the google-bert/bert-base-uncased style configuration
+    >>> model = BertModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "bert"
+
+    def __init__(
+            self,
+            vocab_size=30522,
+            hidden_size=768,
+            num_hidden_layers=12,
+            num_attention_heads=12,
+            intermediate_size=3072,
+            hidden_act="gelu",
+            hidden_dropout_prob=0.1,
+            attention_probs_dropout_prob=0.1,
+            max_position_embeddings=512,
+            type_vocab_size=2,
+            initializer_range=0.02,
+            layer_norm_eps=1e-12,
+            pad_token_id=0,
+            position_embedding_type="absolute",
+            use_cache=True,
+            classifier_dropout=None,
+            **kwargs,
+    ):
+        super().__init__(pad_token_id=pad_token_id, **kwargs)
+
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.hidden_act = hidden_act
+        self.intermediate_size = intermediate_size
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.max_position_embeddings = max_position_embeddings
+        self.type_vocab_size = type_vocab_size
+        self.initializer_range = initializer_range
+        self.layer_norm_eps = layer_norm_eps
+        self.position_embedding_type = position_embedding_type
+        self.use_cache = use_cache
+        self.classifier_dropout = classifier_dropout
diff --git a/mindie/examples/models/bge/large-zh-v1.5/convert.sh b/mindie/examples/models/bge/large-zh-v1.5/convert.sh
new file mode 100644
index 00000000..27545a09
--- /dev/null
+++ b/mindie/examples/models/bge/large-zh-v1.5/convert.sh
@@ -0,0 +1,42 @@
+#!/bin/bash  
+  
+# 定义模型检查点和保存目录  
+model_checkpoint="$1" 
+save_directory="$model_checkpoint"
+soc_version=$(python -c "import torch;import torch_npu;print(torch.npu.get_device_name())")
+
+precision_mode=allow_mix_precision
+
+# 确保当前模型路径下没有同名的model.onnx文件
+if [ -f "$save_directory/model.onnx" ]; then
+    echo "Error: model.onnx already exists in the current path"
+    exit 1
+fi
+
+# 使用Python脚本加载并导出模型到ONNX  
+python -c "  
+from optimum.onnxruntime import ORTModelForFeatureExtraction  
+  
+ort_model = ORTModelForFeatureExtraction.from_pretrained('$model_checkpoint', export=True, from_transformers=True)  
+ort_model.save_pretrained('$save_directory')  
+"  
+  
+# 检查ONNX模型是否成功保存  
+if [ -f "$save_directory/model.onnx" ]; then  
+    echo "ONNX model successfully saved at $save_directory/model.onnx"  
+else  
+    echo "Error: Failed to save ONNX model."  
+    exit 1  
+fi  
+
+
+# 使用ATC命令对ONNX模型进行转换或优化  
+atc --model=$save_directory/model.onnx --framework=5 --output=$save_directory/bge_"$soc_version" --soc_version="$soc_version" --input_shape="input_ids:-1,-1;attention_mask:-1,-1;token_type_ids:-1,-1" --precision_mode="$precision_mode"
+  
+# 检查ATC命令是否执行成功  
+if [ $? -eq 0 ]; then  
+    echo "Model conversion with ATC successful."  
+else  
+    echo "Error: Failed to convert model with ATC."  
+    exit 1  
+fi
\ No newline at end of file
diff --git a/mindie/examples/models/bge/large-zh-v1.5/demo.py b/mindie/examples/models/bge/large-zh-v1.5/demo.py
new file mode 100644
index 00000000..02c86035
--- /dev/null
+++ b/mindie/examples/models/bge/large-zh-v1.5/demo.py
@@ -0,0 +1,85 @@
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the License);
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import time
+import logging
+import torch
+
+try:
+    import torch_npu
+
+    device = "npu:0"
+    torch_npu.npu.set_device(0)
+    torch.npu.set_compile_mode(jit_compile=False)
+except ImportError:
+    device = "cuda:0"
+from transformers import AutoTokenizer, AutoModel
+
+logging.getLogger().setLevel(logging.INFO)
+
+# Sentences we want sentence embeddings for
+sentences = ["样例数据-1", "样例数据-2"]
+MODEL_PATH = "./"
+# Load model from HuggingFace Hub
+tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)
+model = AutoModel.from_pretrained(MODEL_PATH).to(device)
+model.eval()
+
+
+def infer(text):
+    # Tokenize sentences
+    encoded_input = tokenizer(text, padding=True, truncation=True, return_tensors='pt', max_length=512)
+    encoded_input = encoded_input.to(device)
+    logging.info(encoded_input.input_ids.shape)
+
+    # Compute token embeddings
+    with torch.no_grad():
+        model_output = model(**encoded_input)
+        # Perform pooling. In this case, cls pooling.
+        sentence_embeddings = model_output[0][:, 0]
+    # normalize embeddings
+    sentence_embeddings = torch.nn.functional.normalize(sentence_embeddings, p=2, dim=1)
+    logging.info("Sentence embeddings:", sentence_embeddings)
+    logging.info("Sentence embeddings.shape:", sentence_embeddings.shape)
+
+
+def infer_test(text):
+    # Tokenize sentences
+    encoded_input = tokenizer(text, padding="max_length", return_tensors='pt', max_length=512)
+    encoded_input = encoded_input.to(device)
+    logging.info(encoded_input.input_ids.shape)
+
+    # Compute token embeddings
+    with torch.no_grad():
+        start_time = time.time()
+        model_output = model(**encoded_input)
+        end_time = time.time()
+        # Perform pooling. In this case, cls pooling.
+        sentence_embeddings = model_output[0][:, 0]
+    # normalize embeddings
+    sentence_embeddings = torch.nn.functional.normalize(sentence_embeddings, p=2, dim=1)
+    time_cost = end_time - start_time
+    logging.info("Sentence embeddings:", sentence_embeddings)
+    logging.info("Sentence embeddings.shape:", sentence_embeddings.shape)
+    logging.info("generate cost %g ms", time_cost * 1000)
+    return sentence_embeddings
+
+
+if __name__ == '__main__':
+    try:
+        infer_test(sentences)
+        infer_test(sentences)
+
+    except Exception as e:
+        logging.error("An error occurred during inference:", str(e))
\ No newline at end of file
diff --git a/mindie/examples/models/bge/large-zh-v1.5/eval_cmteb.py b/mindie/examples/models/bge/large-zh-v1.5/eval_cmteb.py
new file mode 100644
index 00000000..af258767
--- /dev/null
+++ b/mindie/examples/models/bge/large-zh-v1.5/eval_cmteb.py
@@ -0,0 +1,304 @@
+# Copyright 2024 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the License);
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import logging
+import os
+from typing import List, Any, Union
+from collections import defaultdict
+import json
+import numpy as np
+import torch
+import transformers.tokenization_utils_base
+from mteb import MTEB, AbsTaskRetrieval
+from datasets import load_dataset, DatasetDict
+from optimum.onnxruntime import ORTModelForFeatureExtraction
+from transformers import AutoTokenizer, AutoModel
+from tqdm import tqdm as progressbar
+
+from atb_llm.utils.file_utils import safe_open
+
+logging.getLogger().setLevel(logging.INFO)
+
+
+def get_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser(description='Evaluate LLM.')
+    parser.add_argument(
+        '--model_type_or_path',
+        type=str,
+        required=True,
+        help='Specipy model type to load default model or path to the directory containing model file.'
+    )
+    parser.add_argument(
+        '--batch_size',
+        type=int,
+        default=20,
+        help='Batch size of dataset for computing.'
+    )
+    parser.add_argument(
+        '--device',
+        type=int,
+        default=0,
+        choices=list(range(8)),
+        help='Adapt model on device id x.'
+    )
+    return parser.parse_args()
+
+
+def load_retrieval_data(hf_hub_name, eval_splits):
+    eval_split = eval_splits[0]
+    dataset = load_dataset("parquet", data_files={'corpus': 'dataset/corpus-00000-of-00001-8afe7b7a7eca49e3.parquet',
+                                                  'queries': 'dataset/queries-00000-of-00001-930bf3b805a80dd9.parquet'})
+    qrels = load_dataset("parquet", data_files={eval_split: 'dataset/dev-00000-of-00001-92ed0416056ff7e1.parquet'})[
+        eval_split]
+
+    corpus = {e['id']: {'text': e['text']} for e in dataset['corpus']}
+    queries = {e['id']: e['text'] for e in dataset['queries']}
+    relevant_docs = defaultdict(dict)
+    for e in qrels:
+        relevant_docs[e['qid']][e['pid']] = e['score']
+
+    corpus = DatasetDict({eval_split: corpus})
+    queries = DatasetDict({eval_split: queries})
+    relevant_docs = DatasetDict({eval_split: relevant_docs})
+    return corpus, queries, relevant_docs
+
+
+class T2RetrievalLocal(AbsTaskRetrieval):
+    def __init__(self, **kwargs: Any):
+        super().__init__(**kwargs)
+        self.data_loaded = None
+        self.corpus = None
+        self.queries = None
+        self.relevant_docs = None
+
+    @property
+    def description(self) -> dict:
+        return {
+            'name': 'T2RetrievalLocal',
+            'description': 'T2Ranking: A large-scale Chinese Benchmark for Passage Ranking',
+            'hf_hub_name': 'C-MTEB/T2Retrieval',
+            'reference': "https://arxiv.org/abs/2304.03679",
+            'type': 'Retrieval',
+            'category': 's2p',
+            'eval_splits': ['test'],
+            'eval_langs': ['zh'],
+            'main_score': 'ndcg_at_10',
+        }
+
+    def load_data(self, **kwargs) -> None:
+        if self.data_loaded:
+            return
+        try:
+            self.corpus, self.queries, self.relevant_docs = load_retrieval_data(self.description['hf_hub_name'],
+                                                                                self.description['eval_splits'])
+        except KeyError as e:
+            raise RuntimeError('load dataset failed because {}'.format(e)) from e
+        else:
+            self.data_loaded = True
+
+
+class Model:
+    def __init__(self, tokenizer_path: str, batch_size: int) -> None:
+        self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)
+        self.batch_size = batch_size
+
+    def encode(self, sentences: List[str], **kwargs: Any) -> torch.Tensor:
+        """ Returns a list of embeddings for the given sentences.
+        Args:
+            sentences (`List[str]`): List of sentences to encode
+
+        Returns:
+            `torch.Tensor`: Tensor of embeddings for the given sentences
+        """
+        pass
+
+    def _tokenize_sentences(self, sentences: List[str]) -> transformers.tokenization_utils_base.BatchEncoding:
+        return self.tokenizer(
+            sentences,
+            padding='max_length',
+            truncation=True,
+            return_tensors='pt',
+            max_length=512
+        )
+
+
+class PyTorchModel(Model):
+    def __init__(self, tokenizer_path: str, model_path: str, batch_size: int, device_id: int):
+        super(PyTorchModel, self).__init__(tokenizer_path, batch_size)
+
+        # init model runtime
+        try:
+            import torch_npu
+        except ImportError:
+            self.device = 'cuda:{}'.format(device_id) if torch.cuda.is_available() else 'cpu'
+        else:
+            self.device = 'npu:{}'.format(device_id)
+            torch_npu.npu.set_device(device_id)
+            torch.npu.set_compile_mode(jit_compile=False)
+
+        self.model = AutoModel.from_pretrained(
+            model_path,
+            local_files_only=True,
+            trust_remote_code=True
+        ).half().to(self.device)
+        self.model.eval()
+
+    def encode(self, sentences: List[str], **kwargs: Any) -> Union[np.ndarray, torch.Tensor]:
+        all_embs = []
+
+        for start_index in progressbar(range(0, len(sentences), self.batch_size)):
+            sentences_batch = sentences[start_index:start_index + self.batch_size]
+            # Tokenize sentences
+            encoded_inputs = self._tokenize_sentences(sentences_batch)
+            # Compute token embeddings
+            with torch.no_grad():
+                embs = self.model(**encoded_inputs.to(self.device)).float()
+                sentence_embeddings = embs[:, 0]
+            all_embs.extend(sentence_embeddings.cpu())
+
+        if all_embs:
+            if isinstance(all_embs, np.ndarray):
+                all_embs = torch.from_numpy(all_embs)
+            else:
+                all_embs = torch.stack(all_embs)
+        else:
+            all_embs = torch.Tensor()
+
+        return all_embs
+
+
+class ONNXModel(Model):
+    def __init__(self, tokenizer_path: str, model_path: str, batch_size: int, device_id: int):
+        super(ONNXModel, self).__init__(tokenizer_path, batch_size)
+
+        # init model runtime
+        try:
+            import torch_npu
+        except ImportError:
+            self.device = 'cuda:{}'.format(device_id) if torch.cuda.is_available() else 'cpu'
+        else:
+            self.device = 'npu:{}'.format(device_id)
+            torch_npu.npu.set_device(device_id)
+            torch.npu.set_compile_mode(jit_compile=False)
+
+        self.ort = ORTModelForFeatureExtraction.from_pretrained(model_path).to(self.device)
+
+    def encode(self, sentences: List[str], **kwargs: Any) -> Union[np.ndarray, torch.Tensor]:
+        all_embs = []
+        for start_index in progressbar(range(0, len(sentences), self.batch_size)):
+            sentences_batch = sentences[start_index:start_index + self.batch_size]
+            # Tokenize sentences
+            encoded_inputs = self._tokenize_sentences(sentences_batch)
+            # Compute token embeddings
+            encoded_input = encoded_inputs.to(self.device)
+            with torch.no_grad():
+                model_output = self.ort(**encoded_input)
+                # Perform pooling. In this case, cls pooling.
+                sentence_embeddings = model_output[0][:, 0]
+            embs = torch.nn.functional.normalize(sentence_embeddings, p=2, dim=1)
+            all_embs.extend(embs)
+
+        if all_embs:
+            if isinstance(all_embs, np.ndarray):
+                all_embs = torch.from_numpy(all_embs)
+            else:
+                all_embs = torch.stack(all_embs)
+        else:
+            all_embs = torch.Tensor()
+
+        return all_embs
+
+
+class OMModel(Model):
+    def __init__(self, tokenizer_path: str, model_path: str, batch_size: int, device_id: int) -> None:
+        super(OMModel, self).__init__(tokenizer_path, batch_size)
+
+        # init model runtime
+        from ais_bench.infer.interface import InferSession
+
+        self.session = InferSession(device_id, model_path)
+
+    def encode(self, sentences: List[str], **kwargs: Any) -> Union[np.ndarray, torch.Tensor]:
+        all_embs = []
+
+        for start_index in progressbar(range(0, len(sentences), self.batch_size)):
+            sentences_batch = sentences[start_index:start_index + self.batch_size]
+            # Tokenize sentences
+            encoded_inputs = self._tokenize_sentences(sentences_batch)
+            input_ids = encoded_inputs.data['input_ids']
+            attention_mask = encoded_inputs.data['attention_mask']
+            token_type_ids = encoded_inputs.data['token_type_ids']
+            # Compute token embeddings
+            outputs = self.session.infer(feeds=[input_ids, attention_mask, token_type_ids], mode='dymshape',
+                                         custom_sizes=10000000)[0][:, 0]
+            outputs = torch.from_numpy(outputs)
+            embs = torch.nn.functional.normalize(outputs, p=2, dim=1)
+            all_embs.extend(embs)
+
+        if all_embs:
+            if isinstance(all_embs, np.ndarray):
+                all_embs = torch.from_numpy(all_embs)
+            else:
+                all_embs = torch.stack(all_embs)
+        else:
+            all_embs = torch.Tensor()
+
+        return all_embs
+
+
+def load_model(model_args: argparse.Namespace) -> Model:
+    # default model path
+    with safe_open('config_bge.json', 'r', encoding='utf-8') as reader:
+        text = reader.read()
+    default_path = json.loads(text)['default_path']
+    pytorch_model_path = tokenizer_path = os.path.abspath(default_path['tokenizer_path'])
+    onnx_model_path = os.path.abspath(default_path['onnx_model_path'])
+    om_model_path = os.path.abspath(default_path['om_model_path'])
+
+    model_path_map = {'pytorch': pytorch_model_path, 'onnx': onnx_model_path, 'om': om_model_path}
+    model_map = {'pytorch': PyTorchModel, 'onnx': ONNXModel, 'om': OMModel}
+
+    model_type = model_args.model_type_or_path.removesuffix('/').split('.')[-1].split('/')[-1]
+    default_model_path = model_path_map.get(model_type, 'not exist')
+    if default_model_path != 'not exist':
+        model_path = (
+            model_args.model_type_or_path
+            if os.path.isdir(model_args.model_type_or_path) or os.path.isfile(model_args.model_type_or_path)
+            else default_model_path
+        )
+    else:
+        raise RuntimeError(
+            'load model failed because '
+            '\'{}\' is not a valid model type or path'.format(model_args.model_type_or_path)
+        )
+    try:
+        model_for_eval = model_map[model_type](
+            tokenizer_path=tokenizer_path,
+            model_path=model_path,
+            batch_size=model_args.batch_size,
+            device_id=model_args.device
+        )
+    except KeyError as e:
+        raise RuntimeError('load {} model failed because {}'.format(model_type, e)) from e
+    return model_for_eval
+
+
+if __name__ == '__main__':
+    args = get_args()
+    model = load_model(args)
+    task = ['T2RetrievalLocal']
+    evaluation = MTEB(tasks=task, task_langs=['zh'])
+    results = evaluation.run(model)
+    logging.info(results)
diff --git a/mindie/examples/models/bge/large-zh-v1.5/eval_performance.py b/mindie/examples/models/bge/large-zh-v1.5/eval_performance.py
new file mode 100644
index 00000000..08d1ec1b
--- /dev/null
+++ b/mindie/examples/models/bge/large-zh-v1.5/eval_performance.py
@@ -0,0 +1,302 @@
+# Copyright 2024 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the License);
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import json
+import logging
+import os
+import time
+from typing import Any, List, Union, Tuple
+
+import datasets
+import numpy as np
+import torch
+import transformers.tokenization_utils_base
+from transformers import AutoTokenizer, AutoModel
+from optimum.onnxruntime import ORTModelForFeatureExtraction
+from tqdm import tqdm as progressbar
+
+from atb_llm.utils.file_utils import safe_open
+
+logging.getLogger().setLevel(logging.INFO)
+
+
+def get_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser(description='Evaluate LLM.')
+    parser.add_argument(
+        '--model_type_or_path',
+        type=str,
+        required=True,
+        help='Specipy model type to load default model or path to the directory containing model file.'
+    )
+    parser.add_argument(
+        '--input_shape',
+        type=str,
+        required=True,
+        help='Shape of input tensors.'
+    )
+    parser.add_argument(
+        '--device',
+        type=int,
+        default=4,
+        choices=list(range(8)),
+        help='Adapt model on device id x.'
+    )
+    parser.add_argument(
+        '--loop',
+        type=int,
+        default=50,
+        help='Evaluation loops.'
+    )
+    return parser.parse_args()
+
+
+class Model:
+    def __init__(self, tokenizer_path: str) -> None:
+        self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)
+
+    def init_runtime(self, device_id: int) -> Tuple[Union[str, int], Any]:
+        if self.__class__.__name__.startswith(('PyTorch', 'ONNX')):
+            try:
+                import torch_npu
+            except ImportError:
+                device = 'cuda:{}'.format(device_id) if torch.cuda.is_available() else 'cpu'
+            else:
+                device = 'npu:{}'.format(device_id)
+                torch_npu.npu.set_device(device_id)
+                torch.npu.set_compile_mode(jit_compile=False)
+            return device, 0
+        elif self.__class__.__name__.startswith('OMModel'):
+            from ais_bench.infer.interface import InferSession
+            return device_id, InferSession
+        else:
+            raise RuntimeError
+
+    def tokenize(
+            self,
+            sentences_batch: List[List[str]],
+            seq_len: int
+    ) -> transformers.tokenization_utils_base.BatchEncoding:
+        encoded_inputs = self.tokenizer(
+            sentences_batch,
+            padding='max_length',
+            truncation=True,
+            return_tensors='pt',
+            max_length=512  # seq_len
+        ).to(self.device)
+        return encoded_inputs
+
+    def encode(self, pairs: List[List[str]], seq_len: int) -> float:
+        # Tokenize sentences
+        encoded_inputs = self.tokenize(pairs, seq_len)
+        # Compute token embedding time
+        computing_time = self._encode_batched(encoded_inputs)
+
+        return computing_time
+
+    def compute_scores(self, pairs: List[List[str]], batch_size: int, seq_len: int, loop: int) -> dict:
+        all_computing_time = []
+
+        for _ in progressbar(range(loop), 'Evaluating...'):
+            computing_time = self.encode(pairs, seq_len)
+            all_computing_time.append(computing_time)
+
+        try:
+            throughput = 1000 * batch_size / np.mean(all_computing_time)
+        except ZeroDivisionError as e:
+            raise RuntimeError('{} because no evaluation results'.format(e)) from e
+
+        scores = {
+            'compute_time': {
+                'min': np.min(all_computing_time),
+                'max': np.max(all_computing_time),
+                'mean': np.mean(all_computing_time),
+                'median': np.median(all_computing_time),
+                'percentile(99%)': np.percentile(all_computing_time, 99)
+            },
+            'throughput': throughput
+        }
+
+        return scores
+
+    def _encode_batched(self, inputs: transformers.tokenization_utils_base.BatchEncoding) -> float:
+        """ Returns a list of embeddings for the given sentences.
+
+        Args:
+            inputs (`BatchEncoding`): List of sentences to encode
+
+        Returns:
+            `float: Computing time of embeddings for the given sentences
+        """
+        _ = self
+        return 0.0
+
+
+class PyTorchModel(Model):
+    def __init__(self, tokenizer_path: str, model_path: str, device_id: int):
+        super(PyTorchModel, self).__init__(tokenizer_path)
+        self.device, _ = self.init_runtime(device_id)
+        self.model = AutoModel.from_pretrained(model_path).half().to(self.device)
+        self.model.eval()
+
+    def _encode_batched(self, inputs: transformers.tokenization_utils_base.BatchEncoding) -> float:
+        tick = time.time()
+        with torch.no_grad():
+            model_output = self.model(**inputs)
+            _ = model_output[0][:, 0]
+        tock = time.time()
+        return 1000 * (tock - tick)
+
+
+class ONNXModel(Model):
+    def __init__(self, tokenizer_path: str, model_path: str, device_id: int):
+        super(ONNXModel, self).__init__(tokenizer_path)
+        self.device = self.init_runtime(device_id)
+        self.ort = ORTModelForFeatureExtraction.from_pretrained(model_path).to(self.device)
+
+    def _encode_batched(self, inputs: transformers.tokenization_utils_base.BatchEncoding) -> float:
+        tick = time.time()
+        with torch.no_grad():
+            _ = self.ort(**inputs)
+            # Perform pooling. In this case, cls pooling.
+        tock = time.time()
+        return 1000 * (tock - tick)
+
+
+class OMModel(Model):
+    def __init__(self, tokenizer_path: str, model_path: str, device_id: int) -> None:
+        super(OMModel, self).__init__(tokenizer_path)
+        self.device, infer_session = self.init_runtime(device_id)
+        self.session = infer_session(device_id, model_path, loop=4)
+
+    def _encode_batched(self, inputs: transformers.tokenization_utils_base.BatchEncoding) -> float:
+        input_ids = inputs.data['input_ids']
+        attention_mask = inputs.data['attention_mask']
+        token_type_ids = inputs.data['token_type_ids']
+        tick = time.time()
+        _ = self.session.infer(feeds=[input_ids, attention_mask, token_type_ids],
+                           mode='dymshape', custom_sizes=5000000)[0][:, 0]
+        tock = time.time()
+
+        return 1000 * (tock - tick) / 4
+
+
+class PerformanceEvaluator:
+    def __init__(self, metadata: dict) -> None:
+        self.metadata = metadata
+        self.dataset = datasets.load_dataset("parquet", data_files={
+            'corpus': 'dataset/corpus-00000-of-00001-8afe7b7a7eca49e3.parquet',
+            'queries': 'dataset/queries-00000-of-00001-930bf3b805a80dd9.parquet'})
+
+        self.samples = self.dataset[self.metadata['eval_splits'][0]]
+
+    def __call__(
+            self,
+            model: Model,
+            input_shape: Union[Tuple, List],
+            loop: int) -> dict:
+        """This is called during training to evaluate the model.
+        It returns scores.
+
+        Args:
+            model (`Model`): the model to evaluate
+            input_shape (`Union[Tuple[int, int], List[int, int]]`): shape of input tensors
+            loop (`int`): evaluation loops
+        """
+        return self.compute_performance(model, input_shape, loop)
+
+    def compute_performance(
+            self,
+            model: Model,
+            input_shape: Union[Tuple, List],
+            loop: int) -> dict:
+        batch_size, seq_len = input_shape
+
+        pairs = []
+        docs = []
+        for sample in self.samples:
+            docs.append(sample['text'])
+        pairs = docs
+        pairs = pairs[:batch_size]
+
+        scores = model.compute_scores(pairs, batch_size, seq_len, loop)
+
+        return scores
+
+
+class Evaluation:
+    def __init__(self, eval_args: argparse.Namespace):
+        self.input_shape = tuple(map(int, eval_args.input_shape.split(',')))
+        self.device_id = eval_args.device
+        self.loop = eval_args.loop
+        # dataset metadata
+        self.metadata = {
+            'name': 'T2RetrievalLocal',
+            'description': 'T2Ranking: A large-scale Chinese Benchmark for Passage Ranking',
+            'reference': 'https://arxiv.org/abs/2304.03679',
+            'type': 'Retrieval',
+            'category': 's2p',
+            'eval_splits': ['corpus'],
+            'eval_langs': ['zh'],
+            'main_score': 'ndcg_at_10'
+        }
+
+        # default model path
+        with safe_open('config_bge.json', 'r', encoding='utf-8') as reader:
+            text = reader.read()
+        default_path = json.loads(text)['default_path']
+        pytorch_model_path = self.tokenizer_path = os.path.abspath(default_path['tokenizer_path'])
+        onnx_model_path = os.path.abspath(default_path['onnx_model_path'])
+        om_model_path = os.path.abspath(default_path['om_model_path'])
+
+        model_path_map = {'pytorch': pytorch_model_path, 'onnx': onnx_model_path, 'om': om_model_path}
+
+        self.model_type = eval_args.model_type_or_path.removesuffix('/').split('.')[-1].split('/')[-1]
+        default_model_path = model_path_map.get(self.model_type, 'not exist')
+        if default_model_path != 'not exist':
+            self.model_path = (
+                eval_args.model_type_or_path
+                if os.path.isdir(eval_args.model_type_or_path) or os.path.isfile(eval_args.model_type_or_path)
+                else default_model_path
+            )
+        else:
+            raise RuntimeError(
+                'load model failed because '
+                '\'{}\' is not a valid model type or path'.format(eval_args.model_type_or_path)
+            )
+
+    def load_model(self) -> Model:
+        model_map = {'pytorch': PyTorchModel, 'onnx': ONNXModel, 'om': OMModel}
+        try:
+            model = model_map[self.model_type](
+                tokenizer_path=self.tokenizer_path,
+                model_path=self.model_path,
+                device_id=self.device_id
+            )
+        except KeyError as e:
+            raise RuntimeError('load {} model failed because {}'.format(self.model_type, e)) from e
+        return model
+
+    def run(self) -> dict:
+        model = self.load_model()
+        evaluator = PerformanceEvaluator(self.metadata)
+        eval_results = evaluator(model, self.input_shape, self.loop)
+        return eval_results
+
+
+if __name__ == '__main__':
+    args = get_args()
+    evaluation = Evaluation(args)
+    results = evaluation.run()
+    logging.info(results)
diff --git a/mindie/examples/models/bge/large-zh-v1.5/infer.py b/mindie/examples/models/bge/large-zh-v1.5/infer.py
new file mode 100644
index 00000000..fa1d677e
--- /dev/null
+++ b/mindie/examples/models/bge/large-zh-v1.5/infer.py
@@ -0,0 +1,95 @@
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the License);
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import argparse
+import logging
+import torch
+from transformers import AutoTokenizer
+from ais_bench.infer.interface import InferSession
+
+parser = argparse.ArgumentParser(description='Infer with a specified .om model file and device id')
+parser.add_argument('--model-path', type=str, required=True, help='Path to the directory containing the .om model file')
+parser.add_argument('--device', type=int, default=0, choices=[0, 1, 2, 3, 4, 5, 6, 7],
+                    help='load the model.om on device id x')
+
+logging.getLogger().setLevel(logging.INFO)
+
+
+class InferEngine:
+
+    def __init__(self, device_id, model_path):
+        self.device_id = device
+        self.model_path = model_path
+        self.tokenizer = AutoTokenizer.from_pretrained(hf_model_path)
+        # InferSession的初始化表示在某个device的npu芯片上加载模型model.om
+        self.session = InferSession(device_id=device_id, model_path=model_path)
+
+    def infer(self, text):
+        encoded_input = self.tokenizer(text, padding=True, truncation=True, return_tensors='np', max_length=512)
+        input_ids = encoded_input['input_ids']
+        attention_mask = encoded_input['attention_mask']
+        token_type_ids = encoded_input['token_type_ids']
+        inputs = [input_ids, attention_mask, token_type_ids]
+        # feeds传入一组输入数据；mode选择模型类型，static表示输入节点shape固定的静态模型
+        outputs = self.session.infer(feeds=inputs, mode="dymshape", custom_sizes=10000000)[0][:, 0]
+        outputs = torch.from_numpy(outputs)
+        outputs = torch.nn.functional.normalize(outputs, p=2, dim=1)
+
+        logging.info("Sentence embeddings: %s", outputs)
+        logging.info("Sentence embeddings.shape: %s", outputs.shape)
+        return outputs
+
+    def infer_test(self, text):
+        encoded_input = self.tokenizer(text, padding=True, truncation=True, return_tensors='np', max_length=512)
+        input_ids = encoded_input['input_ids']
+        attention_mask = encoded_input['attention_mask']
+        token_type_ids = encoded_input['token_type_ids']
+        inputs = [input_ids, attention_mask, token_type_ids]
+        # feeds传入一组输入数据；mode选择模型类型，static表示输入节点shape固定的静态模型
+        outputs = self.session.infer(feeds=inputs, mode="dymshape", custom_sizes=10000000)[0][:, 0]
+        outputs = torch.from_numpy(outputs)
+        outputs = torch.nn.functional.normalize(outputs, p=2, dim=1)
+
+        logging.info("Sentence embeddings: %s", outputs)
+        logging.info("Sentence embeddings.shape: %s", outputs.shape)
+        # exec_time_list 按先后顺序保留了所有session在执行推理的时间。
+        exec_time = self.session.summary().exec_time_list[-1]
+        time_cost = exec_time[1] - exec_time[0]
+        logging.info("generate cost %g ms", time_cost * 1000)
+        return outputs
+
+    def free(self):
+        self.session.free_resource()
+
+
+if __name__ == '__main__':
+    args = parser.parse_args()
+    device = args.device
+    # Load model from HuggingFace Hub
+    hf_model_path = args.model_path
+    # Sentences we want sentence embeddings for
+    sentences = ["样例数据-1", "样例数据-2"]
+
+    om_files = [f for f in os.listdir(hf_model_path) if f.endswith('.om')]
+    if not om_files:
+        raise ValueError(f"No .om files found in {hf_model_path}")
+
+    # 选择第一个找到的.om文件
+    om_file_name = om_files[0]
+    om_model_path = os.path.join(hf_model_path, om_file_name)
+    infer_engine = InferEngine(device_id=device, model_path=om_model_path)
+    infer_engine.infer_test(sentences)
+    infer_engine.infer_test(sentences)
+    infer_engine.free()
\ No newline at end of file
diff --git a/mindie/examples/models/bge/large-zh-v1.5/main.py b/mindie/examples/models/bge/large-zh-v1.5/main.py
new file mode 100644
index 00000000..275f63e8
--- /dev/null
+++ b/mindie/examples/models/bge/large-zh-v1.5/main.py
@@ -0,0 +1,83 @@
+# Copyright 2024 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the License);
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import time
+import logging
+import torch
+
+try:
+    import torch_npu
+
+    device = "npu:0"
+    torch_npu.npu.set_device(0)
+    torch.npu.set_compile_mode(jit_compile=False)
+except ImportError:
+    device = "cuda:0"
+from transformers import AutoTokenizer, AutoModel
+
+logging.getLogger().setLevel(logging.INFO)
+
+
+class ModelInference:
+    def __init__(self, model_path):
+        self.model_path = model_path
+        self.tokenizer = AutoTokenizer.from_pretrained(model_path)
+        self.model = AutoModel.from_pretrained(model_path).half().to(device)
+        self.model.eval()
+
+    def infer(self, text):
+        encoded_input = self.tokenizer(
+            text, padding=True, truncation=True, return_tensors="pt", max_length=512
+        )
+        encoded_input = encoded_input.to(device)
+        logging.info(encoded_input.input_ids.shape)
+
+        with torch.no_grad():
+            model_output = self.model(**encoded_input)
+            sentence_embeddings = model_output[0][:, 0]
+
+        sentence_embeddings = torch.nn.functional.normalize(
+            sentence_embeddings, p=2, dim=1
+        )
+        logging.info("Sentence embeddings: %s", sentence_embeddings)
+        logging.info("Sentence embeddings.shape: %s", sentence_embeddings.shape)
+
+    def infer_test(self, text):
+        encoded_input = self.tokenizer(
+            text, padding="max_length", return_tensors="pt", max_length=512
+        )
+        encoded_input = encoded_input.to(device)
+
+        with torch.no_grad():
+            start_time = time.time()
+            model_output = self.model(**encoded_input)
+            end_time = time.time()
+            sentence_embeddings = model_output[:, 0]
+
+        sentence_embeddings = torch.nn.functional.normalize(
+            sentence_embeddings, p=2, dim=1
+        )
+        time_cost = end_time - start_time
+        logging.info("Sentence embeddings: %s", sentence_embeddings)
+        logging.info("Sentence embeddings.shape: %s", sentence_embeddings.shape)
+        logging.info("generate cost %g ms", time_cost * 1000)
+        return sentence_embeddings
+
+
+if __name__ == "__main__":
+    MODEL_PATH = "/data1/models/BAAI/bge-large-zh-v1.5"
+    sentences = ["样例数据-1", "样例数据-2"]
+    model_inference = ModelInference(MODEL_PATH)
+    model_inference.infer_test(sentences)
+    model_inference.infer_test(sentences)
diff --git a/mindie/examples/models/bge/large-zh-v1.5/modeling_bert_ascend.py b/mindie/examples/models/bge/large-zh-v1.5/modeling_bert_ascend.py
new file mode 100644
index 00000000..223a8182
--- /dev/null
+++ b/mindie/examples/models/bge/large-zh-v1.5/modeling_bert_ascend.py
@@ -0,0 +1,1982 @@
+# Copyright 2024 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the License);
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch BERT model."""
+
+import math
+import os
+import warnings
+import json
+from dataclasses import dataclass
+from typing import List, Optional, Tuple, Union
+import torch_npu
+
+import torch
+import torch.utils.checkpoint
+from atb_speed.common.utils import load_atb_speed
+from torch import nn
+from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
+from transformers.activations import ACT2FN
+
+from transformers.modeling_outputs import (
+    BaseModelOutputWithPastAndCrossAttentions,
+    BaseModelOutputWithPoolingAndCrossAttentions,
+    CausalLMOutputWithCrossAttentions,
+    MaskedLMOutput,
+    MultipleChoiceModelOutput,
+    NextSentencePredictorOutput,
+    QuestionAnsweringModelOutput,
+    SequenceClassifierOutput,
+    TokenClassifierOutput,
+)
+from transformers.modeling_utils import PreTrainedModel
+from transformers.pytorch_utils import apply_chunking_to_forward, find_pruneable_heads_and_indices, prune_linear_layer
+from transformers.utils import (
+    ModelOutput,
+    add_code_sample_docstrings,
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    logging,
+    replace_return_docstrings,
+)
+from .configuration_bert import BertConfig
+
+
+def load_acl_transformer():
+    acl_transformer_home_path = os.getenv("ATB_SPEED_HOME_PATH", "")
+    if not acl_transformer_home_path or not os.path.exists(acl_transformer_home_path):
+        raise RuntimeError("env ACLTRANSFORMER_HOME_PATH not exist, source set_env.sh")
+    lib_path = os.path.join(acl_transformer_home_path, "lib/libatb_speed_torch.so")
+    torch.classes.load_library(lib_path)
+
+
+def is_nd():
+    soc_version = torch_npu._C._npu_get_soc_version()
+    return soc_version in [104, 220, 221, 222, 223, 224]
+
+
+IS_ND = is_nd()
+
+load_acl_transformer()
+load_atb_speed()
+
+logger = logging.get_logger(__name__)
+
+_CHECKPOINT_FOR_DOC = "google-bert/bert-base-uncased"
+_CONFIG_FOR_DOC = "BertConfig"
+
+# TokenClassification docstring
+_CHECKPOINT_FOR_TOKEN_CLASSIFICATION = "dbmdz/bert-large-cased-finetuned-conll03-english"
+_TOKEN_CLASS_EXPECTED_OUTPUT = (
+    "['O', 'I-ORG', 'I-ORG', 'I-ORG', 'O', 'O', 'O', 'O', 'O', 'I-LOC', 'O', 'I-LOC', 'I-LOC'] "
+)
+_TOKEN_CLASS_EXPECTED_LOSS = 0.01
+MASK_INC_DIM1 = 1 if IS_ND else 16
+
+# QuestionAnswering docstring
+_CHECKPOINT_FOR_QA = "deepset/bert-base-cased-squad2"
+_QA_EXPECTED_OUTPUT = "'a nice puppet'"
+_QA_EXPECTED_LOSS = 7.41
+_QA_TARGET_START_INDEX = 14
+_QA_TARGET_END_INDEX = 15
+
+# SequenceClassification docstring
+_CHECKPOINT_FOR_SEQUENCE_CLASSIFICATION = "textattack/bert-base-uncased-yelp-polarity"
+_SEQ_CLASS_EXPECTED_OUTPUT = "'LABEL_1'"
+_SEQ_CLASS_EXPECTED_LOSS = 0.01
+
+
+class BertEmbeddings(nn.Module):
+    """Construct the embeddings from word, position and token_type embeddings."""
+
+    def __init__(self, config):
+        super().__init__()
+        self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id)
+        self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size)
+        self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.hidden_size)
+
+        # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load
+        # any TensorFlow checkpoint file
+        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+        # position_ids (1, len position emb) is contiguous in memory and exported when serialized
+        self.position_embedding_type = getattr(config, "position_embedding_type", "absolute")
+        self.register_buffer(
+            "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False
+        )
+        self.register_buffer(
+            "token_type_ids", torch.zeros(self.position_ids.size(), dtype=torch.long), persistent=False
+        )
+
+    def forward(
+            self,
+            input_ids: Optional[torch.LongTensor] = None,
+            token_type_ids: Optional[torch.LongTensor] = None,
+            position_ids: Optional[torch.LongTensor] = None,
+            inputs_embeds: Optional[torch.FloatTensor] = None,
+            past_key_values_length: int = 0,
+    ) -> torch.Tensor:
+        if input_ids is not None:
+            input_shape = input_ids.size()
+        else:
+            input_shape = inputs_embeds.size()[:-1]
+
+        seq_length = input_shape[1]
+
+        if position_ids is None:
+            position_ids = self.position_ids[:, past_key_values_length: seq_length + past_key_values_length]
+
+        # Setting the token_type_ids to the registered buffer in constructor where it is all zeros, which usually
+        # occurs when its auto-generated, registered buffer helps users when tracing the model without passing
+        # token_type_ids, solves issue #5664
+        if token_type_ids is None:
+            if hasattr(self, "token_type_ids"):
+                buffered_token_type_ids = self.token_type_ids[:, :seq_length]
+                buffered_token_type_ids_expanded = buffered_token_type_ids.expand(input_shape[0], seq_length)
+                token_type_ids = buffered_token_type_ids_expanded
+            else:
+                token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=self.position_ids.device)
+
+        if inputs_embeds is None:
+            inputs_embeds = self.word_embeddings(input_ids)
+        token_type_embeddings = self.token_type_embeddings(token_type_ids)
+
+        embeddings = inputs_embeds + token_type_embeddings
+        if self.position_embedding_type == "absolute":
+            position_embeddings = self.position_embeddings(position_ids)
+            embeddings += position_embeddings
+        embeddings = self.LayerNorm(embeddings)
+        embeddings = self.dropout(embeddings)
+        return embeddings
+
+
+class BertSelfAttention(nn.Module):
+    def __init__(self, config, position_embedding_type=None):
+        super().__init__()
+        if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"):
+            raise ValueError(
+                f"The hidden size ({config.hidden_size}) is not a multiple of the number of attention "
+                f"heads ({config.num_attention_heads})"
+            )
+
+        self.num_attention_heads = config.num_attention_heads
+        self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
+        self.all_head_size = self.num_attention_heads * self.attention_head_size
+
+        self.query = nn.Linear(config.hidden_size, self.all_head_size)
+        self.key = nn.Linear(config.hidden_size, self.all_head_size)
+        self.value = nn.Linear(config.hidden_size, self.all_head_size)
+
+        self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
+        self.position_embedding_type = position_embedding_type or getattr(
+            config, "position_embedding_type", "absolute"
+        )
+        if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query":
+            self.max_position_embeddings = config.max_position_embeddings
+            self.distance_embedding = nn.Embedding(2 * config.max_position_embeddings - 1, self.attention_head_size)
+
+        self.is_decoder = config.is_decoder
+
+    def transpose_for_scores(self, x: torch.Tensor) -> torch.Tensor:
+        new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
+        x = x.view(new_x_shape)
+        return x.permute(0, 2, 1, 3)
+
+    def forward(
+            self,
+            hidden_states: torch.Tensor,
+            attention_mask: Optional[torch.FloatTensor] = None,
+            head_mask: Optional[torch.FloatTensor] = None,
+            encoder_hidden_states: Optional[torch.FloatTensor] = None,
+            encoder_attention_mask: Optional[torch.FloatTensor] = None,
+            past_key_value: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
+            output_attentions: Optional[bool] = False,
+    ) -> Tuple[torch.Tensor]:
+        mixed_query_layer = self.query(hidden_states)
+
+        # If this is instantiated as a cross-attention module, the keys
+        # and values come from an encoder; the attention mask needs to be
+        # such that the encoder's padding tokens are not attended to.
+        is_cross_attention = encoder_hidden_states is not None
+
+        if is_cross_attention and past_key_value is not None:
+            # reuse k,v, cross_attentions
+            key_layer = past_key_value[0]
+            value_layer = past_key_value[1]
+            attention_mask = encoder_attention_mask
+        elif is_cross_attention:
+            key_layer = self.transpose_for_scores(self.key(encoder_hidden_states))
+            value_layer = self.transpose_for_scores(self.value(encoder_hidden_states))
+            attention_mask = encoder_attention_mask
+        elif past_key_value is not None:
+            key_layer = self.transpose_for_scores(self.key(hidden_states))
+            value_layer = self.transpose_for_scores(self.value(hidden_states))
+            key_layer = torch.cat([past_key_value[0], key_layer], dim=2)
+            value_layer = torch.cat([past_key_value[1], value_layer], dim=2)
+        else:
+            key_layer = self.transpose_for_scores(self.key(hidden_states))
+            value_layer = self.transpose_for_scores(self.value(hidden_states))
+
+        query_layer = self.transpose_for_scores(mixed_query_layer)
+
+        use_cache = past_key_value is not None
+        if self.is_decoder:
+            past_key_value = (key_layer, value_layer)
+
+        # Take the dot product between "query" and "key" to get the raw attention scores.
+        attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))
+
+        if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query":
+            query_length, key_length = query_layer.shape[2], key_layer.shape[2]
+            if use_cache:
+                position_ids_l = torch.tensor(key_length - 1, dtype=torch.long, device=hidden_states.device).view(
+                    -1, 1
+                )
+            else:
+                position_ids_l = torch.arange(query_length, dtype=torch.long, device=hidden_states.device).view(-1, 1)
+            position_ids_r = torch.arange(key_length, dtype=torch.long, device=hidden_states.device).view(1, -1)
+            distance = position_ids_l - position_ids_r
+
+            positional_embedding = self.distance_embedding(distance + self.max_position_embeddings - 1)
+            positional_embedding = positional_embedding.to(dtype=query_layer.dtype)  # fp16 compatibility
+
+            if self.position_embedding_type == "relative_key":
+                relative_position_scores = torch.einsum("bhld,lrd->bhlr", query_layer, positional_embedding)
+                attention_scores = attention_scores + relative_position_scores
+            elif self.position_embedding_type == "relative_key_query":
+                relative_position_scores_query = torch.einsum("bhld,lrd->bhlr", query_layer, positional_embedding)
+                relative_position_scores_key = torch.einsum("bhrd,lrd->bhlr", key_layer, positional_embedding)
+                attention_scores = attention_scores + relative_position_scores_query + relative_position_scores_key
+
+        attention_scores = attention_scores / math.sqrt(self.attention_head_size)
+        if attention_mask is not None:
+            # Apply the attention mask is (precomputed for all layers in BertModel forward() function)
+            attention_scores = attention_scores + attention_mask
+
+        # Normalize the attention scores to probabilities.
+        attention_probs = nn.functional.softmax(attention_scores, dim=-1)
+
+        # This is actually dropping out entire tokens to attend to, which might
+        # seem a bit unusual, but is taken from the original Transformer paper.
+        attention_probs = self.dropout(attention_probs)
+
+        # Mask heads if we want to
+        if head_mask is not None:
+            attention_probs = attention_probs * head_mask
+
+        context_layer = torch.matmul(attention_probs, value_layer)
+
+        context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
+        new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
+        context_layer = context_layer.view(new_context_layer_shape)
+
+        outputs = (context_layer, attention_probs) if output_attentions else (context_layer,)
+
+        if self.is_decoder:
+            outputs = outputs + (past_key_value,)
+        return outputs
+
+
+class BertSelfOutput(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states + input_tensor)
+        return hidden_states
+
+
+class BertAttention(nn.Module):
+    def __init__(self, config, position_embedding_type=None):
+        super().__init__()
+        self.self = BertSelfAttention(config, position_embedding_type=position_embedding_type)
+        self.output = BertSelfOutput(config)
+        self.pruned_heads = set()
+
+    def prune_heads(self, heads):
+        if len(heads) == 0:
+            return
+        heads, index = find_pruneable_heads_and_indices(
+            heads, self.self.num_attention_heads, self.self.attention_head_size, self.pruned_heads
+        )
+
+        # Prune linear layers
+        self.self.query = prune_linear_layer(self.self.query, index)
+        self.self.key = prune_linear_layer(self.self.key, index)
+        self.self.value = prune_linear_layer(self.self.value, index)
+        self.output.dense = prune_linear_layer(self.output.dense, index, dim=1)
+
+        # Update hyper params and store pruned heads
+        self.self.num_attention_heads = self.self.num_attention_heads - len(heads)
+        self.self.all_head_size = self.self.attention_head_size * self.self.num_attention_heads
+        self.pruned_heads = self.pruned_heads.union(heads)
+
+    def forward(
+            self,
+            hidden_states: torch.Tensor,
+            attention_mask: Optional[torch.FloatTensor] = None,
+            head_mask: Optional[torch.FloatTensor] = None,
+            encoder_hidden_states: Optional[torch.FloatTensor] = None,
+            encoder_attention_mask: Optional[torch.FloatTensor] = None,
+            past_key_value: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
+            output_attentions: Optional[bool] = False,
+    ) -> Tuple[torch.Tensor]:
+        self_outputs = self.self(
+            hidden_states,
+            attention_mask,
+            head_mask,
+            encoder_hidden_states,
+            encoder_attention_mask,
+            past_key_value,
+            output_attentions,
+        )
+        attention_output = self.output(self_outputs[0], hidden_states)
+        outputs = (attention_output,) + self_outputs[1:]  # add attentions if we output them
+        return outputs
+
+
+class BertIntermediate(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
+        if isinstance(config.hidden_act, str):
+            self.intermediate_act_fn = ACT2FN[config.hidden_act]
+        else:
+            self.intermediate_act_fn = config.hidden_act
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.intermediate_act_fn(hidden_states)
+        return hidden_states
+
+
+class BertOutput(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
+        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states + input_tensor)
+        return hidden_states
+
+
+class BertLayer(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.chunk_size_feed_forward = config.chunk_size_feed_forward
+        self.seq_len_dim = 1
+        self.attention = BertAttention(config)
+        self.is_decoder = config.is_decoder
+        self.add_cross_attention = config.add_cross_attention
+        if self.add_cross_attention:
+            if not self.is_decoder:
+                raise ValueError(f"{self} should be used as a decoder model if cross attention is added")
+            self.crossattention = BertAttention(config, position_embedding_type="absolute")
+        self.intermediate = BertIntermediate(config)
+        self.output = BertOutput(config)
+
+    def forward(
+            self,
+            hidden_states: torch.Tensor,
+            attention_mask: Optional[torch.FloatTensor] = None,
+            head_mask: Optional[torch.FloatTensor] = None,
+            encoder_hidden_states: Optional[torch.FloatTensor] = None,
+            encoder_attention_mask: Optional[torch.FloatTensor] = None,
+            past_key_value: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
+            output_attentions: Optional[bool] = False,
+    ) -> Tuple[torch.Tensor]:
+        # decoder uni-directional self-attention cached key/values tuple is at positions 1,2
+        self_attn_past_key_value = past_key_value[:2] if past_key_value is not None else None
+        self_attention_outputs = self.attention(
+            hidden_states,
+            attention_mask,
+            head_mask,
+            output_attentions=output_attentions,
+            past_key_value=self_attn_past_key_value,
+        )
+        attention_output = self_attention_outputs[0]
+
+        # if decoder, the last output is tuple of self-attn cache
+        if self.is_decoder:
+            outputs = self_attention_outputs[1:-1]
+            present_key_value = self_attention_outputs[-1]
+        else:
+            outputs = self_attention_outputs[1:]  # add self attentions if we output attention weights
+            present_key_value = 0
+        cross_attn_present_key_value = None
+        if self.is_decoder and encoder_hidden_states is not None:
+            if not hasattr(self, "crossattention"):
+                raise ValueError(
+                    f"If `encoder_hidden_states` are passed, {self} has to be instantiated with cross-attention layers"
+                    " by setting `config.add_cross_attention=True`"
+                )
+
+            # cross_attn cached key/values tuple is at positions 3,4 of past_key_value tuple
+            cross_attn_past_key_value = past_key_value[-2:] if past_key_value is not None else None
+            cross_attention_outputs = self.crossattention(
+                attention_output,
+                attention_mask,
+                head_mask,
+                encoder_hidden_states,
+                encoder_attention_mask,
+                cross_attn_past_key_value,
+                output_attentions,
+            )
+            attention_output = cross_attention_outputs[0]
+            outputs = outputs + cross_attention_outputs[1:-1]  # add cross attentions if we output attention weights
+
+            # add cross-attn cache to positions 3,4 of present_key_value tuple
+            cross_attn_present_key_value = cross_attention_outputs[-1]
+            present_key_value = present_key_value + cross_attn_present_key_value
+
+        layer_output = apply_chunking_to_forward(
+            self.feed_forward_chunk, self.chunk_size_feed_forward, self.seq_len_dim, attention_output
+        )
+        outputs = (layer_output,) + outputs
+
+        # if decoder, return the attn key/values as the last output
+        if self.is_decoder:
+            outputs = outputs + (present_key_value,)
+
+        return outputs
+
+    def feed_forward_chunk(self, attention_output):
+        intermediate_output = self.intermediate(attention_output)
+        layer_output = self.output(intermediate_output, attention_output)
+        return layer_output
+
+
+class BertEncoder(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.layer = nn.ModuleList([BertLayer(config) for _ in range(config.num_hidden_layers)])
+        self.gradient_checkpointing = False
+
+    def forward(
+            self,
+            hidden_states: torch.Tensor,
+            attention_mask: Optional[torch.FloatTensor] = None,
+            head_mask: Optional[torch.FloatTensor] = None,
+            encoder_hidden_states: Optional[torch.FloatTensor] = None,
+            encoder_attention_mask: Optional[torch.FloatTensor] = None,
+            past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
+            use_cache: Optional[bool] = None,
+            output_attentions: Optional[bool] = False,
+            output_hidden_states: Optional[bool] = False,
+            return_dict: Optional[bool] = True,
+    ) -> Union[Tuple[Union[Tuple[torch.Tensor], torch.Tensor], ...], BaseModelOutputWithPastAndCrossAttentions]:
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attentions = () if output_attentions else None
+        all_cross_attentions = () if output_attentions and self.config.add_cross_attention else None
+
+        if self.gradient_checkpointing and self.training:
+            if use_cache:
+                logger.warning_once(
+                    "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
+                )
+                use_cache = False
+
+        next_decoder_cache = () if use_cache else None
+        for i, layer_module in enumerate(self.layer):
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+
+            layer_head_mask = head_mask[i] if head_mask is not None else None
+            past_key_value = past_key_values[i] if past_key_values is not None else None
+
+            if self.gradient_checkpointing and self.training:
+                layer_outputs = self._gradient_checkpointing_func(
+                    layer_module.__call__,
+                    hidden_states,
+                    attention_mask,
+                    layer_head_mask,
+                    encoder_hidden_states,
+                    encoder_attention_mask,
+                    past_key_value,
+                    output_attentions,
+                )
+            else:
+                layer_outputs = layer_module(
+                    hidden_states,
+                    attention_mask,
+                    layer_head_mask,
+                    encoder_hidden_states,
+                    encoder_attention_mask,
+                    past_key_value,
+                    output_attentions,
+                )
+
+            hidden_states = layer_outputs[0]
+            if use_cache:
+                next_decoder_cache += (layer_outputs[-1],)
+            if output_attentions:
+                all_self_attentions = all_self_attentions + (layer_outputs[1],)
+                if self.config.add_cross_attention:
+                    all_cross_attentions = all_cross_attentions + (layer_outputs[2],)
+
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
+
+        if not return_dict:
+            return tuple(
+                v
+                for v in [
+                    hidden_states,
+                    next_decoder_cache,
+                    all_hidden_states,
+                    all_self_attentions,
+                    all_cross_attentions,
+                ]
+                if v is not None
+            )
+        return BaseModelOutputWithPastAndCrossAttentions(
+            last_hidden_state=hidden_states,
+            past_key_values=next_decoder_cache,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attentions,
+            cross_attentions=all_cross_attentions,
+        )
+
+
+class BertPooler(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.activation = nn.Tanh()
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        # We "pool" the model by simply taking the hidden state corresponding
+        # to the first token.
+        first_token_tensor = hidden_states[:, 0]
+        pooled_output = self.dense(first_token_tensor)
+        pooled_output = self.activation(pooled_output)
+        return pooled_output
+
+
+class BertPredictionHeadTransform(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        if isinstance(config.hidden_act, str):
+            self.transform_act_fn = ACT2FN[config.hidden_act]
+        else:
+            self.transform_act_fn = config.hidden_act
+        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.transform_act_fn(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states)
+        return hidden_states
+
+
+class BertLMPredictionHead(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.transform = BertPredictionHeadTransform(config)
+
+        # The output weights are the same as the input embeddings, but there is
+        # an output-only bias for each token.
+        self.decoder = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+
+        self.bias = nn.Parameter(torch.zeros(config.vocab_size))
+
+        # Need a link between the two variables so that the bias is correctly resized with `resize_token_embeddings`
+        self.decoder.bias = self.bias
+
+    def forward(self, hidden_states):
+        hidden_states = self.transform(hidden_states)
+        hidden_states = self.decoder(hidden_states)
+        return hidden_states
+
+
+class BertOnlyMLMHead(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.predictions = BertLMPredictionHead(config)
+
+    def forward(self, sequence_output: torch.Tensor) -> torch.Tensor:
+        prediction_scores = self.predictions(sequence_output)
+        return prediction_scores
+
+
+class BertOnlyNSPHead(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.seq_relationship = nn.Linear(config.hidden_size, 2)
+
+    def forward(self, pooled_output):
+        seq_relationship_score = self.seq_relationship(pooled_output)
+        return seq_relationship_score
+
+
+class BertPreTrainingHeads(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.predictions = BertLMPredictionHead(config)
+        self.seq_relationship = nn.Linear(config.hidden_size, 2)
+
+    def forward(self, sequence_output, pooled_output):
+        prediction_scores = self.predictions(sequence_output)
+        seq_relationship_score = self.seq_relationship(pooled_output)
+        return prediction_scores, seq_relationship_score
+
+
+class BertPreTrainedModel(PreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+
+    config_class = BertConfig
+    base_model_prefix = "bert"
+    supports_gradient_checkpointing = True
+
+    def _init_weights(self, module):
+        """Initialize the weights"""
+        if isinstance(module, nn.Linear):
+            # Slightly different from the TF version which uses truncated_normal for initialization
+            # cf https://github.com/pytorch/pytorch/pull/5617
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+
+
+class KVAttentionManager:
+    def __init__(self, config: BertConfig, batch_size):
+        self.seq_len_tensor_inc = None
+        self.token_offset_tensor = None
+        self.seq_len_list_inc = None
+        self.seq_len_tensor_full = None
+        self.seq_len_list_full = None
+        self.nz_dim = 16
+        self.is_full = True
+        self.batch_size = batch_size
+        self.num_layers = config.num_hidden_layers
+        self.hidden_size = config.hidden_size  # // self.world_size
+        self.max_seq_len = int(os.getenv("MAX_SEQ_LEN", config.max_position_embeddings))
+        self.token_offset = 1
+        self.ori_len_list = []
+        self.min_cache = None
+        if not IS_ND:
+            self.k_cache_input = torch.zeros(self.num_layers,
+                                             self.batch_size,  # batch
+                                             self.hidden_size // self.nz_dim,
+                                             self.max_seq_len,
+                                             self.nz_dim,
+                                             device="npu",
+                                             dtype=torch.half)
+
+            self.v_cache_input = torch.zeros(self.num_layers,
+                                             self.batch_size,  # batch
+                                             self.hidden_size // self.nz_dim,
+                                             self.max_seq_len,
+                                             self.nz_dim,
+                                             device="npu",
+                                             dtype=torch.half)
+            self.k_cache_input = torch_npu.npu_format_cast(self.k_cache_input, 29)
+            torch.npu.empty_cache()
+            self.v_cache_input = torch_npu.npu_format_cast(self.v_cache_input, 29)
+        else:
+            self.k_cache_input = torch.zeros(self.num_layers,
+                                             batch_size,  # batch
+                                             self.max_seq_len,
+                                             self.hidden_size,
+                                             device="npu",
+                                             dtype=torch.half)
+            self.v_cache_input = torch.zeros(self.num_layers,
+                                             batch_size,  # batch
+                                             self.max_seq_len,
+                                             self.hidden_size,
+                                             device="npu",
+                                             dtype=torch.half)
+        torch.npu.empty_cache()
+
+        self.attention_mask_max = torch.zeros(
+            (self.batch_size, self.max_seq_len, self.max_seq_len), device="npu", dtype=torch.half)
+        self.attention_mask_max_inc = torch.zeros(
+            (self.batch_size, MASK_INC_DIM1, self.max_seq_len), device="npu", dtype=torch.half)
+
+    def init_seq_len_and_token_offset(self, seq_len):
+        self.token_offset = seq_len
+        self.seq_len_list_full = [self.token_offset] * self.batch_size
+        self.seq_len_tensor_full = torch.full((self.batch_size,), self.token_offset, dtype=torch.int32).npu()
+        self.seq_len_list_inc = [1] * self.batch_size
+        self.seq_len_tensor_inc = torch.full((self.batch_size,), 1, dtype=torch.int32).npu()
+        self.token_offset_tensor = torch.full((self.batch_size,), self.token_offset, dtype=torch.int32).npu()
+
+    @property
+    def seq_len_list(self):
+        if self.is_full:
+            return self.seq_len_list_full
+        return self.seq_len_list_inc
+
+    @property
+    def seq_len_tensor(self):
+        if self.is_full:
+            return self.seq_len_tensor_full
+        return self.seq_len_tensor_inc
+
+    @property
+    def token_offset_list(self):
+        return [self.token_offset] * self.batch_size
+    
+    def init_attention_mask(self):
+        if IS_ND:
+            self.attention_mask_max.zero_()
+            self.attention_mask_max_inc.zero_()
+        else:
+            self.attention_mask_max.zero_()
+            self.attention_mask_max_inc = torch.zeros(
+                (self.batch_size, MASK_INC_DIM1, self.max_seq_len), device="npu", dtype=torch.half)
+
+    def trans_data(self, tensor, trans_type="full"):
+        """
+        :param tensor:
+        :param trans_type:full or inc
+        :return:
+        """
+        if trans_type == "full":
+            return torch_npu.npu_format_cast(tensor.view(
+                self.batch_size, self.max_seq_len,
+                self.max_seq_len // self.nz_dim, self.nz_dim).transpose(1, 2).contiguous(), 29)
+        else:
+            return torch_npu.npu_format_cast(tensor.view(
+                self.batch_size, self.nz_dim,
+                self.max_seq_len // self.nz_dim, self.nz_dim).transpose(1, 2).contiguous(), 29)
+
+    def get_attention_mask(self, attention_mask=None):
+        if not self.is_full:
+            return self.attention_mask_max_inc
+        else:
+            for i in range(self.batch_size):
+                self.attention_mask_max[i][:self.token_offset, :self.token_offset] = attention_mask[i]
+            if not IS_ND:
+                self.attention_mask_max_inc = self.trans_data(self.attention_mask_max_inc, "inc")
+                return self.trans_data(self.attention_mask_max, "full")
+            else:
+                return self.attention_mask_max
+
+
+@dataclass
+class BertForPreTrainingOutput(ModelOutput):
+    """
+    Output type of [`BertForPreTraining`].
+
+    Args:
+        loss (*optional*, returned when `labels` is provided, `torch.FloatTensor` of shape `(1,)`):
+            Total loss as the sum of the masked language modeling loss and the next sequence prediction
+            (classification) loss.
+        prediction_logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
+            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
+        seq_relationship_logits (`torch.FloatTensor` of shape `(batch_size, 2)`):
+            Prediction scores of the next sequence prediction (classification) head (scores of True/False continuation
+            before SoftMax).
+        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
+            shape `(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
+    """
+
+    loss: Optional[torch.FloatTensor] = None
+    prediction_logits: torch.FloatTensor = None
+    seq_relationship_logits: torch.FloatTensor = None
+    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+    attentions: Optional[Tuple[torch.FloatTensor]] = None
+
+
+BERT_START_DOCSTRING = r"""
+
+    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
+    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
+    etc.)
+
+    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
+    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
+    and behavior.
+
+    Parameters:
+        config ([`BertConfig`]): Model configuration class with all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+BERT_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (`torch.LongTensor` of shape `({0})`):
+            Indices of input sequence tokens in the vocabulary.
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            [What are input IDs?](../glossary#input-ids)
+        attention_mask (`torch.FloatTensor` of shape `({0})`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+            [What are attention masks?](../glossary#attention-mask)
+        token_type_ids (`torch.LongTensor` of shape `({0})`, *optional*):
+            Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,
+            1]`:
+
+            - 0 corresponds to a *sentence A* token,
+            - 1 corresponds to a *sentence B* token.
+
+            [What are token type IDs?](../glossary#token-type-ids)
+        position_ids (`torch.LongTensor` of shape `({0})`, *optional*):
+            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
+            config.max_position_embeddings - 1]`.
+
+            [What are position IDs?](../glossary#position-ids)
+        head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
+            Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:
+
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+
+        inputs_embeds (`torch.FloatTensor` of shape `({0}, hidden_size)`, *optional*):
+            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
+            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
+            model's internal embedding lookup matrix.
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+
+
+@add_start_docstrings(
+    "The bare Bert Model transformer outputting raw hidden-states without any specific head on top.",
+    BERT_START_DOCSTRING,
+)
+class BertModel(BertPreTrainedModel):
+    """
+
+    The model can behave as an encoder (with only self-attention) as well as a decoder, in which case a layer of
+    cross-attention is added between the self-attention layers, following the architecture described in [Attention is
+    all you need](https://arxiv.org/abs/1706.03762) by Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit,
+    Llion Jones, Aidan N. Gomez, Lukasz Kaiser and Illia Polosukhin.
+
+    To behave as an decoder the model needs to be initialized with the `is_decoder` argument of the configuration set
+    to `True`. To be used in a Seq2Seq model, the model needs to initialized with both `is_decoder` argument and
+    `add_cross_attention` set to `True`; an `encoder_hidden_states` is then expected as an input to the forward pass.
+    """
+
+    def __init__(self, config: BertConfig, add_pooling_layer: bool = True):
+        super().__init__(config)
+        self.config = config
+
+        self.embeddings = BertEmbeddings(config)
+        self.encoder = BertEncoder(config)
+
+        self.pooler = BertPooler(config) if add_pooling_layer else None
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+        # for ascend init
+        self.init_ascend_operations(config)
+        self.layer_id_list = [torch.tensor([i], dtype=torch.int32).npu() for i in range(config.num_hidden_layers)]
+
+    def init_ascend_operations(self, config: BertConfig):
+        self.head_size = config.hidden_size // config.num_attention_heads
+        self.head_num = config.num_attention_heads
+        if hasattr(config, 'world_size'):
+            rank = torch.distributed.get_rank()
+            rank_size = torch.distributed.get_world_size()
+            self.acl_param = json.dumps({"headNum": self.head_num, "layerNormEps": config.layer_norm_eps,
+                                         "dk": self.head_size, "layerNum": config.num_hidden_layers, "rank": rank,
+                                         "rankSize": rank_size})
+        else:
+            self.acl_param = json.dumps({"headNum": self.head_num, "layerNormEps": config.layer_norm_eps,
+                                         "dk": self.head_size, "layerNum": config.num_hidden_layers})
+        self.max_position_embeddings = config.max_position_embeddings
+
+        self.acl_fa_operation = torch.classes.ModelTorch.ModelTorch("bge_large_FlashAttentionModel")
+
+        self.acl_fa_operation.set_param(self.acl_param)
+
+        self.num_layers = config.num_hidden_layers
+        self.hidden_size = config.hidden_size
+        self.ascend_weight = []
+        self.batch_size = 0
+        self.kv_attention_manager = None
+        self.min_cache = torch.full(
+            (self.max_position_embeddings, self.max_position_embeddings),
+            torch.finfo(torch.half).min, dtype=torch.half).npu()
+
+    def init_ascend_weight(self):
+        weights: List = []
+        weights = [self.state_dict()["embeddings.word_embeddings.weight"],
+                   self.state_dict()["embeddings.position_embeddings.weight"],
+                   self.state_dict()["embeddings.token_type_embeddings.weight"],
+                   self.state_dict()["embeddings.LayerNorm.weight"],
+                   self.state_dict()["embeddings.LayerNorm.bias"]
+        ]
+        for i in range(self.num_layers):
+            weights_t = []
+            weights_layer = self.encoder.layer[i].state_dict()
+            weights_t.append(weights_layer["attention.self.query.weight"])
+            weights_t.append(weights_layer["attention.self.query.bias"])
+            weights_t.append(weights_layer["attention.self.key.weight"])
+            weights_t.append(weights_layer["attention.self.key.bias"])
+            weights_t.append(weights_layer["attention.self.value.weight"])
+            weights_t.append(weights_layer["attention.self.value.bias"])
+            weights_t.append(weights_layer["attention.output.dense.weight"])
+            weights_t.append(weights_layer["attention.output.dense.bias"])
+            weights_t.append(weights_layer["attention.output.LayerNorm.weight"])
+            weights_t.append(weights_layer["attention.output.LayerNorm.bias"])
+            weights_t.append(weights_layer["intermediate.dense.weight"])
+            weights_t.append(weights_layer["intermediate.dense.bias"])
+            weights_t.append(weights_layer["output.dense.weight"])
+            weights_t.append(weights_layer["output.dense.bias"])
+            weights_t.append(weights_layer["output.LayerNorm.weight"])
+            weights_t.append(weights_layer["output.LayerNorm.bias"])
+            weights.extend(weights_t)
+        self.ascend_weight = weights
+        self.acl_fa_operation.set_weight(weights)
+
+    def prepare_inputs_for_ascend(self, input_ids, position_ids, token_type_ids, attention_mask=None,
+                                  past_key_values=None):
+        self.kv_attention_manager.is_full = not past_key_values
+        position_ids = position_ids.npu()
+        token_type_ids = token_type_ids.npu()
+        attention_mask = attention_mask.float().half()
+        mask = attention_mask.clone()
+
+        mask[mask == 0] = -65504.0
+        mask[mask == 1] = -0.0
+        inputs = [input_ids,
+                  position_ids,
+                  token_type_ids,
+                  self.kv_attention_manager.k_cache_input,
+                  self.kv_attention_manager.v_cache_input,
+                  self.kv_attention_manager.get_attention_mask(mask),
+                  self.kv_attention_manager.token_offset_tensor,
+                  self.kv_attention_manager.seq_len_tensor,
+                  ] + self.layer_id_list
+        return inputs
+
+    def execute_ascend_operator(self, input_ids, position_ids, token_type_ids, attention_mask=None,
+                                past_key_values=None):
+        acl_inputs = self.prepare_inputs_for_ascend(input_ids, position_ids, token_type_ids, attention_mask,
+                                                    past_key_values)
+        tmp_param = json.dumps(
+            {"tokenOffset": self.kv_attention_manager.token_offset_list,
+             "seqLen": self.kv_attention_manager.seq_len_list
+             })
+        acl_model_out = self.acl_fa_operation.execute(acl_inputs, tmp_param)
+        acl_hidden_state = acl_model_out[0]
+        return acl_hidden_state
+
+    def get_input_embeddings(self):
+        return self.embeddings.word_embeddings
+
+    def set_input_embeddings(self, value):
+        self.embeddings.word_embeddings = value
+
+    def _prune_heads(self, heads_to_prune):
+        """
+        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
+        class PreTrainedModel
+        """
+        for layer, heads in heads_to_prune.items():
+            self.encoder.layer[layer].attention.prune_heads(heads)
+
+    @add_start_docstrings_to_model_forward(BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @add_code_sample_docstrings(
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=BaseModelOutputWithPoolingAndCrossAttentions,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def forward(
+            self,
+            input_ids: Optional[torch.Tensor] = None,
+            attention_mask: Optional[torch.Tensor] = None,
+            token_type_ids: Optional[torch.Tensor] = None,
+            position_ids: Optional[torch.Tensor] = None,
+            head_mask: Optional[torch.Tensor] = None,
+            inputs_embeds: Optional[torch.Tensor] = None,
+            past_key_values: Optional[List[torch.FloatTensor]] = None,
+    ) -> Union[Tuple[torch.Tensor], BaseModelOutputWithPoolingAndCrossAttentions]:
+        r"""        
+        encoder_hidden_states  (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+            Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if
+            the model is configured as a decoder.
+        encoder_attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in
+            the cross-attention if the model is configured as a decoder. Mask values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**, - 0 for tokens that are **masked**. past_key_values (`tuple(
+            tuple(torch.FloatTensor))` of length `config.n_layers` with each tuple having 4 tensors of shape `(
+            batch_size, num_heads, sequence_length - 1, embed_size_per_head)`): Contains precomputed key and value
+            hidden states of the attention blocks. Can be used to speed up decoding.
+
+            If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that
+            don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all
+            `decoder_input_ids` of shape `(batch_size, sequence_length)`.
+        use_cache (`bool`, *optional*):
+            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
+            `past_key_values`).
+        """
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
+        elif input_ids is not None:
+            self.warn_if_padding_and_no_attention_mask(input_ids, attention_mask)
+            input_shape = input_ids.size()
+        elif inputs_embeds is not None:
+            input_shape = inputs_embeds.size()[:-1]
+        else:
+            raise ValueError("You have to specify either input_ids or inputs_embeds")
+
+        batch_size, seq_length = input_shape
+        past_key_values_length = 0
+        device = input_ids.device if input_ids is not None else inputs_embeds.device
+
+        # past_key_values_length
+        if batch_size != self.batch_size:
+            self.batch_size = batch_size
+            self.kv_attention_manager = KVAttentionManager(self.config, batch_size)
+            self.kv_attention_manager.min_cache = self.min_cache
+
+        if past_key_values is None:
+            self.kv_attention_manager.init_attention_mask()
+            # 假设输入batch的长度一样
+            self.kv_attention_manager.init_seq_len_and_token_offset(seq_length)
+
+        if past_key_values is not None:
+            past_key_values_length = self.kv_attention_manager.token_offset
+            # NEW
+            self.kv_attention_manager.token_offset = self.kv_attention_manager.token_offset + 1
+            self.kv_attention_manager.token_offset_tensor += 1
+
+        if position_ids is None:
+            device = input_ids.device if input_ids is not None else inputs_embeds.device
+            position_ids = torch.arange(
+                past_key_values_length, seq_length + past_key_values_length, dtype=torch.long, device=device
+            )
+            position_ids = position_ids.unsqueeze(0).view(-1, seq_length)
+        else:
+            position_ids = position_ids.view(-1, seq_length).long()
+
+        if attention_mask is None:
+            attention_mask = torch.ones(((batch_size, seq_length + past_key_values_length)), device=device)
+
+        if not past_key_values:  # 使用fa时，在计算首token时会同时计算增量额attention_mask
+            self.kv_attention_manager.ori_len_list = attention_mask.sum(dim=-1)
+
+        if token_type_ids is None:
+            if hasattr(self.embeddings, "token_type_ids"):
+                buffered_token_type_ids = self.embeddings.token_type_ids[:, :seq_length]
+                buffered_token_type_ids_expanded = buffered_token_type_ids.expand(batch_size, seq_length)
+                token_type_ids = buffered_token_type_ids_expanded
+            else:
+                token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=device)
+
+        head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers)
+
+        # add acl model
+        if not self.ascend_weight:
+            self.init_ascend_weight()
+
+        hidden_states = self.execute_ascend_operator(input_ids,
+                                                     position_ids,
+                                                     token_type_ids,
+                                                     attention_mask,
+                                                     past_key_values)
+
+        return hidden_states
+
+
+@add_start_docstrings(
+    """
+    Bert Model with two heads on top as done during the pretraining: a `masked language modeling` head and a `next
+    sentence prediction (classification)` head.
+    """,
+    BERT_START_DOCSTRING,
+)
+class BertForPreTraining(BertPreTrainedModel):
+    _tied_weights_keys = ["predictions.decoder.bias", "cls.predictions.decoder.weight"]
+
+    def __init__(self, config: BertConfig):
+        super().__init__(config)
+
+        self.bert = BertModel(config)
+        self.cls = BertPreTrainingHeads(config)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_output_embeddings(self):
+        return self.cls.predictions.decoder
+
+    def set_output_embeddings(self, new_embeddings):
+        self.cls.predictions.decoder = new_embeddings
+
+    @add_start_docstrings_to_model_forward(BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @replace_return_docstrings(output_type=BertForPreTrainingOutput, config_class=_CONFIG_FOR_DOC)
+    def forward(
+            self,
+            input_ids: Optional[torch.Tensor] = None,
+            attention_mask: Optional[torch.Tensor] = None,
+            token_type_ids: Optional[torch.Tensor] = None,
+            position_ids: Optional[torch.Tensor] = None,
+            head_mask: Optional[torch.Tensor] = None,
+            inputs_embeds: Optional[torch.Tensor] = None,
+            labels: Optional[torch.Tensor] = None,
+            next_sentence_label: Optional[torch.Tensor] = None,
+            output_attentions: Optional[bool] = None,
+            output_hidden_states: Optional[bool] = None,
+            return_dict: Optional[bool] = None,
+    ) -> Union[Tuple[torch.Tensor], BertForPreTrainingOutput]:
+        r"""
+            labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
+                config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked),
+                the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
+            next_sentence_label (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+                Labels for computing the next sequence prediction (classification) loss. Input should be a sequence
+                pair (see `input_ids` docstring) Indices should be in `[0, 1]`:
+
+                - 0 indicates sequence B is a continuation of sequence A,
+                - 1 indicates sequence B is a random sequence.
+            kwargs (`Dict[str, any]`, optional, defaults to *{}*):
+                Used to hide legacy arguments that have been deprecated.
+
+        Returns:
+
+        Example:
+
+        ```python
+        >>> from transformers import AutoTokenizer, BertForPreTraining
+        >>> import torch
+
+        >>> tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-uncased")
+        >>> model = BertForPreTraining.from_pretrained("google-bert/bert-base-uncased")
+
+        >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
+        >>> outputs = model(**inputs)
+
+        >>> prediction_logits = outputs.prediction_logits
+        >>> seq_relationship_logits = outputs.seq_relationship_logits
+        ```
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.bert(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        sequence_output, pooled_output = outputs[:2]
+        prediction_scores, seq_relationship_score = self.cls(sequence_output, pooled_output)
+
+        total_loss = None
+        if labels is not None and next_sentence_label is not None:
+            loss_fct = CrossEntropyLoss()
+            masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1))
+            next_sentence_loss = loss_fct(seq_relationship_score.view(-1, 2), next_sentence_label.view(-1))
+            total_loss = masked_lm_loss + next_sentence_loss
+
+        if not return_dict:
+            output = (prediction_scores, seq_relationship_score) + outputs[2:]
+            return ((total_loss,) + output) if total_loss is not None else output
+
+        return BertForPreTrainingOutput(
+            loss=total_loss,
+            prediction_logits=prediction_scores,
+            seq_relationship_logits=seq_relationship_score,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+@add_start_docstrings(
+    """Bert Model with a `language modeling` head on top for CLM fine-tuning.""", BERT_START_DOCSTRING
+)
+class BertLMHeadModel(BertPreTrainedModel):
+    _tied_weights_keys = ["cls.predictions.decoder.bias", "cls.predictions.decoder.weight"]
+
+    def __init__(self, config: BertConfig):
+        super().__init__(config)
+
+        if not config.is_decoder:
+            logger.warning("If you want to use `BertLMHeadModel` as a standalone, add `is_decoder=True.`")
+
+        self.bert = BertModel(config, add_pooling_layer=False)
+        self.cls = BertOnlyMLMHead(config)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_output_embeddings(self):
+        return self.cls.predictions.decoder
+
+    def set_output_embeddings(self, new_embeddings):
+        self.cls.predictions.decoder = new_embeddings
+
+    @add_start_docstrings_to_model_forward(BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @add_code_sample_docstrings(
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=CausalLMOutputWithCrossAttentions,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def forward(
+            self,
+            input_ids: Optional[torch.Tensor] = None,
+            attention_mask: Optional[torch.Tensor] = None,
+            token_type_ids: Optional[torch.Tensor] = None,
+            position_ids: Optional[torch.Tensor] = None,
+            head_mask: Optional[torch.Tensor] = None,
+            inputs_embeds: Optional[torch.Tensor] = None,
+            encoder_hidden_states: Optional[torch.Tensor] = None,
+            encoder_attention_mask: Optional[torch.Tensor] = None,
+            labels: Optional[torch.Tensor] = None,
+            past_key_values: Optional[List[torch.Tensor]] = None,
+            use_cache: Optional[bool] = None,
+            output_attentions: Optional[bool] = None,
+            output_hidden_states: Optional[bool] = None,
+            return_dict: Optional[bool] = None,
+    ) -> Union[Tuple[torch.Tensor], CausalLMOutputWithCrossAttentions]:
+        r"""
+        encoder_hidden_states  (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+            Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if
+            the model is configured as a decoder.
+        encoder_attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in
+            the cross-attention if the model is configured as a decoder. Mask values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**, - 0 for tokens that are **masked**. labels (`torch.LongTensor` of
+            shape `(batch_size, sequence_length)`, *optional*): Labels for computing the left-to-right language
+            modeling loss (next word prediction). Indices should be in `[-100, 0, ..., config.vocab_size]` (see
+            `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the loss is only computed
+            for the tokens with labels n `[0, ..., config.vocab_size]` past_key_values (`tuple(tuple(
+            torch.FloatTensor))` of length `config.n_layers` with each tuple having 4 tensors of shape `(batch_size,
+            num_heads, sequence_length - 1, embed_size_per_head)`): Contains precomputed key and value hidden states
+            of the attention blocks. Can be used to speed up decoding.
+
+            If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that
+            don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all
+            `decoder_input_ids` of shape `(batch_size, sequence_length)`.
+        use_cache (`bool`, *optional*):
+            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
+            `past_key_values`).
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        if labels is not None:
+            use_cache = False
+
+        outputs = self.bert(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+            past_key_values=past_key_values,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        sequence_output = outputs[0]
+        prediction_scores = self.cls(sequence_output)
+
+        lm_loss = None
+        if labels is not None:
+            # we are doing next-token prediction; shift prediction scores and input ids by one
+            shifted_prediction_scores = prediction_scores[:, :-1, :].contiguous()
+            labels = labels[:, 1:].contiguous()
+            loss_fct = CrossEntropyLoss()
+            lm_loss = loss_fct(shifted_prediction_scores.view(-1, self.config.vocab_size), labels.view(-1))
+
+        if not return_dict:
+            output = (prediction_scores,) + outputs[2:]
+            return ((lm_loss,) + output) if lm_loss is not None else output
+
+        return CausalLMOutputWithCrossAttentions(
+            loss=lm_loss,
+            logits=prediction_scores,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+            cross_attentions=outputs.cross_attentions,
+        )
+
+    def prepare_inputs_for_generation(
+            self, input_ids, past_key_values=None, attention_mask=None, use_cache=True, **model_kwargs
+    ):
+        input_shape = input_ids.shape
+        # if model is used as a decoder in encoder-decoder model, the decoder attention mask is created on the fly
+        if attention_mask is None:
+            attention_mask = input_ids.new_ones(input_shape)
+
+        # cut decoder_input_ids if past_key_values is used
+        if past_key_values is not None:
+            past_length = past_key_values[0][0].shape[2]
+
+            # Some generation methods already pass only the last input ID
+            if input_ids.shape[1] > past_length:
+                remove_prefix_length = past_length
+            else:
+                # Default to old behavior: keep only final ID
+                remove_prefix_length = input_ids.shape[1] - 1
+
+            input_ids = input_ids[:, remove_prefix_length:]
+
+        return {
+            "input_ids": input_ids,
+            "attention_mask": attention_mask,
+            "past_key_values": past_key_values,
+            "use_cache": use_cache,
+        }
+
+    def _reorder_cache(self, past_key_values, beam_idx):
+        reordered_past = ()
+        for layer_past in past_key_values:
+            reordered_past += (
+                tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past),
+            )
+        return reordered_past
+
+
+@add_start_docstrings("""Bert Model with a `language modeling` head on top.""", BERT_START_DOCSTRING)
+class BertForMaskedLM(BertPreTrainedModel):
+    _tied_weights_keys = ["predictions.decoder.bias", "cls.predictions.decoder.weight"]
+
+    def __init__(self, config: BertConfig):
+        super().__init__(config)
+
+        if config.is_decoder:
+            logger.warning(
+                "If you want to use `BertForMaskedLM` make sure `config.is_decoder=False` for "
+                "bi-directional self-attention."
+            )
+
+        self.bert = BertModel(config, add_pooling_layer=False)
+        self.cls = BertOnlyMLMHead(config)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_output_embeddings(self):
+        return self.cls.predictions.decoder
+
+    def set_output_embeddings(self, new_embeddings):
+        self.cls.predictions.decoder = new_embeddings
+
+    @add_start_docstrings_to_model_forward(BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @add_code_sample_docstrings(
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=MaskedLMOutput,
+        config_class=_CONFIG_FOR_DOC,
+        expected_output="'paris'",
+        expected_loss=0.88,
+    )
+    def forward(
+            self,
+            input_ids: Optional[torch.Tensor] = None,
+            attention_mask: Optional[torch.Tensor] = None,
+            token_type_ids: Optional[torch.Tensor] = None,
+            position_ids: Optional[torch.Tensor] = None,
+            head_mask: Optional[torch.Tensor] = None,
+            inputs_embeds: Optional[torch.Tensor] = None,
+            encoder_hidden_states: Optional[torch.Tensor] = None,
+            encoder_attention_mask: Optional[torch.Tensor] = None,
+            labels: Optional[torch.Tensor] = None,
+            output_attentions: Optional[bool] = None,
+            output_hidden_states: Optional[bool] = None,
+            return_dict: Optional[bool] = None,
+    ) -> Union[Tuple[torch.Tensor], MaskedLMOutput]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
+            config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
+            loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
+        """
+
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.bert(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        sequence_output = outputs[0]
+        prediction_scores = self.cls(sequence_output)
+
+        masked_lm_loss = None
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()  # -100 index = padding token
+            masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1))
+
+        if not return_dict:
+            output = (prediction_scores,) + outputs[2:]
+            return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output
+
+        return MaskedLMOutput(
+            loss=masked_lm_loss,
+            logits=prediction_scores,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+    def prepare_inputs_for_generation(self, input_ids, attention_mask=None, **model_kwargs):
+        input_shape = input_ids.shape
+        effective_batch_size = input_shape[0]
+
+        #  add a dummy token
+        if self.config.pad_token_id is None:
+            raise ValueError("The PAD token should be defined for generation")
+
+        attention_mask = torch.cat([attention_mask, attention_mask.new_zeros((attention_mask.shape[0], 1))], dim=-1)
+        dummy_token = torch.full(
+            (effective_batch_size, 1), self.config.pad_token_id, dtype=torch.long, device=input_ids.device
+        )
+        input_ids = torch.cat([input_ids, dummy_token], dim=1)
+
+        return {"input_ids": input_ids, "attention_mask": attention_mask}
+
+
+@add_start_docstrings(
+    """Bert Model with a `next sentence prediction (classification)` head on top.""",
+    BERT_START_DOCSTRING,
+)
+class BertForNextSentencePrediction(BertPreTrainedModel):
+    def __init__(self, config: BertConfig):
+        super().__init__(config)
+
+        self.bert = BertModel(config)
+        self.cls = BertOnlyNSPHead(config)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @add_start_docstrings_to_model_forward(BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @replace_return_docstrings(output_type=NextSentencePredictorOutput, config_class=_CONFIG_FOR_DOC)
+    def forward(
+            self,
+            input_ids: Optional[torch.Tensor] = None,
+            attention_mask: Optional[torch.Tensor] = None,
+            token_type_ids: Optional[torch.Tensor] = None,
+            position_ids: Optional[torch.Tensor] = None,
+            head_mask: Optional[torch.Tensor] = None,
+            inputs_embeds: Optional[torch.Tensor] = None,
+            labels: Optional[torch.Tensor] = None,
+            output_attentions: Optional[bool] = None,
+            output_hidden_states: Optional[bool] = None,
+            return_dict: Optional[bool] = None,
+            **kwargs,
+    ) -> Union[Tuple[torch.Tensor], NextSentencePredictorOutput]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the next sequence prediction (classification) loss. Input should be a sequence pair
+            (see `input_ids` docstring). Indices should be in `[0, 1]`:
+
+            - 0 indicates sequence B is a continuation of sequence A,
+            - 1 indicates sequence B is a random sequence.
+
+        Returns:
+
+        Example:
+
+        ```python
+        >>> from transformers import AutoTokenizer, BertForNextSentencePrediction
+        >>> import torch
+
+        >>> tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-uncased")
+        >>> model = BertForNextSentencePrediction.from_pretrained("google-bert/bert-base-uncased")
+
+        >>> prompt = "In Italy, pizza served in formal settings, such as at a restaurant, is presented unsliced."
+        >>> next_sentence = "The sky is blue due to the shorter wavelength of blue light."
+        >>> encoding = tokenizer(prompt, next_sentence, return_tensors="pt")
+
+        >>> outputs = model(**encoding, labels=torch.LongTensor([1]))
+        >>> logits = outputs.logits
+        >>> assert logits[0, 0] < logits[0, 1]  # next sentence was random
+        ```
+        """
+
+        if "next_sentence_label" in kwargs:
+            warnings.warn(
+                "The `next_sentence_label` argument is deprecated and will be removed in a future version, use"
+                " `labels` instead.",
+                FutureWarning,
+            )
+            labels = kwargs.pop("next_sentence_label")
+
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.bert(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        pooled_output = outputs[1]
+
+        seq_relationship_scores = self.cls(pooled_output)
+
+        next_sentence_loss = None
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()
+            next_sentence_loss = loss_fct(seq_relationship_scores.view(-1, 2), labels.view(-1))
+
+        if not return_dict:
+            output = (seq_relationship_scores,) + outputs[2:]
+            return ((next_sentence_loss,) + output) if next_sentence_loss is not None else output
+
+        return NextSentencePredictorOutput(
+            loss=next_sentence_loss,
+            logits=seq_relationship_scores,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+@add_start_docstrings(
+    """
+    Bert Model transformer with a sequence classification/regression head on top (a linear layer on top of the pooled
+    output) e.g. for GLUE tasks.
+    """,
+    BERT_START_DOCSTRING,
+)
+class BertForSequenceClassification(BertPreTrainedModel):
+    def __init__(self, config: BertConfig):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+        self.config = config
+
+        self.bert = BertModel(config)
+        classifier_dropout = (
+            config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
+        )
+        self.dropout = nn.Dropout(classifier_dropout)
+        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @add_start_docstrings_to_model_forward(BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @add_code_sample_docstrings(
+        checkpoint=_CHECKPOINT_FOR_SEQUENCE_CLASSIFICATION,
+        output_type=SequenceClassifierOutput,
+        config_class=_CONFIG_FOR_DOC,
+        expected_output=_SEQ_CLASS_EXPECTED_OUTPUT,
+        expected_loss=_SEQ_CLASS_EXPECTED_LOSS,
+    )
+    def forward(
+            self,
+            input_ids: Optional[torch.Tensor] = None,
+            attention_mask: Optional[torch.Tensor] = None,
+            token_type_ids: Optional[torch.Tensor] = None,
+            position_ids: Optional[torch.Tensor] = None,
+            head_mask: Optional[torch.Tensor] = None,
+            inputs_embeds: Optional[torch.Tensor] = None,
+            labels: Optional[torch.Tensor] = None,
+            output_attentions: Optional[bool] = None,
+            output_hidden_states: Optional[bool] = None,
+            return_dict: Optional[bool] = None,
+    ) -> Union[Tuple[torch.Tensor], SequenceClassifierOutput]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.bert(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        pooled_output = outputs[1]
+
+        pooled_output = self.dropout(pooled_output)
+        logits = self.classifier(pooled_output)
+
+        loss = None
+        if labels is not None:
+            if self.config.problem_type is None:
+                if self.num_labels == 1:
+                    self.config.problem_type = "regression"
+                elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
+                    self.config.problem_type = "single_label_classification"
+                else:
+                    self.config.problem_type = "multi_label_classification"
+
+            if self.config.problem_type == "regression":
+                loss_fct = MSELoss()
+                if self.num_labels == 1:
+                    loss = loss_fct(logits.squeeze(), labels.squeeze())
+                else:
+                    loss = loss_fct(logits, labels)
+            elif self.config.problem_type == "single_label_classification":
+                loss_fct = CrossEntropyLoss()
+                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
+            elif self.config.problem_type == "multi_label_classification":
+                loss_fct = BCEWithLogitsLoss()
+                loss = loss_fct(logits, labels)
+        if not return_dict:
+            output = (logits,) + outputs[2:]
+            return ((loss,) + output) if loss is not None else output
+
+        return SequenceClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+@add_start_docstrings(
+    """
+    Bert Model with a multiple choice classification head on top (a linear layer on top of the pooled output and a
+    softmax) e.g. for RocStories/SWAG tasks.
+    """,
+    BERT_START_DOCSTRING,
+)
+class BertForMultipleChoice(BertPreTrainedModel):
+    def __init__(self, config: BertConfig):
+        super().__init__(config)
+
+        self.bert = BertModel(config)
+        classifier_dropout = (
+            config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
+        )
+        self.dropout = nn.Dropout(classifier_dropout)
+        self.classifier = nn.Linear(config.hidden_size, 1)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @add_start_docstrings_to_model_forward(BERT_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length"))
+    @add_code_sample_docstrings(
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=MultipleChoiceModelOutput,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def forward(
+            self,
+            input_ids: Optional[torch.Tensor] = None,
+            attention_mask: Optional[torch.Tensor] = None,
+            token_type_ids: Optional[torch.Tensor] = None,
+            position_ids: Optional[torch.Tensor] = None,
+            head_mask: Optional[torch.Tensor] = None,
+            inputs_embeds: Optional[torch.Tensor] = None,
+            labels: Optional[torch.Tensor] = None,
+            output_attentions: Optional[bool] = None,
+            output_hidden_states: Optional[bool] = None,
+            return_dict: Optional[bool] = None,
+    ) -> Union[Tuple[torch.Tensor], MultipleChoiceModelOutput]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the multiple choice classification loss. Indices should be in `[0, ...,
+            num_choices-1]` where `num_choices` is the size of the second dimension of the input tensors. (See
+            `input_ids` above)
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1]
+
+        input_ids = input_ids.view(-1, input_ids.size(-1)) if input_ids is not None else None
+        attention_mask = attention_mask.view(-1, attention_mask.size(-1)) if attention_mask is not None else None
+        token_type_ids = token_type_ids.view(-1, token_type_ids.size(-1)) if token_type_ids is not None else None
+        position_ids = position_ids.view(-1, position_ids.size(-1)) if position_ids is not None else None
+        inputs_embeds = (
+            inputs_embeds.view(-1, inputs_embeds.size(-2), inputs_embeds.size(-1))
+            if inputs_embeds is not None
+            else None
+        )
+
+        outputs = self.bert(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        pooled_output = outputs[1]
+
+        pooled_output = self.dropout(pooled_output)
+        logits = self.classifier(pooled_output)
+        reshaped_logits = logits.view(-1, num_choices)
+
+        loss = None
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()
+            loss = loss_fct(reshaped_logits, labels)
+
+        if not return_dict:
+            output = (reshaped_logits,) + outputs[2:]
+            return ((loss,) + output) if loss is not None else output
+
+        return MultipleChoiceModelOutput(
+            loss=loss,
+            logits=reshaped_logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+@add_start_docstrings(
+    """
+    Bert Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for
+    Named-Entity-Recognition (NER) tasks.
+    """,
+    BERT_START_DOCSTRING,
+)
+class BertForTokenClassification(BertPreTrainedModel):
+    def __init__(self, config: BertConfig):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+
+        self.bert = BertModel(config, add_pooling_layer=False)
+        classifier_dropout = (
+            config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
+        )
+        self.dropout = nn.Dropout(classifier_dropout)
+        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @add_start_docstrings_to_model_forward(BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @add_code_sample_docstrings(
+        checkpoint=_CHECKPOINT_FOR_TOKEN_CLASSIFICATION,
+        output_type=TokenClassifierOutput,
+        config_class=_CONFIG_FOR_DOC,
+        expected_output=_TOKEN_CLASS_EXPECTED_OUTPUT,
+        expected_loss=_TOKEN_CLASS_EXPECTED_LOSS,
+    )
+    def forward(
+            self,
+            input_ids: Optional[torch.Tensor] = None,
+            attention_mask: Optional[torch.Tensor] = None,
+            token_type_ids: Optional[torch.Tensor] = None,
+            position_ids: Optional[torch.Tensor] = None,
+            head_mask: Optional[torch.Tensor] = None,
+            inputs_embeds: Optional[torch.Tensor] = None,
+            labels: Optional[torch.Tensor] = None,
+            output_attentions: Optional[bool] = None,
+            output_hidden_states: Optional[bool] = None,
+            return_dict: Optional[bool] = None,
+    ) -> Union[Tuple[torch.Tensor], TokenClassifierOutput]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.bert(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        sequence_output = outputs[0]
+
+        sequence_output = self.dropout(sequence_output)
+        logits = self.classifier(sequence_output)
+
+        loss = None
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()
+            loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
+
+        if not return_dict:
+            output = (logits,) + outputs[2:]
+            return ((loss,) + output) if loss is not None else output
+
+        return TokenClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+@add_start_docstrings(
+    """
+    Bert Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear
+    layers on top of the hidden-states output to compute `span start logits` and `span end logits`).
+    """,
+    BERT_START_DOCSTRING,
+)
+class BertForQuestionAnswering(BertPreTrainedModel):
+    def __init__(self, config: BertConfig):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+
+        self.bert = BertModel(config, add_pooling_layer=False)
+        self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @add_start_docstrings_to_model_forward(BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @add_code_sample_docstrings(
+        checkpoint=_CHECKPOINT_FOR_QA,
+        output_type=QuestionAnsweringModelOutput,
+        config_class=_CONFIG_FOR_DOC,
+        qa_target_start_index=_QA_TARGET_START_INDEX,
+        qa_target_end_index=_QA_TARGET_END_INDEX,
+        expected_output=_QA_EXPECTED_OUTPUT,
+        expected_loss=_QA_EXPECTED_LOSS,
+    )
+    def forward(
+            self,
+            input_ids: Optional[torch.Tensor] = None,
+            attention_mask: Optional[torch.Tensor] = None,
+            token_type_ids: Optional[torch.Tensor] = None,
+            position_ids: Optional[torch.Tensor] = None,
+            head_mask: Optional[torch.Tensor] = None,
+            inputs_embeds: Optional[torch.Tensor] = None,
+            start_positions: Optional[torch.Tensor] = None,
+            end_positions: Optional[torch.Tensor] = None,
+            output_attentions: Optional[bool] = None,
+            output_hidden_states: Optional[bool] = None,
+            return_dict: Optional[bool] = None,
+    ) -> Union[Tuple[torch.Tensor], QuestionAnsweringModelOutput]:
+        r"""
+        start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for position (index) of the start of the labelled span for computing the token classification loss.
+            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
+            are not taken into account for computing the loss.
+        end_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for position (index) of the end of the labelled span for computing the token classification loss.
+            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
+            are not taken into account for computing the loss.
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.bert(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        sequence_output = outputs[0]
+
+        logits = self.qa_outputs(sequence_output)
+        start_logits, end_logits = logits.split(1, dim=-1)
+        start_logits = start_logits.squeeze(-1).contiguous()
+        end_logits = end_logits.squeeze(-1).contiguous()
+
+        total_loss = None
+        if start_positions is not None and end_positions is not None:
+            # If we are on multi-GPU, split add a dimension
+            if len(start_positions.size()) > 1:
+                start_positions = start_positions.squeeze(-1)
+            if len(end_positions.size()) > 1:
+                end_positions = end_positions.squeeze(-1)
+            # sometimes the start/end positions are outside our model inputs, we ignore these terms
+            ignored_index = start_logits.size(1)
+            start_positions = start_positions.clamp(0, ignored_index)
+            end_positions = end_positions.clamp(0, ignored_index)
+
+            loss_fct = CrossEntropyLoss(ignore_index=ignored_index)
+            start_loss = loss_fct(start_logits, start_positions)
+            end_loss = loss_fct(end_logits, end_positions)
+            total_loss = (start_loss + end_loss) / 2
+
+        if not return_dict:
+            output = (start_logits, end_logits) + outputs[2:]
+            return ((total_loss,) + output) if total_loss is not None else output
+
+        return QuestionAnsweringModelOutput(
+            loss=total_loss,
+            start_logits=start_logits,
+            end_logits=end_logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
diff --git a/mindie/examples/models/bge/large-zh-v1.5/ops_info.json b/mindie/examples/models/bge/large-zh-v1.5/ops_info.json
new file mode 100644
index 00000000..03e4db2f
--- /dev/null
+++ b/mindie/examples/models/bge/large-zh-v1.5/ops_info.json
@@ -0,0 +1,10 @@
+{
+    "black-list": {
+        "to-add": [
+            "Add",
+            "Sub",
+            "Mul",
+            "SoftmaxV2"
+        ]
+    }
+}
\ No newline at end of file
diff --git a/mindie/examples/models/bge/large-zh-v1.5/requirements.txt b/mindie/examples/models/bge/large-zh-v1.5/requirements.txt
new file mode 100644
index 00000000..b5a5a6fb
--- /dev/null
+++ b/mindie/examples/models/bge/large-zh-v1.5/requirements.txt
@@ -0,0 +1,3 @@
+optimum==1.18.0
+onnx==1.16.0
+onnxruntime==1.17.1
\ No newline at end of file
diff --git a/mindie/examples/models/bge/reranker-large/README.md b/mindie/examples/models/bge/reranker-large/README.md
new file mode 100644
index 00000000..e78cfc6f
--- /dev/null
+++ b/mindie/examples/models/bge/reranker-large/README.md
@@ -0,0 +1,251 @@
+# README
+
+# 特性矩阵
+- 此矩阵罗列了各bge-reranker-large模型支持的特性
+
+| 模型及参数量       | 800I A2 Tensor Parallelism | 300I DUO Tensor Parallelism | FP16 | BF16 | Flash Attention | Paged Attention | W8A8量化 | W8A16量化 | W4A16量化 | KV cache量化 | 稀疏量化 | MOE量化 | MindIE Service | TGI  | 长序列 |
+|--------------|-------------------------|---------------------------| ---- |-----| --------------- | --------------- | -------- | --------- | --------- | ------------ | -------------------------- | ---- | ------ | ---- |-----|
+| bge-reranker-large | 支持world size 1    | 支持world size 1        | √    | ×   | ×               | ×               | ×        | ×         | ×         | ×            | ×                          | ×    | ×      | ×    | ×    |
+
+# bge-reranker-large模型-推理指导
+
+- [概述](#概述)
+  - [输入输出数据](#输入输出数据)
+- [推理环境准备](#推理环境准备)
+- [快速上手](#快速上手)
+  - [获取源码](#获取源码)
+  - [模型转换](#模型转换)
+  - [模型推理](#模型推理)
+- [模型推理性能&精度](#模型推理性能精度)
+  - [模型推理性能](#模型推理性能)
+  - [精度](#精度)
+
+## 概述
+
+### 模型介绍
+
+`bge-reranker-large` 是由智源研究院研发的交叉编码器重排模型，可对查询和答案实时计算相关性分数，这比向量模型（即双编码器）更准确，但比向量模型更耗时。
+
+### 开源模型地址
+
+```text
+url=https://huggingface.co/BAAI/bge-reranker-large
+commit_id=bc0c7056d15eaea221616887bf15da63743d19e1
+model_name=bge-reranker-large
+```
+
+### 路径变量解释
+
+```text
+{cur_dir}
+├─ .cache
+│  ├─ huggingface
+│  │  └─ datasets
+│  │     └─ C-MTEB
+│  │        └─ T2Reranking
+│  │           └─ dev-00000-of-00001-65d96bde8023d9b9.parquet
+├─ models
+│  ├─ om
+│  │  ├─ bge-reranker-large_{soc_version}_{precision_mode}_linux_aarch64.om
+│  ├─ onnx
+│  │  ├─ model.onnx
+│  └─ pytorch
+│     └─ pytorch_model.bin
+├─ eval_performance.py
+├─ eval_precision.py
+└─ run.py
+```
+
+| 变量名            | 含义                                                                                                                                                      |
+|----------------|---------------------------------------------------------------------------------------------------------------------------------------------------------|
+| soc_version    | npu芯片的处理器的版本，可以使用 `npu-smi info` 查询                                                                                                                     |
+| precision_mode | 转换的om模型的精度模式，参考[ATC工具参数](https://www.hiascend.com/document/detail/zh/CANNCommunityEdition/80RC2alpha002/devaids/auxiliarydevtool/atlasatc_16_0099.html) |
+
+
+### 输入输出数据
+
+**输入数据**
+
+| 输入数据           | 数据类型  | 大小                   | 数据排布格式 |
+|----------------|-------|----------------------|--------|
+| input_ids      | INT64 | batch_size * seq_len | ND     |
+| attention_mask | INT64 | batch_size * seq_len | ND     |
+
+**输出数据**
+
+| 输出数据   | 数据类型    | 大小                 | 数据排布格式 |
+|--------|---------|--------------------|--------|
+| output | FLOAT32 | batch_size * class | ND     |
+
+## 推理环境准备
+
+**该模型需要以下插件与驱动**
+
+| 配套      | 版本       | 环境准备指导                                                                                                        |
+|---------|----------|---------------------------------------------------------------------------------------------------------------|
+| 固件与驱动   | 23.0.RC3 | [Pytorch框架推理环境准备](https://www.hiascend.com/document/detail/zh/ModelZoo/pytorchframework/pies/pies_00001.html) |
+| CANN    | 7.0.RC1  | -                                                                                                             |
+| Python  | 3.10     | -                                                                                                             |
+| Pytorch | 2.1.0    | -                                                                                                             |
+
+说明：Atlas 300I Duo 推理卡请以 CANN 版本选择实际固件与驱动版本。
+
+## 快速上手
+
+### 获取源码
+
+1. 获取本项目源码
+    ```shell
+    git clone https://gitee.com/ascend/MindIE-LLM.git                 # 克隆本仓库代码
+    git checkout master                                               # 切换对应分支
+    cd examples/atb_models/pytorch/examples/BAAI/bge-reranker-large   # 打开工作（当前）目录 {cur_dir}
+    ```
+
+2. 安装依赖
+
+    安装python依赖
+    ```shell
+    pip install -r requirements.txt
+    ```
+    下载安装 `ais_bench` 推理工具
+
+    [ais_bench推理工具使用指南](https://gitee.com/ascend/tools/blob/master/ais-bench_workload/tool/ais_bench/README.md)
+    ```shell
+    pip install ./aclruntime-{version}-{python_version}-linux_{arch}.whl
+    pip install ./ais_bench-{version}-py3-none-any.whl
+    # {version}表示软件版本号，{python_version}表示Python版本号，{arch}表示CPU架构
+    ```
+
+3. 获取开源模型
+    ```shell
+    git lfs install
+    GIT_LFS_SKIP_SMUDGE=1 git clone https://gitee.com/ascend/MindIE-LLM.git
+    ```
+
+4. 准备数据集
+
+    下载 [C-MTEB/T2Reranking](https://huggingface.co/datasets/C-MTEB/T2Reranking) 数据集
+    
+    ```shell
+    mkdir .cache/huggingface/datasets/C-MTEB/
+    cd .cache/huggingface/datasets/C-MTEB/
+    git clone https://huggingface.co/datasets/C-MTEB/T2Reranking
+    mv T2Reranking/data/dev-00000-of-00001-65d96bde8023d9b9.parquet T2Reranking/
+    ```
+
+### 模型转换
+
+1. 获取开源模型 pytorch 权重文件 [pytorch_model.bin](https://huggingface.co/BAAI/bge-reranker-large/blob/main/pytorch_model.bin)，放在 `models/pytorch` 目录中
+
+2. 获取开源模型 onnx 权重文件 [model.onnx](https://huggingface.co/BAAI/bge-large-zh-v1.5/resolve/main/pytorch_model.bin?download=true)，放在 `models/onnx` 目录中
+
+3. 运行脚本转换模型
+
+    ```shell
+    bash ${cur_dir}/convert.sh ${onnx} ${om} ${precision_mode}
+    ```
+    
+    - 参数说明，参考 [ATC工具参数](https://www.hiascend.com/document/detail/zh/CANNCommunityEdition/80RC1alpha003/devaids/auxiliarydevtool/atlasatc_16_0039.html)
+      - `onnx`：转换后的onnx模型文件路径
+      - `om`：转换后的om模型文件路径
+      - `precision_mode`：模型精度模式，精度高低排序 `origin>mixed_float16>fp16`，性能优劣排序 `fp16>=mixed_float16>origin`，推荐使用 `mixed_float16` 以在保证精度的前提下获得最大性能，默认为 `mixed_float16`
+
+### 模型推理
+
+1. 执行推理
+
+    ```shell
+    python run.py \
+      --model_type_or_path=${model_type} or ${model_path}
+      --device=${device}
+    ```
+    
+    - 参数说明
+      - `model_type_or_path`：选择需要推理的模型类型或模型文件路径
+      - `device`：选择加载模型的芯片id
+
+2. 性能测试
+
+    ```shell
+    python eval_performance.py \
+      --model_type_or_path=${model_type} or ${model_path} \
+      --input_shape=${batch_size},${seq_len} \
+      --device=${device} \
+      --loop=${loop}
+    ```
+    
+    - 参数说明
+      - `model_type_or_path`：选择需要推理的模型类型或模型文件路径
+      - `batch_size`：选择每次推理时加载的数据集长度
+      - `seq_len`：选择每次推理时加载的文本长度
+      - `device`：选择加载模型的芯片id
+      - `loop`：验证循环次数
+
+3. 精度测试
+
+    ```shell
+    python eval_precision.py \
+      --model_type_or_path=${model_type} or ${model_path} \
+      --batch_size=${batch_size} \
+      --device=${device}
+    ```
+    
+    - 参数说明
+      - `model_type_or_path`：选择需要推理的模型类型或模型文件路径
+      - `batch_size`：选择每次推理时加载的数据集长度
+      - `device`：选择加载模型的芯片id
+
+## 模型推理性能&精度
+
+### 模型推理性能
+
+吞吐率：1000 * batch_size / compute_time
+
+| 环境  | 芯片型号        | batch_size | seq_len | 吞吐率（fps） |
+|-----|-------------|------------|---------|----------|
+| NPU | Ascend310P3 | 20         | 512     | 43.84    |
+| NPU | Ascend310P3 | 50         | 512     | 44.23    |
+| GPU | NVIDIA A10  | 20         | 512     | 46.43    |
+| GPU | NVIDIA A10  | 50         | 512     | 49.16    |
+
+说明：Atlas 300I Duo 推理卡为单卡双芯，比较吞吐率时需要×2
+
+| 环境  | 芯片型号        | batch_size | seq_len | 吞吐率（fps） |
+|-----|-------------|------------|---------|----------|
+| NPU | Ascend910B4 | 20         | 512     | 144.02   |
+| NPU | Ascend910B4 | 50         | 512     | 135.82   |
+| GPU | NVIDIA L40S | 20         | 512     | 119.75   |
+| GPU | NVIDIA L40S | 50         | 512     | 113.42   |
+
+### 模型推理精度
+
+精度验证NPU环境使用 `OM` 模型，GPU环境使用 `ONNX` 模型
+
+有数据集精度验证选择 [C-MTEB/T2Reranking](https://huggingface.co/datasets/C-MTEB/T2Reranking) 任务，开源模型在该任务下 MAP 分数为 67.28
+
+| 环境  | 芯片型号        | MAP（%） | MRR@10（%） | 执行时间（s） |
+|-----|-------------|--------|-----------|---------|
+| NPU | Ascend310P3 | 67.60  | 77.68     | 4496.25 |
+| GPU | Nvidia A10  | 67.61  | 77.66     | 2216.56 |
+
+| 环境  | 芯片型号        | MAP（%） | MRR@10（%） | 执行时间（s） |
+|-----|-------------|--------|-----------|---------|
+| NPU | Ascend910B4 | 67.60  | 77.66     | 985.30  |
+| GPU | Nvidia L40S | 67.61  | 77.66     | 991.57  |
+
+说明：
+
+- MAP：平均精度均值（Mean Average Precision）$MAP = \frac{1}{|U|} \sum_{i=1}^{|U|} hit(i) \times \frac{1}{P_i}$
+- MRR：平均倒数排名（Mean Reciprocal Rank）$MRR = \frac{1}{N} \sum_{i=1}^N \frac{1}{p_i}$
+
+无数据集精度验证选择输入 `[[query, positive], [query, negative]]` 文本，`torch.allclose` 满足1%精度
+
+| 环境  | 芯片型号        | 推理结果                     |
+|-----|-------------|--------------------------|
+| NPU | Ascend310P3 | tensor([7.5195, 1.3613]) |
+| GPU | Nvidia A10  | tensor([7.5152, 1.3654]) |
+
+| 环境  | 芯片型号        | 推理结果                     |
+|-----|-------------|--------------------------|
+| NPU | Ascend910B4 | tensor([7.5195, 1.3779]) |
+| GPU | Nvidia L40S | tensor([7.5140, 1.3697]) |
diff --git a/mindie/examples/models/bge/reranker-large/config.json b/mindie/examples/models/bge/reranker-large/config.json
new file mode 100644
index 00000000..1363595e
--- /dev/null
+++ b/mindie/examples/models/bge/reranker-large/config.json
@@ -0,0 +1,8 @@
+{
+    "default_path": {
+        "tokenizer_path": "models/pytorch",
+        "pytorch_model_path": "models/pytorch",
+        "onnx_model_path": "models/onnx",
+        "om_model_path": "models/om/bge-reranker-large_Ascend910B4_allow_mix_precision_linux_aarch64.om"
+    }
+}
diff --git a/mindie/examples/models/bge/reranker-large/convert.sh b/mindie/examples/models/bge/reranker-large/convert.sh
new file mode 100644
index 00000000..480736ac
--- /dev/null
+++ b/mindie/examples/models/bge/reranker-large/convert.sh
@@ -0,0 +1,39 @@
+#!/bin/bash
+
+# 定义模型检查点和保存目录
+onnx_directory="$1"
+om_directory="$2"
+soc_version=$(python -c "import torch;import torch_npu;print(torch.npu.get_device_name())")
+
+# 检查是否输入了转换精度参数
+if [ -z "$3" ]; then
+    precision_mode=mixed_float16
+else
+    precision_mode="$3"
+fi
+
+# 检查ONNX模型是否存在
+if [ -f "$onnx_directory/model.onnx" ]; then
+    echo "ONNX model found at $onnx_directory/model.onnx"
+else
+    echo "Error: Unable to find ONNX model."
+    exit 1
+fi
+
+# 使用ATC命令对ONNX模型进行转换或优化
+atc --model="$onnx_directory/model.onnx" \
+    --framework=5 \
+    --output="$om_directory/bge-reranker-large_'$soc_version'_'$precision_mode'" \
+    --soc_version="$soc_version" \
+    --input_shape="input_ids:-1,-1;attention_mask:-1,-1" \
+    --precision_mode_v2="$precision_mode" \
+    --modify_mixlist="$om_directory/ops_info.json"
+
+# 检查ATC命令是否执行成功
+# shellcheck disable=SC2181
+if [ $? -eq 0 ]; then
+    echo "Model conversion with ATC successful."
+else
+    echo "Error: Failed to convert model with ATC."
+    exit 1
+fi
diff --git a/mindie/examples/models/bge/reranker-large/eval_performance.py b/mindie/examples/models/bge/reranker-large/eval_performance.py
new file mode 100644
index 00000000..22748d03
--- /dev/null
+++ b/mindie/examples/models/bge/reranker-large/eval_performance.py
@@ -0,0 +1,299 @@
+# Copyright Huawei Technologies Co., Ltd. 2024-2024. All rights reserved.
+import argparse
+import json
+import logging
+import os
+import time
+from typing import Any, List, Union, Tuple
+
+import datasets
+import numpy as np
+import torch
+import transformers.tokenization_utils_base
+from transformers import AutoTokenizer, AutoModelForSequenceClassification
+from optimum.onnxruntime import ORTModelForSequenceClassification
+from tqdm import tqdm as progressbar
+
+from atb_llm.utils.file_utils import safe_open
+
+
+def get_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser(description='Evaluate LLM.')
+    parser.add_argument(
+        '--model_type_or_path',
+        type=str,
+        required=True,
+        help='Specipy model type to load default model or path to the directory containing model file.'
+    )
+    parser.add_argument(
+        '--input_shape',
+        type=str,
+        required=True,
+        help='Shape of input tensors.'
+    )
+    parser.add_argument(
+        '--device',
+        type=int,
+        default=6,
+        choices=list(range(8)),
+        help='Adapt model on device id x.'
+    )
+    parser.add_argument(
+        '--loop',
+        type=int,
+        default=50,
+        help='Evaluation loops.'
+    )
+    return parser.parse_args()
+
+
+class Model:
+    def __init__(self, tokenizer_path: str, device_id: int) -> None:
+        self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)
+        self.device, self.runtime = self.init_runtime(device_id)
+
+    def init_runtime(self, device_id: int) -> Tuple[Union[str, int], Any]:
+        if self.__class__.__name__.startswith(('PyTorch', 'ONNX')):
+            try:
+                import torch_npu
+            except ImportError:
+                device = 'cuda:{}'.format(device_id) if torch.cuda.is_available() else 'cpu'
+            else:
+                device = 'npu:{}'.format(device_id)
+                torch_npu.npu.set_device(device_id)
+                torch.npu.set_compile_mode(jit_compile=False)
+            return device, 0
+        elif self.__class__.__name__.startswith('OM'):
+            from ais_bench.infer.interface import InferSession
+            return device_id, InferSession
+        else:
+            raise RuntimeError
+
+    def tokenize(
+            self,
+            sentences_batch: List[List[str]],
+            seq_len: int
+    ) -> transformers.tokenization_utils_base.BatchEncoding:
+        encoded_inputs = self.tokenizer(
+            sentences_batch,
+            padding='max_length',
+            truncation=True,
+            return_tensors='pt',
+            max_length=seq_len
+        ).to(self.device)
+        return encoded_inputs
+
+    def encode(self, encoded_inputs: transformers.tokenization_utils_base.BatchEncoding) -> float:
+        # Compute token embedding time
+        computing_time = self._encode_batched(encoded_inputs)
+
+        return computing_time
+
+    def compute_scores(self, pairs: List[List[str]], batch_size: int, seq_len: int, loop: int) -> dict:
+        # Tokenize sentences
+        encoded_inputs = self.tokenize(pairs, seq_len)
+
+        all_computing_time = []
+        for _ in progressbar(range(loop), 'Evaluating...'):
+            computing_time = self.encode(encoded_inputs)
+            all_computing_time.append(computing_time)
+
+        try:
+            throughput = 1000 * batch_size / np.mean(all_computing_time)
+        except ZeroDivisionError as e:
+            raise RuntimeError('{} because no evaluation results'.format(e)) from e
+
+        scores = {
+            'compute_time': {
+                'min': np.min(all_computing_time),
+                'max': np.max(all_computing_time),
+                'mean': np.mean(all_computing_time),
+                'median': np.median(all_computing_time),
+                'percentile(99%)': np.percentile(all_computing_time, 99)
+            },
+            'throughput': throughput
+        }
+
+        return scores
+
+    def _encode_batched(self, inputs: transformers.tokenization_utils_base.BatchEncoding) -> float:
+        """ Returns a list of embeddings for the given sentences.
+
+        Args:
+            inputs (`BatchEncoding`): List of sentences to encode
+
+        Returns:
+            `float: Computing time of embeddings for the given sentences
+        """
+        # 规避【华为Python规范】【建议】G.CLS.07 类的方法不需要访问实例时，建议定义为staticmethod或classmethod
+        _ = self
+        return 0.0
+
+
+class PyTorchModel(Model):
+    def __init__(self, tokenizer_path: str, model_path: str, device_id: int):
+        super(PyTorchModel, self).__init__(tokenizer_path, device_id)
+        self.model = AutoModelForSequenceClassification.from_pretrained(
+            model_path,
+            local_files_only=True,
+            trust_remote_code=True
+        ).half().to(self.device)
+        self.model.eval()
+
+    def _encode_batched(self, inputs: transformers.tokenization_utils_base.BatchEncoding) -> float:
+        tick = time.time()
+        with torch.no_grad():
+            self.model(**inputs, return_dict=True).logits.view(-1, ).float().cpu()
+        tock = time.time()
+        return 1000 * (tock - tick)
+
+
+class ONNXModel(Model):
+    def __init__(self, tokenizer_path: str, model_path: str, device_id: int):
+        super(ONNXModel, self).__init__(tokenizer_path, device_id)
+        self.ort = ORTModelForSequenceClassification.from_pretrained(model_path).to(self.device)
+
+    def _encode_batched(self, inputs: transformers.tokenization_utils_base.BatchEncoding) -> float:
+        tick = time.time()
+        with torch.inference_mode():
+            self.ort(**inputs, return_dict=True).logits.view(-1, ).float().cpu()
+        tock = time.time()
+        return 1000 * (tock - tick)
+
+
+class OMModel(Model):
+    def __init__(self, tokenizer_path: str, model_path: str, device_id: int) -> None:
+        super(OMModel, self).__init__(tokenizer_path, device_id)
+        self.session = self.runtime(device_id, model_path)
+
+    def _encode_batched(self, inputs: transformers.tokenization_utils_base.BatchEncoding) -> float:
+        input_ids = inputs.data['input_ids'].numpy().astype(np.int64)
+        attention_mask = inputs.data['attention_mask'].numpy().astype(np.int64)
+
+        tick = time.time()
+        self.session.infer(feeds=[input_ids, attention_mask], mode='dymshape', custom_sizes=10000000)
+        tock = time.time()
+
+        return 1000 * (tock - tick)
+
+
+class PerformanceEvaluator:
+    def __init__(self, metadata: dict) -> None:
+        self.metadata = metadata
+
+        # load dataset from HuggingFace hub
+        self.dataset = datasets.load_dataset(
+            self.metadata['dataset']['path'].split('.')[-1],
+            data_files={self.metadata['eval_splits'][0]: self.metadata['dataset']['path']}
+        )
+        self.samples = self.dataset[self.metadata['eval_splits'][0]]
+
+    def __call__(
+            self,
+            model: Model,
+            input_shape: Union[Tuple, List],
+            loop: int) -> dict:
+        """This is called during training to evaluate the model.
+        It returns scores.
+
+        Args:
+            model (`Model`): the model to evaluate
+            input_shape (`Union[Tuple[int, int], List[int, int]]`): shape of input tensors
+            loop (`int`): evaluation loops
+        """
+        return self.compute_performance(model, input_shape, loop)
+
+    def compute_performance(
+            self,
+            model: Model,
+            input_shape: Union[Tuple, List],
+            loop: int) -> dict:
+        batch_size, seq_len = input_shape
+
+        pairs = []
+        for sample in self.samples:
+            query = sample['query']
+            docs = []
+            docs.extend(sample['positive'])
+            docs.extend(sample['negative'])
+            for doc in docs:
+                pairs.append([query, doc])
+        pairs = pairs[:batch_size]
+
+        scores = model.compute_scores(pairs, batch_size, seq_len, loop)
+
+        return scores
+
+
+class Evaluation:
+    def __init__(self, eval_args: argparse.Namespace):
+        self.input_shape = tuple(map(int, eval_args.input_shape.split(',')))
+        self.device_id = eval_args.device
+        self.loop = eval_args.loop
+
+        # dataset metadata
+        self.metadata = {
+            'name': 'T2RerankingLocal',
+            'description': 'T2Ranking: A large-scale Chinese Benchmark for Passage Ranking',
+            'reference': 'https://arxiv.org/abs/2304.03679',
+            'dataset': {
+                'path': '.cache/huggingface/datasets/C-MTEB/T2Reranking/dev-00000-of-00001-65d96bde8023d9b9.parquet',
+                'revision': '76631901a18387f85eaa53e5450019b87ad58ef9',
+            },
+            'type': 'Reranking',
+            'category': 's2p',
+            'eval_splits': ['test'],
+            'eval_langs': ['zh'],
+            'main_score': 'map'
+        }
+
+        # default model path
+        with safe_open('config.json', 'r', encoding='utf-8') as reader:
+            text = reader.read()
+        default_path = json.loads(text)['default_path']
+        pytorch_model_path = self.tokenizer_path = os.path.abspath(default_path['tokenizer_path'])
+        onnx_model_path = os.path.abspath(default_path['onnx_model_path'])
+        om_model_path = os.path.abspath(default_path['om_model_path'])
+
+        model_path_map = {'pytorch': pytorch_model_path, 'onnx': onnx_model_path, 'om': om_model_path}
+
+        self.model_type = eval_args.model_type_or_path.removesuffix('/').split('.')[-1].split('/')[-1]
+        default_model_path = model_path_map.get(self.model_type, 'not exist')
+        if default_model_path != 'not exist':
+            self.model_path = (
+                eval_args.model_type_or_path
+                if os.path.isdir(eval_args.model_type_or_path) or os.path.isfile(eval_args.model_type_or_path)
+                else default_model_path
+            )
+        else:
+            raise RuntimeError(
+                'load model failed because '
+                '\'{}\' is not a valid model type or path'.format(eval_args.model_type_or_path)
+            )
+
+    def load_model(self) -> Model:
+        model_map = {'pytorch': PyTorchModel, 'onnx': ONNXModel, 'om': OMModel}
+        try:
+            model = model_map[self.model_type](
+                tokenizer_path=self.tokenizer_path,
+                model_path=self.model_path,
+                device_id=self.device_id
+            )
+        except KeyError as e:
+            raise RuntimeError('load {} model failed because {}'.format(self.model_type, e)) from e
+        return model
+
+    def run(self) -> dict:
+        model = self.load_model()
+        evaluator = PerformanceEvaluator(self.metadata)
+        eval_results = evaluator(model, self.input_shape, self.loop)
+        return eval_results
+
+
+if __name__ == '__main__':
+    logger = logging.getLogger(__name__)
+    logging.basicConfig(format='[%(levelname)s] %(message)s', level=logging.INFO)
+    args = get_args()
+    evaluation = Evaluation(args)
+    results = evaluation.run()
+    logging.info(results)
diff --git a/mindie/examples/models/bge/reranker-large/eval_precision.py b/mindie/examples/models/bge/reranker-large/eval_precision.py
new file mode 100644
index 00000000..b7ca60ea
--- /dev/null
+++ b/mindie/examples/models/bge/reranker-large/eval_precision.py
@@ -0,0 +1,351 @@
+# Copyright Huawei Technologies Co., Ltd. 2024-2024. All rights reserved.
+import argparse
+import json
+import logging
+import os
+from typing import Any, List, Union, Tuple
+
+import datasets
+import numpy as np
+import torch
+import transformers.tokenization_utils_base
+from mteb import MTEB, AbsTaskReranking
+from C_MTEB.tasks import ChineseRerankingEvaluator
+from transformers import AutoTokenizer, AutoModelForSequenceClassification
+from optimum.onnxruntime import ORTModelForSequenceClassification
+from tqdm import tqdm as progressbar
+
+from atb_llm.utils.file_utils import safe_open
+
+
+def get_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser(description='Evaluate LLM.')
+    parser.add_argument(
+        '--model_type_or_path',
+        type=str,
+        required=True,
+        help='Specipy model type to load default model or path to the directory containing model file.'
+    )
+    parser.add_argument(
+        '--batch_size',
+        type=int,
+        default=20,
+        help='Batch size of dataset for computing.'
+    )
+    parser.add_argument(
+        '--device',
+        type=int,
+        default=6,
+        choices=list(range(8)),
+        help='Adapt model on device id x.'
+    )
+    return parser.parse_args()
+
+
+# copied from mteb.evaluation.evaluators.utils.cos_sim
+def cos_sim(a: torch.Tensor, b: torch.Tensor) -> torch.Tensor:
+    """Computes the cosine similarity cos_sim(a[i], b[j]) for all i and j.
+
+    Returns:
+        Matrix with res[i][j]  = cos_sim(a[i], b[j])
+    """
+    if not isinstance(a, torch.Tensor):
+        a = torch.tensor(a)
+    if not isinstance(b, torch.Tensor):
+        b = torch.tensor(b)
+    if len(a.shape) == 1:
+        a = a.unsqueeze(0)
+    if len(b.shape) == 1:
+        b = b.unsqueeze(0)
+
+    a_norm = torch.nn.functional.normalize(a, p=2, dim=1)
+    b_norm = torch.nn.functional.normalize(b, p=2, dim=1)
+
+    # transpose will cause RuntimeError in C_MTEB.tasks.ChineseRerankingEvaluator.compute_metrics_from_biencoder():
+    # mat1 and mat2 shapes cannot be multiplied
+    try:
+        similarity = torch.mm(a_norm, b_norm.transpose(0, 1))
+    except RuntimeError:
+        similarity = torch.mm(a_norm, b_norm)
+
+    return similarity
+
+
+class ChineseRerankingEvaluatorTweaked(ChineseRerankingEvaluator):
+    # copied from mteb.evaluation.evaluators.RerankingEvaluator._compute_metrics_instance with similarity_fct->cos_sim
+    def _compute_metrics_instance(
+            self,
+            query_emb: torch.Tensor,
+            docs_emb: torch.Tensor,
+            is_relevant: List[bool]
+    ) -> dict[str, float]:
+        """Computes metrics for a single instance = (query, positives, negatives)
+
+        Args:
+            query_emb (`torch.Tensor` of shape `(num_queries, hidden_size)`): Query embedding
+                if `num_queries` > 0: we take the closest document to any of the queries
+            docs_emb (`torch.Tensor` of shape `(num_pos+num_neg, hidden_size)`): Candidates documents embeddings
+            is_relevant (`List[bool]` of length `num_pos+num_neg`): True if the document is relevant
+
+        Returns:
+            scores (`Dict[str, float]`):
+                - `mrr`: Mean Reciprocal Rank @ `self.mrr_at_k`
+                - `ap`: Average Precision
+        """
+        pred_scores = cos_sim(query_emb, docs_emb)
+        if len(pred_scores.shape) > 1:
+            pred_scores = torch.amax(pred_scores, dim=0)
+
+        pred_scores_argsort = torch.argsort(-pred_scores)  # Sort in decreasing order
+
+        mrr = self.mrr_at_k_score(is_relevant, pred_scores_argsort, self.mrr_at_k)
+        ap = self.ap_score(is_relevant, pred_scores.cpu().tolist())
+        return {'mrr': mrr, 'ap': ap}
+
+
+# copied from C_MTEB.tasks.Reranking.evaluate
+def evaluate(self, model_for_eval, split: str = 'test', **kwargs: Any) -> dict[str, float]:
+    if not self.data_loaded:
+        self.load_data()
+
+    data_split = self.dataset[split]
+
+    evaluator = ChineseRerankingEvaluatorTweaked(data_split, **kwargs)
+    scores = evaluator(model_for_eval)
+
+    return dict(scores)
+
+
+# rewrite
+AbsTaskReranking.evaluate = evaluate
+
+
+# custom task
+class T2RerankingLocal(AbsTaskReranking):
+    # 规避【华为Python规范】【要求】G.CLS.08 避免在__init__方法外定义类实例属性
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+        self.dataset = None
+        self.data_loaded = None
+
+    @property
+    def description(self) -> dict:
+        return {
+            'name': 'T2RerankingLocal',
+            'description': 'T2Ranking: A large-scale Chinese Benchmark for Passage Ranking',
+            'reference': "https://arxiv.org/abs/2304.03679",
+            'dataset': {
+                'path': '.cache/huggingface/datasets/C-MTEB/T2Reranking/dev-00000-of-00001-65d96bde8023d9b9.parquet',
+                'revision': '76631901a18387f85eaa53e5450019b87ad58ef9',
+            },
+            'type': 'Reranking',
+            'category': 's2p',
+            'eval_splits': ['test'],
+            'eval_langs': ['zh'],
+            'main_score': 'map',
+        }
+
+    def load_data(self, **kwargs) -> None:
+        if self.data_loaded:
+            return
+
+        try:
+            self.dataset = datasets.load_dataset(
+                'parquet',
+                data_files={self.description['eval_splits'][0]: self.description['dataset']['path']}
+            )
+        except KeyError as e:
+            raise RuntimeError('load dataset failed because {}'.format(e)) from e
+        else:
+            self.data_loaded = True
+
+
+# custom model
+class Model:
+    def __init__(self, tokenizer_path: str, batch_size: int, device_id: int) -> None:
+        self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)
+        self.device, self.runtime = self.init_runtime(device_id)
+        self.batch_size = batch_size
+
+    def init_runtime(self, device_id: int) -> Tuple[Union[str, int], Any]:
+        if self.__class__.__name__.startswith(('PyTorch', 'ONNX')):
+            try:
+                import torch_npu
+            except ImportError:
+                device = 'cuda:{}'.format(device_id) if torch.cuda.is_available() else 'cpu'
+            else:
+                device = 'npu:{}'.format(device_id)
+                torch_npu.npu.set_device(device_id)
+                torch.npu.set_compile_mode(jit_compile=False)
+            return device, 0
+        elif self.__class__.__name__.startswith('OM'):
+            from ais_bench.infer.interface import InferSession
+            return device_id, InferSession
+        else:
+            raise RuntimeError
+
+    def encode(self, sentences: List[str]) -> torch.Tensor:
+        """ Returns a list of embeddings for the given sentences.
+        Args:
+            sentences (`List[str]`): List of sentences to encode
+
+        Returns:
+            `torch.Tensor`: Tensor of embeddings for the given sentences
+        """
+        all_embeddings = []
+
+        for start_index in progressbar(range(0, len(sentences), self.batch_size)):
+            sentences_batch = sentences[start_index:start_index + self.batch_size]
+            # Tokenize sentences
+            encoded_inputs = self.tokenizer(
+                sentences_batch,
+                padding='max_length',
+                truncation=True,
+                return_tensors='pt',
+                max_length=512
+            )
+            # Compute token embeddings
+            embeddings = self._encode_or_compute_batched(encoded_inputs)
+            all_embeddings.extend(embeddings)
+
+        if all_embeddings:
+            if isinstance(all_embeddings, np.ndarray):
+                all_embeddings = torch.from_numpy(all_embeddings)
+            else:
+                all_embeddings = torch.stack(all_embeddings)
+        else:
+            all_embeddings = torch.Tensor()
+
+        return all_embeddings
+
+    def compute_score(self, sentence_pairs: Union[List[List[str]], Tuple[str, str]]) -> List[float]:
+        """ Returns a list of scores for the given sentence pairs.
+        Args:
+            sentence_pairs (`Union[List[List[str]], Tuple[str, str]]`): List of sentences pairs to compute score
+
+        Returns:
+            `List[float]`: List of scores for the given sentence pairs
+        """
+        if not isinstance(sentence_pairs, list):
+            raise TypeError('type of `sentence_pairs` is not `list`')
+        if isinstance(sentence_pairs[0], str):
+            sentence_pairs = [sentence_pairs]
+
+        all_scores = []
+
+        for start_index in progressbar(range(0, len(sentence_pairs), self.batch_size), 'Computing'):
+            pairs_batch = sentence_pairs[start_index:start_index + self.batch_size]
+            # Tokenize sentences
+            encoded_inputs = self.tokenizer(
+                pairs_batch,
+                padding='max_length',
+                truncation=True,
+                return_tensors='pt',
+                max_length=512
+            ).to(self.device)
+            scores = self._encode_or_compute_batched(encoded_inputs)
+            all_scores.extend(scores.numpy().tolist())
+
+        return all_scores[0] if len(all_scores) == 1 else all_scores
+
+    def _encode_or_compute_batched(self, inputs: transformers.tokenization_utils_base.BatchEncoding) -> torch.Tensor:
+        """ Returns a list of embeddings for the given sentences.
+
+        Args:
+            inputs (`BatchEncoding`): List of sentences to encode
+
+        Returns:
+            `torch.Tensor`: Tensor of embeddings for the given sentences
+        """
+        # 规避【华为Python规范】【建议】G.CLS.07 类的方法不需要访问实例时，建议定义为staticmethod或classmethod
+        _ = self
+        return torch.tensor(0)
+
+
+class PyTorchModel(Model):
+    def __init__(self, tokenizer_path: str, model_path: str, batch_size: int, device_id: int):
+        super(PyTorchModel, self).__init__(tokenizer_path, batch_size, device_id)
+        self.model = AutoModelForSequenceClassification.from_pretrained(
+            model_path,
+            local_files_only=True,
+            trust_remote_code=True
+        ).half().to(self.device)
+        self.model.eval()
+
+    def _encode_or_compute_batched(self, inputs: transformers.tokenization_utils_base.BatchEncoding) -> torch.Tensor:
+        with torch.no_grad():
+            outputs = self.model(**inputs, return_dict=True).logits.view(-1, ).float().cpu()
+        return outputs
+
+
+class ONNXModel(Model):
+    def __init__(self, tokenizer_path: str, model_path: str, batch_size: int, device_id: int):
+        super(ONNXModel, self).__init__(tokenizer_path, batch_size, device_id)
+        self.ort = ORTModelForSequenceClassification.from_pretrained(model_path).to(self.device)
+
+    def _encode_or_compute_batched(self, inputs: transformers.tokenization_utils_base.BatchEncoding) -> torch.Tensor:
+        with torch.inference_mode():
+            outputs = self.ort(**inputs, return_dict=True).logits.view(-1, ).float().cpu()
+        return outputs
+
+
+class OMModel(Model):
+    def __init__(self, tokenizer_path: str, model_path: str, batch_size: int, device_id: int) -> None:
+        super(OMModel, self).__init__(tokenizer_path, batch_size, device_id)
+        self.session = self.runtime(device_id, model_path)
+
+    def _encode_or_compute_batched(self, inputs: transformers.tokenization_utils_base.BatchEncoding) -> torch.Tensor:
+        input_ids = inputs.data['input_ids'].numpy().astype(np.int64)
+        attention_mask = inputs.data['attention_mask'].numpy().astype(np.int64)
+        session_outputs = self.session.infer(feeds=[input_ids, attention_mask], mode='dymshape', custom_sizes=10000000)
+        outputs = torch.from_numpy(session_outputs[0][:, 0]).view(-1, ).float()
+        return outputs
+
+
+def load_model(model_args: argparse.Namespace) -> Model:
+    # default model path
+    with safe_open('config.json', 'r', encoding='utf-8') as reader:
+        text = reader.read()
+    default_path = json.loads(text)['default_path']
+    pytorch_model_path = tokenizer_path = os.path.abspath(default_path['tokenizer_path'])
+    onnx_model_path = os.path.abspath(default_path['onnx_model_path'])
+    om_model_path = os.path.abspath(default_path['om_model_path'])
+
+    model_path_map = {'pytorch': pytorch_model_path, 'onnx': onnx_model_path, 'om': om_model_path}
+    model_map = {'pytorch': PyTorchModel, 'onnx': ONNXModel, 'om': OMModel}
+
+    model_type = model_args.model_type_or_path.removesuffix('/').split('.')[-1].split('/')[-1]
+    default_model_path = model_path_map.get(model_type, 'not exist')
+    if default_model_path != 'not exist':
+        model_path = (
+            model_args.model_type_or_path
+            if os.path.isdir(model_args.model_type_or_path) or os.path.isfile(model_args.model_type_or_path)
+            else default_model_path
+        )
+    else:
+        raise RuntimeError(
+            'load model failed because '
+            '\'{}\' is not a valid model type or path'.format(model_args.model_type_or_path)
+        )
+    try:
+        model_for_eval = model_map[model_type](
+            tokenizer_path=tokenizer_path,
+            model_path=model_path,
+            batch_size=model_args.batch_size,
+            device_id=model_args.device
+        )
+    except KeyError as e:
+        raise RuntimeError('load {} model failed because {}'.format(model_type, e)) from e
+
+    return model_for_eval
+
+
+if __name__ == '__main__':
+    logger = logging.getLogger(__name__)
+    logging.basicConfig(format='[%(levelname)s] %(message)s', level=logging.INFO)
+    args = get_args()
+    model = load_model(args)
+    task = ['T2RerankingLocal']
+    evaluation = MTEB(tasks=task, task_langs=['zh'])
+    results = evaluation.run(model)
+    logging.info(results)
diff --git a/mindie/examples/models/bge/reranker-large/models/om/ops_info.json b/mindie/examples/models/bge/reranker-large/models/om/ops_info.json
new file mode 100644
index 00000000..854c3bfd
--- /dev/null
+++ b/mindie/examples/models/bge/reranker-large/models/om/ops_info.json
@@ -0,0 +1,16 @@
+{
+  "black-list": {
+    "to-remove": [],
+    "to-add": []
+  },
+  "white-list": {
+    "to-remove": [],
+    "to-add": [
+      "Cast",
+      "FlattenV2",
+      "LayerNorm",
+      "GatherShapes",
+      "GatherV2"
+    ]
+  }
+}
diff --git a/mindie/examples/models/bge/reranker-large/models/pytorch/config.json b/mindie/examples/models/bge/reranker-large/models/pytorch/config.json
new file mode 100644
index 00000000..15c45f15
--- /dev/null
+++ b/mindie/examples/models/bge/reranker-large/models/pytorch/config.json
@@ -0,0 +1,39 @@
+{
+  "_name_or_path": "models/pytorch",
+  "architectures": [
+    "XLMRobertaForSequenceClassification"
+  ],
+  "auto_map": {
+    "AutoConfig": "models/pytorch--configuration_xlm_roberta.XLMRobertaConfig",
+    "AutoModel": "models/pytorch--modeling_xlm_roberta_ascend.XLMRobertaModel",
+    "AutoModelForSequenceClassification": "models/pytorch--modeling_xlm_roberta_ascend.XLMRobertaForSequenceClassification"
+  },
+  "attention_probs_dropout_prob": 0.1,
+  "bos_token_id": 0,
+  "classifier_dropout": null,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout_prob": 0.1,
+  "hidden_size": 1024,
+  "id2label": {
+    "0": "LABEL_0"
+  },
+  "initializer_range": 0.02,
+  "intermediate_size": 4096,
+  "label2id": {
+    "LABEL_0": 0
+  },
+  "layer_norm_eps": 1e-05,
+  "max_position_embeddings": 514,
+  "model_type": "xlm-roberta",
+  "num_attention_heads": 16,
+  "num_hidden_layers": 24,
+  "output_past": true,
+  "pad_token_id": 1,
+  "position_embedding_type": "absolute",
+  "torch_dtype": "float32",
+  "transformers_version": "4.30.0",
+  "type_vocab_size": 1,
+  "use_cache": true,
+  "vocab_size": 250002
+}
diff --git a/mindie/examples/models/bge/reranker-large/models/pytorch/configuration_xlm_roberta.py b/mindie/examples/models/bge/reranker-large/models/pytorch/configuration_xlm_roberta.py
new file mode 100644
index 00000000..53771d2a
--- /dev/null
+++ b/mindie/examples/models/bge/reranker-large/models/pytorch/configuration_xlm_roberta.py
@@ -0,0 +1,170 @@
+# coding=utf-8
+# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" XLM-RoBERTa configuration"""
+
+from collections import OrderedDict
+from typing import Mapping
+
+from transformers.configuration_utils import PretrainedConfig
+from transformers.onnx import OnnxConfig
+from transformers.utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+XLM_ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP = {
+    "xlm-roberta-base": "https://huggingface.co/xlm-roberta-base/resolve/main/config.json",
+    "xlm-roberta-large": "https://huggingface.co/xlm-roberta-large/resolve/main/config.json",
+    "xlm-roberta-large-finetuned-conll02-dutch": (
+        "https://huggingface.co/xlm-roberta-large-finetuned-conll02-dutch/resolve/main/config.json"
+    ),
+    "xlm-roberta-large-finetuned-conll02-spanish": (
+        "https://huggingface.co/xlm-roberta-large-finetuned-conll02-spanish/resolve/main/config.json"
+    ),
+    "xlm-roberta-large-finetuned-conll03-english": (
+        "https://huggingface.co/xlm-roberta-large-finetuned-conll03-english/resolve/main/config.json"
+    ),
+    "xlm-roberta-large-finetuned-conll03-german": (
+        "https://huggingface.co/xlm-roberta-large-finetuned-conll03-german/resolve/main/config.json"
+    ),
+}
+
+
+class XLMRobertaConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`XLMRobertaModel`] or a [`TFXLMRobertaModel`]. It
+    is used to instantiate a XLM-RoBERTa model according to the specified arguments, defining the model architecture.
+    Instantiating a configuration with the defaults will yield a similar configuration to that of the XLMRoBERTa
+    [xlm-roberta-base](https://huggingface.co/xlm-roberta-base) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+
+    Args:
+        vocab_size (`int`, *optional*, defaults to 30522):
+            Vocabulary size of the XLM-RoBERTa model. Defines the number of different tokens that can be represented by
+            the `inputs_ids` passed when calling [`XLMRobertaModel`] or [`TFXLMRobertaModel`].
+        hidden_size (`int`, *optional*, defaults to 768):
+            Dimensionality of the encoder layers and the pooler layer.
+        num_hidden_layers (`int`, *optional*, defaults to 12):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 12):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        intermediate_size (`int`, *optional*, defaults to 3072):
+            Dimensionality of the "intermediate" (often named feed-forward) layer in the Transformer encoder.
+        hidden_act (`str` or `Callable`, *optional*, defaults to `"gelu"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+            `"relu"`, `"silu"` and `"gelu_new"` are supported.
+        hidden_dropout_prob (`float`, *optional*, defaults to 0.1):
+            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
+        attention_probs_dropout_prob (`float`, *optional*, defaults to 0.1):
+            The dropout ratio for the attention probabilities.
+        max_position_embeddings (`int`, *optional*, defaults to 512):
+            The maximum sequence length that this model might ever be used with. Typically set this to something large
+            just in case (e.g., 512 or 1024 or 2048).
+        type_vocab_size (`int`, *optional*, defaults to 2):
+            The vocabulary size of the `token_type_ids` passed when calling [`XLMRobertaModel`] or
+            [`TFXLMRobertaModel`].
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-12):
+            The epsilon used by the layer normalization layers.
+        position_embedding_type (`str`, *optional*, defaults to `"absolute"`):
+            Type of position embedding. Choose one of `"absolute"`, `"relative_key"`, `"relative_key_query"`. For
+            positional embeddings use `"absolute"`. For more information on `"relative_key"`, please refer to
+            [Self-Attention with Relative Position Representations (Shaw et al.)](https://arxiv.org/abs/1803.02155).
+            For more information on `"relative_key_query"`, please refer to *Method 4* in [Improve Transformer Models
+            with Better Relative Position Embeddings (Huang et al.)](https://arxiv.org/abs/2009.13658).
+        is_decoder (`bool`, *optional*, defaults to `False`):
+            Whether the model is used as a decoder or not. If `False`, the model is used as an encoder.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values attentions (not used by all models). Only
+            relevant if `config.is_decoder=True`.
+        classifier_dropout (`float`, *optional*):
+            The dropout ratio for the classification head.
+
+    Examples:
+
+    ```python
+    >>> from transformers import XLMRobertaConfig, XLMRobertaModel
+
+    >>> # Initializing a XLM-RoBERTa xlm-roberta-base style configuration
+    >>> configuration = XLMRobertaConfig()
+
+    >>> # Initializing a model (with random weights) from the xlm-roberta-base style configuration
+    >>> model = XLMRobertaModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+    model_type = "xlm-roberta"
+
+    def __init__(
+        self,
+        vocab_size=30522,
+        hidden_size=768,
+        num_hidden_layers=12,
+        num_attention_heads=12,
+        intermediate_size=3072,
+        hidden_act="gelu",
+        hidden_dropout_prob=0.1,
+        attention_probs_dropout_prob=0.1,
+        max_position_embeddings=512,
+        type_vocab_size=2,
+        initializer_range=0.02,
+        layer_norm_eps=1e-12,
+        pad_token_id=1,
+        bos_token_id=0,
+        eos_token_id=2,
+        position_embedding_type="absolute",
+        use_cache=True,
+        classifier_dropout=None,
+        **kwargs,
+    ):
+        super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
+
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.hidden_act = hidden_act
+        self.intermediate_size = intermediate_size
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.max_position_embeddings = max_position_embeddings
+        self.type_vocab_size = type_vocab_size
+        self.initializer_range = initializer_range
+        self.layer_norm_eps = layer_norm_eps
+        self.position_embedding_type = position_embedding_type
+        self.use_cache = use_cache
+        self.classifier_dropout = classifier_dropout
+
+
+# Copied from transformers.models.roberta.configuration_roberta.RobertaOnnxConfig with Roberta->XLMRoberta
+class XLMRobertaOnnxConfig(OnnxConfig):
+    @property
+    def inputs(self) -> Mapping[str, Mapping[int, str]]:
+        if self.task == "multiple-choice":
+            dynamic_axis = {0: "batch", 1: "choice", 2: "sequence"}
+        else:
+            dynamic_axis = {0: "batch", 1: "sequence"}
+        return OrderedDict(
+            [
+                ("input_ids", dynamic_axis),
+                ("attention_mask", dynamic_axis),
+            ]
+        )
diff --git a/mindie/examples/models/bge/reranker-large/models/pytorch/modeling_xlm_roberta_fa.py b/mindie/examples/models/bge/reranker-large/models/pytorch/modeling_xlm_roberta_fa.py
new file mode 100644
index 00000000..a94d491b
--- /dev/null
+++ b/mindie/examples/models/bge/reranker-large/models/pytorch/modeling_xlm_roberta_fa.py
@@ -0,0 +1,1899 @@
+# coding=utf-8
+# Copyright 2019 Facebook AI Research and the HuggingFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch XLM-RoBERTa model."""
+
+import json
+import os
+import math
+from typing import List, Optional, Tuple, Union
+
+import torch
+import torch_npu
+import torch.utils.checkpoint
+from torch import nn
+from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
+
+from transformers.activations import ACT2FN, gelu
+from transformers.modeling_outputs import (
+    BaseModelOutputWithPastAndCrossAttentions,
+    BaseModelOutputWithPoolingAndCrossAttentions,
+    CausalLMOutputWithCrossAttentions,
+    MaskedLMOutput,
+    MultipleChoiceModelOutput,
+    QuestionAnsweringModelOutput,
+    SequenceClassifierOutput,
+    TokenClassifierOutput,
+)
+from transformers.modeling_utils import PreTrainedModel
+from transformers.pytorch_utils import apply_chunking_to_forward, find_pruneable_heads_and_indices, prune_linear_layer
+from transformers.utils import (
+    add_code_sample_docstrings,
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    logging,
+    replace_return_docstrings,
+)
+
+from atb_speed.common.utils import load_atb_speed
+
+from .configuration_xlm_roberta import XLMRobertaConfig
+
+
+logger = logging.get_logger(__name__)
+
+_CHECKPOINT_FOR_DOC = "xlm-roberta-base"
+_CONFIG_FOR_DOC = "XLMRobertaConfig"
+
+XLM_ROBERTA_PRETRAINED_MODEL_ARCHIVE_LIST = [
+    "xlm-roberta-base",
+    "xlm-roberta-large",
+    "xlm-roberta-large-finetuned-conll02-dutch",
+    "xlm-roberta-large-finetuned-conll02-spanish",
+    "xlm-roberta-large-finetuned-conll03-english",
+    "xlm-roberta-large-finetuned-conll03-german",
+    # See all XLM-RoBERTa models at https://huggingface.co/models?filter=xlm-roberta
+]
+
+
+def matrix_is_nd_format():
+    soc_version = torch_npu._C._npu_get_soc_version()
+    return soc_version in [104, 220, 221, 222, 223, 224]
+
+
+def load_acl_transformer():
+    acl_transformer_home_path = os.getenv('ATB_SPEED_HOME_PATH', '')
+    if not acl_transformer_home_path or not os.path.exists(acl_transformer_home_path):
+        raise RuntimeError('env ATB_SPEED_HOME_PATH not exist, source set_env.sh')
+    lib_path = os.path.join(acl_transformer_home_path, 'lib/libatb_speed_torch.so')
+    torch.classes.load_library(lib_path)
+
+
+IS_ND = matrix_is_nd_format()
+MASK_INC_DIM = 1 if IS_ND else 16
+
+load_acl_transformer()
+load_atb_speed()
+
+
+# Copied from transformers.models.roberta.modeling_roberta.create_position_ids_from_input_ids
+def create_position_ids_from_input_ids(input_ids, padding_idx, past_key_values_length=0):
+    """
+    Replace non-padding symbols with their position numbers. Position numbers begin at padding_idx+1. Padding symbols
+    are ignored. This is modified from fairseq's `utils.make_positions`.
+
+    Args:
+        input_ids: torch.Tensor x
+        padding_idx
+        past_key_values_length
+
+    Returns: torch.Tensor
+    """
+    # The series of casts and type-conversions here are carefully balanced to both work with ONNX export and XLA.
+    mask = input_ids.ne(padding_idx).int()
+    incremental_indices = (torch.cumsum(mask, dim=1).type_as(mask) + past_key_values_length) * mask
+    return incremental_indices.long() + padding_idx
+
+
+class KVAttentionManager:
+    def __init__(self, config: XLMRobertaConfig, batch_size: int):
+        self.nz_dim = 16
+        self.is_full = True
+        self.batch_size = batch_size
+        self.num_layers = config.num_hidden_layers
+        self.hidden_size = config.hidden_size
+        self.max_sequence_length = int(os.getenv('MAX_SEQ_LEN', config.max_position_embeddings - 2))
+        self.token_offset = 1
+        self.ori_len_list = []
+        self.min_cache = None
+
+        if IS_ND:
+            self.k_cache_input = torch.zeros(
+                self.num_layers,
+                self.batch_size,
+                self.max_sequence_length,
+                self.hidden_size,
+                device='npu',
+                dtype=torch.half
+            )
+            self.v_cache_input = torch.zeros(
+                self.num_layers,
+                self.batch_size,
+                self.max_sequence_length,
+                self.hidden_size,
+                device='npu',
+                dtype=torch.half
+            )
+        else:
+            self.k_cache_input = torch.zeros(
+                self.num_layers,
+                self.batch_size,
+                self.hidden_size // self.nz_dim,
+                self.max_sequence_length,
+                self.nz_dim,
+                device='npu',
+                dtype=torch.half
+            )
+            self.v_cache_input = torch.zeros(
+                self.num_layers,
+                self.batch_size,
+                self.hidden_size // self.nz_dim,
+                self.max_sequence_length,
+                self.nz_dim,
+                device='npu',
+                dtype=torch.half
+            )
+            self.k_cache_input = torch_npu.npu_format_cast(self.k_cache_input, 29)
+            torch.npu.empty_cache()
+            self.v_cache_input = torch_npu.npu_format_cast(self.v_cache_input, 29)
+        torch.npu.empty_cache()
+
+        self.attention_mask_max_full = torch.zeros(
+            (self.batch_size, self.max_sequence_length, self.max_sequence_length),
+            device='npu',
+            dtype=torch.half
+        )
+        self.attention_mask_max_inc = torch.zeros(
+            (self.batch_size, MASK_INC_DIM, self.max_sequence_length),
+            device='npu',
+            dtype=torch.half
+        )
+
+        # init attributes in self.init_seq_len_and_token_offset()
+        self.token_offset_tensor = None
+        self.seq_len_tensor_inc = None
+        self.seq_len_list_inc = None
+        self.seq_len_tensor_full = None
+        self.seq_len_list_full = None
+
+    @property
+    def seq_len_list(self):
+        if self.is_full:
+            return self.seq_len_list_full
+        return self.seq_len_list_inc
+
+    @property
+    def seq_len_tensor(self):
+        if self.is_full:
+            return self.seq_len_tensor_full
+        return self.seq_len_tensor_inc
+
+    @property
+    def token_offset_list(self):
+        return [self.token_offset] * self.batch_size
+
+    def init_attention_mask(self):
+        if IS_ND:
+            self.attention_mask_max_full.zero_()
+            self.attention_mask_max_inc.zero_()
+        else:
+            self.attention_mask_max_full.zero_()
+            self.attention_mask_max_inc = torch.zeros(
+                (self.batch_size, MASK_INC_DIM, self.max_sequence_length),
+                device='npu',
+                dtype=torch.half
+            )
+
+    def get_attention_mask(self, attention_mask: Optional[torch.Tensor] = None):
+        if self.is_full:
+            for i in range(self.batch_size):
+                self.attention_mask_max_full[i][:self.token_offset, :self.token_offset] = attention_mask[i]
+            if IS_ND:
+                return self.attention_mask_max_full
+            else:
+                self.attention_mask_max_inc = self.trans_data(self.attention_mask_max_inc, 'inc')
+                return self.trans_data(self.attention_mask_max_full, 'full')
+        else:
+            return self.attention_mask_max_inc
+
+    def init_seq_len_and_token_offset(self, seq_len: int):
+        self.token_offset = seq_len
+        self.seq_len_list_full = [self.token_offset] * self.batch_size
+        self.seq_len_tensor_full = torch.full(
+            (self.batch_size,),
+            self.token_offset,
+            device='npu',
+            dtype=torch.int32
+        )
+        self.seq_len_list_inc = [1] * self.batch_size
+        self.seq_len_tensor_inc = torch.full(
+            (self.batch_size,),
+            1,
+            device='npu',
+            dtype=torch.int32
+        )
+        self.token_offset_tensor = torch.full(
+            (self.batch_size,),
+            self.token_offset,
+            device='npu',
+            dtype=torch.int32
+        )
+
+    def trans_data(self, tensor, trans_type='full'):
+        if trans_type == 'full':
+            return torch_npu.npu_format_cast(
+                tensor.view(
+                    self.batch_size, self.max_sequence_length, self.max_sequence_length // self.nz_dim, self.nz_dim
+                ).transpose(1, 2).contiguous(),
+                29
+            )
+        else:
+            return torch_npu.npu_format_cast(
+                tensor.view(
+                    self.batch_size, self.nz_dim, self.max_sequence_length // self.nz_dim, self.nz_dim
+                ).transpose(1, 2).contiguous(),
+                29
+            )
+
+
+# Copied from transformers.models.roberta.modeling_roberta.RobertaEmbeddings with Roberta->XLMRoberta
+class XLMRobertaEmbeddings(nn.Module):
+    """
+    Same as BertEmbeddings with a tiny tweak for positional embeddings indexing.
+    """
+
+    # Copied from transformers.models.bert.modeling_bert.BertEmbeddings.__init__
+    def __init__(self, config):
+        super().__init__()
+        self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id)
+        self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size)
+        self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.hidden_size)
+
+        # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load
+        # any TensorFlow checkpoint file
+        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+        # position_ids (1, len position emb) is contiguous in memory and exported when serialized
+        self.position_embedding_type = getattr(config, "position_embedding_type", "absolute")
+        self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)))
+        self.register_buffer(
+            "token_type_ids", torch.zeros(self.position_ids.size(), dtype=torch.long), persistent=False
+        )
+
+        # End copy
+        self.padding_idx = config.pad_token_id
+        self.position_embeddings = nn.Embedding(
+            config.max_position_embeddings, config.hidden_size, padding_idx=self.padding_idx
+        )
+
+    def forward(
+        self, input_ids=None, token_type_ids=None, position_ids=None, inputs_embeds=None, past_key_values_length=0
+    ):
+        if position_ids is None:
+            if input_ids is not None:
+                # Create the position ids from the input token ids. Any padded tokens remain padded.
+                position_ids = create_position_ids_from_input_ids(
+                    input_ids,
+                    self.padding_idx,
+                    past_key_values_length
+                )
+            else:
+                position_ids = self.create_position_ids_from_inputs_embeds(inputs_embeds)
+
+        if input_ids is not None:
+            input_shape = input_ids.size()
+        else:
+            input_shape = inputs_embeds.size()[:-1]
+
+        seq_length = input_shape[1]
+
+        # Setting the token_type_ids to the registered buffer in constructor where it is all zeros,
+        # which usually occurs when its auto-generated,
+        # registered buffer helps users when tracing the model without passing token_type_ids, solves issue #5664
+        if token_type_ids is None:
+            if hasattr(self, "token_type_ids"):
+                buffered_token_type_ids = self.token_type_ids[:, :seq_length]
+                buffered_token_type_ids_expanded = buffered_token_type_ids.expand(input_shape[0], seq_length)
+                token_type_ids = buffered_token_type_ids_expanded
+            else:
+                token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=self.position_ids.device)
+
+        if inputs_embeds is None:
+            inputs_embeds = self.word_embeddings(input_ids)
+        token_type_embeddings = self.token_type_embeddings(token_type_ids)
+
+        embeddings = inputs_embeds + token_type_embeddings
+        if self.position_embedding_type == "absolute":
+            position_embeddings = self.position_embeddings(position_ids)
+            embeddings += position_embeddings
+        embeddings = self.LayerNorm(embeddings)
+        embeddings = self.dropout(embeddings)
+        return embeddings
+
+    def create_position_ids_from_inputs_embeds(self, inputs_embeds):
+        """
+        We are provided embeddings directly. We cannot infer which are padded so just generate sequential position ids.
+
+        Args:
+            inputs_embeds: torch.Tensor
+
+        Returns: torch.Tensor
+        """
+        input_shape = inputs_embeds.size()[:-1]
+        sequence_length = input_shape[1]
+
+        position_ids = torch.arange(
+            self.padding_idx + 1, sequence_length + self.padding_idx + 1, dtype=torch.long, device=inputs_embeds.device
+        )
+        return position_ids.unsqueeze(0).expand(input_shape)
+
+
+# Copied from transformers.models.roberta.modeling_roberta.RobertaSelfAttention with Roberta->XLMRoberta
+class XLMRobertaSelfAttention(nn.Module):
+    def __init__(self, config, position_embedding_type=None):
+        super().__init__()
+        if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"):
+            raise ValueError(
+                f"The hidden size ({config.hidden_size}) is not a multiple of the number of attention "
+                f"heads ({config.num_attention_heads})"
+            )
+
+        self.num_attention_heads = config.num_attention_heads
+        try:
+            self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
+        except ZeroDivisionError:
+            raise
+        except Exception as e:
+            raise RuntimeError from e
+        self.all_head_size = self.num_attention_heads * self.attention_head_size
+
+        self.query = nn.Linear(config.hidden_size, self.all_head_size)
+        self.key = nn.Linear(config.hidden_size, self.all_head_size)
+        self.value = nn.Linear(config.hidden_size, self.all_head_size)
+
+        self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
+        self.position_embedding_type = position_embedding_type or getattr(
+            config, "position_embedding_type", "absolute"
+        )
+        if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query":
+            self.max_position_embeddings = config.max_position_embeddings
+            self.distance_embedding = nn.Embedding(2 * config.max_position_embeddings - 1, self.attention_head_size)
+
+        self.is_decoder = config.is_decoder
+
+    def transpose_for_scores(self, x: torch.Tensor) -> torch.Tensor:
+        new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
+        x = x.view(new_x_shape)
+        return x.permute(0, 2, 1, 3)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        encoder_attention_mask: Optional[torch.FloatTensor] = None,
+        past_key_value: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
+        output_attentions: Optional[bool] = False,
+    ) -> Tuple[torch.Tensor]:
+        mixed_query_layer = self.query(hidden_states)
+
+        # If this is instantiated as a cross-attention module, the keys
+        # and values come from an encoder; the attention mask needs to be
+        # such that the encoder's padding tokens are not attended to.
+        is_cross_attention = encoder_hidden_states is not None
+
+        if is_cross_attention and past_key_value is not None:
+            # reuse k,v, cross_attentions
+            key_layer = past_key_value[0]
+            value_layer = past_key_value[1]
+            attention_mask = encoder_attention_mask
+        elif is_cross_attention:
+            key_layer = self.transpose_for_scores(self.key(encoder_hidden_states))
+            value_layer = self.transpose_for_scores(self.value(encoder_hidden_states))
+            attention_mask = encoder_attention_mask
+        elif past_key_value is not None:
+            key_layer = self.transpose_for_scores(self.key(hidden_states))
+            value_layer = self.transpose_for_scores(self.value(hidden_states))
+            key_layer = torch.cat([past_key_value[0], key_layer], dim=2)
+            value_layer = torch.cat([past_key_value[1], value_layer], dim=2)
+        else:
+            key_layer = self.transpose_for_scores(self.key(hidden_states))
+            value_layer = self.transpose_for_scores(self.value(hidden_states))
+
+        query_layer = self.transpose_for_scores(mixed_query_layer)
+
+        use_cache = past_key_value is not None
+        if self.is_decoder:
+            # if cross_attention save Tuple(torch.Tensor, torch.Tensor) of all cross attention key/value_states.
+            # Further calls to cross_attention layer can then reuse all cross-attention
+            # key/value_states (first "if" case)
+            # if uni-directional self-attention (decoder) save Tuple(torch.Tensor, torch.Tensor) of
+            # all previous decoder key/value_states. Further calls to uni-directional self-attention
+            # can concat previous decoder key/value_states to current projected key/value_states (third "elif" case)
+            # if encoder bi-directional self-attention `past_key_value` is always `None`
+            past_key_value = (key_layer, value_layer)
+
+        # Take the dot product between "query" and "key" to get the raw attention scores.
+        attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))
+
+        if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query":
+            query_length, key_length = query_layer.shape[2], key_layer.shape[2]
+            if use_cache:
+                position_ids_l = torch.tensor(key_length - 1, dtype=torch.long, device=hidden_states.device).view(
+                    -1, 1
+                )
+            else:
+                position_ids_l = torch.arange(query_length, dtype=torch.long, device=hidden_states.device).view(-1, 1)
+            position_ids_r = torch.arange(key_length, dtype=torch.long, device=hidden_states.device).view(1, -1)
+            distance = position_ids_l - position_ids_r
+
+            positional_embedding = self.distance_embedding(distance + self.max_position_embeddings - 1)
+            positional_embedding = positional_embedding.to(dtype=query_layer.dtype)  # fp16 compatibility
+
+            if self.position_embedding_type == "relative_key":
+                relative_position_scores = torch.einsum("bhld,lrd->bhlr", query_layer, positional_embedding)
+                attention_scores = attention_scores + relative_position_scores
+            elif self.position_embedding_type == "relative_key_query":
+                relative_position_scores_query = torch.einsum("bhld,lrd->bhlr", query_layer, positional_embedding)
+                relative_position_scores_key = torch.einsum("bhrd,lrd->bhlr", key_layer, positional_embedding)
+                attention_scores = attention_scores + relative_position_scores_query + relative_position_scores_key
+
+        try:
+            attention_scores = attention_scores / math.sqrt(self.attention_head_size)
+        except ZeroDivisionError:
+            raise
+        except Exception as e:
+            raise RuntimeError from e
+        if attention_mask is not None:
+            # Apply the attention mask is (precomputed for all layers in XLMRobertaModel forward() function)
+            attention_scores = attention_scores + attention_mask
+
+        # Normalize the attention scores to probabilities.
+        attention_probs = nn.functional.softmax(attention_scores, dim=-1)
+
+        # This is actually dropping out entire tokens to attend to, which might
+        # seem a bit unusual, but is taken from the original Transformer paper.
+        attention_probs = self.dropout(attention_probs)
+
+        # Mask heads if we want to
+        if head_mask is not None:
+            attention_probs = attention_probs * head_mask
+
+        context_layer = torch.matmul(attention_probs, value_layer)
+
+        context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
+        new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
+        context_layer = context_layer.view(new_context_layer_shape)
+
+        outputs = (context_layer, attention_probs) if output_attentions else (context_layer,)
+
+        if self.is_decoder:
+            outputs = outputs + (past_key_value,)
+        return outputs
+
+
+# Copied from transformers.models.roberta.modeling_roberta.RobertaSelfOutput with Roberta->XLMRoberta
+class XLMRobertaSelfOutput(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states + input_tensor)
+        return hidden_states
+
+
+# Copied from transformers.models.roberta.modeling_roberta.RobertaAttention with Roberta->XLMRoberta
+class XLMRobertaAttention(nn.Module):
+    def __init__(self, config, position_embedding_type=None):
+        super().__init__()
+        self.self = XLMRobertaSelfAttention(config, position_embedding_type=position_embedding_type)
+        self.output = XLMRobertaSelfOutput(config)
+        self.pruned_heads = set()
+
+    def prune_heads(self, heads):
+        if len(heads) == 0:
+            return
+        heads, index = find_pruneable_heads_and_indices(
+            heads, self.self.num_attention_heads, self.self.attention_head_size, self.pruned_heads
+        )
+
+        # Prune linear layers
+        self.self.query = prune_linear_layer(self.self.query, index)
+        self.self.key = prune_linear_layer(self.self.key, index)
+        self.self.value = prune_linear_layer(self.self.value, index)
+        self.output.dense = prune_linear_layer(self.output.dense, index, dim=1)
+
+        # Update hyper params and store pruned heads
+        self.self.num_attention_heads = self.self.num_attention_heads - len(heads)
+        self.self.all_head_size = self.self.attention_head_size * self.self.num_attention_heads
+        self.pruned_heads = self.pruned_heads.union(heads)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        encoder_attention_mask: Optional[torch.FloatTensor] = None,
+        past_key_value: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
+        output_attentions: Optional[bool] = False,
+    ) -> Tuple[torch.Tensor]:
+        self_outputs = self.self(
+            hidden_states,
+            attention_mask,
+            head_mask,
+            encoder_hidden_states,
+            encoder_attention_mask,
+            past_key_value,
+            output_attentions,
+        )
+        attention_output = self.output(self_outputs[0], hidden_states)
+        outputs = (attention_output,) + self_outputs[1:]  # add attentions if we output them
+        return outputs
+
+
+# Copied from transformers.models.roberta.modeling_roberta.RobertaIntermediate with Roberta->XLMRoberta
+class XLMRobertaIntermediate(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
+        if isinstance(config.hidden_act, str):
+            self.intermediate_act_fn = ACT2FN[config.hidden_act]
+        else:
+            self.intermediate_act_fn = config.hidden_act
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.intermediate_act_fn(hidden_states)
+        return hidden_states
+
+
+# Copied from transformers.models.roberta.modeling_roberta.RobertaOutput with Roberta->XLMRoberta
+class XLMRobertaOutput(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
+        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states + input_tensor)
+        return hidden_states
+
+
+# Copied from transformers.models.roberta.modeling_roberta.RobertaLayer with Roberta->XLMRoberta
+class XLMRobertaLayer(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.chunk_size_feed_forward = config.chunk_size_feed_forward
+        self.seq_len_dim = 1
+        self.attention = XLMRobertaAttention(config)
+        self.is_decoder = config.is_decoder
+        self.add_cross_attention = config.add_cross_attention
+        if self.add_cross_attention:
+            if not self.is_decoder:
+                raise ValueError(f"{self} should be used as a decoder model if cross attention is added")
+            self.crossattention = XLMRobertaAttention(config, position_embedding_type="absolute")
+        self.intermediate = XLMRobertaIntermediate(config)
+        self.output = XLMRobertaOutput(config)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        encoder_attention_mask: Optional[torch.FloatTensor] = None,
+        past_key_value: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
+        output_attentions: Optional[bool] = False,
+    ) -> Tuple[torch.Tensor]:
+        # decoder uni-directional self-attention cached key/values tuple is at positions 1,2
+        self_attn_past_key_value = past_key_value[:2] if past_key_value is not None else None
+        self_attention_outputs = self.attention(
+            hidden_states,
+            attention_mask,
+            head_mask,
+            output_attentions=output_attentions,
+            past_key_value=self_attn_past_key_value,
+        )
+        attention_output = self_attention_outputs[0]
+
+        # if decoder, the last output is tuple of self-attn cache
+        if self.is_decoder:
+            outputs = self_attention_outputs[1:-1]
+            present_key_value = self_attention_outputs[-1]
+        else:
+            outputs = self_attention_outputs[1:]  # add self attentions if we output attention weights
+            present_key_value = 0
+
+        if self.is_decoder and encoder_hidden_states is not None:
+            if not hasattr(self, "crossattention"):
+                raise ValueError(
+                    f"If `encoder_hidden_states` are passed, {self} has to be instantiated with cross-attention layers"
+                    " by setting `config.add_cross_attention=True`"
+                )
+
+            # cross_attn cached key/values tuple is at positions 3,4 of past_key_value tuple
+            cross_attn_past_key_value = past_key_value[-2:] if past_key_value is not None else None
+            cross_attention_outputs = self.crossattention(
+                attention_output,
+                attention_mask,
+                head_mask,
+                encoder_hidden_states,
+                encoder_attention_mask,
+                cross_attn_past_key_value,
+                output_attentions,
+            )
+            attention_output = cross_attention_outputs[0]
+            outputs = outputs + cross_attention_outputs[1:-1]  # add cross attentions if we output attention weights
+
+            # add cross-attn cache to positions 3,4 of present_key_value tuple
+            cross_attn_present_key_value = cross_attention_outputs[-1]
+            present_key_value = present_key_value + cross_attn_present_key_value
+
+        layer_output = apply_chunking_to_forward(
+            self.feed_forward_chunk,
+            self.chunk_size_feed_forward,
+            self.seq_len_dim,
+            attention_output
+        )
+        outputs = (layer_output,) + outputs
+
+        # if decoder, return the attn key/values as the last output
+        if self.is_decoder:
+            outputs = outputs + (present_key_value,)
+
+        return outputs
+
+    def feed_forward_chunk(self, attention_output):
+        intermediate_output = self.intermediate(attention_output)
+        layer_output = self.output(intermediate_output, attention_output)
+        return layer_output
+
+
+# Copied from transformers.models.roberta.modeling_roberta.RobertaEncoder with Roberta->XLMRoberta
+class XLMRobertaEncoder(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.layer = nn.ModuleList([XLMRobertaLayer(config) for _ in range(config.num_hidden_layers)])
+        self.gradient_checkpointing = False
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        encoder_attention_mask: Optional[torch.FloatTensor] = None,
+        past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = False,
+        output_hidden_states: Optional[bool] = False,
+        return_dict: Optional[bool] = True,
+    ) -> Union[Tuple[Union[Tuple[torch.Tensor], torch.Tensor], ...], BaseModelOutputWithPastAndCrossAttentions]:
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attentions = () if output_attentions else None
+        all_cross_attentions = () if output_attentions and self.config.add_cross_attention else None
+
+        if self.gradient_checkpointing and self.training:
+            if use_cache:
+                logger.warning_once(
+                    "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
+                )
+                use_cache = False
+
+        next_decoder_cache = () if use_cache else None
+        for i, layer_module in enumerate(self.layer):
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+
+            layer_head_mask = head_mask[i] if head_mask is not None else None
+            past_key_value = past_key_values[i] if past_key_values is not None else None
+
+            if self.gradient_checkpointing and self.training:
+
+                def create_custom_forward(module):
+                    def custom_forward(*inputs):
+                        return module(*inputs, past_key_value, output_attentions)
+
+                    return custom_forward
+
+                layer_outputs = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(layer_module),
+                    hidden_states,
+                    attention_mask,
+                    layer_head_mask,
+                    encoder_hidden_states,
+                    encoder_attention_mask,
+                )
+            else:
+                layer_outputs = layer_module(
+                    hidden_states,
+                    attention_mask,
+                    layer_head_mask,
+                    encoder_hidden_states,
+                    encoder_attention_mask,
+                    past_key_value,
+                    output_attentions,
+                )
+
+            hidden_states = layer_outputs[0]
+            if use_cache:
+                next_decoder_cache += (layer_outputs[-1],)
+            if output_attentions:
+                all_self_attentions = all_self_attentions + (layer_outputs[1],)
+                if self.config.add_cross_attention:
+                    all_cross_attentions = all_cross_attentions + (layer_outputs[2],)
+
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
+
+        if not return_dict:
+            return tuple(
+                v
+                for v in [
+                    hidden_states,
+                    next_decoder_cache,
+                    all_hidden_states,
+                    all_self_attentions,
+                    all_cross_attentions,
+                ]
+                if v is not None
+            )
+        return BaseModelOutputWithPastAndCrossAttentions(
+            last_hidden_state=hidden_states,
+            past_key_values=next_decoder_cache,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attentions,
+            cross_attentions=all_cross_attentions,
+        )
+
+
+# Copied from transformers.models.roberta.modeling_roberta.RobertaPooler with Roberta->XLMRoberta
+class XLMRobertaPooler(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.activation = nn.Tanh()
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        # We "pool" the model by simply taking the hidden state corresponding
+        # to the first token.
+        first_token_tensor = hidden_states[:, 0]
+        pooled_output = self.dense(first_token_tensor)
+        pooled_output = self.activation(pooled_output)
+        return pooled_output
+
+
+# Copied from transformers.models.roberta.modeling_roberta.RobertaPreTrainedModel with Roberta->XLMRoberta
+class XLMRobertaPreTrainedModel(PreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+
+    config_class = XLMRobertaConfig
+    base_model_prefix = "roberta"
+    supports_gradient_checkpointing = True
+    _no_split_modules = []
+
+    def __init__(self, config):
+        super().__init__(config)
+        self._keys_to_ignore_on_save = None
+        self._keys_to_ignore_on_load_missing = None
+
+    def update_keys_to_ignore(self, config, del_keys_to_ignore):
+        """Remove some keys from ignore list"""
+        if not config.tie_word_embeddings:
+            # must make a new list, or the class variable gets modified!
+            self._keys_to_ignore_on_save = [
+                k
+                for k in self._keys_to_ignore_on_save
+                if k not in del_keys_to_ignore
+            ]
+            self._keys_to_ignore_on_load_missing = [
+                k
+                for k in self._keys_to_ignore_on_load_missing
+                if k not in del_keys_to_ignore
+            ]
+
+    # Copied from transformers.models.bert.modeling_bert.BertPreTrainedModel._init_weights
+    def _init_weights(self, module):
+        """Initialize the weights"""
+        if isinstance(module, nn.Linear):
+            # Slightly different from the TF version which uses truncated_normal for initialization
+            # cf https://github.com/pytorch/pytorch/pull/5617
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+
+
+XLM_ROBERTA_START_DOCSTRING = r"""
+
+    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
+    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
+    etc.)
+
+    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
+    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
+    and behavior.
+
+    Parameters:
+        config ([`XLMRobertaConfig`]): Model configuration class with all the parameters of the
+            model. Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+XLM_ROBERTA_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (`torch.LongTensor` of shape `({0})`):
+            Indices of input sequence tokens in the vocabulary.
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            [What are input IDs?](../glossary#input-ids)
+        attention_mask (`torch.FloatTensor` of shape `({0})`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+            [What are attention masks?](../glossary#attention-mask)
+        token_type_ids (`torch.LongTensor` of shape `({0})`, *optional*):
+            Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,
+            1]`:
+
+            - 0 corresponds to a *sentence A* token,
+            - 1 corresponds to a *sentence B* token.
+
+            [What are token type IDs?](../glossary#token-type-ids)
+        position_ids (`torch.LongTensor` of shape `({0})`, *optional*):
+            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
+            config.max_position_embeddings - 1]`.
+
+            [What are position IDs?](../glossary#position-ids)
+        head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
+            Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:
+
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+
+        inputs_embeds (`torch.FloatTensor` of shape `({0}, hidden_size)`, *optional*):
+            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
+            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
+            model's internal embedding lookup matrix.
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+
+
+@add_start_docstrings(
+    "The bare XLM-RoBERTa Model transformer outputting raw hidden-states without any specific head on top.",
+    XLM_ROBERTA_START_DOCSTRING,
+)
+# Copied from transformers.models.roberta.modeling_roberta.RobertaModel with Roberta->XLMRoberta, ROBERTA->XLM_ROBERTA
+class XLMRobertaModel(XLMRobertaPreTrainedModel):
+    """
+
+    The model can behave as an encoder (with only self-attention) as well as a decoder, in which case a layer of
+    cross-attention is added between the self-attention layers, following the architecture described in *Attention is
+    all you need*_ by Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N. Gomez, Lukasz
+    Kaiser and Illia Polosukhin.
+
+    To behave as a decoder the model needs to be initialized with the `is_decoder` argument of the configuration set
+    to `True`. To be used in a Seq2Seq model, the model needs to initialized with both `is_decoder` argument and
+    `add_cross_attention` set to `True`; an `encoder_hidden_states` is then expected as an input to the forward pass.
+
+    . _*Attention is all you need*: https://arxiv.org/abs/1706.03762
+
+    """
+
+    _keys_to_ignore_on_load_missing = [r"position_ids"]
+
+    # Copied from transformers.models.bert.modeling_bert.BertModel.__init__ with Bert->XLMRoberta
+    def __init__(self, config: XLMRobertaConfig, add_pooling_layer: bool = True):
+        super().__init__(config)
+        self.config = config
+
+        self.embeddings = XLMRobertaEmbeddings(config)
+        self.encoder = XLMRobertaEncoder(config)
+
+        self.pooler = XLMRobertaPooler(config) if add_pooling_layer else None
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+        # Initialize Ascend operations
+        self.hidden_size = config.hidden_size
+        self.head_size = config.hidden_size // config.num_attention_heads
+        self.head_num = config.num_attention_heads
+        self.num_layers = config.num_hidden_layers
+        self.max_position_embeddings = config.max_position_embeddings
+        self.batch_size = 0
+        self.min_cache = torch.full(
+            (self.max_position_embeddings, self.max_position_embeddings),
+            torch.finfo(torch.half).min,
+            dtype=torch.half,
+            device='npu'
+        )
+        self.layer_id_list = [
+            torch.tensor([i], dtype=torch.int32, device='npu')
+            for i in range(self.num_layers)
+        ]
+        self.acl_param = json.dumps({
+            'headNum': self.head_num,
+            'layerNormEps': config.layer_norm_eps,
+            'dk': self.head_size,
+            'layerNum': self.num_layers
+        })
+        self.lm_head_weight = None
+        self.ascend_weights = None
+        self.kv_attention_manager = None
+
+        self.acl_fa_operation = torch.classes.ModelTorch.ModelTorch('bge_reranker_large_FlashAttentionModel')
+        self.acl_fa_operation.set_param(self.acl_param)
+
+    def init_ascend_weights(self):
+        weights = [
+            self.state_dict()['embeddings.word_embeddings.weight'],
+            self.state_dict()['embeddings.position_embeddings.weight'],
+            self.state_dict()['embeddings.token_type_embeddings.weight'],
+            self.state_dict()['embeddings.LayerNorm.weight'],
+            self.state_dict()['embeddings.LayerNorm.bias']
+        ]
+        for i in range(self.num_layers):
+            weights_per_layer = []
+            layer_weights_dict = self.encoder.layer[i].state_dict()
+            weights_per_layer.append(layer_weights_dict['attention.self.query.weight'])
+            weights_per_layer.append(layer_weights_dict['attention.self.query.bias'])
+            weights_per_layer.append(layer_weights_dict['attention.self.key.weight'])
+            weights_per_layer.append(layer_weights_dict['attention.self.key.bias'])
+            weights_per_layer.append(layer_weights_dict['attention.self.value.weight'])
+            weights_per_layer.append(layer_weights_dict['attention.self.value.bias'])
+            weights_per_layer.append(layer_weights_dict['attention.output.dense.weight'])
+            weights_per_layer.append(layer_weights_dict['attention.output.dense.bias'])
+            weights_per_layer.append(layer_weights_dict['attention.output.LayerNorm.weight'])
+            weights_per_layer.append(layer_weights_dict['attention.output.LayerNorm.bias'])
+            weights_per_layer.append(layer_weights_dict['intermediate.dense.weight'])
+            weights_per_layer.append(layer_weights_dict['intermediate.dense.bias'])
+            weights_per_layer.append(layer_weights_dict['output.dense.weight'])
+            weights_per_layer.append(layer_weights_dict['output.dense.bias'])
+            weights_per_layer.append(layer_weights_dict['output.LayerNorm.weight'])
+            weights_per_layer.append(layer_weights_dict['output.LayerNorm.bias'])
+            weights.extend(weights_per_layer)
+
+        self.ascend_weights = weights
+        self.acl_fa_operation.set_weight(weights)
+
+    def prepare_inputs_for_ascend(
+            self,
+            input_ids: Optional[torch.Tensor],
+            position_ids: Optional[torch.Tensor],
+            token_type_ids: Optional[torch.Tensor],
+            attention_mask: Optional[torch.Tensor],
+            past_key_values: Optional[torch.Tensor] = None
+    ):
+        self.kv_attention_manager.is_full = not past_key_values
+        token_type_ids = token_type_ids.npu()
+        position_ids = position_ids.npu()
+        attention_mask = attention_mask.float().half()
+
+        inputs = [
+            input_ids,
+            position_ids,
+            token_type_ids,
+            self.kv_attention_manager.k_cache_input,
+            self.kv_attention_manager.v_cache_input,
+            self.kv_attention_manager.get_attention_mask(attention_mask),
+            self.kv_attention_manager.token_offset_tensor,
+            self.kv_attention_manager.seq_len_tensor,
+        ] + self.layer_id_list
+
+        return inputs
+
+    def execute_ascend_operator(
+            self,
+            input_ids: Optional[torch.Tensor],
+            position_ids: Optional[torch.Tensor],
+            token_type_ids: Optional[torch.Tensor],
+            attention_mask: Optional[torch.Tensor],
+            past_key_values: Optional[torch.Tensor] = None
+    ):
+        acl_inputs = self.prepare_inputs_for_ascend(
+            input_ids,
+            position_ids,
+            token_type_ids,
+            attention_mask,
+            past_key_values
+        )
+        tmp_param = json.dumps({
+            'tokenOffset': self.kv_attention_manager.token_offset_list,
+            'seqLen': self.kv_attention_manager.seq_len_list
+        })
+        acl_model_out = self.acl_fa_operation.execute(acl_inputs, tmp_param)
+        return acl_model_out
+
+    def get_input_embeddings(self):
+        return self.embeddings.word_embeddings
+
+    def set_input_embeddings(self, value):
+        self.embeddings.word_embeddings = value
+
+    def create_position_ids_from_inputs_embeds(self, inputs_embeds):
+        """
+        We are provided embeddings directly. We cannot infer which are padded so just generate sequential position ids.
+
+        Args:
+            inputs_embeds: torch.Tensor
+
+        Returns: torch.Tensor
+        """
+        input_shape = inputs_embeds.size()[:-1]
+        sequence_length = input_shape[1]
+
+        position_ids = torch.arange(
+            self.padding_idx + 1, sequence_length + self.padding_idx + 1, dtype=torch.long, device=inputs_embeds.device
+        )
+        return position_ids.unsqueeze(0).expand(input_shape)
+
+    @add_start_docstrings_to_model_forward(XLM_ROBERTA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @add_code_sample_docstrings(
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=BaseModelOutputWithPoolingAndCrossAttentions,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    # Copied from transformers.models.bert.modeling_bert.BertModel.forward
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        token_type_ids: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple[torch.Tensor], BaseModelOutputWithPoolingAndCrossAttentions]:
+        r"""
+        encoder_hidden_states  (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+            Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if
+            the model is configured as a decoder.
+        encoder_attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in
+            the cross-attention if the model is configured as a decoder. Mask values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+        past_key_values (`tuple(tuple(torch.FloatTensor))` of length `config.n_layers` with each tuple having 4
+            tensors of shape `(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`):
+            Contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding.
+
+            If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that
+            don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all
+            `decoder_input_ids` of shape `(batch_size, sequence_length)`.
+        use_cache (`bool`, *optional*):
+            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
+            `past_key_values`).
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
+        elif input_ids is not None:
+            input_shape = input_ids.size()
+        elif inputs_embeds is not None:
+            input_shape = inputs_embeds.size()[:-1]
+        else:
+            raise ValueError("You have to specify either input_ids or inputs_embeds")
+
+        batch_size, seq_length = input_shape
+        device = input_ids.device if input_ids is not None else inputs_embeds.device
+
+        if batch_size != self.batch_size:
+            self.batch_size = batch_size
+        self.kv_attention_manager = KVAttentionManager(self.config, self.batch_size)
+        self.kv_attention_manager.min_cache = self.min_cache
+
+        # past_key_values_length
+        if past_key_values:
+            past_key_values_length = self.kv_attention_manager.token_offset
+            self.kv_attention_manager.token_offset += 1
+            self.kv_attention_manager.token_offset_tensor += 1
+        else:
+            past_key_values_length = 0
+            self.kv_attention_manager.init_attention_mask()
+            self.kv_attention_manager.init_seq_len_and_token_offset(seq_length)
+
+        if attention_mask is None:
+            attention_mask = torch.ones((batch_size, seq_length + past_key_values_length), device=device)
+
+        # Copied from transformers.models.bert.modeling_xlm_roberta.XLMRobertaEmbeddings.forward
+        if position_ids is None:
+            if input_ids is not None:
+                # Create the position ids from the input token ids. Any padded tokens remain padded.
+                position_ids = create_position_ids_from_input_ids(
+                    input_ids,
+                    self.config.pad_token_id,
+                    past_key_values_length
+                )
+            else:
+                position_ids = self.create_position_ids_from_inputs_embeds(inputs_embeds)
+
+        # Setting the token_type_ids to the registered buffer in constructor where it is all zeros,
+        # which usually occurs when its auto-generated,
+        # registered buffer helps users when tracing the model without passing token_type_ids
+        if token_type_ids is None:
+            if hasattr(self.embeddings, "token_type_ids"):
+                buffered_token_type_ids = self.embeddings.token_type_ids[:, :seq_length]
+                buffered_token_type_ids_expanded = buffered_token_type_ids.expand(batch_size, seq_length)
+                token_type_ids = buffered_token_type_ids_expanded
+            else:
+                token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=device)
+
+        # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
+        # ourselves in which case we just need to make it broadcastable to all heads.
+        # 使用 fa 时，在计算首 token 时会同时计算增量额 extended_attention_mask
+        if past_key_values:
+            extended_attention_mask = attention_mask
+        else:
+            self.kv_attention_manager.ori_len_list = attention_mask.sum(dim=-1)
+            extended_attention_mask: torch.Tensor = self.get_extended_attention_mask(
+                attention_mask,
+                input_shape
+            )
+
+        if not self.ascend_weights:
+            self.init_ascend_weights()
+
+        hidden_states = self.execute_ascend_operator(
+            input_ids,
+            position_ids,
+            token_type_ids,
+            extended_attention_mask,
+            past_key_values
+        )
+        sequence_output = hidden_states[0]
+        pooled_output = self.pooler(sequence_output) if self.pooler is not None else None
+
+        if not return_dict:
+            return (sequence_output, pooled_output) + hidden_states[1:]
+
+        return BaseModelOutputWithPoolingAndCrossAttentions(
+            last_hidden_state=sequence_output,
+            pooler_output=pooled_output
+        )
+
+    def _prune_heads(self, heads_to_prune):
+        """
+        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
+        class PreTrainedModel
+        """
+        for layer, heads in heads_to_prune.items():
+            self.encoder.layer[layer].attention.prune_heads(heads)
+
+
+@add_start_docstrings(
+    "XLM-RoBERTa Model with a `language modeling` head on top for CLM fine-tuning.",
+    XLM_ROBERTA_START_DOCSTRING,
+)
+# Copied from transformers.models.roberta.modeling_roberta.RobertaForCausalLM with
+# Roberta->XLMRoberta, ROBERTA->XLM_ROBERTA
+class XLMRobertaForCausalLM(XLMRobertaPreTrainedModel):
+    _keys_to_ignore_on_save = [r"lm_head.decoder.weight", r"lm_head.decoder.bias"]
+    _keys_to_ignore_on_load_missing = [r"position_ids", r"lm_head.decoder.weight", r"lm_head.decoder.bias"]
+    _keys_to_ignore_on_load_unexpected = [r"pooler"]
+
+    def __init__(self, config: XLMRobertaConfig):
+        super().__init__(config)
+
+        if not config.is_decoder:
+            logger.warning("If you want to use `XLMRobertaLMHeadModel` as a standalone, add `is_decoder=True.`")
+
+        self.roberta = XLMRobertaModel(config, add_pooling_layer=False)
+        self.lm_head = XLMRobertaLMHead(config)
+
+        # The LM head weights require special treatment only when they are tied with the word embeddings
+        self.update_keys_to_ignore(config, ["lm_head.decoder.weight"])
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+        # Initialize Ascend
+        self.lm_head_weight = None
+
+    def get_output_embeddings(self):
+        return self.lm_head.decoder
+
+    def set_output_embeddings(self, new_embeddings):
+        self.lm_head.decoder = new_embeddings
+
+    @add_start_docstrings_to_model_forward(XLM_ROBERTA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @replace_return_docstrings(output_type=CausalLMOutputWithCrossAttentions, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        token_type_ids: Optional[torch.LongTensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        past_key_values: Tuple[Tuple[torch.FloatTensor]] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple[torch.Tensor], CausalLMOutputWithCrossAttentions]:
+        r"""
+        encoder_hidden_states  (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+            Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if
+            the model is configured as a decoder.
+        encoder_attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in
+            the cross-attention if the model is configured as a decoder. Mask values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for computing the left-to-right language modeling loss (next word prediction). Indices should be in
+            `[-100, 0, ..., config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are
+            ignored (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
+        past_key_values (`tuple(tuple(torch.FloatTensor))` of length `config.n_layers` with each tuple having 4
+            tensors of shape `(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`):
+            Contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding.
+
+            If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that
+            don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all
+            `decoder_input_ids` of shape `(batch_size, sequence_length)`.
+        use_cache (`bool`, *optional*):
+            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
+            `past_key_values`).
+
+        Returns:
+
+        Example:
+
+        ```python
+        >>> from transformers import AutoTokenizer, XLMRobertaForCausalLM, AutoConfig
+        >>> import torch
+
+        >>> tokenizer = AutoTokenizer.from_pretrained("roberta-base")
+        >>> config = AutoConfig.from_pretrained("roberta-base")
+        >>> config.is_decoder = True
+        >>> model = XLMRobertaForCausalLM.from_pretrained("roberta-base", config=config)
+
+        >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
+        >>> outputs = model(**inputs)
+
+        >>> prediction_logits = outputs.logits
+        ```"""
+        if not self.lm_head_weight:
+            self.lm_head_weight = self.state_dict()['lm_head.weight']
+            if not IS_ND:
+                self.lm_head_weight.data = torch_npu.npu_format_cast(self.lm_head_weight.data, 29)
+            self.roberta.lm_head_weight = self.lm_head_weight
+
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.roberta(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            inputs_embeds=inputs_embeds,
+            past_key_values=past_key_values,
+            return_dict=return_dict,
+        )
+        prediction_scores = outputs[0]
+
+        lm_loss = None
+        if labels is not None:
+            # move labels to correct device to enable model parallelism
+            labels = labels.to(prediction_scores.device)
+            # we are doing next-token prediction; shift prediction scores and input ids by one
+            shifted_prediction_scores = prediction_scores[:, :-1, :].contiguous()
+            labels = labels[:, 1:].contiguous()
+            loss_fct = CrossEntropyLoss()
+            lm_loss = loss_fct(shifted_prediction_scores.view(-1, self.config.vocab_size), labels.view(-1))
+
+        if not return_dict:
+            output = (prediction_scores,) + outputs[2:]
+            return ((lm_loss,) + output) if lm_loss is not None else output
+
+        return CausalLMOutputWithCrossAttentions(
+            loss=lm_loss,
+            logits=prediction_scores,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+            cross_attentions=outputs.cross_attentions,
+        )
+
+    def prepare_inputs_for_generation(self, input_ids, past_key_values=None, attention_mask=None, **model_kwargs):
+        input_shape = input_ids.shape
+        # if model is used as a decoder in encoder-decoder model, the decoder attention mask is created on the fly
+        if attention_mask is None:
+            attention_mask = input_ids.new_ones(input_shape)
+
+        # cut decoder_input_ids if past is used
+        if past_key_values is not None:
+            input_ids = input_ids[:, -1:]
+
+        return {"input_ids": input_ids, "attention_mask": attention_mask, "past_key_values": past_key_values}
+
+    def _reorder_cache(self, past_key_values, beam_idx):
+        reordered_past = ()
+        for layer_past in past_key_values:
+            reordered_past += (tuple(past_state.index_select(0, beam_idx) for past_state in layer_past),)
+        return reordered_past
+
+
+@add_start_docstrings(
+    """XLM-RoBERTa Model with a `language modeling` head on top.""",
+    XLM_ROBERTA_START_DOCSTRING,
+)
+# Copied from transformers.models.roberta.modeling_roberta.RobertaForMaskedLM with
+# Roberta->XLMRoberta, ROBERTA->XLM_ROBERTA
+class XLMRobertaForMaskedLM(XLMRobertaPreTrainedModel):
+    _keys_to_ignore_on_save = [r"lm_head.decoder.weight", r"lm_head.decoder.bias"]
+    _keys_to_ignore_on_load_missing = [r"position_ids", r"lm_head.decoder.weight", r"lm_head.decoder.bias"]
+    _keys_to_ignore_on_load_unexpected = [r"pooler"]
+
+    def __init__(self, config: XLMRobertaConfig):
+        super().__init__(config)
+
+        if config.is_decoder:
+            logger.warning(
+                "If you want to use `XLMRobertaForMaskedLM` make sure `config.is_decoder=False` for "
+                "bi-directional self-attention."
+            )
+
+        self.roberta = XLMRobertaModel(config, add_pooling_layer=False)
+        self.lm_head = XLMRobertaLMHead(config)
+
+        # The LM head weights require special treatment only when they are tied with the word embeddings
+        self.update_keys_to_ignore(config, ["lm_head.decoder.weight"])
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+        # Initialize Ascend
+        self.lm_head_weight = None
+
+    def get_output_embeddings(self):
+        return self.lm_head.decoder
+
+    def set_output_embeddings(self, new_embeddings):
+        self.lm_head.decoder = new_embeddings
+
+    @add_start_docstrings_to_model_forward(XLM_ROBERTA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @add_code_sample_docstrings(
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=MaskedLMOutput,
+        config_class=_CONFIG_FOR_DOC,
+        mask="<mask>",
+        expected_output="' Paris'",
+        expected_loss=0.1,
+    )
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        token_type_ids: Optional[torch.LongTensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple[torch.Tensor], MaskedLMOutput]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
+            config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
+            loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
+        kwargs (`Dict[str, any]`, optional, defaults to *{}*):
+            Used to hide legacy arguments that have been deprecated.
+        """
+        if not self.lm_head_weight:
+            self.lm_head_weight = self.state_dict()['lm_head.weight']
+            if not IS_ND:
+                self.lm_head_weight.data = torch_npu.npu_format_cast(self.lm_head_weight.data, 29)
+            self.roberta.lm_head_weight = self.lm_head_weight
+
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.roberta(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            inputs_embeds=inputs_embeds,
+            return_dict=return_dict,
+        )
+        prediction_scores = outputs[0]
+
+        masked_lm_loss = None
+        if labels is not None:
+            # move labels to correct device to enable model parallelism
+            labels = labels.to(prediction_scores.device)
+            loss_fct = CrossEntropyLoss()
+            masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1))
+
+        if not return_dict:
+            output = (prediction_scores,) + outputs[2:]
+            return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output
+
+        return MaskedLMOutput(
+            loss=masked_lm_loss,
+            logits=prediction_scores,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+# Copied from transformers.models.roberta.modeling_roberta.RobertaLMHead
+class XLMRobertaLMHead(nn.Module):
+    """Roberta Head for masked language modeling."""
+
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+
+        self.decoder = nn.Linear(config.hidden_size, config.vocab_size)
+        self.bias = nn.Parameter(torch.zeros(config.vocab_size))
+        self.decoder.bias = self.bias
+
+    def forward(self, features):
+        x = self.dense(features)
+        x = gelu(x)
+        x = self.layer_norm(x)
+
+        # project back to size of vocabulary with bias
+        x = self.decoder(x)
+
+        return x
+
+    def _tie_weights(self):
+        # To tie those two weights if they get disconnected (on TPU or when the bias is resized)
+        # For accelerate compatibility and to not break backward compatibility
+        if self.decoder.bias.device.type == "meta":
+            self.decoder.bias = self.bias
+        else:
+            self.bias = self.decoder.bias
+
+
+@add_start_docstrings(
+    """
+    XLM-RoBERTa Model transformer with a sequence classification/regression head on top (a linear layer on top of the
+    pooled output) e.g. for GLUE tasks.
+    """,
+    XLM_ROBERTA_START_DOCSTRING,
+)
+# Copied from transformers.models.roberta.modeling_roberta.RobertaForSequenceClassification with
+# Roberta->XLMRoberta, ROBERTA->XLM_ROBERTA
+class XLMRobertaForSequenceClassification(XLMRobertaPreTrainedModel):
+    _keys_to_ignore_on_load_missing = [r"position_ids"]
+
+    def __init__(self, config: XLMRobertaConfig):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+        self.config = config
+
+        self.roberta = XLMRobertaModel(config, add_pooling_layer=False)
+        self.classifier = XLMRobertaClassificationHead(config)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @add_start_docstrings_to_model_forward(XLM_ROBERTA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @add_code_sample_docstrings(
+        checkpoint="cardiffnlp/twitter-roberta-base-emotion",
+        output_type=SequenceClassifierOutput,
+        config_class=_CONFIG_FOR_DOC,
+        expected_output="'optimism'",
+        expected_loss=0.08,
+    )
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        token_type_ids: Optional[torch.LongTensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple[torch.Tensor], SequenceClassifierOutput]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.roberta(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            inputs_embeds=inputs_embeds,
+            return_dict=return_dict,
+        )
+        sequence_output = outputs[0]
+        logits = self.classifier(sequence_output)
+
+        loss = None
+        if labels is not None:
+            # move labels to correct device to enable model parallelism
+            labels = labels.to(logits.device)
+            if self.config.problem_type is None:
+                if self.num_labels == 1:
+                    self.config.problem_type = "regression"
+                elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
+                    self.config.problem_type = "single_label_classification"
+                else:
+                    self.config.problem_type = "multi_label_classification"
+
+            if self.config.problem_type == "regression":
+                loss_fct = MSELoss()
+                if self.num_labels == 1:
+                    loss = loss_fct(logits.squeeze(), labels.squeeze())
+                else:
+                    loss = loss_fct(logits, labels)
+            elif self.config.problem_type == "single_label_classification":
+                loss_fct = CrossEntropyLoss()
+                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
+            elif self.config.problem_type == "multi_label_classification":
+                loss_fct = BCEWithLogitsLoss()
+                loss = loss_fct(logits, labels)
+
+        if not return_dict:
+            output = (logits,) + outputs[2:]
+            return ((loss,) + output) if loss is not None else output
+
+        return SequenceClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+@add_start_docstrings(
+    """
+    XLM-RoBERTa Model with a multiple choice classification head on top (a linear layer on top of the pooled output and
+    a softmax) e.g. for RocStories/SWAG tasks.
+    """,
+    XLM_ROBERTA_START_DOCSTRING,
+)
+# Copied from transformers.models.roberta.modeling_roberta.RobertaForMultipleChoice with
+# Roberta->XLMRoberta, ROBERTA->XLM_ROBERTA
+class XLMRobertaForMultipleChoice(XLMRobertaPreTrainedModel):
+    _keys_to_ignore_on_load_missing = [r"position_ids"]
+
+    def __init__(self, config: XLMRobertaConfig):
+        super().__init__(config)
+
+        self.roberta = XLMRobertaModel(config)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+        self.classifier = nn.Linear(config.hidden_size, 1)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @add_start_docstrings_to_model_forward(
+        XLM_ROBERTA_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length")
+    )
+    @add_code_sample_docstrings(
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=MultipleChoiceModelOutput,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        token_type_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple[torch.Tensor], MultipleChoiceModelOutput]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the multiple choice classification loss. Indices should be in `[0, ...,
+            num_choices-1]` where `num_choices` is the size of the second dimension of the input tensors. (See
+            `input_ids` above)
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1]
+
+        flat_input_ids = input_ids.view(-1, input_ids.size(-1)) if input_ids is not None else None
+        flat_position_ids = position_ids.view(-1, position_ids.size(-1)) if position_ids is not None else None
+        flat_token_type_ids = token_type_ids.view(-1, token_type_ids.size(-1)) if token_type_ids is not None else None
+        flat_attention_mask = attention_mask.view(-1, attention_mask.size(-1)) if attention_mask is not None else None
+        flat_inputs_embeds = (
+            inputs_embeds.view(-1, inputs_embeds.size(-2), inputs_embeds.size(-1))
+            if inputs_embeds is not None
+            else None
+        )
+
+        outputs = self.roberta(
+            flat_input_ids,
+            position_ids=flat_position_ids,
+            token_type_ids=flat_token_type_ids,
+            attention_mask=flat_attention_mask,
+            inputs_embeds=flat_inputs_embeds,
+            return_dict=return_dict,
+        )
+        pooled_output = outputs[1]
+
+        pooled_output = self.dropout(pooled_output)
+        logits = self.classifier(pooled_output)
+        reshaped_logits = logits.view(-1, num_choices)
+
+        loss = None
+        if labels is not None:
+            # move labels to correct device to enable model parallelism
+            labels = labels.to(reshaped_logits.device)
+            loss_fct = CrossEntropyLoss()
+            loss = loss_fct(reshaped_logits, labels)
+
+        if not return_dict:
+            output = (reshaped_logits,) + outputs[2:]
+            return ((loss,) + output) if loss is not None else output
+
+        return MultipleChoiceModelOutput(
+            loss=loss,
+            logits=reshaped_logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+@add_start_docstrings(
+    """
+    XLM-RoBERTa Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g.
+    for Named-Entity-Recognition (NER) tasks.
+    """,
+    XLM_ROBERTA_START_DOCSTRING,
+)
+# Copied from transformers.models.roberta.modeling_roberta.RobertaForTokenClassification with
+# Roberta->XLMRoberta, ROBERTA->XLM_ROBERTA
+class XLMRobertaForTokenClassification(XLMRobertaPreTrainedModel):
+    _keys_to_ignore_on_load_unexpected = [r"pooler"]
+    _keys_to_ignore_on_load_missing = [r"position_ids"]
+
+    def __init__(self, config: XLMRobertaConfig):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+
+        self.roberta = XLMRobertaModel(config, add_pooling_layer=False)
+        classifier_dropout = (
+            config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
+        )
+        self.dropout = nn.Dropout(classifier_dropout)
+        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @add_start_docstrings_to_model_forward(XLM_ROBERTA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @add_code_sample_docstrings(
+        checkpoint="Jean-Baptiste/roberta-large-ner-english",
+        output_type=TokenClassifierOutput,
+        config_class=_CONFIG_FOR_DOC,
+        expected_output="['O', 'ORG', 'ORG', 'O', 'O', 'O', 'O', 'O', 'LOC', 'O', 'LOC', 'LOC']",
+        expected_loss=0.01,
+    )
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        token_type_ids: Optional[torch.LongTensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple[torch.Tensor], TokenClassifierOutput]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.roberta(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            inputs_embeds=inputs_embeds,
+            return_dict=return_dict,
+        )
+
+        sequence_output = outputs[0]
+
+        sequence_output = self.dropout(sequence_output)
+        logits = self.classifier(sequence_output)
+
+        loss = None
+        if labels is not None:
+            # move labels to correct device to enable model parallelism
+            labels = labels.to(logits.device)
+            loss_fct = CrossEntropyLoss()
+            loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
+
+        if not return_dict:
+            output = (logits,) + outputs[2:]
+            return ((loss,) + output) if loss is not None else output
+
+        return TokenClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+# Copied from transformers.models.roberta.modeling_roberta.RobertaClassificationHead with Roberta->XLMRoberta
+class XLMRobertaClassificationHead(nn.Module):
+    """Head for sentence-level classification tasks."""
+
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        classifier_dropout = (
+            config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
+        )
+        self.dropout = nn.Dropout(classifier_dropout)
+        self.out_proj = nn.Linear(config.hidden_size, config.num_labels)
+
+    def forward(self, features):
+        x = features[:, 0, :]  # take <s> token (equiv. to [CLS])
+        x = self.dropout(x)
+        x = self.dense(x)
+        x = torch.tanh(x)
+        x = self.dropout(x)
+        x = self.out_proj(x)
+        return x
+
+
+@add_start_docstrings(
+    """
+    XLM-RoBERTa Model with a span classification head on top for extractive question-answering tasks like SQuAD (a
+    linear layers on top of the hidden-states output to compute `span start logits` and `span end logits`).
+    """,
+    XLM_ROBERTA_START_DOCSTRING,
+)
+# Copied from transformers.models.roberta.modeling_roberta.RobertaForQuestionAnswering with
+# Roberta->XLMRoberta, ROBERTA->XLM_ROBERTA
+class XLMRobertaForQuestionAnswering(XLMRobertaPreTrainedModel):
+    _keys_to_ignore_on_load_unexpected = [r"pooler"]
+    _keys_to_ignore_on_load_missing = [r"position_ids"]
+
+    def __init__(self, config: XLMRobertaConfig):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+
+        self.roberta = XLMRobertaModel(config, add_pooling_layer=False)
+        self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @add_start_docstrings_to_model_forward(XLM_ROBERTA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @add_code_sample_docstrings(
+        checkpoint="deepset/roberta-base-squad2",
+        output_type=QuestionAnsweringModelOutput,
+        config_class=_CONFIG_FOR_DOC,
+        expected_output="' puppet'",
+        expected_loss=0.86,
+    )
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        token_type_ids: Optional[torch.LongTensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        start_positions: Optional[torch.LongTensor] = None,
+        end_positions: Optional[torch.LongTensor] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple[torch.Tensor], QuestionAnsweringModelOutput]:
+        r"""
+        start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for position (index) of the start of the labelled span for computing the token classification loss.
+            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
+            are not taken into account for computing the loss.
+        end_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for position (index) of the end of the labelled span for computing the token classification loss.
+            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
+            are not taken into account for computing the loss.
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.roberta(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            inputs_embeds=inputs_embeds,
+            return_dict=return_dict,
+        )
+
+        sequence_output = outputs[0]
+
+        logits = self.qa_outputs(sequence_output)
+        start_logits, end_logits = logits.split(1, dim=-1)
+        start_logits = start_logits.squeeze(-1).contiguous()
+        end_logits = end_logits.squeeze(-1).contiguous()
+
+        total_loss = None
+        if start_positions is not None and end_positions is not None:
+            # If we are on multi-GPU, split add a dimension
+            if len(start_positions.size()) > 1:
+                start_positions = start_positions.squeeze(-1)
+            if len(end_positions.size()) > 1:
+                end_positions = end_positions.squeeze(-1)
+            # sometimes the start/end positions are outside our model inputs, we ignore these terms
+            ignored_index = start_logits.size(1)
+            start_positions = start_positions.clamp(0, ignored_index)
+            end_positions = end_positions.clamp(0, ignored_index)
+
+            loss_fct = CrossEntropyLoss(ignore_index=ignored_index)
+            start_loss = loss_fct(start_logits, start_positions)
+            end_loss = loss_fct(end_logits, end_positions)
+            total_loss = (start_loss + end_loss) / 2
+
+        if not return_dict:
+            output = (start_logits, end_logits) + outputs[2:]
+            return ((total_loss,) + output) if total_loss is not None else output
+
+        return QuestionAnsweringModelOutput(
+            loss=total_loss,
+            start_logits=start_logits,
+            end_logits=end_logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
diff --git a/mindie/examples/models/bge/reranker-large/requirements.txt b/mindie/examples/models/bge/reranker-large/requirements.txt
new file mode 100644
index 00000000..e789c7a3
--- /dev/null
+++ b/mindie/examples/models/bge/reranker-large/requirements.txt
@@ -0,0 +1,4 @@
+optimum==1.18.0
+onnx==1.16.0
+onnxruntime==1.17.1
+transformers==4.33.0
\ No newline at end of file
diff --git a/mindie/examples/models/bge/reranker-large/run.py b/mindie/examples/models/bge/reranker-large/run.py
new file mode 100644
index 00000000..0cc702bb
--- /dev/null
+++ b/mindie/examples/models/bge/reranker-large/run.py
@@ -0,0 +1,181 @@
+# Copyright Huawei Technologies Co., Ltd. 2024-2024. All rights reserved.
+import argparse
+import logging
+import json
+import os
+import time
+
+import numpy as np
+import torch
+from transformers import AutoTokenizer, AutoModelForSequenceClassification
+from optimum.onnxruntime import ORTModelForSequenceClassification
+
+from atb_llm.utils.file_utils import safe_open
+
+logger = logging.getLogger(__name__)
+logging.basicConfig(format='[%(levelname)s] %(message)s', level=logging.DEBUG)
+
+
+parser = argparse.ArgumentParser(description='Adapting LLM on Ascend.')
+parser.add_argument(
+    '--model_type_or_path',
+    type=str,
+    required=True,
+    help='Specipy model type to load default model or path to the directory containing model file.'
+)
+parser.add_argument(
+    '--device',
+    type=int,
+    default=6,
+    choices=list(range(8)),
+    help='Adapt model on device id x.'
+)
+
+# Default model path
+with safe_open('config.json', 'r', encoding='utf-8') as reader:
+    text = reader.read()
+default_path = json.loads(text)['default_path']
+pytorch_model_path = tokenizer_path = os.path.abspath(default_path['tokenizer_path'])
+onnx_model_path = os.path.abspath(default_path['onnx_model_path'])
+om_model_path = os.path.abspath(default_path['om_model_path'])
+
+# Query and passage we want sentence embeddings for
+QUERY = '什么是大熊猫？'
+POSITIVE = '大熊猫（Ailuropoda melanoleuca），属于食肉目熊科的一种哺乳动物，体色为黑白两色。是中国特有物种'
+NEGATIVE = '比熊犬（法语：Bichon Frisé，bichon à poil frisé，意指“白色卷毛的玩赏用小狗”）是一种小型犬品种'
+pairs = [[QUERY, POSITIVE], [QUERY, NEGATIVE]]
+logger.info('query and passage for inference: %s', pairs)
+
+# Load local tokenizer
+tokenizer = AutoTokenizer.from_pretrained(pytorch_model_path)
+
+# Tokenize sentences
+encoded_input = tokenizer(pairs, padding='max_length', return_tensors='pt', max_length=512)
+
+
+def infer_pytorch(model_path: str, device_id: int) -> None:
+    # Set device
+    try:
+        import torch_npu
+    except ImportError:
+        device = 'cuda:{}'.format(device_id) if torch.cuda.is_available() else 'cpu'
+    else:
+        device = 'npu:{}'.format(device_id)
+        torch_npu.npu.set_device(device_id)
+        torch.npu.set_compile_mode(jit_compile=False)
+    # Load model from local
+    model = AutoModelForSequenceClassification.from_pretrained(
+        model_path,
+        local_files_only=True,
+        trust_remote_code=True
+    ).half().to(device)
+    model.eval()
+    encoded_input_to_device = encoded_input.to(device)
+    # Compute similarity scores
+    for iters in range(2):
+        with torch.no_grad():
+            start_time = time.time()
+            scores = model(**encoded_input_to_device, return_dict=True).logits.view(-1, ).float()
+            exec_time = time.time() - start_time
+            logger.info(
+                '%s%s inference time: %.2f ms',
+                iters + 1,
+                'tsnrhtdd'[(iters + 1) % 5 * ((iters + 1) % 100 ^ 15 > 4 > (iters + 1) % 10)::4],
+                exec_time * 1000
+            )
+            logger.info('scores [positive, negative]: %s', scores.cpu())
+    # Free resource
+    if device.startswith('npu'):
+        try:
+            torch.npu.empty_cahce()
+        except AttributeError:
+            pass
+    elif device.startswith('cuda'):
+        torch.cuda.empty_cache()
+
+
+def infer_onnx(model_path: str, device_id: int) -> None:
+    # Set device
+    try:
+        import torch_npu
+    except ImportError:
+        device = 'cuda:{}'.format(device_id) if torch.cuda.is_available() else 'cpu'
+    else:
+        device = 'npu:{}'.format(device_id)
+        torch_npu.npu.set_device(device_id)
+        torch.npu.set_compile_mode(jit_compile=False)
+    # Load model from local
+    ort = ORTModelForSequenceClassification.from_pretrained(model_path).to(device)
+    encoded_input_to_device = encoded_input.to(device)
+    # Compute similarity scores
+    for iters in range(2):
+        with torch.inference_mode():
+            start_time = time.time()
+            scores = ort(**encoded_input_to_device, return_dict=True).logits.view(-1, ).float()
+            exec_time = time.time() - start_time
+            logger.info(
+                '%s%s inference time: %.2f ms',
+                iters + 1,
+                'tsnrhtdd'[(iters + 1) % 5 * ((iters + 1) % 100 ^ 15 > 4 > (iters + 1) % 10)::4],
+                exec_time * 1000
+            )
+            logger.info('scores [positive, negative]: %s', scores.cpu())
+    # Free resource
+    if device.startswith('npu'):
+        try:
+            torch.npu.empty_cahce()
+        except AttributeError:
+            pass
+    elif device.startswith('cuda'):
+        torch.cuda.empty_cache()
+
+
+def infer_om(model_path: str, device_id: int) -> None:
+    # Tokenize sentences
+    input_ids = encoded_input.data['input_ids'].numpy().astype(np.int64)
+    attention_mask = encoded_input.data['attention_mask'].numpy().astype(np.int64)
+    # Load model from local
+    from ais_bench.infer.interface import InferSession
+    session = InferSession(device_id, model_path)
+    # Compute similarity scores
+    for iters in range(2):
+        output = session.infer(feeds=[input_ids, attention_mask], mode='dymshape', custom_sizes=10000000)
+        scores = torch.from_numpy(output[0][:, 0]).view(-1, ).float()
+        exec_time = session.summary().exec_time_list[-1]
+        logger.info(
+            '%s%s inference time: %.2f ms',
+            iters + 1,
+            'tsnrhtdd'[(iters + 1) % 5 * ((iters + 1) % 100 ^ 15 > 4 > (iters + 1) % 10)::4],
+            exec_time[1] - exec_time[0]
+        )
+        logger.info('scores [positive, negative]: %s', scores)
+    # Free resource
+    session.free_resource()
+
+
+def infer(model_type_or_path: str = None, device_id: int = 0) -> None:
+    model_path_map = {'pytorch': pytorch_model_path, 'onnx': onnx_model_path, 'om': om_model_path}
+    model_map = {'pytorch': infer_pytorch, 'onnx': infer_onnx, 'om': infer_om}
+
+    model_type = model_type_or_path.removesuffix('/').split('.')[-1].split('/')[-1]
+    default_model_path = model_path_map.get(model_type, 'not exist')
+    if default_model_path != 'not exist':
+        model_path = (
+            model_type_or_path
+            if os.path.isdir(model_type_or_path) or os.path.isfile(model_type_or_path)
+            else default_model_path
+        )
+    else:
+        raise RuntimeError(
+            'load model failed because '
+            '\'{}\' is not a valid model type or path'.format(model_type_or_path)
+        )
+    try:
+        model_map[model_type](model_path, device_id)
+    except KeyError as e:
+        raise RuntimeError('load {} model failed because {}'.format(model_type, e)) from e
+
+
+if __name__ == '__main__':
+    args = parser.parse_args()
+    infer(args.model_type_or_path, args.device)
diff --git a/mindie/examples/models/bloom/README.md b/mindie/examples/models/bloom/README.md
new file mode 100644
index 00000000..a17872fb
--- /dev/null
+++ b/mindie/examples/models/bloom/README.md
@@ -0,0 +1,138 @@
+# BLOOM
+
+* [BLOOM](https://huggingface.co/bigscience/bloom) (BigScience Large Open-science Open-access Multilingual Language Model)
+* 此代码仓中实现了一套基于 NPU 硬件的 BLOOM 推理模型。
+
+## 特性矩阵
+
+- 此矩阵罗列了各 BLOOM 模型支持的特性：
+
+| 模型及参数量 | 800I A2 Tensor Parallelism | 300I DUO Tensor Parallelism | FP16 | BF16 | Flash Attention | Paged Attention | W8A8量化 | W8A16量化 | KV cache量化 | 稀疏量化 | MOE量化 | MindIE Service | TGI |  长序列 |
+|-------------|----------------------------|-----------------------------|------|----------------------|-----------------|-----------------|---------|-----------|--------------|--------------------------|-----|--------|-----|-----|
+| bloom (176B) | 支持world size 8   | 否      | 是   | 否  | 否              | 是              | 否       | 是      | 否           | 否       | 否     | 否     | 否  | 否  |
+| bloom-7b1 | 支持world size 1,2,4,8   | 支持world size 1,2,4 | 是   | 否  | 否              | 是              | 否       | 否       | 否           | 否       | 否     | 否    | 否  | 否  |
+| bloomz-7b1-mt | 支持world size 1,2,4,8 | 支持world size 1,2,4 | 是   | 否  | 否              | 是              | 否       | 否       | 否           | 否     | 否     | 否    | 否  | 否  |
+
+## 推理使用说明
+
+### 路径变量解释
+
+| 变量名        | 含义                                                         |
+| ------------- | ------------------------------------------------------------ |
+| `working_dir` | 加速库及模型库下载后放置的目录                               |
+| `llm_path`    | 模型仓所在路径。若使用编译好的包，则路径为`${working_dir}/MindIE-LLM/`；若使用gitee下载的代码，则路径为`${working_dir}/MindIE-LLM/examples/atb_models` |
+| `script_path` | 脚本所在路径。BLOOM 系列模型的工作脚本所在路径为`{llm_path}/examples/models/bloom` |
+| `weight_path` | HF 原始模型权重路径（`.safetensors` 格式）                   |
+
+权重下载链接：
+
+* bloom (176b): https://huggingface.co/bigscience/bloom
+* bloomz-7b1-mt: https://huggingface.co/bigscience/bloomz-7b1-mt
+* bloom-7b1: https://huggingface.co/bigscience/bloomz-7b1
+
+> 下载权重时无需下载 `pytorch_model.bin.index.json` 以及 `.bin` 文件。
+
+框架加载权重时会从下载的 `config.json` 里面读取 `torch_dtype`，因此需要手动在 `config.json` 里面补上 `"torch_dtype": "float16"`。
+
+### 环境准备
+
+1、安装 CANN 8.0 的环境，并 `source /path/to/cann/set_env.sh`；
+
+2、使用 Python 3.9 或更高；
+
+3、使用 torch 2.0 或更高版本，并安装对应的 torch_npu；
+
+4、安装依赖：
+
+```shell
+pip install transformers==4.34.0
+pip install accelerate
+```
+
+5、安装 `atb_llm`:
+
+```shell
+cd $llm_path
+python setup.py bdist_wheel
+python -m pip install dist/*.whl --force-reinstall
+```
+
+## BLOOMZ-7B1-MT
+
+### 权重准备
+
+在 Hugging Face 上下载模型权重文件（推荐下载 `.safetensors`，`.bin` 需要转换成 `.safetensors`），权重路径为 `weight_path`。
+
+### PagedAttention模型
+
+进入 `modeltest` 路径下：
+
+```shell
+cd tests/modeltest
+```
+
+进行测试前需要先设置一些环境变量：
+
+```shell
+export HCCL_BUFFSIZE=110
+export PYTHONWARNINGS="ignore"
+export ATB_OPERATION_EXECUTE_ASYNC=1
+export TASK_QUEUE_ENABLE=1
+export ATB_USE_TILING_COPY_STREAM=1
+export ATB_CONTEXT_WORKSPACE_RING=1
+export ATB_LAYER_INTERNAL_TENSOR_REUSE=1
+export ASCEND_RT_VISIBLE_DEVICES="0,1,2,3,4,5,6,7"
+```
+
+#### 性能测试
+
+> `$weight_path` 可以是 HuggingFace 原始权重路径，也可以是量化后的模型权重路径（下同）。
+
+```shell
+bash run.sh pa_fp16 performance [[seq_in,seq_out],[seq_in,seq_out]] $batch_size bloom $weight_path $tp
+```
+
+例如：`TP = 8`，`batch_size = 1`：
+
+```shell
+bash run.sh pa_fp16 performance [[256,256],[512,512],[1024,1024],[2048,2048]] 1 bloom /path/to/model 8
+```
+
+#### 下游任务精度测试
+
+```shell
+bash run.sh pa_fp16 full_CEval $n_shot $batch_size bloom $weight_path $tp
+```
+
+例如：`TP = 8`，`batch_size = 1`，`CEval 5-shot`：
+
+```shell
+bash run.sh pa_fp16 full_CEval 5 1 bloom /path/to/model 1
+```
+
+更详细的配置选项请参考：`examples/atb_models/tests/modeltest/README.md`
+
+## BLOOM-7B1
+
+### PagedAttention模型
+
+与 BLOOMZ-7B1-MT PagedAttention 模型测试方式相同。
+
+## BLOOM-176B
+
+### 权重准备
+
+BLOOM-176B 由于权重较大（约 328GB），仅支持 800I A2 机器上进行 TP8 W8A16 推理，首选需要对 HuggingFace 下载的原始权重进行量化：
+
+```shell
+# source CANN包
+source /path/to/cann/set_env.sh
+# 进入模型仓所在路径，详见*路径变量解释-llm_path*
+cd $llm_path
+# {浮点权重路径} 即 HuggingFace 下载的原始权重路径
+python examples/models/bloom/convert_quant_weights.py --model_path {浮点权重路径} --save_directory {W8A16量化权重路径} --w_bit 8 --a_bit 16 --act_method 3 --calib_file ""
+```
+
+### PagedAttention模型
+
+与 BLOOMZ-7B1-MT PagedAttention 模型测试方式相同，只需要将 `{W8A16量化权重路径}` 作为 `$weight_path` 配置即可。
diff --git a/mindie/examples/models/bloom/convert_quant_weights.py b/mindie/examples/models/bloom/convert_quant_weights.py
new file mode 100644
index 00000000..ad5a36a0
--- /dev/null
+++ b/mindie/examples/models/bloom/convert_quant_weights.py
@@ -0,0 +1,76 @@
+# Copyright Huawei Technologies Co., Ltd. 2024. All rights reserved.
+
+import os
+
+from msmodelslim.pytorch.llm_ptq.llm_ptq_tools import QuantConfig
+from msmodelslim.pytorch.llm_ptq.anti_outlier import AntiOutlierConfig
+
+from transformers import BloomConfig
+from examples.convert.model_slim.get_calibration_dataset import load_jsonl
+from examples.convert.model_slim.quantifier import parse_arguments, Quantifier
+from examples.convert.convert_utils import copy_tokenizer_files, modify_config
+
+
+
+if __name__ == "__main__":
+    args = parse_arguments()
+
+    rank = int(os.getenv("RANK", "0"))
+
+    config = BloomConfig.from_pretrained(args.model_path)
+
+    disable_names = []
+    if args.a_bit != 16:
+        # W8A16, W4A16没有回退层
+        num_layers = config.num_hidden_layers
+        disable_names = [f"model.layers.{layer}.mlp.down_proj" for layer in range(num_layers)]
+        disable_names.append("lm_head")
+
+    anti_outlier_config = None
+    if args.anti_method:
+        anti_outlier_config = AntiOutlierConfig(anti_method=args.anti_method)
+
+    quant_config = QuantConfig(
+        a_bit=args.a_bit,
+        w_bit=args.w_bit,
+        disable_names=disable_names,
+        act_method=args.act_method,
+        w_sym=args.w_sym,
+        mm_tensor=False,
+        dev_type=args.device_type,
+        dev_id=rank,
+        pr=1.0,
+        fraction=args.fraction,
+        co_sparse=args.co_sparse,
+        do_smooth=args.do_smooth,
+        use_sigma=args.use_sigma,
+        sigma_factor=args.sigma_factor,
+        is_lowbit=args.is_lowbit,
+        use_kvcache_quant=args.use_kvcache_quant,
+    )
+
+    # 默认无校准数据集
+    calibration_dataset = None
+    # 若存在calib_file，则使用calib_file作为校准数据集
+    if args.calib_file:
+        calibration_dataset = load_jsonl(args.calib_file)
+    calibration_dataset = calibration_dataset
+    quant_weight_generator = Quantifier(args.model_path, quant_config, anti_outlier_config, args.device_type)
+    quant_weight_generator.tokenizer.pad_token_id = 0
+
+    tokenized_data = None
+    if calibration_dataset is not None:
+        tokenized_data = quant_weight_generator.get_tokenized_data(calibration_dataset)
+
+    quant_weight_generator.convert(tokenized_data, args.save_directory, args.disable_level)
+    #为适配工具稀疏量化传入w_bit=4,a_bit=8暂时修改quant_type
+    quant_type = f"w{args.w_bit}a{args.a_bit}" + ("s" if (args.co_sparse or args.is_lowbit) else "")
+    is_sparseCompress = args.w_bit == 4 and args.a_bit == 8 and (args.co_sparse or args.is_lowbit)
+    if is_sparseCompress:
+        quant_type = "w8a8s"
+    modify_config(
+        args.model_path, args.save_directory, config.torch_dtype,
+        quant_type,
+        args.use_kvcache_quant
+    )
+    copy_tokenizer_files(args.model_path, args.save_directory)
diff --git a/mindie/examples/models/bloom/run_fa.sh b/mindie/examples/models/bloom/run_fa.sh
new file mode 100644
index 00000000..0c468851
--- /dev/null
+++ b/mindie/examples/models/bloom/run_fa.sh
@@ -0,0 +1,37 @@
+# Copyright Huawei Technologies Co., Ltd. 2023-2024. All rights reserved.
+# 参数配置以及启动指令的说明见同级目录下的README.md文件
+export MAX_MEMORY_GB=29
+export ASCEND_RT_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+export MASTER_PORT=20030
+
+# 以下环境变量与性能和内存优化相关，通常情况下无需修改
+export ATB_LAYER_INTERNAL_TENSOR_REUSE=1
+export INF_NAN_MODE_ENABLE=0
+export ATB_OPERATION_EXECUTE_ASYNC=1
+export TASK_QUEUE_ENABLE=1
+export ATB_CONVERT_NCHW_TO_ND=1
+export HCCL_BUFFSIZE=120
+export HCCL_WHITELIST_DISABLE=1
+export ATB_CONTEXT_WORKSPACE_RING=1
+export ATB_CONTEXT_WORKSPACE_SIZE=2629145600
+export ATB_WORKSPACE_MEM_ALLOC_GLOBAL=0
+export ATB_LAUNCH_KERNEL_WITH_TILING=0
+export ATB_OPSRUNNER_KERNEL_CACHE_GLOABL_COUNT=1
+export ATB_OPSRUNNER_KERNEL_CACHE_LOCAL_COUNT=0
+
+# solve num_blocks < 0  free_memory < 0
+export ATB_WORKSPACE_MEM_ALLOC_GLOBAL=1
+export RESERVED_MEMORY_GB=0
+export ATB_CONTEXT_WORKSPACE_SIZE=0
+
+extra_param=""
+world_size=$(($(echo "${ASCEND_RT_VISIBLE_DEVICES}" | grep -o , | wc -l) +1))
+
+
+if [ "$TP_WORLD_SIZE" == "1" ]; then
+    python -m examples.run_fa --model_path $1 $extra_param
+else
+    torchrun --nproc_per_node $world_size --master_port $MASTER_PORT -m examples.run_pa --model_path $1 $extra_param
+fi
+
+#  --input_text "Common sense questions and answers\n\nQuestion: Why do we need to learn a new language\nFactual answer:" --max_output_length 32
\ No newline at end of file
diff --git a/mindie/examples/models/bloom/run_pa.sh b/mindie/examples/models/bloom/run_pa.sh
new file mode 100644
index 00000000..1d9d09b2
--- /dev/null
+++ b/mindie/examples/models/bloom/run_pa.sh
@@ -0,0 +1,36 @@
+# Copyright Huawei Technologies Co., Ltd. 2023-2024. All rights reserved.
+# 参数配置以及启动指令的说明见同级目录下的README.md文件
+export IS_QUANT=0
+export MAX_MEMORY_GB=29
+export ASCEND_RT_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+export world_size=8
+export MASTER_PORT=20030
+export IS_BF16=false
+
+# 以下环境变量与性能和内存优化相关，通常情况下无需修改
+export ATB_LAYER_INTERNAL_TENSOR_REUSE=1
+export INF_NAN_MODE_ENABLE=0
+export ATB_OPERATION_EXECUTE_ASYNC=1
+export TASK_QUEUE_ENABLE=1
+export ATB_CONVERT_NCHW_TO_ND=1
+export LCCL_ENABLE_FALLBACK=1
+
+export ATB_WORKSPACE_MEM_ALLOC_GLOBAL=1
+export RESERVED_MEMORY_GB=0
+export ATB_CONTEXT_WORKSPACE_SIZE=0
+
+export ATB_OPERATION_EXECUTE_ASYNC=1
+export INT8_FORMAT_NZ_ENABLE=1
+
+extra_param=""
+world_size=$(($(echo "${ASCEND_RT_VISIBLE_DEVICES}" | grep -o , | wc -l) +1))
+
+if [ "$IS_BF16" = true ]; then
+    extra_param="${extra_param} --is_bf16"
+fi
+
+if [ "$TP_WORLD_SIZE" == "1" ]; then
+    python -m examples.run_pa --model_path $1 $extra_param
+else
+    torchrun --nproc_per_node $world_size --master_port $MASTER_PORT -m examples.run_pa --model_path $1 $extra_param 
+fi
diff --git a/mindie/examples/models/chatglm/v2_6b/README.md b/mindie/examples/models/chatglm/v2_6b/README.md
new file mode 100644
index 00000000..3de5cb4b
--- /dev/null
+++ b/mindie/examples/models/chatglm/v2_6b/README.md
@@ -0,0 +1,231 @@
+# ChatGLM2-6B 模型推理指导 <!-- omit in toc -->
+
+# 概述
+
+- [ChatGLM2-6B](https://github.com/THUDM/ChatGLM2-6B/) 是开源中英双语对话模型 [ChatGLM-6B](https://github.com/THUDM/ChatGLM-6B) 的第二代版本，在保留了初代模型对话流畅、部署门槛较低等众多优秀特性的基础之上，ChatGLM2-6B有更强大的性能、更长的上下文、更高效的推理和更开放的协议。
+- 此代码仓中实现了一套基于NPU硬件的ChatGLM2推理模型。配合加速库使用，旨在NPU上获得极致的推理性能。
+
+# 特性矩阵
+- 此矩阵罗列了ChatGLM2-6B模型支持的特性
+
+| 模型及参数量 | 800I A2 Tensor Parallelism | 300I DUO Tensor Parallelism | FP16 | BF16 | Flash Attention | Paged Attention | W8A8量化 | W8A16量化 | W4A16量化 | KV cache量化 | 稀疏量化（仅300I DUO支持） | MOE | MindIE | TGI | 长序列 |
+|-------------|-------------------------|-------------------------|------|------|-----------------|-----------------|---------|--------------|----------|--------|--------|-----|-----|-----|-----|
+| ChatGLM2-6B    | 支持world size 1,2,4,8  | 支持world size 1,2,4      | 是   | 否   | 否              | 是              | 是       | 否      | 否     | 否           | 是       | 否     | 是     | 是  | 否 |
+
+- 此模型仓已适配的模型版本
+  - [ChatGLM2-6B](https://huggingface.co/THUDM/chatglm2-6b/tree/main)
+
+# 使用说明
+
+## 路径变量解释
+| 变量名  | 含义                                             |
+|--------|--------------------------------------------------|
+| working_dir | 加速库及模型库下载后放置的目录                  |
+| llm_path | 模型仓所在路径。若使用编译好的包，则路径为`${working_dir}/MindIE-LLM/`；若使用gitee下载的代码，则路径为`${working_dir}/MindIE-LLM/examples/atb_models`    |
+| script_path | 脚本所在路径；路径为${llm_path}/examples/models/chatglm/v2_6b                            |
+| weight_path | 模型权重路径                            |
+
+## 权重转换
+- 参考[此README文件](../../../README.md)
+
+## 量化权重导出
+量化权重可通过msmodelslim（昇腾压缩加速工具）实现。
+
+### 环境准备
+环境配置可参考msmodelslim官网：https://www.hiascend.com/document/detail/zh/canncommercial/70RC1/devtools/auxiliarydevtool/modelslim_0002.html
+
+### 导出w8a8量化权重
+通过`${llm_path}/examples/models/chatglm/v2_6b/quant_chatglm_w8a8.py`文件导出模型的量化权重（注意量化权重不要和浮点权重放在同一个目录下）：
+```shell
+# 必须设置该线程数
+export OMP_NUM_THREADS=48
+python quant_chatglm_w8a8.py --model_path ${浮点权重路径} --save_path ${量化权重保存路径} --dataset_path ${校准数据集路径}
+```
+校准数据集从 [Tsinghua Cloud](https://cloud.tsinghua.edu.cn/f/e84444333b6d434ea7b0/) 获取，解压后，使用解压目录下的 `CEval/val/Other/civil_servant.jsonl` 作为校准数据集。
+
+导出量化权重后应生成`quant_model_weight_w8a8.safetensors`和`quant_model_description_w8a8.json`两个文件。
+
+### 导出w4a16量化权重
+通过`${llm_path}/examples/models/chatglm/v2_6b/quant_chatglm_w4a16.py`文件导出模型的量化权重（注意量化权重不要和浮点权重放在同一个目录下）：
+```shell
+python quant_chatglm_w4a16.py --model_path ${浮点权重路径} --save_path ${量化权重保存路径} --dataset_path ${校准数据集路径}
+```
+校准数据集从 [Tsinghua Cloud](https://cloud.tsinghua.edu.cn/f/e84444333b6d434ea7b0/) 获取，解压后，使用解压目录下的 `CEval/val/Social_Science/teacher_qualification.jsonl` 作为校准数据集。
+
+导出量化权重后应生成`quant_model_weight_w4a16.safetensors`和`quant_model_description_w4a16.json`两个文件。
+
+注：
+
+1.quant_chatglm_w8a8.py和quant_chatglm_w4a16.py文件中已配置好较优的量化策略，导出量化权重时可直接使用，也可修改为其它策略。
+
+2.执行脚本生成量化权重时，会在生成的权重路径的config.json文件中添加(或修改)`quantize`字段，值为相应量化方式，当前仅支持`w8a8`和`w4a16`。
+
+3.执行完以上步骤后，执行量化模型只需要替换权重路径。
+
+4.如果生成权重时遇到`OpenBLAS Warning: Detect OpenMP Loop and this application may hang. Please rebuild the library with USE_OPENMP = 1 option`，可通过设置`export OMP_NUM_THREADS=1`来关闭多线程规避。
+
+### 导出稀疏量化权重
+执行generate_sparse.sh导出稀疏量化权重（注意量化权重不要和浮点权重放在同一个目录下）：
+```shell
+bash generate_sparse.sh ${浮点权重路径} ${稀疏量化权重保存路径} ${llm_path}/examples/models/chatglm/v2_6b/calib_data.jsonl ${Tensor并行数}
+```
+
+执行后`${稀疏量化权重保存路径}`下会生成compress目录，使用`${稀疏量化权重保存路径}/compress`目录作为权重目录进行推理。
+
+注：
+
+1.generate_sparse.sh文件中已配置好较优的量化策略，导出量化权重时可直接使用，也可修改为其它策略。
+
+2.执行完以上步骤后，执行量化模型只需要替换权重路径为`${稀疏量化权重保存路径}/compress`。
+
+3.当在npu上生成稀疏量化权重（即--device_type为npu时）时，注意需要将${浮点权重路径}/modeling_chatglm.py文件168行的@torch.jit.script注释。
+
+## 300I DUO 运行操作说明
+- 可开启CPU Performance模式以提高模型推理性能
+
+  ```
+  cpupower frequency-set -g performance
+  ```
+
+### 对话测试
+- 运行启动脚本
+  - 在\${llm_path}目录下执行以下指令
+    ```shell
+    bash ${script_path}/run_300i_duo.sh ${weight_path}
+    ```
+- 环境变量说明
+  - `export BIND_CPU=1`
+    - 绑定CPU核心开关
+    - 默认进行绑核
+    - 若当前机器未设置NUMA或绑核失败，可将 BIND_CPU 设为 0
+  - `export ASCEND_RT_VISIBLE_DEVICES=0,1,2,3`
+    - 指定当前机器上可用的逻辑NPU核心，多个核心间使用逗号相连
+    - 核心ID查阅方式见[此README文件](../../../README.md)的【启动脚本相关环境变量】章节
+  - `export TP_WORLD_SIZE=2`
+    - 指定模型运行时的TP数，即world size
+    - 默认为单卡双芯
+    - 各模型支持的TP数参考“特性矩阵”
+    - “单卡双芯”运行请指定`TP_WORLD_SIZE`为`2`
+  - `export MASTER_PORT=20030`
+    - 设置卡间通信端口
+    - 默认使用20030端口
+    - 目的是为了避免同一台机器同时运行多个多卡模型时出现通信冲突
+    - 设置时端口建议范围为：20000-20050
+  - `export PYTHONPATH=${llm_path}:$PYTHONPATH`
+    - 将模型仓路径加入Python查询模块和包的搜索路径中
+    - 将${llm_path}替换为实际路径
+  - `export INT8_FORMAT_NZ_ENABLE=1`
+    - 服务化量化场景开启
+  - - 以下环境变量与性能和内存优化相关，通常情况下无需修改
+    ```shell
+    # 内存
+    export ATB_LAYER_INTERNAL_TENSOR_REUSE=1
+    export ATB_WORKSPACE_MEM_ALLOC_GLOBAL=1
+    # 性能
+    export HCCL_OP_BASE_FFTS_MODE_ENABLE=TRUE
+    export ATB_OPERATION_EXECUTE_ASYNC=1
+    export TASK_QUEUE_ENABLE=1
+    export HCCL_BUFFSIZE=110
+    ```
+
+
+## 800I A2 运行操作说明
+- 可开启CPU Performance模式以提高模型推理性能
+
+  ```
+  cpupower frequency-set -g performance
+  ```
+### 对话测试
+
+**运行Paged Attention FP16**
+- 运行启动脚本
+  - 在\${llm_path}目录下执行以下指令
+    ```shell
+    bash ${script_path}/run_800i_a2_pa.sh ${weight_path}
+    ```
+- 环境变量说明
+  - `export ASCEND_RT_VISIBLE_DEVICES=0,1,2,3,4,5,6,7`
+    - 指定当前机器上可用的逻辑NPU核心，多个核心间使用逗号相连
+    - 核心ID查阅方式见[此README文件](../../../README.md)的【启动脚本相关环境变量】章节
+  - `export TP_WORLD_SIZE=1`
+    - 指定模型运行时的TP数，即world size
+    - 默认为单卡
+    - 各模型支持的TP数参考“特性矩阵”
+  - `export MASTER_PORT=20030`
+    - 设置卡间通信端口
+    - 默认使用20030端口
+    - 目的是为了避免同一台机器同时运行多个多卡模型时出现通信冲突
+    - 设置时端口建议范围为：20000-20050
+  - `export PYTHONPATH=${llm_path}:$PYTHONPATH`
+    - 将模型仓路径加入Python查询模块和包的搜索路径中
+    - 将${llm_path}替换为实际路径
+  - `export IS_BF16=false`
+    - 是否使用BF16精度进行推理
+    - 默认使用FP16
+  - 以下环境变量与性能和内存优化相关，通常情况下无需修改
+    ```shell
+    # 内存
+    export ATB_LAYER_INTERNAL_TENSOR_REUSE=1
+    export ATB_WORKSPACE_MEM_ALLOC_GLOBAL=1
+    # 性能
+    export HCCL_OP_BASE_FFTS_MODE_ENABLE=TRUE
+    export ATB_OPERATION_EXECUTE_ASYNC=1
+    export TASK_QUEUE_ENABLE=1
+    export LCCL_ENABLE_FALLBACK=1
+    ```
+
+
+**运行Paged Attention BF16**
+- 暂不支持
+
+**运行Paged Attention W8A8量化**
+- 运行启动脚本
+  - 与“运行Paged Attention FP16”的启动方式相同
+  - `${weight_path}`为W8A8量化权重的路径
+- 环境变量说明
+  - 参见“运行Paged Attention FP16”中的环境变量说明
+- 相比于FP16，运行量化时需修改W8A8量化权重`${weight_path}/config.json`中的`quantize`字段，将此字段对应的值修改为`w8a8`
+  - 若config.json中无此字段，则新增
+
+**运行KV cache量化**
+- 暂不支持
+
+**运行Paged Attention 稀疏量化**
+- 运行启动脚本
+  - 与“运行Paged Attention FP16”的启动方式相同
+  - `${weight_path}`为稀疏量化权重的路径
+- 环境变量说明
+  - 参见“运行Paged Attention FP16”中的环境变量说明
+- 相比于FP16，运行量化时需修改稀疏量化权重`${weight_path}/config.json`中的`quantize`字段，将此字段对应的值修改为`w8a8sc`
+  - 若config.json中无此字段，则新增
+- 注意：压缩算法与硬件强相关，当前仅300I DUO卡支持稀疏量化
+
+
+## 精度测试
+- 参考[此README文件](../../../../tests/modeltest/README.md)
+
+## 性能测试
+- 参考[此README文件](../../../../tests/modeltest/README.md)
+
+## Web交互
+- 拉起MindIE Service后端
+- 拉起Web后端
+  ```shell
+  # 安装依赖
+  pip install -r web_requirements.txt
+  
+  # 下载 GitHub 仓库
+  git clone https://github.com/THUDM/ChatGLM2-6B.git
+  cd ChatGLM2-6B
+  git reset --hard 921d7e9adc69020a19169d1ba4f76c2675a2dd29
+  
+  # 应用适配代码
+  git apply ../web_demo.patch
+  cd ..
+  python3 ChatGLM2-6B/web_demo.py --model_path ${weight_path}
+  ```
+- 根据后台显示的IP和端口从浏览器访问
+
+## FAQ
+- `import torch_npu`遇到`xxx/libgomp.so.1: cannot allocate memory in static TLS block`报错，可通过配置`LD_PRELOAD`解决。
+  - 示例：`export LD_PRELOAD=/lib/aarch64-linux-gnu/libgomp.so.1:$LD_PRELOAD`
\ No newline at end of file
diff --git a/mindie/examples/models/chatglm/v2_6b/calib_data.jsonl b/mindie/examples/models/chatglm/v2_6b/calib_data.jsonl
new file mode 100644
index 00000000..d7d837c8
--- /dev/null
+++ b/mindie/examples/models/chatglm/v2_6b/calib_data.jsonl
@@ -0,0 +1,15 @@
+{"id": 0, "inputs_pretokenized": "编写中小学教科书的直接依据是____。\nA. 《中华人民共和国教育法》\nB. 课程计划\nC. 课程标准\nD. 课程表", "choices_pretokenized": [" A", " B", " C", " D"], "label": 2, "targets_pretokenized": ["C"]}
+{"id": 1, "inputs_pretokenized": "下列关于课程的三种文本表现形式说法正确的是____\nA. 课程计划是由当地教育主管部门制订的\nB. 课程标准是依据课程计划制定的\nC. 课程标准的核心是实施建议\nD. 教材编写的基本方式有直线式、螺旋式、交叉式", "choices_pretokenized": [" A", " B", " C", " D"], "label": 1, "targets_pretokenized": ["B"]}
+{"id": 2, "inputs_pretokenized": "悦悦是一名右耳失聪的残疾儿童，活动课上有时会听不清楚周老师所讲的内容，因此经常提问题。对此，周老师应当采取的措施是____。\nA. 给予悦悦更多的帮助和指导\nB. 指导家长带悦悦回家自学\nC. 建议家长将悦悦转到特殊幼儿园\nD. 照顾大多数幼儿，不理会悦悦", "choices_pretokenized": [" A", " B", " C", " D"], "label": 0, "targets_pretokenized": ["A"]}
+{"id": 3, "inputs_pretokenized": "内流河也称“内陆河”，是指没有流入海洋的河流，大多分布在大陆内部干燥地区，上游降水或冰雪融水为其主要补给水源，最终消失于沙漠或注入内陆湖泊。下列中国内流河中，最长的是____。\nA. 塔里木河\nB. 柴达木河\nC. 尼雅河\nD. 疏勒河", "choices_pretokenized": [" A", " B", " C", " D"], "label": 0, "targets_pretokenized": ["A"]}
+{"id": 4, "inputs_pretokenized": "学校规定学生不能烫染头发，但是小文为了彰显个性，在假期把头发染成了棕色。面对小文的情况，教师应该怎样处理？____\nA. 年轻人追求个性是合情合理的，应该宽容对待\nB. 违反学校的校规，应该严格处分\nC. 强制要求小文将头发颜色染回来才可以进校门\nD. 探明小文违反校规的原因，并对其进行劝导和教育", "choices_pretokenized": [" A", " B", " C", " D"], "label": 3, "targets_pretokenized": ["D"]}
+{"id": 5, "inputs_pretokenized": "张老师根据自己班级的情况，为解决班级内部班干部的人际关系问题，建立和谐融洽的班级氛围，自主开发了“和谐人际”的班级课程，这体现了教师____。\nA. 是教育教学的研究者\nB. 是课程的建设者和开发者\nC. 是学生学习的促进者\nD. 是社区型的开放教师", "choices_pretokenized": [" A", " B", " C", " D"], "label": 1, "targets_pretokenized": ["B"]}
+{"id": 6, "inputs_pretokenized": "刘老师工作很负责，学生在学校出现一点问题他就会与家长联系，在与家长沟通时他经常以前辈的姿态对待家长，对家长的教育方式指指点点。刘老师的做法____。\nA. 正确，老师就应该与家长经常沟通\nB. 正确，老师的经验比家长丰富，应该多指导家长\nC. 不正确，教师没有权利指导家长\nD. 不正确，教师应该与家长建立平等的沟通关系，尊重家长的人格", "choices_pretokenized": [" A", " B", " C", " D"], "label": 3, "targets_pretokenized": ["D"]}
+{"id": 7, "inputs_pretokenized": "在古代印度，有一户人家经营一家棉布店销售自己手工制作的衣服。你认为这户人家属于哪个等级？____\nA. 婆罗门\nB. 刹帝利\nC. 吠舍\nD. 首陀罗", "choices_pretokenized": [" A", " B", " C", " D"], "label": 2, "targets_pretokenized": ["C"]}
+{"id": 8, "inputs_pretokenized": "“小型分散，便于开展多种多样的活动，满足学生不同的兴趣、爱好，发展学生的才能，使学生得到更多的学习和锻炼的机会。”这种课外活动的形式是____。\nA. 科技活动\nB. 学科活动\nC. 个人活动\nD. 小组活动", "choices_pretokenized": [" A", " B", " C", " D"], "label": 3, "targets_pretokenized": ["D"]}
+{"id": 9, "inputs_pretokenized": "小红每天晚上临睡前都要多次反复检查自己的书包，确保带齐了第二天需要用的教材和文具。她明知道没有这个必要，但就是控制不住。她可能出现了____。\nA. 抑郁症\nB. 焦虑症\nC. 强迫症\nD. 恐惧症", "choices_pretokenized": [" A", " B", " C", " D"], "label": 2, "targets_pretokenized": ["C"]}
+{"id": 10, "inputs_pretokenized": "国家管理和评价课程的基础是____。\nA. 课程计划\nB. 课程标准\nC. 教学目标\nD. 教育目的", "choices_pretokenized": [" A", " B", " C", " D"], "label": 1, "targets_pretokenized": ["B"]}
+{"id": 11, "inputs_pretokenized": "儿童坚持性发生明显质变的年龄约在____\nA. 3～4岁\nB. 4～5岁\nC. 5～6岁\nD. 6岁以后", "choices_pretokenized": [" A", " B", " C", " D"], "label": 1, "targets_pretokenized": ["B"]}
+{"id": 12, "inputs_pretokenized": "《红楼梦》中人物众多、关系繁杂。为了帮助读者阅读，许多红学爱好者都在网络上发布了自己整理制作的主要人物关系图。这属于____。\nA. 纲要策略\nB. 精细加工策略\nC. 资源管理策略\nD. 监控策略", "choices_pretokenized": [" A", " B", " C", " D"], "label": 0, "targets_pretokenized": ["A"]}
+{"id": 13, "inputs_pretokenized": "学期结束时，班主任王老师会对学生思想品德的发展变化情况进行评价。这项工作属于____。\nA. 工作总结\nB. 工作计划\nC. 操行评定\nD. 建立学生档案", "choices_pretokenized": [" A", " B", " C", " D"], "label": 2, "targets_pretokenized": ["C"]}
+{"id": 14, "inputs_pretokenized": "人们常说：“教学有法而教无定法。”这反映了教师的劳动具有____。\nA. 连续性\nB. 示范性\nC. 长期性\nD. 创造性", "choices_pretokenized": [" A", " B", " C", " D"], "label": 3, "targets_pretokenized": ["D"]}
\ No newline at end of file
diff --git a/mindie/examples/models/chatglm/v2_6b/generate_sparse.sh b/mindie/examples/models/chatglm/v2_6b/generate_sparse.sh
new file mode 100644
index 00000000..5b7b310e
--- /dev/null
+++ b/mindie/examples/models/chatglm/v2_6b/generate_sparse.sh
@@ -0,0 +1,17 @@
+export PYTORCH_NPU_ALLOC_CONF=expandable_segments:False
+
+disable_names="transformer.encoder.layers.0.mlp.dense_4h_to_h transformer.encoder.layers.1.self_attention.query_key_value transformer.encoder.layers.1.self_attention.dense transformer.encoder.layers.1.mlp.dense_h_to_4h transformer.encoder.layers.1.mlp.dense_4h_to_h transformer.encoder.layers.2.self_attention.query_key_value transformer.encoder.layers.2.self_attention.dense transformer.encoder.layers.2.mlp.dense_h_to_4h transformer.encoder.layers.2.mlp.dense_4h_to_h transformer.encoder.layers.3.self_attention.query_key_value transformer.encoder.layers.3.self_attention.dense transformer.encoder.layers.4.self_attention.query_key_value transformer.encoder.layers.4.self_attention.dense transformer.encoder.layers.5.self_attention.query_key_value transformer.encoder.layers.5.self_attention.dense transformer.encoder.layers.6.self_attention.query_key_value transformer.encoder.layers.6.self_attention.dense transformer.encoder.layers.7.self_attention.query_key_value transformer.encoder.layers.7.self_attention.dense transformer.encoder.layers.8.self_attention.query_key_value transformer.encoder.layers.8.self_attention.dense transformer.encoder.layers.9.self_attention.query_key_value transformer.encoder.layers.9.self_attention.dense transformer.encoder.layers.11.self_attention.query_key_value transformer.encoder.layers.11.self_attention.dense transformer.encoder.layers.14.self_attention.query_key_value transformer.encoder.layers.14.self_attention.dense transformer.encoder.layers.19.self_attention.query_key_value transformer.encoder.layers.19.self_attention.dense transformer.encoder.layers.20.mlp.dense_4h_to_h transformer.encoder.layers.27.mlp.dense_4h_to_h transformer.output_layer"
+
+weight_path=$1
+w8a8s_weight_path=$2
+w8a8sc_weight_path=${w8a8s_weight_path}/compress
+calib_data=$3
+tp_size=$4
+
+cd ${ATB_SPEED_HOME_PATH}
+
+python -m examples.convert.model_slim.quantifier --model_path ${weight_path} --save_directory ${w8a8s_weight_path} --calib_file ${calib_data} --disable_names ${disable_names} --device_type npu --is_lowbit True --w_bit 4 --a_bit 8
+
+torchrun --nproc_per_node $tp_size -m examples.convert.model_slim.sparse_compressor --model_path ${w8a8s_weight_path} --save_directory ${w8a8sc_weight_path}
+
+cp $weight_path/modeling_chatglm.py $w8a8sc_weight_path/
\ No newline at end of file
diff --git a/mindie/examples/models/chatglm/v2_6b/quant_chatglm_w4a16.py b/mindie/examples/models/chatglm/v2_6b/quant_chatglm_w4a16.py
new file mode 100644
index 00000000..e0727102
--- /dev/null
+++ b/mindie/examples/models/chatglm/v2_6b/quant_chatglm_w4a16.py
@@ -0,0 +1,50 @@
+# Copyright Huawei Technologies Co., Ltd. 2023-2024. All rights reserved.
+
+from msmodelslim.pytorch.llm_ptq.anti_outlier import AntiOutlierConfig, AntiOutlier
+from msmodelslim.pytorch.llm_ptq.llm_ptq_tools import Calibrator, QuantConfig
+from examples.models.chatglm.v2_6b.quant_utils \
+    import parse_args, get_model_and_tokenizer, get_calib_dataset, copy_config_files, read_dataset
+
+
+disable_names = [
+    'transformer.encoder.layers.0.mlp.dense_4h_to_h',
+    'transformer.encoder.layers.1.mlp.dense_4h_to_h',
+    'transformer.encoder.layers.2.self_attention.query_key_value',
+    'transformer.encoder.layers.2.mlp.dense_4h_to_h',
+    'transformer.output_layer'
+]
+
+
+def main():
+    args = parse_args()
+    fp16_path = args.model_path  # 原始浮点模型路径
+    model, tokenizer = get_model_and_tokenizer(fp16_path)
+
+    calib_set = read_dataset(args.dataset_path)
+    dataset_calib = get_calib_dataset(tokenizer, calib_set[:1])
+
+    w_sym = True
+    anti_config = AntiOutlierConfig(a_bit=16, w_bit=4, anti_method="m3", dev_type="cpu", w_sym=w_sym)
+    anti_outlier = AntiOutlier(model, calib_data=dataset_calib, cfg=anti_config)
+    anti_outlier.process()
+    quant_config = QuantConfig(
+        a_bit=16, 
+        w_bit=4, 
+        disable_names=disable_names, 
+        dev_type='cpu',
+        w_sym=w_sym,
+        mm_tensor=False,
+        is_lowbit=True,
+        open_outlier=False,
+        group_size=args.group_size
+    )
+
+    calibrator = Calibrator(model, quant_config, calib_data=[], disable_level='L0')
+    calibrator.run()  # 执行PTQ量化校准
+    calibrator.save(args.save_path, save_type=["safe_tensor"])  # "safe_tensor"对应safetensors格式权重
+    copy_config_files(fp16_path, args.save_path, 'w4a16')
+    
+
+if __name__ == '__main__':
+    main()
+    
\ No newline at end of file
diff --git a/mindie/examples/models/chatglm/v2_6b/quant_chatglm_w8a8.py b/mindie/examples/models/chatglm/v2_6b/quant_chatglm_w8a8.py
new file mode 100644
index 00000000..c41f849f
--- /dev/null
+++ b/mindie/examples/models/chatglm/v2_6b/quant_chatglm_w8a8.py
@@ -0,0 +1,59 @@
+# Copyright Huawei Technologies Co., Ltd. 2023-2024. All rights reserved.
+
+from msmodelslim.pytorch.llm_ptq.llm_ptq_tools import Calibrator, QuantConfig
+from examples.models.chatglm.v2_6b.quant_utils \
+    import parse_args, get_model_and_tokenizer, get_calib_dataset, copy_config_files, read_dataset
+
+
+disable_names = [
+    'transformer.encoder.layers.0.self_attention.query_key_value',
+    'transformer.encoder.layers.0.mlp.dense_4h_to_h',
+    'transformer.encoder.layers.1.self_attention.query_key_value',
+    'transformer.encoder.layers.1.mlp.dense_h_to_4h',
+    'transformer.encoder.layers.1.mlp.dense_4h_to_h',
+    'transformer.encoder.layers.2.self_attention.query_key_value',
+    'transformer.encoder.layers.2.mlp.dense_h_to_4h',
+    'transformer.encoder.layers.2.mlp.dense_4h_to_h',
+    'transformer.encoder.layers.3.self_attention.query_key_value',
+    'transformer.encoder.layers.4.self_attention.query_key_value',
+    'transformer.encoder.layers.5.self_attention.query_key_value',
+    'transformer.encoder.layers.6.self_attention.query_key_value',
+    'transformer.encoder.layers.7.self_attention.query_key_value',
+    'transformer.encoder.layers.8.self_attention.query_key_value',
+    'transformer.encoder.layers.9.self_attention.query_key_value',
+    'transformer.encoder.layers.11.self_attention.query_key_value',
+    'transformer.encoder.layers.14.self_attention.query_key_value',
+    'transformer.encoder.layers.19.self_attention.query_key_value',
+    'transformer.encoder.layers.20.mlp.dense_4h_to_h',
+    'transformer.encoder.layers.27.mlp.dense_4h_to_h',
+    'transformer.output_layer'
+]
+
+quant_config = QuantConfig(
+    a_bit=8, 
+    w_bit=8, 
+    disable_names=disable_names, 
+    dev_type='cpu',
+    act_method=1,
+    pr=1.0, 
+    w_sym=True, 
+    mm_tensor=False
+)
+
+
+def main():
+    args = parse_args()
+    fp16_path = args.model_path  # 原始浮点模型路径
+    model, tokenizer = get_model_and_tokenizer(fp16_path)
+
+    calib_set = read_dataset(args.dataset_path)
+    dataset_calib = get_calib_dataset(tokenizer, calib_set)
+    calibrator = Calibrator(model, quant_config, calib_data=dataset_calib, disable_level='L0')
+    calibrator.run()  # 执行PTQ量化校准
+    calibrator.save(args.save_path, save_type=["safe_tensor"])  # "safe_tensor"对应safetensors格式权重
+    copy_config_files(fp16_path, args.save_path, 'w8a8')
+    
+
+if __name__ == '__main__':
+    main()
+    
\ No newline at end of file
diff --git a/mindie/examples/models/chatglm/v2_6b/quant_utils.py b/mindie/examples/models/chatglm/v2_6b/quant_utils.py
new file mode 100644
index 00000000..6d8d8cfe
--- /dev/null
+++ b/mindie/examples/models/chatglm/v2_6b/quant_utils.py
@@ -0,0 +1,62 @@
+# Copyright Huawei Technologies Co., Ltd. 2023-2024. All rights reserved.
+
+import os
+import json
+import shutil
+import argparse
+import torch
+from transformers import AutoTokenizer, AutoModelForCausalLM
+
+from atb_llm.utils.file_utils import safe_open
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description="Creating quant weights for ChatGLM2-6B or ChatGLM3-6B")
+    parser.add_argument("--model_path", type=str, required=True, help="The path to model float weights")
+    parser.add_argument("--save_path", type=str, default="./quant_weight_glm", help="The path to save quant weights")
+    parser.add_argument("--dataset_path", type=str, required=True, help="The dataset path")
+    parser.add_argument("--group_size", type=int, default=128, help="The group size for w4a16")
+
+    return parser.parse_args()
+
+
+def get_model_and_tokenizer(model_path):
+    tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path=model_path, trust_remote_code=True) 
+    model = AutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path=model_path, torch_dtype=torch.float32,
+                                                 trust_remote_code=True).cpu()
+    model.eval()
+    return model, tokenizer
+
+
+def read_dataset(dataset_path):
+    calib_set = []
+    with safe_open(dataset_path, encoding='utf-8') as file:
+        for line in file:
+            calib_set.append(json.loads(line))
+    return calib_set
+
+
+# 获取校准数据函数定义
+def get_calib_dataset(tokenizer, calib_list, device="cpu"):  # device="npu:0" 如果需要使用npu进行量化
+    calib_dataset = []
+    for calib_data in calib_list:
+        text = calib_data['inputs_pretokenized']
+        inputs = tokenizer([text], return_tensors='pt')
+        calib_dataset.append([
+            inputs.data['input_ids'].to(device),
+            inputs.data['position_ids'].to(device),
+            inputs.data['attention_mask'].to(device)
+            ])
+    return calib_dataset
+
+
+def copy_config_files(fp16_path, quant_path, quant_type):
+    model_files = [f for f in os.listdir(fp16_path) if f.startswith(("config", "tokeniz", "modeling_chatglm.py"))]
+    for f in model_files:
+        shutil.copy2(os.path.join(fp16_path, f), os.path.join(quant_path, f))
+    with safe_open(os.path.join(quant_path, "config.json"), 'r+', encoding='utf-8') as f:
+        config = json.load(f)
+        config['quantize'] = quant_type
+        f.seek(0)
+        json.dump(config, f, indent=4)
+        f.truncate()
diff --git a/mindie/examples/models/chatglm/v2_6b/run_300i_duo_pa.sh b/mindie/examples/models/chatglm/v2_6b/run_300i_duo_pa.sh
new file mode 100644
index 00000000..a9cdbb4d
--- /dev/null
+++ b/mindie/examples/models/chatglm/v2_6b/run_300i_duo_pa.sh
@@ -0,0 +1,18 @@
+# Copyright Huawei Technologies Co., Ltd. 2023-2024. All rights reserved.
+# 参数配置以及启动指令的说明见同级目录下的README.md文件
+export BIND_CPU=1
+export ASCEND_RT_VISIBLE_DEVICES=0,1
+export TP_WORLD_SIZE=2
+export MASTER_PORT=20030
+
+export ATB_LAYER_INTERNAL_TENSOR_REUSE=1
+export ATB_WORKSPACE_MEM_ALLOC_GLOBAL=1
+
+export HCCL_OP_BASE_FFTS_MODE_ENABLE=TRUE
+export ATB_OPERATION_EXECUTE_ASYNC=1
+export TASK_QUEUE_ENABLE=1
+export HCCL_BUFFSIZE=110
+export INT8_FORMAT_NZ_ENABLE=1
+
+export PYTHONPATH=${llm_path}:$PYTHONPATH
+torchrun --nproc_per_node $TP_WORLD_SIZE --master_port $MASTER_PORT -m examples.run_pa --model_path $1
\ No newline at end of file
diff --git a/mindie/examples/models/chatglm/v2_6b/run_800i_a2_pa.sh b/mindie/examples/models/chatglm/v2_6b/run_800i_a2_pa.sh
new file mode 100644
index 00000000..52d9052e
--- /dev/null
+++ b/mindie/examples/models/chatglm/v2_6b/run_800i_a2_pa.sh
@@ -0,0 +1,27 @@
+# Copyright Huawei Technologies Co., Ltd. 2023-2024. All rights reserved.
+# 参数配置以及启动指令的说明见同级目录下的README.md文件
+export ASCEND_RT_VISIBLE_DEVICES=0
+export TP_WORLD_SIZE=1
+export MASTER_PORT=20030
+export PYTHONPATH=${llm_path}:$PYTHONPATH
+export IS_BF16=false
+
+# 以下环境变量与性能和内存优化相关，通常情况下无需修改
+export ATB_LAYER_INTERNAL_TENSOR_REUSE=1
+export ATB_WORKSPACE_MEM_ALLOC_GLOBAL=1
+
+export HCCL_OP_BASE_FFTS_MODE_ENABLE=TRUE
+export ATB_OPERATION_EXECUTE_ASYNC=1
+export TASK_QUEUE_ENABLE=1
+export LCCL_ENABLE_FALLBACK=1
+
+extra_param=""
+
+# if [ "$IS_BF16" = true ]; then
+#     extra_param="${extra_param} --is_bf16"
+# fi
+
+if [ "$TP_WORLD_SIZE" == "1" ]; then    python -m examples.run_pa --model_path $1 $extra_param
+else
+    torchrun --nproc_per_node $TP_WORLD_SIZE --master_port $MASTER_PORT -m examples.run_pa --model_path $1 $extra_param
+fi
\ No newline at end of file
diff --git a/mindie/examples/models/chatglm/v2_6b/web_demo.patch b/mindie/examples/models/chatglm/v2_6b/web_demo.patch
new file mode 100644
index 00000000..1292893f
--- /dev/null
+++ b/mindie/examples/models/chatglm/v2_6b/web_demo.patch
@@ -0,0 +1,109 @@
+diff --git a/web_demo.py b/web_demo.py
+index 1af24c9..8c0e765 100644
+--- a/web_demo.py
++++ b/web_demo.py
+@@ -1,14 +1,23 @@
+-from transformers import AutoModel, AutoTokenizer
++import json
++import argparse
++import requests
++from transformers import AutoTokenizer
+ import gradio as gr
+ import mdtex2html
+-from utils import load_model_on_gpus
+ 
+-tokenizer = AutoTokenizer.from_pretrained("THUDM/chatglm2-6b", trust_remote_code=True)
+-model = AutoModel.from_pretrained("THUDM/chatglm2-6b", trust_remote_code=True).cuda()
+-# 多显卡支持，使用下面两行代替上面一行，将num_gpus改为你实际的显卡数量
+-# from utils import load_model_on_gpus
+-# model = load_model_on_gpus("THUDM/chatglm2-6b", num_gpus=2)
+-model = model.eval()
++def parse_args():
++    parser = argparse.ArgumentParser(description="ChatGLM2-6B/ChatGLM3-6b web demo")
++    parser.add_argument("--model_path", type=str, required=True, help="The path to model weights")
++    parser.add_argument("--mindie_sever_ip", type=str, default="127.0.0.1", help="The IP address of mindie server")
++    parser.add_argument("--mindie_sever_port", type=int, default=1025, help="The port of mindie server")
++    parser.add_argument("--max_new_tokens", type=int, default=512, help="Max new tokens to generate")
++    parser.add_argument("--concurrency", type=int, default=10, help="Concurrency count of web demo")
++
++    return parser.parse_args()
++
++
++args = parse_args()
++tokenizer = AutoTokenizer.from_pretrained(args.model_path, trust_remote_code=True)
+ 
+ """Override Chatbot.postprocess"""
+ 
+@@ -71,6 +80,49 @@ def predict(input, chatbot, max_length, top_p, temperature, history, past_key_va
+         yield chatbot, history, past_key_values
+ 
+ 
++def build_inputs(tokenizer, query: str):
++    # history由服务化内部自行处理
++    prompt = tokenizer.build_prompt(query, history=None)
++    return prompt
++
++
++def request(input, chatbot, max_length, top_p, temperature, history, past_key_values):
++    chatbot.append((parse_text(input), ""))
++
++    # 添加prompt格式以支持chat
++    promt = build_inputs(tokenizer, input)
++
++    response = requests.post(
++        f"http://{args.mindie_sever_ip}:{args.mindie_sever_port}/generate_stream",
++        json={
++            "inputs": promt,
++            "parameters": {
++                "max_new_tokens": max_length,
++                "do_sample": True,
++                "repetition_penalty": 1.05,
++                "seed": None,
++                "temperature": temperature,
++                # "top_k": 1,
++                "top_p": top_p,
++                "batch_size": 1
++            },
++        },
++        verify=False, stream=True
++    )
++
++    generate_text = ""
++    for line in response.iter_lines():
++        if not line:
++            continue
++        # 删除字符串开头的'data: '
++        res = line.decode('utf-8')[6:]
++        # 获取流式生成的文本内容
++        res_text = json.loads(res).get('token').get('text')
++        generate_text += res_text
++        chatbot[-1] = (parse_text(input), parse_text(generate_text))
++        yield chatbot, history, past_key_values
++
++
+ def reset_user_input():
+     return gr.update(value='')
+ 
+@@ -92,17 +144,17 @@ with gr.Blocks() as demo:
+                 submitBtn = gr.Button("Submit", variant="primary")
+         with gr.Column(scale=1):
+             emptyBtn = gr.Button("Clear History")
+-            max_length = gr.Slider(0, 32768, value=8192, step=1.0, label="Maximum length", interactive=True)
+-            top_p = gr.Slider(0, 1, value=0.8, step=0.01, label="Top P", interactive=True)
+-            temperature = gr.Slider(0, 1, value=0.95, step=0.01, label="Temperature", interactive=True)
++            max_length = gr.Slider(1, args.max_new_tokens, value=args.max_new_tokens, step=1.0, label="Maximum New Tokens", interactive=True)
++            top_p = gr.Slider(0.01, 0.99, value=0.01, step=0.01, label="Top P", interactive=True)
++            temperature = gr.Slider(0.01, 1, value=0.01, step=0.01, label="Temperature", interactive=True)
+ 
+     history = gr.State([])
+     past_key_values = gr.State(None)
+ 
+-    submitBtn.click(predict, [user_input, chatbot, max_length, top_p, temperature, history, past_key_values],
++    submitBtn.click(request, [user_input, chatbot, max_length, top_p, temperature, history, past_key_values],
+                     [chatbot, history, past_key_values], show_progress=True)
+     submitBtn.click(reset_user_input, [], [user_input])
+ 
+     emptyBtn.click(reset_state, outputs=[chatbot, history, past_key_values], show_progress=True)
+ 
+-demo.queue().launch(share=False, inbrowser=True)
++demo.queue(concurrency_count=args.concurrency).launch(server_name='0.0.0.0', share=False, inbrowser=True)
diff --git a/mindie/examples/models/chatglm/v2_6b/web_requirements.txt b/mindie/examples/models/chatglm/v2_6b/web_requirements.txt
new file mode 100644
index 00000000..2f062412
--- /dev/null
+++ b/mindie/examples/models/chatglm/v2_6b/web_requirements.txt
@@ -0,0 +1,3 @@
+gradio==3.39
+mdtex2html
+streamlit
diff --git a/mindie/examples/models/chatglm/v3_6b/README.md b/mindie/examples/models/chatglm/v3_6b/README.md
new file mode 100644
index 00000000..ce83cf38
--- /dev/null
+++ b/mindie/examples/models/chatglm/v3_6b/README.md
@@ -0,0 +1,33 @@
+# ChatGLM3-6B 模型推理指导 <!-- omit in toc -->
+
+# 概述
+
+- ChatGLM3 是智谱AI和清华大学 KEG 实验室联合发布的对话预训练模型。ChatGLM3-6B 是 [ChatGLM3]((https://github.com/THUDM/ChatGLM3)) 系列中的开源模型，在保留了前两代模型对话流畅、部署门槛低等众多优秀特性的基础上，ChatGLM3-6B 有更强大的基础模型、更完整的功能支持、和更全面的开源序列。
+- 此代码仓中实现了一套基于NPU硬件的ChatGLM3-6B推理模型。配合加速库使用，旨在NPU上获得极致的推理性能。
+
+# 特性矩阵
+- 此矩阵罗列了ChatGLM3-6B模型支持的特性
+
+| 模型及参数量 | 800I A2 Tensor Parallelism | 300I DUO Tensor Parallelism | FP16 | BF16 | Flash Attention | Paged Attention | W8A8量化 | W8A16量化 | W4A16量化 | KV cache量化 | 稀疏量化 | MOE量化 | MindIE | TGI | 长序列 |
+|-------------|-------------------------|-------------------------|------|------|-----------------|-----------------|---------|--------------|----------|--------|--------|-----|-----|-----|-----|
+| ChatGLM3-6B    | 支持world size 1,2,4,8 | 支持world size 1,2,4    | 是   | 否   | 否              | 是              | 否       | 否      | 否    | 否           | 否       | 否     | 是     | 否 | 否 |
+
+- 此模型仓已适配的模型版本
+  - [ChatGLM3-6B](https://huggingface.co/THUDM/chatglm3-6b)
+  - [ChatGLM3-6B-32K](https://huggingface.co/THUDM/chatglm3-6b-32k)
+  - 注：ChatGLM3-6B 推荐使用commit id为 `a5ba5501eb873d40d48bd0983bd2a8dd006bb838` 的模型仓版本
+
+
+# 使用说明
+
+- 参考[此README文件](../../chatglm/v2_6b/README.md)
+
+## 精度测试
+- 参考[此README文件](../../../../tests/modeltest/README.md)
+
+## 性能测试
+- 参考[此README文件](../../../../tests/modeltest/README.md)
+
+## FAQ
+- `import torch_npu`遇到`xxx/libgomp.so.1: cannot allocate memory in static TLS block`报错，可通过配置`LD_PRELOAD`解决。
+  - 示例：`export LD_PRELOAD=/lib/aarch64-linux-gnu/libgomp.so.1:$LD_PRELOAD`
\ No newline at end of file
diff --git a/mindie/examples/models/chinese_alpaca/README.md b/mindie/examples/models/chinese_alpaca/README.md
new file mode 100644
index 00000000..73736e6a
--- /dev/null
+++ b/mindie/examples/models/chinese_alpaca/README.md
@@ -0,0 +1,99 @@
+# README
+
+[Chinese-LLaMA-Alpaca](https://github.com/ymcui/Chinese-LLaMA-Alpaca) 项目开源了中文LLaMA模型和指令精调的Alpaca大模型，以进一步促进大模型在中文NLP社区的开放研究。这些模型在原版LLaMA的基础上扩充了中文词表并使用了中文数据进行二次预训练，进一步提升了中文基础语义理解能力。同时，中文Alpaca模型进一步使用了中文指令数据进行精调，显著提升了模型对指令的理解和执行能力。
+
+- 此代码仓中实现了一套基于NPU硬件的Chinese-LLaMA-Alpaca系列模型。配合加速库使用，旨在NPU上获得极致的推理性能。
+
+# 特性矩阵
+- 此矩阵罗列了各Chinese-LLaMA-Alpaca模型支持的特性
+
+| 模型及参数量 | 800I A2 Tensor Parallelism | 300I DUO Tensor Parallelism | FP16 | BF16 | Flash Attention | Paged Attention | W8A8量化 | W8A16量化 | KV cache量化 | 稀疏量化 | MOE量化 | MindIE | TGI |
+|-------------|-------------------------|-------------------------|------|------|-----------------|-----------------|---------|---------|--------------|----------|--------|--------|-----|
+| Chinese-Alpaca-13B    | 支持world size 1,2,4,8   | 支持world size 1,2,4     | 是   | 否   | 否              | 是              | 否       | 否       | 否           | 否       | 否     | 否     | 否  |
+
+# 使用说明
+
+## 路径变量解释
+| 变量名  | 含义                                             |
+|--------|--------------------------------------------------|
+| working_dir | 加速库及模型库下载后放置的目录                  |
+| llm_path | 模型仓所在路径；若使用编译好的包，则路径为`${working_dir}/`；若使用gitee下载的代码，则路径为`${working_dir}/ModelLink/mindie_ref/mindie_llm/atb_models`    |
+| script_path | 脚本所在路径；Chinese-Alpaca-13B的工作脚本所在路径为`${llm_path}/examples/models/chinese_alpaca`                            |
+| weight_path | 模型权重路径                            |
+
+## 权重
+**权重下载**
+
+- lora权重: [Chinese-Alpaca-Lora-13B](https://pan.baidu.com/s/1wYoSF58SnU9k0Lndd5VEYg?pwd=mm8i)
+- 原模型权重: [LLaMA-13B](https://huggingface.co/huggyllama/llama-13b)
+> 下载后务必检查压缩包中模型文件的SHA256是否一致，请查看[SHA256.md](https://github.com/ymcui/Chinese-LLaMA-Alpaca/blob/main/SHA256.md)
+
+**lora权重合并**
+- 合并lora权重和原模型权重，请参考[合并教程](https://github.com/ymcui/Chinese-LLaMA-Alpaca/wiki/%E6%89%8B%E5%8A%A8%E6%A8%A1%E5%9E%8B%E5%90%88%E5%B9%B6%E4%B8%8E%E8%BD%AC%E6%8D%A2#%E5%A4%9Alora%E6%9D%83%E9%87%8D%E5%90%88%E5%B9%B6%E9%80%82%E7%94%A8%E4%BA%8Echinese-alpaca-plus)
+
+**权重转换**
+> 若权重中不包含safetensors格式，则执行权重转换步骤，否则跳过
+- 参考[此README文件](../../README.md)
+
+**基础环境变量**
+- 参考[此README文件](../../../README.md)
+
+## 推理
+
+### 对话测试
+
+**运行Paged Attention FP16**
+- 运行启动脚本
+  - 将`${llm_path}`加入`PYTHONPATH`搜索目录
+    ```shell
+    export PYTHONPATH=${llm_path}:${PYTHONPATH}
+    ```
+  - 在\${llm_path}目录下执行以下指令
+    ```shell
+    bash ${script_path}/run_pa.sh ${weight_path}
+    ```
+- 环境变量说明
+  - `export ASCEND_RT_VISIBLE_DEVICES=0,1,2,3,4,5,6,7`
+    - 指定当前机器上可用的逻辑NPU核心，多个核心间使用逗号相连
+    - 核心ID查阅方式见[此README文件](../../README.md)的【启动脚本相关环境变量】章节
+    - 对于300I DUO卡而言，若要使用单卡双芯，请指定至少两个可见核心；若要使用双卡四芯，请指定至少四个可见核心
+    - 各模型支持的核心数参考“特性矩阵”
+  - `export MASTER_PORT=20030`
+    - 设置卡间通信端口
+    - 默认使用20030端口
+    - 目的是为了避免同一台机器同时运行多个多卡模型时出现通信冲突
+    - 设置时端口建议范围为：20000-20050
+  - 以下环境变量与性能和内存优化相关，通常情况下无需修改
+    ```shell
+    export ATB_LAYER_INTERNAL_TENSOR_REUSE=1
+    export INF_NAN_MODE_ENABLE=0
+    export ATB_OPERATION_EXECUTE_ASYNC=1
+    export TASK_QUEUE_ENABLE=1
+    export ATB_CONVERT_NCHW_TO_ND=1
+    export LCCL_ENABLE_FALLBACK=1
+    export ATB_WORKSPACE_MEM_ALLOC_GLOBAL=1
+    export ATB_CONTEXT_WORKSPACE_SIZE=0
+    ```
+
+## 精度测试
+- 参考[此README文件](../../../tests/modeltest/README.md)
+  - 示例
+    ```shell
+    cd ${llm_path}/tests/modeltest
+    export ASCEND_RT_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+    bash run.sh pa_fp16 full_CEval 1 llama ${Chinese-Alpaca-13B权重路径} 8
+    ```
+
+## 性能测试
+- 参考[此README文件](../../../tests/modeltest/README.md)
+  - 示例
+    ```shell
+    cd ${llm_path}/tests/modeltest
+    export ASCEND_RT_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+    bash run.sh pa_fp16 performance [[2048,2048],[1024,1024],[512,512],[256,256]] 1 llama ${Chinese-Alpaca-13B权重路径} 8
+    ```
+
+## FAQ
+- 更多环境变量见[此README文件](../../README.md)
+- 对话测试实际执行的Python文件为`${llm_path}/examples/run_pa.py`；这两个文件的参数说明见[此README文件](../../README.md)
+- 运行时，需要通过指令pip list｜grep protobuf确认protobuf版本，如果版本高于3.20.x，请运行指令pip install protobuf==3.20.0进行更新
diff --git a/mindie/examples/models/chinese_alpaca/run_pa.sh b/mindie/examples/models/chinese_alpaca/run_pa.sh
new file mode 100644
index 00000000..ac0f784b
--- /dev/null
+++ b/mindie/examples/models/chinese_alpaca/run_pa.sh
@@ -0,0 +1,23 @@
+# Copyright Huawei Technologies Co., Ltd. 2023-2024. All rights reserved.
+# 参数配置以及启动指令的说明见同级目录下的README.md文件
+export ASCEND_RT_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+export MASTER_PORT=20030
+
+# 以下环境变量与性能和内存优化相关，通常情况下无需修改
+export ATB_LAYER_INTERNAL_TENSOR_REUSE=1
+export INF_NAN_MODE_ENABLE=0
+export ATB_OPERATION_EXECUTE_ASYNC=1
+export TASK_QUEUE_ENABLE=1
+export ATB_CONVERT_NCHW_TO_ND=1
+export LCCL_ENABLE_FALLBACK=1
+export ATB_WORKSPACE_MEM_ALLOC_GLOBAL=1
+export ATB_CONTEXT_WORKSPACE_SIZE=0
+export INT8_FORMAT_NZ_ENABLE=1
+
+world_size=$(($(echo "${ASCEND_RT_VISIBLE_DEVICES}" | grep -o , | wc -l) +1))
+
+if [ "$TP_WORLD_SIZE" == "1" ]; then
+    python -m examples.run_pa --model_path $1
+else
+    torchrun --nproc_per_node $world_size --master_port $MASTER_PORT -m examples.run_pa --model_path $1
+fi
\ No newline at end of file
diff --git a/mindie/examples/models/codegeex/v2_6b/README.md b/mindie/examples/models/codegeex/v2_6b/README.md
new file mode 100644
index 00000000..47e5302c
--- /dev/null
+++ b/mindie/examples/models/codegeex/v2_6b/README.md
@@ -0,0 +1,57 @@
+# CodeGeeX2-6B 模型推理指导 <!-- omit in toc -->
+
+# 概述
+
+- [CodeGeeX2-6B](https://github.com/THUDM/CodeGeeX2) 是多语言代码生成模型 [CodeGeeX](https://github.com/THUDM/CodeGeeX) ([KDD’23](https://arxiv.org/abs/2303.17568)) 的第二代模型。不同于一代 CodeGeeX（完全在国产华为昇腾芯片平台训练） ，CodeGeeX2 是基于 [ChatGLM2](https://github.com/THUDM/ChatGLM2-6B) 架构加入代码预训练实现，得益于 ChatGLM2 的更优性能，CodeGeeX2 在多项指标上取得性能提升（+107% > CodeGeeX；仅60亿参数即超过150亿参数的 StarCoder-15B 近10%）。
+- 此代码仓中实现了一套基于NPU硬件的CodeGeeX2推理模型。配合加速库使用，旨在NPU上获得极致的推理性能。
+
+# 特性矩阵
+- 此矩阵罗列了CodeGeeX2-6B模型支持的特性
+
+| 模型及参数量 | 800I A2 Tensor Parallelism | 300I DUO Tensor Parallelism | FP16 | BF16 | Flash Attention | Paged Attention | W8A8量化 | W8A16量化 | W4A16量化 | KV cache量化 | 稀疏量化 | MOE量化 | MindIE | TGI | 长序列 |
+|-------------|-------------------------|-------------------------|------|------|-----------------|-----------------|---------|--------------|----------|--------|--------|-----|-----|-----|-----|
+| CodeGeeX2-6B    | 支持world size 1,2,4,8 | 支持world size 1,2,4    | 是   | 否   | 否              | 是              | 是      | 否      | 否     | 否           | 否       | 否     | 是     | 是  | 否 |
+
+- 此模型仓已适配的模型版本
+  - [CodeGeeX2-6B](https://huggingface.co/THUDM/codegeex2-6b/tree/main)
+
+
+# 使用说明
+
+- 执行推理前需要将权重目录下的config.json中的`torch_dtype`改为`"float16"`
+- 除了“量化权重导出”章节，其余均参考[此README文件](../../chatglm/v2_6b/README.md)
+## 量化权重导出
+量化权重可通过msmodelslim（昇腾压缩加速工具）实现。
+
+### 环境准备
+环境配置可参考msmodelslim官网：https://www.hiascend.com/document/detail/zh/canncommercial/70RC1/devtools/auxiliarydevtool/modelslim_0002.html
+
+### 导出量化权重
+通过`${llm_path}/examples/models/codegeex/v2_6b/quant_codegeex2_6b_w8a8.py`文件导出模型的量化权重（注意量化权重不要和浮点权重放在同一个目录下）：
+```shell
+python quant_codegeex2_6b_w8a8.py --model_path ${浮点权重路径} --save_path ${量化权重保存路径} --dataset_path ${校准数据集路径}
+```
+校准数据集采用 `${llm_path}/tests/modeltest/dataset/full/BoolQ/dev.jsonl`
+
+导出量化权重后应生成`quant_model_weight_w8a8.safetensors`和`quant_model_description_w8a8.json`两个文件。
+
+注：
+
+1.quant_codegeex2_6b_w8a8.py文件中已配置好较优的量化策略，导出量化权重时可直接使用，也可修改为其它策略。
+
+2.执行脚本生成量化权重时，会在生成的权重路径的config.json文件中添加(或修改)`quantize`字段，值为相应量化方式，当前仅支持`w8a8`。
+
+3.执行完以上步骤后，执行量化模型只需要替换权重路径。
+
+4.如果生成权重时遇到`OpenBLAS Warning: Detect OpenMP Loop and this application may hang. Please rebuild the library with USE_OPENMP = 1 option`，可通过设置`export OMP_NUM_THREADS=1`来关闭多线程规避。
+
+
+## 精度测试
+- 参考[此README文件](../../../../tests/modeltest/README.md)
+
+## 性能测试
+- 参考[此README文件](../../../../tests/modeltest/README.md)
+
+## FAQ
+- `import torch_npu`遇到`xxx/libgomp.so.1: cannot allocate memory in static TLS block`报错，可通过配置`LD_PRELOAD`解决。
+  - 示例：`export LD_PRELOAD=/lib/aarch64-linux-gnu/libgomp.so.1:$LD_PRELOAD`
\ No newline at end of file
diff --git a/mindie/examples/models/codegeex/v2_6b/quant_codegeex2_6b_w8a8.py b/mindie/examples/models/codegeex/v2_6b/quant_codegeex2_6b_w8a8.py
new file mode 100644
index 00000000..0c9d086f
--- /dev/null
+++ b/mindie/examples/models/codegeex/v2_6b/quant_codegeex2_6b_w8a8.py
@@ -0,0 +1,93 @@
+import os
+import json
+import shutil
+import argparse
+
+from transformers import AutoTokenizer, AutoModelForCausalLM
+from msmodelslim.pytorch.llm_ptq.llm_ptq_tools import Calibrator, QuantConfig
+
+from atb_llm.utils.file_utils import safe_open
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description="Creating quant weights for CodeGeex2-6B")
+    parser.add_argument("--model_path", type=str, required=True, help="The path to model float weights")
+    parser.add_argument("--save_path", type=str, default="./quant_weight_geex", help="The path to save quant weights")
+    parser.add_argument("--dataset_path", type=str, required=True, help="The dataset path")
+
+    return parser.parse_args()
+
+
+# 获取校准数据函数定义
+def get_calib_dataset(tokenizer, calib_list, device="cpu"):  # device="npu:0" 如果需要使用npu进行量化
+    calib_dataset = []
+    for calib_data in calib_list:
+        inputs = tokenizer(calib_data, return_tensors='pt')
+        calib_dataset.append([
+            inputs.data['input_ids'].to(device),
+            inputs.data['position_ids'].to(device),
+            inputs.data['attention_mask'].to(device)
+            ])
+    return calib_dataset
+
+
+disable_names = ['transformer.encoder.layers.0.self_attention.query_key_value',
+'transformer.encoder.layers.0.mlp.dense_4h_to_h',
+'transformer.encoder.layers.1.self_attention.query_key_value',
+'transformer.encoder.layers.1.mlp.dense_h_to_4h',
+'transformer.encoder.layers.1.mlp.dense_4h_to_h',
+'transformer.encoder.layers.2.self_attention.query_key_value',
+'transformer.encoder.layers.2.mlp.dense_h_to_4h',
+'transformer.encoder.layers.2.mlp.dense_4h_to_h',
+'transformer.encoder.layers.3.self_attention.query_key_value',
+'transformer.encoder.layers.4.self_attention.query_key_value',
+'transformer.encoder.layers.5.self_attention.query_key_value',
+'transformer.encoder.layers.6.self_attention.query_key_value',
+'transformer.encoder.layers.7.self_attention.query_key_value',
+'transformer.encoder.layers.8.self_attention.query_key_value',
+'transformer.encoder.layers.9.self_attention.query_key_value',
+'transformer.encoder.layers.11.self_attention.query_key_value',
+'transformer.encoder.layers.17.mlp.dense_4h_to_h',
+'transformer.encoder.layers.23.mlp.dense_4h_to_h',
+'transformer.encoder.layers.27.mlp.dense_4h_to_h',
+'transformer.output_layer']
+
+quant_config = QuantConfig(
+    a_bit=8, 
+    w_bit=8, 
+    disable_names=disable_names, 
+    dev_type='cpu',  # dev_type="npu", dev_id=0  如果需要使用npu进行量化
+    act_method=3, 
+    pr=1.0, 
+    w_sym=True, 
+    mm_tensor=False
+)
+
+
+def main():
+    args = parse_args()
+    fp16_path = args.model_path  # 原始浮点模型路径
+    tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path=fp16_path, trust_remote_code=True) 
+    model = AutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path=fp16_path, trust_remote_code=True).float().cpu()
+
+    calib_set = []
+    with safe_open(args.dataset_path, 'r', encoding='utf-8') as file:
+        calib_set = file.readlines()
+
+    dataset_calib = get_calib_dataset(tokenizer, calib_set[:5])
+    calibrator = Calibrator(model, quant_config, calib_data=dataset_calib, disable_level='L0')
+    calibrator.run()  # 执行PTQ量化校准
+    calibrator.save(args.save_path, save_type=["safe_tensor"])  # "safe_tensor"对应safetensors格式权重，"numpy"对应npy格式权重
+
+    model_files = [f for f in os.listdir(args.model_path) if f.startswith(("config", "tokeniz", "modeling_chatglm.py"))]
+    for f in model_files:
+        shutil.copy2(os.path.join(args.model_path, f), os.path.join(args.save_path, f))
+    with safe_open(os.path.join(args.save_path, "config.json"), 'r+', encoding='utf-8') as f:
+        config = json.load(f)
+        config['quantize'] = 'w8a8'
+        f.seek(0)
+        json.dump(config, f, indent=4)
+        f.truncate()
+
+if __name__ == '__main__':
+    main()
\ No newline at end of file
diff --git a/mindie/examples/models/codellama/README.md b/mindie/examples/models/codellama/README.md
new file mode 100644
index 00000000..0d157df7
--- /dev/null
+++ b/mindie/examples/models/codellama/README.md
@@ -0,0 +1,172 @@
+# README
+
+- [Code Llama](https://github.com/Meta-Llama/codellama) 是Meta发布的代码生成类大语言模型，在编程任务上具备填充、0-shot指令跟随能力，并支持长序列文本输入，在开源模型中拥有先进的性能。Code Llama 是 Llama 2 的代码专用版本，它是通过在代码数据集上对 Llama 2 进行进一步训练，并在同一数据集上长时间采样更多数据而创建的。从本质上讲，Code Llama 具有更强的编码能力。它可以根据代码和自然语言提示（例如，"给我写一个输出斐波那契数列的函数"）生成代码和有关代码的自然语言。它还可用于代码补全和调试。它支持许多当今最流行的编程语言，包括 Python、C++、Java、PHP、Typescript (Javascript)、C#、Bash 等。
+
+- 此代码仓中实现了一套基于NPU硬件的Code Llama推理模型。配合加速库使用，旨在NPU上获得极致的推理性能。
+
+# 特性矩阵
+- 此矩阵罗列了各CodeLlama模型支持的特性
+
+| 模型及参数量 | 800I A2 Tensor Parallelism | 300I DUO Tensor Parallelism | FP16 | BF16 | Flash Attention | Paged Attention | W8A8量化 | W8A16量化 | KV cache量化 | 稀疏量化 | MOE量化 | MindIE Service | TGI |  长序列 |
+|-------------|----------------------------|-----------------------------|------|----------------------|-----------------|-----------------|---------|-----------|--------------|--------------------------|-----|--------|-----|-----|
+| CodeLlama-7B  | 支持world size 1,2,4,8   | 否      | 是   | 否   | 否              | 是              | 否       | 否       | 否           | 否       | 否     | 否     | 否  | 否  |
+| CodeLlama-13B  | 支持world size 1,2,4,8   | 否      | 是   | 是   | 否              | 是              | 否       | 否       | 否           | 否       | 否     | 是     | 否  | 否  |
+| CodeLlama-34B  | 支持world size 4,8   | 支持world size 2,4,8      | 是   | 是   | 否              | 是              | 是       | 否       | 否           | 是       | 否     | 是     | 否  | 否  |
+| CodeLlama-70B  | 支持world size 4,8   | 否      | 是   | 是   | 否              | 是              | 否       | 否       | 否           | 否       | 否     | 否     | 否  | 否  |
+
+# 使用说明
+
+## 路径变量解释
+| 变量名  | 含义                                             |
+|--------|--------------------------------------------------|
+| working_dir | 加速库及模型库下载后放置的目录                  |
+| llm_path | ATB_Models模型仓所在路径；若使用编译好的包，则路径为`${working_dir}/`；若使用gitee下载的代码，则路径为`${working_dir}/MindIE-LLM/examples/atb_models/`    |
+| script_path | 脚本所在路径；CodeLlama的工作脚本所在路径为`${llm_path}/examples/models/codellama`                            |
+| weight_path | 模型权重路径                            |
+
+## 权重
+**权重下载**
+- [CodeLlama-7B](https://huggingface.co/codellama/CodeLlama-7b-hf)
+- [CodeLlama-13B](https://huggingface.co/codellama/CodeLlama-13b-hf)
+- [CodeLlama-34B](https://huggingface.co/codellama/CodeLlama-34b-hf)
+- [CodeLlama-70B](https://huggingface.co/codellama/CodeLlama-70b-hf)
+
+**权重转换**
+> 若权重中不包含safetensors格式，则执行权重转换步骤，否则跳过
+- 参考[此README文件](../../README.md)
+
+**量化权重生成**
+> 基于原始的浮点权重，生成量化权重
+
+- 设置环境变量
+  ```shell
+  # 设置CANN包的环境变量
+  source /usr/local/Ascend/ascend-toolkit/set_env.sh
+  # 推荐使用transformers 4.33.0版本进行量化权重转换，执行模型推理时transformers的版本大于等于4.33.0
+  pip uninstall transformers -y
+  pip install transformers=={指定版本}
+  # NPU多卡量化时关闭虚拟内存
+  export PYTORCH_NPU_ALLOC_CONF=expandable_segments:False
+  # 指定当前机器上可用的逻辑NPU核心
+  export ASCEND_RT_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+  # 将`${llm_path}`加入`PYTHONPATH`搜索目录
+  export PYTHONPATH=${llm_path}:${PYTHONPATH}
+  ```
+- W8A8量化权重请使用以下指令生成
+  - Step 1
+    - 修改模型权重config.json中`torch_dtype`字段为`float16`
+  - Step 2 W8A8量化权重生成
+    ```shell
+    cd ${llm_path}/examples/models/codellama
+    python convert_quant_weights.py --model_path {浮点权重路径} --save_directory {W8A8量化权重路径} --w_bit 8 --a_bit 8 --act_method 3 --anti_method m2 --device_type npu --calib_file ./humaneval_python.json
+    ```
+  > NPU多卡量化注意事项和环境要求见[此README中的【NPU多卡量化】章节](../../README.md)
+
+- 稀疏量化权重请使用以下指令生成
+  > 稀疏量化方式生成的权重只支持在300I DUO硬件上推理
+  - Step 1
+    - 修改模型权重config.json中`torch_dtype`字段为`float16`
+  - Step 2 稀疏量化权重生成
+    ```shell
+    cd ${llm_path}/examples/models/codellama
+    python convert_quant_weights.py --model_path {浮点权重路径} --save_directory {W8A8S量化权重路径} --w_bit 4 --a_bit 8 --act_method 2 --do_smooth True --use_sigma True --is_lowbit True  --device_type npu --calib_file ./humaneval_python.json
+    ```
+  - Step 3：量化权重切分及压缩
+    > 运行前需要确保压缩工具编译过
+    >
+    > `cd /usr/local/Ascend/ascend-toolkit/latest/python/site-packages/msmodelslim/pytorch/weight_compression/compress_graph`
+    >
+    > `bash build.sh /usr/local/Ascend/ascend-toolkit/latest`
+    ```shell
+    torchrun --nproc_per_node {TP数} -m examples.convert.model_slim.sparse_compressor --model_path {W8A8S量化权重路径} --save_directory {W8A8SC量化权重路径}
+    ```
+    > TP数为tensor parallel并行个数
+    > 注意：若权重生成时以TP=4进行切分，则运行时也需以TP=4运行
+
+**基础环境变量**
+- 参考[此README文件](../../../README.md)
+
+## 推理
+
+### 对话测试
+
+**运行Paged Attention BF16**
+- 运行启动脚本
+  - 将`${llm_path}`加入`PYTHONPATH`搜索目录
+    ```shell
+    export PYTHONPATH=${llm_path}:${PYTHONPATH}
+    ```
+  - 在${llm_path}目录下执行以下指令
+    ```shell
+    bash ${script_path}/run_pa.sh ${weight_path}
+    ```
+- 环境变量说明
+  - `export ASCEND_RT_VISIBLE_DEVICES=0,1,2,3,4,5,6,7`
+    - 指定当前机器上可用的逻辑NPU核心，多个核心间使用逗号相连
+    - 核心ID查阅方式见[此README文件](../../README.md)的【启动脚本相关环境变量】章节
+    - 对于300I DUO卡而言，若要使用单卡双芯，请指定至少两个可见核心；若要使用双卡四芯，请指定至少四个可见核心
+    - 各模型支持的核心数参考“特性矩阵”
+  - `export MASTER_PORT=20030`
+    - 设置卡间通信端口
+    - 默认使用20030端口
+    - 目的是为了避免同一台机器同时运行多个多卡模型时出现通信冲突
+    - 设置时端口建议范围为：20000-20050
+  - 以下环境变量与性能和内存优化相关，通常情况下无需修改
+    ```shell
+    export ATB_LAYER_INTERNAL_TENSOR_REUSE=1
+    export INF_NAN_MODE_ENABLE=0
+    export ATB_OPERATION_EXECUTE_ASYNC=1
+    export TASK_QUEUE_ENABLE=1
+    export ATB_CONVERT_NCHW_TO_ND=1
+    export LCCL_ENABLE_FALLBACK=1
+    export ATB_WORKSPACE_MEM_ALLOC_GLOBAL=1
+    export ATB_CONTEXT_WORKSPACE_SIZE=0
+    export INT8_FORMAT_NZ_ENABLE=1
+    ```
+- 300I DUO卡不支持BF16数据类型
+
+**运行Paged Attention FP16**
+- 运行启动脚本
+  - 与“运行Paged Attention BF16”的启动方式相同
+- 环境变量说明
+  - 参见“运行Paged Attention BF16”中的环境变量说明
+- 相比于BF16，运行FP16时需修改${weight_path}/config.json中的`torch_dtype`字段，将此字段对应的值修改为`float16`
+
+**运行Paged Attention W8A8**
+- W8A8量化权重生成
+- 运行启动脚本
+  - 与“运行Paged Attention BF16”的启动方式相同
+  - `${weight_path}`为W8A8量化权重的路径
+- 环境变量说明
+  - 参见“运行Paged Attention BF16”中的环境变量说明
+- 相比于BF16，运行量化时需修改W8A8量化权重`${weight_path}/config.json`中的`quantize`字段，将此字段对应的值修改为`w8a8`
+  - 若config.json中无此字段，则新增
+
+## 精度测试
+- 参考[此README文件](../../../tests/modeltest/README.md)
+  - 示例
+    ```shell
+    cd ${llm_path}/tests/modeltest
+    export ASCEND_RT_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+    # 运行Paged Attention BF16
+    bash run.sh pa_bf16 full_HumanEval 1 codellama ${weight_path} 8
+    # 运行Paged Attention FP16
+    bash run.sh pa_fp16 full_HumanEval 1 codellama ${weight_path} 8
+    ```
+
+## 性能测试
+- 参考[此README文件](../../../tests/modeltest/README.md)
+  - 示例
+    ```shell
+    cd ${llm_path}/tests/modeltest
+    export ASCEND_RT_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+    # 运行Paged Attention BF16
+    bash run.sh pa_bf16 performance [[2048,2048],[1024,1024],[512,512],[256,256]] 1 codellama ${weight_path} 8
+    # 运行Paged Attention FP16
+    bash run.sh pa_fp16 performance [[2048,2048],[1024,1024],[512,512],[256,256]] 1 codellama ${weight_path} 8
+    ```
+
+## FAQ
+- 更多环境变量见[此README文件](../../README.md)
+- 对话测试实际执行的Python文件为`${llm_path}/examples/run_fa.py`和`${llm_path}/examples/run_pa.py`；这两个文件的参数说明见[此README文件](../../README.md)
+- 运行时，需要通过指令pip list｜grep protobuf确认protobuf版本，如果版本高于3.20.x，请运行指令pip install protobuf==3.20.0进行更新
diff --git a/mindie/examples/models/codellama/convert_quant_weights.py b/mindie/examples/models/codellama/convert_quant_weights.py
new file mode 100644
index 00000000..595ca8f5
--- /dev/null
+++ b/mindie/examples/models/codellama/convert_quant_weights.py
@@ -0,0 +1,84 @@
+# Copyright Huawei Technologies Co., Ltd. 2023-2024. All rights reserved.
+import os
+import torch
+
+from msmodelslim.pytorch.llm_ptq.llm_ptq_tools import QuantConfig
+from msmodelslim.pytorch.llm_ptq.anti_outlier import AntiOutlierConfig
+
+from atb_llm.models.llama.modeling_llama import LlamaConfig
+from atb_llm.utils.log import logger, print_log
+from examples.convert.convert_utils import copy_tokenizer_files, modify_config
+from examples.convert.model_slim.get_calibration_dataset import load_jsonl
+from examples.convert.model_slim.quantifier import parse_arguments, Quantifier
+
+
+if __name__ == "__main__":
+    args = parse_arguments()
+
+    rank = int(os.getenv("RANK", "0"))
+
+    config = LlamaConfig.from_pretrained(args.model_path)
+
+    disable_names = []
+    if args.a_bit != 16:
+        # W8A16, W4A16没有回退层
+        num_layers = config.num_hidden_layers
+        disable_names = [f"model.layers.{layer}.mlp.down_proj" for layer in range(num_layers)]
+        disable_names.append("lm_head")
+
+    anti_outlier_config = None
+    if args.anti_method:
+        anti_outlier_config = AntiOutlierConfig(anti_method=args.anti_method, dev_type=args.device_type, dev_id=rank)
+
+    quant_config = QuantConfig(
+        a_bit=args.a_bit,
+        w_bit=args.w_bit,
+        disable_names=disable_names,
+        act_method=args.act_method,
+        w_sym=args.w_sym,
+        mm_tensor=False,
+        dev_type=args.device_type,
+        dev_id=rank,
+        pr=1.0,
+        fraction=args.fraction,
+        co_sparse=args.co_sparse,
+        do_smooth=args.do_smooth,
+        use_sigma=args.use_sigma,
+        sigma_factor=args.sigma_factor,
+        is_lowbit=args.is_lowbit,
+    )
+
+    # 默认无校准数据集
+    calibration_dataset = None
+    # 若存在calib_file，则使用calib_file作为校准数据集
+    if args.calib_file:
+        calibration_dataset = load_jsonl(args.calib_file, key_name='prompt')
+    if args.calib_dataset_length <= len(calibration_dataset):
+        calibration_dataset = calibration_dataset[:args.calib_dataset_length]
+        print_log(rank, logger.info, f"calib_dataset_length: {args.calib_dataset_length}")
+    else:
+        print_log(rank, logger.warning,
+                  f"calib_dataset_length is too large, use default {len(calibration_dataset)}")
+    quant_weight_generator = Quantifier(
+        args.model_path, quant_config, anti_outlier_config,
+        device_type=args.device_type, tokenizer_args={"padding_side": "left"}
+    )
+    quant_weight_generator.tokenizer.pad_token_id = 2
+
+    tokenized_data = None
+    if calibration_dataset is not None:
+        dataloader = torch.utils.data.DataLoader(calibration_dataset, batch_size=4)
+        tokenized_data = quant_weight_generator.get_tokenized_data(dataloader)
+
+    quant_weight_generator.convert(tokenized_data, args.save_directory, args.disable_level)
+    #为适配工具稀疏量化传入w_bit=4,a_bit=8暂时修改quant_type
+    quant_type = f"w{args.w_bit}a{args.a_bit}" + ("s" if (args.co_sparse or args.is_lowbit) else "")
+    is_sparseCompress = args.w_bit == 4 and args.a_bit == 8 and (args.co_sparse or args.is_lowbit)
+    if is_sparseCompress:
+        quant_type = "w8a8s"
+
+    modify_config(
+        args.model_path, args.save_directory, config.torch_dtype,
+        quant_type
+    )
+    copy_tokenizer_files(args.model_path, args.save_directory)
diff --git a/mindie/examples/models/codellama/humaneval_python.json b/mindie/examples/models/codellama/humaneval_python.json
new file mode 100644
index 00000000..5dbd2285
--- /dev/null
+++ b/mindie/examples/models/codellama/humaneval_python.json
@@ -0,0 +1,7 @@
+{"task_id": "HumanEval/0", "prompt": "from typing import List\n\n\ndef has_close_elements(numbers: List[float], threshold: float) -> bool:\n    \"\"\" Check if in given list of numbers, are any two numbers closer to each other than\n    given threshold.\n    >>> has_close_elements([1.0, 2.0, 3.0], 0.5)\n    False\n    >>> has_close_elements([1.0, 2.8, 3.0, 4.0, 5.0, 2.0], 0.3)\n    True\n    \"\"\"\n", "entry_point": "has_close_elements", "canonical_solution": "    for idx, elem in enumerate(numbers):\n        for idx2, elem2 in enumerate(numbers):\n            if idx != idx2:\n                distance = abs(elem - elem2)\n                if distance < threshold:\n                    return True\n\n    return False\n", "test": "\n\nMETADATA = {\n    'author': 'jt',\n    'dataset': 'test'\n}\n\n\ndef check(candidate):\n    assert candidate([1.0, 2.0, 3.9, 4.0, 5.0, 2.2], 0.3) == True\n    assert candidate([1.0, 2.0, 3.9, 4.0, 5.0, 2.2], 0.05) == False\n    assert candidate([1.0, 2.0, 5.9, 4.0, 5.0], 0.95) == True\n    assert candidate([1.0, 2.0, 5.9, 4.0, 5.0], 0.8) == False\n    assert candidate([1.0, 2.0, 3.0, 4.0, 5.0, 2.0], 0.1) == True\n    assert candidate([1.1, 2.2, 3.1, 4.1, 5.1], 1.0) == True\n    assert candidate([1.1, 2.2, 3.1, 4.1, 5.1], 0.5) == False\n\n"}
+{"task_id": "HumanEval/1", "prompt": "from typing import List\n\n\ndef separate_paren_groups(paren_string: str) -> List[str]:\n    \"\"\" Input to this function is a string containing multiple groups of nested parentheses. Your goal is to\n    separate those group into separate strings and return the list of those.\n    Separate groups are balanced (each open brace is properly closed) and not nested within each other\n    Ignore any spaces in the input string.\n    >>> separate_paren_groups('( ) (( )) (( )( ))')\n    ['()', '(())', '(()())']\n    \"\"\"\n", "entry_point": "separate_paren_groups", "canonical_solution": "    result = []\n    current_string = []\n    current_depth = 0\n\n    for c in paren_string:\n        if c == '(':\n            current_depth += 1\n            current_string.append(c)\n        elif c == ')':\n            current_depth -= 1\n            current_string.append(c)\n\n            if current_depth == 0:\n                result.append(''.join(current_string))\n                current_string.clear()\n\n    return result\n", "test": "\n\nMETADATA = {\n    'author': 'jt',\n    'dataset': 'test'\n}\n\n\ndef check(candidate):\n    assert candidate('(()()) ((())) () ((())()())') == [\n        '(()())', '((()))', '()', '((())()())'\n    ]\n    assert candidate('() (()) ((())) (((())))') == [\n        '()', '(())', '((()))', '(((())))'\n    ]\n    assert candidate('(()(())((())))') == [\n        '(()(())((())))'\n    ]\n    assert candidate('( ) (( )) (( )( ))') == ['()', '(())', '(()())']\n"}
+{"task_id": "HumanEval/2", "prompt": "\n\ndef truncate_number(number: float) -> float:\n    \"\"\" Given a positive floating point number, it can be decomposed into\n    and integer part (largest integer smaller than given number) and decimals\n    (leftover part always smaller than 1).\n\n    Return the decimal part of the number.\n    >>> truncate_number(3.5)\n    0.5\n    \"\"\"\n", "entry_point": "truncate_number", "canonical_solution": "    return number % 1.0\n", "test": "\n\nMETADATA = {\n    'author': 'jt',\n    'dataset': 'test'\n}\n\n\ndef check(candidate):\n    assert candidate(3.5) == 0.5\n    assert abs(candidate(1.33) - 0.33) < 1e-6\n    assert abs(candidate(123.456) - 0.456) < 1e-6\n"}
+{"task_id": "HumanEval/3", "prompt": "from typing import List\n\n\ndef below_zero(operations: List[int]) -> bool:\n    \"\"\" You're given a list of deposit and withdrawal operations on a bank account that starts with\n    zero balance. Your task is to detect if at any point the balance of account fallls below zero, and\n    at that point function should return True. Otherwise it should return False.\n    >>> below_zero([1, 2, 3])\n    False\n    >>> below_zero([1, 2, -4, 5])\n    True\n    \"\"\"\n", "entry_point": "below_zero", "canonical_solution": "    balance = 0\n\n    for op in operations:\n        balance += op\n        if balance < 0:\n            return True\n\n    return False\n", "test": "\n\nMETADATA = {\n    'author': 'jt',\n    'dataset': 'test'\n}\n\n\ndef check(candidate):\n    assert candidate([]) == False\n    assert candidate([1, 2, -3, 1, 2, -3]) == False\n    assert candidate([1, 2, -4, 5, 6]) == True\n    assert candidate([1, -1, 2, -2, 5, -5, 4, -4]) == False\n    assert candidate([1, -1, 2, -2, 5, -5, 4, -5]) == True\n    assert candidate([1, -2, 2, -2, 5, -5, 4, -4]) == True\n"}
+{"task_id": "HumanEval/7", "prompt": "from typing import List\n\n\ndef filter_by_substring(strings: List[str], substring: str) -> List[str]:\n    \"\"\" Filter an input list of strings only for ones that contain given substring\n    >>> filter_by_substring([], 'a')\n    []\n    >>> filter_by_substring(['abc', 'bacd', 'cde', 'array'], 'a')\n    ['abc', 'bacd', 'array']\n    \"\"\"\n", "entry_point": "filter_by_substring", "canonical_solution": "    return [x for x in strings if substring in x]\n", "test": "\n\nMETADATA = {\n    'author': 'jt',\n    'dataset': 'test'\n}\n\n\ndef check(candidate):\n    assert candidate([], 'john') == []\n    assert candidate(['xxx', 'asd', 'xxy', 'john doe', 'xxxAAA', 'xxx'], 'xxx') == ['xxx', 'xxxAAA', 'xxx']\n    assert candidate(['xxx', 'asd', 'aaaxxy', 'john doe', 'xxxAAA', 'xxx'], 'xx') == ['xxx', 'aaaxxy', 'xxxAAA', 'xxx']\n    assert candidate(['grunt', 'trumpet', 'prune', 'gruesome'], 'run') == ['grunt', 'prune']\n"}
+{"task_id": "HumanEval/65", "prompt": "\ndef circular_shift(x, shift):\n    \"\"\"Circular shift the digits of the integer x, shift the digits right by shift\n    and return the result as a string.\n    If shift > number of digits, return digits reversed.\n    >>> circular_shift(12, 1)\n    \"21\"\n    >>> circular_shift(12, 2)\n    \"12\"\n    \"\"\"\n", "entry_point": "circular_shift", "canonical_solution": "    s = str(x)\n    if shift > len(s):\n        return s[::-1]\n    else:\n        return s[len(s) - shift:] + s[:len(s) - shift]\n", "test": "def check(candidate):\n\n    # Check some simple cases\n    assert candidate(100, 2) == \"001\"\n    assert candidate(12, 2) == \"12\"\n    assert candidate(97, 8) == \"79\"\n    assert candidate(12, 1) == \"21\", \"This prints if this assert fails 1 (good for debugging!)\"\n\n    # Check some edge cases that are easy to work out by hand.\n    assert candidate(11, 101) == \"11\", \"This prints if this assert fails 2 (also good for debugging!)\"\n\n"}
+{"task_id": "HumanEval/79", "prompt": "\ndef decimal_to_binary(decimal):\n    \"\"\"You will be given a number in decimal form and your task is to convert it to\n    binary format. The function should return a string, with each character representing a binary\n    number. Each character in the string will be '0' or '1'.\n\n    There will be an extra couple of characters 'db' at the beginning and at the end of the string.\n    The extra characters are there to help with the format.\n\n    Examples:\n    decimal_to_binary(15)   # returns \"db1111db\"\n    decimal_to_binary(32)   # returns \"db100000db\"\n    \"\"\"\n", "entry_point": "decimal_to_binary", "canonical_solution": "    return \"db\" + bin(decimal)[2:] + \"db\"\n", "test": "def check(candidate):\n\n    # Check some simple cases\n    assert candidate(0) == \"db0db\"\n    assert candidate(32) == \"db100000db\"\n    assert candidate(103) == \"db1100111db\"\n    assert candidate(15) == \"db1111db\", \"This prints if this assert fails 1 (good for debugging!)\"\n\n    # Check some edge cases that are easy to work out by hand.\n    assert True, \"This prints if this assert fails 2 (also good for debugging!)\"\n\n"}
\ No newline at end of file
diff --git a/mindie/examples/models/codellama/run_pa.sh b/mindie/examples/models/codellama/run_pa.sh
new file mode 100644
index 00000000..ac0f784b
--- /dev/null
+++ b/mindie/examples/models/codellama/run_pa.sh
@@ -0,0 +1,23 @@
+# Copyright Huawei Technologies Co., Ltd. 2023-2024. All rights reserved.
+# 参数配置以及启动指令的说明见同级目录下的README.md文件
+export ASCEND_RT_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+export MASTER_PORT=20030
+
+# 以下环境变量与性能和内存优化相关，通常情况下无需修改
+export ATB_LAYER_INTERNAL_TENSOR_REUSE=1
+export INF_NAN_MODE_ENABLE=0
+export ATB_OPERATION_EXECUTE_ASYNC=1
+export TASK_QUEUE_ENABLE=1
+export ATB_CONVERT_NCHW_TO_ND=1
+export LCCL_ENABLE_FALLBACK=1
+export ATB_WORKSPACE_MEM_ALLOC_GLOBAL=1
+export ATB_CONTEXT_WORKSPACE_SIZE=0
+export INT8_FORMAT_NZ_ENABLE=1
+
+world_size=$(($(echo "${ASCEND_RT_VISIBLE_DEVICES}" | grep -o , | wc -l) +1))
+
+if [ "$TP_WORLD_SIZE" == "1" ]; then
+    python -m examples.run_pa --model_path $1
+else
+    torchrun --nproc_per_node $world_size --master_port $MASTER_PORT -m examples.run_pa --model_path $1
+fi
\ No newline at end of file
diff --git a/mindie/examples/models/codeshell/README.md b/mindie/examples/models/codeshell/README.md
new file mode 100644
index 00000000..d4549411
--- /dev/null
+++ b/mindie/examples/models/codeshell/README.md
@@ -0,0 +1,33 @@
+# CodeShell-7B 模型推理指导 <!-- omit in toc -->
+
+# 概述
+
+- [CodeShell-7B](https://github.com/WisdomShell/codeshell)是北京大学知识计算实验室联合四川天府银行AI团队研发的多语言代码大模型基座。它拥有70亿参数，经过对五千亿Tokens的训练，并具有8192的上下文窗口长度。CodeShell在权威的代码评估Benchmark（HumanEval与MBPP）上取得了同等规模最好的性能。这个项目为多语言代码处理和理解提供了有力的工具。
+- 此代码仓中实现了一套基于NPU硬件的CodeShell推理模型。配合加速库使用，旨在NPU上获得极致的推理性能。
+
+# 特性矩阵
+- 此矩阵罗列了CodeShell-7B模型支持的特性
+
+| 模型及参数量 | 800I A2 Tensor Parallelism | 300I DUO Tensor Parallelism | FP16 | BF16 | Flash Attention | Paged Attention | W8A8量化 | W8A16量化 | W4A16量化 | KV cache量化 | 稀疏量化 | MOE量化 | MindIE | TGI | 长序列 |
+|-------------|-------------------------|-------------------------|------|------|-----------------|-----------------|---------|--------------|----------|--------|--------|-----|-----|-----|-----|
+| CodeShell-7B    | 支持world size 1,2,4,8  | 支持world size 1,2,4      | 是   | 否   | 否              | 是              | 否       | 否      | 否     | 否           | 否       | 否     | 否    | 否  | 否 |
+
+- 此模型仓已适配的模型版本
+  - [CodeShell-7B](https://huggingface.co/WisdomShell/CodeShell)
+
+
+# 使用说明
+
+- 执行推理前需要将权重目录下的config.json中的`torch_dtype`改为`"float16"`
+- 修改config.json中的`model_type`改为`"codeshell"`
+
+
+## 精度测试
+- 参考[此README文件](../../../../tests/modeltest/README.md)
+
+## 性能测试
+- 参考[此README文件](../../../../tests/modeltest/README.md)
+
+## FAQ
+- `import torch_npu`遇到`xxx/libgomp.so.1: cannot allocate memory in static TLS block`报错，可通过配置`LD_PRELOAD`解决。
+  - 示例：`export LD_PRELOAD=/lib/aarch64-linux-gnu/libgomp.so.1:$LD_PRELOAD`
\ No newline at end of file
diff --git a/mindie/examples/models/deepseek/README_DeepSeek_Coder.md b/mindie/examples/models/deepseek/README_DeepSeek_Coder.md
new file mode 100644
index 00000000..5ce2506f
--- /dev/null
+++ b/mindie/examples/models/deepseek/README_DeepSeek_Coder.md
@@ -0,0 +1,112 @@
+# README
+
+- [Deepseek]是由一系列代码语言模型组成。提供 1.3B、6.7B、7B 和 33B 的型号尺寸，使用者能够选择最适合其要求的设置。（当前脚本支持1.3B、6.7B、7B和33B）
+
+- 此代码仓中实现了一套基于NPU硬件的Deepseek-Coder模型。配合加速库使用，旨在NPU上获得极致的推理性能。
+
+# 特性矩阵
+- 此矩阵罗列了各DeepSeek-Coder模型支持的特性
+
+| 模型及参数量 | 800I A2 Tensor Parallelism | 300I DUO Tensor Parallelism | FP16 | BF16（仅800I A2支持） | Flash Attention | Paged Attention | W8A8量化 | W8A16量化 | KV cache量化 | 稀疏量化（仅300I DUO支持） | MOE | MindIE | TGI | 长序列 |
+|-------------|----------------------------|-----------------------------|------|----------------------|-----------------|-----------------|---------|-----------|--------------|--------------------------|-----|--------|-----|-----|
+| DeepSeek-Coder-1.3B    | 支持world size 1,2,4,8     | ×                |  ×  |     √               |  √               |     √           |  ×      |    ×     |   ×         |      ×                  |  × |   ×   | ×  |×|
+| DeepSeek-Coder-6.7B   | 支持world size 1,2,4,8     | 支持world size 2,4 |   √ |   √                  |      √          |       √         |    ×    |     ×    |     ×       |        ×                |  × |   ×   | ×  |×|
+| DeepSeek-Coder-7B   | 支持world size 1,2,4,8     | 支持world size 2,4   |   √ |    √                 |     √           |      √          |    ×    |      ×   |    ×        |       ×                |  × |    ×  | ×  |×|
+| DeepSeek-Coder-33B   | 支持world size 4,8           | ×                |   × |     √                |     √           |      √          |    ×    |     ×    |    ×        |       ×                 |  × |    ×  | ×  |×|
+
+- 此模型仓已适配的模型版本
+  - [DeepSeek-Coder系列](https://github.com/deepseek-ai/DeepSeek-Coder)
+
+# 使用说明
+
+## 路径变量解释
+| 变量名  | 含义                                             |
+|--------|--------------------------------------------------|
+| working_dir | 加速库及模型库下载后放置的目录                  |
+| llm_path | 模型仓所在路径。若使用编译好的包，则路径为`${working_dir}/MindIE-LLM/`；若使用gitee下载的代码，则路径为`${working_dir}/MindIE-LLM/examples/atb_models`    |
+| script_path | 脚本所在路径；Deepseek-Coder的工作脚本所在路径为`${llm_path}/examples/models/deepseek`                            |
+| weight_path | 模型权重路径                            |
+
+## 权重
+**权重下载**
+- [Deepseek-Coder-1.3B](https://huggingface.co/deepseek-ai/deepseek-coder-1.3b-instruct)
+- [Deepseek-Coder-6.7B](https://huggingface.co/deepseek-ai/deepseek-coder-6.7b-instruct)
+- [Deepseek-Coder-7B](https://huggingface.co/deepseek-ai/deepseek-coder-7b-instruct-v1.5)
+- [Deepseek-Coder-33B](https://huggingface.co/deepseek-ai/deepseek-coder-33b-instruct)
+
+**基础环境变量**
+- 参考[此README文件](../../../README.md)
+
+**权重转换**
+- 参考[此README文件](../../README.md)
+
+**量化权重生成**
+- 暂不支持
+
+
+## 推理
+
+### 对话测试
+**运行Paged Attention FP16**
+- 运行启动脚本 （chat_template接口 transformers版本需求：4.34.0）
+  - 在\${llm_path}目录下执行以下指令
+    ```shell
+    bash ${script_path}/run_pa.sh ${weight_path}
+    ```
+- 启动脚本中可设置自定义问题，具体在input_text后面修改即可
+- 环境变量说明
+  - `export ASCEND_RT_VISIBLE_DEVICES=0,1,2,3,4,5,6,7`
+    - 指定当前机器上可用的逻辑NPU核心，多个核心间使用逗号相连
+    - 核心ID查阅方式见[此README文件](../../README.md)的【启动脚本相关环境变量】章节
+    - 对于300I DUO卡而言，若要使用单卡双芯，请指定至少两个可见核心；若要使用双卡四芯，请指定至少四个可见核心
+    - 各模型支持的核心数参考“特性矩阵”
+  - `export MASTER_PORT=20030`
+    - 设置卡间通信端口
+    - 默认使用20030端口
+    - 目的是为了避免同一台机器同时运行多个多卡模型时出现通信冲突
+    - 设置时端口建议范围为：20000-20050
+  - 以下环境变量与性能和内存优化相关，通常情况下无需修改
+    ```shell
+    export ATB_LAYER_INTERNAL_TENSOR_REUSE=1
+    export INF_NAN_MODE_ENABLE=0
+    export ATB_OPERATION_EXECUTE_ASYNC=1
+    export TASK_QUEUE_ENABLE=1
+    export ATB_CONVERT_NCHW_TO_ND=1
+    export LCCL_ENABLE_FALLBACK=1
+    export ATB_WORKSPACE_MEM_ALLOC_GLOBAL=1
+    export ATB_CONTEXT_WORKSPACE_SIZE=0
+    ```
+
+## 精度测试
+- 参考[此README文件](../../../tests/modeltest/README.md)
+  - 示例
+    ```shell
+    cd ${llm_path}/tests/modeltest
+    export ASCEND_RT_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+    export MAX_MEMORY_GB=29
+    bash run.sh pa_fp16 full_BoolQ 1 deepseek_coder ${deepseek-coder-1.3b权重路径} 8
+    bash run.sh pa_fp16 full_BoolQ 1 deepseek_coder ${deepseek-coder-6.7b权重路径} 8
+    bash run.sh pa_fp16 full_BoolQ 1 deepseek_coder ${deepseek-coder-7b权重路径} 8
+    bash run.sh pa_fp16 full_BoolQ 1 deepseek_coder ${deepseek-coder-33b权重路径} 8
+    ```
+- 运行量化权重和BF16时需注意`${weight_path}/config.json`中的`quantize`字段和`torch_dtype`字段是否与权重匹配，参考[此README文件](../../README.md)
+
+## 性能测试
+- 参考[此README文件](../../../tests/modeltest/README.md)
+  - 示例
+    ```shell
+    cd ${llm_path}/tests/modeltest
+    export ASCEND_RT_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+    export MAX_MEMORY_GB=29
+    export ATB_LLM_BENCHMARK_ENABLE=1
+    bash run.sh pa_fp16 performance [[2048,2048],[1024,1024],[512,512],[256,256]] 1 deepseek_coder ${deepseek-coder-1.3b权重路径} 8
+    bash run.sh pa_fp16 performance [[2048,2048],[1024,1024],[512,512],[256,256]] 1 deepseek_coder ${deepseek-coder-6.7b权重路径} 8
+    bash run.sh pa_fp16 performance [[2048,2048],[1024,1024],[512,512],[256,256]] 1 deepseek_coder ${deepseek-coder-7b权重路径} 8
+    bash run.sh pa_fp16 performance [[2048,2048],[1024,1024],[512,512],[256,256]] 1 deepseek_coder ${deepseek-coder-33b权重路径} 8
+    ```
+- 运行量化权重和BF16时需注意`${weight_path}/config.json`中的`quantize`字段和`torch_dtype`字段是否与权重匹配，参考[此README文件](../../README.md)
+
+## FAQ
+- 更多环境变量见[此README文件](../../README.md)
+- 对话测试实际执行的Python文件为`${llm_path}/examples/run_pa.py`；这个文件的参数说明见[此README文件](../../README.md)
+- 运行时，需要通过指令pip list｜grep protobuf确认protobuf版本，如果版本高于3.20.x，请运行指令pip install protobuf==3.20.0进行更新
diff --git a/mindie/examples/models/deepseek/README_deepseek_llm.md b/mindie/examples/models/deepseek/README_deepseek_llm.md
new file mode 100644
index 00000000..60c59743
--- /dev/null
+++ b/mindie/examples/models/deepseek/README_deepseek_llm.md
@@ -0,0 +1,101 @@
+# README
+
+- [DeepSeek-LLM](https://github.com/deepseek-ai/deepseek-LLM)从包含2T token的中英文混合数据集中，训练得到7B Base、7B Chat、67B Base与67B Chat四种模型
+
+# 支持特性
+| 模型及参数量       | 800I A2 Tensor Parallelism | 300I DUO Tensor Parallelism | FP16 | BF16（仅800I A2支持） | Flash Attention | Paged Attention | W8A8量化 | W8A16量化 | KV cache量化 | 稀疏量化（仅300I DUO支持） | MOE | MindIE | TGI |长序列|
+|------------------|----------------------------|-----------------------------|------|---------------------|-----------------|-----------------|---------|-----------|--------------|------------------------|-----|--------|-----|-----|
+| DeepSeek-LLM-7B  | 支持world size 1,2,4,8        | 支持world size 1,2,4,8        | √   | ×                   | ×              | √               | ×       | ×        | ×           | ×                      | ×  | ×     | ×  |×  |
+| DeepSeek-LLM-67B | 支持world size 8            | ×                          | √    | ×                   | ×              | √               | ×       | ×        | ×           | ×                      | ×  | ×     | ×  |×  |
+
+
+# 使用说明
+
+## 路径变量解释
+
+| 变量名         | 含义                             |
+| --------------| --------------------------------|
+| `working_dir` | 加速库及模型库下载后放置的目录       |
+| `llm_path`    | 模型仓所在路径。若使用编译好的包，则路径为`${working_dir}/MindIE-LLM/`；若使用 gitee 下载的代码，则路径为`${working_dir}/MindIE-LLM/examples/atb_models` |
+| `script_path` | 脚本所在路径；Deepseek-LLM的工作脚本所在路径为`${llm_path}/examples/models/deepseek` |
+| `weight_path` | 模型权重路径                      |
+
+## 权重
+
+### 权重下载
+- [Deepseek-LLM-7B-Base](https://huggingface.co/deepseek-ai/deepseek-llm-7b-base)
+- [Deepseek-LLM-7B-Chat](https://huggingface.co/deepseek-ai/deepseek-llm-7b-chat)
+- [Deepseek-LLM-67B-Chat](https://huggingface.co/deepseek-ai/deepseek-llm-67b-base)
+- [Deepseek-LLM-67B-Chat](https://huggingface.co/deepseek-ai/deepseek-llm-67b-chat)
+
+### 权重转换
+- 当前仅支持加载safetensor格式的权重文件，若权重文件为bin格式，请参考[此README文件](../../README.md)
+
+
+## 基础环境变量
+- 参考[此 README 文件](../../../README.md)
+
+## 推理
+
+### 对话测试
+
+**运行 Paged Attention FP16**
+- 运行启动脚本（`transformers` 版本需求：>=4.35.0）
+  - 在`${llm_path}`目录下执行以下指令
+    ```shell
+    bash ${script_path}/run_pa.sh ${weight_path}
+    ```
+- 启动脚本中可设置自定义问题，具体在 input_text 后面修改即可 (默认问题为"Who is the CEO of Google?")
+- 启动脚本中可设置自定义输出长度，具体在 max_output_length 后面修改即可（默认长度为 10）
+- 若当前所用权重版本为"chat"版本，请将"--is_chat_model"赋值给 extra_param；若当前所用权重版本为"base"版本，可以将空字符串赋值给 extra_param（默认为 chat_model）
+- 环境变量说明
+  - `export ASCEND_RT_VISIBLE_DEVICES=0,1,2,3,4,5,6,7`
+    - 指定当前机器上可用的逻辑 NPU 核心，多个核心间使用逗号相连
+    - 核心 ID 查阅方式见[此 README 文件](../../README.md)的【启动脚本相关环境变量】章节
+    - 对于 300I DUO 卡而言，若要使用单卡双芯，请指定至少两个可见核心；若要使用双卡四芯，请指定至少四个可见核心
+    - 各模型支持的核心数参考“特性矩阵”
+  - `export MASTER_PORT=20030`
+    - 设置卡间通信端口
+    - 默认使用 20030 端口
+    - 目的是为了避免同一台机器同时运行多个多卡模型时出现通信冲突
+    - 设置时端口建议范围为：20000-20050
+  - 以下环境变量与性能和内存优化相关，通常情况下无需修改
+    ```shell
+    export ATB_LAYER_INTERNAL_TENSOR_REUSE=1
+    export INF_NAN_MODE_ENABLE=0
+    export ATB_OPERATION_EXECUTE_ASYNC=1
+    export TASK_QUEUE_ENABLE=1
+    export ATB_CONVERT_NCHW_TO_ND=1
+    export LCCL_ENABLE_FALLBACK=1
+    export ATB_WORKSPACE_MEM_ALLOC_GLOBAL=1
+    export ATB_CONTEXT_WORKSPACE_SIZE=1
+    export INT8_FORMAT_NZ_ENABLE=1
+    ```
+
+## 精度测试
+- 参考[此 README 文件](../../../tests/modeltest/README.md)
+  - 示例
+    ```shell
+    cd ${llm_path}/tests/modeltest
+    export ASCEND_RT_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+    export MAX_MEMORY_GB=29
+    bash run.sh pa_fp16 full_BoolQ 1 deepseek_llm ${deepseek-llm-7b-base权重路径} 2
+    bash run.sh pa_fp16 full_BoolQ 1 deepseek_llm ${deepseek-llm-67b-base权重路径} 8
+    ```
+
+## 性能测试
+- 参考[此 README 文件](../../../tests/modeltest/README.md)
+  - 示例
+    ```shell
+    cd ${llm_path}/tests/modeltest
+    export ASCEND_RT_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+    export MAX_MEMORY_GB=29
+    export ATB_LLM_BENCHMARK_ENABLE=1
+    bash run.sh pa_fp16 performance [[2048,2048],[1024,1024],[512,512],[256,256]] 1 deepseek_llm ${deepseek-llm-7b-base权重路径} 2
+    bash run.sh pa_fp16 performance [[2048,2048],[1024,1024],[512,512],[256,256]] 1 deepseek_llm ${deepseek-llm-67b-base权重路径} 8
+    ```
+
+## FAQ
+- 更多环境变量见[此 README 文件](../../README.md)
+- 对话测试实际执行的 Python 文件为`${llm_path}/examples/run_pa.py`；这个文件的参数说明见[此 README 文件](../../README.md)
+- 运行时，需要通过指令`pip list ｜ grep protobuf`确认`protobuf`版本，如果版本高于 3.20.x，请运行指令`pip install protobuf==3.20.0`进行更新
diff --git a/mindie/examples/models/deepseek/README_deepseek_moe.md b/mindie/examples/models/deepseek/README_deepseek_moe.md
new file mode 100644
index 00000000..03aa3539
--- /dev/null
+++ b/mindie/examples/models/deepseek/README_deepseek_moe.md
@@ -0,0 +1,103 @@
+# README
+
+- [DeepSeekMoE 16B]是具有 16.4B 参数的混合专家(MoE)语言模型。模型主要涉及两个创新策略：专家细分和共享专家。此模型用[DeepSeek 7B]和[Llama2 7B]40%的计算量，就可以得到与其相当的精度结果。（当前脚本支持 16B-Base 和 16B-Chat）
+- 此代码仓中实现了一套基于 NPU 硬件的 Deepseek-MoE 模型。配合加速库使用，旨在 NPU 上获得极致的推理性能。
+
+# 特性矩阵
+- 此矩阵罗列了各DeepSeek-MoE模型支持的特性
+
+| 模型及参数量 | 800I A2 Tensor Parallelism | 300I DUO Tensor Parallelism | FP16 | BF16（仅800I A2支持） | Flash Attention | Paged Attention | W8A8量化 | W8A16量化 | W4A16量化 |KV cache量化 | 稀疏量化（仅300I DUO支持） | MindIE | TGI | 长序列  |
+|-------------|----------------------------|-----------------------------|------|----------------------|-----------------|-----------------|---------|-----------|-----------|--------------|--------------------------|--------|-----|-----|
+| DeepSeek-MoE-16B-Chat    | 支持world size 4,8     | ×                | √   | ×                   | √              | √              | ×       | ×        | ×        | ×           | ×                       | √     | ×  | ×  |
+
+# 使用说明
+
+## 路径变量解释
+
+| 变量名      | 含义                                                                                                                                                     |
+| ----------- | -------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| working_dir | 加速库及模型库下载后放置的目录                                                                                                                           |
+| llm_path    | 模型仓所在路径。若使用编译好的包，则路径为`${working_dir}/MindIE-LLM/`；若使用 gitee 下载的代码，则路径为`${working_dir}/MindIE-LLM/examples/atb_models` |
+| script_path | 脚本所在路径；Deepseek-MoE 的工作脚本所在路径为`${llm_path}/examples/models/deepseek`                                                                    |
+| weight_path | 模型权重路径                                                                                                                                             |
+
+## 权重
+
+**权重下载**
+
+- [Deepseek-MoE-16B-Base](https://huggingface.co/deepseek-ai/deepseek-moe-16b-base)
+- [Deepseek-MoE-16B-Chat](https://huggingface.co/deepseek-ai/deepseek-moe-16b-chat)
+
+**基础环境变量**
+
+- 参考[此 README 文件](../../../README.md)
+
+## 推理
+
+### 对话测试
+
+**运行 Paged Attention FP16**
+
+- 运行启动脚本（transformers 版本需求：4.36.2）
+  - 在\${llm_path}目录下执行以下指令
+    ```shell
+    bash ${script_path}/run_pa_deepseek_moe.sh ${weight_path}
+    ```
+- 启动脚本中可设置自定义问题，具体在 input_text 后面修改即可 (默认问题为"Who is the CEO of Google?")
+- 启动脚本中可设置自定义输出长度，具体在 max_output_length 后面修改即可（默认长度为 10）
+- 若当前所用权重版本为"chat"版本，请将"--is_chat_model"赋值给 extra_param；若当前所用权重版本为"base"版本，可以将空字符串赋值给 extra_param（默认为 chat_model）
+- 环境变量说明
+  - `export ASCEND_RT_VISIBLE_DEVICES=0,1,2,3,4,5,6,7`
+    - 指定当前机器上可用的逻辑 NPU 核心，多个核心间使用逗号相连
+    - 核心 ID 查阅方式见[此 README 文件](../../README.md)的【启动脚本相关环境变量】章节
+    - 对于 300I DUO 卡而言，若要使用单卡双芯，请指定至少两个可见核心；若要使用双卡四芯，请指定至少四个可见核心
+    - 各模型支持的核心数参考“特性矩阵”
+  - `export MASTER_PORT=20030`
+    - 设置卡间通信端口
+    - 默认使用 20030 端口
+    - 目的是为了避免同一台机器同时运行多个多卡模型时出现通信冲突
+    - 设置时端口建议范围为：20000-20050
+  - 以下环境变量与性能和内存优化相关，通常情况下无需修改
+    ```shell
+    export ATB_LAYER_INTERNAL_TENSOR_REUSE=1
+    export INF_NAN_MODE_ENABLE=0
+    export ATB_OPERATION_EXECUTE_ASYNC=1
+    export TASK_QUEUE_ENABLE=1
+    export ATB_CONVERT_NCHW_TO_ND=1
+    export LCCL_ENABLE_FALLBACK=1
+    export ATB_WORKSPACE_MEM_ALLOC_GLOBAL=1
+    export ATB_CONTEXT_WORKSPACE_SIZE=1
+    export INT8_FORMAT_NZ_ENABLE=1
+    export ATB_LLM_ENABLE_AUTO_TRANSPOSE=0
+    ```
+
+## 精度测试
+
+- 参考[此 README 文件](../../../tests/modeltest/README.md)
+  - 示例
+    ```shell
+    cd ${llm_path}/tests/modeltest
+    export ASCEND_RT_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+    export MAX_MEMORY_GB=29
+    bash run.sh pa_fp16 full_BoolQ 1 deepseek ${deepseek-moe-16b-base权重路径} 8
+    bash run.sh pa_fp16 full_BoolQ 1 deepseek ${deepseek-moe-16b-chat权重路径} 8
+    ```
+
+## 性能测试
+
+- 参考[此 README 文件](../../../tests/modeltest/README.md)
+  - 示例
+    ```shell
+    cd ${llm_path}/tests/modeltest
+    export ASCEND_RT_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+    export MAX_MEMORY_GB=29
+    export ATB_LLM_BENCHMARK_ENABLE=1
+    bash run.sh pa_fp16 performance [[2048,2048],[1024,1024],[512,512],[256,256]] 1 deepseek ${deepseek-moe-16b-base权重路径} 8
+    bash run.sh pa_fp16 performance [[2048,2048],[1024,1024],[512,512],[256,256]] 1 deepseek ${deepseek-moe-16b-chat权重路径} 8
+    ```
+
+## FAQ
+
+- 更多环境变量见[此 README 文件](../../README.md)
+- 对话测试实际执行的 Python 文件为`${llm_path}/examples/run_pa.py`；这个文件的参数说明见[此 README 文件](../../README.md)
+- 运行时，需要通过指令 pip list ｜ grep protobuf 确认 protobuf 版本，如果版本高于 3.20.x，请运行指令 pip install protobuf==3.20.0 进行更新
diff --git a/mindie/examples/models/deepseek/run_pa.sh b/mindie/examples/models/deepseek/run_pa.sh
new file mode 100644
index 00000000..fee4b791
--- /dev/null
+++ b/mindie/examples/models/deepseek/run_pa.sh
@@ -0,0 +1,26 @@
+# Copyright Huawei Technologies Co., Ltd. 2023-2024. All rights reserved.
+# 参数配置以及启动指令的说明见同级目录下的README.md文件
+export ASCEND_RT_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+export MASTER_PORT=20030
+
+# 以下环境变量与性能和内存优化相关，通常情况下无需修改
+export ATB_LAYER_INTERNAL_TENSOR_REUSE=1
+export INF_NAN_MODE_ENABLE=0
+export ATB_OPERATION_EXECUTE_ASYNC=1
+export TASK_QUEUE_ENABLE=1
+export ATB_CONVERT_NCHW_TO_ND=1
+export LCCL_ENABLE_FALLBACK=1
+export ATB_WORKSPACE_MEM_ALLOC_GLOBAL=1
+export ATB_CONTEXT_WORKSPACE_SIZE=1
+export INT8_FORMAT_NZ_ENABLE=1
+
+
+extra_param="--is_chat_model"
+world_size=$(($(echo "${ASCEND_RT_VISIBLE_DEVICES}" | grep -o , | wc -l) +1))
+
+
+if [ "$TP_WORLD_SIZE" == "1" ]; then
+    python -m examples.run_pa --model_path $1 $extra_param
+else
+    torchrun --nproc_per_node $world_size --master_port $MASTER_PORT -m examples.run_pa --model_path $1 $extra_param
+fi
diff --git a/mindie/examples/models/deepseek/run_pa_deepseek_moe.sh b/mindie/examples/models/deepseek/run_pa_deepseek_moe.sh
new file mode 100644
index 00000000..1e57c034
--- /dev/null
+++ b/mindie/examples/models/deepseek/run_pa_deepseek_moe.sh
@@ -0,0 +1,26 @@
+# Copyright Huawei Technologies Co., Ltd. 2023-2024. All rights reserved.
+# 参数配置以及启动指令的说明见同级目录下的README.md文件
+export ASCEND_RT_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+export MASTER_PORT=20030
+
+# 以下环境变量与性能和内存优化相关，通常情况下无需修改
+export ATB_LAYER_INTERNAL_TENSOR_REUSE=1
+export INF_NAN_MODE_ENABLE=0
+export ATB_OPERATION_EXECUTE_ASYNC=1
+export TASK_QUEUE_ENABLE=1
+export ATB_CONVERT_NCHW_TO_ND=1
+export LCCL_ENABLE_FALLBACK=1
+export ATB_WORKSPACE_MEM_ALLOC_GLOBAL=1
+export ATB_CONTEXT_WORKSPACE_SIZE=1
+export INT8_FORMAT_NZ_ENABLE=1
+export ATB_LLM_ENABLE_AUTO_TRANSPOSE=0
+
+extra_param="--is_chat_model"
+world_size=$(($(echo "${ASCEND_RT_VISIBLE_DEVICES}" | grep -o , | wc -l) +1))
+
+
+if [ "$TP_WORLD_SIZE" == "1" ]; then
+    python -m examples.run_pa --model_path $1 $extra_param
+else
+    torchrun --nproc_per_node $world_size --master_port $MASTER_PORT -m examples.run_pa --model_path $1 $extra_param
+fi
diff --git a/mindie/examples/models/gemma/README.md b/mindie/examples/models/gemma/README.md
new file mode 100644
index 00000000..790e382d
--- /dev/null
+++ b/mindie/examples/models/gemma/README.md
@@ -0,0 +1,146 @@
+# README
+
+- [Gemma](https://github.com/google/gemma_pytorch)，是由 Google 推出的一系列轻量级最先进的开放模型，采用与Gemini模型相同的研究和技术构建。Gemma模型非常适合各种文本生成任务，包括问答、摘要和推理。
+
+- 此代码仓中实现了一套基于NPU硬件的Gemma推理模型。配合加速库使用，旨在NPU上获得极致的推理性能。
+
+# 特性矩阵
+- 此矩阵罗列了Gemma模型支持的特性
+
+| 模型及参数量 | 800I A2 Tensor Parallelism | 300I DUO Tensor Parallelism | FP16 | BF16 | Flash Attention | Paged Attention | W8A8量化 | W8A16量化 | KV cache量化 | 稀疏量化 | MOE量化 | MindIE Service | TGI |  长序列 |
+|-------------|----------------------------|-----------------------------|------|----------------------|-----------------|-----------------|---------|-----------|--------------|--------------------------|-----|--------|-----|-----|
+| Gemma-2B    | 支持world size 1,2,4,8     | 支持world size 2,4           | 是   | 是                   | 否              | 是              | 否       | 否        | 否           | 否                       | 否  | 否     | 否  |  否  |
+| Gemma-7B   | 支持world size 1,2,4,8     | 支持world size 2,4           | 是   | 是                  | 否              | 是              | 是       | 否        | 否           | 否                       | 否  | 否     | 否  |  否  |
+
+- 此模型仓已适配的模型版本
+  - [Gemma系列](https://github.com/google/gemma_pytorch)
+  
+
+# 使用说明
+
+## 路径变量解释
+| 变量名  | 含义                                             |
+|--------|--------------------------------------------------|
+| working_dir | 加速库及模型库下载后放置的目录                  |
+| llm_path | 模型仓所在路径。若使用编译好的包，则路径为`${working_dir}/MindIE-LLM/`；若使用gitee下载的代码，则路径为`${working_dir}/MindIE-LLM/examples/atb_models`    |
+| script_path | 脚本所在路径；LLaMa和LLaMa2的工作脚本所在路径为`${llm_path}/examples/models/llama`                            |
+| weight_path | 模型权重路径                            |
+
+## 权重
+**权重下载**
+- [Gemma-2B](https://huggingface.co/google/gemma-2b)
+- [Gemma-7B](https://huggingface.co/google/gemma-7b)
+
+
+**权重转换**
+- 参考[此README文件](../../README.md)
+
+**量化权重生成**
+- 基于原始的FP16的权重，生成量化权重
+- W8A8 Antioutlier量化权重请使用以下指令生成
+  - 当前Gemma-2B/7B支持W8A8 Antioulier量化
+  - 设置环境变量
+  ```shell
+  # 设置CANN包的环境变量
+  source /usr/local/Ascend/ascend-toolkit/set_env.sh
+  ```
+  在\${llm_path}目录下执行以下指令
+  - 执行量化脚本  （也可以指定自己的数据集）
+  ```shell
+  python examples/models/gemma/convert_w8a8_quant_weights.py --model_path {浮点权重路径} --save_directory {W8A8量化权重路径} --calib_file examples/models/gemma/boolq.jsonl
+  ```
+
+
+**基础环境变量**
+- 参考[此README文件](../../../README.md)
+
+## 推理
+
+### 对话测试
+
+**运行Paged Attention FP16**
+- 运行启动脚本
+  - 在\${llm_path}目录下执行以下指令
+    ```shell
+    bash ${script_path}/run_pa.sh ${weight_path}
+    ```
+- 环境变量说明
+  - `export ASCEND_RT_VISIBLE_DEVICES=0,1,2,3,4,5,6,7`
+    - 指定当前机器上可用的逻辑NPU核心，多个核心间使用逗号相连
+    - 核心ID查阅方式见[此README文件](../../README.md)的【启动脚本相关环境变量】章节
+    - 对于300I DUO卡而言，若要使用单卡双芯，请指定至少两个可见核心；若要使用双卡四芯，请指定至少四个可见核心
+    - 各模型支持的核心数参考“特性矩阵”
+  - `export MASTER_PORT=20030`
+    - 设置卡间通信端口
+    - 默认使用20030端口
+    - 目的是为了避免同一台机器同时运行多个多卡模型时出现通信冲突
+    - 设置时端口建议范围为：20000-20050
+  - 以下环境变量与性能和内存优化相关，通常情况下无需修改
+    ```shell
+    export ATB_LAYER_INTERNAL_TENSOR_REUSE=1
+    export INF_NAN_MODE_ENABLE=0
+    export ATB_OPERATION_EXECUTE_ASYNC=1
+    export TASK_QUEUE_ENABLE=1
+    export ATB_CONVERT_NCHW_TO_ND=1
+    export LCCL_ENABLE_FALLBACK=1
+    export ATB_WORKSPACE_MEM_ALLOC_GLOBAL=1
+    export ATB_CONTEXT_WORKSPACE_SIZE=0
+    ```
+
+**运行Paged Attention BF16**
+- 运行启动脚本
+  - 与“运行Paged Attention FP16”的启动方式相同
+- 环境变量说明
+  - 参见“运行Paged Attention FP16”中的环境变量说明
+- 相比于FP16，运行BF16时需修改${weight_path}/config.json中的`torch_dtype`字段，将此字段对应的值修改为`bfloat16`
+- 300I DUO卡暂不支持BF16特性
+
+**运行Paged Attention W8A8**
+- 运行启动脚本
+  - 与“运行Paged Attention FP16”的启动方式相同
+  - `${weight_path}`为W8A8量化权重的路径
+- 环境变量说明
+  - 参见“运行Paged Attention FP16”中的环境变量说明
+- 相比于FP16，运行量化时需修改W8A8量化权重`${weight_path}/config.json`中的`quantize`字段，将此字段对应的值修改为`w8a8`
+  - 若config.json中无此字段，则新增
+
+
+
+## 精度测试
+
+- 参考[此README文件](../../../tests/modeltest/README.md)
+  - 示例
+    ```shell
+    cd ${llm_path}/tests/modeltest
+    export ASCEND_RT_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+    export MAX_MEMORY_GB=29
+    bash run.sh pa_fp16 full_BoolQ 1 gemma ${gemma-2b权重路径} 8
+    bash run.sh pa_fp16 full_BoolQ 1 gemma ${gemma-7b权重路径} 8
+    ```
+- 运行量化权重和BF16时需注意`${weight_path}/config.json`中的`quantize`字段和`torch_dtype`字段是否与权重匹配，参考[此README文件](../../README.md)
+
+## 性能测试
+
+- 参考[此README文件](../../../tests/modeltest/README.md)
+  - 示例
+    ```shell
+    cd ${llm_path}/tests/modeltest
+    export ASCEND_RT_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+    export MAX_MEMORY_GB=29
+    export ATB_LLM_BENCHMARK_ENABLE=1
+    bash run.sh pa_fp16 performance [[2048,2048],[1024,1024],[512,512],[256,256]] 1 gemma ${gemma-2b权重路径} 8
+    bash run.sh pa_fp16 performance [[2048,2048],[1024,1024],[512,512],[256,256]] 1 gemma ${gemma-7b权重路径} 8
+    
+    ```
+- 运行量化权重和BF16时需注意`${weight_path}/config.json`中的`quantize`字段和`torch_dtype`字段是否与权重匹配，参考[此README文件](../../README.md)
+
+
+## FAQ
+- 更多环境变量见[此README文件](../../README.md)
+- 对话测试实际执行的Python文件为`${llm_path}/examples/run_fa.py`和`${llm_path}/examples/run_pa.py`；这两个文件的参数说明见[此README文件](../../README.md)
+- 运行时，需要通过指令pip list｜grep protobuf确认protobuf版本，如果版本高于3.20.x，请运行指令pip install protobuf==3.20.0进行更新
+- gemma-7b的开源权重有个bug，在运行时需要改一下源码：..../transformers/models/gemma/modeling_gemma.py 280行
+```
+    #attn_output = attn_output.reshape(bsz, q_len, self.hidden_size)
+    attn_output = attn_output.reshape(bsz, q_len, 4096)
+```
\ No newline at end of file
diff --git a/mindie/examples/models/gemma/boolq.jsonl b/mindie/examples/models/gemma/boolq.jsonl
new file mode 100644
index 00000000..df91dbed
--- /dev/null
+++ b/mindie/examples/models/gemma/boolq.jsonl
@@ -0,0 +1,5 @@
+{"id": 0, "inputs_pretokenized": "Ghost in the Shell -- Animation studio Production I.G has produced several different anime adaptations of Ghost in the Shell, starting with the 1995 film of the same name, telling the story of Section 9's investigation of the Puppet Master. The television series Ghost in the Shell: Stand Alone Complex followed in 2002, telling an alternate story from the manga and first film, featuring Section 9's investigations of government corruption in the Laughing Man and Individual Eleven incidents. A sequel to the 1995 film, Ghost in the Shell 2: Innocence, was released in 2004. In 2006, the film Ghost in the Shell: Stand Alone Complex - Solid State Society retook the story of the television series. 2013 saw the start of the Ghost in the Shell: Arise original video animation (OVA) series, consisting of four parts through mid-2014. The series was recompiled in early 2015 as a television series titled Ghost in the Shell: Arise - Alternative Architecture, airing with an additional two episodes (one part). An animated feature film produced by most of the Arise staff, titled Ghost in the Shell: The New Movie, was released on June 20, 2015. A live-action American film of the same name was released on March 31, 2017.\nQuestion: is ghost in the shell based on the anime?\nAnswer:"}
+{"id": 1, "inputs_pretokenized": "The Walking Dead (season 8) -- The eighth season of The Walking Dead, an American post-apocalyptic horror television series on AMC, premiered on October 22, 2017, and concluded on April 15, 2018, consisting of 16 episodes. Developed for television by Frank Darabont, the series is based on the eponymous series of comic books by Robert Kirkman, Tony Moore, and Charlie Adlard. The executive producers are Kirkman, David Alpert, Scott M. Gimple, Greg Nicotero, Tom Luse, and Gale Anne Hurd, with Gimple as showrunner for his fifth and final season. The eighth season received positive reviews from critics. It was nominated for multiple awards and won two, including Best Horror Television Series for the third consecutive year, at the 44th Saturn Awards.\nQuestion: is there gonna be a season 8 of the walking dead?\nAnswer:"}
+{"id": 2, "inputs_pretokenized": "Onyx -- Brazilian green onyx was often used as plinths for art deco sculptures created in the 1920s and 1930s. The German sculptor Ferdinand Preiss used Brazilian green onyx for the base on the majority of his chryselephantine sculptures. Green onyx was also used for trays and pin dishes -- produced mainly in Austria -- often with small bronze animals or figures attached.\nQuestion: is there such a thing as green onyx?\nAnswer:"}
+{"id": 3, "inputs_pretokenized": "Wachovia -- The acquisition of Wachovia by Wells Fargo was completed on December 31, 2008 after a government-forced sale to avoid Wachovia's failure. The Wachovia brand was absorbed into the Wells Fargo brand in a process that lasted three years: on October 15, 2011, the last Wachovia branches in North Carolina were converted to Wells Fargo.\nQuestion: is wells fargo and wachovia the same bank?\nAnswer:"}
+{"id": 4, "inputs_pretokenized": "Friday Night Lights (film) -- Friday Night Lights is a 2004 American sports drama film, directed by Peter Berg, which 'dramatized' the coach and players of a high school football team in the Texas city of Odessa that supported and was obsessed with them. The book on which it was based, Friday Night Lights: A Town, a Team, and a Dream (1990) by H.G. Bissinger, followed the story of the 1988 Permian High School Panthers football team as they made a run towards the state championship. A television series of the same name premiered on October 3, 2006 on NBC. The film won the Best Sports Movie ESPY Award and was ranked number 37 on Entertainment Weekly's list of the Best High School Movies.\nQuestion: is friday night lights movie based on a true story?\nAnswer:"}
diff --git a/mindie/examples/models/gemma/convert_w8a8_quant_weights.py b/mindie/examples/models/gemma/convert_w8a8_quant_weights.py
new file mode 100644
index 00000000..6b3733f2
--- /dev/null
+++ b/mindie/examples/models/gemma/convert_w8a8_quant_weights.py
@@ -0,0 +1,100 @@
+# Copyright 2023 Huawei Technologies Co.; Ltd
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+import json
+import shutil
+import torch
+import torch.utils.data
+from atb_llm.utils.log import logger
+from atb_llm.utils.file_utils import safe_open
+from atb_llm.models.gemma.modeling_gemma import GemmaConfig
+from transformers import AutoTokenizer, AutoModelForCausalLM
+from msmodelslim.pytorch.llm_ptq.llm_ptq_tools import Calibrator, QuantConfig # 导入量化配置接口
+from examples.convert.model_slim.quantifier import parse_arguments
+from examples.convert.model_slim.get_calibration_dataset import load_jsonl
+
+ANTI_METHOD = "m1" # anti-outlier算法配置
+
+
+#获取校准数据函数定义
+def get_calib_dataset(_tokenizer, _calib_list):
+    calib_dataset = []
+    for calib_data in _calib_list:
+        inputs = _tokenizer([calib_data], return_tensors='pt').to('cpu')
+        calib_dataset.append([inputs.data['input_ids'], None, inputs.data['attention_mask']])
+    return calib_dataset
+
+
+def main():
+    in_model_path = './gemma' # 浮点权重输入路径
+    out_model_path = './gemma-7b-quant' # 量化权重生成路径
+    num_layers = 28 # 模型层数
+
+    if args.model_path:
+        in_model_path = args.model_path
+    if args.save_directory:
+        out_model_path = args.save_directory
+
+    logger.info("--------model is loading--------")
+    # 加载模型
+    tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path=in_model_path)
+    model = AutoModelForCausalLM.from_pretrained(
+        pretrained_model_name_or_path=in_model_path,
+        torch_dtype=torch.float32, use_safetensors=True).cpu()
+
+    # 准备校准数据，建议随机选择50条BoolQ数据作为校准数据
+    calib_list = []
+    if args.calib_file:
+        calib_list = load_jsonl(args.calib_file)
+
+    #校准数据获取
+    dataset_calib = get_calib_dataset(tokenizer, calib_list) 
+    logger.info("--------get_calib_dataset--------")
+    # 量化配置
+    # anti-outlier配置  无需使用
+
+    # 配置回退层数
+    config = GemmaConfig.from_pretrained(args.model_path)
+    num_layers = config.num_hidden_layers
+    disabled_names = []
+    disabled_layers = [i for i in range(0, num_layers)]
+    for i in disabled_layers:
+        disabled_names.append(f"model.layers.{i}.mlp.down_proj")
+    # 配置量化参数，并返回量化配置实例
+    quant_config = QuantConfig(a_bit=8, w_bit=8, disable_names=disabled_names, dev_type='cpu', 
+                                act_method=3, mm_tensor=False)
+    # 输入加载的原模型、量化配置和校准数据，定义校准
+    calibrator = Calibrator(model, quant_config, calib_data=dataset_calib, disable_level='L0')
+    logger.info("--------calibrator start--------")
+    #执行量化
+    calibrator.run() 
+
+    # 保存生成好的量化权重
+    calibrator.save(out_model_path, save_type=["numpy", "safe_tensor"])
+    # 拷贝浮点模型配置文件
+    with safe_open(os.path.join(in_model_path, "config.json"), 'r') as f:
+        config = json.load(f)
+    config['quantize'] = 'w8a8'
+    logger.info("--------calibrator end--------")
+
+    with safe_open(os.path.join(out_model_path, "config.json"), "w") as f:
+        json.dump(config, f)
+    shutil.copyfile(os.path.join(in_model_path, "tokenizer_config.json"),
+                     os.path.join(out_model_path, "tokenizer_config.json"))
+    shutil.copyfile(os.path.join(in_model_path, "tokenizer.json"), os.path.join(out_model_path, "tokenizer.json"))
+    shutil.copyfile(os.path.join(in_model_path, "tokenizer.model"), os.path.join(out_model_path, "tokenizer.model"))
+
+if __name__ == "__main__":
+    args = parse_arguments()
+
+    main()
\ No newline at end of file
diff --git a/mindie/examples/models/gemma/run_pa.sh b/mindie/examples/models/gemma/run_pa.sh
new file mode 100644
index 00000000..2e782e98
--- /dev/null
+++ b/mindie/examples/models/gemma/run_pa.sh
@@ -0,0 +1,19 @@
+# Copyright Huawei Technologies Co., Ltd. 2023-2024. All rights reserved.
+# 参数配置以及启动指令的说明见同级目录下的README.md文件
+export ASCEND_RT_VISIBLE_DEVICES=0
+export MASTER_PORT=20030
+
+# 以下环境变量与性能和内存优化相关，通常情况下无需修改
+export ATB_LAYER_INTERNAL_TENSOR_REUSE=1
+export INF_NAN_MODE_ENABLE=0
+export LCCL_ENABLE_FALLBACK=1
+export INT8_FORMAT_NZ_ENABLE=1
+
+extra_param=""
+world_size=$(($(echo "${ASCEND_RT_VISIBLE_DEVICES}" | grep -o , | wc -l) +1))
+
+if [ "$TP_WORLD_SIZE" == "1" ]; then
+    python -m examples.run_pa --model_path $1 $extra_param --block_size 64
+else
+    torchrun --nproc_per_node $world_size --master_port $MASTER_PORT -m examples.run_pa --model_path $1 $extra_param --block_size 64
+fi
\ No newline at end of file
diff --git a/mindie/examples/models/gpt_neox/README.md b/mindie/examples/models/gpt_neox/README.md
new file mode 100644
index 00000000..37cce226
--- /dev/null
+++ b/mindie/examples/models/gpt_neox/README.md
@@ -0,0 +1,328 @@
+# README
+
+- GPT-NeoX-20B 是一个 200 亿参数的自回归语言模型，使用 GPT-NeoX 库在 Pile 上训练。它的架构有意类似于 GPT-3，并且与 GPT-J-6B 的架构几乎相同。其训练数据集包含大量英语文本，反映了该模型的通用性质。
+- 此代码仓中实现了一套基于NPU硬件的GPT-NEOX-20B推理模型。配合加速库使用，旨在NPU上获得极致的推理性能。
+# 特性矩阵
+
+- 此矩阵罗列了GTP-NEOX-20B模型支持的特性
+
+| 模型及参数量           | 800I A2 Tensor Parallelism | 300I DUO Tensor Parallelism | FP16 | BF16 | Flash Attention | Paged Attention | W8A8量化 | W8A16量化 | W4A16量化 | KV cache量化 | 稀疏量化 | MOE量化 | MindIE Service | TGI  | 长序列 |
+| ---------------------- |----------------------|---------------------------| ---- |-----| --------------- | --------------- | -------- | --------- | --------- | ------------ | -------------------------- | ---- | ------ | ---- |-----|
+| GTP-NEOX-20B           | 支持world size 2,4,8   | 支持world size 2, 4        | √    | ×   | √               | √               | ×        | ×         | ×         | ×            | ×                          | ×    | ×      | ×    | ×    |
+
+# Paged Attention 推理使用说明
+
+## 路径变量解释
+
+| 变量名         | 含义                                                                                                                     |
+|-------------|------------------------------------------------------------------------------------------------------------------------|
+| working_dir | 加速库及模型库下载后放置的目录                                                                                                        |
+| llm_path    | 模型仓所在路径。若使用编译好的包，则路径为`${working_dir}/ModelLink/`；若使用gitee下载的代码，则路径为`${working_dir}/ModelLink/mindie_ref/mindie_llm/atb_models` |
+| script_path | 脚本所在路径。GTP-NEOX系列模型的工作脚本所在路径为${llm_path}/examples/models/gpt_neox                                                              |
+| weight_path | 模型权重路径                                                                                                                 |
+
+# 权重下载
+
+- [GPTNeoX-20B](https://huggingface.co/EleutherAI/gpt-neox-20b/tree/main)
+## 权重转换
+
+- 通常情况 Paged Attention 场景下需要.safetensors 格式的权重，如果没有，参考[此README文件](../../README.md)转换
+- 但这里我们的GPTNeoX本身就是safetensors 格式权重，PA场景下无需转换。
+- 注: huggingface上有safetensors类型权重可直接下载
+
+## 量化权重生成
+- 暂不支持
+## 操作说明
+
+### 推理
+
+##### 对话测试
+- 运行Paged Attention FP16
+- 在`${llm_path}`目录下执行以下脚本
+
+```shell
+bash examples/models/gpt_neox/run_pa.sh ${weight_path}
+```
+
+根据硬件设备不同请参考下表修改run_pa.sh再运行
+
+### run_pa.sh 参数说明
+
+| 参数名称                      | 含义                                  | 800I A2推荐值 | 300I DUO推荐值 |
+|---------------------------|-------------------------------------|------------|-------------|
+| BIND_CPU                  | 绑定CPU核心开关,默认进行绑核                    | 1          | 1           |
+| IS_QUANT                  | 是否启动量化                              | 0          | 0           |
+| ASCEND_RT_VISIBLE_DEVICES | 使用的硬件卡号，多个卡间使用逗号相连                  | 根据实际情况设置   | 根据实际情况设置    |
+| MAX_MEMORY_GB             | 每张卡上的预计使用的最大显存，若出现显存不足导致的异常，请将该参数改小 | 30         | 40          |
+| MASTER_PORT               | 卡间通信端口,通常不用修改，有冲突时再改                |            |             |
+
+### 运行Paged Attention BF16
+- 暂不支持
+### 运行Paged Attention W8A16
+- 暂不支持
+### 运行Paged Attention W8A8
+- 暂不支持
+### 运行Paged Attention BF16
+- 暂不支持
+### 运行KV cache量化
+- 暂不支持
+### 运行稀疏量化
+- 暂不支持
+### 运行MOE量化
+- 暂不支持
+### 精度测试
+
+- 参考[此README文件](../../../tests/modeltest/README.md)
+- 示例
+- 310服务器上用2卡4芯跑，910服务器上用8卡
+ ```shell
+ cd ${llm_path}/tests/modeltest
+ export ASCEND_RT_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+ bash run.sh pa_fp16 full_BoolQ 1 gptneox True ${gptneox-20b权重路径} 8
+ ```
+注：GPTNeoX为纯英文模型，所以一般只测试BoolQ英文测试集，对于Ceval中文集我们不做测试
+
+## 性能测试
+
+- 参考[此README文件](../../../tests/modeltest/README.md)
+- 示例
+- 310服务器上用2卡4芯跑，910服务器上用8卡
+-  export ASCEND_RT_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+   bash run.sh pa_fp16 performance [[2048,2048],[1024,1024],[512,512],[256,256]] 1 gptneox True ${gptneox-20b权重路径} 8
+
+# Flash Attention推理使用说明
+
+#### 张量并行模型切分（仅在模型需要多卡并行时使用）
+
+```shell
+cp ${script_path}/modeling_gpt_neox_cut.py ${model_path}
+cp ${script_path}/modeling_gpt_neox_ascend.py ${model_path}
+```
+
+修改 ${model_path}里的config.json中的kv对，改成`"AutoModelForCausalLM": "modeling_gpt_neox_cut.GPTNeoXForCausalLM"`
+
+```text
+修改`${script_path}/cut_model_and_run_gpt_neox.sh`    
+将 `input_dir` 修改为模型所在路径 `${model_path}` 
+将 `output_dir` 修改为原目录下子目录 `${model_path/part_model}`。模型切分成功后，会自动生成新目录part_model(用户无需新建该文件夹)
+将 `world_size_` 修改为期望切分的份数。world_size_=2表示模型切分为2份。
+
+```
+
+目录结构示例建议
+
+```
+--model_path
+  *.py(模型源文件)
+  *.json(模型源文件)
+  *.tiktoken(模型源文件)
+  *.bin(模型源文件，软链接，部分模型权重为其它格式，如*.safetensors等)
+  modeling_gpt_neox_ascend.py(加速库modeling)
+  modeling_gpt_neox_cut.py(权重切分脚本)
+  configuration_gpt_neox.py(模型配置文件)
+  --part_model(以双卡为例，权重切分成功后文件夹)
+    --0
+    --1
+  ......(其他)
+--script_path
+  cut_model_and_run_gpt_neox.sh
+  cut_model_util.py
+  main.py
+  config.ini
+  ......(其他)
+```
+
+执行
+
+```shell
+cd ${script_path}
+bash cut_model_and_run.sh
+```
+
+切分所需时间较长，切分完成后，将会打印 'Tensor parallelism weights have been successfully saved.'。
+
+修改
+${model_path}/part_model/{rank_id}里的config.json中的kv对，改成
+
+```
+AutoModelForCausalLM": "modeling_gpt_neox_ascend.GPTNeoXForCausalLM
+```
+
+# CPU高性能模式
+
+可开启CPU Performance模式以提高模型推理性能。
+
+```
+
+cpupower frequency-set -g performance
+
+```
+
+### 执行推理
+
+#### 修改 ${script_path}/config.ini
+
+[config文件配置参考](../../atb_speed_sdk/README.md)  
+提示：多卡并行推理时，config.ini中model_path路径为part_model父文件夹。例如：
+
+```
+# 正确示例：
+
+model_path=../model
+
+# 错误示例：
+
+model_path=../model/part_model
+```
+
+#### main.py
+
+提供了demo推理，precision测试，性能测试三种下游任务。  
+task_name可选inference、precision、performance。
+
+- 单卡
+  修改 ${model_path}里的config.json中的kv对，改成`"AutoModelForCausalLM": "modeling_gpt_neox_ascend.GPTNeoXForCausalLM"`
+
+```shell
+python main.py --task ${task_name}
+```
+
+注意，由于本模型体量较大，受硬件限制，单卡很可能无法跑起。
+
+- 多卡
+
+```shell
+bash cut_model_and_run.sh ${task_name}
+```
+
+**注意**
+1.docker环境与conda环境有所不同，docker环境中启动模型时需要修改环境变量"ATB_OPERATION_EXECUTE_ASYNC=0"、"TASK_QUEUE_ENABLE=0"，否则可能出现算子下发同步失败。
+2.300l DUO暂时不支持lccl，因此在300l DUO上启动模型时需删去环境变量"BACKEND='lccl'"
+
+**可以使用 MAX_SEQ_LEN 环境变量来设置model支持的最大长度以优化显存占用, 默认使用config里面的max_model_length**  
+如
+
+```shell
+MAX_SEQ_LEN=2048 python main.py --task ${task_name}
+```
+
+或
+修改cut_model_and_run.sh 中的 max_seq_length
+
+```shell
+MAX_SEQ_LEN=2048 bash cut_model_and_run.sh ${task_name}
+```
+
+如果遇到
+
+```text
+Traceback (most recent call last):
+  File "/root/miniconda3/envs/wqh39/lib/python3.9/site-packages/torch_npu/__init__.py", line 31, in <module>
+    import torch_npu.npu
+  File "/root/miniconda3/envs/wqh39/lib/python3.9/site-packages/torch_npu/npu/__init__.py", line 46, in <module>
+    from .utils import (is_initialized, _lazy_call, _lazy_init, init, set_dump,
+  File "/root/miniconda3/envs/wqh39/lib/python3.9/site-packages/torch_npu/npu/utils.py", line 27, in <module>
+    import torch_npu._C
+ImportError: /root/miniconda3/envs/wqh39/bin/../lib/libgomp.so.1: cannot allocate memory in static TLS block
+Segmentation fault (core dumped)
+```
+
+则在命令行前加上`LD_PRELOAD=上面的error路径`。如
+
+```shell
+LD_PRELOAD=/root/miniconda3/envs/wqh39/bin/../lib/libgomp.so.1 MAX_SEQ_LEN=2048 python main.py --task ${task_name}  --is_quant ${is_quant}
+```
+
+# 附录：
+
+# 精度测试指南
+
+由于gpt-neox是英文模型，选用mmlu数据集进行精度测试
+
+## 配置说明
+
+参考 [SDK精度测试指南章节](../../atb_speed_sdk/README.md)
+
+## 运行脚本
+
+- 单芯
+
+```shell
+cd ${script_path}
+python main.py --task precision
+```
+
+- 多芯  
+
+
+```shell
+cd ${script_path}
+bash cut_model_and_run.sh precision
+```
+
+结束后在${mmlu_work_dir}/test_result目录下查看测试结果。[双芯结果每个两份，只需看其中一份即可]。
+
+| 文件                        | 用途                   | 
+|---------------------------|----------------------| 
+| device0.log               | 运行过程日志               |
+| cache0.csv                | 结果详情，C列为预期答案，D列为测试答案 |
+| result_0_classes_acc.json | 测试数据下按不同维度统计准确率      |
+| result_0_subject_acc.json | 测试数据下按不同学科统计准确率      |
+
+**注意：后续重新运行， 需要删除当前目录下生成的test_result文件夹，否则只会读取当前的目录下的测试结果**
+
+# 性能测试
+
+在功能运行正常的基础下，执行以下步骤进行性能测试
+
+## 按照推理指导,下载模型及配置路径，并安装atb_speed_sdk
+
+## 1. 准备
+
+参考 [SDK性能测试指南精确打点法章节](../../atb_speed_sdk/README.md) 进行准备
+
+## 2. 修改配置文件
+
+- 配置config.ini中[performance]属性， 如下：
+  ```
+  model_name=gpt_neox_20b
+  perf_mode=detail
+  ```
+
+## 3. 执行测试脚本
+
+- 单芯
+
+```shell
+cd ${script_path}
+TIMEIT=1 python main.py --task performance
+```
+
+- 多芯  
+  多卡推理，芯片类型区分为300l DUO、800l A2系列。当在800l A2芯片进行多卡推理时，"cut_model_and_run.sh"脚本需修改环境变量"ATB_USE_TILING_COPY_STREAM=0"。
+该环境变量功能是为了解决300l DUO上asynccopy性能慢的问题，与800l A2无关。
+
+```shell
+cd ${script_path}
+TIMEIT=1 bash cut_model_and_run.sh performance
+```
+
+为了不影响正常使用，将`TIMEIT`设置成1来返回具体的性能测试的值，默认是0
+
+### 性能测试结果
+
+得到性能测试结果csv `performance_test_npu_${model_name}_xxx.csv`
+
+### 结果分析
+
+| 列名                            | 含义         |
+|-------------------------------|------------|
+| batch_size                    | batch大小    |
+| input_seq_len(Encoding)       | 输入长度       |
+| output_seq_len(Decoding)	     | 输出长度       |
+| ResponseTime(s)	              | 总响应时间      |
+| forward_first_token_time(ms)  | 首token推理时长 |
+| forward_next_token_time(ms)   | 增量推理时长     |
+| pre_next_token_time(ms)	      | 前处理时长      |
+| post_next_token_time_post(ms) | 后处理时长      |
+
diff --git a/mindie/examples/models/gpt_neox/config.ini b/mindie/examples/models/gpt_neox/config.ini
new file mode 100644
index 00000000..63a70395
--- /dev/null
+++ b/mindie/examples/models/gpt_neox/config.ini
@@ -0,0 +1,33 @@
+[model]
+;模型路径
+model_path=/home/lfy/LM_trans/gptneox20b/model/
+;使用的设备号,多卡用逗号分隔，设置多卡，将默认使用并行模式
+device_ids=7
+;并行通信类型，默认是hccl
+;parallel_backend=hccl
+;日志保存路径，默认是执行脚本所在路径
+log_dir=./log_total
+
+[ceval]
+;精度测试工作路径
+work_dir=/home/lfy/LM_trans/mmlu_test
+;测试batch
+batch=1
+;测试shot数量
+shot=5
+mode=mmlu
+
+[performance]
+;性能测试模型名称，用于结果文件的命名
+model_name=gpt-neox-20b
+;测试的batch size
+batch_size=1
+;测试的输入的最大2的幂
+max_len_exp=10
+;测试的输入的最小2的幂
+min_len_exp=5
+;特定用例测试，格式为[[seq_in,seq_out]],注意当设置这个参数时，max_len_exp min_len_exp不生效
+case_pair=[[256,256],[512,512],[1024,1024],[2048,2048]]
+;生成的结果文件名称，默认会自动生成，一般不设置
+;save_file_name=
+perf_mode=detail
diff --git a/mindie/examples/models/gpt_neox/configuration_gpt_neox.py b/mindie/examples/models/gpt_neox/configuration_gpt_neox.py
new file mode 100644
index 00000000..4db4e04f
--- /dev/null
+++ b/mindie/examples/models/gpt_neox/configuration_gpt_neox.py
@@ -0,0 +1,128 @@
+# coding=utf-8
+# Copyright 2022 EleutherAI and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" GPTNeoX model configuration"""
+
+from transformers.configuration_utils import PretrainedConfig
+from transformers.utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+GPT_NEOX_PRETRAINED_CONFIG_ARCHIVE_MAP = {
+    "EleutherAI/gpt-neox-20b": "https://huggingface.co/EleutherAI/gpt-neox-20b/resolve/main/config.json",
+    # See all GPTNeoX models at https://huggingface.co/models?filter=gpt_neox
+}
+
+
+class GPTNeoXConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`GPTNeoXModel`]. It is used to instantiate an
+    GPTNeoX model according to the specified arguments, defining the model architecture. Instantiating a configuration
+    with the defaults will yield a similar configuration to that of the GPTNeoX
+    [EleutherAI/gpt-neox-20b](https://huggingface.co/EleutherAI/gpt-neox-20b) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+
+    Args:
+        vocab_size (`int`, *optional*, defaults to 50432):
+            Vocabulary size of the GPTNeoX model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling [`GPTNeoXModel`].
+        hidden_size (`int`, *optional*, defaults to 6144):
+            Dimension of the encoder layers and the pooler layer.
+        num_hidden_layers (`int`, *optional*, defaults to 44):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 64):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        intermediate_size (`int`, *optional*, defaults to 24576):
+            Dimension of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
+        hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+            `"relu"`, `"selu"` and `"gelu_new"` are supported.
+        rotary_pct (`float`, *optional*, defaults to 0.25):
+            percentage of hidden dimensions to allocate to rotary embeddings
+        rotary_emb_base (`int`, *optional*, defaults to 10000)
+            base for computing rotary embeddings frequency
+        classifier_dropout (`float`, *optional*, defaults to 0.1):
+            Argument used when doing token classification, used in the model [`GPTNeoXForTokenClassification`].
+
+            The dropout ratio for the hidden layer.
+        max_position_embeddings (`int`, *optional*, defaults to 2048):
+            The maximum sequence length that this model might ever be used with. Typically set this to something large
+            just in case (e.g., 512 or 1024 or 2048).
+        initializer_range (`float`, *optional*, defaults to 1e-5):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-12):
+            The epsilon used by the layer normalization layers.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values attentions (not used by all models). Only
+            relevant if `config.is_decoder=True`.
+        use_parallel_residual (`bool`, *optional*, defaults to `True`):
+            Whether to use a "parallel" formulation in each Transformer layer, which can provide a slight training
+            speedup at large scales (e.g. 20B).
+        Example:
+
+    ```python
+    >>> from transformers import GPTNeoXConfig, GPTNeoXModel
+
+    >>> # Initializing a GPTNeoX gpt-neox-20b style configuration
+    >>> configuration = GPTNeoXConfig()
+
+    >>> # Initializing a model (with random weights) from the gpt-neox-20b style configuration
+    >>> model = GPTNeoXModel(configuration)  # doctest: +SKIP
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config  # doctest: +SKIP
+    ```"""
+    model_type = "gpt_neox"
+
+    def __init__(
+        self,
+        vocab_size=50432,
+        hidden_size=6144,
+        num_hidden_layers=44,
+        num_attention_heads=64,
+        intermediate_size=24576,
+        hidden_act="gelu",
+        rotary_pct=0.25,
+        rotary_emb_base=10000,
+        classifier_dropout=0.1,
+        max_position_embeddings=2048,
+        initializer_range=0.02,
+        layer_norm_eps=1e-5,
+        use_cache=True,
+        bos_token_id=0,
+        eos_token_id=2,
+        tie_word_embeddings=False,
+        use_parallel_residual=True,
+        **kwargs,
+    ):
+        super().__init__(bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
+        self.vocab_size = vocab_size
+        self.max_position_embeddings = max_position_embeddings
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.intermediate_size = intermediate_size
+        self.hidden_act = hidden_act
+        self.rotary_pct = rotary_pct
+        self.rotary_emb_base = rotary_emb_base
+        self.classifier_dropout = classifier_dropout
+        self.initializer_range = initializer_range
+        self.layer_norm_eps = layer_norm_eps
+        self.use_cache = use_cache
+        self.tie_word_embeddings = tie_word_embeddings
+        self.use_parallel_residual = use_parallel_residual
diff --git a/mindie/examples/models/gpt_neox/cut_model_and_run.sh b/mindie/examples/models/gpt_neox/cut_model_and_run.sh
new file mode 100644
index 00000000..5a056ecc
--- /dev/null
+++ b/mindie/examples/models/gpt_neox/cut_model_and_run.sh
@@ -0,0 +1,21 @@
+#!/bin/bash
+input_dir="/home/lfy/LM_trans/gptneox20b/model"
+output_dir="/home/lfy/LM_trans/gptneox20b/model/part_model"
+world_size_=2
+task_name=${1-inference}
+max_seq_length=2048
+
+atb_options="ATB_LAUNCH_KERNEL_WITH_TILING=1 ATB_LAYER_INTERNAL_TENSOR_REUSE=1 PYTORCH_NPU_ALLOC_CONF='max_split_size_mb:2048' HCCL_OP_BASE_FFTS_MODE_ENABLE=1 HCCL_BUFFSIZE=110"
+atb_async_options="ATB_OPERATION_EXECUTE_ASYNC=1 TASK_QUEUE_ENABLE=1"
+start_cmd="MAX_SEQ_LEN=$max_seq_length torchrun --nproc_per_node $world_size_ --master_port 20001 main.py --task $task_name"
+run_cmd="${atb_options} ${atb_async_options} ${start_cmd}"
+
+
+if [[ -d "${output_dir}" ]];then
+    echo "**********************The gpt-neox-20b part model exists, Now begin to run ...**********************"
+    eval "${run_cmd}"
+
+else
+    echo "The gpt-neox-20b part model is not exists, Now begin to cut ..."
+    python ./cut_model_util.py --input_path $input_dir --output_path $output_dir --world_size $world_size_
+fi
\ No newline at end of file
diff --git a/mindie/examples/models/gpt_neox/cut_model_util.py b/mindie/examples/models/gpt_neox/cut_model_util.py
new file mode 100644
index 00000000..0b6637a6
--- /dev/null
+++ b/mindie/examples/models/gpt_neox/cut_model_util.py
@@ -0,0 +1,100 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+#  Copyright (c) Huawei Technologies Co., Ltd. 2024-2024. All rights reserved.
+"""
+cut model
+@create: 2024/1/26 14:53 
+@since: 2024/1/26 14:53 
+"""
+
+import os
+import argparse
+import shutil
+
+import torch
+
+from transformers import AutoModelForCausalLM
+from transformers.utils import logging
+
+logger = logging.get_logger(__name__)
+
+
+def cut_weights(model, world_size, cut_row_keys, cut_col_keys):
+    tensor_dict_list = [{} for i in range(world_size)]
+    for key, tensor in model.state_dict().items():
+        key_short = key.split('.')[-2]
+        cut_tensor_list_t = []
+        if key_short in cut_row_keys:
+            cut_tensor_list = torch.chunk(tensor, world_size, dim=0)
+        elif key_short in cut_col_keys:
+            if key.split('.')[-1] == "weight":
+                cut_tensor_list = torch.chunk(tensor, world_size, dim=1)
+            else:
+                cut_tensor_list = [tensor.div(world_size)] * world_size
+
+        else:
+            cut_tensor_list = [tensor] * world_size
+        for cut_tensor in cut_tensor_list:
+            cut_tensor_list_t.append(cut_tensor.clone())
+        for j in range(world_size):
+            tensor_dict_list[j][key] = cut_tensor_list_t[j]
+    return tensor_dict_list
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        description="Cut Model weights.")
+    parser.add_argument(
+        "--input_path",
+        default="/home/lfy/LM_trans/gptneox20b/model",
+        help="Location of Model weights, which contains model folders",
+    )
+    parser.add_argument(
+        "--output_path",
+        default='/home/lfy/LM_trans/gptneox20b/model/part_model',
+        help="Location to write the part weights",
+    )
+    parser.add_argument(
+        "--world_size",
+        default=2,
+        help="world_size",
+    )
+    parser.add_argument(
+        "--cut_row_keys",
+        default=['query_key_value', 'dense_h_to_4h', 'embed_out'],
+        help="cut_row_keys",
+    )
+    parser.add_argument(
+        "--cut_col_keys",
+        default=['dense', 'dense_4h_to_h'],
+        help="cut_col_keys",
+    )
+
+    args = parser.parse_args()
+
+    args.world_size = int(args.world_size)
+
+    model_path = args.input_path
+    origin_model = AutoModelForCausalLM.from_pretrained(model_path, trust_remote_code=True).half()  # 都加载模型和权重
+    state_dict_list = cut_weights(origin_model, args.world_size, args.cut_row_keys, args.cut_col_keys)
+    model_config = origin_model.config
+    model_config.world_size = args.world_size
+
+    creat_model = AutoModelForCausalLM.from_config(model_config, trust_remote_code=True)
+    for i in range(args.world_size):
+
+        creat_model.load_state_dict(state_dict_list[i])
+
+        target_dir = os.path.join(args.output_path, str(i))
+        os.makedirs(target_dir, exist_ok=True)
+        creat_model.save_pretrained(target_dir)
+        creat_model.config.auto_map["AutoModelForCausalLM"] = "modeling_gpt_neox_ascend.GPTNeoXForCausalLM"
+        creat_model.config.save_pretrained(target_dir)
+        for source_file in ["configuration_gpt_neox.py", "merges.txt", "special_tokens_map.json", "tokenizer.json",
+                            "tokenizer_config.json",
+                            "vocab.json",
+                            "modeling_gpt_neox_ascend.py"]:
+            shutil.copy(os.path.join(model_path, source_file), target_dir)
+
+    logger.info('Tensor parallelism weights have been successfully saved.')
+
diff --git a/mindie/examples/models/gpt_neox/main.py b/mindie/examples/models/gpt_neox/main.py
new file mode 100644
index 00000000..25631008
--- /dev/null
+++ b/mindie/examples/models/gpt_neox/main.py
@@ -0,0 +1,125 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+#  Copyright (c) Huawei Technologies Co., Ltd. 2024-2024. All rights reserved.
+"""
+gpt-neox-20b
+@create: 2024/1/24 19:32
+@since: 2024/1/24 19:32
+"""
+import argparse
+import os
+
+from atb_speed.common.config import atb_speed_config
+from atb_speed.common.launcher import ParallelLauncher, Launcher
+from atb_speed.common.performance.base import PerformanceTest
+from atb_speed.common.precision import get_precision_test_cls
+from transformers import AutoTokenizer, AutoModelForCausalLM
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description="Adapting LLM on Ascend")
+    parser.add_argument(
+        "--task",
+        type=str,
+        default='inference',
+        choices=['inference', 'precision', 'performance'],
+        help="Specify the task in which to run the script"
+    )
+    args = parser.parse_args()
+    return args
+
+
+class LMParallel(ParallelLauncher):
+    """
+    多卡推理launcher
+    """
+
+    def init_model(self):
+
+        tokenizer = AutoTokenizer.from_pretrained(self.model_path, padding_side='left')
+        tokenizer.add_special_tokens({'pad_token': '[PAD]'})
+        part_model_path = os.path.join(self.model_path, 'part_model', str(self.local_rank))
+        model = AutoModelForCausalLM.from_pretrained(part_model_path, trust_remote_code=True)
+        model = model.half().to(self._device)
+        model.eval()
+        model.generation_config = self.remove_part_of_generation_config(model.generation_config)
+        return model, tokenizer
+
+
+class LM(Launcher):
+    """
+    单卡推理launcher
+    """
+
+    def init_model(self):
+        """
+        模型初始化
+        :return:
+        """
+
+        tokenizer = AutoTokenizer.from_pretrained(self.model_path, padding_side='left')
+        tokenizer.add_special_tokens({'pad_token': '[PAD]'})
+        model = AutoModelForCausalLM.from_pretrained(self.model_path, trust_remote_code=True).half().to(self._device)
+        model.eval()
+        model.generation_config = self.remove_part_of_generation_config(model.generation_config)
+        return model, tokenizer
+
+
+def demo_ceval(launcher: Launcher):
+    """
+    :param launcher:
+    :return:
+    """
+    c_t = get_precision_test_cls()(launcher)
+    c_t.run()
+
+
+def demo_perf(launcher: Launcher):
+    """
+    :param launcher:
+    :return:
+    """
+    performance_test = PerformanceTest(launcher)
+    performance_test.warm_up()
+    performance_test.run_test()
+
+
+def demo_inference(launcher: Launcher):
+    """
+    :param launcher:
+    :return:
+    """
+    param_dict = {"max_new_tokens": 64, "do_sample": False, "repetition_penalty": 1.1}
+    launcher.logger.info("---------------warm-up---------------")
+    launcher.infer('Hamlet->Shakespeare\nOne Hundred Years of Solitude->', param_dict)
+
+    launcher.logger.info("---------------inference---------------")
+    launcher.infer('How to learn a new language?', param_dict)
+
+    launcher.logger.info("---------------batch---------------")
+    query_list = [
+        "How to learn a new language?",
+        'The CEO of Google is',
+    ]
+    launcher.infer_batch(query_list, param_dict)
+
+
+TASK_MAP = {
+    "inference": demo_inference,
+    "precision": demo_ceval,
+    "performance": demo_perf
+}
+
+
+def main():
+    args = parse_args()
+    atb_speed_config.init_config("config.ini")
+    if atb_speed_config.model.device_num > 1:
+        launcher = LMParallel()
+    else:
+        launcher = LM()
+    TASK_MAP.get(args.task)(launcher)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/mindie/examples/models/gpt_neox/modeling_gpt_neox_ascend.py b/mindie/examples/models/gpt_neox/modeling_gpt_neox_ascend.py
new file mode 100644
index 00000000..a1c81e73
--- /dev/null
+++ b/mindie/examples/models/gpt_neox/modeling_gpt_neox_ascend.py
@@ -0,0 +1,1194 @@
+# coding=utf-8
+# Copyright 2022 EleutherAI The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" PyTorch GPTNeoX model."""
+
+import json
+import math
+import os
+from typing import Optional, Tuple, Union
+
+import torch
+import torch.utils.checkpoint
+from configuration_gpt_neox import GPTNeoXConfig
+from torch import nn
+from torch.nn import CrossEntropyLoss
+import torch_npu
+from atb_speed.common.timer import Timer
+from atb_speed.common.utils import load_atb_speed
+from transformers.activations import ACT2FN
+from transformers.file_utils import (
+    add_code_sample_docstrings,
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    replace_return_docstrings,
+)
+from transformers.modeling_outputs import (
+    BaseModelOutputWithPast,
+    CausalLMOutputWithPast
+)
+from transformers.modeling_utils import PreTrainedModel
+from transformers.utils import logging
+
+logger = logging.get_logger(__name__)
+
+_CHECKPOINT_FOR_DOC = "trl-internal-testing/tiny-random-GPTNeoXForCausalLM"
+_REAL_CHECKPOINT_FOR_DOC = "EleutherAI/gpt-neox-20b"
+_CONFIG_FOR_DOC = "GPTNeoXConfig"
+
+GPT_NEOX_PRETRAINED_MODEL_ARCHIVE_LIST = [
+    "EleutherAI/gpt-neox-20b",
+    # See all GPTNeoX models at https://huggingface.co/models?filter=gpt_neox
+]
+
+
+def is_nd():
+    soc_version = torch_npu._C._npu_get_soc_version()
+    return soc_version in [104, 220, 221, 222, 223, 224]
+
+
+IS_ND = is_nd()
+logger.info(f"IS_ND = {IS_ND}")
+
+
+def get_rank_and_world_size():
+    try:
+        rank = torch.distributed.get_rank()
+        world_size = torch.distributed.get_world_size()
+    except RuntimeError:
+        rank = 0
+        world_size = 1
+    return rank, world_size
+
+
+RANK, WORLD_SIZE = get_rank_and_world_size()
+logger.info(f"RANK = {RANK} | WORLD_SIZE = {WORLD_SIZE}")
+
+
+def load_ascend_transformer():
+    atb_speed_home_path = os.environ.get("ATB_SPEED_HOME_PATH")
+    if atb_speed_home_path is None:
+        raise RuntimeError(
+            "env ATB_SPEED_HOME_PATH not exist, source set_env.sh")
+    lib_path = os.path.join(atb_speed_home_path, "lib/libatb_speed_torch.so")
+    logger.info(f"load {lib_path}")
+    torch.classes.load_library(lib_path)
+
+
+load_ascend_transformer()
+load_atb_speed()
+
+
+def _make_causal_mask(
+        input_ids_shape: torch.Size, dtype: torch.dtype, device: torch.device, past_key_values_length: int = 0
+):
+    """
+    Make causal mask used for bi-directional self-attention.
+    """
+    bsz, tgt_len = input_ids_shape
+    mask = torch.full((tgt_len, tgt_len), 1, device=device)
+    mask_cond = torch.arange(mask.size(-1), device=device)
+    mask.masked_fill_(mask_cond < (mask_cond + 1).view(mask.size(-1), 1), 0)
+    mask = mask.to(dtype)
+
+    if past_key_values_length > 0:
+        mask = torch.cat([torch.zeros(tgt_len, past_key_values_length, dtype=dtype, device=device), mask], dim=-1)
+
+    return mask[None, None, :, :].expand(bsz, 1, tgt_len, past_key_values_length + tgt_len)
+
+
+# Copied from transformers.models.bart.modeling_bart._expand_mask
+def _expand_mask(mask: torch.Tensor, dtype: torch.dtype, tgt_len: Optional[int] = None):
+    """
+    Expands attention_mask from `[bsz, seq_len]` to `[bsz, 1, tgt_seq_len, src_seq_len]`.
+    """
+    bsz, src_len = mask.size()
+    tgt_len = tgt_len if tgt_len is not None else src_len
+    expanded_mask = mask[:, None, None, :].expand(bsz, 1, tgt_len, src_len).to(dtype)
+    inverted_mask = 1.0 - expanded_mask
+
+    return inverted_mask
+
+
+def _prepare_input_attention_mask(attention_mask, input_shape, inputs_embeds, past_key_values_length):
+    # create causal mask
+    combined_attention_mask = None
+    if input_shape[-1] > 1:
+        # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+        combined_attention_mask = _make_causal_mask(
+            input_shape,
+            inputs_embeds.dtype,
+            device=inputs_embeds.device,
+            past_key_values_length=past_key_values_length,
+        )
+    if attention_mask is not None:
+        # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+        expanded_attn_mask = _expand_mask(
+            attention_mask, inputs_embeds.dtype, tgt_len=input_shape[-1]).to(inputs_embeds.device)
+        if combined_attention_mask is None:
+            combined_attention_mask = expanded_attn_mask
+        else:
+            combined_attention_mask = combined_attention_mask + expanded_attn_mask
+
+    return combined_attention_mask.masked_fill(combined_attention_mask.to(torch.bool), -20000)
+
+
+class GPTNeoXPreTrainedModel(PreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+
+    config_class = GPTNeoXConfig
+    base_model_prefix = "gpt_neox"
+    supports_gradient_checkpointing = True
+    _no_split_modules = ["GPTNeoXLayer"]
+
+    def _init_weights(self, module):
+        """Initialize the weights"""
+        if isinstance(module, nn.Linear):
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+
+
+class GPTNeoXAttention(nn.Module):
+    def __init__(self, config, layer_id):
+        super().__init__()
+        self.world_size = WORLD_SIZE
+        self.num_attention_heads = config.num_attention_heads
+        self.hidden_size = config.hidden_size
+        self.head_size = self.hidden_size // self.num_attention_heads
+        self.num_attention_heads = self.num_attention_heads // self.world_size
+
+        self.rotary_ndims = int(self.head_size * config.rotary_pct)
+        max_positions = config.max_position_embeddings
+        self.register_buffer(
+            "bias",
+            torch.tril(torch.ones((max_positions, max_positions), dtype=torch.bool)).view(
+                1, 1, max_positions, max_positions
+            ),
+        )
+        self.register_buffer("masked_bias", torch.tensor(-1e9))
+        self.rotary_emb = RotaryEmbedding(
+            self.rotary_ndims, int(os.getenv("MAX_SEQ_LEN", config.max_position_embeddings)),
+            base=config.rotary_emb_base
+        )
+        self.register_buffer(
+            "norm_factor",
+            torch.sqrt(torch.tensor(self.head_size, dtype=torch.float32)).to(torch.get_default_dtype()),
+            persistent=False,
+        )
+        self.query_key_value = nn.Linear(config.hidden_size, 3 * config.hidden_size // self.world_size)
+        self.dense = nn.Linear(config.hidden_size // self.world_size, config.hidden_size)
+
+        self.layer_id = layer_id
+
+    def forward(
+            self,
+            hidden_states: torch.FloatTensor,
+            attention_mask: torch.FloatTensor,
+            position_ids: torch.LongTensor,
+            head_mask: Optional[torch.FloatTensor] = None,
+            layer_past: Optional[Tuple[torch.Tensor]] = None,
+            use_cache: Optional[bool] = False,
+            output_attentions: Optional[bool] = False,
+    ):
+        has_layer_past = layer_past is not None
+
+        # Compute QKV
+        qkv = self.query_key_value(hidden_states)
+
+        # Covert QKV to multiHead shape
+        new_qkv_shape = qkv.size()[:-1] + (self.num_attention_heads, 3 * self.head_size)
+        qkv = qkv.view(*new_qkv_shape)
+
+        query = qkv[..., : self.head_size].permute(0, 2, 1, 3)
+        key = qkv[..., self.head_size: 2 * self.head_size].permute(0, 2, 1, 3)
+        value = qkv[..., 2 * self.head_size:].permute(0, 2, 1, 3)
+
+        # Compute rotary embeddings on rotary_ndims
+        query_rot = query[..., : self.rotary_ndims]
+        query_pass = query[..., self.rotary_ndims:]
+        key_rot = key[..., : self.rotary_ndims]
+        key_pass = key[..., self.rotary_ndims:]
+
+        # Compute token offset for rotary embeddings (when decoding)
+        seq_len = key.shape[-2]
+        if has_layer_past:
+            seq_len += layer_past[0].shape[-2]
+        cos, sin = self.rotary_emb(value, seq_len=seq_len)
+        query, key = apply_rotary_pos_emb(query_rot, key_rot, cos, sin, position_ids)
+        query = torch.cat((query, query_pass), dim=-1)
+        key = torch.cat((key, key_pass), dim=-1)
+
+        # Cache QKV values
+        if has_layer_past:
+            past_key = layer_past[0]
+            past_value = layer_past[1]
+            key = torch.cat((past_key, key), dim=-2)
+            value = torch.cat((past_value, value), dim=-2)
+        present = (key, value) if use_cache else None
+
+        # Compute attention
+        attn_output, attn_weights = self._attn(query, key, value, attention_mask, head_mask)
+
+        # Reshape outputs
+        attn_output = self._merge_heads(attn_output, self.num_attention_heads, self.head_size)
+        attn_output = self.dense(attn_output)
+
+        # all reduce
+        if self.world_size >= 2:
+            torch.distributed.all_reduce(
+                attn_output, op=torch.distributed.ReduceOp.SUM)
+
+        outputs = (attn_output, present)
+        if output_attentions:
+            outputs += (attn_weights,)
+
+        return outputs
+
+    @classmethod
+    def _split_heads(cls, tensor, num_attention_heads, attn_head_size):
+        """
+        Splits hidden dim into attn_head_size and num_attention_heads
+        """
+        new_shape = tensor.size()[:-1] + (num_attention_heads, attn_head_size)
+        tensor = tensor.view(new_shape)
+        tensor = tensor.permute(0, 2, 1, 3)
+        return tensor
+
+    @classmethod
+    def _merge_heads(cls, tensor, num_attention_heads, attn_head_size):
+        """
+        Merges attn_head_size dim and num_attn_heads dim into hidden dim
+        """
+        tensor = tensor.permute(0, 2, 1, 3).contiguous()
+        tensor = tensor.view(tensor.size(0), tensor.size(1), num_attention_heads * attn_head_size)
+        return tensor
+
+    def _attn(self, query, key, value, attention_mask=None, head_mask=None):
+        # q, k, v: [bs, num_attention_heads, seq_len, attn_head_size]
+        # compute causal mask from causal mask buffer
+        batch_size, num_attention_heads, query_length, attn_head_size = query.size()
+        key_length = key.size(-2)
+
+        causal_mask = self.bias[:, :, key_length - query_length: key_length, :key_length]
+
+        query = query.view(batch_size * num_attention_heads, query_length, attn_head_size)
+        key = key.view(batch_size * num_attention_heads, key_length, attn_head_size)
+        attn_scores = torch.zeros(
+            batch_size * num_attention_heads,
+            query_length,
+            key_length,
+            dtype=query.dtype,
+            device=key.device,
+        )
+        attn_scores = torch.baddbmm(
+            attn_scores,
+            query,
+            key.transpose(1, 2),
+            beta=1.0,
+            alpha=(torch.tensor(1.0, dtype=self.norm_factor.dtype, device=self.norm_factor.device) / self.norm_factor),
+        )
+        attn_scores = attn_scores.view(batch_size, num_attention_heads, query_length, key_length)
+
+        mask_value = torch.finfo(attn_scores.dtype).min
+        # Need to be a tensor, otherwise we get error: `RuntimeError: expected scalar type float but found double`.
+        # Need to be on the same device, otherwise `RuntimeError: ..., x and y to be on the same device`
+        mask_value = torch.tensor(mask_value, dtype=attn_scores.dtype).to(attn_scores.device)
+        attn_scores = torch.where(causal_mask, attn_scores, mask_value)
+
+        if attention_mask is not None:
+            # Apply the attention mask
+            attn_scores = attn_scores + attention_mask
+
+        attn_weights = nn.functional.softmax(attn_scores, dim=-1)
+        attn_weights = attn_weights.to(value.dtype)
+
+        # Mask heads if we want to
+        if head_mask is not None:
+            attn_weights = attn_weights * head_mask
+
+        attn_output = torch.matmul(attn_weights, value)
+        return attn_output, attn_weights
+
+
+def attention_mask_func(attention_scores, ltor_mask):
+    attention_scores.masked_fill_(~ltor_mask, torch.finfo(attention_scores.dtype).min)
+    return attention_scores
+
+
+class RotaryEmbedding(torch.nn.Module):
+    def __init__(self, dim, max_position_embeddings, base=10000, device=None):
+        super().__init__()
+        inv_freq = 1.0 / (base ** (torch.arange(0, dim, 2).float().to(device) / dim))
+        self.register_buffer("inv_freq", inv_freq)
+
+        # Build here to make `torch.jit.trace` work.
+        self.max_seq_len_cached = max_position_embeddings
+        positions = torch.arange(self.max_seq_len_cached, device=self.inv_freq.device, dtype=self.inv_freq.dtype)
+        # out product of positions and inv_freq: [max_seq_len_cached, dim//2]
+        freqs = torch.einsum("i,j->ij", positions, self.inv_freq)
+        # Different from paper, but it uses a different permutation in order to obtain the same calculation
+        emb = torch.cat((freqs, freqs), dim=-1)
+        self.cos_cached = emb.cos()[None, None, :, :]
+        self.sin_cached = emb.sin()[None, None, :, :]
+
+    def forward(self, x, seq_len=None):
+        # This `if` block is unlikely to be run after we build sin/cos in `__init__`. Keep the logic here just in case.
+        if seq_len > self.max_seq_len_cached:
+            self.max_seq_len_cached = seq_len
+            positions = torch.arange(self.max_seq_len_cached, device=x.device, dtype=self.inv_freq.dtype)
+            # out product of positions and inv_freq: [seq_len, dim//2]
+            freqs = torch.einsum("i,j->ij", positions, self.inv_freq)
+            # Different from paper, but it uses a different permutation in order to obtain the same calculation
+            emb = torch.cat((freqs, freqs), dim=-1).to(x.device)
+            self.cos_cached = emb.cos()[None, None, :, :]
+            self.sin_cached = emb.sin()[None, None, :, :]
+        return self.cos_cached[:, :, :seq_len, ...].to(x.device), self.sin_cached[:, :, :seq_len, ...].to(x.device)
+
+
+class AscendRotaryEmbedding(RotaryEmbedding):
+    def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None):
+        super().__init__(dim, max_position_embeddings, base, device)
+        self.cos_cached = self.cos_cached.squeeze(1).squeeze(0).half()
+        self.sin_cached = self.sin_cached.squeeze(1).squeeze(0).half()
+
+    def forward(self, x, seq_len=None):
+        if seq_len > self.max_seq_len_cached:
+            super().forward(x, seq_len)
+            self.cos_cached = self.cos_cached.squeeze(1).squeeze(0).half()
+            self.sin_cached = self.sin_cached.squeeze(1).squeeze(0).half()
+        if x.device != self.cos_cached.device:
+            self.cos_cached = self.cos_cached.to(x.device).half()
+            self.sin_cached = self.sin_cached.to(x.device).half()
+        return self.cos_cached, self.sin_cached
+
+
+class AttentionMask(nn.Module):
+    def __init__(self, max_seq_length):
+        super().__init__()
+        self.mask_min = -20000
+        self._seq_len_cached = max_seq_length
+        self.attn_mask_inc_cache = torch.full((max_seq_length, max_seq_length), self.mask_min, dtype=torch.half).npu()
+        self.attn_mask_inc_zeros = torch.full((max_seq_length, max_seq_length), 0, dtype=torch.half).npu()
+        self.attn_mask_full = None  # encoder_mask
+        self.attn_mask_inc = None  # decoder_mask
+
+    def get_attn_mask(self, attention_mask, origin_inputs_count, seq_len: int, batch_size,
+                      dtype: torch.dtype, device: torch.device):
+        # Reset the tables if the sequence length has changed,
+        # or if we're on a new device (possibly due to tracing for instance)
+
+        if seq_len > self._seq_len_cached:
+            self._seq_len_cached = seq_len
+            self.attn_mask_inc_cache = torch.full(
+                (seq_len, seq_len), self.mask_min).to(dtype).to(device)
+            self.attn_mask_inc_zeros = torch.full((seq_len, seq_len), 0).to(dtype).to(device)
+        if self.attn_mask_inc_cache.device != device or self.attn_mask_inc_cache.dtype != dtype:
+            self.attn_mask_inc_cache = self.attn_mask_inc_cache.to(dtype).to(device)
+            self.attn_mask_inc_zeros = self.attn_mask_inc_zeros.to(dtype).to(device)
+
+        self.attn_mask_full = torch.full(
+            (batch_size, self._seq_len_cached, self._seq_len_cached), self.mask_min).to(dtype).to(device)
+        decoder_masks = []
+        for i in range(batch_size):
+            self.attn_mask_full[i][:seq_len, :seq_len] = attention_mask.squeeze(1)[i]
+            count = origin_inputs_count[i].item()
+            # left padding, if input has no padding，count will equal seq_len
+            left_mask = self.attn_mask_inc_cache[:, :seq_len - count]
+            right_mask = self.attn_mask_inc_zeros[:, :self._seq_len_cached - seq_len + count]
+            decoder_mask = torch.concat([left_mask, right_mask], dim=-1).unsqueeze(0)
+
+            decoder_masks.append(decoder_mask)
+        self.attn_mask_inc = torch.concat(decoder_masks, dim=0).to(dtype).to(device)
+
+        return self.attn_mask_full, self.attn_mask_inc
+
+
+def rotate_half(x):
+    """Rotates half the hidden dims of the input."""
+    x1 = x[..., : x.shape[-1] // 2]
+    x2 = x[..., x.shape[-1] // 2:]
+    return torch.cat((-x2, x1), dim=-1)
+
+
+def apply_rotary_pos_emb(q, k, cos, sin, position_ids):
+    # q, k: [bs, num_heads, seq_len, rotary_ndims]
+    # cos, sin: [1, 1, seq_len, rotary_ndims]
+    gather_indices = position_ids[:, None, :, None]  # [bs, 1, seq_len, 1]
+    gather_indices = gather_indices.repeat(1, cos.shape[1], 1, cos.shape[3])  # [bs, 1, seq_len, rotary_ndims]
+    cos = torch.gather(cos.repeat(gather_indices.shape[0], 1, 1, 1), 2, gather_indices)
+    sin = torch.gather(sin.repeat(gather_indices.shape[0], 1, 1, 1), 2, gather_indices)
+    q_embed = (q * cos) + (rotate_half(q) * sin)
+    k_embed = (k * cos) + (rotate_half(k) * sin)
+    return q_embed, k_embed
+
+
+class GPTNeoXMLP(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.world_size = WORLD_SIZE
+        self.dense_h_to_4h = nn.Linear(config.hidden_size, config.intermediate_size // self.world_size)
+        self.dense_4h_to_h = nn.Linear(config.intermediate_size // self.world_size, config.hidden_size)
+        self.act = ACT2FN[config.hidden_act]
+
+    def forward(self, hidden_states):
+        hidden_states = self.dense_h_to_4h(hidden_states)
+        hidden_states = self.act(hidden_states)
+        hidden_states = self.dense_4h_to_h(hidden_states)
+        # all_reduce
+        if self.world_size >= 2:
+            torch.distributed.all_reduce(
+                hidden_states, op=torch.distributed.ReduceOp.SUM)
+        return hidden_states
+
+
+class GPTNeoXLayer(nn.Module):
+    def __init__(self, config, layer_id):
+        super().__init__()
+        self.world_size = WORLD_SIZE
+        self.use_parallel_residual = config.use_parallel_residual
+        self.input_layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.post_attention_layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.attention = GPTNeoXAttention(config, layer_id)
+        self.mlp = GPTNeoXMLP(config)
+        self.layer_id = layer_id
+
+    def forward(
+            self,
+            hidden_states: Optional[torch.FloatTensor],
+            attention_mask: Optional[torch.FloatTensor] = None,
+            position_ids: Optional[torch.LongTensor] = None,
+            head_mask: Optional[torch.FloatTensor] = None,
+            use_cache: Optional[bool] = False,
+            layer_past: Optional[Tuple[torch.Tensor]] = None,
+            output_attentions: Optional[bool] = False,
+    ):
+        attention_layer_outputs = self.attention(
+            self.input_layernorm(hidden_states),
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            layer_past=layer_past,
+            head_mask=head_mask,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+        )
+        attn_output = attention_layer_outputs[0]  # output_attn: attn_output, present, (attn_weights)
+        outputs = attention_layer_outputs[1:]
+
+        if self.use_parallel_residual:
+            mlp_output = self.mlp(self.post_attention_layernorm(hidden_states))
+            hidden_states = mlp_output + attn_output + hidden_states
+        else:
+            attn_output = attn_output + hidden_states
+            mlp_output = self.mlp(self.post_attention_layernorm(attn_output))
+            hidden_states = mlp_output + attn_output
+
+        if use_cache:
+            outputs = (hidden_states,) + outputs  # hidden_states, present, (attn_weights)
+        else:
+            outputs = (hidden_states,) + outputs[1:]  # hidden_states, (attn_weights)
+
+        return outputs
+
+
+GPT_NEOX_START_DOCSTRING = r"""
+    This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) sub-class. Use
+    it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
+    behavior.
+
+    Parameters:
+        config ([`~GPTNeoXConfig`]): Model configuration class with all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+GPT_NEOX_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (`torch.LongTensor` of shape `({0})`):
+            Indices of input sequence tokens in the vocabulary.
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            [What are input IDs?](../glossary#input-ids)
+        attention_mask (`torch.FloatTensor` of shape `({0})`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+            [What are attention masks?](../glossary#attention-mask)
+        position_ids (`torch.LongTensor` of shape `({0})`, *optional*):
+            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
+            config.n_positions - 1]`.
+
+            [What are position IDs?](../glossary#position-ids)
+        head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
+            Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:
+
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+
+        inputs_embeds (`torch.FloatTensor` of shape `({0}, hidden_size)`, *optional*):
+            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
+            is useful if you want more control over how to convert *input_ids* indices into associated vectors than the
+            model's internal embedding lookup matrix.
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple.
+"""
+
+
+@add_start_docstrings(
+    "The bare GPTNeoX Model transformer outputting raw hidden-states without any specific head on top.",
+    GPT_NEOX_START_DOCSTRING,
+)
+class GPTNeoXModel(GPTNeoXPreTrainedModel):
+    def __init__(self, config: GPTNeoXConfig):
+        super().__init__(config)
+        self.config = config
+        self.rank = RANK
+        self.rank_size = WORLD_SIZE
+        self.embed_in = nn.Embedding(config.vocab_size, config.hidden_size)
+        self.layers = nn.ModuleList([GPTNeoXLayer(config, i) for i in range(config.num_hidden_layers)])
+        self.final_layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+
+        self.gradient_checkpointing = False
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+        # for ascend init
+        self.acl_param_decoder = None
+        self.acl_param_encoder = None
+        self.acl_encoder_operation = None
+        self.acl_decoder_operation = None
+        self.ascend_weight = []
+        self.init_ascend_operations(config)
+        self.place_holder = torch.ones(1).npu()
+        self.hidden_size_nz = None
+
+    def init_ascend_operations(self, config: GPTNeoXConfig):
+        self.acl_param_encoder = json.dumps({
+            "headNum": config.num_attention_heads // WORLD_SIZE,
+            "dk": config.hidden_size // config.num_attention_heads,
+            "layerNum": config.num_hidden_layers,
+            "layerNormEps": config.layer_norm_eps,
+            "rotaryPct": config.rotary_pct,
+            "isPrefill": True,
+            "qScale": 1 / math.sqrt(config.hidden_size // config.num_attention_heads),
+            "rank": self.rank,
+            "rankSize": self.rank_size,
+            "backend": os.getenv("BACKEND", "hccl")
+        })
+
+        self.acl_param_decoder = json.dumps({
+            "headNum": config.num_attention_heads // WORLD_SIZE,
+            "dk": config.hidden_size // config.num_attention_heads,
+            "layerNum": config.num_hidden_layers,
+            "layerNormEps": config.layer_norm_eps,
+            "rotaryPct": config.rotary_pct,
+            "isPrefill": True,
+            "qScale": 1 / math.sqrt(config.hidden_size // config.num_attention_heads),
+            "rank": self.rank,
+            "rankSize": self.rank_size,
+            "backend": os.getenv("BACKEND", "hccl")
+        })
+
+        self.max_position_embeddings = int(os.getenv("MAX_SEQ_LEN", config.max_position_embeddings))
+        self.num_layers = config.num_hidden_layers
+        self.num_attention_heads = config.num_attention_heads // WORLD_SIZE
+
+        self.acl_encoder_operation = torch.classes.ModelTorch.ModelTorch("gptneox_20b_FaKvCacheRopeModel")
+        self.acl_decoder_operation = torch.classes.ModelTorch.ModelTorch("gptneox_20b_FaKvCacheRopeModel")
+
+        self.acl_encoder_operation.set_param(self.acl_param_encoder)
+        self.acl_decoder_operation.set_param(self.acl_param_decoder)
+
+        rotary_dim = int((config.hidden_size // config.num_attention_heads) * config.rotary_pct)
+        self.ascend_rotary_embedding = AscendRotaryEmbedding(
+            rotary_dim, max_position_embeddings=self.max_position_embeddings)
+
+        self.increment_flags = [False] * self.num_layers
+        self.token_num = 0
+
+        self.token_offset = None
+        self.layer_id_input = []
+
+        self.attention_mask_generator = AttentionMask(self.max_position_embeddings)
+        self.attention_mask_input = None
+        self.attention_mask_encoder = None
+        self.attention_mask_decoder = None
+        self.origin_inputs_count = None
+
+        self.seq_len_tensor = None
+        self.seqlen_max = None
+
+        for i in range(self.num_layers):
+            self.layer_id_input.append(torch.tensor([i], dtype=torch.int32).npu())
+
+        self.weight_flag = False
+        self.num_layers = config.num_hidden_layers
+        self.hidden_size = config.hidden_size
+        self.nz_dim = 16
+
+        self.acl_encoder_operation_inputs : list = [None] * (11 + self.num_layers)
+        self.acl_decoder_operation_inputs : list = [None] * (11 + self.num_layers)
+        self.lm_head_weight = None
+        self.k_cache_input = None
+        self.v_cache_input = None
+        self.batch = 0
+
+        for i in range(self.num_layers):
+            self.acl_encoder_operation_inputs[11 + i] = torch.tensor([i], dtype=torch.int32).npu()
+            self.acl_decoder_operation_inputs[11 + i] = torch.tensor([i], dtype=torch.int32).npu()
+
+    def init_ascend_weight(self):
+        weights = [self.state_dict()["embed_in.weight"]]
+        for i in range(self.num_layers):
+            weights_t = []
+            weights_layer = self.layers[i].state_dict()
+            weights_t.append(weights_layer['input_layernorm.weight'])
+            weights_t.append(weights_layer['input_layernorm.bias'])
+            weights_t.append(weights_layer['post_attention_layernorm.weight'])
+            weights_t.append(weights_layer['post_attention_layernorm.bias'])
+            weights_t.append(weights_layer['attention.query_key_value.weight'])
+            weights_t.append(weights_layer['attention.query_key_value.bias'])
+            weights_t.append(weights_layer['attention.dense.weight'])
+            weights_t.append(weights_layer['attention.dense.bias'])
+            weights_t.append(weights_layer['mlp.dense_h_to_4h.weight'])
+            weights_t.append(weights_layer['mlp.dense_h_to_4h.bias'])
+            weights_t.append(weights_layer['mlp.dense_4h_to_h.weight'])
+            weights_t.append(weights_layer['mlp.dense_4h_to_h.bias'])
+            weights.extend(weights_t)
+
+        weights.append(self.state_dict()["final_layer_norm.weight"])
+        weights.append(self.state_dict()["final_layer_norm.bias"])
+        weights.append(self.lm_head_weight)
+
+        self.ascend_weight = weights
+        self.acl_encoder_operation.set_weight(weights)
+        self.acl_decoder_operation.set_weight(weights)
+
+    def prepare_inputs_for_ascend(self, input_ids, position_ids, seq_length, batch_size, past_key_values=None):
+        max_seq_len = self.token_num + seq_length
+        cos_table, sin_table = self.ascend_rotary_embedding(input_ids, max_seq_len)
+        if not past_key_values or past_key_values[0] is None:
+            self.token_num = seq_length
+            self.token_offset[:] = seq_length
+            self.seq_len_tensor = torch.tensor([seq_length] * batch_size,
+                                               dtype=torch.int32, device=input_ids.device)
+            self.seqlen_max = torch.tensor([self.seq_len_tensor[0] - 1], dtype=torch.int64, device="npu")
+            self.attention_mask_encoder, self.attention_mask_decoder = self.attention_mask_generator.get_attn_mask(
+                self.attention_mask_input,
+                self.origin_inputs_count,
+                seq_length,
+                batch_size,
+                dtype=self.k_cache_input.dtype,
+                device=self.k_cache_input.device)
+
+            if not IS_ND:
+                self.attention_mask_encoder = torch_npu.npu_format_cast(self.attention_mask_encoder.view(
+                    batch_size, self.max_position_embeddings,
+                    self.max_position_embeddings // self.nz_dim, self.nz_dim).transpose(1, 2).contiguous(), 29)
+                self.attention_mask_decoder = torch_npu.npu_format_cast(self.attention_mask_decoder.view(
+                    batch_size, self.max_position_embeddings,
+                    self.max_position_embeddings // self.nz_dim, self.nz_dim).transpose(1, 2).contiguous(), 29)
+
+            self.acl_encoder_operation_inputs[0] = input_ids
+            self.acl_encoder_operation_inputs[1] = position_ids
+            self.acl_encoder_operation_inputs[2] = cos_table
+            self.acl_encoder_operation_inputs[3] = sin_table
+            self.acl_encoder_operation_inputs[4] = self.attention_mask_encoder
+            self.acl_encoder_operation_inputs[5] = self.k_cache_input
+            self.acl_encoder_operation_inputs[6] = self.v_cache_input
+            self.acl_encoder_operation_inputs[7] = self.token_offset
+            self.acl_encoder_operation_inputs[8] = self.seq_len_tensor
+            self.acl_encoder_operation_inputs[9] = self.place_holder
+            self.acl_encoder_operation_inputs[10] = self.seqlen_max
+
+            acl_param_encoder = json.dumps({
+                "tokenOffset": [seq_length] * batch_size,
+                "seqLen": [seq_length] * batch_size
+            })
+
+            return self.acl_encoder_operation_inputs, acl_param_encoder
+        else:
+            self.token_num = self.token_num + 1
+            self.token_offset[:] = self.token_num
+            self.seq_len_tensor = torch.tensor([1] * batch_size, dtype=torch.int32, device=input_ids.device)
+            self.seqlen_max = torch.tensor([self.seq_len_tensor[0] - 1], dtype=torch.int64, device="npu")
+
+            self.acl_decoder_operation_inputs[0] = input_ids
+            self.acl_decoder_operation_inputs[1] = position_ids
+            self.acl_decoder_operation_inputs[2] = cos_table
+            self.acl_decoder_operation_inputs[3] = sin_table
+            self.acl_decoder_operation_inputs[4] = self.attention_mask_decoder
+            self.acl_decoder_operation_inputs[5] = self.k_cache_input
+            self.acl_decoder_operation_inputs[6] = self.v_cache_input
+            self.acl_decoder_operation_inputs[7] = self.token_offset
+            self.acl_decoder_operation_inputs[8] = self.seq_len_tensor
+            self.acl_decoder_operation_inputs[9] = self.place_holder
+            self.acl_decoder_operation_inputs[10] = self.seqlen_max
+
+            acl_param_decoder = json.dumps({
+                "tokenOffset": [self.token_num] * batch_size,
+                "seqLen": [1] * batch_size
+            })
+
+            return self.acl_decoder_operation_inputs, acl_param_decoder
+
+    def execute_ascend_operator(self, acl_model, input_ids, position_ids, past_key_values=None):
+        batch_size, seq_length = input_ids.shape
+        acl_inputs, acl_param = self.prepare_inputs_for_ascend(input_ids, position_ids, seq_length, batch_size,
+                                                               past_key_values)
+        acl_model_out = acl_model.execute(acl_inputs, acl_param)
+        acl_hidden_state = acl_model_out[0]
+        return acl_hidden_state
+
+    def get_input_embeddings(self):
+        return self.embed_in
+
+    def set_input_embeddings(self, value):
+        self.embed_in = value
+
+    @add_code_sample_docstrings(
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        real_checkpoint=_REAL_CHECKPOINT_FOR_DOC,
+        output_type=BaseModelOutputWithPast,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def forward(
+            self,
+            input_ids: Optional[torch.LongTensor] = None,
+            attention_mask: Optional[torch.FloatTensor] = None,
+            position_ids: Optional[torch.LongTensor] = None,
+            head_mask: Optional[torch.FloatTensor] = None,
+            inputs_embeds: Optional[torch.FloatTensor] = None,
+            past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
+            use_cache: Optional[bool] = None,
+            output_attentions: Optional[bool] = None,
+            output_hidden_states: Optional[bool] = None,
+            return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutputWithPast]:
+        
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
+        elif input_ids is not None:
+            input_shape = input_ids.size()
+        elif inputs_embeds is not None:
+            input_shape = inputs_embeds.size()[:-1]
+        else:
+            raise ValueError("You have to specify either input_ids or inputs_embeds")
+
+        batch_size, seq_length = input_shape
+
+        if batch_size != self.batch:
+            self.batch = batch_size
+
+            if not IS_ND:
+                self.hidden_size_nz = math.ceil(self.hidden_size // WORLD_SIZE / self.nz_dim)
+                self.k_cache_input = torch.zeros(self.num_layers,
+                                                 batch_size,
+                                                 self.hidden_size_nz,
+                                                 self.max_position_embeddings,
+                                                 self.nz_dim,
+                                                 device=input_ids.device,
+                                                 dtype=torch.half)
+                self.v_cache_input = torch.zeros(self.num_layers,
+                                                 batch_size,
+                                                 self.hidden_size_nz,
+                                                 self.max_position_embeddings,
+                                                 self.nz_dim,
+                                                 device=input_ids.device,
+                                                 dtype=torch.half)
+                self.k_cache_input = torch_npu.npu_format_cast(self.k_cache_input, 29)
+                torch.npu.empty_cache()
+                self.v_cache_input = torch_npu.npu_format_cast(self.v_cache_input, 29)
+            else:
+
+                self.k_cache_input = torch.zeros(self.num_layers,
+                                                 batch_size,
+                                                 self.max_position_embeddings,
+                                                 self.hidden_size // WORLD_SIZE,
+                                                 device=input_ids.device,
+                                                 dtype=torch.half)
+                self.v_cache_input = torch.zeros(self.num_layers,
+                                                 batch_size,
+                                                 self.max_position_embeddings,
+                                                 self.hidden_size // WORLD_SIZE,
+                                                 device=input_ids.device,
+                                                 dtype=torch.half)
+            torch.npu.empty_cache()
+            self.token_num = 0
+            self.token_offset = torch.full((batch_size,), 0, dtype=torch.int32, device=self.k_cache_input.device)
+
+        if past_key_values is None:
+            past_length = 0
+            past_key_values = tuple([None] * self.config.num_hidden_layers)
+            if attention_mask is None:
+                seq_length_with_past = seq_length + past_length
+                attention_mask = torch.ones(
+                    (batch_size, seq_length_with_past), dtype=torch.bool, device=input_ids.device)
+
+            self.origin_inputs_count = attention_mask.sum(dim=-1)
+            self.attention_mask_input = _prepare_input_attention_mask(
+                attention_mask, (batch_size, seq_length), self.k_cache_input, past_length)
+        else:
+            past_length = self.token_num
+
+        if position_ids is None:
+            device = input_ids.device if input_ids is not None else inputs_embeds.device
+            position_ids = torch.arange(past_length, seq_length + past_length, dtype=torch.long, device=device)
+            position_ids = position_ids.unsqueeze(0).view(-1, seq_length)
+
+        '''
+        else:
+            position_ids = position_ids.view(-1, seq_length).long()
+
+        # Attention mask.
+        if attention_mask is not None:
+            assert batch_size > 0, "batch_size has to be defined and > 0"
+            attention_mask = attention_mask.view(batch_size, -1)
+            # We create a 3D attention mask from a 2D tensor mask.
+            # Sizes are [batch_size, 1, 1, to_seq_length]
+            # So we can broadcast to [batch_size, num_heads, from_seq_length, to_seq_length]
+            # this attention mask is more simple than the triangular masking of causal attention
+            # used in OpenAI GPT, we just need to prepare the broadcast dimension here.
+            attention_mask = attention_mask[:, None, None, :]
+
+            # Since attention_mask is 1.0 for positions we want to attend and 0.0 for
+            # masked positions, this operation will create a tensor which is 0.0 for
+            # positions we want to attend and the dtype's smallest value for masked positions.
+            # Since we are adding it to the raw scores before the softmax, this is
+            # effectively the same as removing these entirely.
+            attention_mask = attention_mask.to(dtype=self.dtype)  # fp16 compatibility
+            attention_mask = (1.0 - attention_mask) * torch.finfo(self.dtype).min
+
+        # Prepare head mask if needed
+        # 1.0 in head_mask indicate we keep the head
+        # attention_probs has shape bsz x n_heads x N x N
+        # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
+        # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
+        head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers)
+
+        if inputs_embeds is None:
+            inputs_embeds = self.embed_in(input_ids)
+
+        hidden_states = inputs_embeds
+
+        if self.gradient_checkpointing and self.training:
+            if use_cache:
+                logger.warning(
+                    "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
+                )
+                use_cache = False
+
+
+        '''
+
+        if not self.ascend_weight:
+            self.init_ascend_weight()
+
+        all_attentions = () if output_attentions else None
+        all_hidden_states = () if output_hidden_states else None
+
+        is_prefill = True if past_key_values[0] is None else False
+
+        if is_prefill:
+            model_op = self.acl_encoder_operation
+        else:
+            model_op = self.acl_decoder_operation
+
+        hidden_states = self.execute_ascend_operator(model_op,
+                                                     input_ids,
+                                                     position_ids,
+                                                     past_key_values)
+
+        presents = (self.k_cache_input, self.v_cache_input)
+
+        '''
+        for i, (layer, layer_past) in enumerate(zip(self.layers, past_key_values)):
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+
+            if self.gradient_checkpointing and self.training:
+
+                def create_custom_forward(module):
+                    def custom_forward(*inputs):
+                        # None for layer_past
+                        return module(*inputs, use_cache, None, output_attentions)
+
+                    return custom_forward
+
+                outputs = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(layer),
+                    hidden_states,
+                    attention_mask,
+                    position_ids,
+                    head_mask[i],
+                )
+            else:
+                outputs = layer(
+                    hidden_states,
+                    attention_mask=attention_mask,
+                    position_ids=position_ids,
+                    head_mask=head_mask[i],
+                    layer_past=layer_past,
+                    use_cache=use_cache,
+                    output_attentions=output_attentions,
+                )
+            hidden_states = outputs[0]
+            if use_cache is True:
+                presents = presents + (outputs[1],)
+            if output_attentions:
+                all_attentions = all_attentions + (outputs[2 if use_cache else 1],)
+
+        hidden_states = self.final_layer_norm(hidden_states)
+        '''
+
+        # Add last hidden state
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
+
+        if not return_dict:
+            return tuple(v for v in [hidden_states, presents, all_hidden_states, all_attentions] if v is not None)
+
+        return BaseModelOutputWithPast(
+            last_hidden_state=hidden_states,
+            past_key_values=presents,
+            hidden_states=all_hidden_states,
+            attentions=all_attentions,
+        )
+
+
+@add_start_docstrings(
+    """GPTNeoX Model with a `language modeling` head on top for CLM fine-tuning.""", GPT_NEOX_START_DOCSTRING
+)
+class GPTNeoXForCausalLM(GPTNeoXPreTrainedModel):
+    _keys_to_ignore_on_load_missing = [r"position_ids", r"predictions.decoder.bias"]
+
+    def __init__(self, config: GPTNeoXConfig):
+        super().__init__(config)
+
+        self.gpt_neox = GPTNeoXModel(config)
+        self.embed_out = nn.Linear(config.hidden_size, config.vocab_size // WORLD_SIZE, bias=False)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+        self.position_ids_cache = torch.arange(int(os.getenv("MAX_SEQ_LEN", config.max_position_embeddings)),
+                                               device='cpu').long().unsqueeze(0).npu()
+        self.lm_head_weight = None
+
+    def get_output_embeddings(self):
+        return self.embed_out
+
+    def set_output_embeddings(self, new_embeddings):
+        self.embed_out = new_embeddings
+
+    @add_start_docstrings_to_model_forward(GPT_NEOX_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @replace_return_docstrings(output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
+    @torch.no_grad()
+    @Timer.timing
+    def forward(
+            self,
+            input_ids: Optional[torch.LongTensor] = None,
+            attention_mask: Optional[torch.FloatTensor] = None,
+            position_ids: Optional[torch.LongTensor] = None,
+            inputs_embeds: Optional[torch.FloatTensor] = None,
+            head_mask: Optional[torch.FloatTensor] = None,
+            past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
+            labels: Optional[torch.LongTensor] = None,
+            use_cache: Optional[bool] = None,
+            output_attentions: Optional[bool] = None,
+            output_hidden_states: Optional[bool] = None,
+            return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, CausalLMOutputWithPast]:
+        r"""
+            Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
+            `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of shape
+            `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`. The two additional tensors are
+            only required when the model is used as a decoder in a Sequence to Sequence model.
+
+            Contains pre-computed hidden-states (key and values in the self-attention blocks that can be used (see
+            `past_key_values` input) to speed up sequential decoding.
+
+            If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that
+            don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all
+            `decoder_input_ids` of shape `(batch_size, sequence_length)`.
+        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for computing the left-to-right language modeling loss (next word prediction). Indices should be in
+            `[-100, 0, ..., config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are
+            ignored (masked), the loss is only computed for the tokens with labels n `[0, ..., config.vocab_size]`.
+        use_cache (`bool`, *optional*):
+            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
+            `past_key_values`).
+
+        Returns:
+
+        Example:
+
+        ```python
+        >>> from transformers import AutoTokenizer, GPTNeoXForCausalLM, GPTNeoXConfig
+        >>> import torch
+
+        >>> tokenizer = AutoTokenizer.from_pretrained("EleutherAI/gpt-neox-20b")
+        >>> config = GPTNeoXConfig.from_pretrained("EleutherAI/gpt-neox-20b")
+        >>> config.is_decoder = True
+        >>> model = GPTNeoXForCausalLM.from_pretrained("EleutherAI/gpt-neox-20b", config=config)
+
+        >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
+        >>> outputs = model(**inputs)
+
+        >>> prediction_logits = outputs.logits
+        ```"""
+
+        if self.lm_head_weight is None:
+            self.lm_head_weight = self.state_dict()["embed_out.weight"]
+            if not IS_ND:
+                self.lm_head_weight = torch_npu.npu_format_cast(self.lm_head_weight, 29)
+            self.gpt_neox.lm_head_weight = self.lm_head_weight
+
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.gpt_neox(
+            input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            past_key_values=past_key_values,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        lm_logits = outputs[0]
+
+        lm_loss = None
+        if labels is not None:
+            # move labels to correct device to enable model parallelism
+            labels = labels.to(lm_logits.device)
+            # we are doing next-token prediction; shift prediction scores and input ids by one
+            shift_logits = lm_logits[:, :-1, :].contiguous()
+            labels = labels[:, 1:].contiguous()
+            loss_fct = CrossEntropyLoss()
+            lm_loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), labels.view(-1))
+
+        if not return_dict:
+            output = (lm_logits,) + outputs[1:]
+            return ((lm_loss,) + output) if lm_loss is not None else output
+
+        return CausalLMOutputWithPast(
+            loss=lm_loss,
+            logits=lm_logits,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+    def prepare_inputs_for_generation(
+            self, input_ids, past_key_values=None, attention_mask=None, inputs_embeds=None, **kwargs
+    ):
+        '''
+        input_shape = input_ids.shape
+
+        # cut decoder_input_ids if past is used
+        if past_key_values and past_key_values[0] is not None:
+            input_ids = input_ids[:, -1:]
+
+        position_ids = kwargs.get("position_ids", None)
+        if attention_mask is not None and position_ids is None:
+            # create position_ids on the fly for batch generation
+            position_ids = attention_mask.long().cumsum(-1) - 1
+            position_ids.masked_fill_(attention_mask == 0, 1)
+            if past_key_values:
+                position_ids = position_ids[:, -1].unsqueeze(-1)
+
+        # if model is used as a decoder in encoder-decoder model, the decoder attention mask is created on the fly
+        if attention_mask is None:
+            attention_mask = input_ids.new_ones(input_shape)
+
+        # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
+        if inputs_embeds is not None and past_key_values is None:
+            model_inputs = {"inputs_embeds": inputs_embeds}
+        else:
+            model_inputs = {"input_ids": input_ids}
+
+        model_inputs.update(
+            {
+                "attention_mask": attention_mask,
+                "past_key_values": past_key_values,
+                "position_ids": position_ids,
+            }
+        )
+
+        return model_inputs
+        '''
+
+        input_shape = input_ids.shape
+        if input_shape[0] != self.position_ids_cache.shape[0]:
+            self.position_ids_cache = self.position_ids_cache.repeat(input_shape[0], 1)
+
+        position_ids = kwargs.get("position_ids", None)
+        if position_ids is None:
+            position_ids = self.position_ids_cache[:, :input_shape[1]]
+
+        if past_key_values and past_key_values[0] is not None:
+            input_ids = input_ids[:, -1:]
+            position_ids = position_ids[:, -1:]
+
+        model_inputs = {"input_ids": input_ids}
+        model_inputs.update(
+            {
+                "attention_mask": attention_mask,
+                "past_key_values": past_key_values,
+                "position_ids": position_ids,
+            }
+        )
+
+        return model_inputs
+
+    def _reorder_cache(self, past_key_values, beam_idx):
+        reordered_past = ()
+        for layer_past in past_key_values:
+            reordered_past += (
+                tuple(past_state.index_select(0, beam_idx) for past_state in layer_past[:2]) + layer_past[2:],
+            )
+        return reordered_past
+
diff --git a/mindie/examples/models/gpt_neox/modeling_gpt_neox_cut.py b/mindie/examples/models/gpt_neox/modeling_gpt_neox_cut.py
new file mode 100644
index 00000000..7c158f2e
--- /dev/null
+++ b/mindie/examples/models/gpt_neox/modeling_gpt_neox_cut.py
@@ -0,0 +1,761 @@
+# coding=utf-8
+# Copyright 2022 EleutherAI The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" PyTorch GPTNeoX model."""
+
+from typing import Optional, Tuple, Union
+
+import torch
+import torch.utils.checkpoint
+from torch import nn
+from torch.nn import CrossEntropyLoss
+from transformers.activations import ACT2FN
+from transformers.file_utils import (
+    add_code_sample_docstrings,
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    replace_return_docstrings,
+)
+from transformers.modeling_outputs import (
+    BaseModelOutputWithPast,
+    CausalLMOutputWithPast
+)
+from transformers.modeling_utils import PreTrainedModel
+from transformers.utils import logging
+from configuration_gpt_neox import GPTNeoXConfig
+
+logger = logging.get_logger(__name__)
+
+_CHECKPOINT_FOR_DOC = "trl-internal-testing/tiny-random-GPTNeoXForCausalLM"
+_REAL_CHECKPOINT_FOR_DOC = "EleutherAI/gpt-neox-20b"
+_CONFIG_FOR_DOC = "GPTNeoXConfig"
+
+GPT_NEOX_PRETRAINED_MODEL_ARCHIVE_LIST = [
+    "EleutherAI/gpt-neox-20b",
+    # See all GPTNeoX models at https://huggingface.co/models?filter=gpt_neox
+]
+
+
+class GPTNeoXPreTrainedModel(PreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+
+    config_class = GPTNeoXConfig
+    base_model_prefix = "gpt_neox"
+    supports_gradient_checkpointing = True
+    _no_split_modules = ["GPTNeoXLayer"]
+
+    def _init_weights(self, module):
+        """Initialize the weights"""
+        if isinstance(module, nn.Linear):
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+
+
+class GPTNeoXAttention(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+
+        self.num_attention_heads = config.num_attention_heads
+        self.hidden_size = config.hidden_size
+        self.head_size = self.hidden_size // self.num_attention_heads
+
+        # 初始化world_size
+        self.world_size = 1
+        if hasattr(config, 'world_size'):
+            self.world_size = config.world_size
+        # 单卡头数
+        self.num_attention_heads = self.num_attention_heads // self.world_size
+
+        self.rotary_ndims = int(self.head_size * config.rotary_pct)
+        max_positions = config.max_position_embeddings
+        self.register_buffer(
+            "bias",
+            torch.tril(torch.ones((max_positions, max_positions), dtype=torch.bool)).view(
+                1, 1, max_positions, max_positions
+            ),
+        )
+        self.register_buffer("masked_bias", torch.tensor(-1e9))
+        self.rotary_emb = RotaryEmbedding(
+            self.rotary_ndims, config.max_position_embeddings, base=config.rotary_emb_base
+        )
+        self.register_buffer(
+            "norm_factor",
+            torch.sqrt(torch.tensor(self.head_size, dtype=torch.float32)).to(torch.get_default_dtype()),
+            persistent=False,
+        )
+        self.query_key_value = nn.Linear(config.hidden_size, 3 * config.hidden_size // self.world_size)
+        self.dense = nn.Linear(config.hidden_size // self.world_size, config.hidden_size)
+
+    @classmethod
+    def _split_heads(cls, tensor, num_attention_heads, attn_head_size):
+        """
+        Splits hidden dim into attn_head_size and num_attention_heads
+        """
+        new_shape = tensor.size()[:-1] + (num_attention_heads, attn_head_size)
+        tensor = tensor.view(new_shape)
+        tensor = tensor.permute(0, 2, 1, 3)
+        return tensor
+
+    @classmethod
+    def _merge_heads(cls, tensor, num_attention_heads, attn_head_size):
+        """
+        Merges attn_head_size dim and num_attn_heads dim into hidden dim
+        """
+        tensor = tensor.permute(0, 2, 1, 3).contiguous()
+        tensor = tensor.view(tensor.size(0), tensor.size(1), num_attention_heads * attn_head_size)
+        return tensor
+
+    def forward(
+            self,
+            hidden_states: torch.FloatTensor,
+            attention_mask: torch.FloatTensor,
+            position_ids: torch.LongTensor,
+            head_mask: Optional[torch.FloatTensor] = None,
+            layer_past: Optional[Tuple[torch.Tensor]] = None,
+            use_cache: Optional[bool] = False,
+            output_attentions: Optional[bool] = False,
+    ):
+        has_layer_past = layer_past is not None
+
+        # Compute QKV
+        qkv = self.query_key_value(hidden_states)
+
+        new_qkv_shape = qkv.size()[:-1] + (self.num_attention_heads, 3 * self.head_size)
+        qkv = qkv.view(*new_qkv_shape)
+
+        query = qkv[..., : self.head_size].permute(0, 2, 1, 3)
+        key = qkv[..., self.head_size: 2 * self.head_size].permute(0, 2, 1, 3)
+        value = qkv[..., 2 * self.head_size:].permute(0, 2, 1, 3)
+
+        # Compute rotary embeddings on rotary_ndims
+        query_rot = query[..., : self.rotary_ndims]
+        query_pass = query[..., self.rotary_ndims:]
+        key_rot = key[..., : self.rotary_ndims]
+        key_pass = key[..., self.rotary_ndims:]
+
+        # Compute token offset for rotary embeddings (when decoding)
+        seq_len = key.shape[-2]
+        if has_layer_past:
+            seq_len += layer_past[0].shape[-2]
+        cos, sin = self.rotary_emb(value, seq_len=seq_len)
+        query, key = apply_rotary_pos_emb(query_rot, key_rot, cos, sin, position_ids)
+        query = torch.cat((query, query_pass), dim=-1)
+        key = torch.cat((key, key_pass), dim=-1)
+
+        # Cache QKV values
+        if has_layer_past:
+            past_key = layer_past[0]
+            past_value = layer_past[1]
+            key = torch.cat((past_key, key), dim=-2)
+            value = torch.cat((past_value, value), dim=-2)
+        present = (key, value) if use_cache else None
+
+        # Compute attention
+        attn_output, attn_weights = self._attn(query, key, value, attention_mask, head_mask)
+
+        # Reshape outputs
+        attn_output = self._merge_heads(attn_output, self.num_attention_heads, self.head_size)
+        attn_output = self.dense(attn_output)
+
+        # all reduce
+        if self.world_size >= 2:
+            torch.distributed.all_reduce(
+                attn_output, op=torch.distributed.ReduceOp.SUM)
+
+        outputs = (attn_output, present)
+        if output_attentions:
+            outputs += (attn_weights,)
+
+        return outputs
+
+    def _attn(self, query, key, value, attention_mask=None, head_mask=None):
+        # q, k, v: [bs, num_attention_heads, seq_len, attn_head_size]
+        # compute causal mask from causal mask buffer
+        batch_size, num_attention_heads, query_length, attn_head_size = query.size()
+        key_length = key.size(-2)
+
+        causal_mask = self.bias[:, :, key_length - query_length: key_length, :key_length]
+
+        query = query.view(batch_size * num_attention_heads, query_length, attn_head_size)
+        key = key.view(batch_size * num_attention_heads, key_length, attn_head_size)
+        attn_scores = torch.zeros(
+            batch_size * num_attention_heads,
+            query_length,
+            key_length,
+            dtype=query.dtype,
+            device=key.device,
+        )
+        attn_scores = torch.baddbmm(
+            attn_scores,
+            query,
+            key.transpose(1, 2),
+            beta=1.0,
+            alpha=(torch.tensor(1.0, dtype=self.norm_factor.dtype, device=self.norm_factor.device) / self.norm_factor),
+        )
+        attn_scores = attn_scores.view(batch_size, num_attention_heads, query_length, key_length)
+
+        mask_value = torch.finfo(attn_scores.dtype).min
+        # Need to be a tensor, otherwise we get error: `RuntimeError: expected scalar type float but found double`.
+        # Need to be on the same device, otherwise `RuntimeError: ..., x and y to be on the same device`
+        mask_value = torch.tensor(mask_value, dtype=attn_scores.dtype).to(attn_scores.device)
+        attn_scores = torch.where(causal_mask, attn_scores, mask_value)
+
+        if attention_mask is not None:
+            # Apply the attention mask
+            attn_scores = attn_scores + attention_mask
+
+        attn_weights = nn.functional.softmax(attn_scores, dim=-1)
+        attn_weights = attn_weights.to(value.dtype)
+
+        # Mask heads if we want to
+        if head_mask is not None:
+            attn_weights = attn_weights * head_mask
+
+        attn_output = torch.matmul(attn_weights, value)
+        return attn_output, attn_weights
+
+
+def attention_mask_func(attention_scores, ltor_mask):
+    attention_scores.masked_fill_(~ltor_mask, torch.finfo(attention_scores.dtype).min)
+    return attention_scores
+
+
+class RotaryEmbedding(torch.nn.Module):
+    def __init__(self, dim, max_position_embeddings, base=10000, device=None):
+        super().__init__()
+        inv_freq = 1.0 / (base ** (torch.arange(0, dim, 2).float().to(device) / dim))
+        self.register_buffer("inv_freq", inv_freq)
+
+        # Build here to make `torch.jit.trace` work.
+        self.max_seq_len_cached = max_position_embeddings
+        t = torch.arange(self.max_seq_len_cached, device=self.inv_freq.device, dtype=self.inv_freq.dtype)
+        freqs = torch.einsum("i,j->ij", t, self.inv_freq)
+        # Different from paper, but it uses a different permutation in order to obtain the same calculation
+        emb = torch.cat((freqs, freqs), dim=-1)
+        self.cos_cached = emb.cos()[None, None, :, :]
+        self.sin_cached = emb.sin()[None, None, :, :]
+
+    def forward(self, x, seq_len=None):
+        # This `if` block is unlikely to be run after we build sin/cos in `__init__`. Keep the logic here just in case.
+        if seq_len > self.max_seq_len_cached:
+            self.max_seq_len_cached = seq_len
+            t = torch.arange(self.max_seq_len_cached, device=x.device, dtype=self.inv_freq.dtype)
+            freqs = torch.einsum("i,j->ij", t, self.inv_freq)
+            # Different from paper, but it uses a different permutation in order to obtain the same calculation
+            emb = torch.cat((freqs, freqs), dim=-1).to(x.device)
+            self.cos_cached = emb.cos()[None, None, :, :]
+            self.sin_cached = emb.sin()[None, None, :, :]
+        return self.cos_cached[:seq_len, ...].to(x.device), self.sin_cached[:seq_len, ...].to(x.device)
+
+
+def rotate_half(x):
+    """Rotates half the hidden dims of the input."""
+    x1 = x[..., : x.shape[-1] // 2]
+    x2 = x[..., x.shape[-1] // 2:]
+    return torch.cat((-x2, x1), dim=-1)
+
+
+def apply_rotary_pos_emb(q, k, cos, sin, position_ids):
+    gather_indices = position_ids[:, None, :, None]  # [bs, 1, seq_len, 1]
+    gather_indices = gather_indices.repeat(1, cos.shape[1], 1, cos.shape[3])
+    cos = torch.gather(cos.repeat(gather_indices.shape[0], 1, 1, 1), 2, gather_indices)
+    sin = torch.gather(sin.repeat(gather_indices.shape[0], 1, 1, 1), 2, gather_indices)
+    q_embed = (q * cos) + (rotate_half(q) * sin)
+    k_embed = (k * cos) + (rotate_half(k) * sin)
+    return q_embed, k_embed
+
+
+class GPTNeoXMLP(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        # 设置卡数
+        self.world_size = 1
+        if hasattr(config, 'world_size'):
+            self.world_size = config.world_size
+        # dense_h_to_4h 和 dense_4h_to_h分别行切和列切
+        self.dense_h_to_4h = nn.Linear(config.hidden_size, config.intermediate_size // self.world_size)
+        self.dense_4h_to_h = nn.Linear(config.intermediate_size // self.world_size, config.hidden_size)
+        self.act = ACT2FN[config.hidden_act]
+
+    def forward(self, hidden_states):
+        hidden_states = self.dense_h_to_4h(hidden_states)
+        hidden_states = self.act(hidden_states)
+        hidden_states = self.dense_4h_to_h(hidden_states)
+        return hidden_states
+
+
+class GPTNeoXLayer(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.use_parallel_residual = config.use_parallel_residual
+        self.input_layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.post_attention_layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.attention = GPTNeoXAttention(config)
+        # 设置 world_size
+        self.world_size = 1
+        if hasattr(config, 'world_size'):
+            self.world_size = config.world_size
+        self.mlp = GPTNeoXMLP(config)
+
+    def forward(
+            self,
+            hidden_states: Optional[torch.FloatTensor],
+            attention_mask: Optional[torch.FloatTensor] = None,
+            position_ids: Optional[torch.LongTensor] = None,
+            head_mask: Optional[torch.FloatTensor] = None,
+            use_cache: Optional[bool] = False,
+            layer_past: Optional[Tuple[torch.Tensor]] = None,
+            output_attentions: Optional[bool] = False,
+    ):
+        attention_layer_outputs = self.attention(
+            self.input_layernorm(hidden_states),
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            layer_past=layer_past,
+            head_mask=head_mask,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+        )
+        attn_output = attention_layer_outputs[0]  # output_attn: attn_output, present, (attn_weights)
+        outputs = attention_layer_outputs[1:]
+
+        if self.use_parallel_residual:
+            mlp_output = self.mlp(self.post_attention_layernorm(hidden_states))
+            # all_reduce
+            if self.world_size >= 2:
+                torch.distributed.all_reduce(
+                    mlp_output, op=torch.distributed.ReduceOp.SUM)
+
+            hidden_states = mlp_output + attn_output + hidden_states
+        else:
+            attn_output = attn_output + hidden_states
+            mlp_output = self.mlp(self.post_attention_layernorm(attn_output))
+            # all_reduce
+            if self.world_size >= 2:
+                torch.distributed.all_reduce(
+                    mlp_output, op=torch.distributed.ReduceOp.SUM)
+            hidden_states = mlp_output + attn_output
+
+        if use_cache:
+            outputs = (hidden_states,) + outputs  # hidden_states, present, (attn_weights)
+        else:
+            outputs = (hidden_states,) + outputs[1:]  # hidden_states, (attn_weights)
+
+        return outputs
+
+
+GPT_NEOX_START_DOCSTRING = r"""
+    This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) sub-class. Use
+    it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
+    behavior.
+
+    Parameters:
+        config ([`~GPTNeoXConfig`]): Model configuration class with all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+GPT_NEOX_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (`torch.LongTensor` of shape `({0})`):
+            Indices of input sequence tokens in the vocabulary.
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            [What are input IDs?](../glossary#input-ids)
+        attention_mask (`torch.FloatTensor` of shape `({0})`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+            [What are attention masks?](../glossary#attention-mask)
+        position_ids (`torch.LongTensor` of shape `({0})`, *optional*):
+            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
+            config.n_positions - 1]`.
+
+            [What are position IDs?](../glossary#position-ids)
+        head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
+            Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:
+
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+
+        inputs_embeds (`torch.FloatTensor` of shape `({0}, hidden_size)`, *optional*):
+            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
+            is useful if you want more control over how to convert *input_ids* indices into associated vectors than the
+            model's internal embedding lookup matrix.
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple.
+"""
+
+
+@add_start_docstrings(
+    "The bare GPTNeoX Model transformer outputting raw hidden-states without any specific head on top.",
+    GPT_NEOX_START_DOCSTRING,
+)
+class GPTNeoXModel(GPTNeoXPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.config = config
+
+        self.embed_in = nn.Embedding(config.vocab_size, config.hidden_size)
+        self.layers = nn.ModuleList([GPTNeoXLayer(config) for _ in range(config.num_hidden_layers)])
+        self.final_layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+
+        self.gradient_checkpointing = False
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.embed_in
+
+    def set_input_embeddings(self, value):
+        self.embed_in = value
+
+    @add_start_docstrings_to_model_forward(GPT_NEOX_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @add_code_sample_docstrings(
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        real_checkpoint=_REAL_CHECKPOINT_FOR_DOC,
+        output_type=BaseModelOutputWithPast,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def forward(
+            self,
+            input_ids: Optional[torch.LongTensor] = None,
+            attention_mask: Optional[torch.FloatTensor] = None,
+            position_ids: Optional[torch.LongTensor] = None,
+            head_mask: Optional[torch.FloatTensor] = None,
+            inputs_embeds: Optional[torch.FloatTensor] = None,
+            past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
+            use_cache: Optional[bool] = None,
+            output_attentions: Optional[bool] = None,
+            output_hidden_states: Optional[bool] = None,
+            return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutputWithPast]:
+        r"""
+            Contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding.
+            If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that
+            don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all
+            `decoder_input_ids` of shape `(batch_size, sequence_length)`.
+        use_cache (`bool`, *optional*):
+            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
+            `past_key_values`).
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
+        elif input_ids is not None:
+            input_shape = input_ids.size()
+        elif inputs_embeds is not None:
+            input_shape = inputs_embeds.size()[:-1]
+        else:
+            raise ValueError("You have to specify either input_ids or inputs_embeds")
+
+        batch_size, seq_length = input_shape
+
+        if past_key_values is None:
+            past_length = 0
+            past_key_values = tuple([None] * self.config.num_hidden_layers)
+        else:
+            past_length = past_key_values[0][0].size(-2)
+
+        if position_ids is None:
+            device = input_ids.device if input_ids is not None else inputs_embeds.device
+            position_ids = torch.arange(past_length, seq_length + past_length, dtype=torch.long, device=device)
+            position_ids = position_ids.unsqueeze(0).view(-1, seq_length)
+        else:
+            position_ids = position_ids.view(-1, seq_length).long()
+
+        # Attention mask.
+        if attention_mask is not None:
+            attention_mask = attention_mask.view(batch_size, -1)
+            # We create a 3D attention mask from a 2D tensor mask.
+            # Sizes are [batch_size, 1, 1, to_seq_length]
+            # So we can broadcast to [batch_size, num_heads, from_seq_length, to_seq_length]
+            # this attention mask is more simple than the triangular masking of causal attention
+            # used in OpenAI GPT, we just need to prepare the broadcast dimension here.
+            attention_mask = attention_mask[:, None, None, :]
+
+            # Since attention_mask is 1.0 for positions we want to attend and 0.0 for
+            # masked positions, this operation will create a tensor which is 0.0 for
+            # positions we want to attend and the dtype's smallest value for masked positions.
+            # Since we are adding it to the raw scores before the softmax, this is
+            # effectively the same as removing these entirely.
+            attention_mask = attention_mask.to(dtype=self.dtype)  # fp16 compatibility
+            attention_mask = (1.0 - attention_mask) * torch.finfo(self.dtype).min
+
+        # Prepare head mask if needed
+        # 1.0 in head_mask indicate we keep the head
+        # attention_probs has shape bsz x n_heads x N x N
+        # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
+        # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
+        head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers)
+
+        if inputs_embeds is None:
+            inputs_embeds = self.embed_in(input_ids)
+
+        hidden_states = inputs_embeds
+
+        if self.gradient_checkpointing and self.training:
+            if use_cache:
+                logger.warning(
+                    "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
+                )
+                use_cache = False
+
+        presents = () if use_cache else None
+        all_attentions = () if output_attentions else None
+        all_hidden_states = () if output_hidden_states else None
+        for i, (layer, layer_past) in enumerate(zip(self.layers, past_key_values)):
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+
+            if self.gradient_checkpointing and self.training:
+
+                def create_custom_forward(module):
+                    def custom_forward(*inputs):
+                        # None for layer_past
+                        return module(*inputs, use_cache, None, output_attentions)
+
+                    return custom_forward
+
+                outputs = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(layer),
+                    hidden_states,
+                    attention_mask,
+                    position_ids,
+                    head_mask[i],
+              )
+            else:
+                outputs = layer(
+                    hidden_states,
+                    attention_mask=attention_mask,
+                    position_ids=position_ids,
+                    head_mask=head_mask[i],
+                    layer_past=layer_past,
+                    use_cache=use_cache,
+                    output_attentions=output_attentions,
+                )
+            hidden_states = outputs[0]
+            if use_cache is True:
+                presents = presents + (outputs[1],)
+            if output_attentions:
+                all_attentions = all_attentions + (outputs[2 if use_cache else 1],)
+
+        hidden_states = self.final_layer_norm(hidden_states)
+        # Add last hidden state
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
+
+        if not return_dict:
+            return tuple(v for v in [hidden_states, presents, all_hidden_states, all_attentions] if v is not None)
+
+        return BaseModelOutputWithPast(
+            last_hidden_state=hidden_states,
+            past_key_values=presents,
+            hidden_states=all_hidden_states,
+            attentions=all_attentions,
+        )
+
+
+@add_start_docstrings(
+    """GPTNeoX Model with a `language modeling` head on top for CLM fine-tuning.""", GPT_NEOX_START_DOCSTRING
+)
+class GPTNeoXForCausalLM(GPTNeoXPreTrainedModel):
+    _keys_to_ignore_on_load_missing = [r"position_ids", r"predictions.decoder.bias"]
+
+    def __init__(self, config):
+        super().__init__(config)
+        self.world_size = 1
+        if hasattr(config, 'world_size'):
+            self.world_size = config.world_size
+        self.gpt_neox = GPTNeoXModel(config)
+        self.embed_out = nn.Linear(config.hidden_size, config.vocab_size // self.world_size, bias=False)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_output_embeddings(self):
+        return self.embed_out
+
+    def set_output_embeddings(self, new_embeddings):
+        self.embed_out = new_embeddings
+
+    @add_start_docstrings_to_model_forward(GPT_NEOX_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @replace_return_docstrings(output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
+    @torch.no_grad()
+    def forward(
+            self,
+            input_ids: Optional[torch.LongTensor] = None,
+            attention_mask: Optional[torch.FloatTensor] = None,
+            position_ids: Optional[torch.LongTensor] = None,
+            inputs_embeds: Optional[torch.FloatTensor] = None,
+            head_mask: Optional[torch.FloatTensor] = None,
+            past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
+            labels: Optional[torch.LongTensor] = None,
+            use_cache: Optional[bool] = None,
+            output_attentions: Optional[bool] = None,
+            output_hidden_states: Optional[bool] = None,
+            return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, CausalLMOutputWithPast]:
+        r"""
+            Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
+            `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of shape
+            `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`. The two additional tensors are
+            only required when the model is used as a decoder in a Sequence to Sequence model.
+
+            Contains pre-computed hidden-states (key and values in the self-attention blocks that can be used (see
+            `past_key_values` input) to speed up sequential decoding.
+
+            If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that
+            don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all
+            `decoder_input_ids` of shape `(batch_size, sequence_length)`.
+        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for computing the left-to-right language modeling loss (next word prediction). Indices should be in
+            `[-100, 0, ..., config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are
+            ignored (masked), the loss is only computed for the tokens with labels n `[0, ..., config.vocab_size]`.
+        use_cache (`bool`, *optional*):
+            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
+            `past_key_values`).
+
+        Returns:
+
+        Example:
+
+        ```python
+        >>> from transformers import AutoTokenizer, GPTNeoXForCausalLM, GPTNeoXConfig
+        >>> import torch
+
+        >>> tokenizer = AutoTokenizer.from_pretrained("EleutherAI/gpt-neox-20b")
+        >>> config = GPTNeoXConfig.from_pretrained("EleutherAI/gpt-neox-20b")
+        >>> config.is_decoder = True
+        >>> model = GPTNeoXForCausalLM.from_pretrained("EleutherAI/gpt-neox-20b", config=config)
+
+        >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
+        >>> outputs = model(**inputs)
+
+        >>> prediction_logits = outputs.logits
+        ```"""
+
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.gpt_neox(
+            input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            past_key_values=past_key_values,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        hidden_states = outputs[0]
+        lm_logits = self.embed_out(hidden_states)
+
+        lm_loss = None
+        if labels is not None:
+            # move labels to correct device to enable model parallelism
+            labels = labels.to(lm_logits.device)
+            # we are doing next-token prediction; shift prediction scores and input ids by one
+            shift_logits = lm_logits[:, :-1, :].contiguous()
+            labels = labels[:, 1:].contiguous()
+            loss_fct = CrossEntropyLoss()
+            lm_loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), labels.view(-1))
+
+        if not return_dict:
+            output = (lm_logits,) + outputs[1:]
+            return ((lm_loss,) + output) if lm_loss is not None else output
+
+        return CausalLMOutputWithPast(
+            loss=lm_loss,
+            logits=lm_logits,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+    def prepare_inputs_for_generation(
+            self, input_ids, past_key_values=None, attention_mask=None, inputs_embeds=None, **kwargs
+    ):
+        input_shape = input_ids.shape
+
+        # cut decoder_input_ids if past is used
+        if past_key_values and past_key_values[0] is not None:
+            input_ids = input_ids[:, -1:]
+
+        position_ids = kwargs.get("position_ids", None)
+        if attention_mask is not None and position_ids is None:
+            # create position_ids on the fly for batch generation
+            position_ids = attention_mask.long().cumsum(-1) - 1
+            position_ids.masked_fill_(attention_mask == 0, 1)
+            if past_key_values:
+                position_ids = position_ids[:, -1].unsqueeze(-1)
+
+        # if model is used as a decoder in encoder-decoder model, the decoder attention mask is created on the fly
+        if attention_mask is None:
+            attention_mask = input_ids.new_ones(input_shape)
+
+        # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
+        if inputs_embeds is not None and past_key_values is None:
+            model_inputs = {"inputs_embeds": inputs_embeds}
+        else:
+            model_inputs = {"input_ids": input_ids}
+
+        model_inputs.update(
+            {
+                "attention_mask": attention_mask,
+                "past_key_values": past_key_values,
+                "position_ids": position_ids,
+            }
+        )
+
+        return model_inputs
+
+    def _reorder_cache(self, past_key_values, beam_idx):
+        reordered_past = ()
+        for layer_past in past_key_values:
+            reordered_past += (
+                tuple(past_state.index_select(0, beam_idx) for past_state in layer_past[:2]) + layer_past[2:],
+            )
+        return reordered_past
+
diff --git a/mindie/examples/models/gpt_neox/run_pa.sh b/mindie/examples/models/gpt_neox/run_pa.sh
new file mode 100644
index 00000000..e3903c51
--- /dev/null
+++ b/mindie/examples/models/gpt_neox/run_pa.sh
@@ -0,0 +1,18 @@
+#!/bin/bash
+export BIND_CPU=1
+export IS_QUANT=0
+export MAX_MEMORY_GB=30
+export ASCEND_RT_VISIBLE_DEVICES=4,5
+export MASTER_PORT=20014
+export TP_WORLD_SIZE=$(($(echo "${ASCEND_RT_VISIBLE_DEVICES}" | grep -o , | wc -l) +1))
+export INT8_FORMAT_NZ_ENABLE=1
+model_path=$1
+
+atb_options="ATB_LAUNCH_KERNEL_WITH_TILING=1 ATB_LAYER_INTERNAL_TENSOR_REUSE=1 PYTORCH_NPU_ALLOC_CONF='max_split_size_mb:2048' HCCL_BUFFSIZE=110"
+atb_async_options="ATB_OPERATION_EXECUTE_ASYNC=1 TASK_QUEUE_ENABLE=1"
+base_cmd="torchrun --nproc_per_node $TP_WORLD_SIZE --master_port $MASTER_PORT -m examples.run_pa --model_path $model_path"
+run_cmd="${atb_options} ${atb_async_options} ${base_cmd}"
+
+if [[ -n ${model_path} ]];then
+    eval "${run_cmd}"
+fi
\ No newline at end of file
diff --git a/mindie/examples/models/internlm/README.md b/mindie/examples/models/internlm/README.md
new file mode 100644
index 00000000..3500b225
--- /dev/null
+++ b/mindie/examples/models/internlm/README.md
@@ -0,0 +1,242 @@
+# README
+
+- InternLM开源了InternLM系列的多个基础模型和为实际场景量身定制的聊天模型。该系列模型具有以下特点：
+
+    - 它利用数万亿个高质量的代币进行培训，以建立强大的知识库。
+    - 一代的internlm-20B支持 8k 上下文窗口长度，InternLM2-7B/20B-Chat有效支持200K超长上下文，可实现更长的输入序列和更强的推理能力。
+    - 它为用户提供了一个多功能的工具集，可以灵活地构建自己的工作流程。
+
+- 此代码仓中实现了一套基于NPU硬件的Internlm推理模型。配合加速库使用，旨在NPU上获得极致的推理性能。
+
+# 特性矩阵
+
+- 此矩阵罗列了各Internlm模型支持的特性
+
+| 模型及参数量        | 800I A2 Tensor Parallelism | 300I DUO Tensor Parallelism | FP16 | BF16 | Flash Attention | Paged Attention | W8A8量化 | W8A16量化 | W4A16量化 | KV cache量化 | 稀疏量化 | MOE量化 | MindIE Service | TGI  | 长序列 |
+|---------------|------------------------|------------------------| ---- |-----| --------------- | --------------- | -------- | --------- | --------- | ------------ | -------------------------- | ---- | ------ | ---- |-----|
+| internlm-20B  | 支持world size 2,4,8     | 支持world size 2,4,8     | √    | ×   | ×               | √               | ×        | ×         | ×         | ×            | ×                          | ×    | ×      | ×    | ×    |
+| intenlm2-7B   | 支持world size 1,2,4,8   | 支持world size 1,2,4,8   | √    | ×   | ×               | √               | ×        | ×         | ×         | ×            | ×                          | ×    | √      | ×    | √    |
+| intenlm2-20B  | 支持world size 2,4,8     | 支持world size 2,4,8     | √    | ×   | ×               | √               | ×        | ×         | ×         | ×            | ×                          | ×    | √      | ×    | √    |
+# Paged Attention 推理使用说明
+
+## 路径变量解释
+
+| 变量名         | 含义                                                                                                                             |
+|-------------|--------------------------------------------------------------------------------------------------------------------------------|
+| working_dir | 加速库及模型库下载后放置的目录                                                                                                                |
+| llm_path    | 模型仓所在路径。若使用编译好的包，则路径为`${working_dir}/ModelLink/`；若使用gitee下载的代码，则路径为`${working_dir}/ModelLink/mindie_ref/mindie_llm/atb_models` |
+| script_path | 脚本所在路径。Internlm系列模型的工作脚本所在路径为${llm_path}/examples/models/internlm                                                              |
+| weight_path | 模型权重路径
+| chat | 是否启用对话模式
+
+## 权重
+**权重下载**
+- [internlm-20B](https://huggingface.co/internlm/internlm-chat-20b)
+- [intenlm2-7B](https://huggingface.co/internlm/internlm2-chat-7b)
+- [intenlm2-20B](https://huggingface.co/internlm/internlm2-chat-20b)
+
+**权重转换**
+- Paged Attention 场景下需要.safetensors 格式的权重，如果没有，参考[此README文件](../../README.md)转换
+
+**量化权重生成**
+- 基于原始的FP16的权重，生成量化权重
+- W8A8 Antioutlier量化权重请使用以下指令生成
+    - 暂不支持
+
+- W8A8量化权重请使用以下指令生成
+    - 暂不支持
+
+- W8A16量化权重请使用以下指令生成
+    - 暂不支持
+
+- 稀疏量化权重请使用以下指令生成
+    - 暂不支持
+
+**基础环境变量**
+- 参考[此README文件](../../../README.md)
+
+
+## 推理
+
+### 对话测试
+**运行Flash Attention FP16**
+- 其余Internlm模型参考以下运行方式
+    - 运行启动脚本
+        - 在\${llm_path}目录下执行以下指令
+          ```shell
+          bash ${script_path}/run_fa.sh ${weight_path}
+          ```
+    - 环境变量说明
+        - `export ASCEND_RT_VISIBLE_DEVICES=0,1,2,3,4,5,6,7`
+            - 指定当前机器上可用的逻辑NPU核心，多个核心间使用逗号相连
+            - 核心ID查阅方式见[此README文件](../../README.md)的【启动脚本相关环境变量】章节
+            - 对于300I DUO卡而言，若要使用单卡双芯，请指定至少两个可见核心；若要使用双卡四芯，请指定至少四个可见核心
+            - 各模型支持的核心数参考“特性矩阵”
+        - `export MASTER_PORT=20050`
+            - 设置卡间通信端口
+            - 默认使用20050端口
+            - 目的是为了避免同一台机器同时运行多个多卡模型时出现通信冲突
+            - 设置时端口建议范围为：20000-20050
+        - 以下环境变量与性能和内存优化相关，通常情况下无需修改
+          ```shell
+          export ATB_LAYER_INTERNAL_TENSOR_REUSE=1
+          export INF_NAN_MODE_ENABLE=0
+          export ATB_OPERATION_EXECUTE_ASYNC=1
+          export TASK_QUEUE_ENABLE=1
+          export ATB_CONVERT_NCHW_TO_ND=1
+          export HCCL_BUFFSIZE=120
+          export HCCL_WHITELIST_DISABLE=1
+          export ATB_CONTEXT_WORKSPACE_RING=1
+          export ATB_CONTEXT_WORKSPACE_SIZE=2629145600
+          export ATB_WORKSPACE_MEM_ALLOC_GLOBAL=0
+          export ATB_LAUNCH_KERNEL_WITH_TILING=0
+          export ATB_OPSRUNNER_KERNEL_CACHE_GLOABL_COUNT=1
+          export ATB_OPSRUNNER_KERNEL_CACHE_LOCAL_COUNT=0
+          ```
+
+**运行Flash Attention BF16**
+- 暂不支持
+
+**运行Flash Attention W8A8**
+- 暂不支持
+
+**运行Flash Attention W8A16**
+- 暂不支持
+
+**运行Paged Attention FP16**
+- 运行启动脚本
+    - 在\${llm_path}目录下执行以下指令
+      ```shell
+      bash ${script_path}/run_pa.sh ${weight_path}
+      ```
+- 环境变量说明
+    - `export ASCEND_RT_VISIBLE_DEVICES=0,1,2,3,4,5,6,7`
+        - 指定当前机器上可用的逻辑NPU核心，多个核心间使用逗号相连
+        - 核心ID查阅方式见[此README文件](../../README.md)的【启动脚本相关环境变量】章节
+        - 对于300I DUO卡而言，若要使用单卡双芯，请指定至少两个可见核心；若要使用双卡四芯，请指定至少四个可见核心
+        - 各模型支持的核心数参考“特性矩阵”
+    - `export MASTER_PORT=20050`
+        - 设置卡间通信端口
+        - 默认使用20050端口
+        - 目的是为了避免同一台机器同时运行多个多卡模型时出现通信冲突
+        - 设置时端口建议范围为：20000-20050
+    - 以下环境变量与性能和内存优化相关，通常情况下无需修改
+      ```shell
+      export ATB_LAYER_INTERNAL_TENSOR_REUSE=1
+      export INF_NAN_MODE_ENABLE=0
+      export ATB_OPERATION_EXECUTE_ASYNC=1
+      export TASK_QUEUE_ENABLE=1
+      export ATB_CONVERT_NCHW_TO_ND=1
+      export LCCL_ENABLE_FALLBACK=1
+      export ATB_WORKSPACE_MEM_ALLOC_GLOBAL=1
+      export ATB_CONTEXT_WORKSPACE_SIZE=0
+      ```
+
+**运行Paged Attention BF16**
+- 暂不支持
+
+**运行Paged Attention W8A8**
+- 暂不支持
+
+**运行Paged Attention W8A16**
+- 暂不支持
+
+**运行KV cache量化**
+- 暂不支持
+
+**运行稀疏量化**
+- 暂不支持
+
+**运行MOE量化**
+- 暂不支持
+
+**200K长序列**
+- 修改模型权重下的config.json
+    ```json
+    intenlm2-7B改为
+    "rope_scaling": {
+        "factor": 2.0,
+        "type": "dynamic"
+    },
+    intenlm2-20B改为
+    "rope_scaling": {
+        "factor": 3.0,
+        "type": "dynamic"
+    },
+    ```
+- 修改run_pa.py文件 `parse_arguments()`函数的参数，max_input_length必须大于文本token数。因为分词原因，文本长度不等于文本token数，通常文本字符数大于文本token数。
+-     --input_texts
+      --input_file
+      --max_input_length
+      --max_output_length
+    ```python
+
+    parser.add_argument(
+        '--input_texts',
+        type=str,
+        nargs='+',
+        default="text_200K") 
+    parser.add_argument(
+        '--input_file',
+        type=str,
+        help='CSV or Numpy file containing tokenized input. Alternative to text input.',
+        default="./text_200K.jsonl")
+    parser.add_argument('--max_input_length', type=int, default=210000)
+    parser.add_argument('--max_output_length', type=int, default=256)
+    ```
+- 输入32K/64K/128K/192K长序列
+    - 使用 --input_texts 参数或者 --input_file 参数。
+  * `--input_texts`
+    * 必须为 str 或 List[str]格式的对话数据
+    ```
+    """
+    这里是10万字的小说内容   \n总结以上文本内容。
+    """
+    ```
+  * `--input_file` (推荐)
+    * 目前仅支持jsonl格式文件，每一行必须为List[Dict]格式的按时间顺序排序对话数据
+    * 每个Dict字典中需要至少包含"role"和"content"两个字段
+    ```
+    [{"role": "user", "content": "这里是10万字的小说内容   \n总结以上文本内容。"}]
+    ```
+- 运行启动脚本
+    - 在\${llm_path}目录下执行以下指令(后面加一个chat参数)
+      ```shell
+      bash ${script_path}/run_pa.sh ${weight_path} chat
+      ```
+
+## 精度测试
+
+- 参考[此README文件](../../../tests/modeltest/README.md)
+    - 示例
+      ```shell
+      cd ${llm_path}/tests/modeltest
+      export ASCEND_RT_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+      export MAX_MEMORY_GB=29
+      bash run.sh pa_fp16 full_BoolQ 1 internlm ${internlm系列模型权重路径} 8
+      bash run.sh pa_fp16 full_CEval 5 1 internlm ${internlm系列模型权重路径} 8
+
+      internlm_20b, internlm2_7b, internlm2_20b, 已合并为 internlm，模型名称都是 internlm，
+      对应于 tests/modeltest/core/internlm_test.py。
+      ```
+
+## 性能测试
+
+- 参考[此README文件](../../../tests/modeltest/README.md)
+    - 示例
+      ```shell
+      cd ${llm_path}/tests/modeltest
+      export ASCEND_RT_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+      export MAX_MEMORY_GB=29
+      export ATB_LLM_BENCHMARK_ENABLE=1
+      bash run.sh pa_fp16 performance [[256,256],[512,512],[1024,1024],[2048,2048]] 1 internlm ${internlm系列模型权重路径} 8
+
+      bash run.sh pa_fp16 performance_maxbs [[256,256],[512,512],[1024,1024],[2048,2048]] [[1,2048],[1,2048],[1,2048],[1,2048]] 50 internlm ${internlm系列模型权重路径} 8
+      ```
+
+## FAQ
+- 更多环境变量见[此README文件](../../README.md)
+- 对话测试实际执行的Python文件为`${llm_path}/examples/run_fa.py`和`${llm_path}/examples/run_pa.py`；这两个文件的参数说明见[此README文件](../../README.md)
+- 运行时，需要通过指令pip list｜grep protobuf确认protobuf版本，如果版本高于3.20.x，请运行指令pip install protobuf==3.20.0进行更新
+- 如果模型生成了 [UNUSED_TOKEN_146]、[UNUSED_TOKEN_145]等特殊字符，升级transformers版本到4.37.1以上。
+- Internlm2模型使用bfloat16完成训练，使用float16进行推理会有一些精度波动，如果logits输出在fp16数据类型1个ulp的最小波动范围内，是正常现象。
\ No newline at end of file
diff --git a/mindie/examples/models/internlm/run_pa.sh b/mindie/examples/models/internlm/run_pa.sh
new file mode 100644
index 00000000..cfdd0f56
--- /dev/null
+++ b/mindie/examples/models/internlm/run_pa.sh
@@ -0,0 +1,19 @@
+#!/bin/bash
+set -ex
+# Copyright Huawei Technologies Co., Ltd. 2023-2024. All rights reserved.
+# 参数配置以及启动指令的说明见同级目录下的README.md文件
+export BIND_CPU=1
+export IS_QUANT=0
+export RESERVED_MEMORY_GB=3
+export ASCEND_RT_VISIBLE_DEVICES=0,1,2,3
+export MASTER_PORT=20036
+export TP_WORLD_SIZE=$(($(echo "${ASCEND_RT_VISIBLE_DEVICES}" | grep -o , | wc -l) +1))
+export INT8_FORMAT_NZ_ENABLE=1
+atb_options="ATB_LAUNCH_KERNEL_WITH_TILING=1 ATB_LAYER_INTERNAL_TENSOR_REUSE=1 ATB_WORKSPACE_MEM_ALLOC_GLOBAL=1 PYTORCH_NPU_ALLOC_CONF='max_split_size_mb:2048' HCCL_BUFFSIZE=120"
+atb_async_options="ATB_OPERATION_EXECUTE_ASYNC=1 TASK_QUEUE_ENABLE=1"
+base_cmd="torchrun --nproc_per_node $TP_WORLD_SIZE --master_port $MASTER_PORT -m examples.run_pa --model_path $1"
+if [[ "$2" == "chat" ]]; then
+    base_cmd+=" --is_chat_model"
+fi
+run_cmd="${atb_options} ${atb_async_options} ${base_cmd}"
+eval "${run_cmd}"
diff --git a/mindie/examples/models/llama/README.md b/mindie/examples/models/llama/README.md
new file mode 100644
index 00000000..63603981
--- /dev/null
+++ b/mindie/examples/models/llama/README.md
@@ -0,0 +1,312 @@
+# README
+
+- [LLaMA（Large Language Model Meta AI）](https://github.com/facebookresearch/llama/tree/llama_v1)和 [LLaMA2（Large Language Model Meta AI 2）](https://github.com/facebookresearch/llama)，是由 Meta AI 发布的一个开放且高效的大型基础语言模型，可以通过自然语言交互的方式提供知识、文本生成、语言翻译、语言理解、代码编写和解释等任务。
+
+- 此代码仓中实现了一套基于NPU硬件的LLaMa推理模型。配合加速库使用，旨在NPU上获得极致的推理性能。
+
+# 特性矩阵
+- 此矩阵罗列了各LLaMa模型支持的特性
+
+| 模型及参数量 | 800I A2 Tensor Parallelism | 300I DUO Tensor Parallelism | FP16 | BF16（仅800I A2支持） | Flash Attention | Paged Attention | W8A8量化 | W8A16量化 | KV cache量化 | 稀疏量化（仅300I DUO支持） | MOE | MindIE | TGI |长序列|
+|-------------|----------------------------|-----------------------------|------|----------------------|-----------------|-----------------|---------|-----------|--------------|--------------------------|-----|--------|-----|-----|
+| LLaMa-7B    | 支持world size 1,2,4,8     | 支持world size 2,4           | √   | √                   | √            | √            | ×     | ×      | ×         | ×                     | ×| √   | √|×|
+| LLaMa-13B   | 支持world size 1,2,4,8     | 支持world size 2,4           | √   | √                 | √            | √            | ×     | ×      | ×         | ×                     | ×| √   | √|×|
+| LLaMa-33B   | 支持world size 4,8         | 支持world size 2,4           | √   | √                 | √            | √            | ×     | ×      | ×         | √                     | ×| ×   | ×|×|
+| LLaMa-65B   | 支持world size 8           | ×                         | √   | √                 | √            | √            | ×     | √      | √         | ×                     | ×| √   | ×|×|
+| LLaMa2-7B   | 支持world size 1,2,4,8     | 支持world size 2,4           | √   | √                 | √            | √            | √     | ×      | ×         | √                     | ×| √   | √|×|
+| LLaMa2-13B  | 支持world size 1,2,4,8     | 支持world size 2,4           | √   | √                 | √            | √            | √     | ×      | ×         | √                     | ×| √   | √|×|
+| LLaMa2-70B  | 支持world size 8           | ×                         | √   | √                 | √            | √            | √     | √      | ×         | ×                     | ×| √   | √|×|
+
+- 此模型仓已适配的模型版本
+  - [LLaMa系列](https://github.com/facebookresearch/llama/tree/llama_v1)
+  - [LLaMa2系列](https://github.com/facebookresearch/llama/tree/v2)
+
+# 使用说明
+
+## 路径变量解释
+| 变量名  | 含义                                             |
+|--------|--------------------------------------------------|
+| working_dir | 加速库及模型库下载后放置的目录                  |
+| llm_path | 模型仓所在路径。若使用编译好的包，则路径为`${working_dir}/MindIE-LLM/`；若使用gitee下载的代码，则路径为`${working_dir}/MindIE-LLM/examples/atb_models`    |
+| script_path | 脚本所在路径；LLaMa和LLaMa2的工作脚本所在路径为`${llm_path}/examples/models/llama`                            |
+| weight_path | 模型权重路径                            |
+
+## 权重
+**权重下载**
+- [LLaMa-7B](https://huggingface.co/huggyllama/llama-7b)
+- [LLaMa-13B](https://huggingface.co/huggyllama/llama-13b)
+- [LLaMa-33B](https://huggingface.co/pinkmanlove/llama-33b-hf/tree/main)
+- [LLaMa-65B](https://huggingface.co/huggyllama/llama-65b)
+- [LLaMa2-7B](https://huggingface.co/NousResearch/Llama-2-7b-hf)
+- [LLaMa2-13B](https://huggingface.co/NousResearch/Llama-2-13b-hf)
+- [LLaMa2-70B](https://huggingface.co/NousResearch/Llama-2-70b-hf)
+
+**权重转换**
+- 参考[此README文件](../../README.md)
+
+**量化权重生成**
+- 基于原始的浮点权重，使用量化工具，将高位浮点数转为低位的定点数。
+- 注意事项：
+  - `model_path`和`save_directory`请勿使用同一个文件夹，避免浮点权重和量化权重混淆
+  - NPU多卡量化注意事项和环境要求见[此README中的【NPU多卡量化】章节](../../README.md)
+- W8A8 Antioutlier量化权重请使用以下指令生成
+  - LLaMa2-7B/13B推荐使用W8A8 Antioulier量化
+  - 执行量化脚本
+  ```shell
+  # 设置CANN包的环境变量
+  source /usr/local/Ascend/ascend-toolkit/set_env.sh
+  cd ${llm_path}
+  # 生成llama2-7b量化权重，antioutlier使用m1算法配置
+  python examples/models/llama/convert_quant_weights.py --model_path {浮点权重路径} --save_directory {W8A8量化权重路径} --w_bit 8 --a_bit 8 --disable_level L0 --device_type cpu --anti_method m1 --act_method 1 --calib_file ${llm_path}/examples/convert/model_slim/boolq.jsonl
+  # 生成llama2-13b量化权重，antioutlier使用m2算法配置
+  python examples/models/llama/convert_quant_weights.py --model_path {浮点权重路径} --save_directory {W8A8量化权重路径} --w_bit 8 --a_bit 8 --disable_level L0 --device_type cpu --anti_method m2 --act_method 1 --calib_file ${llm_path}/examples/convert/model_slim/boolq.jsonl
+- W8A8量化权重请使用以下指令生成
+  - 大参数量模型LLaMa2-70B推荐使用NPU多卡W8A8量化
+  - 执行量化脚本
+    ```shell
+    # 指定当前机器上可用的逻辑NPU核心
+    export ASCEND_RT_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+    cd ${llm_path}
+    python examples/models/llama/convert_quant_weights.py --model_path {浮点权重路径} --save_directory {W8A8量化权重路径} --w_bit 8 --a_bit 8 --disable_level L5 --device_type npu --calib_file ${llm_path}/examples/convert/model_slim/boolq.jsonl
+    ```
+- W8A16量化权重请使用以下指令生成
+  - 当前仅LLaMa-65B、LLaMa2-70B支持W8A16量化
+    ```shell
+    # 设置CANN包的环境变量
+    source /usr/local/Ascend/ascend-toolkit/set_env.sh
+    cd ${llm_path}
+    python examples/models/llama/convert_quant_weights.py --model_path {浮点权重路径} --save_directory {W8A16量化权重路径} --w_bit 8 --a_bit 16 --act_method 3 --calib_file ""
+    ```
+- 稀疏量化权重请使用以下指令生成
+  - Step 1
+    ```shell
+    # 设置CANN包的环境变量
+    source /usr/local/Ascend/ascend-toolkit/set_env.sh
+    cd ${llm_path}
+    python examples/models/llama/convert_quant_weights.py --model_path {浮点权重路径} --save_directory {W8A8S量化权重路径} --w_bit 4 --a_bit 8 --calib_file ${llm_path}/examples/convert/model_slim/teacher_qualification.jsonl --fraction 0.011 --co_sparse True
+    ```
+    Llama33B稀疏量化权重请使用以下指令生成
+    ```shell
+    python examples/models/llama/convert_quant_weight.py --model_path {浮点权重路径} --save_directory {W8A8S量化权重路径,默认为当前路径} --calib_file ${llm_path}/examples/convert/model_slim/boolq.jsonl --act_method 2 --do_smooth True --use_sigma True --is_lowbit True --co_sparse True --w_bit 4
+    ```
+  - Step 2：量化权重切分及压缩
+    > 运行前需要确保压缩工具编译过
+    >
+    > `cd /usr/local/Ascend/ascend-toolkit/latest/python/site-packages/msmodelslim/pytorch/weight_compression/compress_graph`
+    >
+    > `bash build.sh /usr/local/Ascend/ascend-toolkit/latest`
+    ```shell
+    torchrun --nproc_per_node {TP数} -m examples.convert.model_slim.sparse_compressor --model_path {W8A8S量化权重路径} --save_directory {W8A8SC量化权重路径}
+    ```
+    - TP数为tensor parallel并行个数
+    - 注意：若权重生成时以TP=4进行切分，则运行时也需以TP=4运行
+    - 示例
+      ```shell
+        torchrun --nproc_per_node 2 -m examples.convert.model_slim.sparse_compressor --model_path /data1/weights/model_slim/llama2-7b_w8a8s --save_directory /data1/weights/model_slim/llama2-7b_w8a8sc
+      ```
+- KV cache量化权重请使用以下指令生成
+  - 当前仅LLaMa-65B W8A8量化支持搭配KV cache量化
+  - 执行量化脚本
+    ```shell
+    # 指定当前机器上可用的逻辑NPU核心
+    export ASCEND_RT_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+    cd ${llm_path}
+    python examples/models/llama/convert_quant_weights.py --model_path {浮点权重路径} --save_directory {W8A8量化权重路径} --w_bit 8 --a_bit 8 --disable_level L5 --device_type npu --calib_file ${llm_path}/examples/convert/model_slim/boolq.jsonl --use_kvcache_quant True
+    ```
+  - 相比于W8A8量化，需额外配置`use_kvcache_quant`参数
+
+**LLaMa 33B权重添加Special token**
+- LLaMa 33B中tokenizer原始的special token为空，需手动将权重文件中的`special_tokens_map.json`文件替换成以下内容
+  ```json
+  {
+    "add_bos_token": true,
+    "add_eos_token": false,
+    "bos_token": {
+      "content": "<s>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false
+    },
+    "clean_up_tokenization_spaces": false,
+    "eos_token": {
+      "content": "</s>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false
+    },
+    "model_max_length": 2048,
+    "pad_token": null,
+    "sp_model_kwargs": {},
+    "tokenizer_class": "LlamaTokenizer",
+    "unk_token": {
+      "content": "<unk>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false
+    }
+  }
+  ```
+
+**基础环境变量**
+- 参考[此README文件](../../../README.md)
+
+## 推理
+
+### 对话测试
+**运行Flash Attention FP16**
+- 其余LLaMa模型参考以下运行方式
+  - 运行启动脚本
+    - 在\${llm_path}目录下执行以下指令
+      ```shell
+      bash ${script_path}/run_fa.sh ${weight_path}
+      ```
+  - 环境变量说明
+    - `export ASCEND_RT_VISIBLE_DEVICES=0,1,2,3,4,5,6,7`
+      - 指定当前机器上可用的逻辑NPU核心，多个核心间使用逗号相连
+      - 核心ID查阅方式见[此README文件](../../README.md)的【启动脚本相关环境变量】章节
+      - 对于300I DUO卡而言，若要使用单卡双芯，请指定至少两个可见核心；若要使用双卡四芯，请指定至少四个可见核心
+      - 各模型支持的核心数参考“特性矩阵”
+    - `export MASTER_PORT=20030`
+      - 设置卡间通信端口
+      - 默认使用20030端口
+      - 目的是为了避免同一台机器同时运行多个多卡模型时出现通信冲突
+      - 设置时端口建议范围为：20000-20050
+    - 以下环境变量与性能和内存优化相关，通常情况下无需修改
+      ```shell
+      export ATB_LAYER_INTERNAL_TENSOR_REUSE=1
+      export INF_NAN_MODE_ENABLE=0
+      export LCCL_ENABLE_FALLBACK=1
+      export INT8_FORMAT_NZ_ENABLE=1
+      ```
+
+**运行Flash Attention BF16**
+- 暂不支持
+
+**运行Flash Attention W8A8**
+- 运行启动脚本
+  - 与“运行Flash Attention FP16”的启动方式相同
+  - `${weight_path}`为W8A8量化权重的路径
+- 环境变量说明
+  - 参见“运行Flash Attention FP16”中的环境变量说明
+- 相比于FP16，运行量化时需修改W8A8量化权重`${weight_path}/config.json`中的`quantize`字段，将此字段对应的值修改为`w8a8`
+  - 若config.json中无此字段，则新增
+
+**运行Flash Attention W8A16**
+- 运行启动脚本
+  - 与“运行Flash Attention FP16”的启动方式相同
+  - `${weight_path}`为W8A16量化权重的路径
+- 环境变量说明
+  - 参见“运行Flash Attention FP16”中的环境变量说明
+
+**运行Paged Attention FP16**
+- 运行启动脚本
+  - 在\${llm_path}目录下执行以下指令
+    ```shell
+    bash ${script_path}/run_pa.sh ${weight_path}
+    ```
+- 环境变量说明
+  - `export ASCEND_RT_VISIBLE_DEVICES=0,1,2,3,4,5,6,7`
+    - 指定当前机器上可用的逻辑NPU核心，多个核心间使用逗号相连
+    - 核心ID查阅方式见[此README文件](../../README.md)的【启动脚本相关环境变量】章节
+    - 对于300I DUO卡而言，若要使用单卡双芯，请指定至少两个可见核心；若要使用双卡四芯，请指定至少四个可见核心
+    - 各模型支持的核心数参考“特性矩阵”
+  - `export MASTER_PORT=20030`
+    - 设置卡间通信端口
+    - 默认使用20030端口
+    - 目的是为了避免同一台机器同时运行多个多卡模型时出现通信冲突
+    - 设置时端口建议范围为：20000-20050
+  - 以下环境变量与性能和内存优化相关，通常情况下无需修改
+    ```shell
+    export ATB_LAYER_INTERNAL_TENSOR_REUSE=1
+    export INF_NAN_MODE_ENABLE=0
+    export LCCL_ENABLE_FALLBACK=1
+    export INT8_FORMAT_NZ_ENABLE=1
+    ```
+
+**运行Paged Attention BF16**
+- 运行启动脚本
+  - 与“运行Paged Attention FP16”的启动方式相同
+- 环境变量说明
+  - 参见“运行Paged Attention FP16”中的环境变量说明
+- 相比于FP16，运行BF16时需修改${weight_path}/config.json中的`torch_dtype`字段，将此字段对应的值修改为`bfloat16`
+- 300I DUO卡暂不支持BF16特性
+
+**运行Paged Attention W8A8**
+- 运行启动脚本
+  - 与“运行Paged Attention FP16”的启动方式相同
+  - `${weight_path}`为W8A8量化权重的路径
+- 环境变量说明
+  - 参见“运行Paged Attention FP16”中的环境变量说明
+- 相比于FP16，运行量化时需修改W8A8量化权重`${weight_path}/config.json`中的`quantize`字段，将此字段对应的值修改为`w8a8`
+  - 若config.json中无此字段，则新增
+
+**运行Paged Attention W8A16**
+- 运行启动脚本
+  - 与“运行Paged Attention FP16”的启动方式相同
+  - `${weight_path}`为W8A16量化权重的路径
+- 环境变量说明
+  - 参见“运行Paged Attention FP16”中的环境变量说明
+
+**运行KV cache量化**
+- 运行启动脚本
+  - 与“运行Paged Attention FP16”的启动方式相同
+  - `${weight_path}`为KV cache量化权重的路径
+- 环境变量说明
+  - 参见“运行Paged Attention FP16”中的环境变量说明
+- 相比于FP16，运行量化时需修改KV cache量化权重`${weight_path}/config.json`中的`kv_quant`字段，将此字段对应的值修改为`C8`
+  - 若config.json中无此字段，则新增
+
+**运行稀疏量化**
+- 运行启动脚本
+  - 与“运行Paged Attention FP16”的启动方式相同
+  - `${weight_path}`为W8A8量化权重的路径
+- 环境变量说明
+  - 参见“运行Paged Attention FP16”中的环境变量说明
+- 相比于FP16，运行量化时需修改W8A8量化权重`${weight_path}/config.json`中的`quantize`字段，将此字段对应的值修改为`w8a8sc`
+  - 若config.json中无此字段，则新增
+- 注意：压缩算法与硬件强相关，当前仅300I DUO卡支持稀疏量化
+
+## 精度测试
+- 参考[此README文件](../../../tests/modeltest/README.md)
+  - 示例
+    ```shell
+    cd ${llm_path}/tests/modeltest
+    export ASCEND_RT_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+    bash run.sh pa_fp16 full_BoolQ 1 llama ${llama2-7b权重路径} 8
+    bash run.sh pa_fp16 full_BoolQ 1 llama ${llama2-13b权重路径} 8
+    bash run.sh pa_fp16 full_BoolQ 1 llama ${llama2-70b权重路径} 8
+    bash run.sh pa_fp16 full_BoolQ 1 llama ${llama-7b权重路径} 8
+    bash run.sh pa_fp16 full_BoolQ 1 llama ${llama-13b权重路径} 8
+    bash run.sh pa_fp16 full_BoolQ 1 llama ${llama-65b权重路径} 8
+    ```
+- 运行量化权重和BF16时需注意`${weight_path}/config.json`中的`quantize`字段和`torch_dtype`字段是否与权重匹配，参考[此README文件](../../README.md)
+
+## 性能测试
+- 参考[此README文件](../../../tests/modeltest/README.md)
+  - 示例
+    ```shell
+    cd ${llm_path}/tests/modeltest
+    export ASCEND_RT_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+    bash run.sh pa_fp16 performance [[2048,2048],[1024,1024],[512,512],[256,256]] 1 llama ${llama2-7b权重路径} 8
+    bash run.sh pa_fp16 performance [[2048,2048],[1024,1024],[512,512],[256,256]] 1 llama ${llama2-13b权重路径} 8
+    bash run.sh pa_fp16 performance [[2048,2048],[1024,1024],[512,512],[256,256]] 1 llama ${llama2-70b权重路径} 8
+    bash run.sh pa_fp16 performance [[2048,2048],[1024,1024],[512,512],[256,256]] 1 llama ${llama-7b权重路径} 8
+    bash run.sh pa_fp16 performance [[2048,2048],[1024,1024],[512,512],[256,256]] 1 llama ${llama-13b权重路径} 8
+    bash run.sh pa_fp16 performance [[2048,2048],[1024,1024],[512,512],[256,256]] 1 llama ${llama-65b权重路径} 8
+    ```
+- 运行量化权重和BF16时需注意`${weight_path}/config.json`中的`quantize`字段和`torch_dtype`字段是否与权重匹配，参考[此README文件](../../README.md)
+- 特殊场景说明: 若在性能测试时发现有波动情况，可配置透明大页，提升内存访问性能。该功能请按需开启，对内存占用有一定影响。
+```shell
+# 性能测试时，可按需开启透明大页
+echo always > /sys/kernel/mm/transparent_hugepage/enabled
+# 关闭透明大页
+echo never > /sys/kernel/mm/transparent_hugepage/enabled
+```
+
+## FAQ
+- 更多环境变量见[此README文件](../../README.md)
+- 对话测试实际执行的Python文件为`${llm_path}/examples/run_fa.py`和`${llm_path}/examples/run_pa.py`；这两个文件的参数说明见[此README文件](../../README.md)
+- 运行时，需要通过指令pip list｜grep protobuf确认protobuf版本，如果版本高于3.20.x，请运行指令pip install protobuf==3.20.0进行更新
diff --git a/mindie/examples/models/llama/convert_quant_weights.py b/mindie/examples/models/llama/convert_quant_weights.py
new file mode 100644
index 00000000..0131e174
--- /dev/null
+++ b/mindie/examples/models/llama/convert_quant_weights.py
@@ -0,0 +1,85 @@
+import os
+
+
+from msmodelslim.pytorch.llm_ptq.llm_ptq_tools import QuantConfig
+from msmodelslim.pytorch.llm_ptq.anti_outlier import AntiOutlierConfig
+
+from atb_llm.models.llama.modeling_llama import LlamaConfig
+from examples.convert.model_slim.get_calibration_dataset import load_jsonl
+from examples.convert.model_slim.quantifier import parse_arguments, Quantifier
+from examples.convert.convert_utils import copy_tokenizer_files, modify_config
+
+
+
+
+
+if __name__ == "__main__":
+    args = parse_arguments()
+    rank = int(os.getenv("RANK", "0"))
+    config = LlamaConfig.from_pretrained(args.model_path)
+    config_path = os.path.join(args.model_path, "config.json")
+
+    disable_names = []
+    if args.a_bit != 16:
+        # W8A16, W4A16没有回退层
+        num_layers = config.num_hidden_layers
+        disable_names = [f"model.layers.{layer}.mlp.down_proj" for layer in range(num_layers)]
+        disable_names.append("lm_head")
+     
+    anti_outlier_config = None
+    
+    if args.anti_method == 'm3':
+        anti_outlier_config = AntiOutlierConfig(a_bit=args.a_bit, w_bit=args.w_bit, 
+            anti_method=args.anti_method, w_sym=args.w_sym, dev_type=args.device_type)
+    elif args.anti_method:
+        anti_outlier_config = AntiOutlierConfig(anti_method=args.anti_method)
+
+    quant_config = QuantConfig(
+        a_bit=args.a_bit,
+        w_bit=args.w_bit,
+        disable_names=disable_names,
+        act_method=args.act_method,
+        w_sym=args.w_sym,
+        mm_tensor=False,
+        dev_type=args.device_type,
+        dev_id=rank,
+        pr=1.0,
+        fraction=args.fraction,
+        co_sparse=args.co_sparse,
+        do_smooth=args.do_smooth,
+        use_sigma=args.use_sigma,
+        sigma_factor=args.sigma_factor,
+        is_lowbit=args.is_lowbit,
+        use_kvcache_quant=args.use_kvcache_quant,
+        open_outlier=args.open_outlier,
+        group_size=args.group_size
+    )
+
+    # 默认无校准数据集
+    calibration_dataset = None
+    # 若存在calib_file，则使用calib_file作为校准数据集
+    if args.calib_file:
+        calibration_dataset = load_jsonl(args.calib_file)
+    calibration_dataset = calibration_dataset
+    quant_weight_generator = Quantifier(
+        args.model_path, quant_config, anti_outlier_config,
+        device_type=args.device_type, tokenizer_args={"padding_side": "left"}
+    )
+    quant_weight_generator.tokenizer.pad_token_id = 0
+
+    tokenized_data = None
+    if calibration_dataset is not None:
+        tokenized_data = quant_weight_generator.get_tokenized_data(calibration_dataset)
+
+    quant_weight_generator.convert(tokenized_data, args.save_directory, args.disable_level)
+    #为适配工具稀疏量化传入w_bit=4,a_bit=8暂时修改quant_type
+    quant_type = f"w{args.w_bit}a{args.a_bit}"
+    is_sparseCompress = args.w_bit == 4 and args.a_bit == 8 and (args.co_sparse or args.is_lowbit)
+    if is_sparseCompress:
+        quant_type = "w8a8s"
+    modify_config(
+        args.model_path, args.save_directory, config.torch_dtype,
+        quant_type,
+        args.use_kvcache_quant
+    )
+    copy_tokenizer_files(args.model_path, args.save_directory)
diff --git a/mindie/examples/models/llama/run_fa.sh b/mindie/examples/models/llama/run_fa.sh
new file mode 100644
index 00000000..69ba5394
--- /dev/null
+++ b/mindie/examples/models/llama/run_fa.sh
@@ -0,0 +1,19 @@
+# Copyright Huawei Technologies Co., Ltd. 2023-2024. All rights reserved.
+# 参数配置以及启动指令的说明见同级目录下的README.md文件
+export ASCEND_RT_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+export MASTER_PORT=20030
+
+# 以下环境变量与性能和内存优化相关，通常情况下无需修改
+export ATB_LAYER_INTERNAL_TENSOR_REUSE=1
+export INF_NAN_MODE_ENABLE=0
+export LCCL_ENABLE_FALLBACK=1
+export INT8_FORMAT_NZ_ENABLE=1
+
+extra_param=""
+world_size=$(($(echo "${ASCEND_RT_VISIBLE_DEVICES}" | grep -o , | wc -l) +1))
+
+if [ "$TP_WORLD_SIZE" == "1" ]; then
+    python -m examples.run_fa --model_path $1 $extra_param
+else
+    torchrun --nproc_per_node $world_size --master_port $MASTER_PORT -m examples.run_fa --model_path $1 $extra_param
+fi
\ No newline at end of file
diff --git a/mindie/examples/models/llama/run_pa.sh b/mindie/examples/models/llama/run_pa.sh
new file mode 100644
index 00000000..18d4b116
--- /dev/null
+++ b/mindie/examples/models/llama/run_pa.sh
@@ -0,0 +1,19 @@
+# Copyright Huawei Technologies Co., Ltd. 2023-2024. All rights reserved.
+# 参数配置以及启动指令的说明见同级目录下的README.md文件
+export ASCEND_RT_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+export MASTER_PORT=20030
+
+# 以下环境变量与性能和内存优化相关，通常情况下无需修改
+export ATB_LAYER_INTERNAL_TENSOR_REUSE=1
+export INF_NAN_MODE_ENABLE=0
+export LCCL_ENABLE_FALLBACK=1
+export INT8_FORMAT_NZ_ENABLE=1
+
+extra_param=""
+world_size=$(($(echo "${ASCEND_RT_VISIBLE_DEVICES}" | grep -o , | wc -l) +1))
+
+if [ "$TP_WORLD_SIZE" == "1" ]; then
+    python -m examples.run_pa --model_path $1 $extra_param
+else
+    torchrun --nproc_per_node $world_size --master_port $MASTER_PORT -m examples.run_pa --model_path $1 $extra_param
+fi
\ No newline at end of file
diff --git a/mindie/examples/models/llama3/README.md b/mindie/examples/models/llama3/README.md
new file mode 100644
index 00000000..8180b1b7
--- /dev/null
+++ b/mindie/examples/models/llama3/README.md
@@ -0,0 +1,144 @@
+# README
+
+- [Llama3（Large Language Model Meta AI 3）](https://github.com/meta-llama/llama3)，是由 Meta AI 发布的一个开放且高效的大型基础语言模型，可以通过自然语言交互的方式提供知识、文本生成、语言翻译、语言理解、代码编写和解释等任务。
+
+- Llama3当前包含两个参数版本：Llama3-8B和Llama3-70B。相较于Llama2，Llama3支持8K长文本，改进的tokenizer具有128K token的词汇量，可实现更好的性能；同时，Llama3在代码生成等任务上实现了领先，能够进行复杂的推理，更遵循指令并解决很多微妙的问题。
+
+- 此代码仓中实现了一套基于NPU硬件的Llama3推理模型。配合加速库使用，旨在NPU上获得极致的推理性能。
+
+# 特性矩阵
+- 此矩阵罗列了Llama3模型支持的特性
+
+| 模型及参数量 | 800I A2 Tensor Parallelism | 300I DUO Tensor Parallelism | FP16 | BF16 | Flash Attention | Paged Attention | W8A8量化 | W8A16量化 |W4A16量化 | KV cache量化 | 稀疏量化 | MindIE Service | TGI |  长序列 |
+|-------------|----------------------------|-----------------------------|------|----------------------|-----------------|---------|-------|-----------|--------------|--------------------------|-----|--------|-----|-----|
+| Llama3-8B    | 支持world size 1,2,4,8     | 支持world size 2,4           | √   | √                   | √              | √              | √       | ×        |  ×       | ×                       | ×  | √     | ×  | ×  |
+| Llama3-70B   | 支持world size 8           | ×                            | √   | √                   | √              | √              | √       | √        |  √         | ×                       | ×  | √     | ×  |  ×  |
+
+# 使用说明
+
+## 路径变量解释
+| 变量名  | 含义                                             |
+|--------|--------------------------------------------------|
+| working_dir | MindIE加速库及模型库下载后放置的目录                  |
+| llm_path | 模型仓所在路径。若使用编译好的包，则路径为`${working_dir}/MindIE-LLM/`；若使用gitee下载的代码，则路径为`${working_dir}/MindIE-LLM/examples/atb_models`    |
+| script_path | 脚本所在路径；Llama3的工作脚本所在路径为`${llm_path}/examples/models/llama3`                            |
+| weight_path | 模型权重路径                            |
+
+## 权重
+**权重下载**
+- [Llama3-8B](https://huggingface.co/NousResearch/Meta-Llama-3-8B)
+- [Llama3-70B](https://huggingface.co/NousResearch/Meta-Llama-3-70b)
+
+
+## 环境准备
+**基础环境配置**
+- 参考[此README文件](../../../README.md)
+- 检查python依赖库中transformers版本的配置，Llama3要求transformers库最低版本为4.38.2
+  ```shell
+  pip show transformers
+  # 若transformers版本低于4.38.2，请将transformers更新至4.38.2
+  pip install transformers==4.38.2
+  ```
+- 300I DUO硬件不支持BF16推理，执行推理前需要将模型配置文件`config.json`中的`torch_dtype`修改为`float16`
+
+**量化权重生成**
+- 基于原始的FP16的权重，生成量化权重
+- W8A8 Antioutlier量化权重请使用以下指令生成
+  - Llama3-8B推荐使用W8A8 Antioulier量化
+  - 执行量化脚本
+    ```shell
+    # 设置CANN包的环境变量
+    source /usr/local/Ascend/ascend-toolkit/set_env.sh
+    cd ${llm_path}
+    python examples/models/llama3/convert_quant_weights.py --model_path {浮点权重路径} --save_directory {W8A8量化权重路径} --w_bit 8 --a_bit 8 --disable_level L0 --device_type cpu --anti_method m2 --act_method 1 --calib_file ${llm_path}/examples/convert/model_slim/boolq.jsonl
+    ```
+- W8A16量化权重请使用以下指令生成
+  - 当前仅LLaMa3-70B支持W8A16量化  
+    ```shell
+    # 设置CANN包的环境变量
+    source /usr/local/Ascend/ascend-toolkit/set_env.sh
+    cd ${llm_path}
+    python examples/models/llama3/convert_quant_weights.py --model_path {浮点权重路径} --save_directory {W8A16量化权重路径} --w_bit 8 --a_bit 16 --act_method 3 --w_sym False --mm_tensor False --calib_file  ${llm_path}/examples/convert/model_slim/boolq.jsonl  --anti_method 'm3'
+    ```
+- W4A16量化权重请使用以下指令生成
+  - 当前仅LLaMa3-70B支持W4A16量化
+```shell
+    # 设置CANN包的环境变量
+    source /usr/local/Ascend/ascend-toolkit/set_env.sh
+    cd ${llm_path}
+    python examples/models/llama3/convert_quant_weights.py --model_path {浮点权重路径}  --save_directory {W8A16量化权重路径}  --w_bit 4 --a_bit 16  --w_sym True --mm_tensor False --is_lowbit True --group_size=128 --open_outlier False
+```
+## 推理
+
+### 对话测试
+**运行Paged Attention BF16**
+- 环境变量说明
+  - `export ASCEND_RT_VISIBLE_DEVICES=0,1,2,3,4,5,6,7`
+    - 指定当前机器上可用的逻辑NPU核心，多个核心间使用逗号相连
+    - 核心ID查阅方式见[此README文件](../../README.md)的【启动脚本相关环境变量】章节
+  - `export MASTER_PORT=20030`
+    - 设置卡间通信端口
+    - 默认使用20030端口
+    - 目的是为了避免同一台机器同时运行多个多卡模型时出现通信冲突
+    - 设置时端口建议范围为：20000-20050
+  - 以下环境变量与性能和内存优化相关，通常情况下无需修改
+    ```shell
+    export ATB_LAYER_INTERNAL_TENSOR_REUSE=1
+    export INF_NAN_MODE_ENABLE=0
+    export ATB_OPERATION_EXECUTE_ASYNC=1
+    export TASK_QUEUE_ENABLE=1
+    export ATB_CONVERT_NCHW_TO_ND=1
+    export LCCL_ENABLE_FALLBACK=1
+    export ATB_WORKSPACE_MEM_ALLOC_GLOBAL=1
+    export ATB_CONTEXT_WORKSPACE_SIZE=0
+    export ATB_LAUNCH_KERNEL_WITH_TILING=0
+    ```
+- 运行启动脚本
+  - 在\${llm_path}目录下执行以下指令
+    ```shell
+    # 执行单卡推理
+    export ASCEND_RT_VISIBLE_DEVICES=0
+    bash ${script_path}/run_pa.sh ${weight_path} ${max_output_length}
+    # 执行4卡推理
+    export ASCEND_RT_VISIBLE_DEVICES=0,1,2,3
+    bash ${script_path}/run_pa.sh ${weight_path} ${max_output_length}
+    # 执行8卡推理
+    export ASCEND_RT_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+    bash ${script_path}/run_pa.sh ${weight_path} ${max_output_length}
+    ```
+
+**运行Paged Attention FP16**
+- 环境变量说明
+  - 参见“运行Paged Attention BF16”中的环境变量说明
+- 相比于BF16，运行FP16时需修改${weight_path}/config.json中的`torch_dtype`字段，将此字段对应的值修改为`float16`
+- 运行启动脚本
+  - 与“运行Paged Attention BF16”的启动方式相同
+
+## 精度测试
+- 参考[此README文件](../../../tests/modeltest/README.md)
+  - 示例
+    ```shell
+    # 测试8卡精度
+    cd ${llm_path}/tests/modeltest
+    export ASCEND_RT_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+    bash run.sh pa_fp16 full_BoolQ 1 llama ${llama3-8b权重路径} 8
+    bash run.sh pa_fp16 full_BoolQ 1 llama ${llama3-70b权重路径} 8
+    ```
+
+## 性能测试
+- 参考[此README文件](../../../tests/modeltest/README.md)
+  - 示例
+    ```shell
+    # 测试8卡性能
+    cd ${llm_path}/tests/modeltest
+    export ASCEND_RT_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+    bash run.sh pa_fp16 performance [[2048,2048],[1024,1024],[512,512],[256,256]] 1 llama ${llama3-8b权重路径} 8
+    bash run.sh pa_fp16 performance [[2048,2048],[1024,1024],[512,512],[256,256]] 1 llama ${llama3-70b权重路径} 8
+    # 测试长序列性能（800I A2 32G支持32k、64k序列长度， 800I A2 64G支持32k、64k、128k、192k、256k序列长度）
+    export ASCEND_RT_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+    bash run.sh pa_fp16 performance [[32768,1024],[65536,1024],[131072,1024],[196608,1024],[262144,1024]] 1 llama ${llama3-70b权重路径} 8
+    ```
+
+## FAQ
+- 更多环境变量见[此README文件](../../README.md)
+- 对话测试实际执行的Python文件为`${llm_path}/examples/run_pa.py`；文件参数说明请见[此README文件](../../README.md)
diff --git a/mindie/examples/models/llama3/convert_quant_weights.py b/mindie/examples/models/llama3/convert_quant_weights.py
new file mode 100644
index 00000000..69d2f485
--- /dev/null
+++ b/mindie/examples/models/llama3/convert_quant_weights.py
@@ -0,0 +1,125 @@
+import os
+import json
+
+
+from msmodelslim.pytorch.llm_ptq.llm_ptq_tools import QuantConfig
+from msmodelslim.pytorch.llm_ptq.anti_outlier import AntiOutlierConfig
+
+from atb_llm.models.llama.modeling_llama import LlamaConfig
+from atb_llm.utils.file_utils import safe_open
+from examples.convert.model_slim.get_calibration_dataset import load_jsonl
+from examples.convert.model_slim.quantifier import parse_arguments, Quantifier
+from examples.convert.convert_utils import copy_tokenizer_files, modify_config
+
+
+
+
+
+if __name__ == "__main__":
+    args = parse_arguments()
+    rank = int(os.getenv("RANK", "0"))
+    config = LlamaConfig.from_pretrained(args.model_path)
+    config_path = os.path.join(args.model_path, "config.json")
+
+    disable_names = []
+    with safe_open(config_path, 'r') as f:
+        config_data = json.load(f)
+        #Llama3 70B 需要进行前5层回退
+        if "hidden_size" in config_data and config_data["hidden_size"] == 8192:
+            disable_names.append("model.layers.0.self_attn.q_proj")
+            disable_names.append("model.layers.1.self_attn.q_proj")
+            disable_names.append("model.layers.2.self_attn.q_proj")
+            disable_names.append("model.layers.3.self_attn.q_proj")
+            disable_names.append("model.layers.4.self_attn.q_proj")
+            disable_names.append("model.layers.0.self_attn.k_proj")
+            disable_names.append("model.layers.1.self_attn.k_proj")
+            disable_names.append("model.layers.2.self_attn.k_proj")
+            disable_names.append("model.layers.3.self_attn.k_proj")
+            disable_names.append("model.layers.4.self_attn.k_proj")
+            disable_names.append("model.layers.0.self_attn.v_proj")
+            disable_names.append("model.layers.1.self_attn.v_proj")
+            disable_names.append("model.layers.2.self_attn.v_proj")
+            disable_names.append("model.layers.3.self_attn.v_proj")
+            disable_names.append("model.layers.4.self_attn.v_proj")
+            disable_names.append("model.layers.0.self_attn.o_proj")
+            disable_names.append("model.layers.1.self_attn.o_proj")
+            disable_names.append("model.layers.2.self_attn.o_proj")
+            disable_names.append("model.layers.3.self_attn.o_proj")
+            disable_names.append("model.layers.4.self_attn.o_proj")
+            disable_names.append("model.layers.0.mlp.gate_proj")
+            disable_names.append("model.layers.1.mlp.gate_proj")
+            disable_names.append("model.layers.2.mlp.gate_proj")
+            disable_names.append("model.layers.3.mlp.gate_proj")
+            disable_names.append("model.layers.4.mlp.gate_proj")
+            disable_names.append("model.layers.0.mlp.up_proj")
+            disable_names.append("model.layers.1.mlp.up_proj")
+            disable_names.append("model.layers.2.mlp.up_proj")
+            disable_names.append("model.layers.3.mlp.up_proj")
+            disable_names.append("model.layers.4.mlp.up_proj")
+            disable_names.append("model.layers.0.mlp.down_proj")
+            disable_names.append("model.layers.1.mlp.down_proj")
+            disable_names.append("model.layers.2.mlp.down_proj")
+            disable_names.append("model.layers.3.mlp.down_proj")
+            disable_names.append("model.layers.4.mlp.down_proj")
+            disable_names.append("lm_head")
+        elif args.a_bit != 16:
+            num_layers = config.num_hidden_layers
+            disable_names = [f"model.layers.{layer}.mlp.down_proj" for layer in range(num_layers)]
+            disable_names.append("lm_head")
+     
+    anti_outlier_config = None
+    if args.anti_method == 'm3':
+        anti_outlier_config = AntiOutlierConfig(a_bit=args.a_bit, w_bit=args.w_bit, 
+            anti_method=args.anti_method, w_sym=args.w_sym, dev_type=args.device_type)
+    elif args.anti_method:
+        anti_outlier_config = AntiOutlierConfig(anti_method=args.anti_method)
+
+    quant_config = QuantConfig(
+        a_bit=args.a_bit,
+        w_bit=args.w_bit,
+        disable_names=disable_names,
+        act_method=args.act_method,
+        w_sym=args.w_sym,
+        mm_tensor=False,
+        dev_type=args.device_type,
+        dev_id=rank,
+        pr=1.0,
+        fraction=args.fraction,
+        co_sparse=args.co_sparse,
+        do_smooth=args.do_smooth,
+        use_sigma=args.use_sigma,
+        sigma_factor=args.sigma_factor,
+        is_lowbit=args.is_lowbit,
+        use_kvcache_quant=args.use_kvcache_quant,
+        open_outlier=args.open_outlier,
+        group_size=args.group_size
+    )
+
+    # 默认无校准数据集
+    calibration_dataset = None
+    # 若存在calib_file，则使用calib_file作为校准数据集
+    if args.calib_file:
+        calibration_dataset = load_jsonl(args.calib_file)
+    calibration_dataset = calibration_dataset
+    quant_weight_generator = Quantifier(
+        args.model_path, quant_config, anti_outlier_config,
+        device_type=args.device_type, tokenizer_args={"padding_side": "left"}
+    )
+    quant_weight_generator.tokenizer.pad_token_id = 0
+
+    tokenized_data = None
+    if calibration_dataset is not None:
+        tokenized_data = quant_weight_generator.get_tokenized_data(calibration_dataset)
+
+    quant_weight_generator.convert(tokenized_data, args.save_directory, args.disable_level)
+    #为适配工具稀疏量化传入w_bit=4,a_bit=8暂时修改quant_type
+    quant_type = f"w{args.w_bit}a{args.a_bit}"
+    is_sparseCompress = args.w_bit == 4 and args.a_bit == 8 and (args.co_sparse or args.is_lowbit)
+    if is_sparseCompress:
+        quant_type = "w8a8s"
+    modify_config(
+        args.model_path, args.save_directory, config.torch_dtype,
+        quant_type,
+        args.use_kvcache_quant
+    )
+    copy_tokenizer_files(args.model_path, args.save_directory)
diff --git a/mindie/examples/models/llama3/run_pa.sh b/mindie/examples/models/llama3/run_pa.sh
new file mode 100644
index 00000000..3fb605ec
--- /dev/null
+++ b/mindie/examples/models/llama3/run_pa.sh
@@ -0,0 +1,23 @@
+# Copyright Huawei Technologies Co., Ltd. 2023-2024. All rights reserved.
+# 参数配置以及启动指令的说明见同级目录下的README.md文件
+export MASTER_PORT=20030
+
+# 以下环境变量与性能和内存优化相关，通常情况下无需修改
+export ATB_LAYER_INTERNAL_TENSOR_REUSE=1
+export INF_NAN_MODE_ENABLE=1
+export ATB_OPERATION_EXECUTE_ASYNC=1
+export TASK_QUEUE_ENABLE=1
+export ATB_CONVERT_NCHW_TO_ND=1
+export LCCL_ENABLE_FALLBACK=1
+export ATB_WORKSPACE_MEM_ALLOC_GLOBAL=1
+export ATB_CONTEXT_WORKSPACE_SIZE=0
+export ATB_LAUNCH_KERNEL_WITH_TILING=1
+
+extra_param=""
+world_size=$(($(echo "${ASCEND_RT_VISIBLE_DEVICES}" | grep -o , | wc -l) +1))
+
+if [ "$world_size" == "1" ]; then
+    python -m examples.run_pa --model_path $1 --max_output_length $2 $extra_param
+else
+    torchrun --nproc_per_node $world_size --master_port $MASTER_PORT -m examples.run_pa --model_path $1 --max_output_length $2 $extra_param
+fi
\ No newline at end of file
diff --git a/mindie/examples/models/llava/README.md b/mindie/examples/models/llava/README.md
new file mode 100644
index 00000000..9fb5c124
--- /dev/null
+++ b/mindie/examples/models/llava/README.md
@@ -0,0 +1,115 @@
+# README
+
+- [LLaVA（Large Language and Vision Assistant）](https://github.com/haotian-liu/LLaVA)，是一种多模态大模型，具有强大的图像和文本处理能力，使得它在聊天机器人等场景中具有广泛的应用前景。 在聊天机器人中，LLaVA可以通过解析用户的文字输入，结合图像信息，生成更加生动、准确的回复。 此外，LLaVA还可以根据用户的图像输入，提供相关的文本信息，实现更加智能化的交互。
+- 此代码仓中实现了一套基于NPU硬件的LLaVa推理模型。配合加速库使用，旨在NPU上获得极致的推理性能。
+- 支持llavav.1.5 13B 基于llama文本模型的多模态推理
+
+# 使用说明
+
+## 路径变量解释
+
+| 变量名      | 含义                                                                                                                                                         |
+| ----------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------ |
+| working_dir | 加速库及模型库下载后放置的目录                                                                                                                               |
+| llm_path    | 模型仓所在路径。若使用编译好的包，则路径为 `${working_dir}/MindIE-LLM/`；若使用gitee下载的代码，则路径为 `${working_dir}/MindIE-LLM/examples/atb_models` |
+| script_path | 脚本所在路径；llava的工作脚本所在路径为 `${llm_path}/examples/models/llava`                                                                        |
+| weight_path | 模型权重路径                                                   |
+| image_path  | 图片所在路径                                                    |
+|open_clip_path| open_clip权重所在路径                                          |
+## 权重
+
+**权重下载**
+
+- [LLava-13B](https://huggingface.co/llava-hf/llava-1.5-13b-hf/tree/main)
+
+**基础环境变量**
+
+-1.Python其他第三方库依赖，参考[requirements_llava.txt](../../../requirements/models/requirements_llava.txt)
+-2.参考[此README文件](../../../README.md)
+-注意：保证先后顺序，否则llava的其余三方依赖会重新安装torch，导致出现别的错误
+
+
+## 推理
+
+### 对话测试
+
+**运行Paged Attention FP16**
+
+- 运行启动脚本
+  - 在\${llm_path}目录下执行以下指令
+    ```shell
+    bash ${script_path}/run_pa.sh --run ${weight_path} ${image_path}
+    ```
+- 环境变量说明
+  - `export ASCEND_RT_VISIBLE_DEVICES=0,1,2,3,4,5,6,7`
+    - 指定当前机器上可用的逻辑NPU核心，多个核心间使用逗号相连
+    - 核心ID查阅方式见[此README文件](../../README.md)的【启动脚本相关环境变量】章节
+    - 对于300I DUO卡而言，若要使用单卡双芯，请指定至少两个可见核心；若要使用双卡四芯，请指定至少四个可见核心
+    - 各模型支持的核心数参考“特性矩阵”
+  - `export MASTER_PORT=20030`
+    - 设置卡间通信端口
+    - 默认使用20030端口
+    - 目的是为了避免同一台机器同时运行多个多卡模型时出现通信冲突
+    - 设置时端口建议范围为：20000-20050
+  - 以下环境变量与性能和内存优化相关，通常情况下无需修改
+    ```shell
+    export ATB_LAYER_INTERNAL_TENSOR_REUSE=1
+    export INF_NAN_MODE_ENABLE=0
+    export ATB_OPERATION_EXECUTE_ASYNC=1
+    export TASK_QUEUE_ENABLE=1
+    export ATB_CONVERT_NCHW_TO_ND=1
+    export LCCL_ENABLE_FALLBACK=1
+    export ATB_WORKSPACE_MEM_ALLOC_GLOBAL=1
+    export ATB_CONTEXT_WORKSPACE_SIZE=0
+    ```
+
+## 精度测试
+
+#### 方案
+
+我们采用的精度测试方案是这样的：使用同样的一组图片，分别在 GPU 和 NPU 上执行推理，得到两组图片描述。 再使用 open_clip 模型作为裁判，对两组结果分别进行评分，以判断优劣。
+
+#### 实施
+
+1. 下载[open_clip 的权重 open_clip_pytorch_model.bin](https://huggingface.co/laion/CLIP-ViT-H-14-laion2B-s32B-b79K/tree/main)，并把下载的权重放在open_clip_path目录下
+   下载[测试图片（CoCotest 数据集）](https://cocodataset.org/#download)并随机抽取其中100张图片放入{image_path}目录下
+   
+
+2. GPU上，在{script_path}/precision目录下，运行脚本python run_coco_gpu.py --model_path ${weight_path} --image_path ${image_path},会在{script_path}/precision目录下生成gpu_coco_predict.json文件存储gpu推理结果
+
+3. NPU 上,在\${llm_path}目录下执行以下指令：
+   ```bash
+   bash ${script_path}/run_pa.sh --precision ${weight_path} ${image_path} 
+   ```
+   运行完成后会在{script_path}生成predict_result.json文件存储npu的推理结果
+
+4. 对结果进行评分：分别使用GPU和NPU推理得到的两组图片描述(gpu_coco_predict.json、predict_result.json)作为输入,执行clip_score_llava.py 脚本输出评分结果
+```bash
+   python examples/models/llava/precision/clip_score_llava.py \ 
+   --model_weights_path {open_clip_path}/open_clip_pytorch_model.bin \ 
+   --image_info {gpu_coco_predict.json 或 predict_result.json的路径} \
+   --dataset_path {iamge_path}
+```
+
+   得分高者精度更优。
+
+## 性能测试
+
+性能测试时需要在 `${image_path}` 下仅存放一张图片，使用以下命令运行 `run_pa.sh`，会自动输出batchsize为1-10时，输出token长度为 256时的吞吐。910B4上硬件只能跑单batch，如果需要多跑batch，可以尝试用多张卡跑。
+
+```shell
+bash ${script_path}/run_pa.sh --performance ${weight_path} ${image_path}
+```
+
+例如在 MindIE-ATB-Models 根目录，可以运行：
+
+```shell
+bash examples/models/llava/run_pa.sh --performance ${weight_path} ${image_path}
+```
+
+可以在 `examples/models/llava` 路径下找到测试结果。
+
+## FAQ
+- 在对话测试或者精度测试时，用户如果需要修改输入input_texts,max_batch_size时，可以修改{script_path}/llava.py里的参数，具体可见llava.py
+- 更多环境变量见[此README文件](../../README.md)
+- 运行时，需要通过指令pip list｜grep protobuf确认protobuf版本，如果版本高于3.20.x，请运行指令pip install protobuf==3.20.0进行更新
diff --git a/mindie/examples/models/llava/llava.py b/mindie/examples/models/llava/llava.py
new file mode 100644
index 00000000..d5579df7
--- /dev/null
+++ b/mindie/examples/models/llava/llava.py
@@ -0,0 +1,370 @@
+# Copyright Huawei Technologies Co., Ltd. 2023-2024. All rights reserved.
+import argparse
+import math
+import os
+import time
+import json
+from dataclasses import dataclass
+from typing import List
+
+import torch
+import torch_npu
+from transformers import AutoProcessor
+
+from atb_llm.utils.cpu_binding import NpuHbmInfo
+from atb_llm.utils.env import ENV
+from atb_llm.utils.file_utils import safe_open
+from atb_llm.utils.log import logger, print_log
+from examples.server.cache import CacheConfig, CacheManager
+from examples.server.generate import decode_token, generate_req
+from examples.server.request import request_from_text_and_image, MultiModalRequestParams
+from examples.run_pa import PARunner, parse_ids
+
+
+STORE_TRUE = "store_true"
+PERF_FILE = "./examples/models/llava/llava_performance.csv"
+PERF_COLUMNS = "batch, input_len, output_len,ResponseTime(ms),E2E Throughout Average(Tokens/s)\n"
+PRED_FILE = "./examples/models/llava/predict_result.json"
+
+
+@dataclass
+class MultiModalRequestOut:
+    req_list:List
+    batch:int
+    file_list:List
+    input_texts:List
+
+
+@dataclass
+class InputAttrs:
+    input_texts:List | None
+    image_path:str | None
+
+
+class MultiModalPARunner(PARunner):
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+        self.init_processer()
+        self.predict_result = kwargs.get('prediction_result', False)
+        self.performance = kwargs.get('performance', False)
+        self.max_prefill_tokens = kwargs.get('max_prefill_tokens', None)
+        self.warm_up_num_blocks = 0
+        self.warm_up_memory = 0
+        self.cache_manager = None
+        self.input_attrs = InputAttrs(kwargs.get('input_texts', None),
+                                       kwargs.get('image_path', None))
+    
+    def init_processer(self):
+        try:
+            self.processor = AutoProcessor.from_pretrained(self.model_path)
+        except AssertionError:
+            self.processor = self.model.tokenizer
+
+    def prepare_request(self, input_texts, image_path, batch_size, max_output_length, current_iter):
+        file_list = os.listdir(image_path)
+        if len(input_texts) == 1:
+            input_texts = [input_texts[0] for _ in range(batch_size)]
+            if len(file_list) == 1:
+                req_list = [request_from_text_and_image(
+                            self.processor,
+                            self.model,
+                            MultiModalRequestParams(input_texts[single_batch],
+                                                    os.path.join(image_path,
+                                                                file_list[0]),
+                                                    max_output_length,
+                                                    self.block_size,
+                                                    req_idx=single_batch))
+                            for single_batch in range(batch_size)]
+            else:
+                req_list = [request_from_text_and_image(
+                            self.processor,
+                            self.model,
+                            MultiModalRequestParams(input_texts[single_batch],
+                                                    os.path.join(image_path,
+                                                                file_list[current_iter * batch_size
+                                                                + single_batch]),
+                                                    max_output_length,
+                                                    self.block_size,
+                                                    req_idx=single_batch))
+                            for single_batch in range(batch_size)]
+        else:
+            if len(input_texts) != len(file_list):
+                raise RuntimeError("input_texts length must equal input_images length")
+            else:
+                req_list = [request_from_text_and_image(
+                            self.processor,
+                            self.model,
+                            MultiModalRequestParams(input_texts[current_iter * batch_size
+                                                                + single_batch],
+                                                    os.path.join(image_path,
+                                                                 file_list[current_iter * batch_size
+                                                                           + single_batch]),
+                                                    max_output_length,
+                                                    self.block_size,
+                                                    req_idx=single_batch))
+                            for single_batch in range(batch_size)]
+            print_log(self.rank, logger.debug, f'req_list[0].input_ids: {req_list[0].input_ids}')
+        return MultiModalRequestOut(req_list, batch_size, file_list, input_texts)
+
+
+    def warm_up(self):
+        input_texts = self.input_attrs.input_texts
+        image_path = self.input_attrs.image_path
+        if self.max_prefill_tokens == -1:
+            self.max_prefill_tokens = self.max_batch_size * (self.max_input_length + self.max_output_length)
+        print_log(self.rank, logger.info, "---------------begin warm_up---------------")
+        try:
+            self.warm_up_num_blocks = math.ceil((self.max_input_length + self.max_output_length) /
+                                                self.block_size) * self.max_batch_size
+        except ZeroDivisionError as e:
+            raise ZeroDivisionError from e
+        cache_config = CacheConfig(self.warm_up_num_blocks, self.block_size)
+        self.cache_manager = CacheManager(cache_config, self.model_config)
+        file_list = os.listdir(image_path)
+        req_list = [request_from_text_and_image(
+                        self.processor,
+                        self.model,
+                        MultiModalRequestParams(input_texts[0],
+                                                os.path.join(image_path, file_list[0]),
+                                                self.max_output_length,
+                                                self.block_size,
+                                                req_idx=single_batch))
+                        for single_batch in range(self.max_batch_size)
+                    ]
+        self.model.postprocessor.max_new_tokens = 2
+        generate_req(req_list, self.model, self.max_batch_size, self.max_prefill_tokens, self.cache_manager)
+        self.warm_up_memory = int(
+            self.max_memory * NpuHbmInfo.get_hbm_usage(self.local_rank, self.world_size, self.model.soc_info.need_nz))
+        print_log(self.rank, logger.info, f'warmup_memory(GB): {self.warm_up_memory / (1024 ** 3): .2f}')
+        print_log(self.rank, logger.info, "---------------end warm_up---------------")
+
+    def infer(self, inputs, batch_size, max_output_length, ignore_eos, is_chat_model=False, **kwargs):
+        print_log(self.rank, logger.info, "---------------begin inference---------------")
+        input_texts = inputs.input_texts
+        image_path = inputs.image_path
+        if not self.cache_manager:
+            if self.max_prefill_tokens == -1:
+                self.max_prefill_tokens = self.max_batch_size * (self.max_input_length + self.max_output_length)
+            cache_block_size = self.block_size * self.model.num_kv_heads * self.model.head_size
+            dtype_size = CacheManager.get_dtype_size(self.dtype)
+            total_cache_size = self.model.num_layers * cache_block_size * 2 * dtype_size
+            # 1 << 30正好是1G
+            max_memory = ENV.memory_fraction * self.max_memory \
+                if not ENV.max_memory_gb else int(ENV.max_memory_gb) * (1 << 30)
+            free_memory = max_memory - ENV.reserved_memory_gb * (1 << 30) - (
+                self.warm_up_memory if self.warm_up_memory != 0 else self.init_memory)
+            print_log(self.rank, logger.info,
+                      f"infer max_memory(GB): {max_memory / (1024 ** 3): .2f}, "
+                      f"warm_up_memory(GB): {self.warm_up_memory / (1024 ** 3): .2f}, "
+                      f"free_memory(GB): {free_memory / (1024 ** 3): .2f}")
+
+            num_blocks = int(free_memory // total_cache_size)
+            print_log(self.rank, logger.info, f"num_blocks: {num_blocks}, free_memory: {free_memory}")
+            cache_config = CacheConfig(num_blocks, self.block_size)
+            self.cache_manager = CacheManager(cache_config, self.model_config)
+
+        
+        self.model.postprocessor.max_new_tokens = max_output_length
+        all_input_texts = []
+        all_generate_text_list = []
+        all_token_num_list = []
+        e2e_time_all = 0
+        file_list = None
+        batch = None
+        req_list = None
+        if not ENV.profiling_enable:
+            print_log(self.rank, logger.debug, "no profiling")
+            torch.npu.synchronize()
+            e2e_start = time.time()
+            if ignore_eos:
+                self.model.postprocessor.eos_token_id = []
+            max_iters = math.ceil(len(os.listdir(image_path)) / self.max_batch_size)
+            for current_iter in range(max_iters):
+                multimodalrequestout = self.prepare_request(input_texts,
+                                                            image_path,
+                                                            batch_size,
+                                                            max_output_length,
+                                                            current_iter)
+                req_list = multimodalrequestout.req_list
+                batch = multimodalrequestout.batch
+                file_list = multimodalrequestout.file_list
+                input_texts = multimodalrequestout.input_texts
+                print_log(self.rank, logger.debug, f'req_list[0].input_ids: {req_list[0].input_ids}')
+                print_log(self.rank, logger.info, f'current iter: {current_iter}')
+                generate_req(req_list, self.model, self.max_batch_size, self.max_prefill_tokens, self.cache_manager)
+                generate_text_list, token_num_list = decode_token(req_list, self.tokenizer)
+                torch.npu.synchronize()
+                e2e_end = time.time()
+                e2e_time = e2e_end - e2e_start
+                e2e_time_all += e2e_time
+                all_input_texts.extend(input_texts)
+                all_generate_text_list.extend(generate_text_list)
+                all_token_num_list.extend(token_num_list)
+
+        else:
+            print_log(self.rank, logger.debug, "enter profiling")
+            profiling_path = ENV.profiling_filepath
+            if not os.path.exists(profiling_path):
+                os.makedirs(profiling_path, exist_ok=True)
+            torch.npu.synchronize()
+            e2e_start = time.time()
+            experimental_config = torch_npu.profiler._ExperimentalConfig(
+                aic_metrics=torch_npu.profiler.AiCMetrics.PipeUtilization,
+                profiler_level=torch_npu.profiler.ProfilerLevel.Level0,
+                l2_cache=False,
+                data_simplification=False
+            )
+            with torch_npu.profiler.profile(
+                activities=[
+                    torch_npu.profiler.ProfilerActivity.CPU,
+                    torch_npu.profiler.ProfilerActivity.NPU
+                ],
+                on_trace_ready=torch_npu.profiler.tensorboard_trace_handler(profiling_path),
+                record_shapes=True,
+                profile_memory=True,
+                with_stack=False,
+                with_flops=False,
+                with_modules=False,
+                experimental_config=experimental_config):
+                multimodalrequestout = self.prepare_request(input_texts,
+                                                            image_path,
+                                                            batch_size,
+                                                            max_output_length,
+                                                            current_iter=0)
+                req_list = multimodalrequestout.req_list
+                batch = multimodalrequestout.batch
+                file_list = multimodalrequestout.file_list
+                input_texts = multimodalrequestout.input_texts
+                generate_req(req_list, self.model, self.max_batch_size, self.max_prefill_tokens, self.cache_manager)
+            torch.npu.synchronize()
+            e2e_end = time.time()
+            e2e_time = e2e_end - e2e_start
+            e2e_time_all += e2e_time
+            all_input_texts.extend(input_texts)
+        if self.predict_result:
+            if self.local_rank == 0:
+                image_answer_pairs = {}
+                for text_index in range(len(all_input_texts)):
+                    image_answer_pairs[file_list[text_index]] = all_generate_text_list[text_index]
+                    image_answer_pairs = dict(sorted(image_answer_pairs.items()))
+                if not os.path.exists(PRED_FILE):
+                    with safe_open(PRED_FILE, "w") as f:
+                        json.dump(image_answer_pairs, f)
+                else:
+                    with safe_open(PRED_FILE, "r") as f:
+                        old_data = json.load(f)
+                    old_data.update(image_answer_pairs)
+                    old_data = dict(sorted(old_data.items()))
+                    with safe_open(PRED_FILE, "w") as f:
+                        json.dump(old_data, f)
+        if self.performance:
+            e2e_throughput = batch * max_output_length / (e2e_time_all + 1e-12)
+            input_len = self.tokenizer([all_input_texts[0]], return_tensors="pt")["input_ids"].flatten().shape[0]
+            output_len = all_token_num_list[0][1]
+            e2e_time = e2e_time_all
+            e2e_throughput = e2e_throughput
+            if self.local_rank == 0:
+                if not os.path.exists(PERF_FILE):
+                    file_utils = safe_open(PERF_FILE, 'a')
+                    file_utils.write(PERF_COLUMNS)
+                    file_utils.write(f"{batch}, {input_len}, {output_len}, {e2e_time}, {e2e_throughput}\n")
+                    file_utils.close()
+                else:
+                    file_utils = safe_open(PERF_FILE, 'a')
+                    file_utils.write(f"{batch}, {input_len}, {output_len}, {e2e_time}, {e2e_throughput}\n")
+                    file_utils.close()
+        if ENV.token_ids_save_enable:
+            if self.local_rank == 0: 
+                for idx, req in enumerate(req_list):
+                    input_ids_save_filename = f"input_ids_{idx}.pth"
+                    output_ids_save_filename = f"output_ids_{idx}.txt"
+                    torch.save(req.input_ids.cpu(),
+                        os.path.join(ENV.token_ids_save_folder, input_ids_save_filename))
+                    output_path = os.path.join(ENV.token_ids_save_folder, output_ids_save_filename)
+                    with safe_open(output_path, 'w', encoding='utf-8') as f:
+                        f.write(' '.join(map(str, req_list[i].out_token_list)))
+        print_log(self.rank, logger.info, "---------------end inference---------------")
+        return all_generate_text_list, all_token_num_list, e2e_time_all
+    
+    
+def parse_arguments():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--model_path',
+                        help="model and tokenizer path",
+                        default='/data/acltransformer_testdata/weights/llava',
+                        )
+    parser.add_argument('--image_path',
+                        help="image path",
+                        default="/data/acltransformer_testdata/images/llava",
+                        )
+    parser.add_argument(
+        '--input_texts',
+        type=str,
+        nargs='+',
+        default=["USER: <image>\nWrite an essay about this image, at least 256 words. ASSISTANT:"])
+    parser.add_argument(
+        '--input_ids',
+        type=parse_ids,
+        nargs='+',
+        default=None)
+    parser.add_argument(
+        '--prediction_result',
+         action=STORE_TRUE)
+    parser.add_argument(
+        '--performance',
+         action=STORE_TRUE)
+    parser.add_argument('--max_position_embeddings', type=int, default=None)
+    parser.add_argument('--max_input_length', type=int, default=1024)
+    parser.add_argument('--max_output_length', type=int, default=256)
+    parser.add_argument('--max_prefill_tokens', type=int, default=-1)
+    parser.add_argument("--max_batch_size", type=int, default=1)
+    parser.add_argument("--block_size", type=int, default=128)
+
+    parser.add_argument('--num_beams',
+                        type=int,
+                        help="Use beam search if num_beams >1",
+                        default=1)
+    parser.add_argument('--temperature', type=float, default=1.0)
+    parser.add_argument('--top_k', type=int, default=1)
+    parser.add_argument('--top_p', type=float, default=0.0)
+    parser.add_argument('--length_penalty', type=float, default=1.0)
+    parser.add_argument('--repetition_penalty', type=float, default=1.0)
+    parser.add_argument('--presence_penalty', type=float, default=0.0)
+    parser.add_argument('--frequency_penalty', type=float, default=0.0)
+    parser.add_argument('--ignore_eos', action=STORE_TRUE)
+
+    return parser.parse_args()
+
+
+if __name__ == '__main__':
+    args = parse_arguments()
+    rank = int(os.getenv("RANK", "0"))
+    local_rank = int(os.getenv("LOCAL_RANK", "0"))
+    world_size = int(os.getenv("WORLD_SIZE", "1"))
+    input_dict = {
+        'rank': rank,
+        'world_size': world_size,
+        'local_rank': local_rank,
+        **vars(args)
+    }
+
+    pa_runner = MultiModalPARunner(**input_dict)
+    print_log(rank, logger.info, f'pa_runner: {pa_runner}')
+    
+
+    infer_params = {
+        "inputs": InputAttrs(args.input_texts, args.image_path),
+        "batch_size": args.max_batch_size,
+        "max_output_length": args.max_output_length,
+        "ignore_eos": args.ignore_eos,
+    }
+    pa_runner.warm_up()
+    generate_texts, token_nums, latency = pa_runner.infer(**infer_params)
+    
+    for i, generate_text in enumerate(generate_texts):
+        length = len(args.input_ids) if args.input_ids else len(args.input_texts)
+        if i < length:
+            print_log(rank, logger.info, f'Question[{i}]: {args.input_texts[i]}')
+        print_log(rank, logger.info, f'Answer[{i}]: {generate_text}')
+        print_log(rank, logger.info, f'Generate[{i}] token num: {token_nums[i]}')
+        print_log(rank, logger.info, f"Latency: {latency}")
\ No newline at end of file
diff --git a/mindie/examples/models/llava/precision/GPU_NPU_result_example.json b/mindie/examples/models/llava/precision/GPU_NPU_result_example.json
new file mode 100644
index 00000000..13016220
--- /dev/null
+++ b/mindie/examples/models/llava/precision/GPU_NPU_result_example.json
@@ -0,0 +1,4 @@
+{
+    "/xx/01.jpg": "This image ...",
+    "/xx/02.jpg": "This image ..."
+}
\ No newline at end of file
diff --git a/mindie/examples/models/llava/precision/clip_score_llava.py b/mindie/examples/models/llava/precision/clip_score_llava.py
new file mode 100644
index 00000000..0620452c
--- /dev/null
+++ b/mindie/examples/models/llava/precision/clip_score_llava.py
@@ -0,0 +1,121 @@
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import json
+import os
+import time
+import open_clip
+import torch
+import torch_npu
+import torch.nn.functional as F
+
+from atb_llm.utils.file_utils import safe_open
+from atb_llm.utils.log import logger
+from PIL import Image
+
+
+def parse_arguments():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--device",
+        type=str,
+        default="0",
+        help="device for torch.",
+    )
+    parser.add_argument(
+        "--model_name",
+        type=str,
+        default="ViT-H-14",
+        help="open clip model name",
+    )
+    parser.add_argument(
+        "--model_weights_path",
+        type=str,
+        default="./open_clip_pytorch_model.bin",
+        help="open clip model weights",
+    )
+    parser.add_argument(
+        "--image_info",
+        type=str,
+        default="./image_info.json",
+        help="Image_info.json file.",
+    )
+    parser.add_argument(
+        "--dataset_path",
+        type=str,
+        default="./cocoTest/",
+        help="dataset path for precision test.",
+    )
+    return parser.parse_args()
+
+
+def set_torch_env(device_ids):
+    torch_npu.npu.set_device(int(device_ids))
+    torch.npu.set_compile_mode(jit_compile=False)
+
+
+def clip_score(model_clip, tokenizer, preprocess, model_answer, image_file):
+    imgs = []
+    texts = []
+
+    img = preprocess(Image.open(image_file)).unsqueeze(0).npu()
+    imgs.append(img)
+    text = tokenizer([model_answer]).npu()
+    texts.append(text)
+
+    img = torch.cat(imgs)  # [bs, 3, 224, 224]
+    text = torch.cat(texts)  # [bs, 77]
+
+    with torch.no_grad():
+        text_ft = model_clip.encode_text(text).float()
+        img_ft = model_clip.encode_image(img).float()
+        score = F.cosine_similarity(img_ft, text_ft).squeeze()
+
+    return score.cpu()
+
+
+def main():
+    args = parse_arguments()
+    set_torch_env(args.device)
+
+    t_b = time.time()
+    logger.info("Load clip model...")
+    model_clip, _, preprocess = open_clip.create_model_and_transforms(
+        args.model_name, pretrained=args.model_weights_path, device=f"npu:{args.device}")
+    model_clip.eval()
+    logger.info(f">done. elapsed time: {(time.time() - t_b):.3f} s")
+
+    tokenizer = open_clip.get_tokenizer("ViT-H-14")
+    with safe_open(args.image_info, "r", override_flags=os.O_RDONLY) as f:
+        image_info = json.load(f)
+
+    t_b = time.time()
+
+    logger.info("Calc clip score...")
+    all_scores = []
+    for image_file, model_answer in image_info.items():
+        # 单个图片  单个answer
+        image_file_path = os.path.join(args.dataset_path, image_file)
+        logger.info(f"cur image file: {image_file_path}")
+        image_score = clip_score(model_clip, tokenizer, preprocess, model_answer, image_file_path)
+        logger.info(f"image_score: {image_score}")
+        all_scores.append(image_score)
+    all_scores_mean = torch.mean(torch.tensor(all_scores))
+    logger.info(f"平均分：{all_scores_mean=}")
+    logger.info(f">done. elapsed time: {(time.time() - t_b):.3f} s")
+
+
+if __name__ == '__main__':
+    main()
diff --git a/mindie/examples/models/llava/precision/run_coco_gpu.py b/mindie/examples/models/llava/precision/run_coco_gpu.py
new file mode 100644
index 00000000..7ce8e157
--- /dev/null
+++ b/mindie/examples/models/llava/precision/run_coco_gpu.py
@@ -0,0 +1,74 @@
+# Copyright Huawei Technologies Co., Ltd. 2024-2024. All rights reserved.
+import argparse
+import json
+import os
+import stat
+import torch
+from tqdm import tqdm
+from PIL import Image
+from transformers import LlavaProcessor, LlavaForConditionalGeneration
+
+from atb_llm.utils.file_utils import safe_open
+from atb_llm.utils.log import logger
+
+torch.manual_seed(1234)
+OUTPUT_JSON_PATH = "./gpu_coco_predict.json"
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description="Demo")
+    parser.add_argument("--model_path",
+                        required=True,
+                        help="Model and tokenizer path.")
+    parser.add_argument("--image_path",
+                        required=True,
+                        help="Image path for inference.")
+    return parser.parse_args()
+
+
+def main():
+    device = torch.device('cuda', 0)
+    args = parse_args()
+    model_path = args.model_path
+    image_path = args.image_path
+    logger.info(f"===== model_path: {model_path}")
+    logger.info(f"===== image_path: {image_path}")
+    if os.path.exists(model_path) and os.path.exists(image_path):
+        images_list = os.listdir(image_path)
+        processor = LlavaProcessor.from_pretrained(model_path, trust_remote_code=True)
+        model = LlavaForConditionalGeneration.from_pretrained(model_path,
+                                                             torch_dtype=torch.float16)
+        model = model.to(device)
+        image_answer = {}
+        for _, img_name in enumerate(tqdm(images_list)):
+            img_path = os.path.join(image_path, img_name)
+            image = Image.open(img_path)
+            prompt = "USER: <image>\nDescribe this image in detail. ASSISTANT:"
+            inputs = processor(text=prompt, images=image, return_tensors="pt").to(device)
+            image.close()
+            inputs = inputs.to(model.device)
+            with torch.no_grad():
+                outputs = model.generate(**inputs, do_sample=False, num_beams=1, max_new_tokens=30)
+            response = processor.decode(outputs.cpu()[0], skip_special_tokens=True)
+            image_answer[img_name] = response.split("ASSISTANT:")[-1]
+        sorted_dict = dict(sorted(image_answer.items()))
+        torch.cuda.empty_cache()
+        flags = os.O_WRONLY | os.O_CREAT | os.O_EXCL
+        models = stat.S_IWUSR | stat.S_IRUSR
+        if not os.path.exists(OUTPUT_JSON_PATH):
+            with safe_open(OUTPUT_JSON_PATH, "w", override_flags=os.O_WRONLY | os.O_CREAT | os.O_EXCL) as fw:
+                json.dump(sorted_dict, fw)
+        else:
+            with safe_open(OUTPUT_JSON_PATH, "r") as f:
+                old_data = json.load(f)
+            old_data.update(sorted_dict)
+            sorted_dict = dict(sorted(old_data.items()))
+            with os.fdopen(os.open(OUTPUT_JSON_PATH, flags, models), 'w') as fw:
+                json.dump(sorted_dict, fw)
+        logger.info("run run_coco_gpu.py finish! output file: ./gpu_coco_predict.json")
+    else:
+        logger.info("model_path or image_path not exist")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/mindie/examples/models/llava/run_pa.sh b/mindie/examples/models/llava/run_pa.sh
new file mode 100644
index 00000000..ba87612d
--- /dev/null
+++ b/mindie/examples/models/llava/run_pa.sh
@@ -0,0 +1,81 @@
+# Copyright Huawei Technologies Co., Ltd. 2023-2024. All rights reserved.
+# 参数配置以及启动指令的说明见同级目录下的README.md文件
+export ASCEND_RT_VISIBLE_DEVICES=0
+export MASTER_PORT=20036
+
+# 以下环境变量与性能和内存优化相关，通常情况下无需修改
+export ATB_LAYER_INTERNAL_TENSOR_REUSE=1
+export INF_NAN_MODE_ENABLE=1
+export ATB_OPERATION_EXECUTE_ASYNC=1
+export TASK_QUEUE_ENABLE=1
+export ATB_CONVERT_NCHW_TO_ND=1
+export LCCL_ENABLE_FALLBACK=1
+export ATB_WORKSPACE_MEM_ALLOC_GLOBAL=1
+export ATB_CONTEXT_WORKSPACE_SIZE=0
+export INT8_FORMAT_NZ_ENABLE=1
+
+IMAGE_PATH=/data/acltransformer_testdata/images/llava
+world_size=$(($(echo "${ASCEND_RT_VISIBLE_DEVICES}" | grep -o , | wc -l) +1))
+
+if [ $# -eq 0 ]; then
+        echo "Error: require parameter. Please refer to README."
+        exit 1
+fi
+
+
+if [[ ! -z "$1" ]];then
+    RUN_OPTION=$1
+    echo "[RUN_OPTION]: $RUN_OPTION"
+    shift
+fi
+
+if [[ ! -z "$1" ]];then
+    MODEL_PATH=$1
+    echo "[MODEL_PATH]: $MODEL_PATH"
+    shift
+fi
+
+if [[ ! -z "$1" ]];then
+    IMAGE_PATH=$1
+    echo "[IMAGE_PATH]: $IMAGE_PATH"
+    shift
+fi
+
+case "${RUN_OPTION}" in
+    "--run")
+    extra_param=""
+    extra_param="${extra_param} --model_path $MODEL_PATH
+                                --image_path $IMAGE_PATH"
+    if [ "$TP_WORLD_SIZE" == "1" ]; then
+        python -m examples.models.llava.llava $extra_param
+    else
+        torchrun --nproc_per_node $world_size --master_port $MASTER_PORT -m examples.models.llava.llava $extra_param
+    fi
+    ;;
+     "--performance")
+        for bsz in {1..10}; do
+            extra_param=""
+            extra_param="${extra_param} --model_path $MODEL_PATH
+                                        --image_path $IMAGE_PATH
+                                        --max_batch_size $bsz
+                                        --max_output_length 256"
+                                        
+            if [ "$TP_WORLD_SIZE" == "1" ]; then
+                python -m examples.models.llava.llava $extra_param --performance
+            else
+                torchrun --nproc_per_node $world_size --master_port $MASTER_PORT -m examples.models.llava.llava $extra_param --performance
+            fi
+                done
+            
+            ;;
+    "--precision")
+    extra_param=""
+    extra_param="${extra_param} --model_path $MODEL_PATH
+                                --image_path $IMAGE_PATH"
+    if [ "$TP_WORLD_SIZE" == "1" ]; then
+        python -m examples.models.llava.llava $extra_param --prediction_result
+    else
+        torchrun --nproc_per_node $world_size --master_port $MASTER_PORT -m examples.models.llava.llava $extra_param --prediction_result
+    fi
+    ;;
+esac
\ No newline at end of file
diff --git a/mindie/examples/models/minigpt4/README.md b/mindie/examples/models/minigpt4/README.md
new file mode 100644
index 00000000..b47ff5b4
--- /dev/null
+++ b/mindie/examples/models/minigpt4/README.md
@@ -0,0 +1,421 @@
+# MiniGPT-4
+
+## 目录
+
+- [概述](#概述)
+- [特性矩阵](#特性矩阵)
+- [环境准备](#环境准备)
+    - [路径变量解释](#路径变量解释)
+    - [python 环境准备](#python 环境准备)
+    - [其他依赖](#其他依赖)
+- [模型文件（源码与权重）准备](#模型文件（源码与权重）准备)
+    - [模型文件（源码与权重）下载，以及相应的配置修改](#模型文件（源码与权重）下载，以及相应的配置修改)
+    - [图像处理部分的 om 转换与其他的源码修改](#图像处理部分的 om 转换与其他的源码修改)
+- [基本推理](#基本推理)
+- [测试](#测试)
+    - [图像处理时间测试](#图像处理时间测试)
+    - [精度测试](#精度测试)
+    - [性能测试](#性能测试)
+- [附录](#附录)
+    - [图像处理部分的 om 转换](#图像处理部分的 om 转换)
+    - [对源码的其他必要修改](#对源码的其他必要修改)
+
+## 概述
+
+MiniGPT-4 是兼具语言与图像理解能力的多模态模型，使用了先进的大语言模型强化了机器的视觉理解能力。
+具体来说，它结合了大语言模型 Vicuna 和视觉编码器 BLIP-2，具备强大的新型视觉语言能力。
+
+## 特性矩阵
+
+- 此矩阵罗列了 minigpt4 模型支持的特性
+
+| 模型及参数量      | 800I A2 Tensor Parallelism | 300I DUO Tensor Parallelism | FP16 | BF16 | Flash Attention | Paged Attention | W8A8量化 | W8A16量化 | W4A16量化 | KV cache量化 | 稀疏量化 | MOE量化 | MindIE Service | TGI | 长序列 |
+|-------------|----------------------------|----------------------------|------|------------------|-----------------|-----------------|---------|-----------|---------|-----------|--------------------------|-----|--------|---|--------|
+| minigpt4-7B   | 支持world size 1,2,4,8     | 支持world size 2,4           | √   | ×                    | √              | ×              | ×       | ×        | ×       | ×      | ×                        | ×   | ×      | ×   | ×     |
+
+- 此模型仓已适配的模型版本
+    - [MiniGPT-4 GitHub仓](https://github.com/Vision-CAIR/MiniGPT-4)
+
+## 环境准备
+
+### 路径变量解释
+
+| 变量名         | 含义                    |  
+|-------------|-----------------------|
+| work_space  | 主工作目录                 |
+| model_path  | 开源权重等必要材料放置在此目录       | 
+
+### python 环境准备
+
+参见 `../../../requirements/models/requirements_minigpt4.txt`
+
+```bash
+pip install -r requirements_minigpt4.txt
+```
+
+此外，还需要安装 `aclruntime` 和 `ais_bench` 这两个三方件（为了支持 om 格式的模型）。请参考
+[这里](https://gitee.com/ascend/tools/tree/master/ais-bench_workload/tool/ais_bench) ，下载并安装。
+
+### 其他依赖
+
+其他依赖具备一般性，请参考[此README文件](../../../README.md)
+
+## 模型文件（源码与权重）准备
+
+### 模型文件（源码与权重）下载，以及相应的配置修改
+
+1. 下载 [MiniGPT-4 的源码](https://github.com/Vision-CAIR/MiniGPT-4)。
+
+   下载所得的目录 `MiniGPT-4-main` 即为主工作目录 `${work_space}`。
+
+2. 下载
+   [MiniGPT-4 线性层的权重 pretrained_minigpt4_7b.pth](https://drive.google.com/file/d/1RY9jV0dyqLX-o38LrumkKRh6Jtaop58R/view?usp=sharing)
+   。
+
+   下载完成后，保存到路径`${model_path}/weights_linear/`下。
+
+   须修改配置文件`${work_space}/eval_configs/minigpt4_eval.yaml`中关于此路径的配置。
+
+   line 8
+      ```yaml
+      ckpt: "${model_path}/weights_linear/pretrained_minigpt4_7b.pth"
+      ```
+
+3. 下载 [大语言模型 Vicuna-7b 的权重](https://hf-mirror.com//Vision-CAIR/vicuna-7b/tree/main)。
+
+   下载完成后，保存到路径`${model_path}/weights_language/`下。
+
+   须修改配置文件`${work_space}/minigpt4/configs/models/minigpt4_vicuna0.yaml`中关于此路径的配置。
+
+   line 18
+      ```yaml
+      llama_model: "${model_path}/weights_language/"
+      ```
+
+4. 下载处理图像所需的
+   [VIT 的权重 eva_vit_g.pth](https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP2/eva_vit_g.pth)
+   、
+   [Qformer 的权重 blip2_pretrained_flant5xxl.pth](https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP2/blip2_pretrained_flant5xxl.pth)
+   以及 [Bert(bert-base-uncased) 的 Tokenizer](https://hf-mirror.com//bert-base-uncased)。
+
+   下载完成后，保存到路径`${model_path}/weights_image/`下。完成后，此路径下的全部文件应是如此：
+
+   ```bash
+   eva_vit_g.pth
+   blip2_pretrained_flant5xxl.pth
+   bert-base-uncased
+     config.json
+     tokenizer_config.json
+     vocab.txt
+   ```
+
+   须进行的配置修改如下：
+
+    1. `./om_trans/eva_vit_model.py`
+
+       line 55
+       ```python
+       encoder_config = BertConfig.from_pretrained("${model_path}/weights_image/bert-base-uncased")
+       ```
+
+    2. `${work_space}/minigpt4/models/eva_vit.py`，
+
+       line 433
+       ```python
+       state_dict = torch.load("${model_path}/weights_image/eva_vit_g.pth", map_location="cpu")
+       ```
+
+    3. `${work_space}/minigpt4/models/minigpt4.py`
+
+       line 28
+       ```python
+       q_former_model = "${model_path}/weights_image/blip2_pretrained_flant5xxl.pth"
+       ```
+
+       line 150
+       ```python
+       q_former_model = cfg.get("q_former_model", "${model_path}/weights_image/blip2_pretrained_flant5xxl.pth")
+       ```
+
+       line 89
+       ```python
+       encoder_config = BertConfig.from_pretrained("${model_path}/weights_image/bert-base-uncased")
+       ```
+
+### 图像处理部分的 om 转换与其他的源码修改
+
+见[附录](#附录)。
+
+## 基本推理
+
+1. 修改 `${model_path}/weights_language/config.json`（让 llama 知道我们的输入是 embeds 而非 ids）
+
+   line 24
+
+    ```json
+   "skip_word_embedding": true
+    ```
+
+3. 进入`./predict/`，将 `${work_space}`, `${model_path}` 填入 `run_predict.sh`
+
+   line 10, 11
+
+    ```bash
+    minigpt_dir="${work_space}"
+    LLM_model_path="${model_path}/weights_language"
+    ```
+
+   运行此脚本，参考
+
+    ```bash
+    bash run_predict.sh
+    ```
+
+## 测试
+
+### 图像处理时间测试结果
+
+将图像处理部分转换为 om 模型后，图像处理时间约为0.018s；GPU图像处理时间约为1.185s
+
+### 精度测试
+
+#### 方案
+
+我们采用的精度测试方案是这样的：使用同样的一组图片，分别在 GPU 和 NPU 上执行推理，得到两组图片描述。
+再使用 open_clip 模型作为裁判，对两组结果分别进行评分，以判断优劣。
+
+#### 实施
+
+1. 下载
+   [open_clip 的权重 open_clip_pytorch_model.bin](https://huggingface.co/laion/CLIP-ViT-H-14-laion2B-s32B-b79K/tree/main)，
+
+   下载
+   [测试图片（CoCotest 数据集）](https://cocodataset.org/#download)并随机抽取其中100张图片放入一个文件夹，
+
+   建议都放置到`./precision`下。
+
+2. 收集推理结果。
+    1. GPU 上：收集脚本参考 `./precision/run_predict_walk_dir_GPU.py`，
+       将其放到`${work_space}`目录下执行，注意脚本传参（主要是`--image-path`和`--output-path`）。
+    2. NPU 上：类似基本推理，只需增加一个参数（图片文件夹的路径）即可
+       ```bash
+       bash run_predict.sh 图片文件夹的路径
+       ```
+   收集的结果应是类似 `./precision/GPU_NPU_result_example.json` 的形式。
+
+3. 对结果进行评分：执行脚本 `./precision/clip_score_minigpt4.py`，参考命令：
+   ```bash
+   python clip_score_minigpt4.py --image_info GPU_NPU_result_example.json（这个替换成你的实际路径）
+   ```
+   得分高者精度更优。
+
+### 性能测试
+
+#### 方案
+
+我们基于 `../../../examples/models/llama/run_fa.sh`，略微修改运行逻辑，得到我们的性能测试脚本 `./performance/run_performance.sh`。
+
+#### 实施
+
+1. 修改 `${model_path}/weights_language/config.json`（让 llama 仍走完整的计算逻辑）
+
+   line 24
+
+    ```json
+   "skip_word_embedding": false
+    ```
+
+2. 将 `${model_path}` 填入 `./performance/run_performance.sh`
+
+   line 8
+
+    ```bash
+    LLM_model_path="${model_path}/weights_language"
+    ```
+
+   并按需设置测试参数（参考 line 10-20）。此脚本支持自动摸高。
+
+   运行此脚本，参考
+
+    ```bash
+    bash run_performance.sh
+    ```
+
+## 附录
+
+### 图像处理部分的 om 转换
+
+#### 概述
+
+MiniGPT-4 的图像处理部分的逻辑是固定的，且在每次推理中只执行一次，比较适合转换为 om 离线模型，以提高运行性能。
+
+整个过程分为三步。
+
+第一步，使用 `torch.onnx.export` 把需要转换的计算逻辑制作成一个 onnx 格式的模型。
+
+第二步，使用昇腾 ATC 工具将上述 onnx 模型转换为 om 模型。
+
+第三步，修改 MiniGPT-4 源码，接入转换所得的 om 模型。
+
+#### onnx 转换
+
+1. 首先，识别出图像处理部分的逻辑。即原始代码中`minigpt4.py`的第 125 行的
+   `image_embeds = self.ln_vision(self.visual_encoder(image)).to(device)`
+   及其配套代码。将这一部分单独写成一个文件（即`./om_trans/eva_vit_model.py`）。
+   将它拷贝到`${work_space}/minigpt4/models`目录下。
+
+2. 基于这部分代码，使用 `torch.onnx.export` 将相应的权重转换为 onnx 格式，详见 `./om_trans/onnx_model_export.py`。
+   运行该文件，即可得到 onnx 模型。
+   参考运行命令:
+   ```bash
+   python onnx_model_export.py --onnx-model-dir onnx模型的输出路径 --image-path ${work_space}/examples_v2/office.jpg
+   ```
+   提示：`onnx_model_export.py`脚本需要 import `${work_space}/minigpt4` 下的模块，
+   为确保能 import 成功，可以 cd 到 `${work_space}` 下再运行此脚本，也可以把 `${work_space}` 加入 `PYTHONPATH`。
+
+#### om 转换
+
+om 转换需使用昇腾 ATC 工具，参考
+[这里](https://www.hiascend.com/document/detail/zh/canncommercial/63RC1/inferapplicationdev/atctool/atctool_000005.html)
+
+1. 环境准备：安装 CANN 并 source 相应的环境变量；
+
+2. 模型转换：参考快速入门中 onnx 网络模型转换成离线模型的章节，或参考执行下面的转换命令
+   （要进入到已转换好的 onnx 模型目录中去执行上述命令，否则会找不到权重文件）：
+   ```bash
+   atc --model=eva_vit_g.onnx --framework=5 --output=${output_path}/eva_vit_g --soc_version=Ascend910B4(按实际) --input_shape="input:1,3,224,224"
+   ```
+   转换完成后，将所得的 om 模型保存到路径`${model_path}/weights_image/eva_vit_g.om`。
+
+#### 接入转换所得的 om 模型
+
+1. 将 `./om_trans/image_encoder.py` 拷贝到 `${work_space}/minigpt4/models` 目录下。
+
+2. 修改 `${work_space}/minigpt4/models/minigpt_base.py` 文件，具体如下：
+
+    1. 导入图像 om 模型推理类
+
+       line 13
+       ```python
+       from minigpt4.models.image_encoder import ImageEncoderOM
+       ```
+
+    2. 新增如下代码，初始化加载 om 模型
+
+       line 40
+       ```python
+       self.image_encoder = ImageEncoderOM("${model_path}/weights_image/eva_vit_g.om", device_8bit)
+       ```
+
+    3. 删除原来的图像处理代码
+
+       line 51
+       ```python
+       self.visual_encoder, self.ln_vision = self.init_vision_encoder(
+           vit_model, img_size, drop_path_rate, use_grad_checkpoint, vit_precision, freeze_vit
+       )
+       ```
+
+    4. 修正（源码多写了一次`.model`）
+
+       line 312
+       ```python
+       if hasattr(self.llama_model.base_model, 'model'): ## lora wrapped model
+           embeds = self.llama_model.base_model.model.embed_tokens(token_ids)
+       ```
+
+
+3. 修改 `${work_space}/minigpt4/models/minigpt4.py` 文件，具体如下：
+
+    1. 在原文件的第 62 行和 70 行，将`self.visual_encoder.num_features`修改为 VisionTransformer 类的入参 embed_dim 的固定值 1408.
+
+       line 62
+        ```python
+        self.Qformer, self.query_tokens = self.init_Qformer(num_query_token, 1408, freeze_qformer)
+        ```
+
+       line 70
+        ```python
+        img_f_dim = 1408 * 4
+        ```
+
+    2. 图像 embedding 的计算不再走原始逻辑，改用转换后的 om 模型进行计算
+
+       line 125
+        ```python
+        image_embeds = torch.tensor(self.image_encoder.image_encoder_om.infer(image.cpu().numpy())[0]).to(device)
+        ```
+
+### 对源码的其他必要修改
+
+1. 修改 `${work_space}/minigpt4/models/base_model.py` 文件，具体如下：
+
+   修改的目的是改用来自昇腾模型库的 llama_model。
+
+    1. 删除不必要的三方件引入（训练才需要）
+
+       删除 line 17
+        ```python
+        from peft import (
+            LoraConfig,
+            get_peft_model,
+            prepare_model_for_int8_training,
+        )
+        ```
+
+    2. 改用来自昇腾模型库的 LlamaForCausalLM 类
+
+       将 line 26
+        ```python
+        from minigpt4.models.modeling_llama import LlamaForCausalLM
+        ```
+       替换为
+        ```python
+        from atb_llm.models.llama.causal_llama import LlamaForCausalLM
+        from atb_llm.runner import ModelRunner
+        ```
+
+    3. 重写 `init_llm(...)` 方法
+
+       重写 line 171 的方法为
+        ```python
+        def init_llm(cls, llama_model_path, low_resource=False, low_res_device=0, lora_r=0,
+                     **lora_kargs):
+            logging.info('Loading LLAMA')
+            llama_tokenizer = LlamaTokenizer.from_pretrained(llama_model_path, use_fast=False)
+            llama_tokenizer.pad_token = "$$"
+
+            rank = int(os.getenv("RANK", "0"))
+            world_size = int(os.getenv("WORLD_SIZE", "1"))
+            llama_model_runner = ModelRunner(llama_model_path, rank=rank, world_size=world_size,
+                                             is_flash_causal_lm=False)
+            llama_model_runner.load_weights()
+            llama_model = llama_model_runner.model
+            for name, param in llama_model.named_parameters():
+                param.requires_grad = False
+
+            logging.info('Loading LLAMA Done')
+            return llama_model, llama_tokenizer
+        ```
+
+3. 修改 `${work_space}/minigpt4/datasets/data_utils.py` 文件，具体如下：
+
+   删除不必要的三方件引入及其使用。
+
+   删除 line 18, 19, 29
+   ```python
+   import decord
+   
+   from decord import VideoReader
+   
+   decord.bridge.set_bridge("torch")
+   ```
+
+4. 修改 `${work_space}/eval_configs/minigpt4_eval.yaml` 文件，具体如下：
+
+   由于无法使用 CUDA 的 8 位优化器，需将`low_resource`参数值设置为`False`。
+
+   修改 line 6
+   ```yaml
+   low_resource: False
+   ```
diff --git a/mindie/examples/models/minigpt4/om_trans/eva_vit_model.py b/mindie/examples/models/minigpt4/om_trans/eva_vit_model.py
new file mode 100644
index 00000000..78684d59
--- /dev/null
+++ b/mindie/examples/models/minigpt4/om_trans/eva_vit_model.py
@@ -0,0 +1,106 @@
+# Copyright Huawei Technologies Co., Ltd. 2023-2024. All rights reserved.
+
+import logging
+import torch
+import torch.nn as nn
+from minigpt4.models.eva_vit import create_eva_vit_g
+from minigpt4.models.Qformer import BertConfig, BertLMHeadModel
+from minigpt4.models.base_model import BaseModel
+
+
+class MiniGPT4ImageEmbedding(BaseModel):
+    def __init__(
+            self,
+            vit_model="eva_clip_g",
+            img_size=224,
+            drop_path_rate=0,
+            use_grad_checkpoint=False,
+            freeze_vit=True,
+    ):
+        super().__init__()
+
+        # Load VIT model
+        self.visual_encoder, self.ln_vision = self.init_vision_encoder(
+            vit_model, img_size, drop_path_rate, use_grad_checkpoint, freeze_vit
+        )
+
+    @classmethod
+    def init_vision_encoder(
+            cls, model_name, img_size, drop_path_rate, use_grad_checkpoint, freeze
+    ):
+        logging.info('Loading VIT')
+
+        if not model_name == "eva_clip_g":
+            logging.error('vit model must be eva_clip_g for current version of MiniGPT-4')
+
+        visual_encoder = create_eva_vit_g(img_size, drop_path_rate, use_grad_checkpoint, 'fp32')
+
+        ln_vision = LayerNorm(visual_encoder.num_features)
+        if freeze:
+            for _, param in visual_encoder.named_parameters():
+                param.requires_grad = False
+            visual_encoder = visual_encoder.eval()
+            visual_encoder.train = disabled_train
+            for _, param in ln_vision.named_parameters():
+                param.requires_grad = False
+            ln_vision = ln_vision.eval()
+            ln_vision.train = disabled_train
+            logging.info("freeze vision encoder")
+
+        logging.info('Loading VIT Done')
+        return visual_encoder, ln_vision
+
+    @classmethod
+    def init_q_former(cls, num_query_token, vision_width, freeze):
+        encoder_config = BertConfig.from_pretrained("${model_path}/weights_image/bert-base-uncased")
+        encoder_config.encoder_width = vision_width
+        # insert cross-attention layer every other block
+        encoder_config.add_cross_attention = True
+        encoder_config.cross_attention_freq = 2
+        encoder_config.query_length = num_query_token
+        q_former = BertLMHeadModel(config=encoder_config)
+        query_tokens = nn.Parameter(
+            torch.zeros(1, num_query_token, encoder_config.hidden_size)
+        )
+        query_tokens.data.normal_(mean=0.0, std=encoder_config.initializer_range)
+
+        q_former.cls = None
+        q_former.bert.embeddings.word_embeddings = None
+        q_former.bert.embeddings.position_embeddings = None
+        for layer in q_former.bert.encoder.layer:
+            layer.output = None
+            layer.intermediate = None
+
+        if freeze:
+            for _, param in q_former.named_parameters():
+                param.requires_grad = False
+            q_former = q_former.eval()
+            q_former.train = disabled_train
+            query_tokens.requires_grad = False
+
+        return q_former, query_tokens
+
+    def forward(self, image):
+        device = image.device
+        if len(image.shape) > 4:
+            image = image.reshape(-1, *image.shape[-3:])
+
+        with self.maybe_autocast():
+            image_embeds = self.ln_vision(self.visual_encoder(image)).to(device)
+
+        return image_embeds
+
+
+def disabled_train(self, mode=True):
+    """Overwrite model.train with this function to make sure train/eval mode does not change anymore."""
+    return self
+
+
+class LayerNorm(nn.LayerNorm):
+    """Subclass torch's LayerNorm to handle fp16."""
+
+    def forward(self, x: torch.Tensor):
+        orig_type = x.dtype
+        ret = super().forward(x.type(torch.float32))
+        return ret.type(orig_type)
+
diff --git a/mindie/examples/models/minigpt4/om_trans/image_encoder.py b/mindie/examples/models/minigpt4/om_trans/image_encoder.py
new file mode 100644
index 00000000..3c0ce441
--- /dev/null
+++ b/mindie/examples/models/minigpt4/om_trans/image_encoder.py
@@ -0,0 +1,9 @@
+# Copyright Huawei Technologies Co., Ltd. 2023-2024. All rights reserved.
+
+from ais_bench.infer.interface import InferSession
+
+
+class ImageEncoderOM:
+    def __init__(self, model_path, device):
+        self.image_encoder_om = InferSession(device, model_path)  # read local om_file
+
diff --git a/mindie/examples/models/minigpt4/om_trans/onnx_model_export.py b/mindie/examples/models/minigpt4/om_trans/onnx_model_export.py
new file mode 100644
index 00000000..ecd10e2d
--- /dev/null
+++ b/mindie/examples/models/minigpt4/om_trans/onnx_model_export.py
@@ -0,0 +1,67 @@
+# Copyright Huawei Technologies Co., Ltd. 2023-2024. All rights reserved.
+
+import os
+import argparse
+
+from PIL import Image
+import torch
+
+from atb_llm.utils.log import logger
+
+from minigpt4.common.registry import registry
+from minigpt4.models.eva_vit_model import MiniGPT4ImageEmbedding
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description="load Model weights and run.")
+    parser.add_argument(
+        "--onnx-model-dir",
+        type=str,
+        default="./transfer_model_onnx",
+        help="Location of Model weights, which contains model folders",
+    )
+    parser.add_argument(
+        "--image-path",
+        type=str,
+        default="../test_image/01.jpg",
+        help="Location image path",
+    )
+    return parser.parse_args()
+
+
+if __name__ == '__main__':
+
+    args = parse_args()
+
+    image_path = args.image_path
+    onnx_model_dir = args.onnx_model_dir
+
+    if not os.path.exists(onnx_model_dir):
+        os.makedirs(onnx_model_dir)
+
+    onnx_model_path = os.path.join(onnx_model_dir, "eva_vit_g.onnx")
+    logger.info('onnx_model_path:', onnx_model_path)
+
+    model = MiniGPT4ImageEmbedding()
+
+    vis_processor = registry.get_processor_class("blip2_image_eval").from_config()
+
+    raw_image = Image.open(image_path).convert('RGB')
+    image = vis_processor(raw_image).unsqueeze(0)
+    logger.info('input size:', image.size())  # input size: torch.Size([1, 224, 224, 3])
+
+    # onnx model export
+    torch.onnx.export(
+        model,  # model being run
+        image,  # model input (or a tuple for multiple inputs)
+        onnx_model_path,  # where to save the model (can be a file or file-like object)
+        export_params=True,  # store the trained parameter weights inside the model file
+        opset_version=13,  # the ONNX version to export the model to
+        do_constant_folding=False,  # whether to execute constant folding for optimization
+        input_names=['input'],  # the model's input names
+        output_names=['output'],  # the model's output names
+        dynamic_axes={'input': {0: 'batch'}},
+    )
+
+    logger.info("====== export onnx model successfully! ======")
+
diff --git a/mindie/examples/models/minigpt4/performance/run_performance.sh b/mindie/examples/models/minigpt4/performance/run_performance.sh
new file mode 100644
index 00000000..90c42cbe
--- /dev/null
+++ b/mindie/examples/models/minigpt4/performance/run_performance.sh
@@ -0,0 +1,52 @@
+# Copyright Huawei Technologies Co., Ltd. 2023-2024. All rights reserved.
+
+# 此脚本的手动配置参数
+export ASCEND_RT_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+export MASTER_PORT=20030
+
+LLM_model_path="${model_path}/weights_language"
+
+case=256
+bsz_base=600
+
+#case=512
+#bsz_base=300
+
+#case=1024
+#bsz_base=150
+
+#case=2048
+#bsz_base=75
+
+# 注册 python 环境变量
+export PYTHONPATH="${ATB_SPEED_HOME_PATH}:${PYTHONPATH}"
+
+# 以下环境变量与性能和内存优化相关，通常情况下无需修改
+export ATB_LAYER_INTERNAL_TENSOR_REUSE=1
+export INF_NAN_MODE_ENABLE=0
+export ATB_OPERATION_EXECUTE_ASYNC=1
+export TASK_QUEUE_ENABLE=1
+export ATB_CONVERT_NCHW_TO_ND=1
+export HCCL_BUFFSIZE=120
+export ATB_WORKSPACE_MEM_ALLOC_GLOBAL=1
+export ATB_CONTEXT_WORKSPACE_SIZE=0
+
+for ((bsz = bsz_base; bsz < bsz_base + 20; bsz++)); do
+    extra_param=""
+    extra_param="${extra_param} --max_position_embeddings $((case * 2))
+                                --max_input_length $case
+                                --max_output_length $case
+                                --batch_size $bsz"
+
+    echo ${extra_param}
+
+    if [ "$TP_WORLD_SIZE" == "1" ]; then
+        python -m examples.run_fa --model_path $LLM_model_path $extra_param
+    else
+        world_size=$(($(echo "${ASCEND_RT_VISIBLE_DEVICES}" | grep -o , | wc -l) + 1))
+        torchrun --nproc_per_node $world_size --master_port $MASTER_PORT -m examples.run_fa --model_path $LLM_model_path $extra_param
+    fi
+    if [ $? -ne 0 ]; then
+        exit
+    fi
+done
diff --git a/mindie/examples/models/minigpt4/precision/GPU_NPU_result_example.json b/mindie/examples/models/minigpt4/precision/GPU_NPU_result_example.json
new file mode 100644
index 00000000..41bf64b4
--- /dev/null
+++ b/mindie/examples/models/minigpt4/precision/GPU_NPU_result_example.json
@@ -0,0 +1,4 @@
+{
+  "/xx/01.jpg": "This image ...",
+  "/xx/02.jpg": "This image ..."
+}
diff --git a/mindie/examples/models/minigpt4/precision/clip_score_minigpt4.py b/mindie/examples/models/minigpt4/precision/clip_score_minigpt4.py
new file mode 100644
index 00000000..8a0a02a0
--- /dev/null
+++ b/mindie/examples/models/minigpt4/precision/clip_score_minigpt4.py
@@ -0,0 +1,103 @@
+# Copyright (c) Huawei Technologies Co., Ltd. 2023-2024. All rights reserved.
+
+import argparse
+import json
+import os
+import time
+from PIL import Image
+
+import open_clip
+import torch
+import torch_npu
+import torch.nn.functional as F
+
+from atb_llm.utils.file_utils import safe_open
+from atb_llm.utils.log import logger
+
+
+def parse_arguments():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--device",
+        type=str,
+        default="0",
+        help="device for torch.",
+    )
+    parser.add_argument(
+        "--model_name",
+        type=str,
+        default="ViT-H-14",
+        help="open clip model name",
+    )
+    parser.add_argument(
+        "--model_weights_path",
+        type=str,
+        default="./open_clip_pytorch_model.bin",
+        help="open clip model weights",
+    )
+    parser.add_argument(
+        "--image_info",
+        type=str,
+        default="./image_info.json",
+        help="Image_info.json file.",
+    )
+    return parser.parse_args()
+
+
+def set_torch_env(device_ids):
+    torch_npu.npu.set_device(int(device_ids))
+    torch.npu.set_compile_mode(jit_compile=False)
+
+
+def clip_score(model_clip, tokenizer, preprocess, model_answer, image_file):
+    imgs = []
+    texts = []
+
+    img = preprocess(Image.open(image_file)).unsqueeze(0).npu()
+    imgs.append(img)
+    text = tokenizer([model_answer]).npu()
+    texts.append(text)
+
+    img = torch.cat(imgs)  # [bs, 3, 224, 224]
+    text = torch.cat(texts)  # [bs, 77]
+
+    with torch.no_grad():
+        text_ft = model_clip.encode_text(text).float()
+        img_ft = model_clip.encode_image(img).float()
+        score = F.cosine_similarity(img_ft, text_ft).squeeze()
+
+    return score.cpu()
+
+
+def main():
+    args = parse_arguments()
+    set_torch_env(args.device)
+
+    t_b = time.time()
+    logger.info("Load clip model...")
+    model_clip, _, preprocess = open_clip.create_model_and_transforms(
+        args.model_name, pretrained=args.model_weights_path, device=f"npu:{args.device}")
+    model_clip.eval()
+    logger.info(f">done. elapsed time: {(time.time() - t_b):.3f} s")
+
+    tokenizer = open_clip.get_tokenizer("ViT-H-14")
+    with safe_open(args.image_info, "r", override_flags=os.O_RDONLY) as f:
+        image_info = json.load(f)
+
+    t_b = time.time()
+    logger.info("Calc clip score...")
+    all_scores = []
+    for image_file, model_answer in image_info.items():
+        # 单个图片  单个answer
+        logger.info(f"cur image file: {image_file}")
+        image_score = clip_score(model_clip, tokenizer, preprocess, model_answer, image_file)
+        logger.info(f"{image_score=}")
+        all_scores.append(image_score)
+    all_scores_mean = torch.mean(torch.tensor(all_scores))
+    logger.info(f"平均分：{all_scores_mean=}")
+    logger.info(f">done. elapsed time: {(time.time() - t_b):.3f} s")
+
+
+if __name__ == '__main__':
+    main()
+
diff --git a/mindie/examples/models/minigpt4/precision/run_predict_walk_dir_gpu.py b/mindie/examples/models/minigpt4/precision/run_predict_walk_dir_gpu.py
new file mode 100644
index 00000000..1e323376
--- /dev/null
+++ b/mindie/examples/models/minigpt4/precision/run_predict_walk_dir_gpu.py
@@ -0,0 +1,101 @@
+# Copyright Huawei Technologies Co., Ltd. 2023-2024. All rights reserved.
+
+import argparse
+import json
+import os
+import stat
+import torch
+from transformers import StoppingCriteriaList
+
+from atb_llm.utils.file_utils import safe_open
+from atb_llm.utils.log import logger
+
+from minigpt4.common.config import Config
+from minigpt4.common.registry import registry
+from minigpt4.conversation.conversation import Chat, CONV_VISION_Vicuna0, StoppingCriteriaSub
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description="Demo")
+    parser.add_argument("--gpu-id", type=int, default=0,
+                        help="Specify the gpu to load the model.")
+    parser.add_argument("--cfg-path", default="eval_configs/minigpt4_eval.yaml",
+                        help="Path to configuration file.")
+    parser.add_argument("--image-path", required=True,
+                        help="Image path for inference.")
+    parser.add_argument("--output-path", required=True,
+                        help="Output path of inference.")
+    parser.add_argument("--options", nargs="+",
+                        help="override some settings in the used config, the key-value pair "
+                             "in xxx=yyy format will be merged into config file (deprecate), "
+                             "change to --cfg-options instead.",
+                        )
+    return parser.parse_args()
+
+
+def traverse_img_dir(img_dir, res_file_dir):
+    # 判断目标目录是否存在
+    if not os.path.exists(img_dir):
+        logger.info("目标目录不存在！")
+        return
+    if not os.path.exists(res_file_dir):
+        os.mkdir(res_file_dir)
+
+    input_text = "Describe this image in detail."
+
+    for root, _, files in os.walk(img_dir):
+        if not files:
+            continue
+        res_dict = {}
+        for file in files:
+            image_path = os.path.join(root, file)
+            logger.info("文件路径：", image_path)
+            chat_state = CONV_VISION_Vicuna0.copy()
+            img_list = []
+            llm_message = chat.upload_img(image_path, chat_state, img_list)
+            logger.info(f"{llm_message=}")
+            chat.encode_img(img_list)
+            logger.info(f"===== image_list: {img_list}")
+            logger.info(f"===== chat_state: {chat_state.messages}")
+            chat.ask(input_text, chat_state)
+            llm_message = chat.answer(conv=chat_state,
+                                      img_list=img_list,
+                                      num_beams=1,
+                                      temperature=0.1,
+                                      max_new_tokens=300,
+                                      max_length=2000)[0]
+            logger.info(f"MiniGPT4 Answer: {llm_message}")
+            res_dict[image_path] = llm_message
+            logger.info(f"已生成 {len(res_dict)} 条记录 from {root}")
+
+            flags = os.O_WRONLY | os.O_CREAT
+            mode = stat.S_IRUSR | stat.S_IWUSR | stat.S_IRGRP | stat.S_IROTH
+            with safe_open(args.results_save_path, "w", permission_mode=mode, override_flags=flags) as f:
+                json.dump(res_dict, f)
+    logger.info("-----ALL DONE-----")
+
+
+if __name__ == '__main__':
+    # Model Initialization
+    logger.info('Initializing Chat')
+    args = parse_args()
+    cfg = Config(args)
+
+    model_config = cfg.model_cfg
+    model_config.device_8bit = args.gpu_id
+    model_cls = registry.get_model_class(model_config.arch)
+    model = model_cls.from_config(model_config).to(f'cuda:{args.gpu_id}')
+
+    vis_processor_cfg = cfg.datasets_cfg.cc_sbu_align.vis_processor.train
+    vis_processor = registry.get_processor_class(vis_processor_cfg.name).from_config(vis_processor_cfg)
+
+    stop_words_ids = [[835], [2277, 29937]]
+    stop_words_ids = [torch.tensor(ids).to(device=f'cuda:{args.gpu_id}') for ids in stop_words_ids]
+    stopping_criteria = StoppingCriteriaList([StoppingCriteriaSub(stops=stop_words_ids)])
+
+    chat = Chat(model, vis_processor, device=f'cuda:{args.gpu_id}', stopping_criteria=stopping_criteria)
+    logger.info('Initialization Finished')
+
+    # Model Inference
+    traverse_img_dir(img_dir=args.image_path, res_file_dir=args.output_path)
+
diff --git a/mindie/examples/models/minigpt4/predict/make_embeds.py b/mindie/examples/models/minigpt4/predict/make_embeds.py
new file mode 100644
index 00000000..0b85de76
--- /dev/null
+++ b/mindie/examples/models/minigpt4/predict/make_embeds.py
@@ -0,0 +1,83 @@
+# Copyright Huawei Technologies Co., Ltd. 2023-2024. All rights reserved.
+
+import argparse
+import os
+import torch
+import torch_npu
+from transformers import StoppingCriteriaList
+
+from atb_llm.utils.log import logger
+
+from minigpt4.common.config import Config
+from minigpt4.common.registry import registry
+from minigpt4.conversation.conversation import Chat, CONV_VISION_Vicuna0, StoppingCriteriaSub
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description="Demo")
+    parser.add_argument("--npu_id", type=int, default=0,
+                        help="Specify the npu to work on.")
+    parser.add_argument("--cfg_path", type=str, required=True,
+                        help="Path to configuration file.")
+    parser.add_argument("--image_path", type=str, required=True,
+                        help="Image path(directory or file) for inference.")
+    parser.add_argument("--inputs_embeds_dir", type=str, required=True,
+                        help="Directory of .pt files containing inputs_embeds.")
+    parser.add_argument("--options", nargs="+",
+                        help="override some settings in the used config, the key-value pair "
+                             "in xxx=yyy format will be merged into config file (deprecate), "
+                             "change to --cfg-options instead.",
+                        )
+    return parser.parse_args()
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    cfg = Config(args)
+
+    torch_npu.npu.set_device(args.npu_id)
+    torch.npu.set_compile_mode(jit_compile=False)
+
+    logger.info('----- Chat Initialization Begins ... -----')
+    model_config = cfg.model_cfg
+    model_config.device_8bit = args.npu_id
+    model_cls = registry.get_model_class(model_config.arch)
+    model = model_cls.from_config(model_config).to(f'npu:{args.npu_id}')
+
+    vis_processor_cfg = cfg.datasets_cfg.cc_sbu_align.vis_processor.train
+    vis_processor = registry.get_processor_class(vis_processor_cfg.name).from_config(vis_processor_cfg)
+
+    stop_words_ids = [[835], [2277, 29937]]  # 835:###, 2277:##, 29937:#
+    stop_words_ids = [torch.tensor(ids).to(device=f'npu:{args.npu_id}') for ids in stop_words_ids]
+    stopping_criteria = StoppingCriteriaList([StoppingCriteriaSub(stops=stop_words_ids)])
+
+    chat = Chat(model, vis_processor, stopping_criteria=stopping_criteria, device=f'npu:{args.npu_id}')
+    logger.info('----- Chat Initialization Finished! -----')
+
+    logger.info('----- inputs_embeds Making Begins ... -----')
+    if not os.path.isdir(args.inputs_embeds_dir):
+        os.mkdir(args.inputs_embeds_dir)
+
+    if not os.path.isdir(args.image_path):
+        image_path_list = [args.image_path]
+    else:
+        image_path_list = [os.path.join(args.image_path, _) for _ in os.listdir(args.image_path)]
+
+    input_text = "Describe this image in detail."
+
+    for image_path in sorted(image_path_list):
+        chat_state = CONV_VISION_Vicuna0.copy()
+        img_list = []
+        chat.upload_img(image_path, chat_state, img_list)
+        chat.encode_img(img_list)
+        chat.ask(input_text, chat_state)
+        inputs_embeds = chat.answer_prepare(conv=chat_state, img_list=img_list)["inputs_embeds"]
+        logger.info(f"{inputs_embeds=}")
+        inputs_embeds_file_path = os.path.join(args.inputs_embeds_dir, f"{os.path.basename(image_path)}.pt")
+        torch.save(inputs_embeds, inputs_embeds_file_path)
+        logger.info('----- inputs_embeds .pt file Saved! -----')
+        logger.info(f"{inputs_embeds_file_path=}")
+
+    logger.info(f'----- inputs_embeds Making All Finished! Total: {len(image_path_list)} -----')
+
+
diff --git a/mindie/examples/models/minigpt4/predict/run_predict.sh b/mindie/examples/models/minigpt4/predict/run_predict.sh
new file mode 100644
index 00000000..7e831bf8
--- /dev/null
+++ b/mindie/examples/models/minigpt4/predict/run_predict.sh
@@ -0,0 +1,93 @@
+# Copyright Huawei Technologies Co., Ltd. 2023-2024. All rights reserved.
+
+# 此脚本的手动配置参数
+export ASCEND_RT_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+export MASTER_PORT=20030
+
+flag_save_log=false
+
+minigpt_dir="${work_space}"
+LLM_model_path="${model_path}/weights_language"
+
+min_length=1
+max_new_tokens=300
+stop_words_ids="[[835],[2277,29937]]" # 注意里面不能有空格
+do_sample=False
+num_beams=1
+top_p=0.9
+temperature=0.1
+repetition_penalty=1.05
+length_penalty=1
+
+# 此脚本的自动配置参数
+world_size=$(($(echo "${ASCEND_RT_VISIBLE_DEVICES}" | grep -o , | wc -l) + 1))
+cur_time=$(date +"%Y-%m-%d_%H-%M-%S")
+inputs_embeds_dir="inputs_embeds_dir_${cur_time}"
+results_save_path="results_save_path_${cur_time}.json"
+
+# 注册 python 环境变量
+export PYTHONPATH="${minigpt_dir}:${PYTHONPATH}"
+export PYTHONPATH="${ATB_SPEED_HOME_PATH}:${PYTHONPATH}"
+
+# step 1/2: 图文输入 → LLM_inputs（单进程，调用 minigpt）
+params_1=""
+params_1="${params_1} --cfg_path ${minigpt_dir}/eval_configs/minigpt4_eval.yaml"
+if [ "${1}" != "" ]; then
+    params_1="${params_1} --image_path ${1}"
+else
+    params_1="${params_1} --image_path ${minigpt_dir}/examples_v2/office.jpg"
+fi
+params_1="${params_1} --inputs_embeds_dir ${inputs_embeds_dir}"
+python -m make_embeds ${params_1}
+if [ $? -ne 0 ]; then
+    exit
+fi
+
+# 以下环境变量与性能和内存优化相关，通常情况下无需修改
+export ATB_LAYER_INTERNAL_TENSOR_REUSE=1
+export INF_NAN_MODE_ENABLE=0
+export ATB_OPERATION_EXECUTE_ASYNC=1
+export TASK_QUEUE_ENABLE=1
+export ATB_CONVERT_NCHW_TO_ND=1
+export HCCL_BUFFSIZE=120
+export ATB_WORKSPACE_MEM_ALLOC_GLOBAL=1
+export ATB_CONTEXT_WORKSPACE_SIZE=0
+
+# 打印加速库、算子库日志
+if [ "${flag_save_log}" = true ]; then
+    export ATB_LOG_LEVEL=INFO
+    export ATB_LOG_TO_STDOUT=1
+    export ATB_LOG_TO_FILE=1
+
+    export ASDOPS_LOG_LEVEL=INFO
+    export ASDOPS_LOG_TO_STDOUT=1
+    export ASDOPS_LOG_TO_FILE=1
+
+    export TASK_QUEUE_ENABLE=0
+    export ATB_OPERATION_EXECUTE_ASYNC=0
+    export ATB_STREAM_SYNC_EVERY_KERNEL_ENABLE=1
+fi
+
+# step 2/2: LLM_inputs → 文字输出（可多进程，调用 llama）
+params_2=""
+
+params_2="${params_2} --model_path ${LLM_model_path}
+                      --results_save_path ${results_save_path}"
+
+params_2="${params_2} --inputs_embeds_dir ${inputs_embeds_dir}
+                      --min_length ${min_length}
+                      --max_output_length ${max_new_tokens}
+                      --stop_words_ids ${stop_words_ids}
+                      --do_sample ${do_sample}
+                      --num_beams ${num_beams}
+                      --top_p ${top_p}
+                      --temperature ${temperature}
+                      --repetition_penalty ${repetition_penalty}
+                      --length_penalty ${length_penalty}"
+
+if [ "${TP_WORLD_SIZE}" == "1" ]; then
+    python -m examples.run_fa ${params_2}
+else
+    torchrun --nproc_per_node ${world_size} --master_port ${MASTER_PORT} -m examples.run_fa ${params_2}
+fi
+
diff --git a/mindie/examples/models/mistral/README.md b/mindie/examples/models/mistral/README.md
new file mode 100644
index 00000000..103d0cb5
--- /dev/null
+++ b/mindie/examples/models/mistral/README.md
@@ -0,0 +1,129 @@
+# README
+
+- [Mistral-7B-Instruct-v0.2](https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.2) 为 Mistral 7B v0.2 Base Model 的指令调优版本。该模型在2023年9月首次发布，在多个基准测试中表现优异，被评价为同尺寸级别中最优秀的模型之一。
+
+- 此代码仓中实现了一套基于NPU硬件的Mistral推理模型。配合加速库使用，旨在NPU上获得极致的推理性能。
+
+# 特性矩阵
+- 此矩阵罗列了Mistral-7B-Instruct-v0.2模型支持的特性
+
+| 模型及参数量 | 800I A2 Tensor Parallelism | 300I DUO Tensor Parallelism | FP16 | BF16（仅800I A2支持） | Flash Attention | Paged Attention | W8A8量化 | W8A16量化 | W4A16量化 | KV cache量化 | 稀疏量化 | MOE | MindIE | TGI | 长序列 |
+|-------------|----------------------------|-----------------------------|------|----------------------|-----------------|-----------------|---------|-----------|--------------|--------------------------|-----|--------|-----|-----|-----|
+| Mistral-7B-Instruct-v0.2 | 支持world size 1,2,4,8     | 支持world size 1,2,4         | 是   | 否                  | 否         | 是              | 否     | 否        | 否       | 否           | 否                       | 否  | 否 | 否 | 否 |
+
+- 此模型仓已适配的模型版本
+  - Mistral-7B-Instruct-v0.2 (transformers==4.36.0)
+
+# 使用说明
+
+## 路径变量解释
+| 变量名  | 含义                                             |
+|--------|--------------------------------------------------|
+| working_dir | 加速库及模型库下载后放置的目录                  |
+| llm_path | 模型仓所在路径。若使用编译好的包，则路径为`${working_dir}/MindIE-LLM/`；若使用gitee下载的代码，则路径为`${working_dir}/MindIE-LLM/examples/atb_models`    |
+| script_path | 脚本所在路径；Mistral的工作脚本所在路径为`${llm_path}/examples/models/mistral`               |
+| weight_path | 模型权重路径                            |
+
+## 权重
+**权重下载**
+
+- [Mistral-7B-Instruct-v0.2](https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.2)
+
+**权重转换**
+- 参考[此README文件](../../README.md)
+
+**量化权重生成**
+
+- 基于原始的BF16的权重，生成量化权重
+
+- W8A8 Antioutlier量化权重请使用以下指令生成
+
+  - 执行量化脚本
+  
+    ```
+    # 指定当前机器上可用的逻辑NPU核心
+    export ASCEND_RT_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+    cd ${llm_path}
+    python examples/models/mistral/convert_quant_weights.py \
+    --model_path {浮点权重路径} \
+    --save_directory {W8A8量化权重路径} \
+    --w_bit 8 --a_bit 8 \
+    --disable_level L0 \
+    --anti_method m1 \
+    --act_method 2 \
+    --device_type npu \
+    --calib_file ${llm_path}/examples/convert/model_slim/boolq.jsonl
+    ```
+  
+    - 注意：`model_path`和`save_directory`请勿使用同一个文件夹，避免浮点权重和量化权重混淆
+  
+  - 修改量化权重的 config.json 文件
+  
+    ```
+    torch_dtype:float16
+    ```
+
+**基础环境变量**
+
+- 参考[此README文件](../../../README.md)
+
+## 推理
+
+### 对话测试
+**运行Paged Attention FP16**
+- 运行启动脚本
+  - 在\${llm_path}目录下执行以下指令
+    ```shell
+    bash ${script_path}/run_pa.sh ${weight_path}
+    ```
+- 环境变量说明
+  - `export ASCEND_RT_VISIBLE_DEVICES=0,1,2,3,4,5,6,7`
+    - 指定当前机器上可用的逻辑NPU核心，多个核心间使用逗号相连
+    - 核心ID查阅方式见[此README文件](../../README.md)的【启动脚本相关环境变量】章节
+    - 对于300I DUO卡而言，若要使用单卡双芯，请指定至少两个可见核心；若要使用双卡四芯，请指定至少四个可见核心
+    - 各模型支持的核心数参考“特性矩阵”
+  - `export MASTER_PORT=20030`
+    - 设置卡间通信端口
+    - 默认使用20030端口
+    - 目的是为了避免同一台机器同时运行多个多卡模型时出现通信冲突
+    - 设置时端口建议范围为：20000-20050
+  - `export USE_REFACTOR=true`
+    - 是否使用新版模型组图
+    - 默认使用
+  - 以下环境变量与性能和内存优化相关，通常情况下无需修改
+    ```shell
+    export ATB_LAYER_INTERNAL_TENSOR_REUSE=1
+    export INF_NAN_MODE_ENABLE=0
+    export ATB_OPERATION_EXECUTE_ASYNC=1
+    export TASK_QUEUE_ENABLE=1
+    export ATB_CONVERT_NCHW_TO_ND=1
+    export LCCL_ENABLE_FALLBACK=1
+    export ATB_WORKSPACE_MEM_ALLOC_GLOBAL=1
+    export ATB_CONTEXT_WORKSPACE_SIZE=0
+    ```
+
+## 精度测试
+- 参考[此README文件](../../../tests/modeltest/README.md)
+  - 示例
+    ```shell
+    cd ${llm_path}/tests/modeltest
+    export ASCEND_RT_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+    export MAX_MEMORY_GB=29
+    bash run.sh pa_fp16 full_BoolQ 1 mistral True ${mistral-7b-instruct-v0.2权重路径} 8
+    ```
+
+## 性能测试
+- 参考[此README文件](../../../tests/modeltest/README.md)
+  - 示例
+    ```shell
+    cd ${llm_path}/tests/modeltest
+    export ASCEND_RT_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+    export MAX_MEMORY_GB=29
+    export ATB_LLM_BENCHMARK_ENABLE=1
+    bash run.sh pa_fp16 performance [[2048,2048],[1024,1024],[512,512],[256,256]] 1 mistral True ${mistral-7b-instruct-v0.2权重路径} 8
+    ```
+
+## FAQ
+- 更多环境变量见[此README文件](../../README.md)
+- 对话测试实际执行的Python文件为`${llm_path}/examples/run_pa.py`；该文件的参数说明见[此README文件](../../README.md)
+- 运行时，需要通过指令pip list｜grep protobuf确认protobuf版本，如果版本高于3.20.x，请运行指令pip install protobuf==3.20.0进行更新
diff --git a/mindie/examples/models/mistral/convert_quant_weights.py b/mindie/examples/models/mistral/convert_quant_weights.py
new file mode 100644
index 00000000..3c221e11
--- /dev/null
+++ b/mindie/examples/models/mistral/convert_quant_weights.py
@@ -0,0 +1,67 @@
+# Copyright Huawei Technologies Co., Ltd. 2024. All rights reserved.
+
+import os
+
+from msmodelslim.pytorch.llm_ptq.llm_ptq_tools import QuantConfig
+from msmodelslim.pytorch.llm_ptq.anti_outlier import AntiOutlierConfig
+
+from atb_llm.models.llama.modeling_llama import LlamaConfig
+from examples.convert.model_slim.get_calibration_dataset import load_jsonl
+from examples.convert.model_slim.quantifier import parse_arguments, Quantifier
+from examples.convert.convert_utils import copy_tokenizer_files, modify_config
+
+
+if __name__ == "__main__":
+    args = parse_arguments()
+
+    rank = int(os.getenv("RANK", "0"))
+
+    config = LlamaConfig.from_pretrained(args.model_path)
+
+    disable_names = []
+    if args.a_bit != 16:
+        # W8A16没有回退层
+        num_layers = config.num_hidden_layers
+        disable_names = [f"model.layers.{layer}.mlp.down_proj" for layer in range(num_layers)]
+        disable_names.append("lm_head")
+
+    anti_outlier_config = None
+    if args.anti_method:
+        anti_outlier_config = AntiOutlierConfig(anti_method=args.anti_method, dev_type="npu")
+
+    quant_config = QuantConfig(
+        a_bit=args.a_bit,
+        w_bit=args.w_bit,
+        disable_names=disable_names,
+        act_method=args.act_method,
+        mm_tensor=False,
+        dev_type=args.device_type,
+        dev_id=rank,
+        pr=1.0,
+        fraction=args.fraction,
+        co_sparse=args.co_sparse,
+        do_smooth=args.do_smooth,
+        use_sigma=args.use_sigma,
+        sigma_factor=args.sigma_factor,
+        is_lowbit=args.is_lowbit,
+    )
+
+    # 默认无校准数据集
+    calibration_dataset = None
+    # 若存在calib_file，则使用calib_file作为校准数据集
+    if args.calib_file:
+        calibration_dataset = load_jsonl(args.calib_file)
+    calibration_dataset = calibration_dataset
+    quant_weight_generator = Quantifier(args.model_path, quant_config, anti_outlier_config, args.device_type)
+    quant_weight_generator.tokenizer.pad_token_id = 0
+
+    tokenized_data = None
+    if calibration_dataset is not None:
+        tokenized_data = quant_weight_generator.get_tokenized_data(calibration_dataset)
+
+    quant_weight_generator.convert(tokenized_data, args.save_directory, args.disable_level)
+    modify_config(
+        args.model_path, args.save_directory, config.torch_dtype,
+        f"w{args.w_bit}a{args.a_bit}" + ("s" if args.co_sparse else "")
+    )
+    copy_tokenizer_files(args.model_path, args.save_directory)
diff --git a/mindie/examples/models/mistral/input.jsonl b/mindie/examples/models/mistral/input.jsonl
new file mode 100644
index 00000000..2f26c5a1
--- /dev/null
+++ b/mindie/examples/models/mistral/input.jsonl
@@ -0,0 +1 @@
+[{"role": "user", "content": "What is your favourite condiment?"},{"role": "assistant", "content": "Well, I'm quite partial to a good squeeze of fresh lemon juice. It adds just the right amount of zesty flavour to whatever I'm cooking up in the kitchen!"},{"role": "user", "content": "Do you have mayonnaise recipes?"}]
\ No newline at end of file
diff --git a/mindie/examples/models/mistral/run_pa.sh b/mindie/examples/models/mistral/run_pa.sh
new file mode 100644
index 00000000..a3892c52
--- /dev/null
+++ b/mindie/examples/models/mistral/run_pa.sh
@@ -0,0 +1,29 @@
+# Copyright Huawei Technologies Co., Ltd. 2023-2024. All rights reserved.
+# 参数配置以及启动指令的说明见同级目录下的README.md文件
+export ASCEND_RT_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+export MASTER_PORT=20030
+
+# 以下环境变量与性能和内存优化相关，通常情况下无需修改
+export ATB_LAYER_INTERNAL_TENSOR_REUSE=1
+export INF_NAN_MODE_ENABLE=0
+export ATB_OPERATION_EXECUTE_ASYNC=1
+export TASK_QUEUE_ENABLE=1
+export ATB_CONVERT_NCHW_TO_ND=1
+export LCCL_ENABLE_FALLBACK=1
+export ATB_WORKSPACE_MEM_ALLOC_GLOBAL=1
+export ATB_CONTEXT_WORKSPACE_SIZE=0
+export INT8_FORMAT_NZ_ENABLE=1
+
+extra_param=""
+world_size=$(($(echo "${ASCEND_RT_VISIBLE_DEVICES}" | grep -o , | wc -l) +1))
+
+
+torchrun \
+--nproc_per_node $world_size \
+--master_port $MASTER_PORT \
+-m examples.run_pa \
+--model_path $1 \
+--is_chat_model \
+--input_file ./examples/models/mistral/input.jsonl \
+--max_output_length 500 \
+$extra_param
diff --git a/mindie/examples/models/mixtral/README.md b/mindie/examples/models/mixtral/README.md
new file mode 100644
index 00000000..c59c6b52
--- /dev/null
+++ b/mindie/examples/models/mixtral/README.md
@@ -0,0 +1,110 @@
+# README
+
+- [Mixtral 8x7B]是一个稀疏混合专家(SMoE)语言模型。对于每个 token 输入，在每一层，路由网络(router network)在 8 个专家中选择 2 个专家进行计算。所以每次运行时，模型只会激活 47B 参数中的 13B。在不损失精度的前提下，提高了模型的性能。[Mixtral 8x22B]的总参数量为 141B，激活参数量为 39B。（当前脚本支持 8x7B-Base，8x7B-Chat，8x22B-Base，和 8x22B-Chat）
+- 此代码仓中实现了一套基于 NPU 硬件的 Mixtral-MoE 模型。配合加速库使用，旨在 NPU 上获得极致的推理性能。
+
+# 特性矩阵
+- 矩此阵罗列了各Mixtral模型支持的特性
+
+| 模型及参数量 | 800I A2 Tensor Parallelism | 300I DUO Tensor Parallelism | FP16 | BF16 | Flash Attention | Paged Attention | W8A8量化 | W8A16量化 | W4A16量化 | KV cache量化 | 稀疏量化 | MindIE | TGI | 长序列   |
+|-------------|----------------------------|-----------------------------|------|----------------------|-----------------|-----------------|---------|---------|----------|---------------|--------------------------|--------|-----|-------|
+| Mixtral-8x7B-Instruct   | 支持world size 8     | ×                | √   | ×                   | √              | √              | ×   | ×    | ×        | ×           | ×                       | ×  | ×     | ×  | ×    |
+| Mixtral-8x22B-Instruct   | 支持world size 8     | ×                | √   | ×                   | √              | √              | ×  | ×    | ×        | ×           | ×                       | ×     | ×  | ×    |
+
+# 使用说明
+
+## 路径变量解释
+
+| 变量名      | 含义                                                                                                                                                     |
+| ----------- | -------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| working_dir | 加速库及模型库下载后放置的目录                                                                                                                           |
+| llm_path    | 模型仓所在路径。若使用编译好的包，则路径为`${working_dir}/MindIE-LLM/`；若使用 gitee 下载的代码，则路径为`${working_dir}/MindIE-LLM/examples/atb_models` |
+| script_path | 脚本所在路径；Mixtral-MoE 的工作脚本所在路径为`${llm_path}/examples/models/mixtral`                                                                      |
+| weight_path | 模型权重路径                                                                                                                                             |
+
+## 权重
+
+**权重下载**
+
+- [Mixtral-8x7B-Base](https://huggingface.co/mistralai/Mixtral-8x7B-v0.1)
+- [Mixtral-8x7B-Chat/Instruct](https://huggingface.co/mistralai/Mixtral-8x7B-Instruct-v0.1)
+- [Mixtral-8x22B-Base](https://huggingface.co/mistralai/Mixtral-8x22B-v0.1)
+- [Mixtral-8x22B-Chat/Instruct](https://huggingface.co/mistralai/Mixtral-8x22B-Instruct-v0.1)
+
+**基础环境变量**
+
+- 参考[此 README 文件](../../../README.md)
+
+## 推理
+
+### 对话测试
+
+**运行 Paged Attention FP16**
+
+- 运行启动脚本（Mixtral_8x7B transformers 版本需求：4.36.0.dev0 | Mixtral_8x22B transformers 版本需求：4.39.0）
+  - 在\${llm_path}目录下执行以下指令
+    ```shell
+    bash ${script_path}/run_pa.sh ${weight_path}
+    ```
+- 启动脚本中可设置自定义问题，具体在 input_text 后面修改即可 (默认问题为"Who is the CEO of Google?")
+- 启动脚本中可设置自定义输出长度，具体在 max_output_length 后面修改即可（默认长度为 10）
+- 若当前所用权重版本为"chat"版本，请将"--is_chat_model"赋值给 extra_param；若当前所用权重版本为"base"版本，可以将空字符串赋值给 extra_param（默认为 chat_model）
+- 环境变量说明
+  - `export ASCEND_RT_VISIBLE_DEVICES=0,1,2,3,4,5,6,7`
+    - 指定当前机器上可用的逻辑 NPU 核心，多个核心间使用逗号相连
+    - 核心 ID 查阅方式见[此 README 文件](../../README.md)的【启动脚本相关环境变量】章节
+    - 对于 300I DUO 卡而言，若要使用单卡双芯，请指定至少两个可见核心；若要使用双卡四芯，请指定至少四个可见核心
+    - 各模型支持的核心数参考“特性矩阵”
+  - `export MASTER_PORT=20030`
+    - 设置卡间通信端口
+    - 默认使用 20030 端口
+    - 目的是为了避免同一台机器同时运行多个多卡模型时出现通信冲突
+    - 设置时端口建议范围为：20000-20050
+  - 以下环境变量与性能和内存优化相关，通常情况下无需修改
+    ```shell
+    export ATB_LAYER_INTERNAL_TENSOR_REUSE=1
+    export INF_NAN_MODE_ENABLE=0
+    export ATB_OPERATION_EXECUTE_ASYNC=1
+    export TASK_QUEUE_ENABLE=1
+    export ATB_CONVERT_NCHW_TO_ND=1
+    export LCCL_ENABLE_FALLBACK=1
+    export ATB_WORKSPACE_MEM_ALLOC_GLOBAL=1
+    export ATB_CONTEXT_WORKSPACE_SIZE=1
+    export INT8_FORMAT_NZ_ENABLE=1
+    export ATB_LLM_ENABLE_AUTO_TRANSPOSE=0
+    ```
+
+## 精度测试
+
+- 参考[此 README 文件](../../../tests/modeltest/README.md)
+  - 示例 
+    ```shell
+    cd ${llm_path}/tests/modeltest
+    export ASCEND_RT_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+    export MAX_MEMORY_GB=29
+    bash run.sh pa_fp16 full_BoolQ 1 mixtral ${mixtral-8x7b-base权重路径} 8
+    bash run.sh pa_fp16 full_BoolQ 1 mixtral ${mixtral-8x7b-instruct权重路径} 8
+    bash run.sh pa_fp16 full_BoolQ 1 mixtral ${mixtral-8x22b-base权重路径} 8
+    bash run.sh pa_fp16 full_BoolQ 1 mixtral ${mixtral-8x22b-instruct权重路径} 8
+    ```
+
+## 性能测试
+
+- 参考[此 README 文件](../../../tests/modeltest/README.md)
+  - 示例
+    ```shell
+    cd ${llm_path}/tests/modeltest
+    export ASCEND_RT_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+    export MAX_MEMORY_GB=29
+    export ATB_LLM_BENCHMARK_ENABLE=1
+    bash run.sh pa_fp16 performance [[2048,2048],[1024,1024],[512,512],[256,256]] 1 mixtral ${mixtral-8x7b-base权重路径} 8
+    bash run.sh pa_fp16 performance [[2048,2048],[1024,1024],[512,512],[256,256]] 1 mixtral ${mixtral-8x7b-instruct权重路径} 8
+    bash run.sh pa_fp16 performance [[2048,2048],[1024,1024],[512,512],[256,256]] 1 mixtral ${mixtral-8x22b-base权重路径} 8
+    bash run.sh pa_fp16 performance [[2048,2048],[1024,1024],[512,512],[256,256]] 1 mixtral ${mixtral-8x22b-instruct权重路径} 8
+    ```
+
+## FAQ
+
+- 更多环境变量见[此 README 文件](../../README.md)
+- 对话测试实际执行的 Python 文件为`${llm_path}/examples/run_pa.py`；这个文件的参数说明见[此 README 文件](../../README.md)
+- 运行时，需要通过指令 pip list ｜ grep protobuf 确认 protobuf 版本，如果版本高于 3.20.x，请运行指令 pip install protobuf==3.20.0 进行更新
diff --git a/mindie/examples/models/mixtral/run_pa.sh b/mindie/examples/models/mixtral/run_pa.sh
new file mode 100644
index 00000000..a13ba163
--- /dev/null
+++ b/mindie/examples/models/mixtral/run_pa.sh
@@ -0,0 +1,25 @@
+# Copyright Huawei Technologies Co., Ltd. 2023-2024. All rights reserved.
+# 参数配置以及启动指令的说明见同级目录下的README.md文件
+export ASCEND_RT_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+export MASTER_PORT=20030
+
+# 以下环境变量与性能和内存优化相关，通常情况下无需修改
+export ATB_LAYER_INTERNAL_TENSOR_REUSE=1
+export INF_NAN_MODE_ENABLE=0
+export ATB_OPERATION_EXECUTE_ASYNC=1
+export TASK_QUEUE_ENABLE=1
+export ATB_CONVERT_NCHW_TO_ND=1
+export LCCL_ENABLE_FALLBACK=1
+export ATB_WORKSPACE_MEM_ALLOC_GLOBAL=1
+export ATB_CONTEXT_WORKSPACE_SIZE=1
+export INT8_FORMAT_NZ_ENABLE=1
+export ATB_LLM_ENABLE_AUTO_TRANSPOSE=0
+
+extra_param="--is_chat_model"
+world_size=$(($(echo "${ASCEND_RT_VISIBLE_DEVICES}" | grep -o , | wc -l) +1))
+
+if [ "$TP_WORLD_SIZE" == "1" ]; then
+    python -m examples.run_pa --model_path $1 $extra_param
+else
+    torchrun --nproc_per_node $world_size --master_port $MASTER_PORT -m examples.run_pa --input_texts "Who is the CEO of Google?" --max_output_length 10 --model_path $1 $extra_param
+fi
diff --git a/mindie/examples/models/phi3/README.md b/mindie/examples/models/phi3/README.md
new file mode 100755
index 00000000..f07758db
--- /dev/null
+++ b/mindie/examples/models/phi3/README.md
@@ -0,0 +1,88 @@
+# Phi-3 模型推理指导
+
+# 概述
+
+- [Phi-3](https://github.com/microsoft/Phi-3CookBook) 是 Microsoft 开发的一系列开放式 AI 模型。Phi-3 模型是一个功能强大、成本效益高的小语言模型 (SLM)，在各种语言、推理、编码和数学基准测试中，在同级别参数模型中性能表现优秀。为开发者构建生成式人工智能应用程序时提供了更多实用的选择。
+- 此代码仓中实现了一套基于NPU硬件的 Phi-3 推理模型。配合加速库使用，旨在NPU上获得极致的推理性能。
+
+# 特性矩阵
+- 此矩阵罗列了 Phi-3 模型支持的特性
+
+| 模型及参数量    | 800I A2 Tensor Parallelism | 300I DUO Tensor Parallelism | FP16 | BF16 | Flash Attention | Paged Attention | W8A8量化 | W8A16量化 | W4A16量化 | KV cache量化 | 稀疏量化 | MOE量化 | MindIE | TGI  | 长序列 |
+| --------------- | -------------------------- | --------------------------- | ---- | ---- | --------------- | --------------- | -------- | --------- | --------- | ------------ | -------- | ------- | ------ | ---- | ------ |
+| Phi-3-mini-128k | 支持world size 1,2,4,8           | 支持world size 1,2,4,8            | 是   | 否   | 否              | 是              | 否       | 否        | 否        | 否           | 否       | 否      | 否     | 否   | 否     |
+
+
+# 使用说明
+
+- 执行推理前需要将权重目录下的config.json中的`torch_dtype`改为`"float16"`
+
+## 路径变量解释
+| 变量名  | 含义                                             |
+|--------|--------------------------------------------------|
+| working_dir | 加速库及模型库下载后放置的目录                  |
+| llm_path | ATB_Models模型仓所在路径；若使用编译好的包，则路径为`${working_dir}/`；若使用gitee下载的代码，则路径为`${working_dir}/MindIE-LLM/examples/atb_models/`    |
+| script_path | 脚本所在路径; Yi系列模型的工作脚本所在路径为`${llm_path}/examples/models/phi3`                            |
+| weight_path | 模型权重路径                            |
+
+## 权重
+**权重下载**
+- [Phi-3-mini-128k-instruct](https://huggingface.co/microsoft/Phi-3-mini-128k-instruct/tree/bb5bf1e4001277a606e11debca0ef80323e5f824) 模型仓近期更新，需要下载 commit id 为 bb5bf1e4001277a606e11debca0ef80323e5f824 的权重（建议直接在 huggingface 先切换 commit id，再下载）。
+
+## 推理
+
+### 对话测试
+**运行Page Attention FP16**
+- 运行启动脚本
+  - 在\${llm_path}目录下执行以下指令
+    ```shell
+    bash ${script_path}/run_pa.sh ${weight_path}
+    ```
+- 环境变量说明
+  - `export ASCEND_RT_VISIBLE_DEVICES=0`
+    - 指定当前机器上可用的逻辑NPU核心，多个核心间使用逗号相连
+    - 核心ID查阅方式见[此README文件](../../README.md)的【启动脚本相关环境变量】章节
+    - 对于300I DUO卡而言，若要使用单卡双芯，请指定至少两个可见核心；若要使用双卡四芯，请指定至少四个可见核心
+    - 各模型支持的核心数参考“特性矩阵”
+  - `export MASTER_PORT=20030`
+    - 设置卡间通信端口
+    - 默认使用20030端口
+    - 目的是为了避免同一台机器同时运行多个多卡模型时出现通信冲突
+    - 设置时端口建议范围为：20000-20050
+  - 以下环境变量与性能和内存优化相关，通常情况下无需修改
+    ```shell
+    export ATB_LAYER_INTERNAL_TENSOR_REUSE=1
+    export INF_NAN_MODE_ENABLE=0
+    export ATB_OPERATION_EXECUTE_ASYNC=1
+    export TASK_QUEUE_ENABLE=1
+    export ATB_CONVERT_NCHW_TO_ND=1
+    export HCCL_BUFFSIZE=120
+    export HCCL_WHITELIST_DISABLE=1
+    export ATB_CONTEXT_WORKSPACE_RING=1
+    export ATB_CONTEXT_WORKSPACE_SIZE=2629145600
+    export ATB_WORKSPACE_MEM_ALLOC_GLOBAL=0
+    export ATB_LAUNCH_KERNEL_WITH_TILING=0
+    export ATB_OPSRUNNER_KERNEL_CACHE_GLOABL_COUNT=1
+    export ATB_OPSRUNNER_KERNEL_CACHE_LOCAL_COUNT=0
+
+    ```
+
+## 精度测试
+- 参考[此README文件](../../../tests/modeltest/README.md)
+  - 示例
+    ```shell
+    cd ${llm_path}/tests/modeltest
+    export ASCEND_RT_VISIBLE_DEVICES=0
+    export MAX_MEMORY_GB=29
+    # 运行Paged Attention FP16
+    bash run.sh pa_fp16 full_CEval 5 1 phi3 ${weight_path} 1
+    ```
+## 性能测试
+- 参考[此README文件](../../../tests/modeltest/README.md)
+  - 示例
+    ```shell
+    cd ${llm_path}/tests/modeltest
+    export ASCEND_RT_VISIBLE_DEVICES=0
+    export MAX_MEMORY_GB=29
+    # 运行Paged Attention FP16
+    bash run.sh pa_fp16 performance [[2048,2048],[1024,1024],[512,512],[256,256]] 1 phi3 ${weight_path} 1
\ No newline at end of file
diff --git a/mindie/examples/models/phi3/run_pa.sh b/mindie/examples/models/phi3/run_pa.sh
new file mode 100755
index 00000000..dcaf1715
--- /dev/null
+++ b/mindie/examples/models/phi3/run_pa.sh
@@ -0,0 +1,28 @@
+#!/bin/bash
+
+# Copyright Huawei Technologies Co., Ltd. 2024. All rights reserved.
+
+# 参数配置以及启动指令的说明见同级目录下的README.md文件
+export ASCEND_RT_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+export MASTER_PORT=20030
+TP_WORLD_SIZE=1
+
+# 以下环境变量与性能和内存优化相关，通常情况下无需修改
+export ATB_LAYER_INTERNAL_TENSOR_REUSE=1
+export INF_NAN_MODE_ENABLE=1
+export ATB_OPERATION_EXECUTE_ASYNC=1
+export TASK_QUEUE_ENABLE=1
+export ATB_CONVERT_NCHW_TO_ND=1
+export LCCL_ENABLE_FALLBACK=1
+export ATB_WORKSPACE_MEM_ALLOC_GLOBAL=1
+export ATB_CONTEXT_WORKSPACE_SIZE=0
+export INT8_FORMAT_NZ_ENABLE=1
+
+extra_param="--is_chat_model"
+world_size=$(($(echo "${ASCEND_RT_VISIBLE_DEVICES}" | grep -o , | wc -l) +1))
+
+if [ "${TP_WORLD_SIZE}" == "1" ]; then
+    python -m examples.run_pa --model_path $1 ${extra_param}
+else
+    torchrun --nproc_per_node ${world_size} --master_port ${MASTER_PORT} -m examples.run_pa --model_path $1 ${extra_param}
+fi
\ No newline at end of file
diff --git a/mindie/examples/models/qwen/README.md b/mindie/examples/models/qwen/README.md
new file mode 100644
index 00000000..1d9653e9
--- /dev/null
+++ b/mindie/examples/models/qwen/README.md
@@ -0,0 +1,338 @@
+# README
+
+- 千问（qwen）语言大模型是阿里巴巴集团推出的大型语言模型，具备强大的自然语言处理能力，能够理解和生成文本，应用于智能客服、内容生成、问答系统等多个场景，助力企业智能化升级。
+
+# 特性矩阵
+
+- 此处罗列QWen模型各版本支持的特性
+
+| 模型及参数量             | 800I A2 Tensor Parallelism | 300I DUO Tensor Parallelism | FP16 | BF16 | Flash Attention | Paged Attention | W8A8量化 | W8A16量化 | W4A16量化 | KV cache量化 | 稀疏量化 | MOE量化 | MindIE Service | TGI  | 长序列 |
+|--------------------| -------------------------- | --------------------------- | ---- | ---- | --------------- | --------------- | -------- | --------- | --------- | ------------ | -------------------------- | ---- | ------ | ---- |-----|
+| Qwen-7B            | 支持world size 1,2,4,8     | 支持world size 1            | √    | ×    | √               | √               | √        | ×         | ×         | ×            | ×                          | ×    | √      | ×    | √    |
+| Qwen-14B           | 支持world size 2,4,8       | 支持world size 1,2          | √    | ×    | √               | √               | √        | ×         | ×         | ×            | ×                          | ×    | √      | ×    | √    |
+| QWen-72B           | 支持world size 8           | ×                           | √    | ×    | √               | √               | ×        | √         | ×         | ×            | ×                          | ×    | √      | ×    | √    |
+| Qwen1.5-0.5B       | 支持world size 1,2,4,8     | 支持world size 1            | √    | ×    | √               | √               | ×        | ×         | ×         | ×            | ×                          | ×    | √      | ×    | √    |
+| Qwen1.5-1.8B       | 支持world size 1,2,4,8     | 支持world size 1            | √    | ×    | √               | √               | ×        | ×         | ×         | ×            | ×                          | ×    | √      | ×    | √    |
+| Qwen1.5-4B         | 支持world size 1,2,4       | 支持world size 1            | √    | ×    | √               | √               | ×        | ×         | ×         | ×            | ×                          | ×    | √      | ×    | √    |
+| Qwen1.5-7B         | 支持world size 1,2,4,8     | 支持world size 1            | √    | ×    | √               | √               | ×        | ×         | ×         | ×            | ×                          | ×    | √      | ×    | √    |
+| Qwen1.5-14B        | 支持world size 2,4,8       | 支持world size 1,2          | √    | ×    | √               | √               | √        | ×         | √         | ×            | √                          | ×    | √      | ×    | √    |
+| Qwen1.5-32B        | 支持world size 4,8         | 支持world size 1,2          | √    | ×    | √               | √               | √        | ×         | ×         | ×            | ×                          | ×    | √      | ×    | √    |
+| Qwen1.5-72B        | 支持world size 8           | ×                           | √    | ×    | √               | √               | ×        | √         | √         | ×            | ×                          | ×    | √      | ×    | √    |
+| Qwen1.5-MoE-A2.7B  | 支持world size 4           | ×                           | √    | ×    | √               | √               | ×        | ×         | ×         | ×            | ×                          | √    | √      | ×    | √    |
+
+注：表中所示支持的world size为对话测试可跑通的配置，实际运行时还需考虑输入序列长度带来的显存占用。
+
+## 原始权重
+
+| 模型              | huggingface下载链接                                     |
+| ----------------- | ------------------------------------------------------- |
+| Qwen-7B           | https://huggingface.co/Qwen/Qwen-7B/tree/main           |
+| Qwen-14B          | https://huggingface.co/Qwen/Qwen-14B/tree/main          |
+| QWen-72B          | https://huggingface.co/Qwen/Qwen-72B/tree/main          |
+| Qwen1.5-14B       | https://huggingface.co/Qwen/Qwen1.5-14B/tree/main       |
+| Qwen-14B-Chat     | https://huggingface.co/Qwen/Qwen-14B-Chat/tree/main     |
+| Qwen-72B-Chat     | https://huggingface.co/Qwen/Qwen-72B-Chat/tree/main     |
+| Qwen1.5-0.5B-Chat | https://huggingface.co/Qwen/Qwen1.5-0.5B-Chat/tree/main |
+| Qwen1.5-4B-Chat   | https://huggingface.co/Qwen/Qwen1.5-4B-Chat/tree/main   |
+| Qwen1.5-7B        | https://huggingface.co/Qwen/Qwen1.5-7B/tree/main        |
+| Qwen1.5-14B-Chat  | https://huggingface.co/Qwen/Qwen1.5-14B-Chat/tree/main  |
+| Qwen1.5-32B-Chat  | https://huggingface.co/Qwen/Qwen1.5-32B-Chat/tree/main  |
+| Qwen1.5-72B       | https://huggingface.co/Qwen/Qwen1.5-72B/tree/main       |
+| Qwen1.5-MoE-A2.7B-Chat       | https://huggingface.co/Qwen/Qwen1.5-MoE-A2.7B-Chat/tree/main       |
+
+# 版本配套
+| 模型版本 | transformers版本 |
+| -------- | ---------------- |
+| Qwen     | 4.30.2、4.32.0   |
+| Qwen1.5  | 4.37.0、4.37.2   |
+
+# Paged Attention 推理使用说明
+
+注意：
+- Qwen模型权重所在路径中的config.json文件需添加字段`torch_dtype`，例如`"torch_dtype": "float16"`
+- Qwen1.5模型权重所在路径中的config.json文件需将字段`torch_dtype`的值修改为"float16"，例如`"torch_dtype": "float16"`
+- 执行量化推理时，须在量化权重所在路径的config.json文件中添加字段`quantize`，值为当前量化权重的量化方式，例如`"quantize": "w8a8"`、`"quantize": "w8a16"`
+- QWen-14B执行2K以上（QWen-7B为8K以上）长序列推理时需增加环境变量`LONG_SEQ_ENABLE=1`。长序列推理过程具有更多计算节点，因此相比于短序列，推理性能将有下降。
+
+## 路径变量解释
+
+| 变量名称    | 含义                                                         |
+| ----------- | ------------------------------------------------------------ |
+| working_dir | 加速库及模型库下载后放置的目录                               |
+| llm_path    | 模型仓所在路径。若使用编译好的包，则路径为`${working_dir}/MindIE-LLM/`；若使用gitee下载的代码，则路径为`${working_dir}/MindIE-LLM/examples/atb_models` |
+| script_path | 脚本所在路径。QWen系列模型的工作脚本所在路径为`${llm_path}/examples/models/qwen` |
+| weight_path | 模型权重路径                                                 |
+
+## 权重格式转换
+
+Paged Attention 场景需要.safetensors格式的权重，如果没有，参考[此README文件](../../README.md)转换
+注：huggingface官网给出的QWen模型权重为.safetensors格式
+
+## 量化权重导出
+量化权重可通过msmodelslim（昇腾压缩加速工具）实现。
+
+#### 环境准备
+环境配置可参考[此README文件](../../../README.md)
+
+- 设置环境变量
+
+```shell
+# 设置CANN包的环境变量
+source /usr/local/Ascend/ascend-toolkit/set_env.sh
+```
+
+需要安装 CANN（已包含msmodelslim工具） 以及 pytorch 和 pytorch-npu
+以及相关的python库
+
+```shell
+pip install transformers  # transformers版本应根据Qwen版本确定，配套关系见‘版本配套’
+pip install accelerate==0.27.2
+pip install scipy==1.11.4
+pip install tiktoken==0.5.2
+pip install einops==0.7.0
+pip install transformers_stream_generator==0.0.4
+```
+
+#### 导出量化权重
+##### qwen系列
+- 通过`${llm_path}/examples/models/qwen/quant_qwen_14b_w8a8.py`和`${llm_path}/examples/models/qwen/quant_qwen_72b_w8a16.py`文件导出目标模型的量化权重（注意量化权重不要和浮点权重放在同一个目录下）：
+    ```shell
+    python quant_qwen_14b_w8a8.py ${浮点权重路径} ${量化权重保存路径}
+    ```
+  导出量化权重后应生成`quant_model_weight_w8a8.safetensors`和`quant_model_description_w8a8.json`两个文件。
+
+- 注意：
+    - quant_qwen_14b_w8a8.py、quant_qwen_72b_w8a16.py分别为Qwen1-14B模型、Qwen1-72B模型已配置好的较优的量化策略。导出量化权重时可直接使用，也可修改为其它策略。
+    - **_72b模型较大，导出量化权重使用DataFree形式需要花费半小时，使用LabelFree需要耗费4小时左右_**
+    ```python
+    calibrator = Calibrator(
+        model,
+        quant_config,
+        calib_data=None,  # w8a16 支持精度无损的data-free形式，设置calib_data=None；label-free需要传入校准集，设置calib_data=dataset_calib，权重生成时间较长
+        disable_level='L0'  # 自动回退等级，根据精度损失程度增加不量化的层（L0~L5，L0为不回退，精度损失明显时可适当提升等级）
+    )
+    ```
+    - qwen系列目前有`qwen-14b、qwen-72b`支持量化，具体支持情况请参考‘特性矩阵’
+##### qwen1.5系列W8A8量化
+- W8A8量化权重请使用以下指令生成
+  - 当前支持NPU分布式W8A8量化
+  - 执行量化脚本
+    ```shell
+    # 指定当前机器上可用的逻辑NPU核心
+    export ASCEND_RT_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+    cd ${llm_path}
+    python examples/models/qwen/convert_quant_weights.py --model_path {浮点权重路径} --save_directory {W8A8量化权重路径} --w_bit 8 --a_bit 8 --disable_level L0 --device_type npu --calib_file ${llm_path}/examples/convert/model_slim/boolq.jsonl
+    ```
+- 注意
+  -`model_path`和`save_directory`请勿使用同一个文件夹，避免浮点权重和量化权重混淆
+  
+  - accelerate三方件版本需>=0.28.0
+  - qwen1.5系列目前有`qwen1.5-14b、qwen1.5-32b`支持量化，具体支持情况请参考‘特性矩阵’
+
+- 稀疏量化权重请使用以下指令生成
+
+  - Step 1
+
+    ```shell
+    # 设置CANN包的环境变量
+    source /usr/local/Ascend/ascend-toolkit/set_env.sh
+    cd ${llm_path}
+    python examples/models/qwen/convert_quant_weights_14b.py --model_path {浮点权重路径} --save_directory {W8A8S量化权重路径} --w_bit 4 --a_bit 8 --calib_file ${llm_path}/atb_llm/models/qwen2/cn_en.jsonl --fraction 0.011 --co_sparse True --device_type cpu --do_smooth False --use_sigma True --is_lowbit True
+    ```
+
+    Step 2：量化权重切分及压缩
+
+    > 运行前需要确保压缩工具编译过
+    >
+    > `cd /usr/local/Ascend/ascend-toolkit/latest/python/site-packages/msmodelslim/pytorch/weight_compression/compress_graph`
+    >
+    > `bash build.sh /usr/local/Ascend/ascend-toolkit/latest`
+
+    ```shell
+    torchrun --nproc_per_node {TP数} -m examples.convert.model_slim.sparse_compressor --model_path {W8A8S量化权重路径} --save_directory {W8A8SC量化权重路径}
+    ```
+
+    - TP数为tensor parallel并行个数
+
+    - 注意：若权重生成时以TP=4进行切分，则运行时也需以TP=4运行
+
+    - 示例
+
+      ```shell
+        torchrun --nproc_per_node 2 -m examples.convert.model_slim.sparse_compressor --model_path /data1/weights/model_slim/Qwen-14b_w8a8s --save_directory /data1/weights/model_slim/Qwen-14b_w8a8sc
+      ```
+##### qwen1.5系列W8A16与W4A16量化
+- 目录`${llm_path}/examples/models/qwen/`下的quant_qwen2_72b_w8a16_fast.py、quant_qwen2_72b_w4a16_64.py分别为Qwen1.5-72B-W8A16模型、Qwen1.5-72B-W4A16模型已配置好的较优的量化策略。导出量化权重时可直接使用，也可修改为其它策略。
+- 通过 `${llm_path}/examples/models/qwen/quant_qwen2_72b_w8a16_fast.py` 和 `${llm_path}/examples/models/qwen/quant_qwen2_72b_w4a16_64.py` 脚本导出Qwen1.5-72B模型W8A16和W4A16的量化权重（注意量化权重不要和浮点权重放在同一个目录下）。命令如下：
+    ```shell
+    python quant_qwen2_72b_w8a16_fast.py ${浮点权重路径} ${量化权重保存路径}
+    
+    python quant_qwen2_72b_w4a16_64.py ${浮点权重路径} ${量化权重保存路径}
+    ```
+    例：
+    ```shell
+    python quant_qwen2_72b_w8a16_fast.py /data1/models/Qwen1p5_72B /data1/models/Qwen1p5_72B_W8A16
+
+    python quant_qwen2_72b_w4a16_64.py /data1/models/Qwen1p5_72B /data1/models/Qwen1p5_72B_W4A16
+    ```
+- 导出量化权重后生成`quant_model_weight_w8a16.safetensors`和`quant_model_description_w8a16.json`两个文件（对于W4A16量化为`quant_model_weight_w4a16.safetensors`和`quant_model_description_w4a16.json`两个文件）。模型浮点权重中的其他文件（除safetensors文件外）需要手工拷贝到目标量化文件夹中。
+- 在量化权重保存路径中的config.json文件中添加"quantize"字段。对于W8A16量化，"quantize"字段的值为"w8a16"；对于W4A16量化，"quantize"字段的值为"w4a16"。
+
+
+## 推理
+
+### 对话测试
+
+量化权重生成路径下可能缺少一些必要文件（与转换量化权重时使用的cann版本有关），若启动量化推理失败，请将config.json等相关文件复制到量化权重路径中，可执行以下指令进行复制：
+```shell
+cp ${浮点权重路径}/*.py ${量化权重路径}
+cp ${浮点权重路径}/*.json ${量化权重路径}
+cp ${浮点权重路径}/*.tiktoken ${量化权重路径}
+```
+
+启动量化推理时，请在权重路径的config.json文件中添加(或修改)`torch_dtype`字段，例如`"torch_dtype": "float16"`。
+
+启动量化推理时，请在权重路径的config.json文件中添加(或修改)`quantize`字段，值为相应量化方式，例如`"quantize": "w8a8"`、`"quantize": "w8a16"`
+
+在`${llm_path}`目录执行以下指令
+
+```shell
+bash examples/models/qwen/run_pa.sh -m ${weight_path}
+```
+
+注：
+
+1.推理支持浮点和量化，若启动浮点推理则在`${weight_path}`中传入浮点权重路径，若启动量化则传入量化权重路径
+
+2.同时支持Qwen和Qwen1.5模型推理，若启动Qwen模型推理时在`${weight_path}`中传入Qwen权重路径，若启动Qwen1.5模型推理时则在`${weight_path}`中传入Qwen1.5权重路径
+
+3.Qwen1.5系列部分chat模型（Qwen1.5-4B-Chat、Qwen1.5-32B-Chat）需要开启chat模式才能正常输出。
+执行：
+
+```shell
+bash examples/models/qwen/run_pa.sh -m ${weight_path} -c true
+```
+
+4.启动qwen需要安装三方依赖tiktoken，若环境中没有该依赖可使用以下命令安装：
+
+```shell
+pip install tiktoken
+```
+
+根据硬件设备不同请参考下表修改run_pa.sh再运行
+
+5.运行Qwen1.5-MoE-A2.7B-Chat时，需写入环境变量
+```shell
+export ATB_LLM_ENABLE_AUTO_TRANSPOSE=0
+```
+
+### run_pa.sh 参数说明（需要到脚本中修改）
+
+| 参数名称                  | 含义                                      | 800I A2推荐值    | 300I DUO推荐值   |
+| ------------------------- | ----------------------------------------- | ---------------- | ---------------- |
+| BIND_CPU                  | 绑定CPU核心开关,默认进行绑核              | 1                | 1                |
+| ASCEND_RT_VISIBLE_DEVICES | 使用的硬件卡号，多个卡间使用逗号相连      | 根据实际情况设置 | 根据实际情况设置 |
+| RESERVED_MEMORY_GB        | 保留内存，通常未加速库需要的内存+通信内存 | 3                | 3                |
+| MASTER_PORT               | 卡间通信端口,通常不用修改，有冲突时再改   |                  |                  |
+
+注：暂不支持奇数卡并行
+
+**运行MOE量化**
+
+- 待补充
+
+**运行KV cache量化**
+
+- 待补充
+
+## 精度测试
+
+- 参考[此README文件](../../../tests/modeltest/README.md)
+
+示例：
+
+```shell
+bash run.sh pa_fp16 full_BoolQ 1 qwen /data1/models/qwen2/qwen_quant_test/ 8
+bash run.sh pa_fp16 full_BoolQ 1 qwen ${Qwen-7b权重路径} 8
+bash run.sh pa_fp16 full_BoolQ 1 qwen ${Qwen-14b权重路径} 8
+bash run.sh pa_fp16 full_BoolQ 1 qwen ${Qwen-72b权重路径} 8
+bash run.sh pa_fp16 full_BoolQ 1 qwen ${Qwen1.5-14b权重路径} 8
+bash run.sh pa_fp16 full_BoolQ 1 qwen ${Qwen-14b-chat权重路径} 8
+bash run.sh pa_fp16 full_BoolQ 1 qwen ${Qwen-72b-chat权重路径} 8
+bash run.sh pa_fp16 full_BoolQ 1 qwen ${Qwen1.5-0.5b-chat权重路径} 8
+bash run.sh pa_fp16 full_BoolQ 1 qwen ${Qwen1.5-4b-chat权重路径} 4
+bash run.sh pa_fp16 full_BoolQ 1 qwen ${Qwen1.5-7b权重路径} 8
+bash run.sh pa_fp16 full_BoolQ 1 qwen ${Qwen1.5-14b-chat权重路径} 8
+bash run.sh pa_fp16 full_BoolQ 1 qwen ${Qwen1.5-32b-chat权重路径} 8
+bash run.sh pa_fp16 full_BoolQ 1 qwen ${Qwen1.5-72b权重路径} 8
+bash run.sh pa_fp16 full_BoolQ 1 qwen ${Qwen1.5-MoE-A2.7B-Chat权重路径} 8
+```
+
+
+
+## 性能测试
+
+- 进入以下路径
+  ```shell
+  ${llm_path}/tests/modeltest
+  ```
+- 运行指令
+  ```shell
+  bash run.sh pa_fp16 [performance|full_CEval|full_BoolQ] ([case_pair]) [batch_size] qwen [weight_dir] [chip_num] ([max_position_embedding/max_sequence_length])
+  ```
+
+- 环境变量释义
+
+1. HCCL_DETERMINISTIC=0          LCCL_DETERMINISTIC=0
+
+这两个会影响性能，开启了变慢，但是会变成确定性计算，不开会变快，所以设置为0。
+
+2. HCCL_BUFFSIZE=120
+
+这个会影响hccl显存，需要设置，基本不影响性能。
+
+3. ATB_WORKSPACE_MEM_ALLOC_GLOBAL=1
+
+这个是显存优化，需要开，小batch、短序列场景不开更好。
+
+示例：
+
+  ```shell
+  HCCL_DETERMINISTIC=0 LCCL_DETERMINISTIC=0 HCCL_BUFFSIZE=120 ATB_WORKSPACE_MEM_ALLOC_GLOBAL=1 bash run.sh pa_fp16 performance　[[2048,2048],[1024,1024],[512,512],[256,256]] 1 qwen ${Qwen-7b权重路径} 8
+  HCCL_DETERMINISTIC=0 LCCL_DETERMINISTIC=0 HCCL_BUFFSIZE=120 ATB_WORKSPACE_MEM_ALLOC_GLOBAL=1 bash run.sh pa_fp16 performance　[[2048,2048],[1024,1024],[512,512],[256,256]] 1 qwen ${Qwen-14b权重路径} 8
+  HCCL_DETERMINISTIC=0 LCCL_DETERMINISTIC=0 HCCL_BUFFSIZE=120 ATB_WORKSPACE_MEM_ALLOC_GLOBAL=1 bash run.sh pa_fp16 performance　[[2048,2048],[1024,1024],[512,512],[256,256]] 1 qwen ${Qwen-72b权重路径} 8
+  HCCL_DETERMINISTIC=0 LCCL_DETERMINISTIC=0 HCCL_BUFFSIZE=120 ATB_WORKSPACE_MEM_ALLOC_GLOBAL=1 bash run.sh pa_fp16 performance　[[2048,2048],[1024,1024],[512,512],[256,256]] 1 qwen ${Qwen1.5-14b权重路径} 8
+  HCCL_DETERMINISTIC=0 LCCL_DETERMINISTIC=0 HCCL_BUFFSIZE=120 ATB_WORKSPACE_MEM_ALLOC_GLOBAL=1 bash run.sh pa_fp16 performance　[[2048,2048],[1024,1024],[512,512],[256,256]] 1 qwen ${Qwen-14b-chat权重路径} 8
+  HCCL_DETERMINISTIC=0 LCCL_DETERMINISTIC=0 HCCL_BUFFSIZE=120 ATB_WORKSPACE_MEM_ALLOC_GLOBAL=1 bash run.sh pa_fp16 performance　[[2048,2048],[1024,1024],[512,512],[256,256]] 1 qwen ${Qwen-72b-chat权重路径} 8
+  HCCL_DETERMINISTIC=0 LCCL_DETERMINISTIC=0 HCCL_BUFFSIZE=120 
+  ATB_WORKSPACE_MEM_ALLOC_GLOBAL=1 bash run.sh pa_fp16 performance　[[2048,2048],[1024,1024],[512,512],[256,256]] 1 qwen ${Qwen1.5-0.5b-chat权重路径} 8
+  HCCL_DETERMINISTIC=0 LCCL_DETERMINISTIC=0 HCCL_BUFFSIZE=120
+  ATB_WORKSPACE_MEM_ALLOC_GLOBAL=1 bash run.sh pa_fp16 performance　[[2048,2048],[1024,1024],[512,512],[256,256]] 1 qwen ${Qwen1.5-4b-chat权重路径} 8
+  HCCL_DETERMINISTIC=0 LCCL_DETERMINISTIC=0 HCCL_BUFFSIZE=120 ATB_WORKSPACE_MEM_ALLOC_GLOBAL=1 bash run.sh pa_fp16 performance　[[2048,2048],[1024,1024],[512,512],[256,256]] 1 qwen ${Qwen1.5-7b权重路径} 8
+  HCCL_DETERMINISTIC=0 LCCL_DETERMINISTIC=0 HCCL_BUFFSIZE=120 ATB_WORKSPACE_MEM_ALLOC_GLOBAL=1 bash run.sh pa_fp16 performance　[[2048,2048],[1024,1024],[512,512],[256,256]] 1 qwen ${Qwen1.5-14b-chat权重路径} 8
+  HCCL_DETERMINISTIC=0 LCCL_DETERMINISTIC=0 HCCL_BUFFSIZE=120 ATB_WORKSPACE_MEM_ALLOC_GLOBAL=1 bash run.sh pa_fp16 performance　[[2048,2048],[1024,1024],[512,512],[256,256]] 1 qwen ${Qwen1.5-32b-chat权重路径} 8
+  HCCL_DETERMINISTIC=0 LCCL_DETERMINISTIC=0 HCCL_BUFFSIZE=120 ATB_WORKSPACE_MEM_ALLOC_GLOBAL=1 bash run.sh pa_fp16 performance　[[2048,2048],[1024,1024],[512,512],[256,256]] 1 qwen ${Qwen1.5-72b权重路径} 8
+  HCCL_DETERMINISTIC=0 LCCL_DETERMINISTIC=0 HCCL_BUFFSIZE=120 ATB_WORKSPACE_MEM_ALLOC_GLOBAL=1 bash run.sh pa_fp16 performance　[[2048,2048],[1024,1024],[512,512],[256,256]] 1 qwen ${Qwen1.5-MoE-A2.7B-Chat权重路径} 8
+  ```
+
+- 参考[此README文件](../../../tests/modeltest/README.md)
+
+# Flash Attention推理使用说明
+
+路径变量和权重转换等均与Paged Attention相同。
+
+## 推理
+
+### 对话测试
+
+在`${llm_path}`目录执行以下指令
+
+```shell
+bash examples/models/qwen/run_fa.sh ${weight_path}
+```
+
+注：
+- 除启动命令外，其他操作与执行PA相同
+- 暂不支持chat模式。部分chat模型输出可能存在异常，如qwen1.5-32b-chat，若出现上述情况，请优先使用PA
+- QWen-14B执行2K以上（QWen-7B为8K以上）长序列推理时需增加环境变量`LONG_SEQ_ENABLE=1`（暂不支持多batch长序列推理）。长序列推理过程具有更多计算节点，因此相比于短序列，推理性能将有下降。
+- qwen1.5部分Chat模型(4B、32B)fa暂不支持chat推理，请优先使用pa。如需使用fa请将输入改造成续写的样式，如：`What's deep learning?`改写成`Deep learning is`
diff --git a/mindie/examples/models/qwen/convert_quant_weights.py b/mindie/examples/models/qwen/convert_quant_weights.py
new file mode 100644
index 00000000..08a7f235
--- /dev/null
+++ b/mindie/examples/models/qwen/convert_quant_weights.py
@@ -0,0 +1,64 @@
+# Copyright Huawei Technologies Co., Ltd. 2023-2024. All rights reserved.
+import os
+import torch
+
+from msmodelslim.pytorch.llm_ptq.llm_ptq_tools import QuantConfig
+from msmodelslim.pytorch.llm_ptq.anti_outlier import AntiOutlierConfig
+
+from atb_llm.models.qwen2.modeling_qwen2 import Qwen2Config
+from examples.convert.model_slim.get_calibration_dataset import load_jsonl
+from examples.convert.model_slim.quantifier import parse_arguments, Quantifier
+from examples.convert.convert_utils import copy_tokenizer_files, modify_config
+
+
+if __name__ == "__main__":
+    args = parse_arguments()
+
+    rank = int(os.getenv("RANK", "0"))
+
+    disable_names = []
+    if args.a_bit != 16:
+        # W8A16没有回退层
+        config = Qwen2Config.from_pretrained(args.model_path)
+        num_layers = config.num_hidden_layers
+        disable_names = [f"model.layers.{layer}.mlp.down_proj" for layer in range(num_layers)]
+        disable_names.append("lm_head")
+
+    anti_outlier_config = None
+    if args.anti_method:
+        anti_outlier_config = AntiOutlierConfig(anti_method=args.anti_method)
+
+    quant_config = QuantConfig(
+        a_bit=args.a_bit,
+        w_bit=args.w_bit,
+        disable_names=disable_names,
+        act_method=args.act_method,
+        mm_tensor=False,
+        dev_type=args.device_type,
+        dev_id=rank,
+        pr=1.0,
+        fraction=args.fraction,
+        co_sparse=args.co_sparse,
+        do_smooth=args.do_smooth,
+        use_sigma=args.use_sigma,
+        sigma_factor=args.sigma_factor,
+        is_lowbit=args.is_lowbit,
+    )
+
+    calibration_dataset = None
+    if args.calib_file:
+        calibration_dataset = load_jsonl(args.calib_file)
+    calibration_dataset = calibration_dataset
+    quant_weight_generator = Quantifier(args.model_path, quant_config, anti_outlier_config, args.device_type)
+    quant_weight_generator.tokenizer.pad_token_id = 0
+
+    tokenized_data = None
+    if calibration_dataset is not None:
+        tokenized_data = quant_weight_generator.get_tokenized_data(calibration_dataset)
+
+    quant_weight_generator.convert(tokenized_data, args.save_directory, args.disable_level)
+    modify_config(
+        args.model_path, args.save_directory, torch.float16,
+        f"w{args.w_bit}a{args.a_bit}" + ("s" if args.co_sparse else "")
+    )
+    copy_tokenizer_files(args.model_path, args.save_directory)
diff --git a/mindie/examples/models/qwen/convert_quant_weights_14b.py b/mindie/examples/models/qwen/convert_quant_weights_14b.py
new file mode 100644
index 00000000..f6e751f3
--- /dev/null
+++ b/mindie/examples/models/qwen/convert_quant_weights_14b.py
@@ -0,0 +1,66 @@
+# Copyright Huawei Technologies Co., Ltd. 2023-2024. All rights reserved.
+import os
+
+from msmodelslim.pytorch.llm_ptq.llm_ptq_tools import QuantConfig
+from msmodelslim.pytorch.llm_ptq.anti_outlier import AntiOutlierConfig
+
+from atb_llm.models.llama.modeling_llama import LlamaConfig
+from examples.convert.model_slim.get_calibration_dataset import load_jsonl
+from examples.convert.model_slim.quantifier import parse_arguments, Quantifier
+from examples.convert.convert_utils import copy_tokenizer_files, modify_config
+
+
+if __name__ == "__main__":
+    args = parse_arguments()
+
+    rank = int(os.getenv("RANK", "0"))
+
+    config = LlamaConfig.from_pretrained(args.model_path)
+
+    disable_names = []
+    if args.a_bit != 16:
+        # W8A16没有回退层
+        num_layers = config.num_hidden_layers
+        disable_names = [f"model.layers.{layer}.mlp.down_proj" for layer in range(num_layers)]
+        disable_names.append("lm_head")
+
+    anti_outlier_config = None
+    if args.anti_method:
+        anti_outlier_config = AntiOutlierConfig(anti_method=args.anti_method)
+
+    quant_config = QuantConfig(
+        a_bit=args.a_bit,
+        w_bit=args.w_bit,
+        disable_names=disable_names,
+        act_method=args.act_method,
+        mm_tensor=False,
+        dev_type=args.device_type,
+        dev_id=rank,
+        pr=1.0,
+        fraction=args.fraction,
+        co_sparse=args.co_sparse,
+        do_smooth=args.do_smooth,
+        use_sigma=args.use_sigma,
+        sigma_factor=args.sigma_factor,
+        is_lowbit=args.is_lowbit,
+    )
+
+    # 默认无校准数据集
+    calibration_dataset = None
+    # 若存在calib_file，则使用calib_file作为校准数据集
+    if args.calib_file:
+        calibration_dataset = load_jsonl(args.calib_file)
+    calibration_dataset = calibration_dataset
+    quant_weight_generator = Quantifier(args.model_path, quant_config, anti_outlier_config, args.device_type)
+    quant_weight_generator.tokenizer.pad_token_id = 0
+
+    tokenized_data = None
+    if calibration_dataset is not None:
+        tokenized_data = quant_weight_generator.get_tokenized_data(calibration_dataset)
+
+    quant_weight_generator.convert(tokenized_data, args.save_directory, args.disable_level)
+    modify_config(
+        args.model_path, args.save_directory, config.torch_dtype,
+        f"w{args.w_bit}a{args.a_bit}" + ("s" if args.co_sparse else "")
+    )
+    copy_tokenizer_files(args.model_path, args.save_directory)
diff --git a/mindie/examples/models/qwen/quant_qwen2_14b_w4a16_64.py b/mindie/examples/models/qwen/quant_qwen2_14b_w4a16_64.py
new file mode 100644
index 00000000..4b3c166e
--- /dev/null
+++ b/mindie/examples/models/qwen/quant_qwen2_14b_w4a16_64.py
@@ -0,0 +1,63 @@
+# Copyright (c) Huawei Technologies Co., Ltd. 2024-2024. All rights reserved.
+import sys
+import torch
+from transformers import AutoTokenizer, AutoModelForCausalLM
+from msmodelslim.pytorch.llm_ptq.llm_ptq_tools import Calibrator, QuantConfig
+
+
+def load_tokenizer_and_model(fp16_path):
+    tokenizer = AutoTokenizer.from_pretrained(
+        pretrained_model_name_or_path=fp16_path,
+        pad_token='<|extra_0|>',
+        eos_token='<|endoftext|>',
+        padding_side='left',
+        trust_remote_code=True
+    )
+    model = AutoModelForCausalLM.from_pretrained(
+        pretrained_model_name_or_path=fp16_path,
+        torch_dtype=torch.float32, trust_remote_code=True
+    ).cpu()
+    return tokenizer, model
+
+
+def main(fp16_path, quant_save_path):
+    tokenizer, model = load_tokenizer_and_model(fp16_path)
+
+
+
+    disable_names = ['lm_head']
+    llama_layers = 20
+    disable_idx_lst = list(range(llama_layers))
+    for layer_index in disable_idx_lst:
+        down_proj_name = "model.layers.{}.mlp.down_proj".format(layer_index)
+        disable_names.append(down_proj_name)
+
+    quant_config = QuantConfig(
+        w_bit=4,  # 权重量化位数
+        a_bit=16,  # 激活值量化位数
+        disable_names=disable_names,  # 不做量化的层
+        dev_type='cpu',
+        pr=1.0,  # 量化正则百分比
+        w_sym=True,  # 对称/非对称量化，True为对称量化，False为非对称量化
+        mm_tensor=False,  # 权重量化粒度，True为per-tensor量化，False为per-channel量化（大模型场景建议False）
+        is_lowbit=True,
+        open_outlier=False,
+        group_size=64
+    )
+
+    calibrator = Calibrator(
+        model,
+        quant_config,
+        calib_data=None,  # W4A16量化无需校准
+        disable_level='L0'  # 自动回退等级，根据精度损失程度增加不量化的层（L0~L5，L0为不回退，精度损失明显时可适当提升等级）
+    )
+
+    calibrator.run()  # 执行PTQ量化校准
+
+    calibrator.save(quant_save_path, save_type=["safe_tensor"])
+
+
+if __name__ == "__main__":
+    fp16_path_0 = sys.argv[1]
+    quant_save_path_0 = sys.argv[2]
+    main(fp16_path_0, quant_save_path_0)
diff --git a/mindie/examples/models/qwen/quant_qwen2_72b_w4a16_64.py b/mindie/examples/models/qwen/quant_qwen2_72b_w4a16_64.py
new file mode 100644
index 00000000..25606af5
--- /dev/null
+++ b/mindie/examples/models/qwen/quant_qwen2_72b_w4a16_64.py
@@ -0,0 +1,56 @@
+# Copyright Huawei Technologies Co., Ltd. 2023-2024. All rights reserved.
+import sys
+import torch
+from transformers import AutoTokenizer, AutoModelForCausalLM
+from msmodelslim.pytorch.llm_ptq.llm_ptq_tools import Calibrator, QuantConfig
+
+
+def load_tokenizer_and_model(fp16_path):
+    tokenizer = AutoTokenizer.from_pretrained(
+        pretrained_model_name_or_path=fp16_path,
+        pad_token='<|extra_0|>',
+        eos_token='<|endoftext|>',
+        padding_side='left',
+        trust_remote_code=True
+    ) 
+    model = AutoModelForCausalLM.from_pretrained(
+        pretrained_model_name_or_path=fp16_path,
+        torch_dtype=torch.float32, trust_remote_code=True
+    ).cpu()
+    return tokenizer, model
+
+
+def main(fp16_path, quant_save_path):    
+    tokenizer, model = load_tokenizer_and_model(fp16_path)
+
+    disable_names = ['lm_head']
+
+    quant_config = QuantConfig(
+        w_bit=4,                        # 权重量化位数
+        a_bit=16,                       # 激活值量化位数
+        disable_names=disable_names,    # 不做量化的层
+        dev_type='cpu',
+        pr=1.0,                         # 量化正则百分比
+        w_sym=True,                     # 对称/非对称量化，True为对称量化，False为非对称量化
+        mm_tensor=False,                # 权重量化粒度，True为per-tensor量化，False为per-channel量化（大模型场景建议False）
+        is_lowbit=True,
+        open_outlier=False,
+        group_size=64
+    )
+    
+    calibrator = Calibrator(
+        model,
+        quant_config,
+        calib_data=None,    # W4A16量化无需校准
+        disable_level='L0'  # 自动回退等级，根据精度损失程度增加不量化的层（L0~L5，L0为不回退，精度损失明显时可适当提升等级）
+    )
+    
+    calibrator.run()  # 执行PTQ量化校准
+
+    calibrator.save(quant_save_path, save_type=["safe_tensor"])
+
+
+if __name__ == "__main__":
+    fp16_path_0 = sys.argv[1]
+    quant_save_path_0 = sys.argv[2]
+    main(fp16_path_0, quant_save_path_0)
diff --git a/mindie/examples/models/qwen/quant_qwen2_72b_w8a16_fast.py b/mindie/examples/models/qwen/quant_qwen2_72b_w8a16_fast.py
new file mode 100644
index 00000000..a45c8388
--- /dev/null
+++ b/mindie/examples/models/qwen/quant_qwen2_72b_w8a16_fast.py
@@ -0,0 +1,53 @@
+# Copyright Huawei Technologies Co., Ltd. 2023-2024. All rights reserved.
+import sys
+import torch
+from transformers import AutoTokenizer, AutoModelForCausalLM
+from msmodelslim.pytorch.llm_ptq.llm_ptq_tools import Calibrator, QuantConfig
+
+
+def load_tokenizer_and_model(fp16_path):
+    tokenizer = AutoTokenizer.from_pretrained(
+        pretrained_model_name_or_path=fp16_path,
+        pad_token='<|extra_0|>',
+        eos_token='<|endoftext|>',
+        padding_side='left',
+        trust_remote_code=True
+    ) 
+    model = AutoModelForCausalLM.from_pretrained(
+        pretrained_model_name_or_path=fp16_path,
+        torch_dtype=torch.float32, trust_remote_code=True
+    ).cpu()
+    return tokenizer, model
+
+
+def main(fp16_path, quant_save_path):    
+    tokenizer, model = load_tokenizer_and_model(fp16_path)
+
+    disable_names = ['lm_head']
+
+    quant_config = QuantConfig(
+        w_bit=8,                        # 权重量化位数
+        a_bit=16,                       # 激活值量化位数
+        disable_names=disable_names,    # 不做量化的层
+        dev_type='cpu',
+        pr=1.0,                         # 量化正则百分比
+        w_sym=True,                     # 对称/非对称量化，True为对称量化，False为非对称量化
+        mm_tensor=False                 # 权重量化粒度，True为per-tensor量化，False为per-channel量化（大模型场景建议False）
+    )
+    
+    calibrator = Calibrator(
+        model,
+        quant_config,
+        calib_data=None,    # W8A16量化无需校准
+        disable_level='L0'  # 自动回退等级，根据精度损失程度增加不量化的层（L0~L5，L0为不回退，精度损失明显时可适当提升等级）
+    )
+    
+    calibrator.run()  # 执行PTQ量化校准
+
+    calibrator.save(quant_save_path, save_type=["safe_tensor"])
+
+
+if __name__ == "__main__":
+    fp16_path_0 = sys.argv[1]
+    quant_save_path_0 = sys.argv[2]
+    main(fp16_path_0, quant_save_path_0)
diff --git a/mindie/examples/models/qwen/quant_qwen_14b_w8a8.py b/mindie/examples/models/qwen/quant_qwen_14b_w8a8.py
new file mode 100644
index 00000000..fefbf51f
--- /dev/null
+++ b/mindie/examples/models/qwen/quant_qwen_14b_w8a8.py
@@ -0,0 +1,339 @@
+# Copyright Huawei Technologies Co., Ltd. 2023-2024. All rights reserved.
+import sys
+import torch
+from transformers import AutoTokenizer, AutoModelForCausalLM
+from msmodelslim.pytorch.llm_ptq.llm_ptq_tools import Calibrator, QuantConfig
+from msmodelslim.pytorch.llm_ptq.anti_outlier import AntiOutlierConfig, AntiOutlier
+
+
+def load_tokenizer_and_model(fp16_path):
+    tokenizer = AutoTokenizer.from_pretrained(
+        pretrained_model_name_or_path=fp16_path,
+        pad_token='<|extra_0|>',
+        eos_token='<|endoftext|>',
+        padding_side='left',
+        trust_remote_code=True
+    )
+    model = AutoModelForCausalLM.from_pretrained(
+        pretrained_model_name_or_path=fp16_path,
+        torch_dtype=torch.float32, trust_remote_code=True
+    ).cpu()
+    return tokenizer, model
+
+
+def infer(tokenizer, model, query, model_params=None):
+    """
+    推理代码
+    :param query:
+    :param model_params:
+    :return:
+    """
+    inputs = tokenizer(query, return_tensors='pt')
+    inputs = inputs.to(model.device)
+    with torch.no_grad():
+        model_params = model_params if model_params is not None else {}
+        pred = model.generate(**inputs, **model_params)
+    output = tokenizer.decode(pred.cpu()[0], skip_special_tokens=True)
+    return output
+
+
+def get_calib_dataset(tokenizer, calib_list):
+    calib_dataset = []
+    for calib_data in calib_list:
+        inputs = tokenizer(calib_data, return_tensors='pt').to("cpu")
+        calib_dataset.append([inputs.data['input_ids']])
+    return calib_dataset
+
+
+def main(fp16_path, quant_save_path):
+    tokenizer, model = load_tokenizer_and_model(fp16_path)
+
+    data_list = [
+        "The following are multiple choice questions (with answers) about  mao zedong thought.\n\n中共十八届三中全会通过\
+        的《中共中央关于全面深化改革若干重大问题的决定》指出，深化政治体制改革要紧紧围绕____\nA. 提高科学执政、民主执政、依法执政水平进\
+        行\nB. 坚持党的领导、人民当家作主、依法治国有机统一进行\nC. 推进社会主义民主政治制度化、规范化、程序化进行\nD. 建设社会主义法\
+        治国家进行\nAnswer: B\n\n中共十八届三中全会通过的《中共中央关于全面深化改革若干重大问题的决定》指出，深化社会体制改革要紧紧围\
+        绕____\nA. 推进基本公共服务均等化\nB. 改革收入分配制度，促进共同富裕\nC. 更好保障和改善民生、促进社会公平正义\nD. 推进社会领\
+        域制度创新\nAnswer: C\n\n个体经济、私营经济都是非公有制经济，但是，个体经济在性质上不同于私营经济，因为个体经济____\nA. 投资\
+        规模较小\nB. 经营方式单一\nC. 主要依靠自己劳动和经营\nD. 不是法人企业\nAnswer: C\n\n公有制经济的性质和实现形式是两个不同层次\
+        的问题。公有制经济的性质体现在____\nA. 组织形式上\nB. 所有权的归属上\nC. 经营方式上\nD. 分配方式上\nAnswer: B\n\n发展是硬道\
+        理，是党执政兴国的第一要务。要把发展的基点放在____\nA. 改革上\nB. 科技上\nC. 创新上\nD. 制度上\nAnswer: C\n\n我国人民代表大\
+        会制度组织和活动的基本原则是____\nA. 人民当家作主的原则\nB. 民主集中制的原则\nC. 在宪法和法律范围内活动的原则\nD. 公平、公正、\
+        公开的原则\nAnswer:",
+        "The following are multiple choice questions (with answers) about  mao zedong thought.\n\n中共十八届三中全会通过\
+        的《中共中央关于全面深化改革若干重大问题的决定》指出，深化政治体制改革要紧紧围绕____\nA. 提高科学执政、民主执政、依法执政水平进\
+        行\nB. 坚持党的领导、人民当家作主、依法治国有机统一进行\nC. 推进社会主义民主政治制度化、规范化、程序化进行\nD. 建设社会主义法治\
+        国家进行\nAnswer: B\n\n中共十八届三中全会通过的《中共中央关于全面深化改革若干重大问题的决定》指出，深化社会体制改革要紧紧围\
+        绕____\nA. 推进基本公共服务均等化\nB. 改革收入分配制度，促进共同富裕\nC. 更好保障和改善民生、促进社会公平正义\nD. 推进社会领域制\
+        度创新\nAnswer: C\n\n个体经济、私营经济都是非公有制经济，但是，个体经济在性质上不同于私营经济，因为个体经济____\nA. 投资规模较\
+        小\nB. 经营方式单一\nC. 主要依靠自己劳动和经营\nD. 不是法人企业\nAnswer: C\n\n公有制经济的性质和实现形式是两个不同层次的问题。\
+        公有制经济的性质体现在____\nA. 组织形式上\nB. 所有权的归属上\nC. 经营方式上\nD. 分配方式上\nAnswer: B\n\n发展是硬道理，是党执\
+        政兴国的第一要务。要把发展的基点放在____\nA. 改革上\nB. 科技上\nC. 创新上\nD. 制度上\nAnswer: C\n\n社会主义和谐社会的核心价值\
+        是____\nA. 以人为本\nB. 以民为本\nC. 社会公平\nD. 公平和正义\nAnswer:",
+        "The following are multiple choice questions (with answers) about  mao zedong thought.\n\n中共十八届三中全会通过的\
+        《中共中央关于全面深化改革若干重大问题的决定》指出，深化政治体制改革要紧紧围绕____\nA. 提高科学执政、民主执政、依法执政水平进行\nB\
+        . 坚持党的领导、人民当家作主、依法治国有机统一进行\nC. 推进社会主义民主政治制度化、规范化、程序化进行\nD. 建设社会主义法治国家进\
+        行\nAnswer: B\n\n中共十八届三中全会通过的《中共中央关于全面深化改革若干重大问题的决定》指出，深化社会体制改革要紧紧围绕____\nA. \
+        推进基本公共服务均等化\nB. 改革收入分配制度，促进共同富裕\nC. 更好保障和改善民生、促进社会公平正义\nD. 推进社会领域制度创新\nAnsw\
+        er: C\n\n个体经济、私营经济都是非公有制经济，但是，个体经济在性质上不同于私营经济，因为个体经济____\nA. 投资规模较小\nB. 经营方式\
+        单一\nC. 主要依靠自己劳动和经营\nD. 不是法人企业\nAnswer: C\n\n公有制经济的性质和实现形式是两个不同层次的问题。公有制经济的性质\
+        体现在____\nA. 组织形式上\nB. 所有权的归属上\nC. 经营方式上\nD. 分配方式上\nAnswer: B\n\n发展是硬道理，是党执政兴国的第一要\
+        务。要把发展的基点放在____\nA. 改革上\nB. 科技上\nC. 创新上\nD. 制度上\nAnswer: C\n\n21世纪前10年，我国经济体制必须解决好的\
+        历史课题是____\nA. 实施科教兴国战略和可持续发展战略\nB. 促进国民经济持续快速健康发展\nC. 大多数国有大中型骨干企业建立现代企业制\
+        度\nD. 建立比较完善的社会主义市场经济体制\nAnswer:",
+        "The following are multiple choice questions (with answers) about  mao zedong thought.\n\n中共十八届三中全会通过的\
+        《中共中央关于全面深化改革若干重大问题的决定》指出，深化政治体制改革要紧紧围绕____\nA. 提高科学执政、民主执政、依法执政水平进\
+        行\nB. 坚持党的领导、人民当家作主、依法治国有机统一进行\nC. 推进社会主义民主政治制度化、规范化、程序化进行\nD. 建设社会主义法治国\
+        家进行\nAnswer: B\n\n中共十八届三中全会通过的《中共中央关于全面深化改革若干重大问题的决定》指出，深化社会体制改革要紧紧围\
+        绕____\nA. 推进基本公共服务均等化\nB. 改革收入分配制度，促进共同富裕\nC. 更好保障和改善民生、促进社会公平正义\nD. 推进社会领域\
+        制度创新\nAnswer: C\n\n个体经济、私营经济都是非公有制经济，但是，个体经济在性质上不同于私营经济，因为个体经济____\nA. 投资规模较\
+        小\nB. 经营方式单一\nC. 主要依靠自己劳动和经营\nD. 不是法人企业\nAnswer: C\n\n公有制经济的性质和实现形式是两个不同层次的问题。\
+        公有制经济的性质体现在____\nA. 组织形式上\nB. 所有权的归属上\nC. 经营方式上\nD. 分配方式上\nAnswer: B\n\n发展是硬道理，是党执\
+        政兴国的第一要务。要把发展的基点放在____\nA. 改革上\nB. 科技上\nC. 创新上\nD. 制度上\nAnswer: C\n\n首次以“台湾回到祖国怀抱，\
+        实现统一大业”来代替“解放台湾”的提法的是____\nA. 1978年12月党的十一届三中全会公报\nB. 1979年元旦全国人大常委会发表《告台湾同胞书\
+        》\nC. 1981年9月叶剑英对新华社记者发表的被称为“叶九条”的谈话\nD. 1982年中国共产党十二大政治报告\nAnswer:",
+        "The following are multiple choice questions (with answers) about  mao zedong thought.\n\n中共十八届三中全会通过的\
+        《中共中央关于全面深化改革若干重大问题的决定》指出，深化政治体制改革要紧紧围绕____\nA. 提高科学执政、民主执政、依法执政水平进\
+        行\nB. 坚持党的领导、人民当家作主、依法治国有机统一进行\nC. 推进社会主义民主政治制度化、规范化、程序化进行\nD. 建设社会主义法治国\
+        家进行\nAnswer: B\n\n中共十八届三中全会通过的《中共中央关于全面深化改革若干重大问题的决定》指出，深化社会体制改革要紧紧围\
+        绕____\nA. 推进基本公共服务均等化\nB. 改革收入分配制度，促进共同富裕\nC. 更好保障和改善民生、促进社会公平正义\nD. 推进社会领域制\
+        度创新\nAnswer: C\n\n个体经济、私营经济都是非公有制经济，但是，个体经济在性质上不同于私营经济，因为个体经济____\nA. 投资规模较\
+        小\nB. 经营方式单一\nC. 主要依靠自己劳动和经营\nD. 不是法人企业\nAnswer: C\n\n公有制经济的性质和实现形式是两个不同层次的问题。\
+        公有制经济的性质体现在____\nA. 组织形式上\nB. 所有权的归属上\nC. 经营方式上\nD. 分配方式上\nAnswer: B\n\n发展是硬道理，是党执\
+        政兴国的第一要务。要把发展的基点放在____\nA. 改革上\nB. 科技上\nC. 创新上\nD. 制度上\nAnswer: C\n\n毛泽东思想达到成熟的标志\
+        是____\nA. 新民主主义理论科学体系的形成\nB. 农村包围城市革命道路理论的形成\nC. 新民主主义革命基本经验的提出\nD. 毛泽东军事路线的\
+        完整形成\nAnswer:",
+        "The following are multiple choice questions (with answers) about  mao zedong thought.\n\n中共十八届三中全会通过的\
+        《中共中央关于全面深化改革若干重大问题的决定》指出，深化政治体制改革要紧紧围绕____\nA. 提高科学执政、民主执政、依法执政水平进行\nB\
+        . 坚持党的领导、人民当家作主、依法治国有机统一进行\nC. 推进社会主义民主政治制度化、规范化、程序化进行\nD. 建设社会主义法治国家进\
+        行\nAnswer: B\n\n中共十八届三中全会通过的《中共中央关于全面深化改革若干重大问题的决定》指出，深化社会体制改革要紧紧围绕____\nA. \
+        推进基本公共服务均等化\nB. 改革收入分配制度，促进共同富裕\nC. 更好保障和改善民生、促进社会公平正义\nD. 推进社会领域制度创\
+        新\nAnswer: C\n\n个体经济、私营经济都是非公有制经济，但是，个体经济在性质上不同于私营经济，因为个体经济____\nA. 投资规模较\
+        小\nB. 经营方式单一\nC. 主要依靠自己劳动和经营\nD. 不是法人企业\nAnswer: C\n\n公有制经济的性质和实现形式是两个不同层次的问题\
+        。公有制经济的性质体现在____\nA. 组织形式上\nB. 所有权的归属上\nC. 经营方式上\nD. 分配方式上\nAnswer: B\n\n发展是硬道理，是党\
+        执政兴国的第一要务。要把发展的基点放在____\nA. 改革上\nB. 科技上\nC. 创新上\nD. 制度上\nAnswer: C\n\n建设和谐文化的根本\
+        是____\nA. 坚持马克思主义的指导\nB. 发展科学和教育\nC. 坚持社会主义核心价值体系\nD. 推进文化体制改革\nAnswer:",
+        "The following are multiple choice questions (with answers) about  mao zedong thought.\n\n中共十八届三中全会通过的\
+        《中共中央关于全面深化改革若干重大问题的决定》指出，深化政治体制改革要紧紧围绕____\nA. 提高科学执政、民主执政、依法执政水平进\
+        行\nB. 坚持党的领导、人民当家作主、依法治国有机统一进行\nC. 推进社会主义民主政治制度化、规范化、程序化进行\nD. 建设社会主义法治国\
+        家进行\nAnswer: B\n\n中共十八届三中全会通过的《中共中央关于全面深化改革若干重大问题的决定》指出，深化社会体制改革要紧紧围\
+        绕____\nA. 推进基本公共服务均等化\nB. 改革收入分配制度，促进共同富裕\nC. 更好保障和改善民生、促进社会公平正义\nD. 推进社会领域制\
+        度创新\nAnswer: C\n\n个体经济、私营经济都是非公有制经济，但是，个体经济在性质上不同于私营经济，因为个体经济____\nA. 投资规模较\
+        小\nB. 经营方式单一\nC. 主要依靠自己劳动和经营\nD. 不是法人企业\nAnswer: C\n\n公有制经济的性质和实现形式是两个不同层次的问题。\
+        公有制经济的性质体现在____\nA. 组织形式上\nB. 所有权的归属上\nC. 经营方式上\nD. 分配方式上\nAnswer: B\n\n发展是硬道理，是党\
+        执政兴国的第一要务。要把发展的基点放在____\nA. 改革上\nB. 科技上\nC. 创新上\nD. 制度上\nAnswer: C\n\n构建社会主义和谐社会的重\
+        点是____\nA. 坚持以马列主义、毛泽东思想、邓小平理论和“三个代表”重要思想为指导\nB. 民主法制、公平正义、诚信友爱、充满活力、安定有\
+        序、人与自然和谐相处\nC. 解决人民群众最关心、最直接、最现实的利益问题\nD. 到2020年完全实现社会主义和谐社会\nAnswer:",
+        "The following are multiple choice questions (with answers) about  mao zedong thought.\n\n中共十八届三中全会通过的\
+        《中共中央关于全面深化改革若干重大问题的决定》指出，深化政治体制改革要紧紧围绕____\nA. 提高科学执政、民主执政、依法执政水平进\
+        行\nB. 坚持党的领导、人民当家作主、依法治国有机统一进行\nC. 推进社会主义民主政治制度化、规范化、程序化进行\nD. 建设社会主义法治国\
+        家进行\nAnswer: B\n\n中共十八届三中全会通过的《中共中央关于全面深化改革若干重大问题的决定》指出，深化社会体制改革要紧紧围\
+        绕____\nA. 推进基本公共服务均等化\nB. 改革收入分配制度，促进共同富裕\nC. 更好保障和改善民生、促进社会公平正义\nD. 推进社会领域制\
+        度创新\nAnswer: C\n\n个体经济、私营经济都是非公有制经济，但是，个体经济在性质上不同于私营经济，因为个体经济____\nA. 投资规模较\
+        小\nB. 经营方式单一\nC. 主要依靠自己劳动和经营\nD. 不是法人企业\nAnswer: C\n\n公有制经济的性质和实现形式是两个不同层次的问题。\
+        公有制经济的性质体现在____\nA. 组织形式上\nB. 所有权的归属上\nC. 经营方式上\nD. 分配方式上\nAnswer: B\n\n发展是硬道理，是党执\
+        政兴国的第一要务。要把发展的基点放在____\nA. 改革上\nB. 科技上\nC. 创新上\nD. 制度上\nAnswer: C\n\n台湾问题的本质是____\nA. \
+        中国的内政问题\nB. 中国同美国的关系问题\nC. 中国同日本的关系问题\nD. 共产党与国民党的关系问题\nAnswer:",
+        "The following are multiple choice questions (with answers) about  mao zedong thought.\n\n中共十八届三中全会通过的\
+        《中共中央关于全面深化改革若干重大问题的决定》指出，深化政治体制改革要紧紧围绕____\nA. 提高科学执政、民主执政、依法执政水平进\
+        行\nB. 坚持党的领导、人民当家作主、依法治国有机统一进行\nC. 推进社会主义民主政治制度化、规范化、程序化进行\nD. 建设社会主义法治国\
+        家进行\nAnswer: B\n\n中共十八届三中全会通过的《中共中央关于全面深化改革若干重大问题的决定》指出，深化社会体制改革要紧紧围\
+        绕____\nA. 推进基本公共服务均等化\nB. 改革收入分配制度，促进共同富裕\nC. 更好保障和改善民生、促进社会公平正义\nD. 推进社会领域制\
+        度创新\nAnswer: C\n\n个体经济、私营经济都是非公有制经济，但是，个体经济在性质上不同于私营经济，因为个体经济____\nA. 投资规模较\
+        小\nB. 经营方式单一\nC. 主要依靠自己劳动和经营\nD. 不是法人企业\nAnswer: C\n\n公有制经济的性质和实现形式是两个不同层次的问题。\
+        公有制经济的性质体现在____\nA. 组织形式上\nB. 所有权的归属上\nC. 经营方式上\nD. 分配方式上\nAnswer: B\n\n发展是硬道理，是党执\
+        政兴国的第一要务。要把发展的基点放在____\nA. 改革上\nB. 科技上\nC. 创新上\nD. 制度上\nAnswer: C\n\n在中国共产党历史上，最早提\
+        出“马克思主义中国化”这个命题的是____\nA. 李大钊\nB. 陈独\nC. 毛泽东\nD. 张闻天\nAnswer:",
+        "The following are multiple choice questions (with answers) about  mao zedong thought.\n\n中共十八届三中全会通过\
+        的《中共中央关于全面深化改革若干重大问题的决定》指出，深化政治体制改革要紧紧围绕____\nA. 提高科学执政、民主执政、依法执政水平进\
+        行\nB. 坚持党的领导、人民当家作主、依法治国有机统一进行\nC. 推进社会主义民主政治制度化、规范化、程序化进行\nD. 建设社会主义法治\
+        国家进行\nAnswer: B\n\n中共十八届三中全会通过的《中共中央关于全面深化改革若干重大问题的决定》指出，深化社会体制改革要紧紧围\
+        绕____\nA. 推进基本公共服务均等化\nB. 改革收入分配制度，促进共同富裕\nC. 更好保障和改善民生、促进社会公平正义\nD. 推进社会领域制\
+        度创新\nAnswer: C\n\n个体经济、私营经济都是非公有制经济，但是，个体经济在性质上不同于私营经济，因为个体经济____\nA. 投资规模较\
+        小\nB. 经营方式单一\nC. 主要依靠自己劳动和经营\nD. 不是法人企业\nAnswer: C\n\n公有制经济的性质和实现形式是两个不同层次的问题。\
+        公有制经济的性质体现在____\nA. 组织形式上\nB. 所有权的归属上\nC. 经营方式上\nD. 分配方式上\nAnswer: B\n\n发展是硬道理，是党执\
+        政兴国的第一要务。要把发展的基点放在____\nA. 改革上\nB. 科技上\nC. 创新上\nD. 制度上\nAnswer: C\n\n党的十八大提出，面对资源约\
+        束趋紧、环境污染严重、生态系统退化的严峻形势，必须树立尊重自然、顺应自然、保护自然的生态文明理念。人与自然相处时应秉持的首要态度\
+        是____\nA. 尊重自然\nB. 顺应自然\nC. 保护自然\nD. 征服自然\nAnswer:",
+        "The following are multiple choice questions (with answers) about  mao zedong thought.\n\n中共十八届三中全会通过的\
+        《中共中央关于全面深化改革若干重大问题的决定》指出，深化政治体制改革要紧紧围绕____\nA. 提高科学执政、民主执政、依法执政水平进\
+        行\nB. 坚持党的领导、人民当家作主、依法治国有机统一进行\nC. 推进社会主义民主政治制度化、规范化、程序化进行\nD. 建设社会主义法治国\
+        家进行\nAnswer: B\n\n中共十八届三中全会通过的《中共中央关于全面深化改革若干重大问题的决定》指出，深化社会体制改革要紧紧围\
+        绕____\nA. 推进基本公共服务均等化\nB. 改革收入分配制度，促进共同富裕\nC. 更好保障和改善民生、促进社会公平正义\nD. 推进社会领域制\
+        度创新\nAnswer: C\n\n个体经济、私营经济都是非公有制经济，但是，个体经济在性质上不同于私营经济，因为个体经济____\nA. 投资规模较\
+        小\nB. 经营方式单一\nC. 主要依靠自己劳动和经营\nD. 不是法人企业\nAnswer: C\n\n公有制经济的性质和实现形式是两个不同层次的问题。\
+        公有制经济的性质体现在____\nA. 组织形式上\nB. 所有权的归属上\nC. 经营方式上\nD. 分配方式上\nAnswer: B\n\n发展是硬道理，是党\
+        执政兴国的第一要务。要把发展的基点放在____\nA. 改革上\nB. 科技上\nC. 创新上\nD. 制度上\nAnswer: C\n\n中国革命的特点和优点\
+        是____\nA. 由中国共产党领导的人民战争\nB. 目标是争取民族独立、人民解放，最终实现国家的繁荣富强\nC. 以反帝反封建作为两大革命任\
+        务\nD. 以武装的革命反对武装的反革命\nAnswer:",
+        "The following are multiple choice questions (with answers) about  mao zedong thought.\n\n中共十八届三中全会通过的\
+        《中共中央关于全面深化改革若干重大问题的决定》指出，深化政治体制改革要紧紧围绕____\nA. 提高科学执政、民主执政、依法执政水平进\
+        行\nB. 坚持党的领导、人民当家作主、依法治国有机统一进行\nC. 推进社会主义民主政治制度化、规范化、程序化进行\nD. 建设社会主义法\
+        治国家进行\nAnswer: B\n\n中共十八届三中全会通过的《中共中央关于全面深化改革若干重大问题的决定》指出，深化社会体制改革要紧紧围\
+        绕____\nA. 推进基本公共服务均等化\nB. 改革收入分配制度，促进共同富裕\nC. 更好保障和改善民生、促进社会公平正义\nD. 推进社会领域\
+        制度创新\nAnswer: C\n\n个体经济、私营经济都是非公有制经济，但是，个体经济在性质上不同于私营经济，因为个体经济____\nA. 投资规\
+        模较小\nB. 经营方式单一\nC. 主要依靠自己劳动和经营\nD. 不是法人企业\nAnswer: C\n\n公有制经济的性质和实现形式是两个不同层次的\
+        问题。公有制经济的性质体现在____\nA. 组织形式上\nB. 所有权的归属上\nC. 经营方式上\nD. 分配方式上\nAnswer: B\n\n发展是硬道\
+        理，是党执政兴国的第一要务。要把发展的基点放在____\nA. 改革上\nB. 科技上\nC. 创新上\nD. 制度上\nAnswer: C\n\n香港特别行政区\
+        的高度自治权的唯一来源是____\nA. 中央授权\nB. 香港特别行政区本身固有的\nC. 《中英联合声明》\nD. 中央授权之外的剩余权\
+        力\nAnswer:",
+        "The following are multiple choice questions (with answers) about  mao zedong thought.\n\n中共十八届三中全会通过\
+        的《中共中央关于全面深化改革若干重大问题的决定》指出，深化政治体制改革要紧紧围绕____\nA. 提高科学执政、民主执政、依法执政水平进\
+        行\nB. 坚持党的领导、人民当家作主、依法治国有机统一进行\nC. 推进社会主义民主政治制度化、规范化、程序化进行\nD. 建设社会主义法治\
+        国家进行\nAnswer: B\n\n中共十八届三中全会通过的《中共中央关于全面深化改革若干重大问题的决定》指出，深化社会体制改革要紧紧围\
+        绕____\nA. 推进基本公共服务均等化\nB. 改革收入分配制度，促进共同富裕\nC. 更好保障和改善民生、促进社会公平正义\nD. 推进社会领域\
+        制度创新\nAnswer: C\n\n个体经济、私营经济都是非公有制经济，但是，个体经济在性质上不同于私营经济，因为个体经济____\nA. 投资规模\
+        较小\nB. 经营方式单一\nC. 主要依靠自己劳动和经营\nD. 不是法人企业\nAnswer: C\n\n公有制经济的性质和实现形式是两个不同层次的问\
+        题。公有制经济的性质体现在____\nA. 组织形式上\nB. 所有权的归属上\nC. 经营方式上\nD. 分配方式上\nAnswer: B\n\n发展是硬道理，\
+        是党执政兴国的第一要务。要把发展的基点放在____\nA. 改革上\nB. 科技上\nC. 创新上\nD. 制度上\nAnswer: C\n\n我国实施改革的目\
+        的是____\nA. 巩固社会主义制度\nB. 发扬社会主义民主\nC. 调动广大人民群众的积极性\nD. 发展社会主义的生产力\nAnswer:",
+        "The following are multiple choice questions (with answers) about  mao zedong thought.\n\n中共十八届三中全会通过\
+        的《中共中央关于全面深化改革若干重大问题的决定》指出，深化政治体制改革要紧紧围绕____\nA. 提高科学执政、民主执政、依法执政水平进\
+        行\nB. 坚持党的领导、人民当家作主、依法治国有机统一进行\nC. 推进社会主义民主政治制度化、规范化、程序化进行\nD. 建设社会主义法治\
+        国家进行\nAnswer: B\n\n中共十八届三中全会通过的《中共中央关于全面深化改革若干重大问题的决定》指出，深化社会体制改革要紧紧围\
+        绕____\nA. 推进基本公共服务均等化\nB. 改革收入分配制度，促进共同富裕\nC. 更好保障和改善民生、促进社会公平正义\nD. 推进社会领域\
+        制度创新\nAnswer: C\n\n个体经济、私营经济都是非公有制经济，但是，个体经济在性质上不同于私营经济，因为个体经济____\nA. 投资规模\
+        较小\nB. 经营方式单一\nC. 主要依靠自己劳动和经营\nD. 不是法人企业\nAnswer: C\n\n公有制经济的性质和实现形式是两个不同层次的问\
+        题。公有制经济的性质体现在____\nA. 组织形式上\nB. 所有权的归属上\nC. 经营方式上\nD. 分配方式上\nAnswer: B\n\n发展是硬道理，\
+        是党执政兴国的第一要务。要把发展的基点放在____\nA. 改革上\nB. 科技上\nC. 创新上\nD. 制度上\nAnswer: C\n\n新民主主义革命总路\
+        线的核心是____\nA. 无产阶级的领导\nB. 人民大众的参与\nC. 反帝反封建\nD. 反官僚资本主义\nAnswer:",
+        "The following are multiple choice questions (with answers) about  mao zedong thought.\n\n中共十八届三中全会通过\
+        的《中共中央关于全面深化改革若干重大问题的决定》指出，深化政治体制改革要紧紧围绕____\nA. 提高科学执政、民主执政、依法执政水平进\
+        行\nB. 坚持党的领导、人民当家作主、依法治国有机统一进行\nC. 推进社会主义民主政治制度化、规范化、程序化进行\nD. 建设社会主义法治\
+        国家进行\nAnswer: B\n\n中共十八届三中全会通过的《中共中央关于全面深化改革若干重大问题的决定》指出，深化社会体制改革要紧紧围\
+        绕____\nA. 推进基本公共服务均等化\nB. 改革收入分配制度，促进共同富裕\nC. 更好保障和改善民生、促进社会公平正义\nD. 推进社会领域\
+        制度创新\nAnswer: C\n\n个体经济、私营经济都是非公有制经济，但是，个体经济在性质上不同于私营经济，因为个体经济____\nA. 投资规模\
+        较小\nB. 经营方式单一\nC. 主要依靠自己劳动和经营\nD. 不是法人企业\nAnswer: C\n\n公有制经济的性质和实现形式是两个不同层次的问\
+        题。公有制经济的性质体现在____\nA. 组织形式上\nB. 所有权的归属上\nC. 经营方式上\nD. 分配方式上\nAnswer: B\n\n发展是硬道理，\
+        是党执政兴国的第一要务。要把发展的基点放在____\nA. 改革上\nB. 科技上\nC. 创新上\nD. 制度上\nAnswer: C\n\n九届人大二次会议正式\
+        将“依法治国”写入宪法，这一政策的核心是____\nA. 人民当家作主\nB. 民主与法制的结合\nC. 法治代替人治\nD. 有法可依，有法必依，执法\
+        必严，违法必究\nAnswer:",
+        "The following are multiple choice questions (with answers) about  mao zedong thought.\n\n中共十八届三中全会通过\
+        的《中共中央关于全面深化改革若干重大问题的决定》指出，深化政治体制改革要紧紧围绕____\nA. 提高科学执政、民主执政、依法执政水平进\
+        行\nB. 坚持党的领导、人民当家作主、依法治国有机统一进行\nC. 推进社会主义民主政治制度化、规范化、程序化进行\nD. 建设社会主义法治国\
+        家进行\nAnswer: B\n\n中共十八届三中全会通过的《中共中央关于全面深化改革若干重大问题的决定》指出，深化社会体制改革要紧紧围\
+        绕____\nA. 推进基本公共服务均等化\nB. 改革收入分配制度，促进共同富裕\nC. 更好保障和改善民生、促进社会公平正义\nD. 推进社会领域\
+        制度创新\nAnswer: C\n\n个体经济、私营经济都是非公有制经济，但是，个体经济在性质上不同于私营经济，因为个体经济____\nA. 投资规模\
+        较小\nB. 经营方式单一\nC. 主要依靠自己劳动和经营\nD. 不是法人企业\nAnswer: C\n\n公有制经济的性质和实现形式是两个不同层次的问\
+        题。公有制经济的性质体现在____\nA. 组织形式上\nB. 所有权的归属上\nC. 经营方式上\nD. 分配方式上\nAnswer: B\n\n发展是硬道理，\
+        是党执政兴国的第一要务。要把发展的基点放在____\nA. 改革上\nB. 科技上\nC. 创新上\nD. 制度上\nAnswer: C\n\n在社会主义初级阶段，\
+        非公有制经济是____\nA. 社会主义公有制经济的补充\nB. 社会主义市场经济的重要组成部分\nC. 具有公有性质的经济\nD. 逐步向公有制过渡的\
+        经济\nAnswer:",
+        "The following are multiple choice questions (with answers) about  mao zedong thought.\n\n中共十八届三中全会通过\
+        的《中共中央关于全面深化改革若干重大问题的决定》指出，深化政治体制改革要紧紧围绕____\nA. 提高科学执政、民主执政、依法执政水平进\
+        行\nB. 坚持党的领导、人民当家作主、依法治国有机统一进行\nC. 推进社会主义民主政治制度化、规范化、程序化进行\nD. 建设社会主义法治\
+        国家进行\nAnswer: B\n\n中共十八届三中全会通过的《中共中央关于全面深化改革若干重大问题的决定》指出，深化社会体制改革要紧紧围\
+        绕____\nA. 推进基本公共服务均等化\nB. 改革收入分配制度，促进共同富裕\nC. 更好保障和改善民生、促进社会公平正义\nD. 推进社会领域\
+        制度创新\nAnswer: C\n\n个体经济、私营经济都是非公有制经济，但是，个体经济在性质上不同于私营经济，因为个体经济____\nA. 投资规\
+        模较小\nB. 经营方式单一\nC. 主要依靠自己劳动和经营\nD. 不是法人企业\nAnswer: C\n\n公有制经济的性质和实现形式是两个不同层次的\
+        问题。公有制经济的性质体现在____\nA. 组织形式上\nB. 所有权的归属上\nC. 经营方式上\nD. 分配方式上\nAnswer: B\n\n发展是硬道理\
+        ，是党执政兴国的第一要务。要把发展的基点放在____\nA. 改革上\nB. 科技上\nC. 创新上\nD. 制度上\nAnswer: C\n\n过渡时期总路线的\
+        特征是____\nA. 重视工业建设\nB. 强调三大改造\nC. 社会主义建设和社会主义改造同时并举\nD. 尤其重视对资本主义工商业的改\
+        造\nAnswer:",
+        "The following are multiple choice questions (with answers) about  mao zedong thought.\n\n中共十八届三中全会通过\
+        的《中共中央关于全面深化改革若干重大问题的决定》指出，深化政治体制改革要紧紧围绕____\nA. 提高科学执政、民主执政、依法执政水平进行\
+        \nB. 坚持党的领导、人民当家作主、依法治国有机统一进行\nC. 推进社会主义民主政治制度化、规范化、程序化进行\nD. 建设社会主义法治国\
+        家进行\nAnswer: B\n\n中共十八届三中全会通过的《中共中央关于全面深化改革若干重大问题的决定》指出，深化社会体制改革要紧紧围\
+        绕____\nA. 推进基本公共服务均等化\nB. 改革收入分配制度，促进共同富裕\nC. 更好保障和改善民生、促进社会公平正义\nD. 推进社会领域\
+        制度创新\nAnswer: C\n\n个体经济、私营经济都是非公有制经济，但是，个体经济在性质上不同于私营经济，因为个体经济____\nA. 投资规模较\
+        小\nB. 经营方式单一\nC. 主要依靠自己劳动和经营\nD. 不是法人企业\nAnswer: C\n\n公有制经济的性质和实现形式是两个不同层次的问题\
+        。公有制经济的性质体现在____\nA. 组织形式上\nB. 所有权的归属上\nC. 经营方式上\nD. 分配方式上\nAnswer: B\n\n发展是硬道理，是\
+        党执政兴国的第一要务。要把发展的基点放在____\nA. 改革上\nB. 科技上\nC. 创新上\nD. 制度上\nAnswer: C\n\n毛泽东思想开始形成是\
+        在____\nA. 国民革命时期\nB. 土地革命战争时期\nC. 解放战争时期\nD. 抗日战争时期\nAnswer:",
+        "The following are multiple choice questions (with answers) about  mao zedong thought.\n\n中共十八届三中全会通过\
+        的《中共中央关于全面深化改革若干重大问题的决定》指出，深化政治体制改革要紧紧围绕____\nA. 提高科学执政、民主执政、依法执政水平进\
+        行\nB. 坚持党的领导、人民当家作主、依法治国有机统一进行\nC. 推进社会主义民主政治制度化、规范化、程序化进行\nD. 建设社会主义法治\
+        国家进行\nAnswer: B\n\n中共十八届三中全会通过的《中共中央关于全面深化改革若干重大问题的决定》指出，深化社会体制改革要紧紧围\
+        绕____\nA. 推进基本公共服务均等化\nB. 改革收入分配制度，促进共同富裕\nC. 更好保障和改善民生、促进社会公平正义\nD. 推进社会领域\
+        制度创新\nAnswer: C\n\n个体经济、私营经济都是非公有制经济，但是，个体经济在性质上不同于私营经济，因为个体经济____\nA. 投资规模\
+        较小\nB. 经营方式单一\nC. 主要依靠自己劳动和经营\nD. 不是法人企业\nAnswer: C\n\n公有制经济的性质和实现形式是两个不同层次的问\
+        题。公有制经济的性质体现在____\nA. 组织形式上\nB. 所有权的归属上\nC. 经营方式上\nD. 分配方式上\nAnswer: B\n\n发展是硬道理，\
+        是党执政兴国的第一要务。要把发展的基点放在____\nA. 改革上\nB. 科技上\nC. 创新上\nD. 制度上\nAnswer: C\n\n当今时代的主题\
+        是____\nA. 战争与革命\nB. 和平与发展\nC. 开放与合作\nD. 和谐与共赢\nAnswer:",
+        "The following are multiple choice questions (with answers) about  mao zedong thought.\n\n中共十八届三中全会通\
+        过的《中共中央关于全面深化改革若干重大问题的决定》指出，深化政治体制改革要紧紧围绕____\nA. 提高科学执政、民主执政、依法执政水平进\
+        行\nB. 坚持党的领导、人民当家作主、依法治国有机统一进行\nC. 推进社会主义民主政治制度化、规范化、程序化进行\nD. 建设社会主义法治\
+        国家进行\nAnswer: B\n\n中共十八届三中全会通过的《中共中央关于全面深化改革若干重大问题的决定》指出，深化社会体制改革要紧紧围\
+        绕____\nA. 推进基本公共服务均等化\nB. 改革收入分配制度，促进共同富裕\nC. 更好保障和改善民生、促进社会公平正义\nD. 推进社会领域\
+        制度创新\nAnswer: C\n\n个体经济、私营经济都是非公有制经济，但是，个体经济在性质上不同于私营经济，因为个体经济____\nA. 投资规模较\
+        小\nB. 经营方式单一\nC. 主要依靠自己劳动和经营\nD. 不是法人企业\nAnswer: C\n\n公有制经济的性质和实现形式是两个不同层次的问题。\
+        公有制经济的性质体现在____\nA. 组织形式上\nB. 所有权的归属上\nC. 经营方式上\nD. 分配方式上\nAnswer: B\n\n发展是硬道理，是党执\
+        政兴国的第一要务。要把发展的基点放在____\nA. 改革上\nB. 科技上\nC. 创新上\nD. 制度上\nAnswer: C\n\n正式把毛泽东思想确立为党的\
+        指导思想并首次写进党章的是____\nA. 中共六大\nB. 中共七大\nC. 中共八大\nD. 中共十二大\nAnswer:"
+    ]
+    dataset_calib = get_calib_dataset(tokenizer, data_list)
+
+    disable_names = [
+        'transformer.h.0.mlp.c_proj',
+        'transformer.h.1.mlp.c_proj',
+        'transformer.h.2.mlp.c_proj',
+        'transformer.h.3.mlp.c_proj',
+        'transformer.h.4.mlp.c_proj',
+        'transformer.h.5.mlp.c_proj',
+        'transformer.h.6.mlp.c_proj',
+        'transformer.h.7.mlp.c_proj',
+        'transformer.h.8.mlp.c_proj',
+        'transformer.h.9.mlp.c_proj',
+        'transformer.h.10.mlp.c_proj',
+        'transformer.h.11.mlp.c_proj',
+        'transformer.h.12.mlp.c_proj',
+        'transformer.h.13.mlp.c_proj',
+        'transformer.h.14.mlp.c_proj',
+        'transformer.h.15.mlp.c_proj',
+        'transformer.h.16.mlp.c_proj',
+        'transformer.h.17.mlp.c_proj',
+        'transformer.h.18.mlp.c_proj',
+        'transformer.h.19.mlp.c_proj',
+        'transformer.h.20.mlp.c_proj',
+        'transformer.h.21.mlp.c_proj',
+        'transformer.h.22.mlp.c_proj',
+        'transformer.h.23.mlp.c_proj',
+        'transformer.h.24.mlp.c_proj',
+        'transformer.h.25.mlp.c_proj',
+        'transformer.h.26.mlp.c_proj',
+        'transformer.h.27.mlp.c_proj',
+        'transformer.h.28.mlp.c_proj',
+        'transformer.h.29.mlp.c_proj',
+        'transformer.h.30.mlp.c_proj',
+        'transformer.h.31.mlp.c_proj',
+        'transformer.h.32.mlp.c_proj',
+        'transformer.h.33.mlp.c_proj',
+        'transformer.h.34.mlp.c_proj',
+        'transformer.h.35.mlp.c_proj',
+        'transformer.h.36.mlp.c_proj',
+        'transformer.h.37.mlp.c_proj',
+        'transformer.h.38.mlp.c_proj',
+        'transformer.h.39.mlp.c_proj',
+        'lm_head'
+    ]
+
+    anti_config = AntiOutlierConfig(anti_method="m2", dev_type="cpu")  # dev_type="npu", dev_id=0  如果需要使用npu进行量化
+    anti_outlier = AntiOutlier(model, calib_data=dataset_calib, cfg=anti_config)
+    anti_outlier.process()
+
+    quant_config = QuantConfig(
+        w_bit=8,  # 权重量化位数
+        a_bit=8,  # 激活值量化位数
+        disable_names=disable_names,  # 不做量化的层（通常是空list）
+        dev_type='cpu',
+        act_method=1,  # 激活量化方法，建议方法3（1：min-max；2：histogram；3：自动混合量化）
+        pr=1.0,  # 量化正则百分比，建议0.5
+        w_sym=True,  # 对称/非对称量化，True为对称量化，False为非对称量化
+        mm_tensor=False  # 权重量化粒度，True为per-tensor量化，False为per-channel量化（大模型场景建议False）
+    )
+
+    calibrator = Calibrator(
+        model,
+        quant_config,
+        calib_data=dataset_calib,
+        disable_level='L0'  # 自动回退等级，根据精度损失程度增加不量化的层（L0~L5，L0为不回退，精度损失明显时可适当提升等级）
+    )
+
+    calibrator.run()  # 执行PTQ量化校准
+
+    calibrator.save(quant_save_path, save_type=["safe_tensor"])
+
+if __name__ == "__main__":
+    model_path = sys.argv[1]
+    quant_weight_save_path = sys.argv[2]
+    main(model_path, quant_weight_save_path)
diff --git a/mindie/examples/models/qwen/quant_qwen_72b_w8a16.py b/mindie/examples/models/qwen/quant_qwen_72b_w8a16.py
new file mode 100644
index 00000000..38d67090
--- /dev/null
+++ b/mindie/examples/models/qwen/quant_qwen_72b_w8a16.py
@@ -0,0 +1,132 @@
+# Copyright Huawei Technologies Co., Ltd. 2023-2024. All rights reserved.
+import sys
+import torch
+from transformers import AutoTokenizer, AutoModelForCausalLM
+from msmodelslim.pytorch.llm_ptq.llm_ptq_tools import Calibrator, QuantConfig
+
+
+def load_tokenizer_and_model(fp16_path):
+    tokenizer = AutoTokenizer.from_pretrained(
+        pretrained_model_name_or_path=fp16_path,
+        pad_token='<|extra_0|>',
+        eos_token='<|endoftext|>',
+        padding_side='left',
+        trust_remote_code=True
+    )
+    model = AutoModelForCausalLM.from_pretrained(
+        pretrained_model_name_or_path=fp16_path,
+        torch_dtype=torch.float32, trust_remote_code=True
+    ).cpu()
+    return tokenizer, model
+
+
+def infer(tokenizer, model, query, model_params=None):
+    """
+    推理代码
+    :param query:
+    :param model_params:
+    :return:
+    """
+    inputs = tokenizer(query, return_tensors='pt')
+    inputs = inputs.to(model.device)
+    with torch.no_grad():
+        model_params = model_params if model_params is not None else {}
+        pred = model.generate(**inputs, **model_params)
+    output = tokenizer.decode(pred.cpu()[0], skip_special_tokens=True)
+    return output
+
+
+def get_calib_dataset(tokenizer, calib_list):
+    calib_dataset = []
+    for calib_data in calib_list:
+        inputs = tokenizer(calib_data, return_tensors='pt').to("cpu")
+        calib_dataset.append([inputs.data['input_ids']])
+    return calib_dataset
+
+
+def main(fp16_path, quant_save_path):
+    tokenizer, model = load_tokenizer_and_model(fp16_path)
+
+    data_list = [
+        ["电子发票有哪些注意事项？"],
+        ["费用报销需要提供哪些材料？"],
+        ["微信支付可以支持哪些银行卡？"],
+        ["简历中应该如何突出重点？"],
+        ["海外留学需要注意哪些事项？"],
+        ["云计算对于企业有哪些好处？"],
+        ["常见的投资方式有哪些？"],
+        ["什么是股票的基本面分析？"],
+        ["运动员如何保持良好的竞技状态？"],
+        ["暴雨天气应该注意哪些安全事项？"],
+        ["驾照考试一共有几个科目？"],
+        ["食品安全检测的流程是什么？"],
+        ["股票交易中的龙头股是什么？"],
+        ["网络攻击有哪些形式？"],
+        ["新能源汽车的优势是什么？"],
+        ["What are the benefits of cloud computing for businesses?"],
+        ["What documents are required for expense reimbursement?"],
+        ["How to highlight key points in a resume?"],
+        ["What should be paid attention to when studying abroad?"],
+        ["Which banks does WeChat payment support?"],
+        ["What are the common investment methods?"],
+        ["What is the process of food safety inspection?"],
+        ["What is the basic analysis of stock fundamentals?"],
+        ["How do athletes maintain good athletic performance?"],
+        ["What safety precautions should be taken in rainy weather?"],
+        ["What are the subjects of the driver's license exam?"],
+        ["What are the types of cyber attacks?"],
+        ["What is the concept of leading stocks in stock trading?"],
+        ["What should be noted in the use of electronic invoices?"],
+        ["What are the advantages of new energy vehicles?"],
+        ["如何有效管理个人财务？"],
+        ["什么是人工智能的发展趋势？"],
+        ["如何设计一个用户友好的网站界面？"],
+        ["为什么要进行环境保护？"],
+        ["如何预防常见的网络安全漏洞？"],
+        ["如何培养良好的沟通能力？"],
+        ["学习一门外语需要多长时间？"],
+        ["什么是健康的饮食习惯？"],
+        ["什么是心理健康？如何保持心理健康？"],
+        ["如何应对工作压力？"],
+        ["How to effectively manage personal finances?"],
+        ["What are the development trends of artificial intelligence?"],
+        ["How to design a user-friendly website interface?"],
+        ["Why is environmental protection important?"],
+        ["How to prevent common network security vulnerabilities?"],
+        ["How to cultivate good communication skills?"],
+        ["How long does it take to learn a foreign language?"],
+        ["What are healthy eating habits?"],
+        ["What is mental health and how to maintain it?"],
+        ["How to cope with work-related stress?"]
+    ]
+    dataset_calib = get_calib_dataset(tokenizer, data_list)
+
+    disable_names = ['lm_head']
+
+    quant_config = QuantConfig(
+        w_bit=8,  # 权重量化位数
+        a_bit=16,  # 激活值量化位数
+        disable_names=disable_names,  # 不做量化的层（通常是空list）
+        dev_type='cpu',
+        act_method=3,  # 激活量化方法，建议方法3（1：min-max；2：histogram；3：自动混合量化）
+        pr=1.0,  # 量化正则百分比，建议0.5
+        w_sym=False,  # 对称/非对称量化，True为对称量化，False为非对称量化
+        mm_tensor=False  # 权重量化粒度，True为per-tensor量化，False为per-channel量化（大模型场景建议False）
+    )
+
+    calibrator = Calibrator(
+        model,
+        quant_config,
+        calib_data=dataset_calib,
+        disable_level='L0'  # 自动回退等级，根据精度损失程度增加不量化的层（L0~L5，L0为不回退，精度损失明显时可适当提升等级）
+    )
+
+    calibrator.run()  # 执行PTQ量化校准
+
+    calibrator.save(quant_save_path, save_type=["safe_tensor"])
+
+
+if __name__ == "__main__":
+    model_path = sys.argv[1]
+    quant_weight_save_path = sys.argv[2]
+    main(model_path, quant_weight_save_path)
diff --git a/mindie/examples/models/qwen/qwen.jinja b/mindie/examples/models/qwen/qwen.jinja
new file mode 100644
index 00000000..e836bf51
--- /dev/null
+++ b/mindie/examples/models/qwen/qwen.jinja
@@ -0,0 +1,10 @@
+{%- for message in messages -%}
+	{%- if loop.first and messages[0]['role'] != 'system' -%}
+		{{- '<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n' -}}
+	{%- endif -%}
+	{{- '<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n' -}}
+{%- endfor -%}
+
+{%- if add_generation_prompt -%}
+	{{- '<|im_start|>assistant\n' -}}
+{%- endif -%}
\ No newline at end of file
diff --git a/mindie/examples/models/qwen/run_fa.sh b/mindie/examples/models/qwen/run_fa.sh
new file mode 100644
index 00000000..b01e74eb
--- /dev/null
+++ b/mindie/examples/models/qwen/run_fa.sh
@@ -0,0 +1,42 @@
+#!/bin/bash
+export ASCEND_RT_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+export MASTER_PORT=20030
+
+# 以下环境变量与性能和内存优化相关，通常情况下无需修改
+export ATB_LAYER_INTERNAL_TENSOR_REUSE=1
+export INF_NAN_MODE_ENABLE=0
+export ATB_OPERATION_EXECUTE_ASYNC=1
+export TASK_QUEUE_ENABLE=1
+export ATB_CONVERT_NCHW_TO_ND=1
+export HCCL_BUFFSIZE=120
+export ATB_WORKSPACE_MEM_ALLOC_GLOBAL=1
+export ATB_CONTEXT_WORKSPACE_SIZE=0
+
+world_size=$(($(echo "${ASCEND_RT_VISIBLE_DEVICES}" | grep -o , | wc -l) +1))
+model_path=""
+
+function usage(){
+    echo "$0 pls. use '-m|--model-path' input model path"
+    exit -1
+}
+
+if [[ $# -eq 0 ]];then
+        usage
+fi
+
+GETOP_ARGS=`getopt -o m: -al model-path: -- "$@"`
+eval set -- "${GETOP_ARGS}"
+while [ -n "$1" ]
+do
+    case "$1" in
+        -m|--model-path) model_path=$2;shift 2;;
+        --) shift;break;;
+        *) usage;break;;
+    esac
+done
+
+if [ "$TP_WORLD_SIZE" == "1" ]; then
+    python -m examples.run_fa --model_path $model_path
+else
+    torchrun --nproc_per_node $world_size --master_port $MASTER_PORT -m examples.run_fa --model_path $model_path
+fi
\ No newline at end of file
diff --git a/mindie/examples/models/qwen/run_pa.sh b/mindie/examples/models/qwen/run_pa.sh
new file mode 100644
index 00000000..e99bba58
--- /dev/null
+++ b/mindie/examples/models/qwen/run_pa.sh
@@ -0,0 +1,43 @@
+#!/bin/bash
+export BIND_CPU=1
+export RESERVED_MEMORY_GB=3
+export ASCEND_RT_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+export MASTER_PORT=12347
+export TP_WORLD_SIZE=$(($(echo "${ASCEND_RT_VISIBLE_DEVICES}" | grep -o , | wc -l) +1))
+export INT8_FORMAT_NZ_ENABLE=1
+model_path=""
+is_chat_model="false"
+
+function usage(){
+    echo "$0 pls. use '-m|--model-path' input model path"
+    exit -1
+}
+
+if [[ $# -eq 0 ]];then
+        usage
+fi
+
+GETOP_ARGS=`getopt -o m:c:t: -al model-path:,is-chat-model:,--chat_template: -- "$@"`
+eval set -- "${GETOP_ARGS}"
+while [ -n "$1" ]
+do
+    case "$1" in
+        -m|--model-path) model_path=$2;shift 2;;
+        -c|--is-chat-model) is_chat_model=$2;shift 2;;
+        -t|--chat_template) chat_template=$2;shift 2;;
+        --) shift;break;;
+        *) usage;break;;
+    esac
+done
+
+atb_options="ATB_LAUNCH_KERNEL_WITH_TILING=1 ATB_LAYER_INTERNAL_TENSOR_REUSE=1 PYTORCH_NPU_ALLOC_CONF='max_split_size_mb:2048' HCCL_BUFFSIZE=120 ATB_WORKSPACE_MEM_ALLOC_GLOBAL=1"
+atb_async_options="ATB_OPERATION_EXECUTE_ASYNC=1 TASK_QUEUE_ENABLE=1"
+base_cmd="torchrun --nproc_per_node $TP_WORLD_SIZE --master_port $MASTER_PORT -m examples.run_pa --model_path $model_path"
+if [[ ${is_chat_model} = "true" ]];then
+    base_cmd="${base_cmd} --is_chat_model --chat_template \"${chat_template}\""
+fi
+run_cmd="${atb_options} ${atb_async_options} ${base_cmd}"
+
+if [[ -n ${model_path} ]];then
+    eval "${run_cmd}"
+fi
diff --git a/mindie/examples/models/qwen_vl/README.md b/mindie/examples/models/qwen_vl/README.md
new file mode 100644
index 00000000..bea7ec32
--- /dev/null
+++ b/mindie/examples/models/qwen_vl/README.md
@@ -0,0 +1,107 @@
+# README
+
+- Qwen-VL 是阿里云研发的大规模视觉语言模型（Large Vision Language Model, LVLM）。Qwen-VL 可以以图像、文本、检测框作为输入，并以文本和检测框作为输出。
+
+
+## 特性矩阵
+
+QWen-VL模型支持的特性
+
+| 模型及参数量      | 800I A2 Tensor Parallelism | 300I DUO Tensor Parallelism | FP16 | BF16 | Flash Attention | Paged Attention | W8A8量化 | W8A16量化 | W4A16量化 | KV cache量化 | 稀疏量化 | MOE量化 | MindIE Service | TGI | 长序列 |
+|-------------|----------------------------|-----------------------------|------|------------------|-----------------|-----------------|---------|-----------|---------|-----------|--------------------------|-----|--------|---|--------|
+|  Qwen-VL    | 支持world size 1,2,4,8     | 支持world size 1,2,4,8        | √   | √                    | √            | √              | ×       | ×        | ×       | ×      | ×                        | ×   | √      | ×   | ×     |
+
+## 路径变量解释
+
+| 变量名      | 含义 |
+| ---------- | ------- |
+| working_dir | 加速库及模型库下载后放置的目录 |
+| llm_path    | 模型仓所在路径。若使用编译好的包，则路径为 `${working_dir}/MindIE-LLM/`；若使用gitee下载的代码，则路径为 `${working_dir}/MindIE-LLM/examples/atb_models` |
+| model_path | 模型所在路径。`${llm_path}/examples/models/qwen_vl` |
+
+
+## 权重
+
+**权重下载**
+
+- [Qwen-VL](https://modelscope.cn/models/qwen/Qwen-VL/files)
+
+**基础环境变量**
+
+- Toolkit, MindIE/ATB，ATB-SPEED等，参考[此README文件](../../../README.md)
+- Python其他第三方库依赖，参考[requirements_qwen_vl.txt](../../../requirements/models/requirements_qwen_vl.txt)
+- Python某些第三方库依赖会重新安装torch,可能导致别的错误，请检查确保 torchvision==0.16.2
+
+## 推理
+
+**运行Paged Attention FP16**
+
+- 执行启动脚本
+  
+  在`${llm_path}`目录下执行以下指令
+  ```shell
+  bash ${model_path}/run_pa.sh
+  ```
+
+- 环境变量说明
+  - `export ASCEND_RT_VISIBLE_DEVICES=0,1,2,3,4,5,6,7`
+    - 指定当前机器上可用的逻辑NPU核心，多个核心间使用逗号相连
+    - 核心ID查阅方式见[此README文件](../../README.md)的【启动脚本相关环境变量】章节
+    - 对于300I DUO卡而言，若要使用单卡双芯，请指定至少两个可见核心；若要使用双卡四芯，请指定至少四个可见核心
+    - 各模型支持的核心数参考“特性矩阵”
+  - `export MASTER_PORT=20030`
+    - 设置卡间通信端口
+    - 默认使用20030端口
+    - 目的是为了避免同一台机器同时运行多个多卡模型时出现通信冲突
+    - 设置时端口建议范围为：20000-20050
+  - 以下环境变量与性能和内存优化相关，通常情况下无需修改，详细信息可参考ATB官方文档
+    ```shell
+    export ATB_LAUNCH_KERNEL_WITH_TILING=1
+    export ATB_LAYER_INTERNAL_TENSOR_REUSE=1
+    export PYTORCH_NPU_ALLOC_CONF='max_split_size_mb:2048'
+    export HCCL_BUFFSIZE=120
+    export ATB_WORKSPACE_MEM_ALLOC_GLOBAL=1
+    export ATB_OPERATION_EXECUTE_ASYNC=1
+    export TASK_QUEUE_ENABLE=1
+    ```
+
+## 精度测试
+
+### 测试方法
+使用同样的一组图片与相同的文本输入，分别在GPU和NPU上执行推理，得到两组图片描述。再使用open_clip模型作为裁判，对两组结果分别进行评分，评分越高越好。
+
+### 测试步骤
+
+- 权重和图片下载
+    - 下载open_clip的权重[open_clip_pytorch_model.bin](https://huggingface.co/laion/CLIP-ViT-H-14-laion2B-s32B-b79K/tree/main)
+    - 下载测试图片[CoCotest 数据集](https://cocodataset.org/#download)，随机抽取其中100张图片作为测试数据集
+    - 安装open_clip仓库（众多github下载的库可以参照如下方式，快速安装）
+    ```shell
+    # 在命令行界面中手动克隆 open_clip 仓库，进入克隆下来的 open_clip 目录 pip 安装
+    git clone https://github.com/mlfoundations/open_clip.git
+    cd open_clip
+    pip install -e .
+    ```
+
+
+- 推理得到两组图片描述
+    - GPU推理：run_coco_rst_GPU.py 脚本在GPU上执行如下命令，运行成功后在执行目录下生成 gpu_coco_rst.json文件
+    ```shell
+    python run_coco_rst_GPU.py --model_path {qwenvl权重路径} --image_path {测试数据集路径}
+    ```
+    - NPU推理：执行推理脚本，运行成功后在执行目录下生成 npu_coco_rst.json文件, 注意参数(--dataset_path 测试数据集路径, input_texts 需要NPU与GPU保持一致，默认为 'Generate the caption in English with grounding:')
+    ```shell
+    bash examples/models/qwen_vl/run_pa.sh
+    ```
+   
+
+- 评分
+
+   分别使用GPU和NPU推理得到的两组图片描述(gpu_coco_rst.json、npu_coco_rst.json)作为输入,执行clip_score_qwenvl.py 脚本输出评分结果
+   在`${llm_path}`目录下执行：
+   ```bash
+   python examples/models/qwen_vl/precision/clip_score_qwenvl.py \ 
+   --model_weights_path {open_clip_pytorch_model.bin 的路径} \ 
+   --image_info {gpu_coco_rst.json 或 npu_coco_rst.json 的路径} \
+   --dataset_path {测试数据集路径}
+   ```
diff --git a/mindie/examples/models/qwen_vl/precision/GPU_NPU_result_example.json b/mindie/examples/models/qwen_vl/precision/GPU_NPU_result_example.json
new file mode 100644
index 00000000..ba51fd14
--- /dev/null
+++ b/mindie/examples/models/qwen_vl/precision/GPU_NPU_result_example.json
@@ -0,0 +1,4 @@
+{
+  "01.jpg": "This image ...",
+  "02.jpg": "This image ..."
+}
diff --git a/mindie/examples/models/qwen_vl/precision/clip_score_qwenvl.py b/mindie/examples/models/qwen_vl/precision/clip_score_qwenvl.py
new file mode 100644
index 00000000..d57a1af8
--- /dev/null
+++ b/mindie/examples/models/qwen_vl/precision/clip_score_qwenvl.py
@@ -0,0 +1,121 @@
+# Copyright Huawei Technologies Co., Ltd. 2023-2024. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import json
+import os
+import time
+import open_clip
+import torch
+import torch_npu
+import torch.nn.functional as F
+
+from atb_llm.utils.file_utils import safe_open
+from atb_llm.utils.log import logger
+from PIL import Image
+
+
+def parse_arguments():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--device",
+        type=str,
+        default="0",
+        help="device for torch.",
+    )
+    parser.add_argument(
+        "--model_name",
+        type=str,
+        default="ViT-H-14",
+        help="open clip model name",
+    )
+    parser.add_argument(
+        "--model_weights_path",
+        type=str,
+        default="./open_clip_pytorch_model.bin",
+        help="open clip model weights",
+    )
+    parser.add_argument(
+        "--image_info",
+        type=str,
+        default="./image_info.json",
+        help="Image_info.json file.",
+    )
+    parser.add_argument(
+        "--dataset_path",
+        type=str,
+        default="./cocoTest/",
+        help="dataset path for precision test.",
+    )
+    return parser.parse_args()
+
+
+def set_torch_env(device_ids):
+    torch_npu.npu.set_device(int(device_ids))
+    torch.npu.set_compile_mode(jit_compile=False)
+
+
+def clip_score(model_clip, tokenizer, preprocess, model_answer, image_file):
+    imgs = []
+    texts = []
+
+    img = preprocess(Image.open(image_file)).unsqueeze(0).npu()
+    imgs.append(img)
+    text = tokenizer([model_answer]).npu()
+    texts.append(text)
+
+    img = torch.cat(imgs)  # [bs, 3, 224, 224]
+    text = torch.cat(texts)  # [bs, 77]
+
+    with torch.no_grad():
+        text_ft = model_clip.encode_text(text).float()
+        img_ft = model_clip.encode_image(img).float()
+        score = F.cosine_similarity(img_ft, text_ft).squeeze()
+
+    return score.cpu()
+
+
+def main():
+    args = parse_arguments()
+    set_torch_env(args.device)
+
+    t_b = time.time()
+    logger.info("Load clip model...")
+    model_clip, _, preprocess = open_clip.create_model_and_transforms(
+        args.model_name, pretrained=args.model_weights_path, device=f"npu:{args.device}")
+    model_clip.eval()
+    logger.info(f">done. elapsed time: {(time.time() - t_b):.3f} s")
+
+    tokenizer = open_clip.get_tokenizer("ViT-H-14")
+    with safe_open(args.image_info, "r", override_flags=os.O_RDONLY) as f:
+        image_info = json.load(f)
+
+    t_b = time.time()
+
+    logger.info("Calc clip score...")
+    all_scores = []
+    for image_file, model_answer in image_info.items():
+        # 单个图片  单个answer
+        image_file_path = os.path.join(args.dataset_path, image_file)
+        logger.info(f"cur image file: {image_file_path}")
+        image_score = clip_score(model_clip, tokenizer, preprocess, model_answer, image_file_path)
+        logger.info(f"{image_score=}")
+        all_scores.append(image_score)
+    all_scores_mean = torch.mean(torch.tensor(all_scores))
+    logger.info(f"平均分：{all_scores_mean=}")
+    logger.info(f">done. elapsed time: {(time.time() - t_b):.3f} s")
+
+
+if __name__ == '__main__':
+    main()
diff --git a/mindie/examples/models/qwen_vl/precision/run_coco_rst_GPU.py b/mindie/examples/models/qwen_vl/precision/run_coco_rst_GPU.py
new file mode 100644
index 00000000..e1d3da6a
--- /dev/null
+++ b/mindie/examples/models/qwen_vl/precision/run_coco_rst_GPU.py
@@ -0,0 +1,62 @@
+# Copyright Huawei Technologies Co., Ltd. 2024-2024. All rights reserved.
+import argparse
+import json
+import os
+import torch
+from tqdm import tqdm
+from transformers import AutoModelForCausalLM, AutoTokenizer
+from atb_llm.utils.file_utils import safe_open
+
+torch.manual_seed(1234)
+output_json_path = "./gpu_coco_rst.json"
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description="Demo")
+    parser.add_argument("--model_path",
+                        required=True,
+                        help="Model and tokenizer path.")
+    parser.add_argument("--image_path",
+                        required=True,
+                        help="Image path for inference.")
+    parser.add_argument("--results_save_path",
+                        help="precision test result path",
+                        default="./npu_coco_rst.json")
+    return parser.parse_args()
+
+
+def main():
+    args = parse_args()
+    model_path = args.model_path
+    image_path = args.image_path
+    print(f"===== model_path: {model_path}")
+    print(f"===== image_path: {image_path}")
+    if os.path.exists(model_path) and os.path.exists(image_path):
+        images_list = os.listdir(image_path)
+        tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
+        model = AutoModelForCausalLM.from_pretrained(model_path,
+                                                     device_map="cuda",
+                                                     trust_remote_code=True,
+                                                     fp16=True).eval()
+        gpu_rst = {}
+        for _, img_name in enumerate(tqdm(images_list)):
+            img_path = os.path.join(image_path, img_name)
+            query = tokenizer.from_list_format([
+                {'image': img_path},
+                {'text': 'Generate the caption in English with grounding:'},
+            ])
+            inputs = tokenizer(query, return_tensors='pt')
+            inputs = inputs.to(model.device)
+            pred = model.generate(**inputs)
+            response = tokenizer.decode(pred.cpu()[0], skip_special_tokens=False)
+            gpu_rst[img_name] = response.split("grounding:")[-1]
+        sorted_dict = dict(sorted(gpu_rst.items()))
+        with safe_open(args.results_save_path, "w", override_flags=os.O_WRONLY | os.O_CREAT | os.O_EXCL) as f:
+            json.dump(sorted_dict, f)
+        print("run run_coco_rst_GPU.py finish! output file: ./gpu_coco_rst.json")
+    else:
+        print("model_path or image_path not exist")
+
+
+if __name__ == "__main__":
+    main()
\ No newline at end of file
diff --git a/mindie/examples/models/qwen_vl/run_pa.py b/mindie/examples/models/qwen_vl/run_pa.py
new file mode 100644
index 00000000..090fe197
--- /dev/null
+++ b/mindie/examples/models/qwen_vl/run_pa.py
@@ -0,0 +1,448 @@
+# Copyright Huawei Technologies Co., Ltd. 2024-2024. All rights reserved.
+import argparse
+import copy
+import math
+import os
+import time
+import json
+
+import torch
+import torch_npu
+from atb_llm.runner import ModelRunner
+from atb_llm.utils.cpu_binding import NpuHbmInfo
+from atb_llm.utils.env import ENV
+from atb_llm.utils.file_utils import safe_open
+from atb_llm.utils.log import logger, print_log
+from examples.server.cache import CacheConfig, ModelConfig, CacheManager
+from examples.server.generate import decode_token, generate_req
+from examples.server.request import request_from_token
+
+_IMAGE_START_ID = 151857
+_IMAGE_PLACE_HOLDER = 256
+
+
+def request_from_text_and_image(
+    text, image, tokenizer, max_out_length, block_size, req_idx=0
+):
+    if image is not None:
+        query_list = [{"image": image}, {"text": text}]
+    else:
+        query_list = [{"text": text}]
+
+    query = tokenizer.from_list_format(query_list)
+    input_ids = tokenizer([query], return_tensors="pt")["input_ids"].flatten()
+    request = request_from_token(input_ids, max_out_length, block_size, req_idx)
+    return request
+
+
+class PARunner:
+    def __init__(self, **kwargs):
+        self.rank = kwargs.get("rank", "0")
+        self.local_rank = kwargs.get("local_rank", self.rank)
+        self.world_size = kwargs.get("world_size", "1")
+
+        self.model_path = kwargs.get("model_path", None)
+        self.input_text = kwargs.get("input_text", None)
+        self.max_position_embeddings = kwargs.get("max_position_embeddings", None)
+        self.max_input_length = kwargs.get("max_input_length", None)
+        self.max_prefill_tokens = kwargs.get("max_prefill_tokens", None)
+        self.max_output_length = kwargs.get("max_output_length", None)
+        self.is_flash_model = kwargs.get("is_flash_model", None)
+        self.max_batch_size = kwargs.get("max_batch_size", None)
+        if self.max_prefill_tokens == -1:
+            self.max_prefill_tokens = self.max_batch_size * (
+                self.max_input_length + self.max_output_length
+            )
+
+        self.block_size = kwargs.get("block_size", None)
+
+        self.model = ModelRunner(
+            self.model_path,
+            rank=self.rank,
+            world_size=self.world_size,
+            local_rank=self.local_rank,
+            max_position_embeddings=self.max_position_embeddings
+        )
+        self.tokenizer = self.model.tokenizer
+        self.dtype = self.model.dtype
+        self.quantize = self.model.quantize
+        self.model.load_weights()
+
+        self.device = self.model.device
+        self.model_config = ModelConfig(
+            self.model.num_heads,
+            self.model.num_kv_heads,
+            self.model.head_size,
+            self.model.num_layers,
+            self.model.device,
+            self.model.dtype,
+            self.model.soc_info,
+            self.quantize
+        )
+
+        self.max_memory = NpuHbmInfo.get_hbm_capacity(
+            self.local_rank, self.world_size, self.model.soc_info.need_nz
+        )
+        self.init_memory = int(
+            self.max_memory
+            * NpuHbmInfo.get_hbm_usage(
+                self.local_rank, self.world_size, self.model.soc_info.need_nz
+            )
+        )
+        print_log(
+            self.rank,
+            logger.info,
+            f"hbm_capacity(GB): {self.max_memory / (1024 ** 3)}, "
+            f"init_memory(GB): {self.init_memory / (1024 ** 3)}",
+        )
+
+        self.warm_up_memory = 0
+        self.warm_up_num_blocks = 0
+        self.cache_manager = None
+
+    def __repr__(self):
+        return (
+            f"PARunner("
+            + f"model_path={self.model_path}, "
+            + f"input_text={self.input_text}, "
+            + f"max_position_embeddings={self.max_position_embeddings}, "
+            + f"max_input_length={self.max_input_length}, "
+            + f"max_output_length={self.max_output_length}, "
+            + f"max_prefill_tokens={self.max_prefill_tokens}, "
+            + f"is_flash_model={self.is_flash_model}, "
+            + f"max_batch_size={self.max_batch_size}, "
+            + f"dtype={self.dtype}, "
+            + f"block_size={self.block_size}, "
+            + f"model_config={self.model_config}, "
+            + f"max_memory={self.max_memory}, "
+        )
+
+    def warm_up(self):
+        all_input_length = self.max_batch_size * self.max_input_length
+        input_ids_list = (
+            [_IMAGE_START_ID]
+            + [_IMAGE_START_ID + 2] * _IMAGE_PLACE_HOLDER
+            + [_IMAGE_START_ID + 1]
+            + [1] * (all_input_length - _IMAGE_PLACE_HOLDER - 2)
+        )
+        input_ids = torch.tensor(input_ids_list, dtype=torch.int64).to(self.device)
+        position_ids = (
+            torch.arange(self.max_input_length, dtype=torch.int32)
+            .repeat(self.max_batch_size)
+            .to(self.device)
+        )
+        cu_seqlen_prefill = torch.tensor([1])
+        try:
+            block_num = math.ceil(all_input_length / self.block_size)
+        except ZeroDivisionError as e:
+            raise ZeroDivisionError from e
+        block_tables_tensor = (
+            torch.arange(block_num, dtype=torch.int32).view(1, -1).to(self.device)
+        )
+        slots = torch.arange(all_input_length, dtype=torch.int32).to(self.device)
+        input_lengths_tensor = torch.tensor(
+            [self.max_input_length] * self.max_batch_size, dtype=torch.int64
+        ).to(self.device)
+        prefill_head_indices = torch.tensor(
+            [all_input_length - 1], dtype=torch.int64
+        ).to(self.device)
+        print_log(self.rank, logger.info, "---------------begin warm_up---------------")
+        try:
+            self.warm_up_num_blocks = (
+                math.ceil(
+                    (self.max_input_length + self.max_output_length) / self.block_size
+                )
+                * self.max_batch_size
+            )
+        except ZeroDivisionError as e:
+            raise ZeroDivisionError from e
+        cache_config = CacheConfig(self.warm_up_num_blocks, self.block_size)
+        self.cache_manager = CacheManager(cache_config, self.model_config)
+        _ = self.model.forward(
+            input_ids=input_ids,
+            position_ids=position_ids,
+            is_prefill=cu_seqlen_prefill is not None,
+            block_tables=block_tables_tensor,
+            kv_cache=self.cache_manager.kv_cache,
+            slots=slots,
+            input_lengths=input_lengths_tensor,
+            max_seq_len=self.max_input_length,
+            lm_head_indices=prefill_head_indices,
+        )
+        self.warm_up_memory = int(
+            self.max_memory
+            * NpuHbmInfo.get_hbm_usage(
+                self.local_rank, self.world_size, self.model.soc_info.need_nz
+            )
+        )
+        print_log(
+            self.rank,
+            logger.info,
+            f"warmup_memory(GB): {self.warm_up_memory / (1024 ** 3): .2f}",
+        )
+        print_log(self.rank, logger.info, "---------------end warm_up---------------")
+
+    def infer(
+        self,
+        input_texts,
+        input_images,
+        batch_size,
+        max_output_length,
+        ignore_eos,
+        input_ids=None,
+    ):
+        print_log(
+            self.rank, logger.info, "---------------begin inference---------------"
+        )
+
+        if len(input_texts) == 1:
+            req_list = [
+                request_from_text_and_image(
+                    input_texts[0],
+                    None if not input_images else input_images[0],
+                    self.tokenizer,
+                    max_output_length,
+                    self.block_size,
+                    req_idx=i,
+                )
+                for i in range(batch_size)
+            ]
+        else:
+            req_list = [
+                request_from_text_and_image(
+                    input_texts[i],
+                    None if not input_images else input_images[i],
+                    self.tokenizer,
+                    max_output_length,
+                    self.block_size,
+                    req_idx=i,
+                )
+                for i in range(len(input_texts))
+            ]
+
+        print_log(
+            self.rank, logger.debug, f"req_list[0].input_ids: {req_list[0].input_ids}"
+        )
+
+        if not self.cache_manager:
+            cache_block_size = (
+                self.block_size * self.model.num_kv_heads * self.model.head_size
+            )
+            dtype_size = CacheManager.get_dtype_size(self.dtype)
+            total_cache_size = self.model.num_layers * cache_block_size * 2 * dtype_size
+
+            max_memory = (
+                ENV.memory_fraction * self.max_memory
+                if not ENV.max_memory_gb
+                else int(ENV.max_memory_gb) * (1 << 30)
+            )
+            free_memory = (
+                max_memory
+                - ENV.reserved_memory_gb * (1 << 30)
+                - (
+                    self.warm_up_memory
+                    if self.warm_up_memory != 0
+                    else self.init_memory
+                )
+            )
+            print_log(
+                self.rank,
+                logger.info,
+                f"infer max_memory(GB): {max_memory / (1024 ** 3): .2f}, "
+                f"warm_up_memory(GB): {self.warm_up_memory / (1024 ** 3): .2f}, "
+                f"free_memory(GB): {free_memory / (1024 ** 3): .2f}",
+            )
+
+            num_blocks = int(free_memory // total_cache_size)
+            if num_blocks <= 0:
+                raise ValueError("num_blocks must be positive!")
+            print_log(
+                self.rank,
+                logger.info,
+                f"num_blocks: {num_blocks}, free_memory: {free_memory}",
+            )
+            cache_config = CacheConfig(num_blocks, self.block_size)
+            self.cache_manager = CacheManager(cache_config, self.model_config)
+
+        if ENV.benchmark_enable:
+            self.model.postprocessor.max_new_tokens = 2
+            req_list_dummy = copy.deepcopy(req_list)
+            generate_req(
+                req_list_dummy,
+                self.model,
+                self.max_batch_size,
+                self.max_prefill_tokens,
+                self.cache_manager,
+            )
+
+        self.model.postprocessor.max_new_tokens = max_output_length
+        if not ENV.profiling_enable:
+            print_log(self.rank, logger.debug, "no profiling")
+            torch.npu.synchronize()
+            e2e_start = time.time()
+            generate_req(
+                req_list,
+                self.model,
+                self.max_batch_size,
+                self.max_prefill_tokens,
+                self.cache_manager,
+            )
+            _, _ = decode_token(req_list, self.tokenizer)
+            torch.npu.synchronize()
+            e2e_end = time.time()
+            e2e_time = e2e_end - e2e_start
+        else:
+            print_log(self.rank, logger.debug, "enter profiling")
+            profiling_path = ENV.profiling_filepath
+            if not os.path.exists(profiling_path):
+                os.makedirs(profiling_path, exist_ok=True)
+            torch.npu.synchronize()
+            e2e_start = time.time()
+            experimental_config = torch_npu.profiler._ExperimentalConfig(
+                aic_metrics=torch_npu.profiler.AiCMetrics.PipeUtilization,
+                profiler_level=torch_npu.profiler.ProfilerLevel.Level0,
+                l2_cache=False,
+                data_simplification=False,
+            )
+            with torch_npu.profiler.profile(
+                activities=[
+                    torch_npu.profiler.ProfilerActivity.CPU,
+                    torch_npu.profiler.ProfilerActivity.NPU,
+                ],
+                on_trace_ready=torch_npu.profiler.tensorboard_trace_handler(
+                    profiling_path
+                ),
+                record_shapes=True,
+                profile_memory=True,
+                with_stack=False,
+                with_flops=False,
+                with_modules=False,
+                experimental_config=experimental_config,
+            ) as _:
+                generate_req(
+                    req_list,
+                    self.model,
+                    self.max_batch_size,
+                    self.max_prefill_tokens,
+                    self.cache_manager,
+                )
+            torch.npu.synchronize()
+            e2e_end = time.time()
+            e2e_time = e2e_end - e2e_start
+
+        generate_text_list, token_num_list = decode_token(req_list, self.tokenizer)
+        print_log(self.rank, logger.info, "---------------end inference---------------")
+        return generate_text_list, token_num_list, e2e_time
+
+
+def parse_ids(list_str):
+    return [int(item) for item in list_str.split(",")]
+
+
+def parse_arguments():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--model_path",
+        help="model and tokenizer path",
+        default="/data/acltransformer_testdata/weights/llama2/llama-2-70b",
+    )
+    parser.add_argument(
+        "--input_texts", type=str, nargs="+", default=["What is deeplearning?:"]
+    )
+    parser.add_argument("--input_images", type=str, nargs="+", default=[])
+    parser.add_argument("--input_ids", type=parse_ids, nargs="+", default=None)
+    parser.add_argument(
+        "--input_file",
+        type=str,
+        help="CSV or Numpy file containing tokenized input. Alternative to text input.",
+        default=None,
+    )
+    parser.add_argument("--max_position_embeddings", type=int, default=None)
+    parser.add_argument("--max_input_length", type=int, default=1024)
+    parser.add_argument("--max_output_length", type=int, default=65)
+    parser.add_argument("--max_prefill_tokens", type=int, default=-1)
+    parser.add_argument("--max_batch_size", type=int, default=1)
+    parser.add_argument("--block_size", type=int, default=128)
+    parser.add_argument("--quantize", type=str, default=None)
+
+    parser.add_argument("--is_flash_model", action="store_false")
+    parser.add_argument("--is_bf16", action="store_true")
+
+    parser.add_argument(
+        "--num_beams", type=int, help="Use beam search if num_beams >1", default=1
+    )
+    parser.add_argument("--temperature", type=float, default=1.0)
+    parser.add_argument("--top_k", type=int, default=1)
+    parser.add_argument("--top_p", type=float, default=0.0)
+    parser.add_argument("--length_penalty", type=float, default=1.0)
+    parser.add_argument("--repetition_penalty", type=float, default=1.0)
+    parser.add_argument("--presence_penalty", type=float, default=0.0)
+    parser.add_argument("--frequency_penalty", type=float, default=0.0)
+    parser.add_argument("--ignore_eos", action="store_true")
+    parser.add_argument("--dataset_path", help="precision test dataset path", default=None)
+    parser.add_argument("--results_save_path",
+                        help="precision test result path",
+                        default="./npu_coco_rst.json")
+
+    return parser.parse_args()
+
+
+def deal_dataset(dataset_path):
+    input_images = []
+    images_list = os.listdir(dataset_path)
+    for img_name in images_list:
+        image_path = os.path.join(dataset_path, img_name)
+        input_images.append(image_path)
+    input_texts = ["Please describe this image in detail:"] * len(input_images)
+
+    return input_images, input_texts
+
+
+if __name__ == "__main__":
+    args = parse_arguments()
+
+    rank = int(os.getenv("RANK", "0"))
+    local_rank = int(os.getenv("LOCAL_RANK", "0"))
+    world_size = int(os.getenv("WORLD_SIZE", "1"))
+    input_dict = {
+        "rank": rank,
+        "world_size": world_size,
+        "local_rank": local_rank,
+        **vars(args),
+    }
+
+    # precision test
+    npu_rst_dict = {}
+    if args.dataset_path:
+        args.input_images, args.input_texts = deal_dataset(args.dataset_path)
+
+    pa_runner = PARunner(**input_dict)
+    print_log(rank, logger.info, f"pa_runner: {pa_runner}")
+    pa_runner.warm_up()
+
+    generate_texts, token_nums, latency = pa_runner.infer(
+        args.input_texts,
+        args.input_images,
+        args.max_batch_size,
+        args.max_output_length,
+        args.ignore_eos,
+        args.input_ids,
+    )
+
+    for i, generate_text in enumerate(generate_texts):
+        length = len(args.input_ids) if args.input_ids else len(args.input_texts)
+        inputs = args.input_ids if args.input_ids else args.input_texts
+        if args.dataset_path:
+            rst_key = args.input_images[i].split("/")[-1]
+            npu_rst_dict[rst_key] = generate_text
+        if i < length:
+            print_log(rank, logger.info, f"Question[{i}]: {inputs[i]}")
+        print_log(rank, logger.info, f"Answer[{i}]: {generate_text}")
+        print_log(rank, logger.info, f"Generate[{i}] token num: {token_nums[i]}")
+        print_log(rank, logger.info, f"Latency: {latency}")
+
+    if args.dataset_path:
+        sorted_dict = dict(sorted(npu_rst_dict.items()))
+        with safe_open(args.results_save_path, "w", override_flags=os.O_WRONLY | os.O_CREAT | os.O_EXCL) as f:
+            json.dump(sorted_dict, f)
+        print_log(rank, logger.info, "--------------npu precision test finish--------------")
diff --git a/mindie/examples/models/qwen_vl/run_pa.sh b/mindie/examples/models/qwen_vl/run_pa.sh
new file mode 100644
index 00000000..e100e212
--- /dev/null
+++ b/mindie/examples/models/qwen_vl/run_pa.sh
@@ -0,0 +1,20 @@
+#!/bin/bash
+export BIND_CPU=1
+export RESERVED_MEMORY_GB=3
+export ASCEND_RT_VISIBLE_DEVICES=0
+export MASTER_PORT=20030
+export TP_WORLD_SIZE=$(($(echo "${ASCEND_RT_VISIBLE_DEVICES}" | grep -o , | wc -l) +1))
+
+model_path="/data/Qwen-VL"
+atb_options="ATB_LAUNCH_KERNEL_WITH_TILING=1 ATB_LAYER_INTERNAL_TENSOR_REUSE=1 PYTORCH_NPU_ALLOC_CONF='max_split_size_mb:2048' HCCL_BUFFSIZE=120 ATB_WORKSPACE_MEM_ALLOC_GLOBAL=1"
+atb_async_options="ATB_OPERATION_EXECUTE_ASYNC=1 TASK_QUEUE_ENABLE=1"
+base_cmd="torchrun --nproc_per_node $TP_WORLD_SIZE --master_port $MASTER_PORT \
+    -m examples.models.qwen_vl.run_pa \
+    --model_path $model_path \
+    --input_images 'https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg' \
+    --input_texts 'Generate the caption in English with grounding:' "
+run_cmd="${atb_options} ${atb_async_options} ${base_cmd}"
+
+if [[ -n ${model_path} ]];then
+    eval "${run_cmd}"
+fi
diff --git a/mindie/examples/models/stablelm/run_pa.sh b/mindie/examples/models/stablelm/run_pa.sh
new file mode 100644
index 00000000..3eee182e
--- /dev/null
+++ b/mindie/examples/models/stablelm/run_pa.sh
@@ -0,0 +1,21 @@
+# Copyright Huawei Technologies Co., Ltd. 2024. All rights reserved.
+export ASCEND_RT_VISIBLE_DEVICES=2,3,4,5
+export MASTER_PORT=20030
+
+export ATB_LAYER_INTERNAL_TENSOR_REUSE=1
+export INF_NAN_MODE_ENABLE=0
+export ATB_OPERATION_EXECUTE_ASYNC=1
+export TASK_QUEUE_ENABLE=1
+export ATB_CONVERT_NCHW_TO_ND=1
+export LCCL_ENABLE_FALLBACK=1
+export ATB_WORKSPACE_MEM_ALLOC_GLOBAL=1
+export ATB_CONTEXT_WORKSPACE_SIZE=0
+export INT8_FORMAT_NZ_ENABLE=1
+
+world_size=$(($(echo "${ASCEND_RT_VISIBLE_DEVICES}" | grep -o , | wc -l) +1))
+
+if [ "$TP_WORLD_SIZE" == "1" ]; then
+    python -m examples.run_pa --model_path $1
+else
+    torchrun --nproc_per_node $world_size --master_port $MASTER_PORT -m examples.run_pa --model_path $1
+fi
diff --git a/mindie/examples/models/starcoder/README.md b/mindie/examples/models/starcoder/README.md
new file mode 100644
index 00000000..e753da60
--- /dev/null
+++ b/mindie/examples/models/starcoder/README.md
@@ -0,0 +1,171 @@
+# STARCODER README
+
+StarCoder模型是在The Stack (v1.2)的80+种编程语言上训练的15.5B参数模型，不包括选择退出请求。该模型使用多查询注意力，一个包含8192个令牌的上下文窗口，并在1万亿个令牌上使用填充中间目标进行训练。
+
+- 参考实现：
+```
+https://huggingface.co/bigcode/starcoder
+```
+# 特性矩阵
+- 此矩阵罗列了各starcoder模型支持的特性
+
+| 模型及参数量      | 800I A2 Tensor Parallelism | 300I DUO Tensor Parallelism | FP16 | BF16（仅800I A2支持） | Flash Attention | Paged Attention | W8A8量化 | W8A16量化 | KV cache量化 | 稀疏量化（仅300I DUO支持） | MOE | MindIE | TGI |长序列|
+|-------------|----------------------------|-----------------------------|------|------------------|-----------------|-----------------|---------|-----------|--------------|--------------------------|-----|--------|---|---|
+| starcoder-15.5B   | 支持world size 1,2,4,8     | 支持world size 2,4            | √   | ×               | ×               | √              | √       | ×        | ×           | ×                       | ×  | √      | √ |×|
+
+# 使用说明
+
+## 权重下载
+- 下载starcoder模型权重，放置到自定义路径下
+```
+https://huggingface.co/bigcode/starcoder/tree/main
+```
+- 修改`config.json`中的`model_type`为`starcoder`
+
+## 权重转换
+- 参考[此README文件](../../README.md)
+
+
+## 量化权重转换（W8A8）
+- 去目标文件目录下执行
+```
+python convert_w8a8_quant_weights.py --model_path {浮点权重路径} --save_directory {W8A8量化权重路径}
+```
+- 若要测试HumanEval量化精度并符合与浮点精度保持1%差距，可配置回退层`disabled_names`
+
+
+- 配置
+  | 量化类型及精度  | torch_dtype | quantize |
+  |----------------|-------------|----------|
+  | FP16           | "float16"   | ""       |
+  | BF16           | "bfloat16"  | ""       |
+  | W8A8           | "float16"   | "w8a8"   |
+  | W8A16          | "float16"   | "w8a16"  |
+
+- 示例
+  - starcoder模型使用FP16精度，W8A8量化
+    ```json
+    {
+      "torch_dtype": "float16",
+      "quantize": "w8a8",
+    }
+    ```
+
+## 路径变量解释
+| 变量名  | 含义                                                                                                                  |
+|--------|---------------------------------------------------------------------------------------------------------------------|
+| working_dir | 加速库及模型库下载后放置的目录                                                                                                     |
+| llm_path | 模型仓所在路径。若使用编译好的包，则路径为`${working_dir}/MindIE-LLM/`；若使用gitee下载的代码，则路径为`${working_dir}/MindIE-LLM/examples/atb_models` |
+| script_path | 脚本所在路径；starcoder工作脚本所在路径为`${llm_path}/examples/models/starcoder`                                                 |
+| weight_path | 模型权重路径   
+
+## 300I DUO 运行操作说明
+
+### 对话测试
+- 运行启动脚本
+  - 在\${llm_path}目录下执行以下指令
+    ```shell
+    bash ${script_path}/run_300i_duo.sh ${weight_path}
+    ```
+- 环境变量说明
+  - `export BIND_CPU=1`
+    - 绑定CPU核心开关
+    - 默认进行绑核
+    - 若当前机器未设置NUMA或绑核失败，可将 BIND_CPU 设为 0
+  - `export ASCEND_RT_VISIBLE_DEVICES=0,1,2,3`
+    - 指定当前机器上可用的逻辑NPU核心，多个核心间使用逗号相连
+    - 核心ID查阅方式见[此README文件](../../README.md)的【启动脚本相关环境变量】章节
+    - 若要使用单卡双芯，请指定至少两个可见核心；若要使用双卡四芯，请指定至少四个可见核心
+  - `export TP_WORLD_SIZE=2`
+    - 指定模型运行时的TP数，即world size
+    - 默认为单卡双芯
+    - 各模型支持的TP数参考“特性矩阵”
+    - “单卡双芯”运行请指定`TP_WORLD_SIZE`为`2`，“双卡四芯”运行请指定`TP_WORLD_SIZE`为`4`
+  - `export MASTER_PORT=20030`
+    - 设置卡间通信端口
+    - 默认使用20030端口
+    - 目的是为了避免同一台机器同时运行多个多卡模型时出现通信冲突
+    - 设置时端口建议范围为：20000-20050
+  - `export PYTHONPATH=${llm_path}:$PYTHONPATH`
+    - 将模型仓路径加入Python查询模块和包的搜索路径中
+    - 将${llm_path}替换为实际路径
+
+### 对话测试脚本参数说明
+- `--model_path` 模型路径
+- `--input_text` 输入问题
+- `--max_input_length` 最大输入长度
+- `--max_output_length` 最大输出长度
+- `--max_batch_size` 每次运行时固定的batch数量
+- 所有参数可见run_pa.py文件中
+
+## 800I A2 运行操作说明
+
+### 对话测试
+**运行Flash Attention FP16**
+- 暂不支持
+
+**运行Flash Attention BF16**
+- 暂不支持
+
+**运行Paged Attention FP16**
+
+### 对话测试
+- 运行启动脚本
+  - 在\${llm_path}目录下执行以下指令
+    ```shell
+    bash ${script_path}/run_800i_a2_pa.sh ${weight_path}
+    ```
+- 环境变量说明
+  - `export BIND_CPU=1`
+    - 绑定CPU核心开关
+    - 默认进行绑核
+    - 若当前机器未设置NUMA或绑核失败，可将 BIND_CPU 设为 0
+  - `export ASCEND_RT_VISIBLE_DEVICES=0,1,2,3`
+    - 指定当前机器上可用的逻辑NPU核心，多个核心间使用逗号相连
+    - 核心ID查阅方式见[此README文件](../../README.md)的【启动脚本相关环境变量】章节
+    - 若要使用单卡双芯，请指定至少两个可见核心；若要使用双卡四芯，请指定至少四个可见核心
+  - `export TP_WORLD_SIZE=2`
+    - 指定模型运行时的TP数，即world size
+    - 默认为单卡双芯
+    - 各模型支持的TP数参考“特性矩阵”
+    - “单卡双芯”运行请指定`TP_WORLD_SIZE`为`2`，“双卡四芯”运行请指定`TP_WORLD_SIZE`为`4`
+  - `export MASTER_PORT=20030`
+    - 设置卡间通信端口
+    - 默认使用20030端口
+    - 目的是为了避免同一台机器同时运行多个多卡模型时出现通信冲突
+    - 设置时端口建议范围为：20000-20050
+  - `export PYTHONPATH=${llm_path}:$PYTHONPATH`
+    - 将模型仓路径加入Python查询模块和包的搜索路径中
+    - 将${llm_path}替换为实际路径
+
+### 对话测试脚本参数说明
+- `--model_path` 模型路径
+- `--input_text` 输入问题
+- `--max_input_length` 最大输入长度
+- `--max_output_length` 最大输出长度
+- `--max_batch_size` 每次运行时固定的batch数量
+- 所有参数可见run_pa.py文件中
+
+**运行Paged Attention BF16**    
+- 待补充
+
+**运行W8A8量化**
+- 获取量化权重后操作步骤同上
+
+**运行KV cache量化**
+- 待补充
+
+**运行稀疏量化**
+- 待补充
+
+**运行MOE量化**
+- 待补充
+
+## 精度测试
+- 参考[此README文件](../../../tests/modeltest/README.md)
+
+## 性能测试
+- 参考[此README文件](../../../tests/modeltest/README.md)
+
+
+
diff --git a/mindie/examples/models/starcoder/convert_w8a8_quant_weights.py b/mindie/examples/models/starcoder/convert_w8a8_quant_weights.py
new file mode 100644
index 00000000..5f3d90d2
--- /dev/null
+++ b/mindie/examples/models/starcoder/convert_w8a8_quant_weights.py
@@ -0,0 +1,115 @@
+# coding=utf-8
+# Copyright 2023 The Bigcode team and HuggingFace Inc. team.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# 导入相关依赖
+import os
+import json
+import torch
+import torch.utils.data
+from atb_llm.utils.file_utils import safe_open
+from atb_llm.utils.log import logger
+from transformers import AutoTokenizer, AutoModelForCausalLM
+from msmodelslim.pytorch.llm_ptq.llm_ptq_tools import Calibrator, QuantConfig # 导入量化配置接口
+from examples.convert.model_slim.quantifier import parse_arguments
+from examples.convert.convert_utils import copy_tokenizer_files, modify_config
+
+
+#获取校准数据函数定义
+def get_calib_dataset(_tokenizer, _calib_list):
+    calib_dataset = []
+    for calib_data in _calib_list:
+        inputs = _tokenizer([calib_data], return_tensors='pt').to('cpu')
+        logger.info(inputs)
+        calib_dataset.append([inputs.data['input_ids'], None, inputs.data['attention_mask']])
+    return calib_dataset
+
+
+# 修改config.json中的model_type
+def change_model_type(model_path, model_type):
+    config_file = os.path.join(model_path, 'config.json')
+    with safe_open(config_file, 'r', encoding='utf-8') as fr:
+        config_data = json.load(fr)
+    config_data['model_type'] = model_type
+    with safe_open(config_file, "w", override_flags=os.O_WRONLY | os.O_CREAT, encoding='utf-8') as fw:
+        fw.truncate()
+        json.dump(config_data, fw, indent=4)
+
+# for local path
+args = parse_arguments()
+config_path = os.path.join(args.model_path, 'config.json')
+logger.info('changing model_type in config.json...')
+change_model_type(args.model_path, 'gpt_bigcode')
+logger.info('changing done!')
+tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path=args.model_path)
+model = AutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path=args.model_path, 
+                                             torch_dtype=torch.float32).cpu()
+logger.info("loading success!")
+logger.info("start quant...")
+
+# 准备校准数据，请根据实际情况修改
+calib_list = []
+with safe_open('humaneval_python.txt', 'r') as file:
+    for line in file:
+        calib_list.append(line.strip())
+#校准数据获取
+dataset_calib = get_calib_dataset(tokenizer, calib_list) 
+
+# 量化配置
+# 配置回退层数
+disabled_names = [
+    # "transformer.h.0.mlp.c_proj",
+    # "transformer.h.1.attn.c_attn",
+    # "transformer.h.1.mlp.c_fc",
+    # "transformer.h.1.mlp.c_proj",
+    # "transformer.h.2.attn.c_attn",
+    # "transformer.h.2.mlp.c_proj",
+    # "transformer.h.3.attn.c_attn",
+    # "transformer.h.3.mlp.c_proj",
+    # "transformer.h.4.attn.c_attn",
+    # "transformer.h.4.mlp.c_proj",
+    # "transformer.h.11.attn.c_attn",
+    # "transformer.h.12.mlp.c_fc",
+    # "transformer.h.13.mlp.c_fc",
+    # "transformer.h.14.mlp.c_fc",
+    # "transformer.h.15.mlp.c_fc",
+    # "transformer.h.16.mlp.c_fc", 
+    # "transformer.h.17.mlp.c_fc",
+    # "transformer.h.18.mlp.c_fc",
+    # "transformer.h.19.mlp.c_fc",
+    # "transformer.h.20.mlp.c_fc",
+    # "transformer.h.21.mlp.c_fc",
+    # "transformer.h.39.attn.c_attn",
+    # "transformer.h.39.mlp.c_fc",
+    # "transformer.h.39.mlp.c_proj",
+    # "lm_head"
+]
+
+# 配置量化参数，并返回量化配置实例
+quant_config = QuantConfig(disable_names=disabled_names, w_bit=8, dev_type='cpu', 
+                            act_method=3, pr=1.0, mm_tensor=False)
+# 输入加载的原模型、量化配置和校准数据，定义校准
+calibrator = Calibrator(model, quant_config, calib_data=dataset_calib, disable_level='L0')
+
+#执行量化
+calibrator.run() 
+
+# save()保存模型量化参数
+calibrator.save(args.save_directory, save_type=["safe_tensor"])
+logger.info("quant weight saved successfully")
+
+logger.info('changing back model_type in config.json')
+change_model_type(args.model_path, 'starcoder')
+logger.info('changing done!')
+modify_config(args.model_path, args.save_directory, torch.float16, 'w8a8')
+copy_tokenizer_files(args.model_path, args.save_directory)
+logger.info('All done!')
\ No newline at end of file
diff --git a/mindie/examples/models/starcoder/humaneval_python.txt b/mindie/examples/models/starcoder/humaneval_python.txt
new file mode 100644
index 00000000..f7f617ac
--- /dev/null
+++ b/mindie/examples/models/starcoder/humaneval_python.txt
@@ -0,0 +1,10 @@
+{"task_id": "Python/0", "prompt": "from typing import List\n\n\ndef has_close_elements(numbers: List[float], threshold: float) -> bool:\n    \"\"\" Check if in given list of numbers, are any two numbers closer to each other than\n    given threshold.\n    >>> has_close_elements([1.0, 2.0, 3.0], 0.5)\n    False\n    >>> has_close_elements([1.0, 2.8, 3.0, 4.0, 5.0, 2.0], 0.3)\n    True\n    \"\"\"\n", "canonical_solution": "    for idx, elem in enumerate(numbers):\n        for idx2, elem2 in enumerate(numbers):\n            if idx != idx2:\n                distance = abs(elem - elem2)\n                if distance < threshold:\n                    return True\n\n    return False\n", "test": "\n\nMETADATA = {\n    'author': 'jt',\n    'dataset': 'test'\n}\n\n\ndef check(has_close_elements):\n    assert has_close_elements([1.0, 2.0, 3.9, 4.0, 5.0, 2.2], 0.3) == True\n    assert has_close_elements([1.0, 2.0, 3.9, 4.0, 5.0, 2.2], 0.05) == False\n    assert has_close_elements([1.0, 2.0, 5.9, 4.0, 5.0], 0.95) == True\n    assert has_close_elements([1.0, 2.0, 5.9, 4.0, 5.0], 0.8) == False\n    assert has_close_elements([1.0, 2.0, 3.0, 4.0, 5.0, 2.0], 0.1) == True\n    assert has_close_elements([1.1, 2.2, 3.1, 4.1, 5.1], 1.0) == True\n    assert has_close_elements([1.1, 2.2, 3.1, 4.1, 5.1], 0.5) == False\n\ncheck(has_close_elements)", "text": "    Check if in given list of numbers, are any two numbers closer to each other than\n    given threshold.\n    >>> has_close_elements([1.0, 2.0, 3.0], 0.5)\n    False\n    >>> has_close_elements([1.0, 2.8, 3.0, 4.0, 5.0, 2.0], 0.3)\n    True", "declaration": "from typing import List\n\n\ndef has_close_elements(numbers: List[float], threshold: float) -> bool:\n", "example_test": "def check(has_close_elements):\n    assert has_close_elements([1.0, 2.0, 3.0], 0.5) == False\n    assert has_close_elements([1.0, 2.8, 3.0, 4.0, 5.0, 2.0], 0.3) == True\ncheck(has_close_elements)\n"}
+{"task_id": "Python/1", "prompt": "from typing import List\n\n\ndef separate_paren_groups(paren_string: str) -> List[str]:\n    \"\"\" Input to this function is a string containing multiple groups of nested parentheses. Your goal is to\n    separate those group into separate strings and return the list of those.\n    Separate groups are balanced (each open brace is properly closed) and not nested within each other\n    Ignore any spaces in the input string.\n    >>> separate_paren_groups('( ) (( )) (( )( ))')\n    ['()', '(())', '(()())']\n    \"\"\"\n", "canonical_solution": "    result = []\n    current_string = []\n    current_depth = 0\n\n    for c in paren_string:\n        if c == '(':\n            current_depth += 1\n            current_string.append(c)\n        elif c == ')':\n            current_depth -= 1\n            current_string.append(c)\n\n            if current_depth == 0:\n                result.append(''.join(current_string))\n                current_string.clear()\n\n    return result\n", "test": "\n\nMETADATA = {\n    'author': 'jt',\n    'dataset': 'test'\n}\n\n\ndef check(separate_paren_groups):\n    assert separate_paren_groups('(()()) ((())) () ((())()())') == [\n        '(()())', '((()))', '()', '((())()())'\n    ]\n    assert separate_paren_groups('() (()) ((())) (((())))') == [\n        '()', '(())', '((()))', '(((())))'\n    ]\n    assert separate_paren_groups('(()(())((())))') == [\n        '(()(())((())))'\n    ]\n    assert separate_paren_groups('( ) (( )) (( )( ))') == ['()', '(())', '(()())']\n\ncheck(separate_paren_groups)", "text": "    Input to this function is a string containing multiple groups of nested parentheses. Your goal is to\n    separate those group into separate strings and return the list of those.\n    Separate groups are balanced (each open brace is properly closed) and not nested within each other\n    Ignore any spaces in the input string.\n    >>> separate_paren_groups('( ) (( )) (( )( ))')\n    ['()', '(())', '(()())']", "declaration": "from typing import List\n\n\ndef separate_paren_groups(paren_string: str) -> List[str]:\n", "example_test": "def check(separate_paren_groups):\n    assert separate_paren_groups('( ) (( )) (( )( ))') == ['()', '(())', '(()())']\ncheck(separate_paren_groups)\n"}
+{"task_id": "Python/2", "prompt": "\n\ndef truncate_number(number: float) -> float:\n    \"\"\" Given a positive floating point number, it can be decomposed into\n    and integer part (largest integer smaller than given number) and decimals\n    (leftover part always smaller than 1).\n\n    Return the decimal part of the number.\n    >>> truncate_number(3.5)\n    0.5\n    \"\"\"\n", "canonical_solution": "    return number % 1.0\n", "test": "\n\nMETADATA = {\n    'author': 'jt',\n    'dataset': 'test'\n}\n\n\ndef check(truncate_number):\n    assert truncate_number(3.5) == 0.5\n    assert abs(truncate_number(1.33) - 0.33) < 1e-6\n    assert abs(truncate_number(123.456) - 0.456) < 1e-6\n\ncheck(truncate_number)", "text": "    Given a positive floating point number, it can be decomposed into\n    and integer part (largest integer smaller than given number) and decimals\n    (leftover part always smaller than 1).\n\n    Return the decimal part of the number.\n    >>> truncate_number(3.5)\n    0.5", "declaration": "def truncate_number(number: float) -> float:\n", "example_test": "def check(truncate_number):\n    assert truncate_number(3.5) == 0.5\ncheck(truncate_number)\n"}
+{"task_id": "Python/3", "prompt": "from typing import List\n\n\ndef below_zero(operations: List[int]) -> bool:\n    \"\"\" You're given a list of deposit and withdrawal operations on a bank account that starts with\n    zero balance. Your task is to detect if at any point the balance of account fallls below zero, and\n    at that point function should return True. Otherwise it should return False.\n    >>> below_zero([1, 2, 3])\n    False\n    >>> below_zero([1, 2, -4, 5])\n    True\n    \"\"\"\n", "canonical_solution": "    balance = 0\n\n    for op in operations:\n        balance += op\n        if balance < 0:\n            return True\n\n    return False\n", "test": "\n\nMETADATA = {\n    'author': 'jt',\n    'dataset': 'test'\n}\n\n\ndef check(below_zero):\n    assert below_zero([]) == False\n    assert below_zero([1, 2, -3, 1, 2, -3]) == False\n    assert below_zero([1, 2, -4, 5, 6]) == True\n    assert below_zero([1, -1, 2, -2, 5, -5, 4, -4]) == False\n    assert below_zero([1, -1, 2, -2, 5, -5, 4, -5]) == True\n    assert below_zero([1, -2, 2, -2, 5, -5, 4, -4]) == True\n\ncheck(below_zero)", "text": "    You're given a list of deposit and withdrawal operations on a bank account that starts with\n    zero balance. Your task is to detect if at any point the balance of account fallls below zero, and\n    at that point function should return True. Otherwise it should return False.\n    >>> below_zero([1, 2, 3])\n    False\n    >>> below_zero([1, 2, -4, 5])\n    True", "declaration": "from typing import List\n\n\ndef below_zero(operations: List[int]) -> bool:\n", "example_test": "def check(below_zero):\n    assert below_zero([1, 2, 3]) == False\n    assert below_zero([1, 2, -4, 5]) == True\ncheck(below_zero)\n"}
+{"task_id": "Python/4", "prompt": "from typing import List\n\n\ndef mean_absolute_deviation(numbers: List[float]) -> float:\n    \"\"\" For a given list of input numbers, calculate Mean Absolute Deviation\n    around the mean of this dataset.\n    Mean Absolute Deviation is the average absolute difference between each\n    element and a centerpoint (mean in this case):\n    MAD = average | x - x_mean |\n    >>> mean_absolute_deviation([1.0, 2.0, 3.0, 4.0])\n    1.0\n    \"\"\"\n", "canonical_solution": "    mean = sum(numbers) / len(numbers)\n    return sum(abs(x - mean) for x in numbers) / len(numbers)\n", "test": "\n\nMETADATA = {\n    'author': 'jt',\n    'dataset': 'test'\n}\n\n\ndef check(mean_absolute_deviation):\n    assert abs(mean_absolute_deviation([1.0, 2.0, 3.0]) - 2.0/3.0) < 1e-6\n    assert abs(mean_absolute_deviation([1.0, 2.0, 3.0, 4.0]) - 1.0) < 1e-6\n    assert abs(mean_absolute_deviation([1.0, 2.0, 3.0, 4.0, 5.0]) - 6.0/5.0) < 1e-6\n\ncheck(mean_absolute_deviation)", "text": "    For a given list of input numbers, calculate Mean Absolute Deviation\n    around the mean of this dataset.\n    Mean Absolute Deviation is the average absolute difference between each\n    element and a centerpoint (mean in this case):\n    MAD = average | x - x_mean |\n    >>> mean_absolute_deviation([1.0, 2.0, 3.0, 4.0])\n    1.0", "declaration": "from typing import List\n\n\ndef mean_absolute_deviation(numbers: List[float]) -> float:\n", "example_test": "def check(mean_absolute_deviation):\n    assert abs(mean_absolute_deviation([1.0, 2.0, 3.0, 4.0]) - 1.0) < 1e-6\ncheck(mean_absolute_deviation)\n"}
+{"task_id": "Python/5", "prompt": "from typing import List\n\n\ndef intersperse(numbers: List[int], delimeter: int) -> List[int]:\n    \"\"\" Insert a number 'delimeter' between every two consecutive elements of input list `numbers'\n    >>> intersperse([], 4)\n    []\n    >>> intersperse([1, 2, 3], 4)\n    [1, 4, 2, 4, 3]\n    \"\"\"\n", "canonical_solution": "    if not numbers:\n        return []\n\n    result = []\n\n    for n in numbers[:-1]:\n        result.append(n)\n        result.append(delimeter)\n\n    result.append(numbers[-1])\n\n    return result\n", "test": "\n\nMETADATA = {\n    'author': 'jt',\n    'dataset': 'test'\n}\n\n\ndef check(intersperse):\n    assert intersperse([], 7) == []\n    assert intersperse([5, 6, 3, 2], 8) == [5, 8, 6, 8, 3, 8, 2]\n    assert intersperse([2, 2, 2], 2) == [2, 2, 2, 2, 2]\n\ncheck(intersperse)", "text": "    Insert a number 'delimeter' between every two consecutive elements of input list `numbers'\n    >>> intersperse([], 4)\n    []\n    >>> intersperse([1, 2, 3], 4)\n    [1, 4, 2, 4, 3]", "declaration": "from typing import List\n\n\ndef intersperse(numbers: List[int], delimeter: int) -> List[int]:\n", "example_test": "def check(intersperse):\n    assert intersperse([], 4) == []\n    assert intersperse([1,2,3], 4) == [1,4,2,4,3]\ncheck(intersperse)\n"}
+{"task_id": "Python/6", "prompt": "from typing import List\n\n\ndef parse_nested_parens(paren_string: str) -> List[int]:\n    \"\"\" Input to this function is a string represented multiple groups for nested parentheses separated by spaces.\n    For each of the group, output the deepest level of nesting of parentheses.\n    E.g. (()()) has maximum two levels of nesting while ((())) has three.\n\n    >>> parse_nested_parens('(()()) ((())) () ((())()())')\n    [2, 3, 1, 3]\n    \"\"\"\n", "canonical_solution": "    def parse_paren_group(s):\n        depth = 0\n        max_depth = 0\n        for c in s:\n            if c == '(':\n                depth += 1\n                max_depth = max(depth, max_depth)\n            else:\n                depth -= 1\n\n        return max_depth\n\n    return [parse_paren_group(x) for x in paren_string.split(' ') if x]\n", "test": "\n\nMETADATA = {\n    'author': 'jt',\n    'dataset': 'test'\n}\n\n\ndef check(parse_nested_parens):\n    assert parse_nested_parens('(()()) ((())) () ((())()())') == [2, 3, 1, 3]\n    assert parse_nested_parens('() (()) ((())) (((())))') == [1, 2, 3, 4]\n    assert parse_nested_parens('(()(())((())))') == [4]\n\ncheck(parse_nested_parens)", "text": "    Input to this function is a string represented multiple groups for nested parentheses separated by spaces.\n    For each of the group, output the deepest level of nesting of parentheses.\n    E.g. (()()) has maximum two levels of nesting while ((())) has three.\n\n    >>> parse_nested_parens('(()()) ((())) () ((())()())')\n    [2, 3, 1, 3]", "declaration": "from typing import List\n\n\ndef parse_nested_parens(paren_string: str) -> List[int]:\n", "example_test": "def check(parse_nested_parens):\n    assert parse_nested_parens('(()()) ((())) () ((())()())') == [2, 3, 1, 3]\ncheck(parse_nested_parens)\n"}
+{"task_id": "Python/7", "prompt": "from typing import List\n\n\ndef filter_by_substring(strings: List[str], substring: str) -> List[str]:\n    \"\"\" Filter an input list of strings only for ones that contain given substring\n    >>> filter_by_substring([], 'a')\n    []\n    >>> filter_by_substring(['abc', 'bacd', 'cde', 'array'], 'a')\n    ['abc', 'bacd', 'array']\n    \"\"\"\n", "canonical_solution": "    return [x for x in strings if substring in x]\n", "test": "\n\nMETADATA = {\n    'author': 'jt',\n    'dataset': 'test'\n}\n\n\ndef check(filter_by_substring):\n    assert filter_by_substring([], 'john') == []\n    assert filter_by_substring(['xxx', 'asd', 'xxy', 'john doe', 'xxxAAA', 'xxx'], 'xxx') == ['xxx', 'xxxAAA', 'xxx']\n    assert filter_by_substring(['xxx', 'asd', 'aaaxxy', 'john doe', 'xxxAAA', 'xxx'], 'xx') == ['xxx', 'aaaxxy', 'xxxAAA', 'xxx']\n    assert filter_by_substring(['grunt', 'trumpet', 'prune', 'gruesome'], 'run') == ['grunt', 'prune']\n\ncheck(filter_by_substring)", "text": "    Filter an input list of strings only for ones that contain given substring\n    >>> filter_by_substring([], 'a')\n    []\n    >>> filter_by_substring(['abc', 'bacd', 'cde', 'array'], 'a')\n    ['abc', 'bacd', 'array']", "declaration": "from typing import List\n\n\ndef filter_by_substring(strings: List[str], substring: str) -> List[str]:\n", "example_test": "def check(filter_by_substring):\n    assert filter_by_substring([], 'a') == []\n    assert filter_by_substring(['abc', 'bacd', 'cde', 'array'], 'a') == ['abc', 'bacd', 'array']\ncheck(filter_by_substring)\n"}
+{"task_id": "Python/8", "prompt": "from typing import List, Tuple\n\n\ndef sum_product(numbers: List[int]) -> Tuple[int, int]:\n    \"\"\" For a given list of integers, return a tuple consisting of a sum and a product of all the integers in a list.\n    Empty sum should be equal to 0 and empty product should be equal to 1.\n    >>> sum_product([])\n    (0, 1)\n    >>> sum_product([1, 2, 3, 4])\n    (10, 24)\n    \"\"\"\n", "canonical_solution": "    sum_value = 0\n    prod_value = 1\n\n    for n in numbers:\n        sum_value += n\n        prod_value *= n\n    return sum_value, prod_value\n", "test": "\n\nMETADATA = {\n    'author': 'jt',\n    'dataset': 'test'\n}\n\n\ndef check(sum_product):\n    assert sum_product([]) == (0, 1)\n    assert sum_product([1, 1, 1]) == (3, 1)\n    assert sum_product([100, 0]) == (100, 0)\n    assert sum_product([3, 5, 7]) == (3 + 5 + 7, 3 * 5 * 7)\n    assert sum_product([10]) == (10, 10)\n\ncheck(sum_product)", "text": "    For a given list of integers, return a tuple consisting of a sum and a product of all the integers in a list.\n    Empty sum should be equal to 0 and empty product should be equal to 1.\n    >>> sum_product([])\n    (0, 1)\n    >>> sum_product([1, 2, 3, 4])\n    (10, 24)", "declaration": "from typing import List, Tuple\n\n\ndef sum_product(numbers: List[int]) -> Tuple[int, int]:\n", "example_test": "def check(sum_product):\n    assert sum_product([]) == (0, 1)\n    assert sum_product([1, 2,3,4]) == (10, 24)\ncheck(sum_product)\n"}
+{"task_id": "Python/9", "prompt": "from typing import List, Tuple\n\n\ndef rolling_max(numbers: List[int]) -> List[int]:\n    \"\"\" From a given list of integers, generate a list of rolling maximum element found until given moment\n    in the sequence.\n    >>> rolling_max([1, 2, 3, 2, 3, 4, 2])\n    [1, 2, 3, 3, 3, 4, 4]\n    \"\"\"\n", "canonical_solution": "    running_max = None\n    result = []\n\n    for n in numbers:\n        if running_max is None:\n            running_max = n\n        else:\n            running_max = max(running_max, n)\n\n        result.append(running_max)\n\n    return result\n", "test": "\n\nMETADATA = {\n    'author': 'jt',\n    'dataset': 'test'\n}\n\n\ndef check(rolling_max):\n    assert rolling_max([]) == []\n    assert rolling_max([1, 2, 3, 4]) == [1, 2, 3, 4]\n    assert rolling_max([4, 3, 2, 1]) == [4, 4, 4, 4]\n    assert rolling_max([3, 2, 3, 100, 3]) == [3, 3, 3, 100, 100]\n\ncheck(rolling_max)", "text": "    From a given list of integers, generate a list of rolling maximum element found until given moment\n    in the sequence.\n    >>> rolling_max([1, 2, 3, 2, 3, 4, 2])\n    [1, 2, 3, 3, 3, 4, 4]", "declaration": "from typing import List, Tuple\n\n\ndef rolling_max(numbers: List[int]) -> List[int]:\n", "example_test": "def check(rolling_max):\n    assert rolling_max([1, 2, 3, 2, 3, 4, 2]) == [1, 2, 3, 3, 3, 4, 4]\ncheck(rolling_max)\n"}
\ No newline at end of file
diff --git a/mindie/examples/models/starcoder/run_300i_duo.sh b/mindie/examples/models/starcoder/run_300i_duo.sh
new file mode 100644
index 00000000..d2951f06
--- /dev/null
+++ b/mindie/examples/models/starcoder/run_300i_duo.sh
@@ -0,0 +1,17 @@
+# Copyright Huawei Technologies Co., Ltd. 2023-2024. All rights reserved.
+# 参数配置以及启动指令的说明见同级目录下的README.md文件
+export ASCEND_RT_VISIBLE_DEVICES=0,1
+export TP_WORLD_SIZE=2
+export MASTER_PORT=12343
+export PYTHONPATH=${llm_path}:$PYTHONPATH
+
+# 以下环境变量与性能和内存优化相关，通常情况下无需修改
+export ATB_LAUNCH_KERNEL_WITH_TILING=1
+export ATB_LAYER_INTERNAL_TENSOR_REUSE=1
+export INF_NAN_MODE_ENABLE=0
+export ATB_OPERATION_EXECUTE_ASYNC=1
+export TASK_QUEUE_ENABLE=1
+export LCCL_ENABLE_FALLBACK=1
+export INT8_FORMAT_NZ_ENABLE=1
+
+torchrun --nproc_per_node $TP_WORLD_SIZE --master_port $MASTER_PORT -m examples.run_pa --model_path $1 $extra_param --max_batch_size 1 --max_input_length 256 --max_output_length 256
\ No newline at end of file
diff --git a/mindie/examples/models/starcoder/run_800i_a2_pa.sh b/mindie/examples/models/starcoder/run_800i_a2_pa.sh
new file mode 100644
index 00000000..bda7bc17
--- /dev/null
+++ b/mindie/examples/models/starcoder/run_800i_a2_pa.sh
@@ -0,0 +1,15 @@
+# Copyright Huawei Technologies Co., Ltd. 2023-2024. All rights reserved.
+# 参数配置以及启动指令的说明见同级目录下的README.md文件
+export ASCEND_RT_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+export TP_WORLD_SIZE=8
+export MASTER_PORT=12343
+export PYTHONPATH=${llm_path}:$PYTHONPATH
+
+# 以下环境变量与性能和内存优化相关，通常情况下无需修改
+export ATB_LAYER_INTERNAL_TENSOR_REUSE=1
+export INF_NAN_MODE_ENABLE=0
+export ATB_OPERATION_EXECUTE_ASYNC=1
+export TASK_QUEUE_ENABLE=1
+export LCCL_ENABLE_FALLBACK=1
+
+torchrun --nproc_per_node $TP_WORLD_SIZE --master_port $MASTER_PORT -m examples.run_pa --model_path $1 $extra_param --max_batch_size 1 --max_input_length 256 --max_output_length 256
\ No newline at end of file
diff --git a/mindie/examples/models/starcoder2/README.md b/mindie/examples/models/starcoder2/README.md
new file mode 100644
index 00000000..825a0dd0
--- /dev/null
+++ b/mindie/examples/models/starcoder2/README.md
@@ -0,0 +1,165 @@
+# STARCODER2 README
+
+- [StarCoder2](https://github.com/bigcode-project/starcoder2)是一系列代码生成模型（3B、7B 和 15B），在 [The Stack v2](https://huggingface.co/datasets/bigcode/the-stack-v2) 的 600+ 种编程语言和一些自然语言文本（如 Wikipedia、Arxiv 和 GitHub 问题）上进行了训练
+- 此代码仓目前支持StarCoder2-7B、StarCoder2-15B
+
+# 支持特性
+| 模型及参数量    | 800I A2 Tensor Parallelism | 300I DUO Tensor Parallelism | FP16 | BF16（仅800I A2支持） | Flash Attention | Paged Attention | W8A8量化 | W8A16量化 | KV cache量化 | 稀疏量化（仅300I DUO支持） | MOE | MindIE | TGI | 长序列|
+|---------------|----------------------------|-----------------------------|------|---------------------|-----------------|-----------------|---------|-----------|--------------|------------------------|-----|--------|-----|----|
+| StarCoder2-7B | 支持world size 1,2,4,8        | ×                          | √   | ×                   | ×             | √              | ×        | ×        | ×           | ×                      | ×  | ×     | ×  |×|
+| StarCoder2-15B | 支持world size 2,4,8        | ×                          | √   | ×                   | ×             | √              | √       | ×        | ×           | ×                      | ×  | √     | ×  |×|
+
+# 使用说明
+
+## 路径变量解释
+| 变量名         | 含义                                            |
+|---------------|-------------------------------------------------|
+| `working_dir` | 加速库及模型库下载后放置的目录                       |
+| `llm_path`    | 模型仓所在路径。若使用编译好的包，则路径为`${working_dir}/MindIE-LLM/`；若使用gitee下载的代码，则路径为`${working_dir}/MindIE-LLM/examples/atb_models` |
+| `script_path` | 脚本所在路径；StarCoder2的工作脚本所在路径为`${llm_path}/examples/models/starcoder2` |
+| `weight_path` | 模型权重路径                                      |
+
+
+## 权重
+### 权重下载
+- 下载starcoder2模型权重，放置到`${weight_path}`下
+  - [StarCoder2-15B](https://huggingface.co/bigcode/starcoder2-15b/tree/main)
+  - [StarCoder2-7B](https://huggingface.co/bigcode/starcoder2-7b/tree/main)
+
+### 权重转换
+- 当前仅支持加载safetensor格式的权重文件，若权重文件为bin格式，请参考[此README文件](../../README.md)
+
+### 量化权重生成（W8A8）
+- 当前仅StarCoder2-15B支持W8A8量化
+- 到`${script_path}`路径下，运行`convert_w8a8_quant_weights.py`
+```shell
+cd ${script_path}
+python convert_w8a8_quant_weights.py --model_path {浮点权重路径} --save_directory {W8A8量化权重路径}
+```
+- 权重生成后确认模型配置文件，确认`${weight_path}/config.json`文件中的`torch_dtype`和`quantize`
+  - `torch_dtype`和`quantize`类型用于标识量化类型和精度
+    | 量化类型及精度  | torch_dtype | quantize |
+    |----------------|-------------|----------|
+    | FP16           | "float16"   | ""       |
+    | BF16           | "bfloat16"  | ""       |
+    | W8A8           | "float16"   | "w8a8"   |
+    | W8A16          | "float16"   | "w8a16"  |
+  - 示例
+    - starcoder2模型使用FP16精度，W8A8量化
+      ```json
+      {
+        "torch_dtype": "float16",
+        "quantize": "w8a8",
+      }
+      ```
+- 若要测试HumanEval量化精度并符合与浮点精度保持1%差距，可配置中`convert_w8a8_quant_weights.py`的回退层`disabled_names`
+```python
+disabled_names = [
+    "model.layers.0.mlp.c_proj",
+    "model.layers.1.mlp.c_proj",
+    "model.layers.2.mlp.c_proj",
+    "model.layers.3.mlp.c_proj",
+    "model.layers.4.mlp.c_proj",
+    "model.layers.5.mlp.c_proj",
+    "model.layers.6.mlp.c_proj",
+    "model.layers.7.mlp.c_proj",
+    "model.layers.8.mlp.c_proj",
+    "model.layers.9.mlp.c_proj",
+    "model.layers.10.mlp.c_proj",
+    "model.layers.11.mlp.c_proj",
+    "model.layers.12.mlp.c_proj",
+    "model.layers.13.mlp.c_proj",
+    "model.layers.14.mlp.c_proj",
+    "model.layers.15.mlp.c_proj",
+    "model.layers.16.mlp.c_proj",
+    "model.layers.17.mlp.c_proj",
+    "model.layers.18.mlp.c_proj",
+    "model.layers.19.mlp.c_proj",
+    "model.layers.20.mlp.c_proj",
+    "model.layers.21.mlp.c_proj",
+    "model.layers.22.mlp.c_proj",
+    "model.layers.23.mlp.c_proj",
+    "model.layers.24.mlp.c_proj",
+    "model.layers.25.mlp.c_proj",
+    "model.layers.26.mlp.c_proj",
+    "model.layers.27.mlp.c_proj",
+    "model.layers.28.mlp.c_proj",
+    "model.layers.29.mlp.c_proj",
+    "model.layers.30.mlp.c_proj",
+    "model.layers.31.mlp.c_proj",
+    "model.layers.32.mlp.c_proj",
+    "model.layers.33.mlp.c_proj",
+    "model.layers.34.mlp.c_proj",
+    "model.layers.35.mlp.c_proj",
+    "model.layers.36.mlp.c_proj",
+    "model.layers.37.mlp.c_proj",
+    "model.layers.38.mlp.c_proj",
+    "model.layers.39.mlp.c_proj",
+]
+```
+
+## 800I A2 运行操作说明
+
+### 对话测试
+**运行Paged Attention FP16**
+- 运行启动脚本
+  - 在`${llm_path}`目录下执行以下指令
+    ```shell
+    bash ${script_path}/run_800i_a2_pa.sh ${weight_path}
+    ```
+- 环境变量说明
+  - `export BIND_CPU=1`
+    - 绑定CPU核心开关
+    - 默认进行绑核
+    - 若当前机器未设置NUMA或绑核失败，可将 BIND_CPU 设为 0
+  - `export ASCEND_RT_VISIBLE_DEVICES=0,1,2,3`
+    - 指定当前机器上可用的逻辑NPU核心，多个核心间使用逗号相连
+    - 核心ID查阅方式见[此README文件](../../README.md)的【启动脚本相关环境变量】章节
+    - 若要使用单卡双芯，请指定至少两个可见核心；若要使用双卡四芯，请指定至少四个可见核心
+  - `export TP_WORLD_SIZE=2`
+    - 指定模型运行时的TP数，即world size
+    - 默认为单卡双芯
+    - 各模型支持的TP数参考“特性矩阵”
+    - “单卡双芯”运行请指定`TP_WORLD_SIZE`为2，“双卡四芯”运行请指定`TP_WORLD_SIZE`为4
+  - `export MASTER_PORT=20030`
+    - 设置卡间通信端口
+    - 默认使用20030端口
+    - 目的是为了避免同一台机器同时运行多个多卡模型时出现通信冲突
+    - 设置时端口建议范围为：20000-20050
+  - `export PYTHONPATH=${llm_path}:$PYTHONPATH`
+    - 将模型仓路径加入Python查询模块和包的搜索路径中
+    - 将`${llm_path}`替换为实际路径
+
+**运行W8A8量化**
+- 获取量化权重后操作步骤同上
+
+### 对话测试脚本参数说明
+- `--model_path` 模型路径
+- `--input_text` 输入问题
+- `--max_input_length` 最大输入长度
+- `--max_output_length` 最大输出长度
+- `--max_batch_size` 每次运行时固定的batch数量
+- 所有参数可见run_pa.py文件中
+
+## 精度测试
+- 参考[此README文件](../../../tests/modeltest/README.md)
+- 示例
+  ```shell
+  cd ${llm_path}/tests/modeltest
+  export ASCEND_RT_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+  export MAX_MEMORY_GB=29
+  bash run.sh pa_fp16 full_HumanEval 1 starcoder2 ${weight_path} 4
+  ```
+- 运行量化权重时需注意`${weight_path}/config.json`中的`quantize`字段和`torch_dtype`字段是否与权重匹配，参考[此README文件](../../README.md)
+
+## 性能测试
+- 参考[此README文件](../../../tests/modeltest/README.md)
+- 示例
+  ```shell
+  cd ${llm_path}/tests/modeltest
+  export ASCEND_RT_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+  export MAX_MEMORY_GB=29
+  export ATB_LLM_BENCHMARK_ENABLE=1
+  bash run.sh pa_fp16 performance [[256,256],[512,512],[1024,1024],[2048,2048]] 1 starcoder2 ${weight_path} 4
+  ```
+- 运行量化权重时需注意`${weight_path}/config.json`中的`quantize`字段和`torch_dtype`字段是否与权重匹配，参考[此README文件](../../README.md)
\ No newline at end of file
diff --git a/mindie/examples/models/starcoder2/convert_w8a8_quant_weights.py b/mindie/examples/models/starcoder2/convert_w8a8_quant_weights.py
new file mode 100644
index 00000000..00ca667f
--- /dev/null
+++ b/mindie/examples/models/starcoder2/convert_w8a8_quant_weights.py
@@ -0,0 +1,58 @@
+# Copyright Huawei Technologies Co., Ltd. 2024. All rights reserved.
+
+import os
+from msmodelslim.pytorch.llm_ptq.llm_ptq_tools import QuantConfig
+from atb_llm.utils.log import logger
+from atb_llm.models.starcoder2.modeling_starcoder2 import Starcoder2Config
+from atb_llm.utils.file_utils import safe_open
+from examples.convert.model_slim.quantifier import parse_arguments, Quantifier
+from examples.convert.convert_utils import copy_tokenizer_files, modify_config
+
+
+def get_calib_dataset(_tokenizer, _calib_list):
+    calib_dataset = []
+    for calib_data in _calib_list:
+        inputs = _tokenizer([calib_data], return_tensors='pt')
+        logger.info(inputs)
+        calib_dataset.append([inputs.data['input_ids'], None, inputs.data['attention_mask']])
+    return calib_dataset
+
+if __name__ == "__main__":
+    args = parse_arguments()
+    disable_names = []
+    quant_conf = QuantConfig(
+        w_bit=args.w_bit,
+        a_bit=args.a_bit,
+        disable_names=disable_names,
+        dev_type=args.device_type,
+        act_method=args.act_method,
+        pr=1.0,  # randseed
+        nonuniform=False,
+        w_sym=args.w_sym,
+        mm_tensor=False,
+        co_sparse=args.co_sparse,
+        fraction=args.fraction,
+        sigma_factor=args.sigma_factor,
+        use_sigma=args.use_sigma,
+        is_lowbit=args.is_lowbit,
+        do_smooth=args.do_smooth,
+        use_kvcache_quant=args.use_kvcache_quant
+    )
+    
+    quantifier = Quantifier(args.model_path, quant_conf)
+    quantifier.tokenizer.pad_token_id = 0
+    calib_list = []
+    with safe_open('humaneval_python.txt', 'r') as file:
+        for line in file:
+            calib_list.append(line.strip())
+    dataset_calib = get_calib_dataset(quantifier.tokenizer, calib_list) 
+
+    if not os.path.exists(args.save_directory):
+        os.makedirs(args.save_directory, exist_ok=True)
+
+    quantifier.convert(dataset_calib, args.save_directory, args.disable_level)
+    quant_type = f"w{args.w_bit}a{args.a_bit}"
+    auto_config = Starcoder2Config.from_pretrained(args.model_path)
+    modify_config(args.model_path, args.save_directory, auto_config.torch_dtype,
+                  quant_type, args.use_kvcache_quant)
+    copy_tokenizer_files(args.model_path, args.save_directory)
\ No newline at end of file
diff --git a/mindie/examples/models/starcoder2/humaneval_python.txt b/mindie/examples/models/starcoder2/humaneval_python.txt
new file mode 100644
index 00000000..f7f617ac
--- /dev/null
+++ b/mindie/examples/models/starcoder2/humaneval_python.txt
@@ -0,0 +1,10 @@
+{"task_id": "Python/0", "prompt": "from typing import List\n\n\ndef has_close_elements(numbers: List[float], threshold: float) -> bool:\n    \"\"\" Check if in given list of numbers, are any two numbers closer to each other than\n    given threshold.\n    >>> has_close_elements([1.0, 2.0, 3.0], 0.5)\n    False\n    >>> has_close_elements([1.0, 2.8, 3.0, 4.0, 5.0, 2.0], 0.3)\n    True\n    \"\"\"\n", "canonical_solution": "    for idx, elem in enumerate(numbers):\n        for idx2, elem2 in enumerate(numbers):\n            if idx != idx2:\n                distance = abs(elem - elem2)\n                if distance < threshold:\n                    return True\n\n    return False\n", "test": "\n\nMETADATA = {\n    'author': 'jt',\n    'dataset': 'test'\n}\n\n\ndef check(has_close_elements):\n    assert has_close_elements([1.0, 2.0, 3.9, 4.0, 5.0, 2.2], 0.3) == True\n    assert has_close_elements([1.0, 2.0, 3.9, 4.0, 5.0, 2.2], 0.05) == False\n    assert has_close_elements([1.0, 2.0, 5.9, 4.0, 5.0], 0.95) == True\n    assert has_close_elements([1.0, 2.0, 5.9, 4.0, 5.0], 0.8) == False\n    assert has_close_elements([1.0, 2.0, 3.0, 4.0, 5.0, 2.0], 0.1) == True\n    assert has_close_elements([1.1, 2.2, 3.1, 4.1, 5.1], 1.0) == True\n    assert has_close_elements([1.1, 2.2, 3.1, 4.1, 5.1], 0.5) == False\n\ncheck(has_close_elements)", "text": "    Check if in given list of numbers, are any two numbers closer to each other than\n    given threshold.\n    >>> has_close_elements([1.0, 2.0, 3.0], 0.5)\n    False\n    >>> has_close_elements([1.0, 2.8, 3.0, 4.0, 5.0, 2.0], 0.3)\n    True", "declaration": "from typing import List\n\n\ndef has_close_elements(numbers: List[float], threshold: float) -> bool:\n", "example_test": "def check(has_close_elements):\n    assert has_close_elements([1.0, 2.0, 3.0], 0.5) == False\n    assert has_close_elements([1.0, 2.8, 3.0, 4.0, 5.0, 2.0], 0.3) == True\ncheck(has_close_elements)\n"}
+{"task_id": "Python/1", "prompt": "from typing import List\n\n\ndef separate_paren_groups(paren_string: str) -> List[str]:\n    \"\"\" Input to this function is a string containing multiple groups of nested parentheses. Your goal is to\n    separate those group into separate strings and return the list of those.\n    Separate groups are balanced (each open brace is properly closed) and not nested within each other\n    Ignore any spaces in the input string.\n    >>> separate_paren_groups('( ) (( )) (( )( ))')\n    ['()', '(())', '(()())']\n    \"\"\"\n", "canonical_solution": "    result = []\n    current_string = []\n    current_depth = 0\n\n    for c in paren_string:\n        if c == '(':\n            current_depth += 1\n            current_string.append(c)\n        elif c == ')':\n            current_depth -= 1\n            current_string.append(c)\n\n            if current_depth == 0:\n                result.append(''.join(current_string))\n                current_string.clear()\n\n    return result\n", "test": "\n\nMETADATA = {\n    'author': 'jt',\n    'dataset': 'test'\n}\n\n\ndef check(separate_paren_groups):\n    assert separate_paren_groups('(()()) ((())) () ((())()())') == [\n        '(()())', '((()))', '()', '((())()())'\n    ]\n    assert separate_paren_groups('() (()) ((())) (((())))') == [\n        '()', '(())', '((()))', '(((())))'\n    ]\n    assert separate_paren_groups('(()(())((())))') == [\n        '(()(())((())))'\n    ]\n    assert separate_paren_groups('( ) (( )) (( )( ))') == ['()', '(())', '(()())']\n\ncheck(separate_paren_groups)", "text": "    Input to this function is a string containing multiple groups of nested parentheses. Your goal is to\n    separate those group into separate strings and return the list of those.\n    Separate groups are balanced (each open brace is properly closed) and not nested within each other\n    Ignore any spaces in the input string.\n    >>> separate_paren_groups('( ) (( )) (( )( ))')\n    ['()', '(())', '(()())']", "declaration": "from typing import List\n\n\ndef separate_paren_groups(paren_string: str) -> List[str]:\n", "example_test": "def check(separate_paren_groups):\n    assert separate_paren_groups('( ) (( )) (( )( ))') == ['()', '(())', '(()())']\ncheck(separate_paren_groups)\n"}
+{"task_id": "Python/2", "prompt": "\n\ndef truncate_number(number: float) -> float:\n    \"\"\" Given a positive floating point number, it can be decomposed into\n    and integer part (largest integer smaller than given number) and decimals\n    (leftover part always smaller than 1).\n\n    Return the decimal part of the number.\n    >>> truncate_number(3.5)\n    0.5\n    \"\"\"\n", "canonical_solution": "    return number % 1.0\n", "test": "\n\nMETADATA = {\n    'author': 'jt',\n    'dataset': 'test'\n}\n\n\ndef check(truncate_number):\n    assert truncate_number(3.5) == 0.5\n    assert abs(truncate_number(1.33) - 0.33) < 1e-6\n    assert abs(truncate_number(123.456) - 0.456) < 1e-6\n\ncheck(truncate_number)", "text": "    Given a positive floating point number, it can be decomposed into\n    and integer part (largest integer smaller than given number) and decimals\n    (leftover part always smaller than 1).\n\n    Return the decimal part of the number.\n    >>> truncate_number(3.5)\n    0.5", "declaration": "def truncate_number(number: float) -> float:\n", "example_test": "def check(truncate_number):\n    assert truncate_number(3.5) == 0.5\ncheck(truncate_number)\n"}
+{"task_id": "Python/3", "prompt": "from typing import List\n\n\ndef below_zero(operations: List[int]) -> bool:\n    \"\"\" You're given a list of deposit and withdrawal operations on a bank account that starts with\n    zero balance. Your task is to detect if at any point the balance of account fallls below zero, and\n    at that point function should return True. Otherwise it should return False.\n    >>> below_zero([1, 2, 3])\n    False\n    >>> below_zero([1, 2, -4, 5])\n    True\n    \"\"\"\n", "canonical_solution": "    balance = 0\n\n    for op in operations:\n        balance += op\n        if balance < 0:\n            return True\n\n    return False\n", "test": "\n\nMETADATA = {\n    'author': 'jt',\n    'dataset': 'test'\n}\n\n\ndef check(below_zero):\n    assert below_zero([]) == False\n    assert below_zero([1, 2, -3, 1, 2, -3]) == False\n    assert below_zero([1, 2, -4, 5, 6]) == True\n    assert below_zero([1, -1, 2, -2, 5, -5, 4, -4]) == False\n    assert below_zero([1, -1, 2, -2, 5, -5, 4, -5]) == True\n    assert below_zero([1, -2, 2, -2, 5, -5, 4, -4]) == True\n\ncheck(below_zero)", "text": "    You're given a list of deposit and withdrawal operations on a bank account that starts with\n    zero balance. Your task is to detect if at any point the balance of account fallls below zero, and\n    at that point function should return True. Otherwise it should return False.\n    >>> below_zero([1, 2, 3])\n    False\n    >>> below_zero([1, 2, -4, 5])\n    True", "declaration": "from typing import List\n\n\ndef below_zero(operations: List[int]) -> bool:\n", "example_test": "def check(below_zero):\n    assert below_zero([1, 2, 3]) == False\n    assert below_zero([1, 2, -4, 5]) == True\ncheck(below_zero)\n"}
+{"task_id": "Python/4", "prompt": "from typing import List\n\n\ndef mean_absolute_deviation(numbers: List[float]) -> float:\n    \"\"\" For a given list of input numbers, calculate Mean Absolute Deviation\n    around the mean of this dataset.\n    Mean Absolute Deviation is the average absolute difference between each\n    element and a centerpoint (mean in this case):\n    MAD = average | x - x_mean |\n    >>> mean_absolute_deviation([1.0, 2.0, 3.0, 4.0])\n    1.0\n    \"\"\"\n", "canonical_solution": "    mean = sum(numbers) / len(numbers)\n    return sum(abs(x - mean) for x in numbers) / len(numbers)\n", "test": "\n\nMETADATA = {\n    'author': 'jt',\n    'dataset': 'test'\n}\n\n\ndef check(mean_absolute_deviation):\n    assert abs(mean_absolute_deviation([1.0, 2.0, 3.0]) - 2.0/3.0) < 1e-6\n    assert abs(mean_absolute_deviation([1.0, 2.0, 3.0, 4.0]) - 1.0) < 1e-6\n    assert abs(mean_absolute_deviation([1.0, 2.0, 3.0, 4.0, 5.0]) - 6.0/5.0) < 1e-6\n\ncheck(mean_absolute_deviation)", "text": "    For a given list of input numbers, calculate Mean Absolute Deviation\n    around the mean of this dataset.\n    Mean Absolute Deviation is the average absolute difference between each\n    element and a centerpoint (mean in this case):\n    MAD = average | x - x_mean |\n    >>> mean_absolute_deviation([1.0, 2.0, 3.0, 4.0])\n    1.0", "declaration": "from typing import List\n\n\ndef mean_absolute_deviation(numbers: List[float]) -> float:\n", "example_test": "def check(mean_absolute_deviation):\n    assert abs(mean_absolute_deviation([1.0, 2.0, 3.0, 4.0]) - 1.0) < 1e-6\ncheck(mean_absolute_deviation)\n"}
+{"task_id": "Python/5", "prompt": "from typing import List\n\n\ndef intersperse(numbers: List[int], delimeter: int) -> List[int]:\n    \"\"\" Insert a number 'delimeter' between every two consecutive elements of input list `numbers'\n    >>> intersperse([], 4)\n    []\n    >>> intersperse([1, 2, 3], 4)\n    [1, 4, 2, 4, 3]\n    \"\"\"\n", "canonical_solution": "    if not numbers:\n        return []\n\n    result = []\n\n    for n in numbers[:-1]:\n        result.append(n)\n        result.append(delimeter)\n\n    result.append(numbers[-1])\n\n    return result\n", "test": "\n\nMETADATA = {\n    'author': 'jt',\n    'dataset': 'test'\n}\n\n\ndef check(intersperse):\n    assert intersperse([], 7) == []\n    assert intersperse([5, 6, 3, 2], 8) == [5, 8, 6, 8, 3, 8, 2]\n    assert intersperse([2, 2, 2], 2) == [2, 2, 2, 2, 2]\n\ncheck(intersperse)", "text": "    Insert a number 'delimeter' between every two consecutive elements of input list `numbers'\n    >>> intersperse([], 4)\n    []\n    >>> intersperse([1, 2, 3], 4)\n    [1, 4, 2, 4, 3]", "declaration": "from typing import List\n\n\ndef intersperse(numbers: List[int], delimeter: int) -> List[int]:\n", "example_test": "def check(intersperse):\n    assert intersperse([], 4) == []\n    assert intersperse([1,2,3], 4) == [1,4,2,4,3]\ncheck(intersperse)\n"}
+{"task_id": "Python/6", "prompt": "from typing import List\n\n\ndef parse_nested_parens(paren_string: str) -> List[int]:\n    \"\"\" Input to this function is a string represented multiple groups for nested parentheses separated by spaces.\n    For each of the group, output the deepest level of nesting of parentheses.\n    E.g. (()()) has maximum two levels of nesting while ((())) has three.\n\n    >>> parse_nested_parens('(()()) ((())) () ((())()())')\n    [2, 3, 1, 3]\n    \"\"\"\n", "canonical_solution": "    def parse_paren_group(s):\n        depth = 0\n        max_depth = 0\n        for c in s:\n            if c == '(':\n                depth += 1\n                max_depth = max(depth, max_depth)\n            else:\n                depth -= 1\n\n        return max_depth\n\n    return [parse_paren_group(x) for x in paren_string.split(' ') if x]\n", "test": "\n\nMETADATA = {\n    'author': 'jt',\n    'dataset': 'test'\n}\n\n\ndef check(parse_nested_parens):\n    assert parse_nested_parens('(()()) ((())) () ((())()())') == [2, 3, 1, 3]\n    assert parse_nested_parens('() (()) ((())) (((())))') == [1, 2, 3, 4]\n    assert parse_nested_parens('(()(())((())))') == [4]\n\ncheck(parse_nested_parens)", "text": "    Input to this function is a string represented multiple groups for nested parentheses separated by spaces.\n    For each of the group, output the deepest level of nesting of parentheses.\n    E.g. (()()) has maximum two levels of nesting while ((())) has three.\n\n    >>> parse_nested_parens('(()()) ((())) () ((())()())')\n    [2, 3, 1, 3]", "declaration": "from typing import List\n\n\ndef parse_nested_parens(paren_string: str) -> List[int]:\n", "example_test": "def check(parse_nested_parens):\n    assert parse_nested_parens('(()()) ((())) () ((())()())') == [2, 3, 1, 3]\ncheck(parse_nested_parens)\n"}
+{"task_id": "Python/7", "prompt": "from typing import List\n\n\ndef filter_by_substring(strings: List[str], substring: str) -> List[str]:\n    \"\"\" Filter an input list of strings only for ones that contain given substring\n    >>> filter_by_substring([], 'a')\n    []\n    >>> filter_by_substring(['abc', 'bacd', 'cde', 'array'], 'a')\n    ['abc', 'bacd', 'array']\n    \"\"\"\n", "canonical_solution": "    return [x for x in strings if substring in x]\n", "test": "\n\nMETADATA = {\n    'author': 'jt',\n    'dataset': 'test'\n}\n\n\ndef check(filter_by_substring):\n    assert filter_by_substring([], 'john') == []\n    assert filter_by_substring(['xxx', 'asd', 'xxy', 'john doe', 'xxxAAA', 'xxx'], 'xxx') == ['xxx', 'xxxAAA', 'xxx']\n    assert filter_by_substring(['xxx', 'asd', 'aaaxxy', 'john doe', 'xxxAAA', 'xxx'], 'xx') == ['xxx', 'aaaxxy', 'xxxAAA', 'xxx']\n    assert filter_by_substring(['grunt', 'trumpet', 'prune', 'gruesome'], 'run') == ['grunt', 'prune']\n\ncheck(filter_by_substring)", "text": "    Filter an input list of strings only for ones that contain given substring\n    >>> filter_by_substring([], 'a')\n    []\n    >>> filter_by_substring(['abc', 'bacd', 'cde', 'array'], 'a')\n    ['abc', 'bacd', 'array']", "declaration": "from typing import List\n\n\ndef filter_by_substring(strings: List[str], substring: str) -> List[str]:\n", "example_test": "def check(filter_by_substring):\n    assert filter_by_substring([], 'a') == []\n    assert filter_by_substring(['abc', 'bacd', 'cde', 'array'], 'a') == ['abc', 'bacd', 'array']\ncheck(filter_by_substring)\n"}
+{"task_id": "Python/8", "prompt": "from typing import List, Tuple\n\n\ndef sum_product(numbers: List[int]) -> Tuple[int, int]:\n    \"\"\" For a given list of integers, return a tuple consisting of a sum and a product of all the integers in a list.\n    Empty sum should be equal to 0 and empty product should be equal to 1.\n    >>> sum_product([])\n    (0, 1)\n    >>> sum_product([1, 2, 3, 4])\n    (10, 24)\n    \"\"\"\n", "canonical_solution": "    sum_value = 0\n    prod_value = 1\n\n    for n in numbers:\n        sum_value += n\n        prod_value *= n\n    return sum_value, prod_value\n", "test": "\n\nMETADATA = {\n    'author': 'jt',\n    'dataset': 'test'\n}\n\n\ndef check(sum_product):\n    assert sum_product([]) == (0, 1)\n    assert sum_product([1, 1, 1]) == (3, 1)\n    assert sum_product([100, 0]) == (100, 0)\n    assert sum_product([3, 5, 7]) == (3 + 5 + 7, 3 * 5 * 7)\n    assert sum_product([10]) == (10, 10)\n\ncheck(sum_product)", "text": "    For a given list of integers, return a tuple consisting of a sum and a product of all the integers in a list.\n    Empty sum should be equal to 0 and empty product should be equal to 1.\n    >>> sum_product([])\n    (0, 1)\n    >>> sum_product([1, 2, 3, 4])\n    (10, 24)", "declaration": "from typing import List, Tuple\n\n\ndef sum_product(numbers: List[int]) -> Tuple[int, int]:\n", "example_test": "def check(sum_product):\n    assert sum_product([]) == (0, 1)\n    assert sum_product([1, 2,3,4]) == (10, 24)\ncheck(sum_product)\n"}
+{"task_id": "Python/9", "prompt": "from typing import List, Tuple\n\n\ndef rolling_max(numbers: List[int]) -> List[int]:\n    \"\"\" From a given list of integers, generate a list of rolling maximum element found until given moment\n    in the sequence.\n    >>> rolling_max([1, 2, 3, 2, 3, 4, 2])\n    [1, 2, 3, 3, 3, 4, 4]\n    \"\"\"\n", "canonical_solution": "    running_max = None\n    result = []\n\n    for n in numbers:\n        if running_max is None:\n            running_max = n\n        else:\n            running_max = max(running_max, n)\n\n        result.append(running_max)\n\n    return result\n", "test": "\n\nMETADATA = {\n    'author': 'jt',\n    'dataset': 'test'\n}\n\n\ndef check(rolling_max):\n    assert rolling_max([]) == []\n    assert rolling_max([1, 2, 3, 4]) == [1, 2, 3, 4]\n    assert rolling_max([4, 3, 2, 1]) == [4, 4, 4, 4]\n    assert rolling_max([3, 2, 3, 100, 3]) == [3, 3, 3, 100, 100]\n\ncheck(rolling_max)", "text": "    From a given list of integers, generate a list of rolling maximum element found until given moment\n    in the sequence.\n    >>> rolling_max([1, 2, 3, 2, 3, 4, 2])\n    [1, 2, 3, 3, 3, 4, 4]", "declaration": "from typing import List, Tuple\n\n\ndef rolling_max(numbers: List[int]) -> List[int]:\n", "example_test": "def check(rolling_max):\n    assert rolling_max([1, 2, 3, 2, 3, 4, 2]) == [1, 2, 3, 3, 3, 4, 4]\ncheck(rolling_max)\n"}
\ No newline at end of file
diff --git a/mindie/examples/models/starcoder2/run_800i_a2_pa.sh b/mindie/examples/models/starcoder2/run_800i_a2_pa.sh
new file mode 100644
index 00000000..242b5973
--- /dev/null
+++ b/mindie/examples/models/starcoder2/run_800i_a2_pa.sh
@@ -0,0 +1,15 @@
+# Copyright Huawei Technologies Co., Ltd. 2023-2024. All rights reserved.
+# 参数配置以及启动指令的说明见同级目录下的README.md文件
+export ASCEND_RT_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+export TP_WORLD_SIZE=4
+export MASTER_PORT=12343
+export PYTHONPATH=${llm_path}:$PYTHONPATH
+
+# 以下环境变量与性能和内存优化相关，通常情况下无需修改
+export ATB_LAYER_INTERNAL_TENSOR_REUSE=1
+export INF_NAN_MODE_ENABLE=0
+export ATB_OPERATION_EXECUTE_ASYNC=1
+export TASK_QUEUE_ENABLE=1
+export LCCL_ENABLE_FALLBACK=1
+
+torchrun --nproc_per_node $TP_WORLD_SIZE --master_port $MASTER_PORT -m examples.run_pa --input_text "def print_hello_world()" --model_path $1 $extra_param --max_batch_size 1 --max_input_length 256 --max_output_length 256
\ No newline at end of file
diff --git a/mindie/examples/models/telechat/README.md b/mindie/examples/models/telechat/README.md
new file mode 100644
index 00000000..7fc89936
--- /dev/null
+++ b/mindie/examples/models/telechat/README.md
@@ -0,0 +1,97 @@
+# Telechat README
+
+星辰语义大模型TeleChat是由中国电信人工智能科技有限公司研发训练的大语言模型，采用1.5万亿 Tokens中英文高质量语料进行训练。
+     
+- 参考实现：
+  ```
+  https://github.com/Tele-AI/Telechat
+  ```
+
+# 特性矩阵
+- 此矩阵罗列了TeleChat模型支持的特性
+
+| 模型及参数量 | 800I A2 Tensor Parallelism | 300I DUO Tensor Parallelism | FP16 | BF16 | Flash Attention | Paged Attention | W8A8量化 | W8A16量化 | KV cache量化 | 稀疏量化 | MOE量化 | MindIE Service | TGI |  长序列 |
+|-------------|----------------------------|-----------------------------|------|----------------------|-----------------|-----------------|---------|-----------|--------------|--------------------------|-----|--------|-----|-----|
+| Telechat-7B    | 否     | 支持world size 2,4           | 是   | 否                   | 否              | 是              | 是       | 否        | 否           | 否                       | 否  | 否     | 否  | 否  |
+| Telechat-12B-v2    | 否     | 支持world size 2,4           | 是   | 否                   | 否              | 是              | 否       | 否        | 否           | 否                       | 否  | 否     | 否  | 否  |
+
+# 使用说明
+
+## 路径变量解释
+| 变量名  | 含义                                             |
+|--------|--------------------------------------------------|
+| working_dir | 加速库及模型库下载后放置的目录                  |
+| llm_path | 模型仓所在路径。若使用编译好的包，则路径为`${working_dir}/MindIE-LLM/`；若使用gitee下载的代码，则路径为`${working_dir}/MindIE-LLM/examples/atb_models`    |
+| script_path | 脚本所在路径；telechat的工作脚本所在路径为${llm_path}/examples/models/telechat                          |
+| weight_path | 模型权重路径 |
+
+## 权重下载
+- [Telechat-7B](https://huggingface.co/Tele-AI/Telechat-7B/tree/main)
+- [Telechat-12B-v2](https://modelscope.cn/models/TeleAI/TeleChat-12B-v2/)
+
+## 权重转换
+- 参考[此README文件](../../README.md)
+
+## 量化权重转换（W8A8）
+在`llm_path`目录下执行以下命令行
+``` bash
+python examples/models/telechat/convert_w8a8_quant_weights.py --level=L5 --jsonl_path="" --checkpoint_path=""  --save_directory=""
+```
+- 命令参数说明：       
+  - `--level`：量化回退等级，默认为L5
+  - `--jsonl_path`：量化校准集路径
+  - `--checkpoint_path`：开源FP16权重路径，即$Telechat_float_path
+  - `--save_directory`：量化权重保存路径
+
+# 服务化推理
+
+## 300I DUO 运行操作说明
+
+### 对话测试
+- 运行启动脚本
+  - 在\${llm_path}目录下执行以下指令
+    ```shell
+    bash ${script_path}/run_300i_duo.sh ${weight_path}
+    ```
+- 环境变量说明
+  - `export BIND_CPU=1`
+    - 绑定CPU核心开关
+    - 默认进行绑核
+    - 若当前机器未设置NUMA或绑核失败，可将 BIND_CPU 设为 0
+  - `export ASCEND_RT_VISIBLE_DEVICES=0,1,2,3`
+    - 指定当前机器上可用的逻辑NPU核心，多个核心间使用逗号相连
+    - 核心ID查阅方式见[此README文件](../../README.md)的【启动脚本相关环境变量】章节
+    - 若要使用单卡双芯，请指定至少两个可见核心；若要使用双卡四芯，请指定至少四个可见核心
+  - `export TP_WORLD_SIZE=2`
+    - 指定模型运行时的TP数，即world size
+    - 默认为单卡双芯
+    - 各模型支持的TP数参考“特性矩阵”
+    - “单卡双芯”运行请指定`TP_WORLD_SIZE`为`2`，“双卡四芯”运行请指定`TP_WORLD_SIZE`为`4`
+  - `export MASTER_PORT=20030`
+    - 设置卡间通信端口
+    - 默认使用20030端口
+    - 目的是为了避免同一台机器同时运行多个多卡模型时出现通信冲突
+    - 设置时端口建议范围为：20000-20050
+  - `export PYTHONPATH=${llm_path}:$PYTHONPATH`
+    - 将模型仓路径加入Python查询模块和包的搜索路径中
+    - 将${llm_path}替换为实际路径
+
+### 对话测试脚本参数说明
+- `--model_path` 模型路径
+- `--input_text` 输入问题
+- `--max_input_length` 最大输入长度
+- `--max_output_length` 最大输出长度
+- `--max_batch_size` 每次运行时固定的batch数量
+- 所有参数可见run_pa.py文件中
+
+**运行W8A8量化**
+- 获取量化权重后操作步骤同上
+
+## 精度测试
+- 参考[此README文件](../../../tests/modeltest/README.md)
+
+## 性能测试
+- 参考[此README文件](../../../tests/modeltest/README.md)
+
+
+
diff --git a/mindie/examples/models/telechat/convert_w8a8_quant_weights.py b/mindie/examples/models/telechat/convert_w8a8_quant_weights.py
new file mode 100644
index 00000000..5552dd95
--- /dev/null
+++ b/mindie/examples/models/telechat/convert_w8a8_quant_weights.py
@@ -0,0 +1,112 @@
+# Copyright Huawei Technologies Co., Ltd. 2024. All rights reserved.
+import argparse
+import random
+import os
+
+
+import torch
+from msmodelslim.pytorch.llm_ptq.llm_ptq_tools import Calibrator, QuantConfig
+from msmodelslim.pytorch.llm_ptq.anti_outlier import AntiOutlier, AntiOutlierConfig
+from transformers import AutoTokenizer, AutoModelForCausalLM, AutoConfig
+
+from atb_llm.utils.log import logger
+from atb_llm.utils.file_utils import safe_open
+from examples.convert.convert_utils import copy_tokenizer_files, modify_config
+
+
+def inference(_model, _tokenizer, max_new_tokens=32):
+    test_prompt = "<_user>1+2*3等于几<_bot>"
+    test_input = _tokenizer(test_prompt, return_tensors="pt")
+    logger.info("model is inferring...")
+    _model.eval()
+    generate_ids = _model.generate(test_input.input_ids.cpu(),
+                                   attention_mask=test_input.attention_mask.cpu(), max_new_tokens=max_new_tokens)
+    res = _tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)
+    for item in res:
+        logger.info(item)
+
+
+def quant(model, tokenizer, save_path):
+    # random input
+    data_num = 100
+    calib_list = random.sample(questions, data_num)
+    torch.save(calib_list, f"calib_list_{args.level}")
+
+    # prepare calib data
+    calib_data = []
+    for text in calib_list:
+        token_data = tokenizer(text, return_tensors="pt")
+        calib_data.append([token_data["input_ids"].cpu(), None, token_data["attention_mask"].cpu()])
+    
+    # model to cpu
+    model.cpu().float().eval()
+
+    inference(model, tokenizer)
+
+    logger.info("--------anti outlier suppression start--------")
+    anti_config = AntiOutlierConfig(anti_method="m2", dev_type="cpu")
+    anti_outlier = AntiOutlier(model, calib_data=calib_data, cfg=anti_config, model_type="Llama")
+    anti_outlier.process()
+    logger.info("--------anti outlier suppression success--------")
+
+    logger.info("-----------set quant config--------")
+    quant_config = QuantConfig(w_bit=8, disable_names=[], dev_type='cpu', act_method=3,
+                               pr=1.0, mm_tensor=False, w_hessian=False)
+
+    logger.info("-----------init calibrator--------")
+    calibrator = Calibrator(model, quant_config, calib_data=calib_data, disable_level=args.level)
+    
+    logger.info("-----------calibrator run--------")
+    calibrator.run(int_infer=True)
+
+    model = calibrator.model
+    inference(model, tokenizer)
+    
+    logger.info("-----------calibrator save--------")
+    calibrator.save(f"{save_path}", save_type=["safe_tensor"])
+    logger.info("--------calibration end----------")
+    return model
+
+
+def get_args():
+    parser = argparse.ArgumentParser(
+        'Evaluation',
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+
+    group = parser.add_argument_group('EVAL Task Parameters')
+    group.add_argument(
+        '--level', type=str)
+    group.add_argument(
+        '--jsonl_path', type=str)
+    group.add_argument(
+        '--checkpoint_path', type=str)
+    group.add_argument(
+        '--save_directory', type=str)
+    args_input = parser.parse_args()
+    return args_input
+
+
+if __name__ == '__main__':
+    args = get_args()
+
+    save_directory = args.save_directory
+    if not os.path.exists(save_directory):
+        os.makedirs(save_directory, exist_ok=True)
+
+    float_tokenizer = AutoTokenizer.from_pretrained(args.checkpoint_path)
+    config = AutoConfig.from_pretrained(args.checkpoint_path, trust_remote_code=True)
+    float_model = AutoModelForCausalLM.from_pretrained(args.checkpoint_path, config=config, trust_remote_code=True)
+
+    questions = []
+    with safe_open(args.jsonl_path, "r") as f:
+        for data in f:
+            questions.append(data["input"])
+
+    quant_model = quant(float_model, float_tokenizer, save_directory) # anti outlier + ptq
+    info_msg = f"quant model: {quant_model}"
+    logger.info(info_msg)
+
+    modify_config(args.checkpoint_path, save_directory, torch.float16, 'w8a8')
+    copy_tokenizer_files(args.checkpoint_path, save_directory)
+
+    logger.info("All done!")
diff --git a/mindie/examples/models/telechat/run_300i_duo.sh b/mindie/examples/models/telechat/run_300i_duo.sh
new file mode 100644
index 00000000..d1c33c14
--- /dev/null
+++ b/mindie/examples/models/telechat/run_300i_duo.sh
@@ -0,0 +1,17 @@
+# Copyright Huawei Technologies Co., Ltd. 2023-2024. All rights reserved.
+# 参数配置以及启动指令的说明见同级目录下的README.md文件
+export ASCEND_RT_VISIBLE_DEVICES=0,1
+export TP_WORLD_SIZE=2
+export MASTER_PORT=12888
+export PYTHONPATH=${llm_path}:$PYTHONPATH
+
+# 以下环境变量与性能和内存优化相关，通常情况下无需修改
+export ATB_LAUNCH_KERNEL_WITH_TILING=1
+export ATB_LAYER_INTERNAL_TENSOR_REUSE=1
+export INF_NAN_MODE_ENABLE=0
+export ATB_OPERATION_EXECUTE_ASYNC=1
+export TASK_QUEUE_ENABLE=1
+export LCCL_ENABLE_FALLBACK=1
+export INT8_FORMAT_NZ_ENABLE=1
+
+torchrun --nproc_per_node $TP_WORLD_SIZE --master_port $MASTER_PORT -m examples.run_pa --model_path $1 $extra_param --max_batch_size 1 --max_input_length 256 --max_output_length 256
\ No newline at end of file
diff --git a/mindie/examples/models/vicuna/README.md b/mindie/examples/models/vicuna/README.md
new file mode 100644
index 00000000..bd18f5cd
--- /dev/null
+++ b/mindie/examples/models/vicuna/README.md
@@ -0,0 +1,100 @@
+# README
+
+Vicuna是由 LMSYS 发布的基于Llama 2用ShareGPT收集的125K对话集微调的大模型，最长可以支持16K。
+
+- 此代码仓中实现了一套基于NPU硬件的Vicuna模型。配合加速库使用，旨在NPU上获得极致的推理性能。
+
+# 特性矩阵
+- 此矩阵罗列了各Vicuna模型支持的特性
+
+| 模型及参数量 | 800I A2 Tensor Parallelism | 300I DUO Tensor Parallelism | FP16 | BF16 | Flash Attention | Paged Attention | W8A8量化 | W8A16量化 | KV cache量化 | 稀疏量化 | MOE量化 | MindIE Service | TGI |  长序列 |
+|-------------|----------------------------|-----------------------------|------|----------------------|-----------------|-----------------|---------|-----------|--------------|--------------------------|-----|--------|-----|-----|
+| Vicuna-7B-v1.5-16K    | 支持world size 1,2,4,8   | 支持world size 2,4     | 是   | 否   | 否              | 是              | 否       | 否       | 否           | 否       | 否     | 否     | 否  | 否  |
+| Vicuna-13B-v1.5-16K    | 支持world size 1,2,4,8   | 支持world size 2,4     | 是   | 否   | 否              | 是              | 否       | 否       | 否           | 否       | 否     | 否     | 否  | 否  |
+
+# 使用说明
+
+## 路径变量解释
+| 变量名  | 含义                                             |
+|--------|--------------------------------------------------|
+| working_dir | 加速库及模型库下载后放置的目录                  |
+| llm_path | 模型仓所在路径; 若使用编译好的包，则路径为`${working_dir}/`；若使用gitee下载的代码，则路径为`${working_dir}/MindIE-LLM/examples/atb_models`    |
+| script_path | 脚本所在路径; Vicuna的工作脚本所在路径为`${llm_path}/examples/models/vicuna`                            |
+| weight_path | 模型权重路径                            |
+
+## 权重
+**权重下载**
+
+- [Vicuna-7b-v1.5-16k](https://huggingface.co/lmsys/vicuna-7b-v1.5-16k)
+- [Vicuna-13b-v1.5-16k](https://huggingface.co/lmsys/vicuna-13b-v1.5-16k)
+
+**权重转换**
+> 若权重中不包含safetensors格式，则执行权重转换步骤，否则跳过
+- 参考[此README文件](../../README.md)
+
+**基础环境变量**
+- 参考[此README文件](../../../README.md)
+
+## 推理
+
+### 对话测试
+
+**运行Paged Attention FP16**
+- 运行启动脚本
+  - 将`${llm_path}`加入`PYTHONPATH`搜索目录
+    ```shell
+    export PYTHONPATH=${llm_path}:${PYTHONPATH}
+    ```
+  - 在\${llm_path}目录下执行以下指令
+    ```shell
+    bash ${script_path}/run_pa.sh ${weight_path}
+    ```
+- 环境变量说明
+  - `export ASCEND_RT_VISIBLE_DEVICES=0,1,2,3,4,5,6,7`
+    - 指定当前机器上可用的逻辑NPU核心，多个核心间使用逗号相连
+    - 核心ID查阅方式见[此README文件](../../README.md)的【启动脚本相关环境变量】章节
+    - 对于300I DUO卡而言，若要使用单卡双芯，请指定至少两个可见核心；若要使用双卡四芯，请指定至少四个可见核心
+    - 各模型支持的核心数参考“特性矩阵”
+  - `export MASTER_PORT=20030`
+    - 设置卡间通信端口
+    - 默认使用20030端口
+    - 目的是为了避免同一台机器同时运行多个多卡模型时出现通信冲突
+    - 设置时端口建议范围为：20000-20050
+  - 以下环境变量与性能和内存优化相关，通常情况下无需修改
+    ```shell
+    export ATB_LAYER_INTERNAL_TENSOR_REUSE=1
+    export INF_NAN_MODE_ENABLE=0
+    export ATB_OPERATION_EXECUTE_ASYNC=1
+    export TASK_QUEUE_ENABLE=1
+    export ATB_CONVERT_NCHW_TO_ND=1
+    export LCCL_ENABLE_FALLBACK=1
+    export ATB_WORKSPACE_MEM_ALLOC_GLOBAL=1
+    export ATB_CONTEXT_WORKSPACE_SIZE=0
+    ```
+
+## 精度测试
+- 参考[此README文件](../../../tests/modeltest/README.md)
+  - 示例
+    ```shell
+    cd ${llm_path}/tests/modeltest
+    export ASCEND_RT_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+    export MAX_MEMORY_GB=29
+    bash run.sh pa_fp16 full_BoolQ 1 vicuna ${weight_path} 8
+    bash run.sh pa_fp16 full_CEval 5 1 vicuna ${weight_path} 8
+    ```
+
+## 性能测试
+- 参考[此README文件](../../../tests/modeltest/README.md)
+  - 示例
+    ```shell
+    cd ${llm_path}/tests/modeltest
+    export ASCEND_RT_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+    export MAX_MEMORY_GB=29
+    export ATB_LLM_BENCHMARK_ENABLE=1
+    bash run.sh pa_fp16 performance [[2048,2048],[1024,1024],[512,512],[256,256]] 1 vicuna ${weight_path} 8
+    ```
+
+## FAQ
+- 更多环境变量见[此README文件](../../README.md)
+- 对话测试实际执行的Python文件为`${llm_path}/examples/run_pa.py`；这两个文件的参数说明见[此README文件](../../README.md)
+- 运行时，需要通过指令pip list｜grep protobuf确认protobuf版本，如果版本高于3.20.x，请运行指令pip install protobuf==3.20.0进行更新
diff --git a/mindie/examples/models/vicuna/run_pa.sh b/mindie/examples/models/vicuna/run_pa.sh
new file mode 100644
index 00000000..ac0f784b
--- /dev/null
+++ b/mindie/examples/models/vicuna/run_pa.sh
@@ -0,0 +1,23 @@
+# Copyright Huawei Technologies Co., Ltd. 2023-2024. All rights reserved.
+# 参数配置以及启动指令的说明见同级目录下的README.md文件
+export ASCEND_RT_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+export MASTER_PORT=20030
+
+# 以下环境变量与性能和内存优化相关，通常情况下无需修改
+export ATB_LAYER_INTERNAL_TENSOR_REUSE=1
+export INF_NAN_MODE_ENABLE=0
+export ATB_OPERATION_EXECUTE_ASYNC=1
+export TASK_QUEUE_ENABLE=1
+export ATB_CONVERT_NCHW_TO_ND=1
+export LCCL_ENABLE_FALLBACK=1
+export ATB_WORKSPACE_MEM_ALLOC_GLOBAL=1
+export ATB_CONTEXT_WORKSPACE_SIZE=0
+export INT8_FORMAT_NZ_ENABLE=1
+
+world_size=$(($(echo "${ASCEND_RT_VISIBLE_DEVICES}" | grep -o , | wc -l) +1))
+
+if [ "$TP_WORLD_SIZE" == "1" ]; then
+    python -m examples.run_pa --model_path $1
+else
+    torchrun --nproc_per_node $world_size --master_port $MASTER_PORT -m examples.run_pa --model_path $1
+fi
\ No newline at end of file
diff --git a/mindie/examples/models/vlmo/README.md b/mindie/examples/models/vlmo/README.md
new file mode 100644
index 00000000..58648841
--- /dev/null
+++ b/mindie/examples/models/vlmo/README.md
@@ -0,0 +1,264 @@
+# README
+
+- [VLMo(Unified Vision-Language Pre-Training with Mixture-of-Modality-Experts.)](https://github.com/microsoft/unilm/tree/master/vlmo)是由微软提出的一种多模态 Transformer 模型，Mixture-of-Modality-Experts (MOME)，即混合多模态专家。VLMo 相当于是一个混合专家 Transformer 模型。预训练完成后，使用时既可以是双塔结构实现高效的图像文本检索，又可以是单塔结构成为分类任务的多模态编码器。
+
+- 此代码仓中实现了一套基于NPU硬件的VLMO推理模型。配合加速库使用，旨在NPU上获得极致的推理性能。
+
+# 特性矩阵
+- 此矩阵罗列了各VLMO模型支持的特性
+
+| 模型及参数量      | 800I A2 Tensor Parallelism | 300I DUO Tensor Parallelism | FP16 | BF16 | Flash Attention | Paged Attention | W8A8量化 | W8A16量化 | W4A16量化 | KV cache量化 | 稀疏量化 | MOE量化 | MindIE Service | TGI | 长序列 |
+|-------------|----------------------------|---------------------------|------|------------------|-----------------|-----------------|---------|-----------|---------|-----------|--------------------------|-----|--------|---|--------|
+|  VLMO    | 支持world size 1,2     | 支持world size 1,2          | √   | ×                    | √              | ×              | ×       | ×        | ×       | ×      | ×                        | ×   | ×      | ×   | ×     |
+
+- 此模型仓已适配的模型版本
+  - [VLMO系列](https://github.com/microsoft/unilm/tree/master/vlmo)
+
+# 使用说明
+
+## 路径变量解释
+| 变量名  | 含义                                             |
+|--------|--------------------------------------------------|
+| working_dir | 加速库及模型库下载后放置的目录                  |
+| llm_path | 模型仓所在路径。若使用编译好的包，则路径为`${working_dir}/MindIE-LLM/`；若使用gitee下载的代码，则路径为`${working_dir}/MindIE-LLM/examples/atb_models`    |
+| script_path | 脚本所在路径；VLMO的文件所在路径为`${llm_path}/atb_llm/models/vlmo`                            |
+| weight_path | 模型权重路径                            |
+
+## 权重
+**权重下载**
+- [VLMO](https://github.com/microsoft/unilm/tree/master/vlmo)
+- 请查看 README.md 下载链接中'Configs'页签下所需测试集的'finetuned weight'
+- 分类任务请使用 VQAv2数据集进行评估，检索任务情使用 COCO 数据集进行评估\
+   以VQAv2为例，下载 vlmo_base_patch16_480_vqa.pt
+
+**权重转换**
+- 不涉及
+
+**量化权重生成**
+- 不涉及
+
+
+**基础环境变量**
+- 参考[此README文件](../../../README.md)
+
+## 推理
+
+### 测试
+**运行Flash Attention FP16**
+- VLMO模型参考以下运行方式
+  - 安装依赖包 pip install 包名==版本号
+    ```shell
+    | 包名               | 推荐版本   |  
+    |-----------------|--------|
+    | transformers     | 4.33.1 | 
+    | decorator        | 5.1.1  |
+    | sympy            | 1.11.1 |
+    | scipy            | 1.11.3 |
+    | attrs            | 23.1.0 |
+    | sentencepiece    | 0.1.99 |
+    | pytorch_lightning| 1.5.5  |
+    | Pillow| 10.2.0 |
+    | tqdm |4.53.0|
+    | ipdb |0.13.7|
+    | einops| 0.3.0|
+    | pyarrow |14.0.1|
+    | sacred |0.8.5|
+    | pandas |2.2.0|
+    | timm |0.4.12|
+    | torchmetrics| 0.7.3|
+    | fairscale |0.4.0|
+    | numpy |1.26.4|
+    | scipy |1.12.0|
+    | opencv-python |4.9.0.80|
+    | opencv-python-headless| 4.9.0.80|
+    | psutil |5.9.8|
+    | torchvision |0.16.2|
+    如torchvision版本安装失败，则说明需要从Huawei源下载，需要将pip源修改为华为源http://cmc-cd-mirror.rnd.huawei.com/pypi/simple/
+    ```
+  - 安装torch
+    - `根据所使用python版本，以及CPU架构，选择对应的包`
+      ```bash
+      # 以安装torch-*-cp39-cp39-manylinux2014_aarch64.whl包为例
+      pip install torch-*-cp39-cp39-manylinux2014_aarch64.whl
+      ```
+  - 安装torch_npu
+    - `选择安装与torch版本以及python版本一致的torch_npu版本`
+      ```bash
+      # 安装torch_npu，以torch*对应的python3.9的aarch64版本为例
+      tar -zxvf pytorch_v*_py39.tar.gz
+      pip install torch*_aarch64.whl
+      ```
+  - 路径变量解释
+    | 变量名                 | 含义                                                                   |  
+    |---------------------|----------------------------------------------------------------------|
+    | model_download_path | 开源权重放置目录                                                             | 
+    | data_download_path|  数据集放置目录
+    | llm_path            | 加速库及模型库下载后放置目录                                                       |
+    | model_path          | 工作时模型所在的目录，可以和model_download_path相同，但一般模型是公共的，为了避免影响其他用户，单独建一个模型工作目录 |
+  - 环境准备
+    - 下载代码，通过git工具将vlmo代码下载至本地 `${model_path}` 中
+      ```
+      git clone https://github.com/microsoft/unilm.git
+      ```  
+    - 下载模型权重，放置到自定义`${model_download_path}` 下载方式参考上文模型权重下载
+    - 下载数据集(请查看 DATA.md 下载指定测试集的数据，并整理成所需目录结构)放置`${data_download_path}`目录\
+      以VQAv2为例，将文件按照文档说明整理为如下格式：
+      ```
+          `${data_download_path}`
+          ├── train2014            
+          │   ├── COCO_train2014_000000000009.jpg                
+          |   └── ...
+          ├── val2014              
+          |   ├── COCO_val2014_000000000042.jpg
+          |   └── ...  
+          ├── test2015              
+          |   ├── COCO_test2015_000000000001.jpg
+          |   └── ...         
+          ├── v2_OpenEnded_mscoco_train2014_questions.json
+          ├── v2_OpenEnded_mscoco_val2014_questions.json
+          ├── v2_OpenEnded_mscoco_test2015_questions.json
+          ├── v2_OpenEnded_mscoco_test-dev2015_questions.json
+          ├── v2_mscoco_train2014_annotations.json
+          └── v2_mscoco_val2014_annotations.json
+      ```
+      在 `${model_path}`/unilm/vlmo 目录下新建文件 makearrow.py 内容如下：
+      ```python
+      from vlmo.utils.write_vqa import make_arrow
+      make_arrow('{data_download_path}', '{data_download_path}/vqa_arrow')
+      ```
+      ```python
+        #对于VQA v2数据集，vlmo的write_vqa脚本不会生成分类结果与答案的映射关系，需要在`${model_path}`/unilm/vlmo/vlmo/utils/write_vqa.py 中最下方手动添加代码进行输出。
+
+      
+          # 注意行对齐
+          with open(os.path.join(dataset_root, "answer2label.txt"), mode="w", encoding="utf-8") as writer:
+                for ans in ans2label:
+                    to_json = {
+                        "answer": ans, 
+                        "label": ans2label[ans]
+                    }
+                    writer.write("%s\n" % json.dumps(to_json))
+      ```
+
+      执行该脚本，将会在 vqa_arrow文件夹下生成相应的二进制数据集文件：
+      ```shell
+      python makearrow.py
+      ```
+      生成目录结构如下：
+      ```
+          `${data_download_path}`W
+            arrow
+              ├── vqav2_val.arrow
+              ├── vqav2_trainable_val.arrow
+              ├── vqav2_train.arrow
+              ├── vqav2_test.arrow
+              ├── vqav2_test-dev.arrow
+              ├── vqav2_test.arrow
+              └── answer2label.txt
+      ```
+    - 下载Bert 词表
+      ```
+      https://huggingface.co/google-bert/bert-base-uncased/tree/main
+      ```
+      在Files and versions 页签中找到 vocab.txt 下载后放入 `${model_download_path}` 中备用。  
+    - 拷贝文件
+      - 将大模型加速库中 vlmo 相关的 文件替换至 model_path 中的指定路径
+        ```shell
+        cd ${llm_path}/pytorch/examples/vlmo/
+        cp multiway_transformer.py ${model_path}/unilm/vlmo/vlmo/modules
+        cp vlmo_module.py ${model_path}/unilm/vlmo/vlmo/modules
+        cp objectives.py ${model_path}/unilm/vlmo/vlmo/modules
+        cp vlmo_utils.py ${model_path}/unilm/vlmo/vlmo/modules
+        cp run_ascend_vqa.py ${model_path}/unilm/vlmo/
+        cp run_ascend_vqa.sh ${model_path}/unilm/vlmo/
+        cp cut_model_util.py ${model_path}/unilm/vlmo/
+        cp cut_ascend_vqa.py ${model_path}/unilm/vlmo/
+        cp cut_model_and_run.sh ${model_path}/unilm/vlmo/
+        ```
+      - 修改配置
+        以VQA v2 task_finetune_vqa_base_image480 微调评估为例。\
+        打开 `${model_path}`/unilm/vlmo/run_ascend_vqa.sh \
+        修改 `<Finetuned_VLMo_WEIGHT>`  为 `${model_download_path}`；修改 `<CONFIG_NAME>` 为 task_finetune_vqa_base_image480
+
+        打开 `${model_path}`/unilm/vlmo/run_ascend_vqa.py \
+        修改 `VQA_ARROW_DIR`  路径为 '`${data_download_path}`/arrow' ；修改 `<BERT_VOCAB>` 为 '`${model_download_path}`/vocab.txt'
+        修改 DEVICE_ID 后的值可选择在哪张卡上运行
+    - 执行推理
+      - 单芯推理 run_ascend_vqa.sh
+        ```shell
+        bash run_ascend_vqa.sh
+        ``` 
+      - 双芯推理 cut_model_and_run.sh
+        - 修改双芯推理配置
+          打开 `${model_path}/unilm/vlmo/cut_model_and_run.sh` 修改input_path为`${model_download_path}`;修改 `CONFIG_NAME` 后的值为 task_finetune_vqa_base_image480
+          打开 `${model_path}/unilm/vlmo/cut_ascend_vqa.py` \
+          修改 `VQA_ARROW_DIR`  路径为 '`${data_download_path}`/arrow' ；修改 `<BERT_VOCAB>` 为 '`${model_download_path}`/vocab.txt'。
+          修改 DEVICE_ID 后的值可选择在哪张卡上运行
+        - 第一次执行为切分权重，第二次执行为进行双芯推理。
+        ```shell
+        bash cut_model_and_run.sh
+        ```
+
+
+            
+**运行Flash Attention BF16**
+- 暂不支持
+
+**运行Flash Attention W8A8**
+- 运行启动脚本
+- 暂不支持
+
+**运行Flash Attention W8A16**
+- 暂不支持
+
+**运行Paged Attention FP16**
+- 暂不支持
+
+**运行Paged Attention BF16**
+- 暂不支持
+
+**运行Paged Attention W8A8**
+- 暂不支持
+
+**运行Paged Attention W8A16**
+- 暂不支持
+
+**运行KV cache量化**
+- 待补充
+
+**运行稀疏量化**
+- 暂不支持
+
+**运行MOE量化**
+- 待补充
+
+## 精度测试
+模型运行完毕后，会在日志中打印出accuracy（正确率/精度数据）mean of cost（平均耗时/性能数据）作为对比参考
+
+## 性能测试
+模型运行完毕后，会在日志中打印出accuracy（正确率/精度数据）mean of cost（平均耗时/性能数据）作为对比参考
+
+## FAQ
+### 
+
+1. ImportError: /root/miniconda3/envs/wqh39/bin/../lib/libgomp.so.1: cannot allocate memory in static TLS block  
+
+如果遇到
+
+```text
+Traceback (most recent call last):
+  File "/root/miniconda3/envs/wqh39/lib/python3.9/site-packages/torch_npu/__init__.py", line 31, in <module>
+    import torch_npu.npu
+  File "/root/miniconda3/envs/wqh39/lib/python3.9/site-packages/torch_npu/npu/__init__.py", line 46, in <module>
+    from .utils import (is_initialized, _lazy_call, _lazy_init, init, set_dump,
+  File "/root/miniconda3/envs/wqh39/lib/python3.9/site-packages/torch_npu/npu/utils.py", line 27, in <module>
+    import torch_npu._C
+ImportError: /root/miniconda3/envs/wqh39/bin/../lib/libgomp.so.1: cannot allocate memory in static TLS block
+Segmentation fault (core dumped)
+```
+
+则可取消run_inf_ascend_*.sh 脚本中的注释，修改为报错中相应的路径。如
+
+```shell
+LD_PRELOAD=/root/miniconda3/envs/wqh39/bin/../lib/libgomp.so.1:$LD_PRELOAD
+```
diff --git a/mindie/examples/models/yi/README.md b/mindie/examples/models/yi/README.md
new file mode 100644
index 00000000..1896d760
--- /dev/null
+++ b/mindie/examples/models/yi/README.md
@@ -0,0 +1,121 @@
+# README
+
+[Yi系列模型](https://huggingface.co/01-ai) 是由 01.AI 从头开始训练的新一代开源大型语言模型。[Yi系列模型](https://huggingface.co/01-ai) 以双语语言模型为目标，在 3T 多语种语料库上进行训练，已成为全球最强大的 LLM 之一，在语言理解、常识推理、阅读理解等方面展示出良好的前景。
+
+- 此代码仓中实现了一套基于NPU硬件的Yi系列模型。配合加速库使用，旨在NPU上获得极致的推理性能。
+
+# 特性矩阵
+- 此矩阵罗列了各Yi模型支持的特性
+
+| 模型及参数量 | 800I A2 Tensor Parallelism | 300I DUO Tensor Parallelism | FP16 | BF16 | Flash Attention | Paged Attention | W8A8量化 | W8A16量化 | KV cache量化 | 稀疏量化 | MOE量化 | MindIE Service | TGI | 长序列 |
+|-------------|-------------------------|-------------------------|------|------|-----------------|-----------------|---------|---------|--------------|----------|--------|--------|-----|-----|
+| Yi-6B-200K    | 支持world size 1,2,4,8   | 支持world size 2,4     | 是   | 是   | 否              | 是              | 否       | 否       | 否           | 否       | 否     | 否     | 否  | 否 |
+| Yi-34B    | 支持world size 4,8   | 否     | 是   | 是   | 否              | 是              | 否       | 否       | 否           | 否       | 否     | 否     | 否  | 否 |
+| Yi-34B-200K    | 支持world size 4,8   | 否     | 是   | 是   | 否              | 是              | 否       | 否       | 否           | 否       | 否     | 否     | 否  | 是 |
+
+# 使用说明
+
+## 路径变量解释
+| 变量名  | 含义                                             |
+|--------|--------------------------------------------------|
+| working_dir | 加速库及模型库下载后放置的目录                  |
+| llm_path | ATB_Models模型仓所在路径；若使用编译好的包，则路径为`${working_dir}/`；若使用gitee下载的代码，则路径为`${working_dir}/MindIE-LLM/examples/atb_models/`    |
+| script_path | 脚本所在路径; Yi系列模型的工作脚本所在路径为`${llm_path}/examples/models/yi`                            |
+| weight_path | 模型权重路径                            |
+
+## 权重
+**权重下载**
+
+- [Yi-6B-200K](https://huggingface.co/01-ai/Yi-6B-200K)
+- [Yi-34B](https://huggingface.co/01-ai/Yi-34B)
+- [Yi-34B-200K](https://huggingface.co/01-ai/Yi-34B-200K)
+
+**权重转换**
+> 若权重中不包含safetensors格式，则执行权重转换步骤，否则跳过
+- 参考[此README文件](../../README.md)
+
+**基础环境变量**
+- 参考[此README文件](../../../README.md)
+
+## 推理
+
+### 对话测试
+
+**运行Paged Attention BF16**
+- 运行启动脚本
+  - 将`${llm_path}`加入`PYTHONPATH`搜索目录
+    ```shell
+    export PYTHONPATH=${llm_path}:${PYTHONPATH}
+    ```
+  - 在\${llm_path}目录下执行以下指令
+    ```shell
+    bash ${script_path}/run_pa.sh ${weight_path}
+    ```
+- 环境变量说明
+  - `export ASCEND_RT_VISIBLE_DEVICES=0,1,2,3,4,5,6,7`
+    - 指定当前机器上可用的逻辑NPU核心，多个核心间使用逗号相连
+    - 核心ID查阅方式见[此README文件](../../README.md)的【启动脚本相关环境变量】章节
+    - 对于300I DUO卡而言，若要使用单卡双芯，请指定至少两个可见核心；若要使用双卡四芯，请指定至少四个可见核心
+    - 各模型支持的核心数参考“特性矩阵”
+  - `export MASTER_PORT=20030`
+    - 设置卡间通信端口
+    - 默认使用20030端口
+    - 目的是为了避免同一台机器同时运行多个多卡模型时出现通信冲突
+    - 设置时端口建议范围为：20000-20050
+  - 以下环境变量与性能和内存优化相关，通常情况下无需修改
+    ```shell
+    export ATB_LAYER_INTERNAL_TENSOR_REUSE=1
+    export INF_NAN_MODE_ENABLE=0
+    export ATB_OPERATION_EXECUTE_ASYNC=1
+    export TASK_QUEUE_ENABLE=1
+    export ATB_CONVERT_NCHW_TO_ND=1
+    export LCCL_ENABLE_FALLBACK=1
+    export ATB_WORKSPACE_MEM_ALLOC_GLOBAL=1
+    export ATB_CONTEXT_WORKSPACE_SIZE=0
+    ```
+
+**运行Paged Attention FP16**
+- 运行启动脚本
+  - 与“运行Paged Attention BF16”的启动方式相同
+- 环境变量说明
+  - 参见“运行Paged Attention BF16”中的环境变量说明
+- 相比于BF16，运行FP16时需修改${weight_path}/config.json中的`torch_dtype`字段，将此字段对应的值修改为`float16`
+
+## 精度测试
+- 参考[此README文件](../../../tests/modeltest/README.md)
+  - 示例
+    ```shell
+    cd ${llm_path}/tests/modeltest
+    export ASCEND_RT_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+    export MAX_MEMORY_GB=29
+    # 运行Paged Attention BF16
+    bash run.sh pa_bf16 full_CEval 5 1 yi ${weight_path} 8
+    # 运行Paged Attention FP16
+    bash run.sh pa_fp16 full_CEval 5 1 yi ${weight_path} 8
+    ```
+
+- 长序列
+  - 涉及21个数据集，1w多条数据，在800I A2上需要运行20+小时，数据最长为 76839。
+  - 长序列精度测试示例
+    ```shell
+    bash run.sh pa_fp16 full_LongBench 1 yi ${weight_path} 8
+    ```
+
+## 性能测试
+- 参考[此README文件](../../../tests/modeltest/README.md)
+  - 示例
+    ```shell
+    cd ${llm_path}/tests/modeltest
+    export ASCEND_RT_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+    export MAX_MEMORY_GB=29
+    export ATB_LLM_BENCHMARK_ENABLE=1
+    # 运行Paged Attention BF16
+    bash run.sh pa_bf16 performance [[2048,2048],[1024,1024],[512,512],[256,256]] 1 yi ${weight_path} 8
+    # 运行Paged Attention FP16
+    bash run.sh pa_fp16 performance [[2048,2048],[1024,1024],[512,512],[256,256]] 1 yi ${weight_path} 8
+    ```
+
+## FAQ
+- 更多环境变量见[此README文件](../../README.md)
+- 对话测试实际执行的Python文件为`${llm_path}/examples/run_pa.py`；这两个文件的参数说明见[此README文件](../../README.md)
+- 运行时，需要通过指令pip list｜grep protobuf确认protobuf版本，如果版本高于3.20.x，请运行指令pip install protobuf==3.20.0进行更新
diff --git a/mindie/examples/models/yi/run_pa.sh b/mindie/examples/models/yi/run_pa.sh
new file mode 100644
index 00000000..ac0f784b
--- /dev/null
+++ b/mindie/examples/models/yi/run_pa.sh
@@ -0,0 +1,23 @@
+# Copyright Huawei Technologies Co., Ltd. 2023-2024. All rights reserved.
+# 参数配置以及启动指令的说明见同级目录下的README.md文件
+export ASCEND_RT_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+export MASTER_PORT=20030
+
+# 以下环境变量与性能和内存优化相关，通常情况下无需修改
+export ATB_LAYER_INTERNAL_TENSOR_REUSE=1
+export INF_NAN_MODE_ENABLE=0
+export ATB_OPERATION_EXECUTE_ASYNC=1
+export TASK_QUEUE_ENABLE=1
+export ATB_CONVERT_NCHW_TO_ND=1
+export LCCL_ENABLE_FALLBACK=1
+export ATB_WORKSPACE_MEM_ALLOC_GLOBAL=1
+export ATB_CONTEXT_WORKSPACE_SIZE=0
+export INT8_FORMAT_NZ_ENABLE=1
+
+world_size=$(($(echo "${ASCEND_RT_VISIBLE_DEVICES}" | grep -o , | wc -l) +1))
+
+if [ "$TP_WORLD_SIZE" == "1" ]; then
+    python -m examples.run_pa --model_path $1
+else
+    torchrun --nproc_per_node $world_size --master_port $MASTER_PORT -m examples.run_pa --model_path $1
+fi
\ No newline at end of file
diff --git a/mindie/examples/models/ziya/README.md b/mindie/examples/models/ziya/README.md
new file mode 100644
index 00000000..c8e7b556
--- /dev/null
+++ b/mindie/examples/models/ziya/README.md
@@ -0,0 +1,94 @@
+# README
+
+- [Ziya-Coding-34B-v1.0](https://huggingface.co/IDEA-CCNL/Ziya-Coding-34B-v1.0) 是IDEA研究院封神榜团队开源的代码大模型。
+
+
+# 特性矩阵
+- 此矩阵罗列Ziya-Coding模型支持的特性
+
+| 模型及参数量 | 800I A2 Tensor Parallelism | 300I DUO Tensor Parallelism | FP16 | BF16 | Flash Attention | Paged Attention | W8A8量化 | W8A16量化 | KV cache量化 | 稀疏量化 | MOE量化 | MindIE Service | TGI |  长序列 |
+|-------------|----------------------------|-----------------------------|------|----------------------|-----------------|-----------------|---------|-----------|--------------|--------------------------|-----|--------|-----|-----|
+| Ziya-Coding-34B-v1.0 | 支持world size 4,8     | 支持world size 2,4           | 是   | 否                  | 否         | 是              | 否       | 否        | 否           | 否                       | 否  | 否  | 否 |  否 |
+
+- 此模型仓已适配的模型版本
+  - Ziya-Coding-34B-v1.0 (transformers==4.36.2)
+
+# 使用说明
+
+## 路径变量解释
+| 变量名  | 含义                                             |
+|--------|--------------------------------------------------|
+| working_dir | 加速库及模型库下载后放置的目录                  |
+| llm_path | 模型仓所在路径。若使用编译好的包，则路径为`${working_dir}/MindIE-LLM/`；若使用gitee下载的代码，则路径为`${working_dir}/MindIE-LLM/examples/atb_models`    |
+| script_path | 脚本所在路径；Ziya-Coding-34B-v1.0的工作脚本所在路径为`${llm_path}/examples/models/ziya`               |
+| weight_path | 模型权重路径                            |
+
+## 权重
+**权重下载**
+
+- [Ziya-Coding-34B-v1.0](https://huggingface.co/IDEA-CCNL/Ziya-Coding-34B-v1.0)
+
+**权重转换**
+- 参考[此README文件](../../README.md)
+
+
+**基础环境变量**
+- 参考[此README文件](../../../README.md)
+
+## 推理
+
+### 对话测试
+**运行Paged Attention FP16**
+- 运行启动脚本
+  - 在\${llm_path}目录下执行以下指令
+    ```shell
+    bash ${script_path}/run_pa.sh ${weight_path}
+    ```
+- 环境变量说明
+  - `export ASCEND_RT_VISIBLE_DEVICES=0,1,2,3,4,5,6,7`
+    - 指定当前机器上可用的逻辑NPU核心，多个核心间使用逗号相连
+    - 核心ID查阅方式见[此README文件](../../README.md)的【启动脚本相关环境变量】章节
+    - 对于300I DUO卡而言，若要使用单卡双芯，请指定至少两个可见核心；若要使用双卡四芯，请指定至少四个可见核心
+    - 各模型支持的核心数参考“特性矩阵”
+  - `export MASTER_PORT=20030`
+    - 设置卡间通信端口
+    - 默认使用20030端口
+    - 目的是为了避免同一台机器同时运行多个多卡模型时出现通信冲突
+    - 设置时端口建议范围为：20000-20050
+  - 以下环境变量与性能和内存优化相关，通常情况下无需修改
+    ```shell
+    export ATB_LAYER_INTERNAL_TENSOR_REUSE=1
+    export INF_NAN_MODE_ENABLE=0
+    export ATB_OPERATION_EXECUTE_ASYNC=1
+    export TASK_QUEUE_ENABLE=1
+    export ATB_CONVERT_NCHW_TO_ND=1
+    export LCCL_ENABLE_FALLBACK=1
+    export ATB_WORKSPACE_MEM_ALLOC_GLOBAL=1
+    export ATB_CONTEXT_WORKSPACE_SIZE=0
+    ```
+
+## 精度测试
+- 参考[此README文件](../../../tests/modeltest/README.md)
+  - 示例
+    ```shell
+    cd ${llm_path}/tests/modeltest
+    export ASCEND_RT_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+    export MAX_MEMORY_GB=29
+    bash run.sh pa_fp16 full_BoolQ 1 ziya ${Ziya-Coding-34B-v1.0权重路径} 8
+    ```
+
+## 性能测试
+- 参考[此README文件](../../../tests/modeltest/README.md)
+  - 示例
+    ```shell
+    cd ${llm_path}/tests/modeltest
+    export ASCEND_RT_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+    export MAX_MEMORY_GB=29
+    export ATB_LLM_BENCHMARK_ENABLE=1
+    bash run.sh pa_fp16 performance [[2048,2048],[1024,1024],[512,512],[256,256]] 1 ziya ${Ziya-Coding-34B-v1.0权重路径} 8
+    ```
+
+## FAQ
+- 更多环境变量见[此README文件](../../README.md)
+- 对话测试实际执行的Python文件为`${llm_path}/examples/run_fa.py`和`${llm_path}/examples/run_pa.py`；这两个文件的参数说明见[此README文件](../../README.md)
+- 运行时，需要通过指令pip list｜grep protobuf确认protobuf版本，如果版本高于3.20.x，请运行指令pip install protobuf==3.20.0进行更新
diff --git a/mindie/examples/models/ziya/run_pa.sh b/mindie/examples/models/ziya/run_pa.sh
new file mode 100644
index 00000000..2ba86f45
--- /dev/null
+++ b/mindie/examples/models/ziya/run_pa.sh
@@ -0,0 +1,21 @@
+# Copyright Huawei Technologies Co., Ltd. 2023-2024. All rights reserved.
+export ASCEND_RT_VISIBLE_DEVICES=2,3,4,5
+export MASTER_PORT=20030
+
+export ATB_LAYER_INTERNAL_TENSOR_REUSE=1
+export INF_NAN_MODE_ENABLE=0
+export ATB_OPERATION_EXECUTE_ASYNC=1
+export TASK_QUEUE_ENABLE=1
+export ATB_CONVERT_NCHW_TO_ND=1
+export LCCL_ENABLE_FALLBACK=1
+export ATB_WORKSPACE_MEM_ALLOC_GLOBAL=1
+export ATB_CONTEXT_WORKSPACE_SIZE=0
+export INT8_FORMAT_NZ_ENABLE=1
+
+world_size=$(($(echo "${ASCEND_RT_VISIBLE_DEVICES}" | grep -o , | wc -l) +1))
+
+if [ "$TP_WORLD_SIZE" == "1" ]; then
+    python -m examples.run_pa --model_path $1
+else
+    torchrun --nproc_per_node $world_size --master_port $MASTER_PORT -m examples.run_pa --model_path $1
+fi
diff --git a/mindie/examples/run_fa.py b/mindie/examples/run_fa.py
new file mode 100644
index 00000000..8f8eb646
--- /dev/null
+++ b/mindie/examples/run_fa.py
@@ -0,0 +1,231 @@
+# Copyright Huawei Technologies Co., Ltd. 2023-2024. All rights reserved.
+import argparse
+import json
+import os
+import time
+import torch
+from transformers import StoppingCriteria, StoppingCriteriaList
+
+from atb_llm.runner import ModelRunner
+from atb_llm.utils.log import logger, print_log
+from atb_llm.utils.file_utils import safe_open
+
+
+def parse_arguments():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--model_path', help="model and tokenizer path")
+    parser.add_argument(
+        '--input_text',
+        type=str,
+        nargs='+',
+        default="What's deep learning?")
+    parser.add_argument(
+        '--input_file',
+        type=str,
+        help='CSV or Numpy file containing tokenized input. Alternative to text input.',
+        default=None)
+    parser.add_argument('--max_input_length', type=int, default=512)
+    parser.add_argument('--max_output_length', type=int, default=20)
+    parser.add_argument('--max_position_embeddings', type=int, default=None)
+    parser.add_argument("--batch_size", type=int, default=1)
+
+    parser.add_argument('--is_flash_causal_lm', action='store_true')
+
+    parser.add_argument('--num_beams',
+                        type=int,
+                        help="Use beam search if num_beams >1",
+                        default=1)
+    parser.add_argument('--temperature', type=float, default=1.0)
+    parser.add_argument('--top_p', type=float, default=0.0)
+    parser.add_argument('--length_penalty', type=float, default=1.0)
+    parser.add_argument('--repetition_penalty', type=float, default=1.0)
+
+    parser.add_argument('--inputs_embeds_dir', type=str, default=None,
+                        help='Directory of .pt files containing inputs_embeds.')
+    parser.add_argument('--min_length', type=int, default=10)
+    parser.add_argument('--stop_words_ids', type=json.loads, default=None)
+    parser.add_argument('--do_sample', type=bool, default=False)
+    parser.add_argument('--results_save_path', type=str, default=None,
+                        help='File path to save inference results.')
+
+    return parser.parse_args()
+
+
+class StoppingCriteriaSub(StoppingCriteria):
+    def __init__(self, stops=None):
+        super().__init__()
+        if stops is None:
+            stops = []
+        self.stops = stops
+
+    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs):
+        for stop in self.stops:
+            if torch.all(torch.eq(input_ids[:, -len(stop):], stop)).item():
+                return True
+        return False
+
+
+class FARunner:
+    def __init__(self, **kwargs):
+        self.rank = kwargs.get('rank', '0')
+        self.local_rank = kwargs.get('local_rank', self.rank)
+        self.world_size = kwargs.get('world_size', '1')
+
+        self.model_path = kwargs.get('model_path', None)
+        self.max_input_length = kwargs.get('max_input_length', None)
+        self.max_output_length = kwargs.get('max_output_length', None)
+        self.max_position_embeddings = kwargs.get('max_position_embeddings', None)
+        self.is_flash_causal_lm = kwargs.get('is_flash_causal_lm', False)
+        self.batch_size = kwargs.get('batch_size', None)
+
+        self.model = ModelRunner(
+            self.model_path, rank=self.rank, world_size=self.world_size,
+            local_rank=self.local_rank,
+            is_flash_causal_lm=self.is_flash_causal_lm,
+            max_position_embeddings=self.max_position_embeddings,
+        )
+        self.tokenizer = self.model.tokenizer
+        self.device = self.model.device
+        self.dtype = self.model.dtype
+        self.quantize = self.model.quantize
+        self.kv_quant = self.model.kv_quant
+        self.model.load_weights()
+
+        self.skip_word_embedding = False
+        if hasattr(self.model.model, 'skip_word_embedding'):
+            self.skip_word_embedding = self.model.model.skip_word_embedding
+
+    def warm_up(self):
+        print_log(self.rank, logger.info, "---------------begin warm_up---------------")
+        dummy_input_ids_full = torch.randint(
+            0, 32000, [self.batch_size, self.max_input_length], dtype=torch.long).npu()
+        self.model.generate(inputs=dummy_input_ids_full, do_sample=False, max_new_tokens=10)
+        print_log(self.rank, logger.info, "---------------end warm_up---------------")
+
+    def infer(self, input_text):
+        print_log(self.rank, logger.info, "---------------begin inference---------------")
+        if isinstance(input_text, str):
+            input_text = [input_text] * self.batch_size
+
+        inputs = self.tokenizer(input_text, return_tensors="pt", padding='max_length',
+                                max_length=self.max_input_length,
+                                truncation=True)
+
+        prefill_start_time = time.time()
+        with torch.no_grad():
+            self.model.generate(
+                inputs=inputs.input_ids.npu(),
+                attention_mask=inputs.attention_mask.npu(),
+                max_new_tokens=1
+            )
+        prefill_end_time = time.time()
+
+        decode_start_time = time.time()
+        with torch.no_grad():
+            generate_ids = self.model.generate(
+                inputs=inputs.input_ids.npu(),
+                attention_mask=inputs.attention_mask.npu(),
+                max_new_tokens=self.max_output_length
+            )
+        decode_end_time = time.time()
+
+        generate_text = self.tokenizer.batch_decode(
+            generate_ids[:, self.max_input_length:], skip_special_tokens=True,
+            clean_up_tokenization_spaces=False)
+        if self.rank == 0:
+
+            logger.info(f'{inputs.input_ids.shape=}')
+
+            input_tokens_num = len(inputs.input_ids[0])
+            generate_tokens_num = len(generate_ids[0]) - len(inputs.input_ids[0])
+            logger.info(f'Question: {input_text[0]}')
+            logger.info(f'Answer: {generate_text[0][:-generate_tokens_num]}')
+            logger.info(f'Input token num: {input_tokens_num}')
+            logger.info(f'Generate token num: {generate_tokens_num}')
+
+            logger.info("---------------end inference---------------")
+
+            prefill_time = (prefill_end_time - prefill_start_time)
+            e2e_time = (decode_end_time - decode_start_time)
+            try:
+                decode_average_time = (e2e_time - prefill_time) / (self.max_output_length - 1)
+            except ZeroDivisionError as e:
+                raise ZeroDivisionError from e
+            logger.info(
+                f"Prefill time: {prefill_time * 1000}ms, "
+                f"Decode average time: {decode_average_time * 1000}ms, "
+                f"E2E time: {e2e_time}s")
+
+    def infer_from_embeds(self, args):
+        if rank == 0:
+            logger.info("---------------begin inference---------------")
+
+        stop_words_ids = [torch.tensor(ids).npu() for ids in args.stop_words_ids]
+        stopping_criteria = StoppingCriteriaList([StoppingCriteriaSub(stops=stop_words_ids)])
+
+        generation_args = {
+            "inputs_embeds": None,
+            'min_length': args.min_length,
+            'max_new_tokens': args.max_output_length,
+            "stopping_criteria": stopping_criteria,
+            'do_sample': args.do_sample,
+            'num_beams': args.num_beams,
+            'top_p': args.top_p,
+            'temperature': args.temperature,
+            'repetition_penalty': args.repetition_penalty,
+            'length_penalty': args.length_penalty,
+        }
+
+        image_answer_pairs = {}
+        for inputs_embeds_file_path in sorted([os.path.join(args.inputs_embeds_dir, _)
+                                               for _ in os.listdir(args.inputs_embeds_dir)]):
+
+            if not inputs_embeds_file_path.endswith(".pt"):
+                continue
+
+            if rank == 0:
+                logger.info(f'NO.{len(image_answer_pairs) + 1}')
+                logger.info(f'inputs_embeds_file_path: {inputs_embeds_file_path}')
+
+            inputs_embeds = torch.load(inputs_embeds_file_path).npu()
+            generation_args["inputs_embeds"] = inputs_embeds
+
+            with torch.no_grad():
+                generate_ids = self.model.generate(**generation_args)
+
+            output_text = self.tokenizer.decode(generate_ids[0], skip_special_tokens=True)
+            output_text = output_text.split('###')[0]  # remove the stop sign '###'
+            output_text = output_text.split('Assistant:')[-1].strip()
+            image_answer_pairs[inputs_embeds_file_path] = output_text
+
+            if rank == 0:
+                logger.info(f'Answer: {output_text}')
+                with safe_open(args.results_save_path, "w", encoding='utf-8') as f:
+                    json.dump(image_answer_pairs, f)
+                logger.info('json dump finished')
+
+        if rank == 0:
+            logger.info("---------------end inference---------------")
+
+
+if __name__ == '__main__':
+    arguments = parse_arguments()
+
+    rank = int(os.getenv("RANK", "0"))
+    local_rank = int(os.getenv("LOCAL_RANK", "0"))
+    world_size = int(os.getenv("WORLD_SIZE", "1"))
+    input_dict = {
+        'rank': rank,
+        'world_size': world_size,
+        'local_rank': local_rank,
+        **vars(arguments)
+    }
+
+    fa_runner = FARunner(**input_dict)
+
+    if fa_runner.skip_word_embedding:
+        fa_runner.infer_from_embeds(arguments)
+    else:
+        fa_runner.warm_up()
+        fa_runner.infer(arguments.input_text)
+
diff --git a/mindie/examples/run_pa.py b/mindie/examples/run_pa.py
new file mode 100644
index 00000000..5d9002ba
--- /dev/null
+++ b/mindie/examples/run_pa.py
@@ -0,0 +1,363 @@
+# Copyright Huawei Technologies Co., Ltd. 2023-2024. All rights reserved.
+import argparse
+import copy
+import json
+import math
+import os
+import time
+
+import torch
+import torch_npu
+from atb_llm.runner import ModelRunner
+from atb_llm.utils.cpu_binding import NpuHbmInfo
+from atb_llm.utils.env import ENV
+from atb_llm.utils.log import logger, print_log
+from atb_llm.utils.file_utils import safe_open
+from examples.server.cache import CacheConfig, ModelConfig, CacheManager
+from examples.server.generate import decode_token, generate_req
+from examples.server.request import request_from_token
+
+
+class PARunner:
+    def __init__(self, **kwargs):
+        self.rank = kwargs.get('rank', '0')
+        self.local_rank = kwargs.get('local_rank', self.rank)
+        self.world_size = kwargs.get('world_size', '1')
+
+        self.model_path = kwargs.get('model_path', None)
+        self.input_text = kwargs.get('input_text', None)
+
+        self.max_batch_size = kwargs.get('max_batch_size', None)
+        self.max_input_length = kwargs.get('max_input_length', None)
+        self.max_output_length = kwargs.get('max_output_length', None)
+        self.max_position_embeddings = kwargs.get('max_position_embeddings', None)
+        self.max_prefill_tokens = kwargs.get('max_prefill_tokens', None)
+
+        self.block_size = kwargs.get('block_size', None)
+        self.chat_template = kwargs.get('chat_template', None)
+        self.is_flash_model = kwargs.get('is_flash_model', None)
+
+        self.model = ModelRunner(
+            self.model_path, rank=self.rank, world_size=self.world_size,
+            local_rank=self.local_rank,
+            max_position_embeddings=self.max_position_embeddings
+        )
+        self.tokenizer = self.model.tokenizer
+        if self.chat_template:
+            self.tokenizer.chat_template = self._load_chat_template(self.chat_template)
+        self.dtype = self.model.dtype
+        self.quantize = self.model.quantize
+        self.kv_quant = self.model.kv_quant
+        self.model.load_weights()
+
+        self.device = self.model.device
+        self.model_config = ModelConfig(self.model.num_heads,
+                                        self.model.num_kv_heads,
+                                        self.model.head_size,
+                                        self.model.num_layers,
+                                        self.model.device,
+                                        self.model.dtype,
+                                        self.model.soc_info,
+                                        self.kv_quant)
+
+        self.max_memory = NpuHbmInfo.get_hbm_capacity(self.local_rank, self.world_size, self.model.soc_info.need_nz)
+        self.init_memory = int(
+            self.max_memory * NpuHbmInfo.get_hbm_usage(self.local_rank, self.world_size, self.model.soc_info.need_nz))
+        print_log(self.rank, logger.info, f'hbm_capacity(GB): {self.max_memory / (1024 ** 3)}, '
+                                          f'init_memory(GB): {self.init_memory / (1024 ** 3)}')
+
+        self.warm_up_memory = 0
+        self.warm_up_num_blocks = 0
+        self.cache_manager = None
+
+    def __repr__(self):
+        return (
+                "PARunner("
+                + f"model_path={self.model_path}, "
+                + f"input_text={self.input_text}, "
+                + f"max_position_embeddings={self.max_position_embeddings}, "
+                + f"max_input_length={self.max_input_length}, "
+                + f"max_output_length={self.max_output_length}, "
+                + f"max_prefill_tokens={self.max_prefill_tokens}, "
+                + f"is_flash_model={self.is_flash_model}, "
+                + f"max_batch_size={self.max_batch_size}, "
+                + f"dtype={self.dtype}, "
+                + f"block_size={self.block_size}, "
+                + f"model_config={self.model_config}, "
+                + f"max_memory={self.max_memory}, "
+        )
+
+    @staticmethod
+    def _load_chat_template(chat_template: str):
+        if os.path.exists(chat_template):
+            with open(chat_template, "r", encoding="utf-8") as f:
+                chat_template_content = f.read()
+        else:
+            chat_template_content = chat_template
+        if chat_template_content:
+            print_log(int(os.getenv("RANK", "0")), logger.info, f"Using chat template:\n{chat_template_content}")
+        return chat_template_content
+
+    def warm_up(self):
+        if self.max_prefill_tokens == -1:
+            self.max_prefill_tokens = self.max_batch_size * (self.max_input_length + self.max_output_length)
+        all_input_length = self.max_batch_size * self.max_input_length
+        input_ids = torch.ones(all_input_length, dtype=torch.int64).to(self.device)
+        position_ids = torch.arange(self.max_input_length, dtype=torch.int32).repeat(self.max_batch_size).to(
+            self.device)
+        cu_seqlen_prefill = torch.tensor([1])
+        try:
+            block_num = math.ceil(all_input_length / self.block_size)
+        except ZeroDivisionError as e:
+            raise ZeroDivisionError from e
+        block_tables_tensor = torch.arange(block_num, dtype=torch.int32).view(1, -1).to(self.device)
+        slots = torch.arange(all_input_length, dtype=torch.int32).to(self.device)
+        input_lengths_tensor = torch.tensor(
+            [self.max_input_length] * self.max_batch_size, dtype=torch.int64
+        ).to(self.device)
+        prefill_head_indices = torch.tensor([all_input_length - 1], dtype=torch.int64).to(self.device)
+        print_log(self.rank, logger.info, "---------------begin warm_up---------------")
+        try:
+            self.warm_up_num_blocks = math.ceil((self.max_input_length + self.max_output_length) /
+                                                self.block_size) * self.max_batch_size
+        except ZeroDivisionError as e:
+            raise ZeroDivisionError from e
+        cache_config = CacheConfig(self.warm_up_num_blocks, self.block_size)
+        self.cache_manager = CacheManager(cache_config, self.model_config)
+        self.model.forward(
+            input_ids=input_ids,
+            position_ids=position_ids,
+            is_prefill=cu_seqlen_prefill is not None,
+            block_tables=block_tables_tensor,
+            kv_cache=self.cache_manager.kv_cache,
+            slots=slots,
+            input_lengths=input_lengths_tensor,
+            max_seq_len=self.max_input_length,
+            lm_head_indices=prefill_head_indices
+        )
+        self.warm_up_memory = int(
+            self.max_memory * NpuHbmInfo.get_hbm_usage(self.local_rank, self.world_size, self.model.soc_info.need_nz))
+        print_log(self.rank, logger.info, f'warmup_memory(GB): {self.warm_up_memory / (1024 ** 3): .2f}')
+        print_log(self.rank, logger.info, "---------------end warm_up---------------")
+
+    def infer(self, inputs, batch_size, max_output_length, ignore_eos, is_chat_model=False, **kwargs):
+        print_log(self.rank, logger.info, "---------------begin inference---------------")
+        if ignore_eos:
+            self.model.postprocessor.eos_token_id = []
+        is_truncation = kwargs.get("truncation", False)
+        input_ids = self._build_model_inputs(inputs, is_chat_model, is_truncation)
+        if len(input_ids) == 1:
+            req_list = [request_from_token(input_ids[0], max_output_length, self.block_size, req_idx=idx)
+                        for idx in range(batch_size)]
+        else:
+            req_list = [request_from_token(input_ids_ins, max_output_length, self.block_size, req_idx=idx)
+                        for idx, input_ids_ins in enumerate(input_ids)]
+        print_log(self.rank, logger.debug, f'req_list[0].input_ids: {req_list[0].input_ids}')
+
+        if not self.cache_manager:
+            if self.max_prefill_tokens == -1:
+                self.max_prefill_tokens = self.max_batch_size * (self.max_input_length + self.max_output_length)
+            cache_block_size = self.block_size * self.model.num_kv_heads * self.model.head_size
+            dtype_size = CacheManager.get_dtype_size(self.dtype)
+            total_cache_size = self.model.num_layers * cache_block_size * 2 * dtype_size
+
+            max_memory = ENV.memory_fraction * self.max_memory \
+                if not ENV.max_memory_gb else int(ENV.max_memory_gb) * (1 << 30)
+            free_memory = max_memory - ENV.reserved_memory_gb * (1 << 30) - (
+                self.warm_up_memory if self.warm_up_memory != 0 else self.init_memory)
+            print_log(self.rank, logger.info,
+                      f"infer max_memory(GB): {max_memory / (1024 ** 3): .2f}, "
+                      f"warm_up_memory(GB): {self.warm_up_memory / (1024 ** 3): .2f}, "
+                      f"free_memory(GB): {free_memory / (1024 ** 3): .2f}")
+
+            num_blocks = int(free_memory // total_cache_size)
+            print_log(self.rank, logger.info, f"num_blocks: {num_blocks}, free_memory: {free_memory}")
+            cache_config = CacheConfig(num_blocks, self.block_size)
+            self.cache_manager = CacheManager(cache_config, self.model_config)
+
+        if ENV.benchmark_enable:
+            req_list_dummy = copy.deepcopy(req_list)
+            self.model.postprocessor.max_new_tokens = 2
+            generate_req(req_list_dummy, self.model, self.max_batch_size, self.max_prefill_tokens, self.cache_manager)
+
+        self.model.postprocessor.max_new_tokens = max_output_length
+        skip_special_tokens = kwargs.get("skip_special_tokens", False)
+        if not ENV.profiling_enable:
+            print_log(self.rank, logger.debug, "no profiling")
+            torch.npu.synchronize()
+            e2e_start = time.time()
+            generate_req(req_list, self.model, self.max_batch_size, self.max_prefill_tokens, self.cache_manager)
+            _, _ = decode_token(req_list, self.tokenizer, skip_special_tokens)
+            torch.npu.synchronize()
+            e2e_end = time.time()
+            e2e_time = e2e_end - e2e_start
+        else:
+            print_log(self.rank, logger.debug, "enter profiling")
+            profiling_path = ENV.profiling_filepath
+            if not os.path.exists(profiling_path):
+                os.makedirs(profiling_path, exist_ok=True)
+            profiler_level = torch_npu.profiler.ProfilerLevel
+            target_level = "Level" + ENV.profiling_level
+            if not hasattr(profiler_level, target_level):
+                raise NotImplementedError(f"target_level: {target_level} is not implemented"
+                                          f" in torch_npu.profiler.ProfilerLevel")
+            actual_profiler_level = getattr(profiler_level, target_level)
+            torch.npu.synchronize()
+            e2e_start = time.time()
+            experimental_config = torch_npu.profiler._ExperimentalConfig(
+                aic_metrics=torch_npu.profiler.AiCMetrics.PipeUtilization,
+                profiler_level=actual_profiler_level,
+                l2_cache=False,
+                data_simplification=False
+            )
+            with torch_npu.profiler.profile(
+                    activities=[
+                        torch_npu.profiler.ProfilerActivity.CPU,
+                        torch_npu.profiler.ProfilerActivity.NPU
+                    ],
+                    on_trace_ready=torch_npu.profiler.tensorboard_trace_handler(profiling_path),
+                    record_shapes=True,
+                    profile_memory=True,
+                    with_stack=False,
+                    with_flops=False,
+                    with_modules=False,
+                    experimental_config=experimental_config):
+                generate_req(req_list, self.model, self.max_batch_size, self.max_prefill_tokens, self.cache_manager)
+            torch.npu.synchronize()
+            e2e_end = time.time()
+            e2e_time = e2e_end - e2e_start
+
+        generate_text_list, token_num_list = decode_token(req_list, self.tokenizer, skip_special_tokens)
+        if ENV.token_ids_save_enable:
+            if self.local_rank == 0:
+                for idx, req in enumerate(req_list):
+                    input_ids_save_filename = f"input_ids_{idx}.pth"
+                    output_ids_save_filename = f"output_ids_{idx}.txt"
+                    torch.save(req.input_ids.cpu(),
+                               os.path.join(ENV.token_ids_save_folder, input_ids_save_filename))
+                    output_path = os.path.join(ENV.token_ids_save_folder, output_ids_save_filename)
+                    with safe_open(output_path, "w", encoding='utf-8') as f:
+                        f.write(' '.join(map(str, req.out_token_list)))
+        print_log(self.rank, logger.info, "---------------end inference---------------")
+        return generate_text_list, token_num_list, e2e_time
+
+    def _build_model_inputs(self, inputs, is_chat_model, is_truncation=False):
+        input_texts, input_ids, input_conversations = [], [], []
+        if isinstance(inputs, list) and inputs:
+            if isinstance(inputs[0], str):
+                input_texts = inputs
+            elif isinstance(inputs[0], torch.Tensor):
+                input_ids = inputs
+            elif isinstance(inputs[0], list) and inputs[0]:
+                if isinstance(inputs[0][0], int):
+                    input_ids = inputs
+                elif isinstance(inputs[0][0], dict):
+                    input_conversations = inputs
+        if not (input_texts or input_ids or input_conversations):
+            raise ValueError(f"The inputs of `PARunner.infer` must be as List[str], List[torch.Tensor], List[List[int]]"
+                             f" or List[List[Dict]]. Now the inputs ({inputs}) is not acceptable or is empty.")
+        if is_chat_model:
+            if input_conversations:
+                input_ids = self.model.build_inputs(input_conversations)
+            elif input_texts:
+                input_conversations = [[{"role": "user", "content": t}] for t in input_texts]
+                input_ids = self.model.build_inputs(input_conversations)
+            else:
+                print_log(self.rank, logger.warning, "Neither conversations nor input_texts exist, "
+                                                     "'chat' parameter is not effective.")
+        elif input_texts:
+            input_ids = [self.tokenizer([text], return_tensors="pt", truncation=is_truncation)["input_ids"].flatten()
+                for text in input_texts]
+        return input_ids
+
+
+def cmd_bool(cmd_arg):
+    if cmd_arg == "True":
+        return True
+    elif cmd_arg == "False":
+        return False
+    raise ValueError(f"{cmd_arg} should be a boolean")
+
+
+def parse_ids(list_str):
+    return [int(item) for item in list_str.split(',')]
+
+
+def parse_arguments():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--model_path', help="model and tokenizer path")
+    parser.add_argument(
+        '--input_texts',
+        type=str,
+        nargs='+',
+        default=["What's deep learning?"])
+    parser.add_argument(
+        '--input_ids',
+        type=parse_ids,
+        nargs='+',
+        default=None)
+    parser.add_argument(
+        '--input_file',
+        type=str,
+        help='CSV or Numpy file containing tokenized input. Alternative to text input.',
+        default=None)
+
+    parser.add_argument("--max_batch_size", type=int, default=1)
+    parser.add_argument('--max_input_length', type=int, default=1024)
+    parser.add_argument('--max_output_length', type=int, default=20)
+    parser.add_argument('--max_position_embeddings', type=int, default=None)
+    parser.add_argument('--max_prefill_tokens', type=int, default=-1)
+
+    parser.add_argument("--block_size", type=int, default=128)
+    parser.add_argument('--chat_template', type=str, default=None)
+    parser.add_argument('--ignore_eos', action='store_true')
+    parser.add_argument('--is_chat_model', action='store_true')
+    parser.add_argument('--is_flash_model', action='store_false')
+
+    return parser.parse_args()
+
+
+if __name__ == '__main__':
+    args = parse_arguments()
+
+    rank = int(os.getenv("RANK", "0"))
+    local_rank = int(os.getenv("LOCAL_RANK", "0"))
+    world_size = int(os.getenv("WORLD_SIZE", "1"))
+    input_dict = {
+        'rank': rank,
+        'world_size': world_size,
+        'local_rank': local_rank,
+        **vars(args)
+    }
+
+    if args.input_ids:
+        infer_inputs = args.input_ids
+    else:
+        infer_inputs = args.input_texts
+    if args.is_chat_model and args.input_file:
+        conversations = []
+        with open(args.input_file, 'r', encoding='utf-8') as file:
+            for line in file:
+                data_line = json.loads(line)
+                conversations.append(data_line)
+        infer_inputs = conversations
+
+    pa_runner = PARunner(**input_dict)
+    print_log(rank, logger.info, f'pa_runner: {pa_runner}')
+    pa_runner.warm_up()
+
+    infer_params = {
+        "inputs": infer_inputs,
+        "batch_size": args.max_batch_size,
+        "max_output_length": args.max_output_length,
+        "ignore_eos": args.ignore_eos,
+        "is_chat_model": args.is_chat_model
+    }
+    generate_texts, token_nums, _ = pa_runner.infer(**infer_params)
+
+    length = len(infer_inputs)
+    for i, generate_text in enumerate(generate_texts):
+        if i < length:
+            print_log(rank, logger.info, f'Question[{i}]: {infer_inputs[i]}')
+        print_log(rank, logger.info, f'Answer[{i}]: {generate_text}')
+        print_log(rank, logger.info, f'Generate[{i}] token num: {token_nums[i]}')
diff --git a/mindie/examples/server/__init__.py b/mindie/examples/server/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/mindie/examples/server/batch.py b/mindie/examples/server/batch.py
new file mode 100644
index 00000000..757fc786
--- /dev/null
+++ b/mindie/examples/server/batch.py
@@ -0,0 +1,156 @@
+# Copyright Huawei Technologies Co., Ltd. 2023-2024. All rights reserved.
+from typing import List
+import torch
+from atb_llm.utils.log import logger
+from .request import Request
+
+
+class Batch:
+    req_ids: List[int]
+    req_list: List[Request]
+    batch_num: int
+
+    cu_seqlen_prefill: torch.Tensor
+    batch_input_ids: torch.Tensor
+    batch_position_ids: torch.Tensor
+
+    batch_block_tables: torch.Tensor
+    batch_slots_tables: torch.Tensor
+    batch_slot_indices: torch.Tensor
+
+    context_length: torch.Tensor
+    max_s: int
+    lm_head_indices: torch.Tensor
+
+    def __init__(self, req_list: List[Request]):
+        self.req_list = req_list
+        self.batch_num = len(req_list)
+
+        self.req_ids = [req.req_id for req in req_list]
+        input_ids_list = []
+        position_ids_list = []
+        slot_indices_list = []
+        context_length_list = []
+        self.max_s = 0
+        slot_offset = 0
+
+        for req in self.req_list:
+            context_length = req.input_ids.size(0)
+            input_ids_list.append(req.input_ids)
+            position_ids = torch.arange(context_length, dtype=torch.long)
+            position_ids_list.append(position_ids)
+            slot_indices = position_ids + slot_offset
+            slot_indices_list.append(slot_indices)
+            context_length_list.append(context_length)
+            self.max_s = max(self.max_s, context_length)
+            slot_offset += req.need_slots
+
+        self.cu_seqlen_prefill = torch.tensor([1])
+        self.batch_input_ids = torch.concat(input_ids_list, dim=0)
+        self.batch_position_ids = torch.concat(position_ids_list, dim=0)
+        self.batch_block_tables: None | torch.Tensor = None
+        self.batch_slots_tables: None | torch.Tensor = None
+        self.batch_slot_indices = torch.concat(slot_indices_list, dim=0)
+        self.context_length = torch.tensor(context_length_list, dtype=torch.int64)
+        self.lm_head_indices = torch.cumsum(self.context_length, dim=0) - 1
+
+    @classmethod
+    def concatenate(cls, batches: List["Batch"]):
+        req_ids = []
+        req_list = []
+        batch_num = 0
+        input_ids_list = [batch.batch_input_ids for batch in batches]
+        position_ids_list = [batch.batch_position_ids for batch in batches]
+        block_tables_list = []
+        slots_tables_list = [batch.batch_slots_tables for batch in batches]
+        slot_indices_list = []
+        context_length_list = [batch.context_length for batch in batches]
+        max_s = 0
+
+        max_block = 0
+        for batch in batches:
+            req_ids.extend(batch.req_ids)
+            req_list.extend(batch.req_list)
+            batch_num += batch.batch_num
+            max_s = max(max_s, batch.max_s)
+            max_block = max(max_block, batch.batch_block_tables.size(1))
+
+        slot_offset = 0
+        for batch in batches:
+            cur_block = batch.batch_block_tables.size(1)
+            if cur_block < max_block:
+                zero = torch.zeros(batch.batch_num, max_block - cur_block, dtype=torch.long)
+                batch.batch_block_tables = torch.concat([batch.batch_block_tables, zero], dim=-1)
+            block_tables_list.append(batch.batch_block_tables)
+            slot_indices_list.append(batch.batch_slot_indices + slot_offset)
+            slot_offset += batch.batch_slots_tables.size(0)
+
+        batches[0].req_ids = req_ids
+        batches[0].req_list = req_list
+        batches[0].batch_num = batch_num
+        batches[0].batch_input_ids = torch.concat(input_ids_list, dim=0)
+        batches[0].batch_position_ids = torch.concat(position_ids_list, dim=0)
+        batches[0].batch_block_tables = torch.concat(block_tables_list, dim=0)
+        batches[0].batch_slots_tables = torch.concat(slots_tables_list, dim=0)
+        batches[0].batch_slot_indices = torch.concat(slot_indices_list, dim=0)
+        batches[0].context_length = torch.concat(context_length_list, dim=0)
+        batches[0].max_s = max_s
+
+        while len(batches) > 1:
+            del batches[1]
+
+    def filter(self, postprocessor, cache_manager):
+        if self.batch_num == 0:
+            logger.error("batch.batch_num is 0")
+            raise AssertionError
+
+        finish_num = 0
+        finish_list = []
+
+        for i, req in enumerate(self.req_list):
+            if (postprocessor.stopping_criteria(req.out_token_list)) or \
+                    len(req.out_token_list) >= postprocessor.max_new_tokens:
+                cache_manager.free(req)
+                finish_num += 1
+                finish_list.append(i)
+
+        if finish_num == 0:
+            return 0
+
+        batch_mask = torch.ones(self.batch_num, dtype=torch.int64)
+        batch_mask[finish_list] = 0
+        remain_batch = batch_mask.nonzero().flatten()
+
+        self.batch_num -= finish_num
+        if self.batch_num == 0:
+            return finish_num
+
+        self.batch_input_ids = self.batch_input_ids[remain_batch]
+        self.batch_position_ids = self.batch_position_ids[remain_batch]
+        self.batch_block_tables = self.batch_block_tables[remain_batch]
+        context_length = self.context_length[remain_batch]
+        self.max_s = int(context_length.max())
+
+        req_ids = []
+        req_list = []
+        slots_tables_list = []
+        slot_indices_list = []
+
+        slot_offset = 0
+        for i, req in enumerate(self.req_list):
+            if i in finish_list:
+                continue
+
+            req_ids.append(req.req_id)
+            req_list.append(req)
+            slots_tables_list.append(req.slot_tables)
+            slot_indices_list.append(int(self.context_length[i]) - 1 + slot_offset)
+            slot_offset += req.need_slots
+
+        self.req_ids = req_ids
+        self.req_list = req_list
+        self.batch_slots_tables = torch.concat(slots_tables_list, dim=0)
+        self.batch_slot_indices = torch.tensor(slot_indices_list, dtype=torch.long)
+        self.context_length = context_length
+
+        return finish_num
diff --git a/mindie/examples/server/cache.py b/mindie/examples/server/cache.py
new file mode 100644
index 00000000..cd929a0e
--- /dev/null
+++ b/mindie/examples/server/cache.py
@@ -0,0 +1,153 @@
+# Copyright Huawei Technologies Co., Ltd. 2023-2024. All rights reserved.
+import os
+import torch
+
+from atb_llm.utils.log import logger
+
+
+class CacheConfig:
+    def __init__(self, num_blocks=1024, block_size=128):
+        self.num_blocks = int(os.getenv("NUM_BLOCKS", f'{num_blocks}'))
+        self.block_size = int(os.getenv("BLOCK_SIZE", f'{block_size}'))
+
+
+class ModelConfig:
+    def __init__(self, num_heads, num_kv_heads, head_size, num_layers, device, dtype, soc_info, kv_quant):
+        self.num_heads = num_heads
+        self.num_kv_heads = num_kv_heads
+        self.head_size = head_size
+        self.num_layers = num_layers
+        self.device = device
+        self.dtype = dtype
+        self.soc_info = soc_info
+        self.kv_quant = kv_quant
+
+    def __repr__(self):
+        return (
+                "ModelConfig("
+                + f"num_heads={self.num_heads}, "
+                + f"num_kv_heads={self.num_kv_heads}, "
+                + f"head_size={self.head_size}, "
+                + f"num_layers={self.num_layers}, "
+                + f"device={self.device}, "
+                + f"dtype={self.dtype}, "
+                + f"soc_info={self.soc_info}, "
+                + f"kv_quant={self.kv_quant}, "
+        )
+
+
+class CacheManager:
+    def __init__(self, cache_config, model_config):
+        self.block_size = cache_config.block_size
+        self.num_blocks = cache_config.num_blocks
+
+        self.num_heads = model_config.num_kv_heads
+        self.head_size = model_config.head_size
+        self.num_layers = model_config.num_layers
+        self.device = model_config.device
+        self.dtype = torch.int8 if model_config.kv_quant is not None else model_config.dtype
+        self.soc_info = model_config.soc_info
+
+        mem_need = self.num_blocks * self.block_size * self.num_heads * self.head_size * self.num_layers * 2 * \
+                   self.get_dtype_size(self.dtype) / 1024 / 1024 / 1024
+        logger.info(f"kv cache will allocate {mem_need}GB memory")
+
+        if self.soc_info.need_nz:
+            self.kv_cache = [
+                (
+                    torch.empty(
+                        (self.num_blocks, self.num_heads * self.head_size // 16, self.block_size, 16),
+                        dtype=self.dtype,
+                        device=self.device,
+                    ),
+                    torch.empty(
+                        (self.num_blocks, self.num_heads * self.head_size // 16, self.block_size, 16),
+                        dtype=self.dtype,
+                        device=self.device,
+                    ),
+                )
+                for _ in range(self.num_layers)
+            ]
+        else:
+            self.kv_cache = [
+                (
+                    torch.empty(
+                        (self.num_blocks, self.block_size, self.num_heads, self.head_size),
+                        dtype=self.dtype,
+                        device=self.device,
+                    ),
+                    torch.empty(
+                        (self.num_blocks, self.block_size, self.num_heads, self.head_size),
+                        dtype=self.dtype,
+                        device=self.device,
+                    ),
+                )
+                for _ in range(self.num_layers)
+            ]
+
+        random_block_allocate = os.getenv("RANDOM_BLOCK_ALLOCATE", '0') == '1'
+        if random_block_allocate:
+            self.block_map = torch.randperm(self.num_blocks, dtype=torch.long)
+            self.contrary_block_map = torch.zeros(self.num_blocks, dtype=torch.long)
+            for i in range(self.num_blocks):
+                self.contrary_block_map[self.block_map[i]] = i
+        else:
+            self.block_map = torch.arange(self.num_blocks, dtype=torch.long)
+            self.contrary_block_map = torch.arange(self.num_blocks, dtype=torch.long)
+
+        self.free_block_mask = torch.ones(self.num_blocks, dtype=torch.long)
+        self.total_slots = torch.arange(self.num_blocks * self.block_size, dtype=torch.long)
+        self.total_slots = self.total_slots.view(self.num_blocks, self.block_size)
+
+    @staticmethod
+    def get_dtype_size(dtype):
+        dtype_size_map = {torch.float16: 2, torch.float32: 4, torch.bfloat16: 2, torch.int8: 1}
+        return dtype_size_map.get(dtype, 2)
+
+    def allocate(self, batch):
+        total_need_blocks = 0
+        max_need_blocks = 0
+        for req in batch.req_list:
+            if req.block_tables:
+                logger.error(f"req_id: {req.req_id} block has been allocated")
+                raise AssertionError
+
+            total_need_blocks += req.need_blocks
+            max_need_blocks = max(max_need_blocks, req.need_blocks)
+
+        free_block_indices = self.free_block_mask.nonzero().flatten()
+        if free_block_indices.numel() < total_need_blocks:
+            logger.error(f"Out of available cache blocks: asked {total_need_blocks}, "
+                         f"only {free_block_indices.numel()} free blocks")
+            raise AssertionError
+
+        allocate_block_indices = free_block_indices[:total_need_blocks]
+        allocate_blocks = self.block_map[allocate_block_indices]
+
+        block_offset = 0
+        block_tables_list = []
+        slot_tables_list = []
+        for req in batch.req_list:
+            req.block_tables = allocate_blocks[block_offset:block_offset + req.need_blocks]
+            req.slot_tables = self.total_slots[req.block_tables].flatten()
+            block_tables = req.block_tables
+            if req.need_blocks < max_need_blocks:
+                block_tables = torch.concat(
+                    [block_tables, torch.zeros(max_need_blocks - req.need_blocks, dtype=torch.long)], dim=0)
+            block_tables_list.append(block_tables.view(1, -1))
+            slot_tables_list.append(req.slot_tables)
+            block_offset += req.need_blocks
+
+        batch.batch_block_tables = torch.concat(block_tables_list, dim=0)
+        batch.batch_slots_tables = torch.concat(slot_tables_list, dim=0)
+
+        self.free_block_mask[allocate_block_indices] = 0
+
+    def free(self, req):
+        if req.block_tables is not None:
+            block_indices = self.contrary_block_map[req.block_tables]
+            self.free_block_mask[block_indices] = 1
+
+    def get_free_block_num(self):
+        free_block_indices = self.free_block_mask.nonzero()
+        return len(free_block_indices)
diff --git a/mindie/examples/server/example_text.csv b/mindie/examples/server/example_text.csv
new file mode 100644
index 00000000..2673dd66
--- /dev/null
+++ b/mindie/examples/server/example_text.csv
@@ -0,0 +1,10 @@
+请写一篇300字散文。
+讲个笑话听听。
+用通俗易懂的语言解释一下东方照？
+五条直线相交，最多能有多少个交点？
+北宋时期，上海大陆地区分属华亭县和昆山县，崇明地区属海门县。北宋淳化二年（991年），因松江上游不断淤浅，海岸线东移，大船出入不便，外来船舶只得停泊在松江的一条支流“上海浦”上（其位置在今外滩至十六铺附近的黄浦江）。将这段话译成英文。
+晚上睡不着应该怎么办？
+以下天文学常识题目，哪一个是错误的? A.太阳系是指由太阳和围绕着它运行的八大行星、矮行星、卫星、小行星带和彗星组成的一个行星系统。B.卫星是指绕行星或其他天体运动的天体。C.彗星是指太阳系中一种较小的天体，其核心由冰和尘埃组成。D.按一般的天体归类方法，月球属于行星。
+以下物理常识题目，哪一个是错误的?A.在自然环境下，声音在固体中传播速度最快。B.牛顿第一定律:一个物体如果不受力作用，将保持静止或匀速直线运动的状态。C.牛顿第三定律:对于每个作用力，都有一个相等而反向的反作用力。D.声音在空气中的传播速度为1000m/s。
+请帮我撰写一份领导难以拒绝的除夕请假申请。
+用最乙方的语气告诉同事方案做的完全不行。
\ No newline at end of file
diff --git a/mindie/examples/server/example_token.csv b/mindie/examples/server/example_token.csv
new file mode 100644
index 00000000..1ec5a1c5
--- /dev/null
+++ b/mindie/examples/server/example_token.csv
@@ -0,0 +1,10 @@
+1,29871,31088,31479,30287,32270,29941,29900,29900,30578,32402,30333,30267
+1,29871,32324,30502,32401,31852,32203,32203,30267
+1,29871,30406,30768,32801,32105,32887,30210,31505,31243,31201,32217,30287,30557,30979,30525,32134,30882
+1,29871,30904,31217,31157,31532,30990,31398,30214,30878,30923,30815,30417,30923,31022,30502,31398,30940,30882
+1,29871,30662,33142,30594,31117,30214,30429,30581,30257,32717,30533,30467,30748,31360,31266,33886,30684,30503,33187,30329,30684,30214,33083,30592,30533,30467,31360,30581,31649,30684,30267,30662,33142,34692,30705,30685,30470,30419,29929,29929,29896,30470,30409,30214,31570,31018,30775,30429,32027,30413,31683,34697,33199,30214,30581,33062,31532,30979,31618,30214,30257,32784,30544,30752,30413,32087,30214,31066,30805,32784,33764,31557,31050,32366,33345,30505,31018,30775,30210,30287,31217,31541,31151,30015,30429,30581,33664,30024,30429,30419,31149,30956,30669,30505,31482,31066,33423,32191,30802,31304,32901,32600,31830,30210,31491,33664,30775,30409,30267,30998,30810,31559,31852,32215,30494,31144,30333,30267
+1,29871,32458,30429,32522,30413,32006,31370,31751,32207,31882,32354,30882
+1,29871,30651,30557,30408,30333,30415,31190,32031,31596,30895,30214,32186,30287,30502,30392,31745,32063,30210,29973,319,29889,30654,31430,31185,30392,31084,31272,30654,31430,30503,32142,32894,32006,32009,31894,30448,30210,31044,30257,30448,30900,30330,34071,30448,30900,30330,32432,30900,30330,30446,30448,30900,32086,30503,35218,30900,31263,30494,30210,30287,30502,30448,30900,31185,31675,30267,29933,29889,32432,30900,30392,31084,32894,30448,30900,31391,31149,31221,30408,30988,31894,30846,30210,30408,30988,30267,29907,29889,35218,30900,30392,31084,30654,31430,31185,30275,30287,31893,32040,30446,30210,30408,30988,30214,31149,32193,30869,31272,32701,30503,33159,33517,31263,30494,30267,29928,29889,31590,30287,32291,30210,30408,30988,32365,30832,30525,30545,30214,30534,31539,31360,30909,30448,30900,30267
+1,29871,30651,30557,30834,30687,31190,32031,31596,30895,30214,32186,30287,30502,30392,31745,32063,30210,29973,29909,29889,30505,30688,31516,32023,32051,30557,30214,32165,30941,30505,32378,30988,30275,31471,32260,31859,30898,30878,32075,30267,29933,29889,32611,32881,30622,30287,30495,32143,29901,30287,30502,30834,30988,30847,30801,30413,32022,31074,30732,30406,30214,30998,30982,31695,32321,31981,31391,33027,31859,31157,31532,31894,30846,30210,31531,31613,30267,29907,29889,32611,32881,30622,30457,30495,32143,29901,30783,30909,31951,30502,30732,30406,31074,30214,30769,30417,30287,30502,30990,31184,31325,31908,31331,30210,31908,30732,30406,31074,30267,29928,29889,32165,30941,30505,30816,32069,30275,30210,31471,32260,31859,30898,30573,29896,29900,29900,29900,29885,29914,29879,30267
+1,29871,31088,32010,30672,32994,31479,30287,32124,32045,31943,32138,30651,32941,32411,30210,31152,33796,31088,32288,32469,31088,30267
+1,29871,30406,30878,32778,30525,30210,31505,32069,31785,32212,30980,30745,30525,32074,32065,30210,31366,30753,30413,30448,30267
\ No newline at end of file
diff --git a/mindie/examples/server/generate.py b/mindie/examples/server/generate.py
new file mode 100644
index 00000000..557d9cad
--- /dev/null
+++ b/mindie/examples/server/generate.py
@@ -0,0 +1,227 @@
+# Copyright Huawei Technologies Co., Ltd. 2023-2024. All rights reserved.
+import pandas as pd
+import torch
+
+from atb_llm.utils.env import ENV
+from atb_llm.utils.log import logger, print_log
+from .batch import Batch
+
+
+def next_token_chooser(logits: torch.Tensor):
+    return torch.argmax(logits, dim=-1)
+
+
+def generate_token(model, cache_manager, batch: Batch):
+    input_ids = batch.batch_input_ids.npu()
+    position_ids = batch.batch_position_ids.npu()
+    is_prefill = batch.cu_seqlen_prefill is not None
+    block_tables = batch.batch_block_tables.npu()
+    kv_cache = cache_manager.kv_cache
+    slots = batch.batch_slots_tables[batch.batch_slot_indices].npu()
+    input_lengths = batch.context_length.npu()
+    lm_head_indices = None if batch.lm_head_indices is None else batch.lm_head_indices.npu()
+
+    logits = model.forward(
+        input_ids=input_ids,
+        position_ids=position_ids,
+        is_prefill=is_prefill,
+        block_tables=block_tables,
+        kv_cache=kv_cache,
+        slots=slots,
+        input_lengths=input_lengths,
+        max_seq_len=batch.max_s,
+        lm_head_indices=lm_head_indices
+    )
+
+    if batch.cu_seqlen_prefill is not None and logits.size(0) != batch.batch_num:
+        if logits.size(0) != batch.lm_head_indices[-1] + 1:
+            logger.error(f"prefill logits is invalid, batch num: {batch.batch_num}," +
+                         f" total token: {int(batch.lm_head_indices[-1] + 1)}, but logits shape is: {logits.shape}")
+            raise AssertionError
+        logits = logits[batch.lm_head_indices]
+
+    ENV.update()
+    if ENV.logits_save_enable:
+        import os
+        if model.rank == 0: 
+            logits_save_filename = "logits_" + str(len(batch.req_list[0].out_token_list)) + ".pth"
+            torch.save(logits.cpu(), os.path.join(ENV.logits_save_folder, logits_save_filename))
+    next_token = next_token_chooser(logits)
+    next_token_list = next_token.tolist()
+
+    for i, req in enumerate(batch.req_list):
+        req.out_token_list.append(next_token_list[i])
+
+    batch.batch_input_ids = next_token.to(torch.int64)
+    batch.batch_position_ids = batch.context_length.clone().to(torch.long)
+    if batch.cu_seqlen_prefill is not None:
+        batch.batch_slot_indices = batch.batch_slot_indices[batch.lm_head_indices]
+        batch.cu_seqlen_prefill = None
+        batch.lm_head_indices = None
+
+    batch.batch_slot_indices += 1
+    batch.context_length += 1
+    batch.max_s += 1
+
+    return batch.filter(model.postprocessor, cache_manager)
+
+
+def generate_req(req_list, model, max_batch_size, max_prefill_tokens, cache_manager):
+    req_num = len(req_list)
+    print_log(model.rank, logger.info, f"------total req num: {req_num}, infer start--------")
+
+    req_idx = 0
+    total_req_finished = 0
+    generate_batch_size = 0
+    max_generate_batch_size = 0
+
+    generate_batches = []
+    prefill_benchmark_timelist = []
+    decoder_benchmark_timelist = []
+
+    while total_req_finished < req_num:
+        do_generate = True
+        if req_idx < req_num and generate_batch_size < max_batch_size:
+            prefill_start = req_idx
+            free_block = cache_manager.get_free_block_num()
+            total_need_blocks = 0
+            total_prefill_token = 0
+            prefill_batch_size = 0
+
+            while generate_batch_size + prefill_batch_size < max_batch_size:
+                if req_idx >= req_num:
+                    break
+                cur_need_blocks = req_list[req_idx].need_blocks
+                cur_context_len = req_list[req_idx].input_length
+                if total_need_blocks + cur_need_blocks > free_block:
+                    raise Exception(f"req: {req_idx} out of memory, need block:" +
+                                    f"{total_need_blocks + cur_need_blocks} is more than free block {free_block}")
+                if cur_context_len > max_prefill_tokens:
+                    logger.error(f"req: {req_idx} input length: {cur_context_len} is too long," +
+                                 f" max_prefill_tokens: {max_prefill_tokens}")
+                    raise AssertionError
+                if total_prefill_token + cur_context_len > max_prefill_tokens:
+                    do_generate = False
+                    break
+                total_need_blocks += cur_need_blocks
+                total_prefill_token += cur_context_len
+                prefill_batch_size += 1
+                req_idx += 1
+
+            if prefill_batch_size > 0:
+                batch = Batch(req_list[prefill_start:prefill_start + prefill_batch_size])
+                cache_manager.allocate(batch)
+                if ENV.benchmark_enable:
+                    import time
+                    torch.npu.synchronize()
+                    prefill_start = time.time()
+                    req_finished = generate_token(model, cache_manager, batch)
+                    torch.npu.synchronize()
+                    prefill_end = time.time()
+                    prefill_time = prefill_end - prefill_start
+                    prefill_benchmark_timelist.append(prefill_time)
+                else:
+                    req_finished = generate_token(model, cache_manager, batch)
+
+                if req_finished != (prefill_batch_size - batch.batch_num):
+                    logger.error("batch filter error")
+                    raise AssertionError
+
+                if batch.batch_num > 0:
+                    generate_batches.append(batch)
+                    generate_batch_size += batch.batch_num
+                if req_finished > 0:
+                    do_generate = False
+                    total_req_finished += req_finished
+
+        if do_generate:
+            if len(generate_batches) > 1:
+                Batch.concatenate(generate_batches)
+
+            if generate_batch_size != generate_batches[0].batch_num:
+                logger.error(f"batch concatenate error, expect batchnum: {generate_batch_size}," +
+                             f" in fact: {generate_batches[0].batch_num}")
+                raise AssertionError
+
+            if ENV.benchmark_enable:
+                import time
+                torch.npu.synchronize()
+                decode_start = time.time()
+                req_finished = generate_token(model, cache_manager, generate_batches[0])
+                torch.npu.synchronize()                              
+                decode_end = time.time()
+                decode_time = decode_end - decode_start
+                decoder_benchmark_timelist.append(decode_time)
+            else:
+                req_finished = generate_token(model, cache_manager, generate_batches[0])
+
+            if req_finished != (generate_batch_size - generate_batches[0].batch_num):
+                logger.error("batch filter error")
+                raise AssertionError
+            if generate_batch_size > max_generate_batch_size:
+                max_generate_batch_size = generate_batch_size
+            generate_batch_size = generate_batches[0].batch_num
+            if generate_batch_size == 0:
+                del generate_batches[0]
+            total_req_finished += req_finished
+
+    if model.rank == 0:
+        print("max_generate_batch_size", max_generate_batch_size)
+    if ENV.benchmark_enable:
+        prefill_time = sum(prefill_benchmark_timelist)
+        e2e_time = sum(prefill_benchmark_timelist) + sum(decoder_benchmark_timelist)
+        try:
+            decode_token_time = sum(decoder_benchmark_timelist) / (model.postprocessor.max_new_tokens - 1)
+        except ZeroDivisionError:
+            decode_token_time = 0
+
+        logger.info(
+            f"Prefill time: {prefill_time * 1000}ms, "
+            f"Decode token time: {decode_token_time * 1000}ms, "
+            f"E2E time: {e2e_time * 1000}ms")
+        batch_size = len(req_list)
+        input_len = req_list[0].input_length
+        output_len = model.postprocessor.max_new_tokens
+        prefill_token_times = ','.join(list(map(str, prefill_benchmark_timelist)))
+        decode_token_times = ','.join(list(map(str, decoder_benchmark_timelist)))
+        if model.rank == 0:
+            import os
+            benchmark_filepath = ENV.benchmark_filepath \
+                if ENV.benchmark_filepath else './benchmark_result/benchmark.csv'
+            benchmark_folder = os.path.dirname(benchmark_filepath)
+            if not os.path.exists(benchmark_folder):
+                os.makedirs(benchmark_folder)
+            stat_data = {
+                'batch_size': [batch_size],
+                'input_seq_len': [input_len],
+                'output_seq_len': [output_len],
+                'e2e_time(ms)': [f'{e2e_time * 1000: .2f}'],
+                'prefill_time(ms)': [f'{prefill_time * 1000: .2f}'],
+                'decoder_token_time(ms)': [f'{decode_token_time * 1000: .2f}'],
+                'prefill_count': [len(prefill_benchmark_timelist)],
+                'prefill_token_times': [prefill_token_times],
+                'decode_token_times': [decode_token_times],
+                'max_generate_batch_size': [max_generate_batch_size],
+            }
+            df = pd.DataFrame(stat_data)
+            df.to_csv(benchmark_filepath, index=False)
+            logger.info('-------------------performance dumped------------------------')
+            df = df.drop('prefill_token_times', axis=1)
+            df = df.drop('decode_token_times', axis=1)
+            print(df.to_markdown(index=False))
+
+
+def decode_token(req_list, tokenizer, skip_special_tokens=False):
+    decode_text_list = []
+    token_num_list = []
+    request_id = 0
+    token_num = 0
+    for req in req_list:
+        out_token = len(req.out_token_list)
+        token_tensor = torch.tensor(req.out_token_list, dtype=torch.int64)
+        decode_text = tokenizer.decode(token_tensor, skip_special_tokens)
+        decode_text_list.append(decode_text)
+        token_num += out_token
+        token_num_list.append((request_id, token_num))
+        request_id += 1
+    return decode_text_list, token_num_list
diff --git a/mindie/examples/server/request.py b/mindie/examples/server/request.py
new file mode 100644
index 00000000..b433880b
--- /dev/null
+++ b/mindie/examples/server/request.py
@@ -0,0 +1,110 @@
+# Copyright Huawei Technologies Co., Ltd. 2023-2024. All rights reserved.
+import math
+from typing import List
+from dataclasses import dataclass
+
+import torch
+
+
+class Request:
+    req_id: int
+
+    input_ids: torch.Tensor
+    input_length: int
+
+    need_blocks: int
+    need_slots: int
+    block_tables: torch.Tensor
+    slot_tables: torch.Tensor
+
+    out_token_list: List[int]
+
+    def __init__(self, max_out_length: int, block_size: int, req_id: int, input_ids: torch.Tensor):
+        self.req_id = req_id
+        self.input_ids = input_ids.flatten()
+
+        self.input_length = self.input_ids.numel()
+
+        try:
+            self.need_blocks = math.ceil((self.input_length + max_out_length) / block_size)
+        except ZeroDivisionError as e:
+            raise ZeroDivisionError from e
+        self.need_slots = self.need_blocks * block_size
+        self.block_tables: None | torch.Tensor = None
+        self.slot_tables: None | torch.Tensor = None
+
+        self.out_token_list = []
+
+
+class MultiModalRequest():
+    def __init__(self, max_out_length: int, block_size: int, req_id: int, input_ids: torch.Tensor):
+        self.req_id = req_id
+        self.input_ids = input_ids
+        self.input_length = self.input_ids.shape[0]
+        try:
+            self.need_blocks = math.ceil((self.input_length + max_out_length) / block_size)
+        except ZeroDivisionError as e:
+            raise ZeroDivisionError from e
+        self.need_slots = self.need_blocks * block_size
+        self.block_tables = None
+        self.slot_tables = None
+        self.out_token_list = []
+
+
+def request_from_token(input_ids, max_out_length, block_size, req_idx=0) -> Request:
+    input_ids = torch.tensor(input_ids, dtype=torch.int64)
+    request = Request(max_out_length, block_size, req_idx, input_ids)
+    return request
+
+
+def request_from_text(text, tokenizer, max_out_length, block_size, req_idx=0) -> Request:
+    input_ids = tokenizer([text], return_tensors="pt")["input_ids"].flatten()
+    request = request_from_token(input_ids, max_out_length, block_size, req_idx)
+    return request
+
+
+@dataclass
+class MultiModalRequestParams:
+    text:str
+    image:str
+    max_out_length:int
+    block_size:int
+    req_idx:int
+
+
+def request_from_text_and_image(processor, model, multimodalparams):
+    text = multimodalparams.text
+    image = multimodalparams.image
+    max_out_length = multimodalparams.max_out_length
+    block_size = multimodalparams.block_size
+    req_idx = multimodalparams.text
+    inputs_embeds = model.model.prepare_prefill_token(text, image, processor)
+    request = MultiModalRequest(max_out_length, block_size, req_idx, inputs_embeds)
+    return request
+
+
+def request_from_token_file(input_path, max_out_length, block_size) -> List[Request]:
+    req_list = []
+    req_idx = 0
+    with open(input_path, 'r') as f:
+        for line in f.readlines():
+            token_str_list = line.split(',')
+            input_ids = []
+            for token_str in token_str_list:
+                input_ids.append(int(token_str))
+            req_list.append(request_from_token(input_ids, max_out_length, block_size, req_idx))
+            req_idx += 1
+    return req_list
+
+
+def request_from_text_file(input_path, tokenizer, max_out_length, block_size) -> List[Request]:
+    req_list = []
+    req_idx = 0
+    with open(input_path, 'r') as f:
+        for line in f.readlines():
+            if line[-1] != '\n':
+                continue
+            text = line[:-1]
+            req_list.append(request_from_text(text, tokenizer, max_out_length, block_size, req_idx=0))
+            req_idx += 1
+    return req_list