add kunlun squeeze kernel (#229 )

Co-authored-by: Haojie Wang <haojie0429@gmail.com>
添加 MLU 平台分布式验收脚本 (#223 )
2024-04-28 11:28:28 +08:00 · 2024-04-28 11:24:09 +08:00 · 2024-04-23 15:46:25 +08:00 · 2024-04-07 16:57:07 +08:00 · 2024-04-03 09:56:52 +08:00 · 2024-04-01 14:04:28 +08:00
372 changed files with 17178 additions and 3186 deletions
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@ -14,10 +14,10 @@ env:
  protobuf-version: "3.21.12"
  python-version: "3.10"

-  resnet-download: https://github.com/onnx/models/raw/main/vision/classification/resnet/model/resnet18-v2-7.onnx
-  inception-download: https://media.githubusercontent.com/media/onnx/models/main/vision/classification/inception_and_googlenet/inception_v2/model/inception-v2-9.onnx
-  densenet-download: https://github.com/onnx/models/raw/main/vision/classification/densenet-121/model/densenet-12.onnx
-  efficientnet-download: https://github.com/onnx/models/raw/main/vision/classification/efficientnet-lite4/model/efficientnet-lite4-11.onnx
+  resnet-download: https://github.com/InfiniTensor/InfiniTensor/releases/download/test-models/resnet18-v2-7.onnx
+  inception-download: https://github.com/InfiniTensor/InfiniTensor/releases/download/test-models/inception-v2-9.onnx
+  densenet-download: https://github.com/InfiniTensor/InfiniTensor/releases/download/test-models/densenet-12.onnx
+  efficientnet-download: https://github.com/InfiniTensor/InfiniTensor/releases/download/test-models/efficientnet-lite4-11.onnx

 jobs:
  build:
--- a/.gitmodules
+++ b/.gitmodules
@ -13,3 +13,6 @@
 [submodule "example"]
 	path = examples/NNmodel
 	url = git@github.com:wanghailu0717/NNmodel.git
+[submodule "examples/distributed/onnxsim_large_model"]
+	path = examples/distributed/onnxsim_large_model
+	url = git@github.com:luchangli03/onnxsim_large_model.git
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -1,19 +1,26 @@
-cmake_minimum_required(VERSION 3.17) # FindCUDAToolkit
-include(CMakeDependentOption)
-project(InfiniTensor C CXX)
-
 # Do not change these options in this file. Use cmake.config, cmake -DOPTION=VALUE, or ccmake to specify them.
 option(USE_CUDA "Support CUDA GPU" OFF)
 option(USE_BANG "Support BANG MLU" OFF)
+option(USE_KUNLUN "Support KUNLUN XPU" OFF)
 option(USE_INTELCPU "Support INTELCPU" OFF)
 option(USE_BACKTRACE "Print backtrace on exception and segmentation fault" ON)
 option(USE_PROTOBUF "Serialize and deserialize tensors" OFF)
+option(BUILD_NNET "Build nnet" OFF)
 option(BUILD_DIST "Build project for distributed running" OFF)
 option(BUILD_TEST "Build tests" OFF)

+if(USE_CUDA)
+    message("CMake 3.18 or higher is required for setting CUDAToolkit")
+    cmake_minimum_required(VERSION 3.18) # FindCUDAToolkit
+else()
+    cmake_minimum_required(VERSION 3.17)
+endif()
+
+include(CMakeDependentOption)
+project(InfiniTensor C CXX)
+
 cmake_dependent_option(BUILD_TEST_CORE "Build tests for core components" ON BUILD_TEST OFF)
 cmake_dependent_option(BUILD_TEST_PET "Build tests for PET" OFF BUILD_TEST OFF)
-cmake_dependent_option(BUILD_TEST_EINNET "Build tests for EINNET" OFF BUILD_TEST OFF)

 set(DEFAULT_BUILD_TYPE "RelWithDebInfo")
 # Build Type
@ -23,14 +30,14 @@ if(CMAKE_BUILD_TYPE STREQUAL "Debug")
    add_compile_definitions(DEBUG_MODE)
 elseif(CMAKE_BUILD_TYPE STREQUAL "Release")
    message("Configuring for Release build.")
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O2") 
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O2")
    add_compile_definitions(NDEBUG)
 elseif(CMAKE_BUILD_TYPE STREQUAL "RelWithDebInfo")
    message("Configuring for RelWithDebInfo build.")
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -g -O2") 
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -g -O2")
 else()
    message("Build type not specified. Configuring for RelWithDebInfo build.")
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -g -O2") 
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -g -O2")
 endif()


@ -46,11 +53,13 @@ endif()

 set(CMAKE_CXX_STANDARD 17)
 set(CMAKE_CXX_EXTENSIONS OFF) # -std=gnu++11 when on, -std=c++11 when off
+add_compile_options(-Wno-error=unused-variable)

 find_package(
  Python
  COMPONENTS Interpreter Development
  REQUIRED)
+
 # OpenMP
 find_package(OpenMP)
 if(OpenMP_C_FOUND)
@ -87,16 +96,17 @@ add_subdirectory(3rd-party/nlohmann_json_cmake_fetchcontent)
 include_directories(3rd-party/nlohmann_json_cmake_fetchcontent/single_include)

 # TVM backend
-if(BUILD_TEST_EINNET)
-  if (NOT TVM_INCLUDE_DIR OR NOT DMLC_INCLUDE_DIR OR NOT DLPACK_INCLUDE_DIR OR NOT DLPACK_INCLUDE_DIR)
-    message(FATAL_ERROR "TVM_INCLUDE_DIR, DMLC_INCLUDE_DIR, and DLPACK_INCLUDE_DIR must be set when BUILD_TEST_EINNET is ON")
-  endif()
+if(BUILD_NNET AND BUILD_TEST)
  # TVM and DMLC for invoking TVM packed functions
  include_directories(${TVM_INCLUDE_DIR})
  include_directories(${DMLC_INCLUDE_DIR})
  include_directories(${DLPACK_INCLUDE_DIR})
-  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DDMLC_USE_LOGGING_LIBRARY=\\\<${TVM_INCLUDE_DIR}/tvm/runtime/logging.h\\\> ")
-  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DINFINI_USE_TVM=1") # Enable TVM codegen kernels
+  if (TVM_INCLUDE_DIR AND DMLC_INCLUDE_DIR AND DLPACK_INCLUDE_DIR AND DLPACK_INCLUDE_DIR)
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DDMLC_USE_LOGGING_LIBRARY=\\\<${TVM_INCLUDE_DIR}/tvm/runtime/logging.h\\\> ")
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DINFINI_USE_TVM=1") # Enable TVM codegen kernels
+  else()
+    # message(FATAL_ERROR "TVM_INCLUDE_DIR, DMLC_INCLUDE_DIR, and DLPACK_INCLUDE_DIR must be set when BUILD_NNET AND BUILD_TEST is ON")
+  endif()
 endif()

 if(BUILD_TEST)
@ -110,13 +120,21 @@ if(BUILD_TEST)
  include_directories(3rd-party/googletest/googletest/include)
 endif()

-set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -g -Wall -Werror -Wno-error=deprecated-declarations")
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -g -Wall -Werror -Wno-error=deprecated-declarations -Wno-error=pointer-arith")
 set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -UNDEBUG") # Enable assertion
 set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "${CMAKE_CXX_FLAGS_RELWITHDEBINFO} -UNDEBUG") # Enable assertion


 # Source files
-file(GLOB_RECURSE SRC src/ffi/*.cc src/core/*.cc src/kernels/cpu/*.cc src/nnet/*.cc src/operators/*.cc src/utils/*.cc)
+file(GLOB_RECURSE SRC src/ffi/*.cc src/core/*.cc src/kernels/cpu/*.cc src/operators/*.cc src/utils/*.cc)
+
+if(BUILD_NNET)
+  add_compile_definitions(BUILD_NNET=1)
+  file(GLOB_RECURSE SRC_NNET src/nnet/*.cc)
+  list (APPEND SRC ${SRC_NNET})
+  # For locating resource files
+  set_source_files_properties(src/nnet/test.cc PROPERTIES COMPILE_OPTIONS "-DINFINI_PROJECT_HOME=${CMAKE_CURRENT_SOURCE_DIR}")
+endif()

 if(USE_CUDA)
  file(GLOB_RECURSE SRC_CUDA src/cuda/*.cc src/cuda/*.cu src/kernels/cuda/*.cc src/kernels/cuda/*.cu)
@ -128,6 +146,11 @@ if(USE_BANG)
  list (APPEND SRC ${SRC_BANG})
 endif()

+if(USE_KUNLUN)
+  file(GLOB_RECURSE SRC_KUNLUN src/kunlun/*.cc src/kernels/kunlun/*.cc )
+  list (APPEND SRC ${SRC_KUNLUN})
+endif()
+
 if(USE_INTELCPU)
  file(GLOB_RECURSE SRC_INTELCPU src/intelcpu/*.cc src/kernels/intelcpu/*.cc )
  list (APPEND SRC ${SRC_INTELCPU})
@ -142,7 +165,7 @@ endif()
 target_link_libraries(InfiniTensor pybind11::embed)

 # TVM backend
-if(BUILD_TEST_EINNET)
+if(BUILD_NNET AND BUILD_TEST AND TVM_LIB_DIR)
  target_link_libraries(InfiniTensor ${TVM_LIB_DIR}/libtvm.so)
 endif()

@ -240,7 +263,50 @@ if(USE_BANG)
  # BangC Kernels
  ################################################################################

-  target_link_libraries(InfiniTensor ${CAMBRICON_CNNL} ${CAMBRICON_CNRT} ${CAMBRICON_CNDRV} stdc++)
+  if (BUILD_DIST)
+    find_library(CAMBRICON_CNCL libcncl.so "${NEUWARE_HOME}/lib64")
+    target_link_libraries(InfiniTensor ${CAMBRICON_CNCL} ${CAMBRICON_CNNL} ${CAMBRICON_CNRT} ${CAMBRICON_CNDRV} stdc++)
+    message(STATUS "Add BUILD_DIST, use CNCL with BANG")
+    add_compile_definitions(INFINI_USE_CNCL=1)
+  else()
+    target_link_libraries(InfiniTensor ${CAMBRICON_CNNL} ${CAMBRICON_CNRT} ${CAMBRICON_CNDRV} stdc++)
+  endif()
+endif()
+
+if(USE_KUNLUN)
+  add_compile_definitions(USE_KUNLUN=1)
+  if ((NOT DEFINED KUNLUN_HOME) AND (NOT DEFINED ENV{KUNLUN_HOME}))
+    message(FATAL_ERROR "KUNLUN_HOME is not defined from cmake or env")
+  elseif (DEFINED KUNLUN_HOME)
+          set(KUNLUN_HOME ${KUNLUN_HOME} CACHE STRING "KUNLUN_HOME directory for Kunlun development")
+  else()
+          set(KUNLUN_HOME $ENV{KUNLUN_HOME} CACHE STRING "KUNLUN_HOME directory for Kunlun development")
+  endif()
+  message(STATUS "KUNLUN_HOME: ${KUNLUN_HOME}")
+
+  include_directories("${KUNLUN_HOME}/include/")
+  find_library(KUNLUN_RT libxpurt.so "${KUNLUN_HOME}/lib64/")
+  find_library(KUNLUN_DNN libxpuapi.so "${KUNLUN_HOME}/lib64/")
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -lstdc++ -Wall -Werror")
+
+  if ((NOT DEFINED TARGET_CPU_ARCH) AND (NOT DEFINED ENV{TARGET_CPU_ARCH}))
+    execute_process(COMMAND uname -m OUTPUT_VARIABLE _uname_m OUTPUT_STRIP_TRAILING_WHITESPACE)
+    set(TARGET_CPU_ARCH "${_uname_m}" CACHE STRING "Target CPU ARCH")
+  elseif(DEFINED TARGET_CPU_ARCH)
+    set(TARGET_CPU_ARCH ${TARGET_CPU_ARCH} CACHE STRING "Target CPU ARCH")
+  else()
+    set(TARGET_CPU_ARCH $ENV{TARGET_CPU_ARCH} CACHE STRING "Target CPU ARCH")
+  endif()
+  message(STATUS "TARGET_CPU_ARCH: ${TARGET_CPU_ARCH}")
+
+  if (BUILD_DIST)
+    message(STATUS "Add BUILD_DIST, use XCCL with KUNLUN XPU")
+    list(APPEND CMAKE_MODULE_PATH ${PROJECT_SOURCE_DIR}/cmake)
+    find_package(XCCL REQUIRED)
+    add_compile_definitions(INFINI_USE_XCCL=1)
+    target_link_libraries(InfiniTensor ${XCCL_LIBRARIES})
+  endif()
+  target_link_libraries(InfiniTensor ${KUNLUN_RT} ${KUNLUN_DNN} stdc++)
 endif()

 # # Python bindings
@ -267,12 +333,18 @@ if(BUILD_TEST)
  if(BUILD_TEST_CORE)
    build_test(test/core/*.cc)
    build_test(test/operators/*.cc)
+    build_test(test/kernels/nativecpu/*.cc)
    if (USE_CUDA)
      build_test(test/kernels/cuda/*.cc)
      build_test(test/cuda/*.cc)
    endif()
    if (USE_BANG)
      build_test(test/kernels/bang/*.cc)
+      build_test(test/bang/*.cc)
+    endif()
+    if (USE_KUNLUN)
+      build_test(test/kernels/kunlun/*.cc)
+      build_test(test/kunlun/*.cc)
    endif()
    if (USE_INTELCPU)
      build_test(test/kernels/intelcpu/*.cc)
@ -281,7 +353,7 @@ if(BUILD_TEST)
  if(BUILD_TEST_PET)
    build_test(test/pet/*.cc)
  endif()
-  if(BUILD_TEST_EINNET)
+  if(BUILD_NNET AND BUILD_TEST)
    build_test(test/nnet/test_*.cc)

    # Build expression reader
--- a/14
+++ b/14
@ -3,15 +3,19 @@
 TYPE ?= Release
 CUDA ?= OFF
 BANG ?= OFF
+KUNLUN ?= OFF
 INTELCPU ?= off
 BACKTRACE ?= ON
 TEST ?= ON
+DIST ?= OFF
+NNET ?= OFF
+DIST ?= OFF
 FORMAT_ORIGIN ?=
 # Docker build options
 DOCKER_NAME ?= infinitensor
 DOCKER_IMAGE_NAME ?= infinitensor
 DOCKER_FILE ?= infinitensor_ubuntu_22.04.dockerfile
-DOCKER_RUN_OPTION ?= 
+DOCKER_RUN_OPTION ?=

 # CUDA option.
 ifeq ($(CUDA), ON)
@ -21,12 +25,14 @@ ifeq ($(CUDA), ON)
 	DOCKER_RUN_OPTION += --gpus all -it --ipc=host --ulimit memlock=-1 --ulimit stack=67108864 -v `pwd`:`pwd` -w `pwd`
 endif

-
 CMAKE_OPT = -DCMAKE_BUILD_TYPE=$(TYPE)
 CMAKE_OPT += -DUSE_CUDA=$(CUDA)
 CMAKE_OPT += -DUSE_BANG=$(BANG)
+CMAKE_OPT += -DUSE_KUNLUN=$(KUNLUN)
 CMAKE_OPT += -DUSE_BACKTRACE=$(BACKTRACE)
 CMAKE_OPT += -DBUILD_TEST=$(TEST)
+CMAKE_OPT += -DBUILD_DIST=$(DIST)
+CMAKE_OPT += -DBUILD_NNET=$(NNET)

 ifeq ($(INTELCPU), ON)
 	CMAKE_OPT += -DUSE_INTELCPU=ON -DCMAKE_CXX_COMPILER=dpcpp
@ -58,7 +64,7 @@ test-api:
 	@echo
 	python3 pyinfinitensor/tests/test_api.py

-docker-build: 
+docker-build:
 	docker build -f scripts/dockerfile/$(DOCKER_FILE) -t $(DOCKER_NAME) .

 docker-run:
@ -69,5 +75,3 @@ docker-start:

 docker-exec:
 	docker exec -it $(DOCKER_IMAGE_NAME) bash
-
-
--- a/README.md
+++ b/README.md
@ -33,13 +33,14 @@ There are several configurable CMake options, see the [CMakeLists.txt](/CMakeLis

 ## Roadmap

+- [RefactorGraph](https://github.com/InfiniTensor/RefactorGraph) is a newly designed AI framework that is set to replace the current main branch.
 - [EinNet](https://github.com/InfiniTensor/InfiniTensor/tree/NNET_e2e) is going to be merged into the main branch.
 - Integration of [PET](https://github.com/thu-pacman/PET), a tensor program optimizer supporting partially equivalent transformations.
 - Supported hardware
  - ✔ NVIDIA GPU
  - ✔ Cambricon MLU
+  - ✔ Kunlunxin XPU
  - ⬜ Ascend NPU
-  - ⬜ Kunlunxin XPU

 ## Contributor Guide

--- a/cmake/FindCNCL.cmake
+++ b/cmake/FindCNCL.cmake
@ -0,0 +1,76 @@
+SET(CNCL_LIB_SEARCH_PATHS $ENV{NEUWARE_HOME}/lib64)
+SET(CNCL_INCLUDE_SEARCH_PATHS $ENV{NEUWARE_HOME}/include)
+
+set(CNCL_INCLUDE_DIR $ENV{NEUWARE_HOME}/include)
+set(CNCL_LIB_DIR $ENV{NEUWARE_HOME}/lib64)
+set(CNCL_VERSION $ENV{CNCL_VERSION} CACHE STRING "Version of CNCL to build with")
+
+if ($ENV{CNCL_ROOT_DIR})
+  message(WARNING "CNCL_ROOT_DIR is deprecated. Please set CNCL_ROOT instead.")
+endif()
+list(APPEND CNCL_ROOT $ENV{CNCL_ROOT_DIR} ${MLU_TOOLKIT_ROOT_DIR})
+# Compatible layer for CMake <3.12. CNCL_ROOT will be accounted in for searching paths and libraries for CMake >=3.12.
+list(APPEND CMAKE_PREFIX_PATH ${CNCL_ROOT})
+
+find_path(CNCL_INCLUDE_DIRS
+  NAMES cncl.h
+  HINTS ${CNCL_INCLUDE_DIR})
+
+if (USE_STATIC_CNCL)
+  MESSAGE(STATUS "USE_STATIC_CNCL is set. Linking with static CNCL library.")
+  SET(CNCL_LIBNAME "CNCL_static")
+  if (CNCL_VERSION)  # Prefer the versioned library if a specific CNCL version is specified
+    set(CMAKE_FIND_LIBRARY_SUFFIXES ".a.${CNCL_VERSION}" ${CMAKE_FIND_LIBRARY_SUFFIXES})
+  endif()
+else()
+  SET(CNCL_LIBNAME "cncl")
+  if (CNCL_VERSION)  # Prefer the versioned library if a specific CNCL version is specified
+    set(CMAKE_FIND_LIBRARY_SUFFIXES ".so.${CNCL_VERSION}" ${CMAKE_FIND_LIBRARY_SUFFIXES})
+  endif()
+endif()
+
+find_library(CNCL_LIBRARIES
+  NAMES ${CNCL_LIBNAME}
+  HINTS ${CNCL_LIB_DIR})
+
+include(FindPackageHandleStandardArgs)
+find_package_handle_standard_args(CNCL DEFAULT_MSG CNCL_INCLUDE_DIRS CNCL_LIBRARIES)
+
+if(CNCL_FOUND)  # obtaining CNCL version and some sanity checks
+  set (CNCL_HEADER_FILE "${CNCL_INCLUDE_DIRS}/cncl.h")
+  message (STATUS "Determining CNCL version from ${CNCL_HEADER_FILE}...")
+  set (OLD_CMAKE_REQUIRED_INCLUDES ${CMAKE_REQUIRED_INCLUDES})
+  list (APPEND CMAKE_REQUIRED_INCLUDES ${CNCL_INCLUDE_DIRS})
+  include(CheckCXXSymbolExists)
+  check_cxx_symbol_exists(CNCL_VERSION_CODE CNCL.h CNCL_VERSION_DEFINED)
+
+  if (CNCL_VERSION_DEFINED)
+    set(file "${PROJECT_BINARY_DIR}/detect_cncl_version.cc")
+    file(WRITE ${file} "
+      #include <iostream>
+      #include <cncl.h>
+      int main()
+      {
+        std::cout << CNCL_MAJOR << '.' << CNCL_MINOR << '.' << CNCL_PATCH << std::endl;
+        int x;
+        CNCLGetVersion(&x);
+        return x == CNCL_VERSION_CODE;
+      }
+")
+    try_run(CNCL_VERSION_MATCHED compile_result ${PROJECT_BINARY_DIR} ${file}
+          RUN_OUTPUT_VARIABLE CNCL_VERSION_FROM_HEADER
+          CMAKE_FLAGS  "-DINCLUDE_DIRECTORIES=${CNCL_INCLUDE_DIRS}"
+          LINK_LIBRARIES ${CNCL_LIBRARIES})
+    if (NOT CNCL_VERSION_MATCHED)
+      message(FATAL_ERROR "Found CNCL header version and library version do not match! \
+(include: ${CNCL_INCLUDE_DIRS}, library: ${CNCL_LIBRARIES}) Please set CNCL_INCLUDE_DIR and CNCL_LIB_DIR manually.")
+    endif()
+    message(STATUS "CNCL version: ${CNCL_VERSION_FROM_HEADER}")
+  else()
+    # message(STATUS "CNCL version < 2.3.5-5")
+  endif ()
+  set (CMAKE_REQUIRED_INCLUDES ${OLD_CMAKE_REQUIRED_INCLUDES})
+
+  message(STATUS "Found CNCL (include: ${CNCL_INCLUDE_DIRS}, library: ${CNCL_LIBRARIES})")
+  mark_as_advanced(CNCL_ROOT_DIR CNCL_INCLUDE_DIRS CNCL_LIBRARIES)
+endif()
--- a/cmake/FindXCCL.cmake
+++ b/cmake/FindXCCL.cmake
@ -0,0 +1,27 @@
+# Find the xccl libraries
+set(XCCL_INCLUDE_DIR $ENV{KUNLUN_HOME}/include CACHE PATH "Folder contains KUNLUN XCCL headers")
+set(XCCL_LIB_DIR $ENV{KUNLUN_HOME}  CACHE PATH "Folder contains KUNLUN XCCL libraries")
+
+list(APPEND CMAKE_PREFIX_PATH $ENV{KUNLUN_HOME})
+
+find_path(XCCL_INCLUDE_DIRS # ${XCCL_INCLUDE_DIR}
+  NAMES xpu/bkcl.h
+  HINTS XCCL_INCLUDE_DIR)
+
+find_library(XCCL_LIBRARIES # ${XCCL_LIB_DIR}
+  NAMES lib64/libbkcl.so
+  HINTS XCCL_LIB_DIR)
+
+message(STATUS "XCCL_INCLUDE_DIRS: ${XCCL_INCLUDE_DIRS}")
+message(STATUS "XCCL_LIBRARIES: ${XCCL_LIBRARIES}")
+
+include(FindPackageHandleStandardArgs)
+find_package_handle_standard_args(XCCL DEFAULT_MSG XCCL_INCLUDE_DIRS XCCL_LIBRARIES)
+
+if (XCCL_FOUND)
+  set (XCCL_HEADER_FILE "${XCCL_INCLUDE_DIRS}/xpu/bkcl.h")
+  message (STATUS "Determing XCCL version from ${XCCL_HEADER_FILE}...")
+  list (APPEND CMAKE_REQUIRED_INCLUDES ${XCCL_INCLUDE_DIRS})
+  message(STATUS "Found XCCL (include: ${XCCL_INCLUDE_DIRS}, library: ${XCCL_LIBRARIES})")
+  mark_as_advanced(XCCL_INCLUDE_DIRS XCCL_LIBRARIES)
+endif()
--- a/docs/INSTALL_GUIDE_CN.md
+++ b/docs/INSTALL_GUIDE_CN.md
@ -133,6 +133,13 @@
   make install-python BANG=ON
   ```

+   编译 CPU 部分，同时编译昆仑 XPU 部分：
+
+   ```bash
+   export KUNLUN_HOME=/path/to/your/kunlun_home
+   make install-python KUNLUN=ON
+   ```
+
 3. 使用方法

   安装成功后，您就可以使用本项目的 Python 接口进行编码并运行。具体使用方式可以参考项目样例代码 example/Resnet/resnet.py 以及用户使用手册
--- a/docs/SUPPORT_MATRIX_CN.md
+++ b/docs/SUPPORT_MATRIX_CN.md
@ -2,6 +2,7 @@

 ## 目录

+
 - [环境支持](#环境支持)
 - [神经网络支持](#神经网络支持)
 - [技术支持](#技术支持)
@ -19,10 +20,10 @@

 目前已经验证过的神经网络模型有

- [x] [ResNet18-v2](https://github.com/onnx/models/blob/main/vision/classification/resnet/model/resnet18-v2-7.onnx)
- [x] [DenseNet-121-12](https://github.com/onnx/models/blob/main/vision/classification/densenet-121/model/densenet-12.onnx)
- [x] [Inception-2](https://github.com/onnx/models/blob/main/vision/classification/inception_and_googlenet/inception_v2/model/inception-v2-9.onnx)
- [x] [EfficientNet-Lite4](https://github.com/onnx/models/blob/main/vision/classification/efficientnet-lite4/model/efficientnet-lite4-11.onnx)
+- [x] [ResNet18-v2](https://github.com/onnx/models/blob/main/validated/vision/classification/resnet/model/resnet18-v2-7.onnx)
+- [x] [DenseNet-121-12](https://github.com/onnx/models/blob/main/validated/vision/classification/densenet-121/model/densenet-12.onnx)
+- [x] [Inception-2](https://github.com/onnx/models/blob/main/validated/vision/classification/inception_and_googlenet/inception_v2/model/inception-v2-9.onnx)
+- [x] [EfficientNet-Lite4](https://github.com/onnx/models/blob/main/validated/vision/classification/efficientnet-lite4/model/efficientnet-lite4-11.onnx)

 ## 技术支持

--- a/docs/USER_GUIDE_CN.md
+++ b/docs/USER_GUIDE_CN.md
@ -3,9 +3,10 @@
 ## 目录

 - [使用方法](#使用方法)
- [python-前端应用指南](#python-前端应用指南)
-  - [导入-onnx-模型](#导入-onnx-模型)
-  - [导出-onnx-模型](#导出-onnx-模型)
+- [python 前端应用指南](#python-前端应用指南)
+  - [导入 onnx 模型](#导入-onnx-模型)
+  - [优化](#优化)
+  - [导出 onnx 模型](#导出-onnx-模型)
  - [执行推理](#执行推理)
  - [样例代码](#样例代码)
 - [技术支持](#技术支持)
@ -13,7 +14,7 @@

 ## 使用方法

-项目管理功能已写到 [Makefile](Makefile)，支持下列功能：
+项目管理功能已写到 [Makefile](../Makefile)，支持下列功能：

 - 编译项目：`make`/`make build`
 - 清理生成文件：`make clean`
@ -26,6 +27,7 @@
 - `TYPE`：编译模式（`debug`/`release`），默认值为 `release`
 - `CUDA`：是否编译 CUDA 后端，默认为 `OFF`，`ON` 打开
 - `BANG`：是否编译寒武纪后端，默认为 `OFF`，`ON` 打开
+- `KUNLUN`：是否编译昆仑后端，默认为 `OFF`，`ON` 打开
 - `BACKTRACE`：是否启用栈回溯，默认为 `ON`，`OFF` 关闭，建议调试时打开
 - `TEST`：是否编译 `googletest`，默认为 `ON`，`OFF` 关闭，只有 `test-cpp` 时必要

@ -37,10 +39,10 @@

 支持的模型：

- [x] [ResNet18-v2](https://github.com/onnx/models/blob/main/vision/classification/resnet/model/resnet18-v2-7.onnx)
- [x] [DenseNet-121-12](https://github.com/onnx/models/blob/main/vision/classification/densenet-121/model/densenet-12.onnx)
- [x] [Inception-2](https://github.com/onnx/models/blob/main/vision/classification/inception_and_googlenet/inception_v2/model/inception-v2-9.onnx)
- [x] [EfficientNet-Lite4](https://github.com/onnx/models/blob/main/vision/classification/efficientnet-lite4/model/efficientnet-lite4-11.onnx)
+- [x] [ResNet18-v2](https://github.com/onnx/models/blob/main/validated/vision/classification/resnet/model/resnet18-v2-7.onnx)
+- [x] [DenseNet-121-12](https://github.com/onnx/models/blob/main/validated/vision/classification/densenet-121/model/densenet-12.onnx)
+- [x] [Inception-2](https://github.com/onnx/models/blob/main/validated/vision/classification/inception_and_googlenet/inception_v2/model/inception-v2-9.onnx)
+- [x] [EfficientNet-Lite4](https://github.com/onnx/models/blob/main/validated/vision/classification/efficientnet-lite4/model/efficientnet-lite4-11.onnx)

 ```python
 import onnx
@ -95,7 +97,7 @@ for name, tensor in stub.inputs.items():
    print(name, tensor.shape(), tensor)
 ```

-对于 [resnet18-v2-7.onnx](https://github.com/onnx/models/blob/main/vision/classification/resnet/model/resnet18-v2-7.onnx)，会打印出：
+对于 [resnet18-v2-7.onnx](https://github.com/onnx/models/blob/main/validated/vision/classification/resnet/model/resnet18-v2-7.onnx)，会打印出：

 ```plaintext
 data [1, 3, 224, 224] <backend.Tensor object at 0x7efeb828e3b0>
@ -136,7 +138,7 @@ for name, tensor in stub.outputs.items():

 ### 样例代码

-您可以参照[./example/Resnet/resnet.py](./example/ResNet/resnet.py)的样例代码进行了解，并尝试运行。在这个文件中，我们使用了 Pytorch 构建了 resnet 网络。您可以查阅该脚本使用方式：
+您可以参照[resnet.py](https://github.com/wanghailu0717/NNmodel/blob/main/ResNet/resnet.py)的样例代码进行了解，并尝试运行。在这个文件中，我们使用了 Pytorch 构建了 resnet 网络。您可以查阅该脚本使用方式：

 ```python
 python resnet.py -h
--- a/env.sh
+++ b/env.sh
@ -35,4 +35,4 @@ export LD_LIBRARY_PATH="${NEUWARE_HOME}/lib64:${LD_LIBRARY_PATH}"
 # ├── tools
 # ├── version
 # └── XTDK
-export XPU_HOME=/usr/local/xpu
+export KUNLUN_HOME=/usr/local/xpu
--- a/examples/distributed/README.md
+++ b/examples/distributed/README.md
@ -0,0 +1,39 @@
+# 分布式脚本
+
+## 英伟达平台运行方式
+
+#### 1. 运行pytorch模型并生成输入和标准输出，可选择导出onnx
+
+使用 `--export_onnx` 设置导出onnx的目录，默认为当前路径 `./`，不使用这个flag则只进行计算和生成输入输出。
+
+```bash
+python run_pytorch.py --model gpt2  --batch_size 1  --length 1 --export_onnx ./
+```
+
+会在当前目录下生成输入输出文件`test_inputs.npy` 和 `test_results.npy`，目前只支持单一输入输出。
+
+#### 2. 运行InfiniTensor分布式脚本
+
+```bash
+python cuda_launch.py --model "/XXX/XXX.onnx" --nproc_per_node 4 
+```
+
+## 寒武纪平台运行方式
+
+**将上述运行脚本 `run_pytorch.py` 以及 `cuda_launch.py` 针对寒武纪平台做了相应的适配，具体见 `run_pytorch_mlu.py` 以及 `bang_launch.py`。**
+
+#### 1. 运行pytorch模型并生成输入和标准输出，可选择导出onnx
+
+使用 `--export_onnx` 设置导出onnx的目录，默认为当前路径 `./`，不使用这个flag则只进行计算和生成输入输出。
+
+```bash
+python run_pytorch_mlu.py --model gpt2  --batch_size 1  --length 1 --export_onnx ./
+```
+
+会在当前目录下生成输入输出文件`test_inputs.npy` 和 `test_results.npy`，目前只支持单一输入输出。
+
+#### 2. 运行InfiniTensor分布式脚本
+
+```bash
+python bang_launch.py --model "/XXX/XXX.onnx" --nproc_per_node 4 
+```
--- a/examples/distributed/init.py
+++ b/examples/distributed/init.py
--- a/examples/distributed/bang/bang_launch.py
+++ b/examples/distributed/bang/bang_launch.py
@ -0,0 +1,187 @@
+import sys
+sys.path.append('../')
+
+import argparse
+import os
+import time
+import multiprocessing as mp
+from pyinfinitensor.onnx import OnnxStub, backend
+import onnx
+from onnx.external_data_helper import convert_model_to_external_data
+from onnx.shape_inference import infer_shapes_path
+import numpy as np
+from parallel_opt import parallel_model
+
+def parse_args():
+    parser = argparse.ArgumentParser(description="launch distributed infinitensor")
+    parser.add_argument("--num_nodes", type=int, default=1, help="number of nodes")
+    parser.add_argument(
+        "--nproc_per_node", type=int, default=1, help="number of processes per node"
+    )
+    parser.add_argument(
+        "--name", type=str, default="test", help="name of this instance."
+    )
+    parser.add_argument(
+        "--model", type=str, required=True, help="path to the ONNX model file."
+    )
+    parser.add_argument("--batch_size", type=int, default=1, help="batch size.")
+    parser.add_argument("--length", type=int, default=1, help="sequence length.")
+    parser.add_argument(
+        "--gen_std",
+        action="store_true",
+        help="whether to generate the standard results.",
+    )
+    parser.add_argument(
+        "--type", type=str, choices=["fp32", "fp16", "tf32"], default="fp32", help="data type"
+    )
+    args = parser.parse_args()
+    print("arg setting: ", args)
+    return (
+        args.num_nodes,
+        args.nproc_per_node,
+        args.name,
+        args.model,
+        args.batch_size,
+        args.length,
+        args.gen_std,
+        args.type,
+    )
+
+
+def run_model(model, runtime, world_size=1, rank=0, n=10, data_type="default"):
+    stub = OnnxStub(model, runtime, matmul_compute_type=data_type)
+    load_inputs(stub, world_size, rank)
+    # stub.tune()
+    stub.run()
+    # get outputs
+    outputs = next(stub.outputs.values().__iter__()).copyout_numpy()
+
+    # bench
+    for _ in range(n):
+        stub.run()
+    begin = time.time()
+    for _ in range(n * 2):
+        stub.run()
+    end = time.time()
+    avg_time = (end - begin) / (n * 2)
+    print(f"average time: {avg_time}")
+    return outputs
+
+def load_inputs(stub, world_size=1, rank=0):
+    for i, (name, tensor) in enumerate(stub.inputs.items()):
+        input = np.load(f"./data/input_{i}.npy")
+        if all(x == y for x,y in zip(input.shape,tensor.shape())):
+            tensor.copyin_numpy(input)
+        else:
+            tensor.copyin_numpy(np.hsplit(input, world_size)[rank])
+
+
+def run_and_compare(name, model, runtime, world_size=1, rank=0, data_type="default"):
+    results = np.load(f"./data/output.npy")
+    outputs = run_model(model, runtime, world_size, rank, data_type=data_type)
+    print("outputs abs mean:", abs(outputs).mean())
+    print("max abs diff:", abs(outputs - results).max())
+
+def start_worker(
+    name: str, world_size: int, rank: int, local_rank: int, model: onnx.ModelProto, data_type: str
+):
+    dist_name = name + "_dist"
+    model = parallel_model(model, world_size, rank)
+    extern_path = f"./{dist_name}_rank{rank}.pb"
+    if os.path.exists(extern_path):
+        os.remove(extern_path)
+    onnx.save_model(
+        model,
+        f"./{dist_name}_rank{rank}.onnx",
+        save_as_external_data=True,
+        location=extern_path,
+    )
+    #infer_shapes_path(f"./{dist_name}_rank{rank}.onnx")
+    runtime = backend.BangRuntime(local_rank)
+    # print("init comm")
+    runtime.init_comm(
+        dist_name,
+        world_size,
+        rank,
+    )
+    run_and_compare(name, model, runtime, world_size, rank, data_type)
+
+
+def start_single(name, model, data_type):
+    runtime = backend.BangRuntime(0)
+    run_and_compare(name, model, runtime, data_type=data_type)
+
+def generate_input_output(model):
+    os.makedirs(os.path.dirname("./data/"), exist_ok=True)
+    runtime = backend.BangRuntime(0)
+    stub = OnnxStub(model, runtime)
+    position_id = 0
+    for i, (name, tensor) in enumerate(stub.inputs.items()):
+        input = tensor.copyout_numpy()
+        if np.issubdtype(input.dtype, np.integer):
+            if input.size == 1:
+                # input = np.array([position_id])
+                input = np.random.randint(0,2,size=input.shape, dtype=input.dtype)
+            else:
+                input = np.random.randint(0,2,size=input.shape, dtype=input.dtype)
+        elif input.dtype == np.bool_:
+            input = np.random.randint(0,2,size=input.shape) > 0
+        else:
+            if i == 0:
+                input = np.ones(input.shape).astype(input.dtype)
+                position_id = input.shape[-1] - 1
+            else:
+                input = np.random.rand(*input.shape).astype(input.dtype)
+        tensor.copyin_numpy(input)
+        np.save(f"./data/input_{i}", input)
+    stub.run()
+    time.sleep(0.01)
+    output = next(stub.outputs.values().__iter__()).copyout_numpy()
+    if np.isnan(output).any():
+        print("Nan in output")
+    np.save(f"./data/output", output)
+
+
+def main():
+    nnodes, nproc_per_node, name, model_path, bs, length, gen_std, data_type = parse_args()
+    data_type = "default" if data_type == "fp32" else data_type
+    
+    model = onnx.load(model_path)
+
+    # generate standart output
+    if gen_std:
+        print(f"generate standard data for {name}.")
+        # a small vocabulary size to fit all LLM.
+        generate_input_output(model)
+        return
+
+    if nproc_per_node == 1:
+        # run single process.
+        # use standalone process to isolate bang.
+        print("run model by single MLU.")
+        # p = mp.Process(target=start_single, args=(name, model, data_type))
+        # p.start()
+        # p.join()
+        start_single(name, model, data_type)
+        return
+
+    # run distributed parallel.
+    world_size = nnodes * nproc_per_node
+    print(f"run model by {world_size} MLU in parallel.")
+    workers = [
+        mp.Process(
+            target=start_worker,
+            args=(name, world_size, rank, rank % nproc_per_node, model, data_type),
+        )
+        for rank in range(world_size)
+    ]
+
+    for w in workers:
+        w.start()
+
+    for w in workers:
+        w.join()
+
+
+if __name__ == "__main__":
+    main()
--- a/examples/distributed/bang/run_pytorch_mlu.py
+++ b/examples/distributed/bang/run_pytorch_mlu.py
@ -0,0 +1,249 @@
+import argparse
+import torch
+import torch_mlu
+from transformers import BertModel, BertConfig
+from transformers import GPT2Model, GPT2Config
+from transformers import OPTModel, OPTConfig
+from transformers import AlbertModel, AlbertConfig
+from transformers import LlamaModel, LlamaConfig
+import time
+import numpy as np
+import onnx
+import sys
+import os
+from onnx.external_data_helper import convert_model_to_external_data
+from onnxsim import simplify
+
+def parse_args():
+    parser = argparse.ArgumentParser(description="Run pytorch gpt2/bert/opt and optionally export onnx.")
+    parser.add_argument(
+        "--model", type=str, choices=["gpt2", "bert", "opt", "llama", "albert"], required=True, help="model type"
+    )
+    parser.add_argument("--batch_size", type=int, default=1, help="batch size.")
+    parser.add_argument("--length", type=int, default=1, help="sequence length.")
+    parser.add_argument(
+        "--export_onnx",
+        type=str,
+        nargs="?",
+        default=None,
+        const="./",
+        help="whether and where to export onnx file",
+    )
+    parser.add_argument(
+        "--type", type=str, choices=["fp32", "fp16", "tf32"], required=True, help="model data type"
+    )
+    args = parser.parse_args()
+    print("arg setting: ", args)
+    return (
+        args.model,
+        args.batch_size,
+        args.length,
+        args.export_onnx,
+        args.type
+    )
+
+
+def get_model(modelname):
+    match modelname:
+        case "albert":
+            model = AlbertModel.from_pretrained("albert/albert-base-v2")
+            voc_size = AlbertConfig().vocab_size
+        case "bert":
+            model = BertModel.from_pretrained("bert-base-uncased", add_pooling_layer=False, hidden_act="gelu_new") # erf is not impl by infini
+            voc_size = BertConfig().vocab_size
+        case "gpt2":
+            model = GPT2Model.from_pretrained("GPT2")
+            voc_size = GPT2Config().vocab_size
+        case "opt":
+            model = OPTModel.from_pretrained("facebook/opt-125m")
+            voc_size = OPTConfig().vocab_size
+        case "llama":
+            model = LlamaModel.from_pretrained("meta-llama/Llama-2-7b-hf")
+            voc_size = LlamaConfig().vocab_size
+        case _:
+            raise KeyError(modelname)
+
+    model = model.eval()
+    return model, voc_size
+
+def run_pytorch(torch_model, voc_size, batchsize, len, dtype="fp32"):
+    data = np.random.randint(0, voc_size, (batchsize, len), dtype=np.int32)
+    os.makedirs(os.path.dirname("./data/"), exist_ok=True)
+    np.save("./data/input_0", data)
+    inputs = torch.from_numpy(data).to("mlu")
+    torch_model = torch_model.to("mlu")
+    if dtype == "fp16":
+        torch_model = torch_model.half()
+
+    n_iter = 20
+    with torch.no_grad():
+        for _ in range(10):
+            outputs = torch_model(inputs)
+    torch.mlu.synchronize()
+    begin = time.time()
+    with torch.no_grad():
+        for _ in range(n_iter):
+            torch.mlu.synchronize()
+            outputs = torch_model(inputs)
+            torch.mlu.synchronize()
+    torch.mlu.synchronize()
+    end = time.time()
+    
+    avg_time = (end - begin) / n_iter
+    outputs = outputs.last_hidden_state.to("cpu")
+    print("outputs abs mean:", abs(np.array(outputs)).mean())
+    print(f"average time: {avg_time}")
+    # torch.mlu.memory.empty_cache()
+    np.save("./data/output", np.array(outputs))
+    print("Save input & output into ./data.")
+
+
+def export_onnx(modelname, model, data, path, extern=False, dtype="fp32"):
+    data = data.to("mlu")
+    model = model.to("mlu")
+    if dtype == "fp16":
+        model = model.half()
+    torch.onnx.export(model, data, path, verbose=False, do_constant_folding=True)
+    if modelname != "llama":
+        # use onnxsim to simplify
+        onnx_model = onnx.load(path)
+        onnx_model, check = simplify(onnx_model, skipped_optimizers=['eliminate_duplicate_initializer'])
+        # onnx_model, check = simplify(onnx_model, skipped_optimizers=['fuse_qkv', 'eliminate_duplicate_initializer'])
+        assert check
+        add_value_info_for_constants(onnx_model)
+        onnx_model = onnx.shape_inference.infer_shapes(onnx_model)
+        if extern:
+            extern_path = path.replace('.onnx', '.pb')
+            if os.path.exists(extern_path):
+                os.remove(extern_path)
+            extern_path = extern_path.split("/")[-1]
+            convert_model_to_external_data(
+                onnx_model,
+                all_tensors_to_one_file=True,
+                location=extern_path,
+                size_threshold=1024,
+                convert_attribute=False,
+            )
+        onnx.save(onnx_model, path)
+    else:
+        # use third party tool to simplify llama
+        # reference: https://github.com/luchangli03/onnxsim_large_model/
+        sys.path.append("onnxsim_large_model")
+        from onnx_utils import set_onnx_input_shape
+        from compress_model import SIZE_1MB, compress_onnx_model, uncompress_onnx_model
+
+        in_model_path = path
+        out_model_path = path
+        if not out_model_path:
+            out_model_path = in_model_path[:-5] + ".sim.onnx"
+        if os.path.isdir(out_model_path):
+            out_model_path = os.path.join(out_model_path, os.path.basename(in_model_path))
+
+        onnx_model = onnx.load(in_model_path)
+        print(f"load model from {in_model_path} success")
+
+        size_th_bytes = 1024 * 1024
+
+        onnx_model, removed_inits = compress_onnx_model(onnx_model, size_th_bytes=size_th_bytes)
+        print(f"compress model success")
+
+        onnx_model = set_onnx_input_shape(onnx_model, "")
+
+        tensor_size_threshold = f"1024KB"
+        skipped_optimizers = []
+        skipped_optimizers.append("eliminate_duplicate_initializer")
+        onnx_model, check = simplify(onnx_model, skipped_optimizers=skipped_optimizers,
+                                    tensor_size_threshold=tensor_size_threshold)
+        if not check:
+            raise ValueError(f"simplify compressed model {in_model_path} failed")
+
+        print(f"simplify model success")
+
+        onnx_model = uncompress_onnx_model(onnx_model, removed_inits)
+        print(f"uncompress model success")
+
+        add_value_info_for_constants(onnx_model)
+
+        onnx.save(onnx_model, out_model_path, save_as_external_data=True)
+
+
+def add_value_info_for_constants(model : onnx.ModelProto):
+    """
+    Currently onnx.shape_inference doesn't use the shape of initializers, so add
+    that info explicitly as ValueInfoProtos.
+    Mutates the model.
+    Args:
+        model: The ModelProto to update.
+    """
+    # All (top-level) constants will have ValueInfos before IRv4 as they are all inputs
+    if model.ir_version < 4:
+        return
+
+    def add_const_value_infos_to_graph(graph : onnx.GraphProto):
+        inputs = {i.name for i in graph.input}
+        existing_info = {vi.name: vi for vi in graph.value_info}
+        for init in graph.initializer:
+            # Check it really is a constant, not an input
+            if init.name in inputs:
+                continue
+
+            # The details we want to add
+            elem_type = init.data_type
+            shape = init.dims
+
+            # Get existing or create new value info for this constant
+            vi = existing_info.get(init.name)
+            if vi is None:
+                vi = graph.value_info.add()
+                vi.name = init.name
+
+            # Even though it would be weird, we will not overwrite info even if it doesn't match
+            tt = vi.type.tensor_type
+            if tt.elem_type == onnx.TensorProto.UNDEFINED:
+                tt.elem_type = elem_type
+            if not tt.HasField("shape"):
+                # Ensure we set an empty list if the const is scalar (zero dims)
+                tt.shape.dim.extend([])
+                for dim in shape:
+                    tt.shape.dim.add().dim_value = dim
+
+        # Handle subgraphs
+        for node in graph.node:
+            for attr in node.attribute:
+                # Ref attrs refer to other attrs, so we don't need to do anything
+                if attr.ref_attr_name != "":
+                    continue
+
+                if attr.type == onnx.AttributeProto.GRAPH:
+                    add_const_value_infos_to_graph(attr.g)
+                if attr.type == onnx.AttributeProto.GRAPHS:
+                    for g in attr.graphs:
+                        add_const_value_infos_to_graph(g)
+
+
+    return add_const_value_infos_to_graph(model.graph)
+
+
+def main():
+    torch.backends.mlu.matmul.allow_tf32 = False
+    torch.backends.cnnl.allow_tf32 = False
+    modelname, batchsize, seqlen, export_path, dtype = parse_args()
+    if dtype == "tf32":
+        torch.backends.mlu.matmul.allow_tf32 = True
+    else:
+        os.environ["CAMBRICON_TF32_OVERRIDE"] = "0"
+
+    model, voc_size = get_model(modelname)
+    if export_path is not None:
+        filename = "{}_{}_{}_{}.onnx".format(modelname, batchsize, seqlen, dtype)
+        path = os.path.join(export_path, filename)
+        if not os.path.exists(path):
+            param = torch.zeros((batchsize, seqlen), dtype=torch.int)
+            export_onnx(modelname, model, param, path, True, dtype)
+        else:
+            print("Onnx path exists, skipping export.")
+
+    run_pytorch(model, voc_size, batchsize, seqlen, dtype)
+
+if __name__ == "__main__":
+    main()
--- a/examples/distributed/cuda/cuda_launch.py
+++ b/examples/distributed/cuda/cuda_launch.py
@ -5,13 +5,11 @@ import multiprocessing as mp
 from pyinfinitensor.onnx import OnnxStub, backend
 import onnx
 from onnx.external_data_helper import convert_model_to_external_data
+from onnx.shape_inference import infer_shapes_path
 import numpy as np
 from parallel_opt import parallel_model


-os.environ["NVIDIA_TF32_OVERRIDE"] = "0"
-
-
 def parse_args():
    parser = argparse.ArgumentParser(description="launch distributed infinitensor")
    parser.add_argument("--num_nodes", type=int, default=1, help="number of nodes")
@ -31,6 +29,9 @@ def parse_args():
        action="store_true",
        help="whether to generate the standard results.",
    )
+    parser.add_argument(
+        "--type", type=str, choices=["fp32", "fp16", "tf32"], default="fp32", help="data type"
+    )
    args = parser.parse_args()
    print("arg setting: ", args)
    return (
@ -41,19 +42,22 @@ def parse_args():
        args.batch_size,
        args.length,
        args.gen_std,
+        args.type,
    )


-def run_model(model, runtime, inputs: np.array, n=20):
-    stub = OnnxStub(model, runtime)
-    next(stub.inputs.items().__iter__())[1].copyin_numpy(inputs)
-    stub.tune()
+def run_model(model, runtime, inputs, n=10, data_type = "default"):
+    stub = OnnxStub(model, runtime, matmul_compute_type=data_type)
+    for tensor, input in zip(stub.inputs.values(), inputs, strict=False):
+        tensor.copyin_numpy(input)
+    # stub.tune()
    stub.run()
    # get outputs
-    outputs = np.array(next(stub.outputs.items().__iter__())[1].copyout_float())
+    outputs = next(stub.outputs.values().__iter__()).copyout_numpy()

    # bench
-    next(stub.inputs.items().__iter__())[1].copyin_numpy(inputs)
+    for tensor, input in zip(stub.inputs.values(), inputs, strict=False):
+        tensor.copyin_numpy(input)
    begin = time.time()
    for _ in range(n):
        stub.run()
@ -63,32 +67,30 @@ def run_model(model, runtime, inputs: np.array, n=20):
    return outputs


-def run_and_compare(name, model, runtime):
-    data = np.load(f"{name}_inputs.npy")
+def run_and_compare(name, model, runtime, data_type):
+    input_ids = np.load(f"{name}_inputs.npy")
+    position_ids = np.arange(input_ids.shape[-1])
    results = np.load(f"{name}_results.npy")
-    outputs = run_model(model, runtime, data)
-    print("outputs sum:", outputs.sum())
+    outputs = run_model(model, runtime, (input_ids, position_ids), data_type=data_type)
+    print("outputs abs mean:", abs(outputs).mean())
    print("max abs diff:", abs(outputs - results).max())
-    print("max rel diff:", abs((outputs - results) / results).max())
-    # assert np.allclose(outputs, results, rtol=1e-3, atol=1e-6)


 def start_worker(
-    name: str, world_size: int, rank: int, local_rank: int, model: onnx.ModelProto
+    name: str, world_size: int, rank: int, local_rank: int, model: onnx.ModelProto, data_type: str
 ):
    dist_name = name + "_dist"
    model = parallel_model(model, world_size, rank)
    extern_path = f"./{dist_name}_rank{rank}.pb"
    if os.path.exists(extern_path):
        os.remove(extern_path)
-    convert_model_to_external_data(
+    onnx.save_model(
        model,
-        all_tensors_to_one_file=True,
+        f"./{dist_name}_rank{rank}.onnx",
+        save_as_external_data=True,
        location=extern_path,
-        size_threshold=1024,
-        convert_attribute=False,
    )
-    onnx.save(model, f"./{dist_name}_rank{rank}.onnx")
+    #infer_shapes_path(f"./{dist_name}_rank{rank}.onnx")
    runtime = backend.CudaRuntime(local_rank)
    # print("init comm")
    runtime.init_comm(
@ -96,26 +98,30 @@ def start_worker(
        world_size,
        rank,
    )
-    run_and_compare(name, model, runtime)
+    run_and_compare(name, model, runtime, data_type)


-def start_single(name, model):
+def start_single(name, model, data_type):
    runtime = backend.CudaRuntime(0)
-    run_and_compare(name, model, runtime)
+    run_and_compare(name, model, runtime, data_type)


 def gen_standard(name, model, voc_size, bs, len):
    # generate standard results
-    data = np.random.randint(0, voc_size, (bs, len), dtype=np.int32)
-    np.save(f"{name}_inputs", data)
+    input_ids = np.random.randint(0, voc_size, (bs, len))
+    position_ids = np.arange(len)
+    np.save(f"{name}_inputs", input_ids)
    runtime = backend.CudaRuntime(0)
-    outputs = run_model(model, runtime, data, 1)
+    outputs = run_model(model, runtime, (input_ids, position_ids), 1)
+    print("outputs abs mean:", abs(outputs).mean())
    np.save(f"{name}_results", outputs)


 def main():
-    nnodes, nproc_per_node, name, model_path, bs, length, gen_std = parse_args()
-
+    nnodes, nproc_per_node, name, model_path, bs, length, gen_std, data_type = parse_args()
+    data_type = "default" if data_type == "fp32" else data_type
+    if data_type != "tf32":
+        os.environ["NVIDIA_TF32_OVERRIDE"] = "0"
    model = onnx.load(model_path)

    # generate standart output
@ -128,16 +134,18 @@ def main():

    # run single process.
    # use standalone process to isolate cuda.
-    p = mp.Process(target=start_single, args=(name, model))
+    print("run model by single GPU.")
+    p = mp.Process(target=start_single, args=(name, model, data_type))
    p.start()
    p.join()

    # run distributed parallel.
    world_size = nnodes * nproc_per_node
+    print(f"run model by {world_size} GPU in parallel.")
    workers = [
        mp.Process(
            target=start_worker,
-            args=(name, world_size, rank, rank % nproc_per_node, model),
+            args=(name, world_size, rank, rank % nproc_per_node, model, data_type),
        )
        for rank in range(world_size)
    ]
--- a/examples/distributed/cuda/launch_kvcache.py
+++ b/examples/distributed/cuda/launch_kvcache.py
--- a/examples/distributed/cuda/run_pytorch.py
+++ b/examples/distributed/cuda/run_pytorch.py
@ -0,0 +1,188 @@
+import argparse
+import torch
+from transformers import BertModel, BertConfig
+from transformers import GPT2Model, GPT2Config
+from transformers import OPTModel, OPTConfig
+import time
+import numpy as np
+import onnx
+import os
+from onnx.external_data_helper import convert_model_to_external_data
+from onnxsim import simplify
+
+def parse_args():
+    parser = argparse.ArgumentParser(description="Run pytorch gpt2/bert/opt and optionally export onnx.")
+    parser.add_argument(
+        "--model", type=str, choices=["gpt2", "bert", "opt"], required=True, help="model type"
+    )
+    parser.add_argument("--batch_size", type=int, default=1, help="batch size.")
+    parser.add_argument("--length", type=int, default=1, help="sequence length.")
+    parser.add_argument(
+        "--export_onnx",
+        type=str,
+        nargs="?",
+        default=None,
+        const="./",
+        help="whether and where to export onnx file",
+    )
+    parser.add_argument(
+        "--type", type=str, choices=["fp32", "fp16", "tf32"], default="fp32", help="data type"
+    )
+    args = parser.parse_args()
+    print("arg setting: ", args)
+    return (
+        args.model,
+        args.batch_size,
+        args.length,
+        args.export_onnx,
+        args.type,
+    )
+
+
+def get_model(modelname):
+    match modelname:
+        case "bert":
+            model = BertModel.from_pretrained("bert-base-uncased", add_pooling_layer=False, hidden_act="gelu_new") # erf is not impl by infini
+            voc_size = BertConfig().vocab_size
+        case "gpt2":
+            model = GPT2Model.from_pretrained("gpt2")
+            voc_size = GPT2Config().vocab_size
+        case "opt":
+            model = model = OPTModel.from_pretrained("./opt-125m")
+            voc_size = OPTConfig().vocab_size
+        case _:
+            raise KeyError(modelname)
+
+    model = model.eval()
+    return model, voc_size
+
+def run_pytorch(torch_model, voc_size, batchsize, len):
+    data = np.random.randint(0, voc_size, (batchsize, len), dtype=np.int32)
+    np.save("test_inputs", data)
+    inputs = torch.from_numpy(data).to("cuda")
+    torch_model = torch_model.to("cuda")
+
+    n_iter = 20
+    with torch.no_grad():
+        for _ in range(10):
+            outputs = torch_model(inputs)
+    torch.cuda.synchronize()
+    begin = time.time()
+    with torch.no_grad():
+        for _ in range(n_iter):
+            torch.cuda.synchronize()
+            outputs = torch_model(inputs)
+            # 
+            torch.cuda.synchronize()
+    torch.cuda.synchronize()
+    end = time.time()
+    
+    avg_time = (end - begin) / n_iter
+    outputs = outputs.last_hidden_state.to("cpu")
+    print("outputs abs mean:", abs(np.array(outputs)).mean())
+    print(f"average time: {avg_time}")
+    torch.cuda.memory.empty_cache()
+    np.save("test_results", np.array(outputs, dtype=np.float32))
+    print("Save input & output as test_inputs.npy and test_results.npy")
+
+
+def export_onnx(model, data, path, extern=False):
+    torch.onnx.export(model, data, path, verbose=False, do_constant_folding=True)
+    onnx_model = onnx.load(path)
+    onnx_model, check = simplify(onnx_model, skipped_optimizers=['eliminate_duplicate_initializer'])
+    #onnx_model, check = simplify(onnx_model, skipped_optimizers=['fuse_qkv', 'eliminate_duplicate_initializer'])
+    assert check
+    add_value_info_for_constants(onnx_model)
+    onnx_model = onnx.shape_inference.infer_shapes(onnx_model)
+    if extern:
+        extern_path = path.replace('.onnx', '.pb')
+        if os.path.exists(extern_path):
+            os.remove(extern_path)
+        convert_model_to_external_data(
+            onnx_model,
+            all_tensors_to_one_file=True,
+            location=extern_path,
+            size_threshold=1024,
+            convert_attribute=False,
+        )
+    onnx.save(onnx_model, path)
+
+def add_value_info_for_constants(model : onnx.ModelProto):
+    """
+    Currently onnx.shape_inference doesn't use the shape of initializers, so add
+    that info explicitly as ValueInfoProtos.
+    Mutates the model.
+    Args:
+        model: The ModelProto to update.
+    """
+    # All (top-level) constants will have ValueInfos before IRv4 as they are all inputs
+    if model.ir_version < 4:
+        return
+
+    def add_const_value_infos_to_graph(graph : onnx.GraphProto):
+        inputs = {i.name for i in graph.input}
+        existing_info = {vi.name: vi for vi in graph.value_info}
+        for init in graph.initializer:
+            # Check it really is a constant, not an input
+            if init.name in inputs:
+                continue
+
+            # The details we want to add
+            elem_type = init.data_type
+            shape = init.dims
+
+            # Get existing or create new value info for this constant
+            vi = existing_info.get(init.name)
+            if vi is None:
+                vi = graph.value_info.add()
+                vi.name = init.name
+
+            # Even though it would be weird, we will not overwrite info even if it doesn't match
+            tt = vi.type.tensor_type
+            if tt.elem_type == onnx.TensorProto.UNDEFINED:
+                tt.elem_type = elem_type
+            if not tt.HasField("shape"):
+                # Ensure we set an empty list if the const is scalar (zero dims)
+                tt.shape.dim.extend([])
+                for dim in shape:
+                    tt.shape.dim.add().dim_value = dim
+
+        # Handle subgraphs
+        for node in graph.node:
+            for attr in node.attribute:
+                # Ref attrs refer to other attrs, so we don't need to do anything
+                if attr.ref_attr_name != "":
+                    continue
+
+                if attr.type == onnx.AttributeProto.GRAPH:
+                    add_const_value_infos_to_graph(attr.g)
+                if attr.type == onnx.AttributeProto.GRAPHS:
+                    for g in attr.graphs:
+                        add_const_value_infos_to_graph(g)
+
+
+    return add_const_value_infos_to_graph(model.graph)
+
+
+def main():
+    torch.backends.cuda.matmul.allow_tf32 = False
+    torch.backends.cudnn.allow_tf32 = False
+    modelname, batchsize, seqlen, export_path, data_type = parse_args()
+    if data_type == "tf32":
+        torch.backends.cuda.matmul.allow_tf32 = True
+    else:
+        os.environ["NVIDIA_TF32_OVERRIDE"] = "0"
+
+    model, voc_size = get_model(modelname)
+    if export_path is not None:
+        filename = "{}_{}_{}.onnx".format(modelname, batchsize, seqlen)
+        path = os.path.join(export_path, filename)
+        param = torch.zeros((batchsize, seqlen), dtype=torch.int)
+        export_onnx(model, param, path, True)
+
+    if data_type == "fp16":
+        model = model.half()
+    run_pytorch(model, voc_size, batchsize, seqlen)
+
+if __name__ == "__main__":
+    main()
--- a/examples/distributed/kunlun/export_onnx.sh
+++ b/examples/distributed/kunlun/export_onnx.sh
@ -0,0 +1,14 @@
+ export HF_ENDPOINT=https://hf-mirror.com
+
+models=("bert" "gpt2" "llama")
+batch_size=(1 32)
+seq_len=(100 500)
+nproc=(1 2 4)
+
+for model in "${models[@]}"; do
+    for bs in "${batch_size[@]}"; do
+        for len in "${seq_len[@]}"; do
+            python run_pytorch.py --model "$model" --batch_size "$bs" --length "$len" --export_onnx ../models/"$model" --export_only 
+        done
+    done
+done 
--- a/examples/distributed/kunlun/kunlun_launch.py
+++ b/examples/distributed/kunlun/kunlun_launch.py
@ -0,0 +1,280 @@
+import sys
+sys.path.append('../')
+
+import argparse
+import os
+import time
+import multiprocessing as mp
+from pyinfinitensor.onnx import OnnxStub, backend
+import onnx
+from onnx.external_data_helper import convert_model_to_external_data
+from onnx.shape_inference import infer_shapes_path
+import numpy as np
+from parallel_opt import parallel_model
+from functools import wraps
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description="launch distributed infinitensor")
+    parser.add_argument("--num_nodes", type=int, default=1, help="number of nodes")
+    parser.add_argument(
+        "--nproc_per_node", type=int, default=2, help="number of processes per node"
+    )
+    parser.add_argument(
+        "--name", type=str, choices=["gpt2", "bert", "llama"], help="name of model."
+    )
+    parser.add_argument(
+        "--model", type=str, default="", help="path to the ONNX model file."
+    )
+    parser.add_argument(
+        "--gen_std",
+        default=False,
+        action="store_true",
+        help="whether to generate the standard results.",
+    )
+    parser.add_argument(
+        "--run_single",
+        default=False,
+        action="store_true",
+        help="whether run model with single process with standard inputs"
+    )
+    parser.add_argument(
+        "--input_dir",
+        default="./",
+        help="path to save model input data"
+    )
+    parser.add_argument(
+        "--result_dir",
+        default="./",
+        help="path to save model standard output"
+    )
+    parser.add_argument(
+        "--internal_model_dir",
+        default="./",
+        help="path to save internal onnx model for parallel run"
+    )
+    args = parser.parse_args()
+
+    # check path, mkdir if not exist
+    check_exists(args.input_dir)
+    check_exists(args.result_dir)
+    check_exists(args.internal_model_dir)
+
+    print("arg setting: ", args)
+    return (
+        args.num_nodes,
+        args.nproc_per_node,
+        args.name,
+        args.model,
+        args.gen_std,
+        args.run_single,
+        args.input_dir,
+        args.result_dir,
+        args.internal_model_dir
+    )
+
+
+"""
+utils function for this scripts
+"""
+def check_exists(path: str):
+    if not os.path.exists(path):
+        os.makedirs(path)
+
+def np_assert(base, test, rtol=1e-2, atol=1e-1):
+    # np.testing.assert_allclose(test, base, rtol, atol)
+    print("max abs diff:", abs(base - test).max())
+
+
+"""
+Perf wrapper, run function n times
+then average
+"""
+def perf_it(n):
+    def decorator(func):
+        @wraps(func)
+        def wrapper(*args, **kwargs):
+            # warmup
+            for _ in range(n):
+                func(*args, **kwargs)
+
+            t_total = 0
+            for _ in range(n):
+                t0 = time.time()
+                func(*args, **kwargs)
+                t1 = time.time()
+                t_total += t1 - t0
+            avg_time = (t_total) / n
+            print(f"Avg runtime of {n} time is {avg_time:.6f} seconds")
+            return avg_time
+        return wrapper
+    return decorator
+
+
+"""
+Run InfiniTensor model with Standard input
+check=True: check with standard output gen by pytorch
+perf=True: run n times to get avg time
+"""
+def run_model(task_name,
+              model,
+              runtime,
+              world_size=1,
+              rank=0,
+              n=10,
+              check=True,
+              perf=True):
+
+    stub = OnnxStub(model, runtime,
+                    use_naive_allocator=True \
+                    if task_name == "llama" else False)
+
+    # load in Onnx model inputs
+    def load_inputs(stub: OnnxStub):
+        # check exists
+        inputs = []
+        for i, (name, tensor) in enumerate(stub.inputs.items()):
+            input_path = os.path.join(input_dir, \
+                                f"{task_name}_input_{i}.npy")
+            print(input_path)
+            if os.path.exists(input_path):
+                input = np.load(input_path)
+            else :
+                raise KeyError(f"{i} th input of model not exists")
+            # check shape
+            if all(x == y for x,y in zip(input.shape, tensor.shape())):
+                tensor.copyin_numpy(input)
+            else:
+                tensor.copyin_numpy(np.hsplit(input, world_size)[rank])
+
+    load_inputs(stub)
+    # stub.tune()
+    stub.run()
+    time.sleep(0.01)
+    output = next(stub.outputs.values().__iter__()).copyout_numpy()
+
+    # check output results with standard output
+    if check:
+        st_output_path = os.path.join(result_dir, \
+                                f"{task_name}_output.npy")
+        assert os.path.exists(st_output_path) , \
+                    "standard output not exists"
+        st_output = np.load(st_output_path)
+        if np.isnan(output).any():
+            print("Nan in output")
+            exit()
+        np_assert(st_output, output)
+
+    # perf
+    if perf:
+        @perf_it(n)
+        def perf_infinitensor(stub: OnnxStub):
+            stub.run()
+        perf_infinitensor(stub)
+
+    return output
+
+
+"""
+Start a worker in Parallel
+"""
+def start_worker(name: str,
+           world_size: int,
+           rank: int,
+           local_rank: int,
+           model: onnx.ModelProto):
+
+    dist_name = name + "_dist"
+    # partial a onnx model to world_size part
+    model = parallel_model(model, world_size, rank)
+    onnx.save(model, os.path.join(internal_model_dir, \
+                                    f"{dist_name}_rank{rank}.onnx"), save_as_external_data=True)
+    runtime = backend.KUNLUNRuntime(local_rank)
+    # print("init comm")
+    runtime.init_comm(
+        dist_name,
+        world_size,
+        rank,
+    )
+    run_model(name, model, runtime, world_size, rank)
+
+
+"""
+generate standard input/output with
+sigle card run
+"""
+def gen_standard(task_name: str, model: onnx.ModelProto):
+    runtime = backend.KUNLUNRuntime(0)
+    stub = OnnxStub(model, runtime)
+    position_id = 0
+    # generate random input for model
+    for i, (name, tensor) in enumerate(stub.inputs.items()):
+        input = tensor.copyout_numpy()
+        if np.issubdtype(input.dtype, np.integer):
+            if input.size == 1:
+                input = np.random.randint(0,2,size=input.shape, dtype=input.dtype)
+            else:
+                input = np.random.randint(0,2,size=input.shape, dtype=input.dtype)
+        elif input.dtype == np.bool_:
+            input = np.random.randint(0,2,size=input.shape) > 0
+        else:
+            if i == 0:
+                input = np.ones(input.shape).astype(input.dtype)
+                position_id = input.shape[-1] - 1
+            else:
+                input = np.random.rand(*input.shape).astype(input.dtype)
+        tensor.copyin_numpy(input)
+        np.save(os.path.join(input_dir, \
+                    f"{task_name}_input_{i}.npy"), input)
+    stub.run()
+    # print(stub.outputs)
+    output = next(stub.outputs.values().__iter__()).copyout_numpy()
+    if np.isnan(output).any():
+        print("Nan in output")
+        exit()
+    np.save(os.path.join(result_dir, f"{task_name}_output.npy"), output)
+
+
+def main():
+
+    global input_dir, result_dir, internal_model_dir
+
+    nnodes, nproc_per_node, task_name, \
+        model_path, gen_std, run_single, \
+            input_dir, result_dir, internal_model_dir = parse_args()
+
+    # load input onnx model
+    model = onnx.load(model_path)
+
+    # generate standart output
+    if gen_std:
+        print("Generate inputs and outputs.")
+        gen_standard(task_name, model)
+        return
+
+    if run_single:
+        print("Run model by one GPU card.")
+        runtime = backend.KUNLUNRuntime(0)
+        run_model(task_name, model, runtime)
+        return
+
+    # run distributed parallel.
+    world_size = nnodes * nproc_per_node
+    print(f"Run model by {world_size} GPU in parallel.")
+    workers = [
+        mp.Process(
+            target=start_worker,
+            args=(task_name, world_size, rank, rank % nproc_per_node, model),
+        )
+        for rank in range(world_size)
+    ]
+
+    for w in workers:
+        w.start()
+
+    for w in workers:
+        w.join()
+
+
+if __name__ == "__main__":
+    main()
--- a/examples/distributed/kunlun/launch.sh
+++ b/examples/distributed/kunlun/launch.sh
@ -0,0 +1,36 @@
+export HF_ENDPOINT=https://hf-mirror.com
+
+# models=("bert" "gpt2" "llama")
+models=("bert" "gpt2")
+batch_size=(1 32)
+seq_len=(100 500)
+nproc=(1 2 4)
+
+results_dir="results"
+
+if [ -d "$results_dir" ]; then
+    echo "directory ./$results_dir exists"
+else
+    mkdir -p "$results_dir"
+    echo "mkdir $results_dir, logs saved there"
+fi
+
+
+for model in "${models[@]}"; do
+    for bs in "${batch_size[@]}"; do
+        for len in "${seq_len[@]}"; do
+            # run pytorch model
+            echo "Run pytorch $model with batch_size=$bs length=$len ."
+            python run_pytorch.py --model "$model" --batch_size "$bs" --length "$len" #> results/"$model"_"$bs"_"$len"_pytorch
+            for n in "${nproc[@]}"; do
+                # run infinitensor 
+                echo "Run $n parallel infinitensor "$model" with batch_size=$bs and length=$len ."
+                python kunlun_launch.py --name "$model" --model ../models/"$model"/"$model"_"$bs"_"$len".onnx --nproc_per_node=$n # >> results/"$model"_"$bs"_"$len"_infini 
+                # delete internal files
+                find ./ -type f -name "*.onnx" -delete
+                find ./ -type f -name "*.pb" -delete
+            done
+            find ./ -type f -name "*.npy" -delete
+        done
+    done
+done
--- a/examples/distributed/kunlun/llama_launch.sh
+++ b/examples/distributed/kunlun/llama_launch.sh
@ -0,0 +1,35 @@
+export HF_ENDPOINT=https://hf-mirror.com
+
+# models=("bert" "gpt2" "llama")
+models=("llama")
+batch_size=(1 )
+seq_len=(100 500)
+nproc=(1 2 4)
+
+results_dir="results"
+
+if [ -d "$results_dir" ]; then
+    echo "directory ./$results_dir exists"
+else
+    mkdir -p "$results_dir"
+    echo "mkdir $results_dir, logs saved there"
+fi
+
+
+for model in "${models[@]}"; do
+    for bs in "${batch_size[@]}"; do
+        for len in "${seq_len[@]}"; do
+            echo "Run pytorch llama with batch_size="$bs" and length="$len""
+            python run_pytorch.py --model "$model" --batch_size "$bs" --length "$len"
+            for n in "${nproc[@]}"; do
+                    # run pytorch model
+                    echo "Run infinitensor llama with batch_size="$bs" and length="$len" and nproc="$n"."
+                    python kunlun_launch.py --name llama --model ../models/llama/llama_"$bs"_"$len"_fp32.onnx --nproc_per_node=$n
+                    # delete internal files
+                    find ./ -type f -name "*.onnx" -delete
+                    find ./ -type f -name "*0c" -delete
+            done
+            find ./ -type f -name "*.npy" -delete
+        done
+    done
+done
--- a/examples/distributed/kunlun/run_pytorch.py
+++ b/examples/distributed/kunlun/run_pytorch.py
@ -0,0 +1,245 @@
+import argparse
+import torch
+from transformers import BertModel, BertConfig
+from transformers import GPT2Model, GPT2Config
+from transformers import OPTModel, OPTConfig
+from transformers import LlamaModel, LlamaConfig
+import time
+import numpy as np
+import onnx
+import os
+import sys
+from onnx.external_data_helper import convert_model_to_external_data
+from onnxsim import simplify
+
+torch.backends.cuda.matmul.allow_tf32 = False
+torch.backends.cudnn.allow_tf32 = False
+def parse_args():
+    parser = argparse.ArgumentParser(description="Run pytorch gpt2/bert/opt and optionally export onnx.")
+    parser.add_argument(
+        "--model", type=str, choices=["gpt2", "bert", "opt", "llama"], required=True, help="model type"
+    )
+    parser.add_argument("--batch_size", type=int, default=1, help="batch size.")
+    parser.add_argument("--length", type=int, default=1, help="sequence length.")
+    parser.add_argument(
+        "--export_onnx",
+        type=str,
+        nargs="?",
+        default=None,
+        const="./",
+        help="whether and where to export onnx file",
+    )
+    parser.add_argument(
+        "--input_dir",
+        type=str,
+        default="./",
+        help="path to save pytorch model input data"
+    )
+    parser.add_argument(
+        "--result_dir",
+        type=str,
+        default="./",
+        help="path to save pytorch model output data"
+    )
+    parser.add_argument(
+        "--export_only",
+        action="store_true"
+    )
+    args = parser.parse_args()
+    print("arg setting: ", args)
+    return (
+        args.model,
+        args.batch_size,
+        args.length,
+        args.export_onnx,
+        args.input_dir,
+        args.result_dir,
+        args.export_only
+    )
+
+
+def get_model(modelname):
+    if modelname == "bert":
+        model = BertModel.from_pretrained("bert-base-uncased", add_pooling_layer=False, hidden_act="gelu_new") # erf is not impl by infini
+        voc_size = BertConfig().vocab_size
+    elif modelname == "gpt2":
+        model = GPT2Model.from_pretrained("gpt2")
+        voc_size = GPT2Config().vocab_size
+    elif modelname == "opt":
+        model = OPTModel.from_pretrained("./opt-125m")
+        voc_size = OPTConfig().vocab_size
+    elif modelname == "llama":
+        model = LlamaModel.from_pretrained("meta-llama/Llama-2-7b-hf")
+        voc_size = LlamaConfig().vocab_size
+    else :
+        raise KeyError(modelname)
+
+    model = model.eval()
+    return model, voc_size
+
+def run_pytorch(torch_model, voc_size, batchsize, len, model_name):
+    data = np.random.randint(0, voc_size, (batchsize, len), dtype=np.int32)
+    np.save(os.path.join(input_dir, f"{model_name}_input_0.npy"), data)
+    inputs = torch.from_numpy(data).to("cuda")
+    torch_model = torch_model.to("cuda")
+
+    n_iter = 10
+    with torch.no_grad():
+        for _ in range(10):
+            outputs = torch_model(inputs)
+    torch.cuda.synchronize()
+    begin = time.time()
+    with torch.no_grad():
+        for _ in range(n_iter):
+            torch.cuda.synchronize()
+            outputs = torch_model(inputs)
+            #
+            torch.cuda.synchronize()
+    torch.cuda.synchronize()
+    end = time.time()
+
+    avg_time = (end - begin) / n_iter
+    outputs = outputs.last_hidden_state.to("cpu")
+    print("outputs abs mean:", abs(np.array(outputs)).mean())
+    print(f"average time: {avg_time}")
+    torch.cuda.memory.empty_cache()
+    np.save(os.path.join(result_dir, f"{model_name}_output.npy"), \
+                                        np.array(outputs))
+    print(f"Save input & output as {model_name}_input_0.npy and {model_name}_output.npy")
+
+
+def export_onnx(model_name, model, data, path, extern=False):
+    # torch.onnx.export(model, data, path, verbose=False, do_constant_folding=True)
+
+    if model_name != "llama":
+        onnx_model = onnx.load(path)
+        onnx_model, check = simplify(onnx_model,
+                                 skipped_optimizers=['fuse_qkv', 'eliminate_duplicate_initializer'])
+                                 # skipped_optimizers=['fuse_qkv'])
+        assert check
+        add_value_info_for_constants(onnx_model)
+        onnx_model = onnx.shape_inference.infer_shapes(onnx_model)
+        if extern:
+            extern_path = path.replace('.onnx', '.pb')
+            if os.path.exists(extern_path):
+                os.remove(extern_path)
+            convert_model_to_external_data(
+                onnx_model,
+                all_tensors_to_one_file=True,
+                location=extern_path.split("/")[-1],
+                size_threshold=1024,
+                convert_attribute=False,
+            )
+        onnx.save(onnx_model, path)
+    else:
+        sys.path.append("onnxsim_large_model")
+        from onnx_utils import set_onnx_input_shape
+        from compress_model import SIZE_1MB, compress_onnx_model, uncompress_onnx_model
+
+        in_model_path = path
+        out_model_path = in_model_path[:-5] + ".sim.onnx"
+
+        onnx_model = onnx.load(in_model_path)
+        print(f"load model from {in_model_path} success")
+
+        size_th_bytes = 1024 * 1024
+        onnx_model, removed_inits = compress_onnx_model(onnx_model, size_th_bytes=size_th_bytes)
+        print("compress model success")
+
+        onnx_model = set_onnx_input_shape(onnx_model, "")
+        tensor_size_threshold = f"1024KB"
+        skipped_optimizers = []
+        skipped_optimizers.append("eliminate_duplicate_initializer")
+        onnx_model, check = simplify(onnx_model, skipped_optimizers=skipped_optimizers,
+                                    tensor_size_threshold=tensor_size_threshold)
+        if not check:
+            raise ValueError(f"simplify compressed model {in_model_path} failed")
+
+        print(f"simplify model success")
+
+        onnx_model = uncompress_onnx_model(onnx_model, removed_inits)
+        print(f"uncompress model success")
+
+        add_value_info_for_constants(onnx_model)
+
+        onnx.save(onnx_model, out_model_path, save_as_external_data=True)
+
+
+def add_value_info_for_constants(model : onnx.ModelProto):
+    """
+    Currently onnx.shape_inference doesn't use the shape of initializers, so add
+    that info explicitly as ValueInfoProtos.
+    Mutates the model.
+    Args:
+        model: The ModelProto to update.
+    """
+    # All (top-level) constants will have ValueInfos before IRv4 as they are all inputs
+    if model.ir_version < 4:
+        return
+
+    def add_const_value_infos_to_graph(graph : onnx.GraphProto):
+        inputs = {i.name for i in graph.input}
+        existing_info = {vi.name: vi for vi in graph.value_info}
+        for init in graph.initializer:
+            # Check it really is a constant, not an input
+            if init.name in inputs:
+                continue
+
+            # The details we want to add
+            elem_type = init.data_type
+            shape = init.dims
+
+            # Get existing or create new value info for this constant
+            vi = existing_info.get(init.name)
+            if vi is None:
+                vi = graph.value_info.add()
+                vi.name = init.name
+
+            # Even though it would be weird, we will not overwrite info even if it doesn't match
+            tt = vi.type.tensor_type
+            if tt.elem_type == onnx.TensorProto.UNDEFINED:
+                tt.elem_type = elem_type
+            if not tt.HasField("shape"):
+                # Ensure we set an empty list if the const is scalar (zero dims)
+                tt.shape.dim.extend([])
+                for dim in shape:
+                    tt.shape.dim.add().dim_value = dim
+
+        # Handle subgraphs
+        for node in graph.node:
+            for attr in node.attribute:
+                # Ref attrs refer to other attrs, so we don't need to do anything
+                if attr.ref_attr_name != "":
+                    continue
+
+                if attr.type == onnx.AttributeProto.GRAPH:
+                    add_const_value_infos_to_graph(attr.g)
+                if attr.type == onnx.AttributeProto.GRAPHS:
+                    for g in attr.graphs:
+                        add_const_value_infos_to_graph(g)
+
+
+    return add_const_value_infos_to_graph(model.graph)
+
+
+def main():
+    global input_dir, result_dir
+
+    modelname, batchsize, seqlen, \
+        export_path, input_dir, result_dir, export_only = parse_args()
+
+    model, voc_size = get_model(modelname) # pytorch model
+
+    if export_path is not None:
+        os.makedirs(export_path, exist_ok=True)
+        filename = "{}_{}_{}.onnx".format(modelname, batchsize, seqlen)
+        path = os.path.join(export_path, filename)
+        param = torch.zeros((batchsize, seqlen), dtype=torch.int)
+        export_onnx(modelname, model, param, path, True) # export pytorch model to onnx model
+        if export_only:
+            return
+
+    run_pytorch(model, voc_size, batchsize, seqlen, modelname)
+
+if __name__ == "__main__":
+    main()
--- a/examples/distributed/onnxsim_large_model
+++ b/examples/distributed/onnxsim_large_model
@ -0,0 +1 @@
+Subproject commit cbcf3fbf985a00494b0f136c92eaccd42031bf65
--- a/examples/distributed/parallel_opt.py
+++ b/examples/distributed/parallel_opt.py
@ -11,6 +11,7 @@ def parallel_model(model: ModelProto, tp_world_size: int = 1, tp_rank: int = 0):
    vinfo = {info.name: info for info in model.graph.value_info}
    vinfo.update({info.name: info for info in model.graph.input})
    vinfo.update({info.name: info for info in model.graph.output})
+    output = {info.name: info for info in model.graph.output}
    place: Dict[str, Placement] = {}
    nodes: List[NodeProto] = []

@ -56,7 +57,7 @@ def parallel_model(model: ModelProto, tp_world_size: int = 1, tp_rank: int = 0):
        ndim = len(vinfo[output].type.tensor_type.shape.dim)
        out_plc = Shard(ndim - 1) if in_plc.is_replicate() else _Partial()
        place[node.output[0]] = out_plc
-        
+
    def shard_concat(node: NodeProto):
        # hack for kvcache
        in_plc = place[node.input[1]]
@ -109,12 +110,11 @@ def parallel_model(model: ModelProto, tp_world_size: int = 1, tp_rank: int = 0):
                s_dim = 0
            elif in_plc.dim == 2:
                s_dim = 1
-
        assert s_dim != -1
        assert out_dims[s_dim] % tp_world_size == 0, out_dims
        out_dims[s_dim] //= tp_world_size
        # if ONNX uses the same tensor for multiple Reshape Nodes, then rename it to distingush from others.
-        # node.input[1] = node.output[0] + "_shape"
+        node.input[1] = node.output[0] + "_shape"
        data[node.input[1]] = numpy_helper.from_array(out_dims, name=node.input[1])
        place[node.output[0]] = Shard(s_dim)

@ -136,7 +136,7 @@ def parallel_model(model: ModelProto, tp_world_size: int = 1, tp_rank: int = 0):
            place[node.output[0]] = Shard(list(perm).index(plc.dim))

    def shard_node(node: NodeProto):
-        if node.op_type in ["Relu", "Tanh", "Softmax"]:
+        if node.op_type in ["Relu", "Tanh", "Softmax", "Cast"]:
            place[node.output[0]] = place[node.input[0]]
        elif node.op_type in ["Where"]:
            place[node.output[0]] = place[node.input[1]]
@ -154,7 +154,7 @@ def parallel_model(model: ModelProto, tp_world_size: int = 1, tp_rank: int = 0):
            ), f"{place[node.input[0]]} != {place[node.input[1]]}"
            place[node.output[0]] = place[node.input[0]]
        elif node.op_type == "Concat":
-            shard_concat(node)            
+            shard_concat(node)

    def find_successor(op_type: str, idx: int, search_limit: int = 1):
        for node in model.graph.node[idx + 1 : idx + 1 + search_limit]:
@ -175,6 +175,16 @@ def parallel_model(model: ModelProto, tp_world_size: int = 1, tp_rank: int = 0):
        if (node.op_type == "MatMul" or node.op_type == "Gemm") and any(
            input in data for input in node.input
        ):
+            # FIXME(constroy): the last MatMul should not be sharded as TP.
+            if (
+                node.output[0] in output
+                or (
+                    index + 1 < len(model.graph.node)
+                    and model.graph.node[index + 1].output[0]
+                )
+                in output
+            ):
+                continue
            groups = 1
            # If the Gemm or Matmul is followed by a split, then the inputs are concatinated by groups
            split_node = find_successor("Split", index, search_limit=2)
@ -218,7 +228,7 @@ def parallel_model(model: ModelProto, tp_world_size: int = 1, tp_rank: int = 0):
    new_input = []
    for info in model.graph.input:
        new_input.append(vinfo[info.name])
-    
+
    graph = helper.make_graph(
        nodes,
        model.graph.name + f"_{tp_rank}",
@ -233,5 +243,5 @@ def parallel_model(model: ModelProto, tp_world_size: int = 1, tp_rank: int = 0):
        if tt.HasField("shape"):
            tt.ClearField("shape")
    model = helper.make_model(graph)
-    model = onnx.shape_inference.infer_shapes(model)
+    #model = onnx.shape_inference.infer_shapes(model)
    return model
--- a/examples/python/llama_kvcache_inference.py
+++ b/examples/python/llama_kvcache_inference.py
@ -0,0 +1,145 @@
+import os
+from pyinfinitensor.onnx import OnnxStub, backend
+import numpy as np
+import onnx
+import torch
+from transformers import LlamaModel, LlamaForCausalLM
+from tqdm import tqdm
+import onnx_graphsurgeon as gs
+from onnxsim import simplify
+import argparse
+
+parser = argparse.ArgumentParser(description='')
+parser.add_argument('--batchsize', dest='batchsize', type=int, default=1)
+parser.add_argument('--layer', dest='n_layers', type=int, default=2)
+parser.add_argument('--iter', dest='n_iter', type=int, default=1)
+parser.add_argument('--n_max_length', dest='n_max_length', type=int, default=1024)
+parser.add_argument('--pretrained_llama_path', dest='pretrained_llama_path', type=str, 
+                    default="/data0/shared/data/public/opensource_models/meta-llama/Llama-2-7b-hf/")
+parser.add_argument('--onnx_model_path', dest='onnx_model_path', type=str, 
+                    default="/data1/shared/llama")
+args = parser.parse_args()
+
+ONNX_MODEL_PATH = "{}/llama_bs{}_layer{}.onnx".format(args.onnx_model_path, args.batchsize, args.n_layers)
+ONNX_WEIGHT_PATH = "./llama_bs{}_layer{}.pb".format(args.batchsize, args.n_layers)
+
+def export_onnx(model: LlamaModel, ONNX_MODEL_PATH):
+    param = torch.zeros(
+        (args.batchsize, 1024), dtype=torch.long)
+    logits = model(param, past_key_values=None)
+    param_kvcache = torch.zeros((args.batchsize, 1), dtype=torch.long)
+
+    torch.onnx.export(model, (param_kvcache, {"past_key_values": logits.past_key_values,
+                                              "position_ids": param_kvcache}), ONNX_MODEL_PATH, verbose=False,
+                      do_constant_folding=True,)
+    onnx_model = onnx.load(ONNX_MODEL_PATH)
+    print("simplifing onnx model")
+    onnx_model, check = simplify(onnx_model, skipped_optimizers=[
+                                 'eliminate_duplicate_initializer'])
+    assert check
+    
+    onnx.save(onnx_model, ONNX_MODEL_PATH, save_as_external_data=True, location=ONNX_WEIGHT_PATH)
+    print("simlifing finished.")
+
+
+@gs.Graph.register()
+def replace_with_attention(self, inputs, outputs, inputs_added, outputs_removed):
+    for inp in inputs:
+        inp.outputs.clear()   
+    for out in outputs:
+        out.inputs.clear()
+    for inp in inputs_added:
+        inputs.append(inp)
+    for out in outputs_removed:
+        out.inputs.clear()
+    return self.layer(op="AttentionKVCache", inputs=inputs, outputs=outputs)
+
+
+def replace_onnx_with_attention_op():
+    graph = gs.import_onnx(
+        onnx.load(ONNX_MODEL_PATH))
+    tmap = graph.tensors()
+    for i in range(args.n_layers):
+        inputs = [
+            tmap["onnx::Concat_" + str((i+1)*2)],
+            tmap["onnx::Concat_" + str((i+1)*2+1)],
+            tmap["/model/layers." + str(i) + "/self_attn/Add_output_0"],
+            tmap["/model/layers." + str(i) + "/self_attn/Add_1_output_0"],
+            tmap["/model/layers." + str(i) + "/self_attn/Transpose_2_output_0"]]
+        outputs = [
+            tmap["/model/layers." + str(i) + "/self_attn/MatMul_1_output_0"]]
+
+        inputs_added = [graph.inputs[1]]
+        outputs_removed = []
+
+        graph.replace_with_attention(
+            inputs, outputs, inputs_added, outputs_removed)
+        
+    graph.outputs = [tmap[graph.outputs[0].name]]
+    graph.cleanup(True).toposort()
+    onnx.save(gs.export_onnx(graph), ONNX_MODEL_PATH, save_as_external_data=True)
+
+
+if __name__ == "__main__":
+    kvcache_torch = None
+    torch_model = LlamaForCausalLM.from_pretrained(
+        args.pretrained_llama_path, num_hidden_layers=int(args.n_layers)).eval()
+    
+    n_heads = torch_model.config.num_attention_heads
+    n_dims = torch_model.config.hidden_size // n_heads
+    
+    if not os.path.exists(ONNX_MODEL_PATH):
+        print("exporting onnx graph")
+        export_onnx(torch_model, ONNX_MODEL_PATH)
+        replace_onnx_with_attention_op()
+    else:
+        print("will use exsiting onnx graph")
+
+    onnx_model = onnx.load(ONNX_MODEL_PATH)
+    stub = OnnxStub(onnx_model, backend.cuda_runtime())
+
+    count_wrong = 0
+    for i in tqdm(range(0, args.n_max_length)):
+        query = np.random.randint(
+            torch_model.config.vocab_size, size=(args.batchsize, 1), dtype=np.int32)
+        position_id = i*np.ones((args.batchsize, 1), dtype=np.int32)
+
+        ####################################
+        # pytorch
+        ####################################
+        outputs_torch = torch_model(
+            torch.tensor(query), past_key_values=kvcache_torch)
+        logit_torch = outputs_torch['logits']
+        kvcache_torch = outputs_torch['past_key_values']
+
+        ####################################
+        # infinitensor
+        ####################################
+        # copyin input
+        (list(stub.inputs.items()))[0][1].copyin_int64(
+            query.reshape(-1).tolist())
+        (list(stub.inputs.items()))[1][1].copyin_int64(
+            position_id.reshape(-1).tolist())
+
+        stub.run()
+
+        ####################################
+        # validation
+        ####################################
+        # copyout output
+        logits_it = np.array((list(stub.outputs.items()))
+                                [0][1].copyout_float())
+        
+        try:
+            np.testing.assert_allclose(
+                logit_torch[:, -1, :].detach().cpu().numpy().flatten(), logits_it, rtol=1e-3, atol=1e-3)
+        except Exception as e: 
+            try:
+                np.testing.assert_allclose(
+                    np.argmax(logit_torch[:, -1, :].detach().cpu().numpy().flatten()), np.argmax(logits_it), rtol=1e-3, atol=1e-3)
+            except:
+                count_wrong = count_wrong + 1
+
+    result = "{}/{} failed.".format(count_wrong, args.n_max_length)
+    print(result)
+    del stub
--- a/examples/python/paddle_densenet.py
+++ b/examples/python/paddle_densenet.py
@ -0,0 +1,80 @@
+
+import paddle
+import paddle.vision.transforms as T
+from paddle.vision.datasets import Cifar10
+from pyinfinitensor.onnx import OnnxStub, backend
+import onnx
+import itertools
+
+def run_cifar_train_and_infer():
+    
+    paddle.device.set_device("gpu")
+
+    transform = T.Compose(
+        [
+            T.Resize(224),
+            T.ToTensor(),
+            T.Normalize(
+                mean=[0.5, 0.5, 0.5],
+                std=[0.5, 0.5, 0.5],
+                to_rgb=True,
+            ),
+        ]
+    )
+    
+    # 下载数据集并初始化 DataSet
+    train_dataset = paddle.vision.datasets.Cifar10(mode='train', transform=transform)
+    test_dataset = paddle.vision.datasets.Cifar10(mode='test', transform=transform)
+
+    # 模型组网并初始化网络
+    densenet = paddle.vision.models.DenseNet(num_classes=10)
+    model = paddle.Model(densenet)
+
+    # 模型训练的配置准备，准备损失函数，优化器和评价指标
+    model.prepare(paddle.optimizer.Adam(parameters=model.parameters()), 
+                paddle.nn.CrossEntropyLoss(),
+                paddle.metric.Accuracy())
+
+    # 模型训练
+    model.fit(train_dataset, epochs=5, batch_size=64, verbose=1)
+    # 模型评估
+    model.evaluate(test_dataset, batch_size=64, verbose=1)
+
+    # export to ONNX
+    save_path = 'onnx.save/densenet' # 需要保存的路径
+    x_spec = paddle.static.InputSpec([1, 3, 224, 224], 'float32', 'x') # 为模型指定输入的形状和数据类型，支持持 Tensor 或 InputSpec ，InputSpec 支持动态的 shape。
+    paddle.onnx.export(densenet, save_path, input_spec=[x_spec], opset_version=11)
+
+    # 加载onnx模型并放到Infinitensor中
+    model_path = save_path + ".onnx"
+    onnx_model = onnx.load(model_path)
+    gofusion_model = OnnxStub(onnx_model, backend.cuda_runtime())
+    model = gofusion_model
+    model.init()
+
+    # 启动推理
+    cifar10_test = Cifar10(
+        mode="test",
+        transform=transform,  # apply transform to every image
+        backend="cv2",  # use OpenCV as image transform backend
+    )
+    batch_size = 1
+    total_size = 0
+    total_acc = 0.0
+    for data in itertools.islice(iter(cifar10_test), 10000):
+        images, labels = data
+        next(model.inputs.items().__iter__())[1].copyin_float(images.reshape([3*224*224]).tolist())
+        model.run()
+        outputs = next(model.outputs.items().__iter__())[1].copyout_float()
+        outputs = paddle.to_tensor(outputs)
+        outputs = paddle.reshape(outputs, (1, 10))
+        labels = paddle.to_tensor(labels)
+        labels = paddle.reshape(labels, (1,1))
+        acc = paddle.metric.accuracy(outputs, labels)
+        total_acc += acc
+        total_size += batch_size
+    print("test acc: {}".format(total_acc.numpy() / total_size))
+
+
+if __name__ == "__main__":
+    run_cifar_train_and_infer()
--- a/examples/python/paddle_inception.py
+++ b/examples/python/paddle_inception.py
@ -0,0 +1,80 @@
+import paddle
+import paddle.vision.transforms as T
+from paddle.vision.datasets import Cifar10
+from pyinfinitensor.onnx import OnnxStub, backend
+import onnx
+import itertools
+
+def run_cifar_train_and_infer():
+    
+    paddle.device.set_device("gpu")
+
+    transform = T.Compose(
+        [
+            T.Resize(224),
+            T.ToTensor(),
+            T.Normalize(
+                mean=[0.5, 0.5, 0.5],
+                std=[0.5, 0.5, 0.5],
+                to_rgb=True,
+            ),
+        ]
+    )
+    
+    # 下载数据集并初始化 DataSet
+    train_dataset = paddle.vision.datasets.Cifar10(mode='train', transform=transform)
+    test_dataset = paddle.vision.datasets.Cifar10(mode='test', transform=transform)
+
+    # 模型组网并初始化网络
+    inception = paddle.vision.models.InceptionV3(num_classes=10)
+    model = paddle.Model(inception)
+
+    # 模型训练的配置准备，准备损失函数，优化器和评价指标
+    model.prepare(paddle.optimizer.Adam(parameters=model.parameters()), 
+                paddle.nn.CrossEntropyLoss(),
+                paddle.metric.Accuracy())
+
+    # 模型训练
+    model.fit(train_dataset, epochs=5, batch_size=64, verbose=1)
+    # 模型评估
+    model.evaluate(test_dataset, batch_size=64, verbose=1)
+
+    # export to ONNX
+    save_path = 'onnx.save/inception' # 需要保存的路径
+    x_spec = paddle.static.InputSpec([1, 3, 224, 224], 'float32', 'x') # 为模型指定输入的形状和数据类型，支持持 Tensor 或 InputSpec ，InputSpec 支持动态的 shape。
+    paddle.onnx.export(inception, save_path, input_spec=[x_spec], opset_version=11)
+
+    # 加载onnx模型并放到Infinitensor中
+    model_path = save_path + ".onnx"
+    onnx_model = onnx.load(model_path)
+    gofusion_model = OnnxStub(onnx_model, backend.cuda_runtime())
+    model = gofusion_model
+    model.init()
+
+    # 启动推理
+    cifar10_test = Cifar10(
+        mode="test",
+        transform=transform,  # apply transform to every image
+        backend="cv2",  # use OpenCV as image transform backend
+    )
+    batch_size = 1
+    total_size = 0
+    total_acc = 0.0
+    for data in itertools.islice(iter(cifar10_test), 10000):
+        images, labels = data
+        next(model.inputs.items().__iter__())[1].copyin_float(images.reshape([3*224*224]).tolist())
+        model.run()
+        outputs = next(model.outputs.items().__iter__())[1].copyout_float()
+        outputs = paddle.to_tensor(outputs)
+        outputs = paddle.reshape(outputs, (1, 10))
+        labels = paddle.to_tensor(labels)
+        labels = paddle.reshape(labels, (1,1))
+        acc = paddle.metric.accuracy(outputs, labels)
+        total_acc += acc
+        total_size += batch_size
+    print("test acc: {}".format(total_acc.numpy() / total_size))
+
+
+
+if __name__ == "__main__":
+    run_cifar_train_and_infer() 
--- a/examples/python/paddle_model_dev.md
+++ b/examples/python/paddle_model_dev.md
@ -0,0 +1,31 @@
+## Description
+
+This is a doc to tell you how to run paddle*.py in your machine. If your model run on other machines except Nvidia, you may need to make some change.
+
+## What do we do in paddle*.py files?
+
+1. Train model and evalute model with Cifar10 dataset
+
+2. Export paddle model to onnx model
+
+3. Load onnx model, infer with InfiniTensor and calculate the inference accuracy
+
+## Command
+
+1. Go to `/examples/python` folder 
+
+2. Run the following command
+   
+   1. ```
+      python paddle_resnet.py
+      python paddle_densenet.py
+      python paddle_inception.py
+      ```
+
+## What should I do if I use other device(MLU, XPU, NPU)?
+
+You need to change this code:
+
+```
+paddle.device.set_device("gpu") # Change gpu to mlu, xpu or npu
+```
--- a/examples/python/paddle_resnet.py
+++ b/examples/python/paddle_resnet.py
@ -0,0 +1,81 @@
+
+import paddle
+import paddle.vision.transforms as T
+from paddle.vision.datasets import Cifar10
+from pyinfinitensor.onnx import OnnxStub, backend
+import onnx
+import itertools
+from paddle.vision.models.resnet import BasicBlock
+
+def run_cifar_train_and_infer():
+    
+    paddle.device.set_device("gpu")
+
+    transform = T.Compose(
+        [
+            T.Resize(224),
+            T.ToTensor(),
+            T.Normalize(
+                mean=[0.5, 0.5, 0.5],
+                std=[0.5, 0.5, 0.5],
+                to_rgb=True,
+            ),
+        ]
+    )
+    
+    # 下载数据集并初始化 DataSet
+    train_dataset = paddle.vision.datasets.Cifar10(mode='train', transform=transform)
+    test_dataset = paddle.vision.datasets.Cifar10(mode='test', transform=transform)
+
+    # 模型组网并初始化网络
+    resnet = paddle.vision.models.ResNet(BasicBlock, depth=18, num_classes=10)
+    model = paddle.Model(resnet)
+
+    # 模型训练的配置准备，准备损失函数，优化器和评价指标
+    model.prepare(paddle.optimizer.Adam(parameters=model.parameters()), 
+                paddle.nn.CrossEntropyLoss(),
+                paddle.metric.Accuracy())
+
+    # 模型训练
+    model.fit(train_dataset, epochs=5, batch_size=64, verbose=1)
+    # 模型评估
+    model.evaluate(test_dataset, batch_size=64, verbose=1)
+
+    # export to ONNX
+    save_path = 'onnx.save/resnet' # 需要保存的路径
+    x_spec = paddle.static.InputSpec([1, 3, 224, 224], 'float32', 'x') # 为模型指定输入的形状和数据类型，支持持 Tensor 或 InputSpec ，InputSpec 支持动态的 shape。
+    paddle.onnx.export(resnet, save_path, input_spec=[x_spec], opset_version=11)
+
+    # 加载onnx模型并放到Infinitensor中
+    model_path = save_path + ".onnx"
+    onnx_model = onnx.load(model_path)
+    gofusion_model = OnnxStub(onnx_model, backend.cuda_runtime())
+    model = gofusion_model
+    model.init()
+
+    # 启动推理
+    cifar10_test = Cifar10(
+        mode="test",
+        transform=transform,  # apply transform to every image
+        backend="cv2",  # use OpenCV as image transform backend
+    )
+    batch_size = 1
+    total_size = 0
+    total_acc = 0.0
+    for data in itertools.islice(iter(cifar10_test), 10000):
+        images, labels = data
+        next(model.inputs.items().__iter__())[1].copyin_float(images.reshape([3*224*224]).tolist())
+        model.run()
+        outputs = next(model.outputs.items().__iter__())[1].copyout_float()
+        outputs = paddle.to_tensor(outputs)
+        outputs = paddle.reshape(outputs, (1, 10))
+        labels = paddle.to_tensor(labels)
+        labels = paddle.reshape(labels, (1,1))
+        acc = paddle.metric.accuracy(outputs, labels)
+        total_acc += acc
+        total_size += batch_size
+    print("test acc: {}".format(total_acc.numpy() / total_size))
+
+
+if __name__ == "__main__":
+    run_cifar_train_and_infer()
--- a/include/bang/bang_common.h
+++ b/include/bang/bang_common.h
@ -2,6 +2,10 @@
 #include "cnnl.h"
 #include "cnrt.h"
 #include "core/common.h"
+#include "core/data_type.h"
+#ifdef INFINI_USE_CNCL
+#include "cncl.h"
+#endif

 #define checkBangError(call)                                                   \
    {                                                                          \
@ -27,4 +31,70 @@ namespace infini {

 using BangPtr = void *;

+inline cnnlDataType_t cnnlDataTypeConvert(DataType dataType) {
+    if (dataType == DataType::Float32) {
+        return CNNL_DTYPE_FLOAT;
+    }
+    if (dataType == DataType::Float16) {
+        return CNNL_DTYPE_HALF;
+    }
+    if (dataType == DataType::Double) {
+        return CNNL_DTYPE_DOUBLE;
+    }
+    if (dataType == DataType::Int8) {
+        return CNNL_DTYPE_INT8;
+    }
+    if (dataType == DataType::Int32) {
+        return CNNL_DTYPE_INT32;
+    }
+    if (dataType == DataType::UInt8) {
+        return CNNL_DTYPE_UINT8;
+    }
+    if (dataType == DataType::BFloat16) {
+        return CNNL_DTYPE_BFLOAT16;
+    }
+    if (dataType == DataType::Int64) {
+        return CNNL_DTYPE_INT64;
+    }
+    if (dataType == DataType::Bool) {
+        return CNNL_DTYPE_BOOL;
+    }
+    IT_TODO_HALT_MSG("Data type " + dataType.toString() +
+                     " not supported in CNNL.");
+}
+
+#ifdef INFINI_USE_CNCL
+inline cnclDataType_t cnclDataTypeConvert(DataType dataType) {
+    if (dataType == DataType::Float32) {
+        return cnclFloat32;
+    }
+    if (dataType == DataType::Float16) {
+        return cnclHalf;
+    }
+    if (dataType == DataType::Int8) {
+        return cnclInt8;
+    }
+    if (dataType == DataType::Int16) {
+        return cnclInt16;
+    }
+    if (dataType == DataType::Int32) {
+        return cnclInt32;
+    }
+    if (dataType == DataType::UInt8) {
+        return cnclUint8;
+    }
+    if (dataType == DataType::UInt16) {
+        return cnclUint16;
+    }
+    if (dataType == DataType::UInt32) {
+        return cnclUint32;
+    }
+    if (dataType == DataType::BFloat16) {
+        return cnclBfloat16;
+    }
+    IT_TODO_HALT_MSG("Data type " + dataType.toString() +
+                     " not supported in CNCL.");
+}
+#endif
+
 } // namespace infini
--- a/include/bang/bang_runtime.h
+++ b/include/bang/bang_runtime.h
@ -7,16 +7,19 @@ namespace infini {
 class BangRuntimeObj : public RuntimeObj {
  private:
    cnnlHandle_t cnnl;
+    cnrtQueue_t queue;
+    std::unique_ptr<CommunicatorObj> comm;
    BangPtr workspace;
    size_t workspaceSize;
+    mutable size_t cursor;

  public:
-    BangRuntimeObj() : RuntimeObj(Device::BANG) {
+    explicit BangRuntimeObj(int deviceId = 0)
+        : RuntimeObj(Device::BANG, deviceId) {
        cnInit(0);
        CNdev dev;
-        cnDeviceGet(&dev, 0);
+        cnDeviceGet(&dev, deviceId);
        checkBangError(cnrtSetDevice(dev));
-        cnrtQueue_t queue;
        checkBangError(cnrtQueueCreate(&queue));

        checkCnnlError(cnnlCreate(&cnnl));
@ -24,10 +27,12 @@ class BangRuntimeObj : public RuntimeObj {
        // 10GB for Longformer
        // size_t longformerNum = 3lu * (1 << 30);
        workspaceSize = 7ll << 30; // 7 GB
+        cursor = 0;
        workspace = alloc(workspaceSize);
    }
    virtual ~BangRuntimeObj() {
        dealloc(workspace);
+        checkBangError(cnrtQueueDestroy(queue));
        checkCnnlError(cnnlDestroy(cnnl));
    }
    string toString() const override;
@ -45,10 +50,15 @@ class BangRuntimeObj : public RuntimeObj {
    void dealloc(void *ptr) override { checkBangError(cnrtFree(ptr)); }
    cnnlHandle_t cnnlHandle() const { return cnnl; }
    BangPtr getWorkspace(size_t size) const {
-        IT_ASSERT(size <= workspaceSize);
-        return workspace;
+        IT_ASSERT((cursor + size) <= workspaceSize);
+        cursor += size;
+        void *temp = workspace;
+        temp += (cursor - size);
+        return temp;
    }

+    void resetWorkspace() const { cursor = 0; }
+
    void copyBlobFromCPU(void *dst, const void *src,
                         size_t bytes) const override {
        checkBangError(cnrtMemcpy(dst, const_cast<void *>(src), bytes,
@ -66,10 +76,9 @@ class BangRuntimeObj : public RuntimeObj {
        checkBangError(cnrtMemcpy(dst, const_cast<void *>(src), bytes,
                                  CNRT_MEM_TRANS_DIR_PEER2PEER));
    }
-
-    void initComm(const string &, int, int) override { IT_TODO_HALT(); }
-
-    CommunicatorObj &getCommunicator() const override { IT_TODO_HALT(); }
+    void initComm(const string &name, int worldSize, int rank) final;
+    CommunicatorObj &getCommunicator() const override { return *comm; }
+    cnrtQueue_t getBangQueue() const { return queue; }

  private:
    void runWithoutSync(const Graph &graph, bool tune, bool profiling) const;
--- a/include/bang/cncl_communicator.h
+++ b/include/bang/cncl_communicator.h
@ -0,0 +1,79 @@
+#pragma once
+#include "bang_common.h"
+#include "core/communicator.h"
+#include <chrono>
+#include <cncl.h>
+#include <cnrt.h>
+#include <cstdlib>
+#include <filesystem>
+#include <fstream>
+#include <mutex>
+#include <thread>
+
+namespace infini {
+
+class CnclCommunicatorObj final : public CommunicatorObj {
+  private:
+    cnclComm_t *comms;
+
+  public:
+    CnclCommunicatorObj(const string &name, int worldSize, int rank)
+        : CommunicatorObj(worldSize, rank) {
+        const std::string filePath("./" + name + "_cncl_id.bin");
+        cnclCliqueId clique_id;
+        if (rank == 0) {
+            CNCL_CHECK(cnclGetCliqueId(&clique_id));
+            std::ofstream ofs(filePath, std::ios::binary);
+            ofs.write((char *)&clique_id, sizeof(cnclCliqueId));
+
+        } else {
+            auto begin = std::chrono::steady_clock::now();
+            while (!std::filesystem::exists(filePath)) {
+                auto now = std::chrono::steady_clock::now();
+                _IT_ASSERT_2(now < begin + std::chrono::seconds(10),
+                             "time limit (10s) exceeded.");
+                std::this_thread::sleep_for(std::chrono::milliseconds(100));
+            }
+            std::ifstream ifs(filePath, std::ios::binary);
+            ifs.read((char *)&clique_id, sizeof(cnclCliqueId));
+        }
+
+        int num_comms = 1;
+        int *dev_list = new int[num_comms];
+        int *rank_list = new int[num_comms];
+        comms = new cnclComm_t[num_comms];
+        uint32_t num_dev = 0;
+        checkBangError(cnrtGetDeviceCount(&num_dev));
+
+        for (int i = 0; i < num_comms; i++) {
+            rank_list[i] = rank;
+            dev_list[i] = rank_list[i] % num_dev;
+        }
+
+        CNCL_CHECK(cnclInitComms(comms, num_comms, dev_list, rank_list,
+                                 worldSize, &clique_id));
+
+        if (rank == 0) {
+            std::filesystem::remove(filePath);
+        }
+
+        delete[] dev_list;
+        delete[] rank_list;
+    }
+
+    ~CnclCommunicatorObj() {
+        CNCL_CHECK(cnclDestroyComms(comms, 1));
+        delete[] comms;
+    }
+
+    // Get the actual cnclComm_t
+    cnclComm_t getCnclComm() { return comms[0]; }
+
+    virtual string toString() const final {
+        std::ostringstream oss;
+        oss << "CNCL communicator";
+        return oss.str();
+    }
+};
+
+} // namespace infini
--- a/include/core/common.h
+++ b/include/core/common.h
@ -61,21 +61,35 @@ template <typename T> auto enum_to_underlying(T e) {
 }

 template <typename T> std::string vecToString(const std::vector<T> &vec) {
-    std::string ret;
-    ret.append("[");
-    for (auto d : vec) {
-        ret.append(std::to_string(d));
-        ret.append(",");
+    std::stringstream ss;
+    ss << "[";
+    for (size_t i = 0; i < vec.size(); ++i) {
+        ss << vec.at(i);
+        if (i < vec.size() - 1) {
+            ss << ",";
+        }
    }
-    if (!vec.empty())
-        ret.pop_back();
-    ret.append("]");
-    return ret;
+    ss << "]";
+    return ss.str();
+}
+
+template <typename T> std::string vecToString(const T *st, size_t length) {
+    std::stringstream ss;
+    ss << "[";
+    size_t i = 0;
+    for (i = 0; i < length; i++) {
+        ss << *(st + i);
+        if (i < length - 1) {
+            ss << ",";
+        }
+    }
+    ss << "]";
+    return ss.str();
 }

 double timeit(
    const std::function<void()> &func,
-    const std::function<void(void)> &sync = []() {}, int warmupRounds = 200,
-    int timingRounds = 200);
+    const std::function<void(void)> &sync = []() {}, int warmupRounds = 10,
+    int timingRounds = 10);

 } // namespace infini
--- a/include/core/graph.h
+++ b/include/core/graph.h
@ -53,6 +53,7 @@ class GraphObj : public Object {
    const TensorVec &getTensors() const { return tensors; }
    const OpVec &getOperators() const { return ops; }
    OpVec getComputeOps() const;
+    Tensor getTensor(int) const;

    /**
     * Sort the nodes in topological order.
@ -64,7 +65,13 @@ class GraphObj : public Object {

    void optimize();

-    void dataMalloc(bool useNaiveAllocator = false);
+    void shape_infer();
+
+    void dataMalloc(bool useNaiveAllocator = false, size_t memPoolSize = 0);
+
+    Tensor cloneKV(Tensor &tensor);
+
+    void freeHeap();

    /**
     * @brief Add an operator and create its outputs. Output tensor arguments
--- a/include/core/graph_handler.h
+++ b/include/core/graph_handler.h
@ -5,6 +5,10 @@
 #include <cstdint>
 #include <iostream>

+#ifdef USE_CUDA
+#include "cuda/cuda_runtime.h"
+#endif
+
 namespace infini {

 class GraphHandlerObj {
@ -26,10 +30,14 @@ class GraphHandlerObj {
                            int pw, int sh, int sw, int dh, int dw, int oph,
                            int opw);
    Tensor matmul(Tensor a, Tensor b, Tensor y, bool transA, bool transB,
-                  Tensor bias, ActType act);
+                  Tensor bias, ActType act,
+                  std::string matmul_compute_type = "default");
    Tensor batchNormalization(Tensor input, Tensor output, Tensor mean,
                              Tensor var, Tensor scale, Tensor bias,
                              float momentum, float eps, bool training);
+    Tensor layerNormalization(Tensor input, Tensor scale, Tensor output,
+                              Tensor bias, float eps, int axis, int stash_type);
+    Tensor rmsNorm(Tensor input, Tensor weight, Tensor output);

    Tensor maxPool(Tensor input, Tensor output, int kh, int kw, int dh, int dw,
                   int ph, int pw, int sh, int sw, int ceilMode);
@ -45,6 +53,7 @@ class GraphHandlerObj {
    Tensor max(Tensor a, Tensor b, Tensor c);

    Tensor relu(Tensor x, Tensor y);
+    Tensor silu(Tensor x, Tensor y);
    Tensor gelu(Tensor x, Tensor y);
    Tensor sigmoid(Tensor x, Tensor y);
    Tensor hardSigmoid(Tensor x, Tensor y);
@ -63,12 +72,27 @@ class GraphHandlerObj {
                std::optional<float> max);
    Tensor transpose(Tensor data, Tensor transposed, Shape perm);
    Tensor reshape(Tensor data, Tensor reshaped, Shape shape);
+    Tensor resize(Tensor input, Tensor output,
+                  const std::optional<vector<int>> &axes, Tensor sizes,
+                  Tensor scales, Tensor roi, vector<uint32_t> sizes_,
+                  vector<float> scales_, vector<float> roi_, string mode,
+                  string ratioPolicy, string nearestMode,
+                  string coordTransMode);
+    Tensor squeeze(Tensor input, Tensor output, Shape axes);
+    Tensor unsqueeze(Tensor input, Tensor output, Shape axes);
    Tensor concat(TensorVec inputs, Tensor output, int dim);
+    Tensor attentionKVCache(Tensor input_k_cache, Tensor input_v_cache,
+                            Tensor input_q, Tensor input_k, Tensor input_v,
+                            Tensor position_id, Tensor output_matmul);
+    Tensor RoPE(Tensor pos, Tensor input, Tensor output);
    TensorVec split(Tensor input, std::optional<TensorVec> outputs, int axis,
-                    int num_outputs);
+                    std::variant<int, vector<int>> numOrRatio);
    Tensor gather(Tensor data, Tensor indices, Tensor output, int axis);
+    Tensor gatherElements(Tensor data, Tensor indices, Tensor output, int axis);
    Tensor reduceMean(Tensor data, Tensor reduced,
                      const optional<vector<int>> &axes, bool keepdims);
+    Tensor reduceSum(Tensor data, Tensor reduced,
+                     const optional<vector<int>> &axes, bool keepdims);
    Tensor slice(Tensor input, Tensor output, const vector<int> &starts,
                 const vector<int> &ends, const optional<vector<int>> &axes,
                 const optional<vector<int>> &steps);
@ -77,6 +101,7 @@ class GraphHandlerObj {
    Tensor cast(Tensor input, Tensor output, int to);
    Tensor expand(Tensor input, Tensor output, Shape dims);
    Tensor where(Tensor inputX, Tensor inputY, Tensor condition, Tensor output);
+    std::vector<int> getDims(Tensor x) { return x->getDims(); }

    Tensor allReduceSum(Tensor input, Tensor output);
    Tensor allReduceProd(Tensor input, Tensor output);
@ -85,6 +110,13 @@ class GraphHandlerObj {
    Tensor allReduceAvg(Tensor input, Tensor output);
    TensorVec allGather(Tensor input, std::optional<TensorVec> outputs, int n);
    Tensor broadcast(Tensor input, Tensor output, int root);
+    Tensor send(Tensor input, int source, int destination, Tensor output);
+    Tensor recv(Tensor output, int source, int destination, Shape dims,
+                int outputType, Tensor input);
+    Tensor depthToSpace(Tensor input, Tensor output, int blocksize,
+                        std::string mode);
+    Tensor lrn(Tensor input, Tensor output, float alpha, float beta, float bias,
+               int size);

    //------ modifiers

@ -92,15 +124,31 @@ class GraphHandlerObj {

    inline void optimize() { g->optimize(); }

+    inline void shape_infer() { g->shape_infer(); }
+
+    void change_shape(const vector<int> &shape, int tensorId);
    //------ runtime

-    inline void data_malloc() { g->dataMalloc(); }
+    inline void data_malloc(bool useNaiveAllocator = false,
+                            size_t memPoolSize = 0) {
+        g->dataMalloc(useNaiveAllocator, memPoolSize);
+    }
+
+    inline Tensor clone_KV(Tensor &tensor) { return g->cloneKV(tensor); }
+
+    inline void free_heap() { g->freeHeap(); }

    inline void tune() { g->getRuntime()->run(g, true); }

    inline void run() { g->getRuntime()->run(g); }

    inline double get_perf_time() { return g->getRuntime()->getPerfTime(g); }
+
+#ifdef USE_CUDA
+    inline void run_with_cudagraph() {
+        (as<CudaRuntimeObj>(g->getRuntime()))->runWithCudaGraph(g);
+    }
+#endif
 };

 } // namespace infini
--- a/include/core/kernel.h
+++ b/include/core/kernel.h
@ -2,10 +2,11 @@
 #include "core/common.h"
 #include "core/operator.h"
 #include "core/tensor.h"
+#include "utils/operator_utils.h"
 #include <functional>
 #include <nlohmann/json.hpp>
-using json = nlohmann::json;
 namespace infini {
+using json = nlohmann::json;

 class RuntimeObj; // Forward declaration for Kernel::compute

@ -29,7 +30,6 @@ class Kernel {
  public:
    Kernel() {}
    virtual ~Kernel() {}
-
    /**
     * @param op The operator to be executed.
     * @param record The parameters for kernel execution. If extra parameters
@ -102,11 +102,9 @@ class KernelRegistry {
    }
    Kernel *getKernel(const KernelAttrs &kernelAttrs) const {
        auto it = kernels.find(kernelAttrs);
-        IT_ASSERT(it != kernels.end(),
-                  "Kernel not found for key {" +
-                      to_string(enum_to_underlying(std::get<0>(kernelAttrs))) +
-                      ", " + std::to_string(std::get<1>(kernelAttrs)) + ", " +
-                      std::get<2>(kernelAttrs).toString() + "}");
+        IT_ASSERT(it != kernels.end(), "Kernel not found for key {" +
+                                           get_kernel_attrs_str(kernelAttrs) +
+                                           "}");
        return std::get<0>(it->second);
    }
    const KernelRecord &getKernelItem(const KernelAttrs &kernelAttrs) const {
@ -131,15 +129,16 @@ class CpuKernelWithoutConfig : public Kernel {

 } // namespace infini

-#define _REGISTER_KERNEL_1(device, opType, dataType, kernel, name, cnt)        \
+#define _REGISTER_KERNEL_1(device, opType, kernel, name, cnt)                  \
    namespace infini {                                                         \
    static const bool _CAT(_register_kernel_, cnt) =                           \
-        KernelRegistry::getInstance().registerKernel(                          \
-            KernelAttrs{device, opType, dataType}, new kernel(), name);        \
+        KernelRegistry::getInstance().registerKernel(KernelAttrs{device,       \
+                                                                 opType},      \
+                                                     new kernel(), name);      \
    }

-#define REGISTER_KERNEL(device, opType, dataType, kernel, name)                \
-    _REGISTER_KERNEL_1(device, opType, dataType, kernel, name, __COUNTER__)
+#define REGISTER_KERNEL(device, opType, kernel, name)                          \
+    _REGISTER_KERNEL_1(device, opType, kernel, name, __COUNTER__)

 #define _REGISTER_CONSTRUCTOR_1(type, constructor, cnt)                        \
    namespace infini {                                                         \
--- a/include/core/lazy_allocator.h
+++ b/include/core/lazy_allocator.h
@ -26,14 +26,23 @@ class LazyAllocator {

    size_t weightPeak = 0;

+    size_t heapPeak = 0;
+
    size_t alignment;

+    bool hasMemPool = false;
+
+    size_t memPoolSize = 0;
+
    // pointer to the memory actually allocated
    void *ptr = nullptr;

    // pointer to the weight memory space
    void *weightPtr = nullptr;

+    // memory pool ptr
+    void *memPoolPtr = nullptr;
+
    // // a cache designed for a batch size that has already occurred
    // std::unordered_map<size_t, std::unordered_map<TensorObj *, size_t>>
    // batchsizeToTensorOffset;
@ -68,6 +77,10 @@ class LazyAllocator {

    void init();

+    void setMemPool(size_t memPoolSize);
+
+    bool getMemPoolStatus();
+
    // function: simulate memory allocation
    // arguments：
    //     size: size of memory block to be allocated
@ -76,6 +89,10 @@ class LazyAllocator {

    size_t allocWeight(size_t size);

+    size_t heapAlloc(size_t size);
+
+    void freeHeap();
+
    // function: simulate memory free
    // arguments:
    //     addr: head address offset of memory block to be free
@ -92,6 +109,8 @@ class LazyAllocator {

    void *getWeightPtr();

+    void *getHeapPtr();
+
    void info();

  private:
--- a/include/core/op_type.h
+++ b/include/core/op_type.h
@ -21,10 +21,11 @@ struct OpType {
        Add,                // Binary
        And,                // Binary
        ArgMax,             //
-        Asin,               // Binary
-        Asinh,              // Binary
-        Atan,               // Binary
-        Atanh,              // Binary
+        Asin,               // Unary
+        Asinh,              // Unary
+        Atan,               // Unary
+        Atanh,              // Unary
+        AttentionKVCache,   // Fusion
        AveragePool,        // Pool
        BatchNormalization, //
        Bernoulli,          //
@ -150,11 +151,14 @@ struct OpType {
        ReduceSum,       // Reduce
        ReduceSumSquare, // Reduce
        Relu,            // Unary
+        Silu,            // Unary
        Reshape,
        Resize,
        ReverseSequence,
        RoiAlign,
-        Round, // Unary
+        RoPE,    // Fusion
+        Round,   // Unary
+        RMSNorm, // Fusion
        STFT,
        Scan,
        Scatter,
@ -231,6 +235,8 @@ struct OpType {
        AllReduceAvg,
        AllGather,
        Broadcast,
+        Send,
+        Recv,
    } type;

    constexpr OpType(decltype(type) t) : type(t) {}
--- a/include/core/operator.h
+++ b/include/core/operator.h
@ -4,7 +4,7 @@
 #include "core/tensor.h"

 namespace infini {
-using KernelAttrs = std::tuple<Device, OpType::underlying_t, DataType>;
+using KernelAttrs = std::tuple<Device, OpType::underlying_t>;

 struct OpPerfKey {
    HashType hash;
@ -55,8 +55,7 @@ class OperatorObj : public Object {

  public:
    OperatorObj(OpType opType, TensorVec inputs, TensorVec outputs);
-    virtual optional<vector<Shape>>
-    inferShape(const TensorVec &inputs) const = 0;
+    virtual optional<vector<Shape>> inferShape(const TensorVec &inputs) = 0;
    virtual vector<DataType> inferDataType(const TensorVec &inputs) const;
    /**
     * @brief Constructs outputs (if requried) and check whether the operator is
@ -91,6 +90,7 @@ class OperatorObj : public Object {
    OpType getOpType() const { return type; }
    // HACK: set correct data type
    DataType getDType() const { return getInputs(0)->getDType(); }
+    DataType getOutDType() const { return getOutput()->getDType(); }
    virtual int numInputs() const = 0;
    virtual int numOutputs() const = 0;

@ -105,7 +105,7 @@ class OperatorObj : public Object {
                           const TensorVec &newOutputs) const = 0;

  protected:
-    optional<vector<Shape>> inferShape() const;
+    optional<vector<Shape>> inferShape();
    vector<DataType> inferDataType() const;

  private:
--- a/include/core/perf_engine.h
+++ b/include/core/perf_engine.h
@ -2,8 +2,8 @@
 #include "core/graph.h"
 #include "core/kernel.h"
 #include <nlohmann/json_fwd.hpp>
-using json = nlohmann::json;
 namespace infini {
+using json = nlohmann::json;

 class PerfEngine {
  public:
--- a/include/core/runtime.h
+++ b/include/core/runtime.h
@ -15,6 +15,7 @@ class GraphObj;
 class GraphHandlerObj;
 class RuntimeObj;
 class BlobObj;
+template <typename T> class WorkspaceObj;

 using TensorBase = Ref<TensorBaseObj>;
 using Tensor = Ref<TensorObj>;
@ -23,6 +24,7 @@ using Graph = Ref<GraphObj>;
 using GraphHandler = Ref<GraphHandlerObj>;
 using Runtime = Ref<RuntimeObj>;
 using Blob = Ref<BlobObj>;
+template <typename T> using Workspace = Ref<WorkspaceObj<T>>;

 using TensorVec = vector<Tensor>;
 using OpVec = vector<Operator>;
@ -30,7 +32,7 @@ using OpLists = list<Operator>;

 using VType = uint32_t;

-enum class Device { CPU = 1, CUDA, BANG, INTELCPU };
+enum class Device { CPU = 1, CUDA, BANG, INTELCPU, KUNLUN };
 /***************** Forward declaration end *****************/

 class RuntimeObj : public std::enable_shared_from_this<RuntimeObj> {
@ -72,6 +74,7 @@ class RuntimeObj : public std::enable_shared_from_this<RuntimeObj> {
    }
    bool isCuda() const { return device == Device::CUDA; }
    bool isBang() const { return device == Device::BANG; }
+    bool isKUNLUN() const { return device == Device::KUNLUN; }
    void copyBlob(const TensorObj *dst, const TensorObj *src) const;
    // TODO: unify these copy APIs
    virtual void copyBlobFromCPU(void *dst, const void *src,
--- a/include/core/tensor.h
+++ b/include/core/tensor.h
@ -4,11 +4,14 @@
 #include "utils/data_convert.h"
 #include <cmath>
 #include <cstring>
+#include <fstream>

 #if USE_CUDA
 #include "cuda/cuda_runtime.h"
 #endif
-
+#if USE_BANG
+#include "bang/bang_runtime.h"
+#endif
 namespace infini {

 // TODO: how to deal with this
@ -31,6 +34,7 @@ class TensorObj : public TensorBaseObj {
    size_t getBytes() const { return _size * dtype.getSize(); }

    Shape getDims() const { return shape; }
+    void setShape(Shape shape_);
    size_t getRank() const { return shape.size(); }
    Shape getStride() const;
    size_t getOffset(const vector<int> &ds) const;
@ -41,8 +45,16 @@ class TensorObj : public TensorBaseObj {
    bool isOutput() const { return tensorType == TensorType::output; }
    bool isOthers() const { return tensorType == TensorType::others; }
    void setWeight() { tensorType = TensorType::weight; }
-    void setInput() { tensorType = TensorType::input; }
-    void setOutput() { tensorType = TensorType::output; }
+    void setInput() {
+        if (!this->isWeight()) {
+            tensorType = TensorType::input;
+        }
+    }
+    void setOutput() {
+        if (!this->isWeight()) {
+            tensorType = TensorType::output;
+        }
+    }
    string tensorTypeToString() const {
        switch (tensorType) {
        case TensorType::weight:
@ -132,6 +144,7 @@ class TensorObj : public TensorBaseObj {
    }

    void printData() const;
+    void dumpData(std::ofstream &ofs) const;
    bool equalData(const Tensor &rhs, double relativeError = 1e-6) const;

    template <typename T> bool equalData(const vector<T> &dataVector) {
@ -180,19 +193,27 @@ class TensorObj : public TensorBaseObj {
    }

    template <typename T>
-    bool equalDataImpl(const T *a, const T *b, size_t size) const {
+    bool equalDataImpl(const T *a, const T *b, size_t size,
+                       double relativeError = 1e-6) const {
        for (size_t i = 0; i < size; ++i) {
            if constexpr (std::is_integral_v<T>) {
                if (a[i] != b[i])
                    return false;
            } else if constexpr (std::is_floating_point_v<T>) {
-                if (fabs(a[i] - b[i]) / std::max(fabs(a[i]), fabs(b[i])) >
-                    1e-6) {
+                if (std::min(fabs(a[i]), fabs(b[i])) == 0. &&
+                    fabs(a[i] - b[i]) > relativeError) {
+                    printf("Error on %lu: %f %f\n", i, a[i], b[i]);
+                    return false;
+                } else if (std::min(fabs(a[i]), fabs(b[i])) != 0. &&
+                           fabs(a[i] - b[i]) /
+                                   std::max(fabs(a[i]), fabs(b[i])) >
+                               relativeError) {
                    printf("Error on %lu: %f %f\n", i, a[i], b[i]);
                    return false;
                }
-            } else
+            } else {
                static_assert(!sizeof(T), "Unsupported data type");
+            }
        }
        return true;
    }
@ -227,8 +248,8 @@ class TensorObj : public TensorBaseObj {
    //         // std::cerr << "Init beginned " << std::endl;
    // #pragma omp parallel for
    //         for (size_t i = 0; i < iEnd; ++i)
-    //             data[i] = fastrand(random_seed[omp_get_thread_num() * 16]) %
-    //             10000;
+    //             data[i] = fastrand(random_seed[omp_get_thread_num() *
+    //             16]) % 10000;
    //         // std::cerr << "Init finished" << std::endl;
    //         computed = ComputedFull;
    //         return true;
@ -273,8 +294,8 @@ class TensorObj : public TensorBaseObj {
    //         auto nDim = dims.size();
    //         auto nBroadcastDim = ds.size() - nDim;
    //         for (size_t i = 0; i < nDim; ++i)
-    //             if (ds[nBroadcastDim + i] < 0 || ds[nBroadcastDim + i] >=
-    //             dims[i])
+    //             if (ds[nBroadcastDim + i] < 0 || ds[nBroadcastDim +
+    //             i] >= dims[i])
    //                 return (size_t)-1;
    //         size_t idx = 0;
    //         for (size_t i = 0; i < nDim; ++i)
@ -333,12 +354,14 @@ class TensorObj : public TensorBaseObj {
    //         return (g_seed >> 16) & 0x7FFF;
    //     }

-    //     std::vector<std::vector<int>> const *getSplittingPoints() const {
+    //     std::vector<std::vector<int>> const *getSplittingPoints()
+    //     const {
    //         assert(!splittingPoints.empty());
    //         return &splittingPoints;
    //     }

-    //     bool setSplittingPoints(std::vector<std::vector<int>> value) {
+    //     bool setSplittingPoints(std::vector<std::vector<int>> value)
+    //     {
    //         assert(!value.empty());
    //         splittingPoints = value;
    //         return true;
--- a/include/core/workspace.h
+++ b/include/core/workspace.h
@ -0,0 +1,42 @@
+#pragma once
+#include "core/runtime.h"
+
+namespace infini {
+
+template <class T> class WorkspaceObj {
+  private:
+    T workspace;           // workspace pointer
+    size_t workspaceSize;  // Size of workspace
+    size_t workspaceAlloc; // currently use workspace size
+
+  public:
+    WorkspaceObj(T workspace_, size_t workspaceSize_)
+        : workspace(workspace_), workspaceSize(workspaceSize_) {
+        workspaceAlloc = 0;
+    }
+    virtual ~WorkspaceObj() {
+        // Dealloc workspace in RuntimeObj
+        // Set workspace = nullptr here
+        workspace = nullptr;
+    }
+    size_t getWorkspaceSize() const { return workspaceSize; }
+
+    T getWorkspace(size_t size) {
+        // Get unused workspace
+        IT_ASSERT(size + workspaceAlloc <= workspaceSize);
+        auto ret = (T)(static_cast<uint8_t *>(workspace) + workspaceAlloc);
+        workspaceAlloc += size;
+        return ret;
+    }
+    T getWorkspace() {
+        // Override getWorkspace in order to dealloc in runtime
+        return workspace;
+    }
+    void resetWorkspace() {
+        // Reset workspaceAlloc every time end kernel
+        workspaceAlloc = 0;
+    }
+    size_t getWorkspaceAlloc() const { return workspaceAlloc; }
+};
+
+} // namespace infini
--- a/include/cuda/cuda_attention_kvcache.h
+++ b/include/cuda/cuda_attention_kvcache.h
@ -0,0 +1,17 @@
+#pragma once
+#include "core/common.h"
+#include <cstdio>
+
+struct AttentionKVCacheMetadata {
+    int dimSize[4];
+    int stride[4];
+};
+
+namespace infini {
+void attention_kvcache_kernel(float *input_k_cache, float *input_v_cache,
+                              float *input_q, float *input_k, float *input_v,
+                              int *position_id, float *output_matmul,
+                              const AttentionKVCacheMetadata &compMeta,
+                              float *output_O_temp, float *output_sum_temp);
+
+} // namespace infini
--- a/include/cuda/cuda_common.h
+++ b/include/cuda/cuda_common.h
@ -5,6 +5,7 @@
 #include <cuda_profiler_api.h>
 #include <cudnn.h>
 #include <curand.h>
+#include <memory>

 #define checkCudaError(call)                                                   \
    if (auto err = call; err != cudaSuccess)                                   \
@ -111,4 +112,20 @@ inline const char *curandGetErrorString(curandStatus_t error) {

 using CudaPtr = void *;

+class CUDAStream {
+  public:
+    CUDAStream(const CUDAStream &) = delete;
+    CUDAStream(CUDAStream &&) = delete;
+    void operator=(const CUDAStream &) = delete;
+    void operator=(CUDAStream &&) = delete;
+    static cudaStream_t getCurrentStream() { return _stream; }
+    static void Init() { CUDAStream::_stream = 0; };
+    static void createStream() { checkCudaError(cudaStreamCreate(&_stream)); }
+    static void destroyStream() { checkCudaError(cudaStreamDestroy(_stream)); }
+
+  private:
+    CUDAStream(){};
+    static cudaStream_t _stream;
+};
+
 } // namespace infini
--- a/include/cuda/cuda_element_wise.h
+++ b/include/cuda/cuda_element_wise.h
@ -1,8 +1,20 @@
 #pragma once

 namespace infini {
-void div_kernel(float *a, float *b, float *c, int a0, int a1, int a2, int a3,
-                int b0, int b1, int b2, int b3, int c0, int c1, int c2, int c3);
-void pow_kernel(float *a, float *b, float *c, int a0, int a1, int a2, int a3,
-                int b0, int b1, int b2, int b3, int c0, int c1, int c2, int c3);
+void div_kernel(int dtypeIndex, void *a, void *b, void *c, int a0, int a1,
+                int a2, int a3, int b0, int b1, int b2, int b3, int c0, int c1,
+                int c2, int c3);
+void add_kernel(int dtypeIndex, void *a, void *b, void *c, int a0, int a1,
+                int a2, int a3, int b0, int b1, int b2, int b3, int c0, int c1,
+                int c2, int c3);
+void pow_kernel(int dtypeIndex, void *a, void *b, void *c, int a0, int a1,
+                int a2, int a3, int b0, int b1, int b2, int b3, int c0, int c1,
+                int c2, int c3);
+void less_kernel(int dtypeIndex, void *a, void *b, void *c, int a0, int a1,
+                 int a2, int a3, int b0, int b1, int b2, int b3, int c0, int c1,
+                 int c2, int c3);
+
+void div_const_kernel(int dType, void *a, void *b, void *c, size_t n);
+
+void pow_const_kernel(int dType, void *a, void *b, void *c, size_t n);
 }; // namespace infini
--- a/include/cuda/cuda_expand.h
+++ b/include/cuda/cuda_expand.h
@ -3,7 +3,10 @@
 #include "operators/unary.h"
 #include "utils/small_array.h"
 namespace infini {
-void expandKernel(float *input, float *output, int nDims, int outputsize,
-                  SmallArray inputShape, SmallArray outputShape);
+void expandKernel(int dType, void *input, void *output, int nDims,
+                  int outputsize, SmallArray inputShape,
+                  SmallArray outputShape);

+void expandRowKernel(int dType, void *input, void *output, int n_rows,
+                     int row_len);
 }; // namespace infini
--- a/include/cuda/cuda_layernorm.h
+++ b/include/cuda/cuda_layernorm.h
@ -0,0 +1,17 @@
+#pragma once
+#include "operators/unary.h"
+
+namespace infini {
+void LaynormKernel(const float *input, const float *scale, const float eps,
+                   int size, int scaleSize, const int dimsize, const int stride,
+                   float *output, const float *bias, int biasSize);
+void LaynormKernel(const float *input, const float *scale, const float eps,
+                   int size, int scaleSize, const int dimsize, const int stride,
+                   float *output);
+void LaynormKernel(const half *input, const half *scale, const half eps,
+                   int size, int scaleSize, const int dimsize, const int stride,
+                   half *output, const half *bias, int biasSize);
+void LaynormKernel(const half *input, const half *scale, const half eps,
+                   int size, int scaleSize, const int dimsize, const int stride,
+                   half *output);
+}; // namespace infini
--- a/include/cuda/cuda_pad_slice.h
+++ b/include/cuda/cuda_pad_slice.h
@ -10,10 +10,11 @@ typedef struct {
    int wholeNDim[MAX_DIM];  // dim size after padding or before slicing
    int partNDim[MAX_DIM];   // dim size before padding or after slicing
    int partStride[MAX_DIM]; // stride before padding or after slicing
+    int DType;
 } TransMetaData;

 namespace infini {
-void pad_slice_kernel(float *partData, float *wholeData,
+void pad_slice_kernel(void *partData, void *wholeData,
                      const TransMetaData &metadata, int nDims, int num,
                      bool isPad);
 } // namespace infini
--- a/include/cuda/cuda_rmsnorm.h
+++ b/include/cuda/cuda_rmsnorm.h
@ -0,0 +1,10 @@
+#pragma once
+
+#include "operators/rms_norm.h"
+
+namespace infini {
+
+void rmsnorm_kernel(int dType, void *input, void *weight, void *output,
+                    int num_tokens, int hidden_size);
+
+}; // namespace infini
--- a/include/cuda/cuda_rope.h
+++ b/include/cuda/cuda_rope.h
@ -0,0 +1,12 @@
+#pragma once
+
+#include "operators/rope.h"
+#include "utils/small_array.h"
+
+namespace infini {
+
+void rope_kernel(int dType, int *pos, void *input, void *output, int size,
+                 int dim_model, int dim_head, int hidden_stride,
+                 int pos_stride);
+
+}; // namespace infini
--- a/include/cuda/cuda_runtime.h
+++ b/include/cuda/cuda_runtime.h
@ -14,6 +14,9 @@ class CudaRuntimeObj : public RuntimeObj {
    std::unique_ptr<CommunicatorObj> comm;
    CudaPtr workspace;
    size_t workspaceSize;
+    bool isCudaGraphCreated;
+    cudaGraph_t cudaGraph;
+    cudaGraphExec_t cudaGraphInstance;

  public:
    explicit CudaRuntimeObj(int deviceId = 0)
@ -26,9 +29,16 @@ class CudaRuntimeObj : public RuntimeObj {
        // size_t longformerNum = 3lu * (1 << 30);
        workspaceSize = 7ll << 30; // 7 GB
        workspace = alloc(workspaceSize);
+        isCudaGraphCreated = false;
+        CUDAStream::Init();
    }
    virtual ~CudaRuntimeObj() {
        try {
+            if (isCudaGraphCreated) {
+                checkCudaError(cudaGraphExecDestroy(cudaGraphInstance));
+                checkCudaError(cudaGraphDestroy(cudaGraph));
+                CUDAStream::destroyStream();
+            }
            dealloc(workspace);
            checkCudnnError(cudnnDestroy(cudnn));
            checkCublasError(cublasDestroy(cublas));
@ -75,6 +85,8 @@ class CudaRuntimeObj : public RuntimeObj {

    void runWithoutSync(const Graph &graph) const;

+    void runWithCudaGraph(const Graph &graph);
+
    // init communicator
    void initComm(const string &name, int worldSize, int rank) final;

--- a/include/cuda/cuda_softmax.h
+++ b/include/cuda/cuda_softmax.h
@ -0,0 +1,8 @@
+#pragma once
+#include "utils/small_array.h"
+namespace infini {
+void softmax_kernel(int num_blocks, float *input, float *output, int size,
+                    int dimsize, int stride);
+void softmax_kernel(int num_blocks, half *input, half *output, int size,
+                    int dimsize, int stride);
+} // namespace infini
--- a/include/cuda/cuda_split_concat.h
+++ b/include/cuda/cuda_split_concat.h
@ -3,13 +3,13 @@
 #include <cstdio>

 const int BATCH_SIZE = 32; // parallel tensor number.
-const int DIM_MAX_SIZE = 4;
+const int DIM_MAX_SIZE = 8;

 // Concat operator acts like element tensors composing to one big tensor,and
 // split operator acts like one big tensor being composed by element
 // tensors.
-struct ElementTensorMetadata {
-    float *data[BATCH_SIZE];
+template <typename T> struct ElementTensorMetadata {
+    T *data[BATCH_SIZE];
    int dimBgNo[BATCH_SIZE]; // the dimention begin no of the element tensor in
                             // the composed tensor.
    int dimSize[BATCH_SIZE]; // the dimention size of the element tensor.
@ -20,16 +20,17 @@ struct ElementTensorMetadata {
                   data[i], dimBgNo[i], dimSize[i], nElements[i]);
    }
 };
-
-struct ComposedTensorMetadata {
+template <typename T> struct ComposedTensorMetadata {
    int dimSize[DIM_MAX_SIZE];
    int stride[DIM_MAX_SIZE];
-    float *data;
+    T *data;
 };

 namespace infini {
-void split_concat_kernel(const ElementTensorMetadata &eleMeta,
-                         const ComposedTensorMetadata &compMeta, int dim,
+void split_concat_kernel(const ElementTensorMetadata<float> &eleMeta,
+                         const ComposedTensorMetadata<float> &compMeta, int dim,
+                         int batchSize, int nDims, bool isSplit);
+void split_concat_kernel(const ElementTensorMetadata<half> &eleMeta,
+                         const ComposedTensorMetadata<half> &compMeta, int dim,
                         int batchSize, int nDims, bool isSplit);
-
 } // namespace infini
--- a/include/cuda/cuda_transpose.h
+++ b/include/cuda/cuda_transpose.h
@ -5,7 +5,7 @@

 namespace infini {

-void transpose_kernel(float *input, float *output, int nDims, int size,
+void transpose_kernel(int dType, void *input, void *output, int nDims, int size,
                      SmallArray strides, SmallArray outputShape);

 }; // namespace infini
--- a/include/cuda/cuda_unary.h
+++ b/include/cuda/cuda_unary.h
@ -3,48 +3,22 @@
 #include "operators/unary.h"

 namespace infini {
-void softmax_kernel(float *input, float *output, size_t num);
-void relu_kernel(float *input, float *output, size_t num);
-void sigmoid_kernel(float *input, float *output, size_t num);
-void tanh_kernel(float *input, float *output, size_t num);
-void abs_kernel(float *input, float *output, size_t num);
-void sqrt_kernel(float *input, float *output, size_t num);
-void neg_kernel(float *input, float *output, size_t num);
-void gelu_kernel(float *input, float *output, size_t num);
-void erf_kernel(float *input, float *output, size_t num);
-void hard_sigmoid_kernel(float *input, float *output, size_t num);
-void hard_swish_kernel(float *input, float *output, size_t num);
+template <typename T> void softmax_kernel(T *input, T *output, size_t num);
+template <typename T> void relu_kernel(T *input, T *output, size_t num);
+template <typename T> void silu_kernel(T *input, T *output, size_t num);
+template <typename T> void sigmoid_kernel(T *input, T *output, size_t num);
+template <typename T> void tanh_kernel(T *input, T *output, size_t num);
+template <typename T> void abs_kernel(T *input, T *output, size_t num);
+template <typename T> void sqrt_kernel(T *input, T *output, size_t num);
+template <typename T> void neg_kernel(T *input, T *output, size_t num);
+template <typename T> void gelu_kernel(T *input, T *output, size_t num);
+template <typename T> void erf_kernel(T *input, T *output, size_t num);
+template <typename T> void hard_sigmoid_kernel(T *input, T *output, size_t num);
+template <typename T> void hard_swish_kernel(T *input, T *output, size_t num);

-void unary_kernel(const Operator &_op) {
-    auto op = as<UnaryObj>(_op);
-    float *const inputData = (op->getInputs(0)->getRawDataPtr<float *>());
-    float *const outputData = (op->getOutput()->getRawDataPtr<float *>());
+template <typename INPUT, typename OUTPUT>
+void cast_kernel(INPUT *input, OUTPUT *output, size_t num);

-    size_t num = op->getOutput()->size();
-    if (op->getOpType() == OpType::Softmax)
-        softmax_kernel(inputData, outputData, num);
-    else if (op->getOpType() == OpType::Relu)
-        relu_kernel(inputData, outputData, num);
-    else if (op->getOpType() == OpType::Sigmoid)
-        sigmoid_kernel(inputData, outputData, num);
-    else if (op->getOpType() == OpType::HardSigmoid)
-        hard_sigmoid_kernel(inputData, outputData, num);
-    else if (op->getOpType() == OpType::HardSwish)
-        hard_swish_kernel(inputData, outputData, num);
-    else if (op->getOpType() == OpType::Tanh)
-        tanh_kernel(inputData, outputData, num);
-    else if (op->getOpType() == OpType::Abs)
-        abs_kernel(inputData, outputData, num);
-    else if (op->getOpType() == OpType::Sqrt)
-        sqrt_kernel(inputData, outputData, num);
-    else if (op->getOpType() == OpType::Gelu)
-        gelu_kernel(inputData, outputData, num);
-    else if (op->getOpType() == OpType::Neg)
-        neg_kernel(inputData, outputData, num);
-    else if (op->getOpType() == OpType::Erf)
-        erf_kernel(inputData, outputData, num);
-    else
-        IT_TODO_HALT();
-}
+void unary_kernel(const Operator &_op);

 }; // namespace infini
--- a/include/cuda/cuda_utility.h
+++ b/include/cuda/cuda_utility.h
@ -1,11 +1,29 @@
+#pragma once
 #include "core/tensor.h"
+#include "cuda/cuda_common.h"

 namespace infini {

 void cudaPrintFloat(float *x, int len);

-void cudaPrintTensor(const Tensor &tensor) {
-    cudaPrintFloat(tensor->getRawDataPtr<float *>(), tensor->size());
-}
+void cudaPrintTensor(const Tensor &tensor);

-} // namespace infini
+cudnnDataType_t cudnnDataTypeConvert(DataType dataType);
+cudaDataType cublasDataTypeConvert(DataType);
+
+template <int index> struct DT_CUDA {};
+template <> struct DT_CUDA<0> { using t = bool; };
+template <> struct DT_CUDA<1> { using t = float; };
+template <> struct DT_CUDA<2> { using t = unsigned char; };
+template <> struct DT_CUDA<3> { using t = char; };
+template <> struct DT_CUDA<4> { using t = unsigned short; };
+template <> struct DT_CUDA<5> { using t = short; };
+template <> struct DT_CUDA<6> { using t = int; };
+template <> struct DT_CUDA<7> { using t = long long; };
+template <> struct DT_CUDA<9> { using t = bool; };
+template <> struct DT_CUDA<10> { using t = half; };
+template <> struct DT_CUDA<11> { using t = double; };
+template <> struct DT_CUDA<12> { using t = unsigned int; };
+template <> struct DT_CUDA<13> { using t = unsigned long long; };
+template <> struct DT_CUDA<16> { using t = nv_bfloat16; };
+} // namespace infini
--- a/include/cuda/cuda_where.h
+++ b/include/cuda/cuda_where.h
@ -3,9 +3,15 @@
 #include "utils/small_array.h"

 namespace infini {
+
 void whereKernel(const float *inputX, const float *inputY,
                 const uint8_t *condition, float *output, int nDims,
-                 SmallArray inputXShape, SmallArray inputYShape,
-                 SmallArray conditionShape, SmallArray outputShape);
-
+                 int outputsize, SmallArray inputXShape, SmallArray inputYShape,
+                 SmallArray conditionShape, SmallArray outputShape, int xSize,
+                 int ySize, int cSize);
+void whereKernel(const half *inputX, const half *inputY,
+                 const uint8_t *condition, half *output, int nDims,
+                 int outputsize, SmallArray inputXShape, SmallArray inputYShape,
+                 SmallArray conditionShape, SmallArray outputShape, int xSize,
+                 int ySize, int cSize);
 }; // namespace infini
--- a/include/cuda/gather.h
+++ b/include/cuda/gather.h
@ -1,19 +1,61 @@
 #pragma once
 #include "core/data_type.h"
+#include "core/operator.h"
+#include "operators/gather.h"

 namespace infini {
 struct GatherMetaData {
+    // Pointer to indices
    void *indexValue;
+    // Type of index values
    DataType indexType;
+    // Type of input and output data
+    DataType dataType;
+    // Axis of the gather operation
    int axis;
+    // Rank of input
    int inNDim;
+    // Rank of output
    int outNDim;
+    // Rank of indices
    int idxNDim;
+    // Shape of output
    int outDim[4];
+    // Shape of indices
    int idxDim[4];
+    // Strides of indices
    int idxStride[4];
+    // Strides of input
    int inStride[4];
 };

-void gather_kernel(float *in, float *out, GatherMetaData metaData, size_t num);
+inline void initGatherMetaData(GatherMetaData &metaData,
+                               const Ref<OperatorObj> &_op) {
+    memset(&metaData, 0, sizeof(metaData));
+    auto op = as<GatherBaseObj>(_op);
+    Ref<TensorObj> in = op->getInputs(0);
+    Ref<TensorObj> index = op->getInputs(1);
+    Ref<TensorObj> out = op->getOutput();
+    metaData.indexValue = index->getRawDataPtr<void *>();
+    metaData.indexType = index->getDType();
+    metaData.dataType = in->getDType();
+    metaData.axis = op->getAxis();
+    metaData.inNDim = in->getRank();
+    metaData.outNDim = out->getRank();
+    metaData.idxNDim = index->getRank();
+    for (int i = 0; i < metaData.outNDim; ++i)
+        metaData.outDim[i] = out->getDims()[i];
+    for (int i = 0; i < metaData.idxNDim; ++i) {
+        metaData.idxDim[i] = index->getDims()[i];
+        metaData.idxStride[i] = index->getStride()[i];
+    }
+    for (int i = 0; i < metaData.inNDim; ++i) {
+        metaData.inStride[i] = in->getStride()[i];
+    }
+}
+template <typename T>
+void gather_kernel(T *in, T *out, GatherMetaData metaData, size_t num);
+
+void gather_elements_kernel(void *in, void *out, GatherMetaData metaData,
+                            size_t num);
 } // namespace infini
--- a/include/cuda/softmax.h
+++ b/include/cuda/softmax.h
@ -1,6 +0,0 @@
-#pragma once
-
-namespace infini {
-void softmax_kernel(int max_threadblock_size, int batch_size, float *x,
-                    float *y, int dim, int stride);
-}
--- a/include/kunlun/kunlun_act_type.h
+++ b/include/kunlun/kunlun_act_type.h
@ -0,0 +1,23 @@
+#include "core/op_type.h"
+#include "kunlun/kunlun_common.h"
+
+namespace infini {
+using KunlunActType = xdnn::Activation_t;
+KunlunActType parseActType(ActType act) {
+    switch (act) {
+    case ActType::None:
+        return KunlunActType::LINEAR;
+    case ActType::Tanh:
+        return KunlunActType::TANH;
+    case ActType::Sigmoid:
+        return KunlunActType::SIGMOID;
+    case ActType::Relu:
+        return KunlunActType::RELU6;
+    default:
+        fprintf(stderr, "Activation Type not support yet!\n");
+        break;
+    }
+    return KunlunActType::LINEAR;
+}
+
+}; // namespace infini
--- a/include/kunlun/kunlun_common.h
+++ b/include/kunlun/kunlun_common.h
@ -0,0 +1,22 @@
+#pragma once
+#include "core/common.h"
+#include "xpu/runtime_ex.h"
+#include "xpu/xdnn.h"
+
+namespace xdnn = baidu::xpu::api;
+
+#define checkKUNLUNError(call)                                                 \
+    {                                                                          \
+        auto err = call;                                                       \
+        if (XPU_SUCCESS != err) {                                              \
+            fprintf(stderr, "KUNLUN error in %s:%i : %s.\n", __FILE__,         \
+                    __LINE__, xpu_strerror(err));                              \
+            exit(EXIT_FAILURE);                                                \
+        }                                                                      \
+    }
+
+namespace infini {
+
+using KUNLUNPtr = void *;
+
+} // namespace infini
--- a/include/kunlun/kunlun_kernel_without_config.h
+++ b/include/kunlun/kunlun_kernel_without_config.h
@ -0,0 +1,24 @@
+#pragma once
+#include "core/kernel.h"
+#include "kunlun/kunlun_runtime.h"
+
+namespace infini {
+
+class KUNLUNKernelWithoutConfig : public Kernel {
+  public:
+    virtual void compute(const Operator &op, const PerfRecord &record,
+                         const RuntimeObj *context) const {
+        compute(op, context);
+    }
+    virtual void compute(const Operator &op,
+                         const RuntimeObj *context) const = 0;
+    // Premise: op is idempotent since it is called multiple times.
+    virtual PerfRecord tune(const Operator &op,
+                            const RuntimeObj *_context) const {
+        auto context = dynamic_cast<const KUNLUNRuntimeObj *>(_context);
+        return make_ref<PerfRecordObj>(timeit([&]() { compute(op, _context); },
+                                              [&]() { context->sync(); }));
+    }
+};
+
+} // namespace infini
--- a/include/kunlun/kunlun_runtime.h
+++ b/include/kunlun/kunlun_runtime.h
@ -0,0 +1,81 @@
+#pragma once
+#include "core/runtime.h"
+#include "core/workspace.h"
+#include "kunlun/kunlun_common.h"
+#ifdef INFINI_USE_XCCL
+#include "kunlun/xccl_communicator.h"
+#endif
+namespace infini {
+
+class KUNLUNRuntimeObj : public RuntimeObj {
+  private:
+    xdnn::Context *ctx;
+    std::unique_ptr<CommunicatorObj> comm;
+    // KUNLUNPtr workspace;
+    // size_t workspaceSize;
+    Workspace<KUNLUNPtr> workspace;
+
+  public:
+    KUNLUNRuntimeObj(int deviceId = 0) : RuntimeObj(Device::KUNLUN) {
+        xpu_set_device(deviceId);
+        ctx = xdnn::create_context();
+        // 10GB for Longformer
+        // size_t longformerNum = 3lu * (1 << 30);
+        size_t workspaceSize = 2llu << 30; // 2 GB
+        KUNLUNPtr wkspacePtr = alloc(workspaceSize);
+        workspace =
+            make_ref<WorkspaceObj<KUNLUNPtr>>(wkspacePtr, workspaceSize);
+    }
+    virtual ~KUNLUNRuntimeObj() {
+        KUNLUNPtr wkspacePtr = workspace->getWorkspace();
+        dealloc(wkspacePtr);
+        xdnn::destroy_context(ctx);
+    }
+    string toString() const override;
+
+    void run(const Graph &graph, bool tune = false,
+             bool profiling = false) const;
+    // double runEvaluation(const Graph &graph, int nWarmups,
+    //                      int nEvaluations) const;
+    void sync() const;
+
+    KUNLUNPtr alloc(size_t size) override {
+        void *ptr;
+        checkKUNLUNError(
+            xpu_malloc((void **)&ptr, size, XPUMemoryKind::XPU_MEM_HBM));
+        return ptr;
+    }
+    void dealloc(void *ptr) override { xpu_free(ptr); }
+
+    xdnn::Context *KUNLUNHandle() const { return ctx; }
+    // Get $size workspace by bytes
+    KUNLUNPtr getWorkspace(size_t size) const {
+        auto ret = workspace->getWorkspace(size);
+        return ret;
+    }
+    Workspace<KUNLUNPtr> getWorkspaceObj() const { return workspace; }
+
+    void copyBlobFromCPU(void *dst, const void *src,
+                         size_t bytes) const override {
+        xpu_memcpy(dst, const_cast<void *>(src), bytes,
+                   XPUMemcpyKind::XPU_HOST_TO_DEVICE);
+    }
+    void copyBlobToCPU(void *dst, const void *src,
+                       size_t bytes) const override {
+        xpu_memcpy(dst, const_cast<void *>(src), bytes,
+                   XPUMemcpyKind::XPU_DEVICE_TO_HOST);
+    }
+    void copyBlobInsideRuntime(void *dst, const void *src,
+                               size_t bytes) const override {
+        xpu_memcpy(dst, const_cast<void *>(src), bytes,
+                   XPUMemcpyKind::XPU_DEVICE_TO_DEVICE);
+    }
+    void initComm(const string &name, int worldSize, int rank) final;
+
+    CommunicatorObj &getCommunicator() const final { return *comm; }
+
+  private:
+    void runWithoutSync(const Graph &graph, bool tune, bool profiling) const;
+};
+
+} // namespace infini
--- a/include/kunlun/operator_timer.h
+++ b/include/kunlun/operator_timer.h
@ -0,0 +1,10 @@
+#pragma once
+namespace infini {
+namespace opTimer {
+double getPerfConvXdnn(int n, int c, int h, int w, int f, int r, int s,
+                       int padh, int padw, int strideh, int stridew,
+                       int dilationh, int dilationw, int group,
+                       const char *name);
+double getPerfMatmulXdnn(int b, int m, int n, int k, const char *name);
+} // namespace opTimer
+} // namespace infini
--- a/include/kunlun/xccl_communicator.h
+++ b/include/kunlun/xccl_communicator.h
@ -0,0 +1,60 @@
+#pragma once
+#include "core/communicator.h"
+#include "xpu/bkcl.h"
+#include <chrono>
+#include <filesystem>
+#include <fstream>
+#include <thread>
+
+#define checkXcclError(call)                                                   \
+    {                                                                          \
+        auto err = call;                                                       \
+        if (BKCL_SUCCESS != err) {                                             \
+            fprintf(stderr, "XCCL error in %s:%i.\n", __FILE__, __LINE__);     \
+            exit(EXIT_FAILURE);                                                \
+        }                                                                      \
+    }
+
+namespace infini {
+
+class XcclCommunicatorObj final : public CommunicatorObj {
+  private:
+    BKCLContext_t comm;
+
+  public:
+    XcclCommunicatorObj(const string &name, int worldSize, int rank)
+        : CommunicatorObj(worldSize, rank) {
+        const std::string filePath("./" + name + "_xccl_id.bin");
+        BKCLUniqueId commId;
+        if (rank == 0) {
+            checkXcclError(bkcl_get_unique_id(&commId));
+            std::ofstream ofs(filePath, std::ios::binary);
+            ofs.write((char *)&commId, sizeof(BKCLUniqueId));
+        } else {
+            auto begin = std::chrono::steady_clock::now();
+            while (!std::filesystem::exists(filePath)) {
+                auto now = std::chrono::steady_clock::now();
+                _IT_ASSERT_2(now < begin + std::chrono::seconds(100),
+                             "time limit (100s) exceeded.");
+                std::this_thread::sleep_for(std::chrono::milliseconds(100));
+            }
+            std::ifstream ifs(filePath, std::ios::binary);
+            ifs.read((char *)&commId, sizeof(BKCLUniqueId));
+        }
+        checkXcclError(bkcl_init_rank(&comm, rank, worldSize, &commId));
+        if (rank == 0) {
+            std::filesystem::remove(filePath);
+        }
+    }
+
+    BKCLContext_t getXcclComm() { return comm; }
+
+    ~XcclCommunicatorObj() final { checkXcclError(bkcl_destroy_context(comm)); }
+    virtual string toString() const final {
+        std::ostringstream oss;
+        oss << "XCCL communicator";
+        return oss.str();
+    }
+};
+
+} // namespace infini
--- a/include/nnet/test.h
+++ b/include/nnet/test.h
@ -24,7 +24,7 @@
 // clang-format on

 namespace nnet {
-int matchExprResult(Derivator &derivator, string fn);
-bool checkExprLogSame(string fnPrefix, int start, int end);
+int matchExprResult(Derivator &derivator, string pathRelativeToProjectHome);
+bool checkExprLogSame(string pathRelativeToProjectHome, int start, int end);
 bool checkExprsEquvivalence(VecExpr exprs);
 } // namespace nnet
--- a/include/operators/G2BMM.h
+++ b/include/operators/G2BMM.h
@ -35,7 +35,7 @@ class G2BMMObj : public OperatorObj {
    OP_CLONE(G2BMMObj);

    std::string toString() const override;
-    optional<vector<Shape>> inferShape(const TensorVec &inputs) const override;
+    optional<vector<Shape>> inferShape(const TensorVec &inputs) override;

    int numInputs() const override { return 2; }
    int numOutputs() const override { return 1; }
--- a/include/operators/GBMM.h
+++ b/include/operators/GBMM.h
@ -33,7 +33,7 @@ class GBMMObj : public OperatorObj {
    OP_CLONE(GBMMObj);

    std::string toString() const override;
-    optional<vector<Shape>> inferShape(const TensorVec &inputs) const override;
+    optional<vector<Shape>> inferShape(const TensorVec &inputs) override;

    int numInputs() const override { return 2; }
    int numOutputs() const override { return 1; }
--- a/include/operators/activation_backward.h
+++ b/include/operators/activation_backward.h
@ -7,7 +7,7 @@ class ActivationBackwardObj : public OperatorObj {
    ActivationBackwardObj(OpType type, GraphObj *graph, Tensor y, Tensor diff_y,
                          Tensor x, Tensor diff_x);
    OP_CLONE(ActivationBackwardObj);
-    optional<vector<Shape>> inferShape(const TensorVec &inputs) const override;
+    optional<vector<Shape>> inferShape(const TensorVec &inputs) override;

    std::string toString() const override;
    int numInputs() const override { return 3; }
--- a/include/operators/all_gather.h
+++ b/include/operators/all_gather.h
@ -27,7 +27,7 @@ class AllGatherObj : public OperatorObj {

    int numInputs() const override { return 1; }
    int numOutputs() const override { return world_size; }
-    optional<vector<Shape>> inferShape(const TensorVec &inputs) const override;
+    optional<vector<Shape>> inferShape(const TensorVec &inputs) override;

    std::string toString() const override;

--- a/include/operators/all_reduce.h
+++ b/include/operators/all_reduce.h
@ -33,7 +33,7 @@ class AllReduceBaseObj : public OperatorObj {
    int numInputs() const override { return 1; }
    int numOutputs() const override { return 1; }

-    optional<vector<Shape>> inferShape(const TensorVec &inputs) const override {
+    optional<vector<Shape>> inferShape(const TensorVec &inputs) override {
        return {{inputs[0]->getDims()}};
    };

--- a/include/operators/attention_kvcache.h
+++ b/include/operators/attention_kvcache.h
@ -0,0 +1,43 @@
+#pragma once
+#include "core/operator.h"
+
+namespace infini {
+/**
+ * @brief Fused Attention with KVCache input operator. All the input and output
+ * tensors should have the same rank except for the position_id.
+ *
+ */
+class AttentionKVCacheObj : public OperatorObj {
+    int dim;
+
+  public:
+    /**
+     * @brief Construct a new AttentionKVCache object.
+     *
+     * @param graph The computation graph that this operator belongs to.
+     * @param input_k_cache The k_cache input tensor.
+     * @param input_v_cache The v_cache input tensor.
+     * @param input_q The query input tensor.
+     * @param input_k The key input tensor.
+     * @param input_v The value input tensor.
+     * @param position_id The positon id of the query,
+     * @param output_matmul The query output tensor.
+     */
+    AttentionKVCacheObj(GraphObj *graph, Tensor input_k_cache,
+                        Tensor input_v_cache, Tensor input_q, Tensor input_k,
+                        Tensor input_v, Tensor position_id,
+                        Tensor output_matmul);
+    OP_CLONE(AttentionKVCacheObj);
+
+    optional<vector<Shape>> inferShape(const TensorVec &inputs) override;
+
+    std::string toString() const override;
+    int numInputs() const override { return 6; }
+    int numOutputs() const override { return 1; }
+    int getDim() const { return dim; }
+
+  private:
+    vector<int> getWorkloadVector() const override;
+    vector<int> getOpAttrVector() const override;
+};
+} // namespace infini
--- a/include/operators/batch_norm.h
+++ b/include/operators/batch_norm.h
@ -34,7 +34,7 @@ class BatchNormObj : public OperatorObj {
                 Tensor var, Tensor scale, Tensor bias, float momentum = 0.9,
                 float eps = 1e-5, bool trainingMode = false);
    OP_CLONE(BatchNormObj);
-    optional<vector<Shape>> inferShape(const TensorVec &inputs) const override;
+    optional<vector<Shape>> inferShape(const TensorVec &inputs) override;
    std::string toString() const override;

    // output size will be 3 when training
--- a/include/operators/broadcast.h
+++ b/include/operators/broadcast.h
@ -26,7 +26,7 @@ class BroadcastObj : public OperatorObj {
    int numInputs() const override { return 1; }
    int numOutputs() const override { return 1; }

-    optional<vector<Shape>> inferShape(const TensorVec &inputs) const override {
+    optional<vector<Shape>> inferShape(const TensorVec &inputs) override {
        return {{inputs[0]->getDims()}};
    };

--- a/include/operators/concat.h
+++ b/include/operators/concat.h
@ -22,7 +22,7 @@ class ConcatObj : public OperatorObj {
    ConcatObj(GraphObj *graph, TensorVec inputs, Tensor output, int dim);
    OP_CLONE(ConcatObj);

-    optional<vector<Shape>> inferShape(const TensorVec &inputs) const override;
+    optional<vector<Shape>> inferShape(const TensorVec &inputs) override;

    std::string toString() const override;
    int numInputs() const override { return inputs.size(); }
--- a/include/operators/conv.h
+++ b/include/operators/conv.h
@ -142,7 +142,7 @@ class ConvObj : public ConvBaseObj {
            ActType act = ActType::None);
    OP_CLONE(ConvObj);

-    optional<vector<Shape>> inferShape(const TensorVec &inputs) const override;
+    optional<vector<Shape>> inferShape(const TensorVec &inputs) override;
    int getNumGroups() const override { return c / getChannelPerGroup(); }

  private:
@ -164,7 +164,7 @@ class ConvBackwardFilterObj : public ConvBaseObj {
                          int sh = 1, int sw = 1, int dh = 1, int dw = 1,
                          Tensor bias = nullptr, ActType act = ActType::None);

-    optional<vector<Shape>> inferShape(const TensorVec &inputs) const override;
+    optional<vector<Shape>> inferShape(const TensorVec &inputs) override;
    ActType getAct() const { return act; }
    int getNumGroups() const override { return c / getChannelPerGroup(); }

@ -191,7 +191,7 @@ class ConvTransposed2dObj : public ConvBaseObj {
                        Tensor bias = nullptr, ActType act = ActType::None);
    OP_CLONE(ConvTransposed2dObj);

-    optional<vector<Shape>> inferShape(const TensorVec &inputs) const override;
+    optional<vector<Shape>> inferShape(const TensorVec &inputs) override;
    int getNumGroups() const override { return group; }
    std::pair<int, int> getOutputPadding() const { return {oph, opw}; }

@ -218,7 +218,7 @@ class ConvTransposed2dNHWCObj : public ConvBaseObj {
                            Tensor bias = nullptr, ActType act = ActType::None);
    OP_CLONE(ConvTransposed2dNHWCObj);

-    optional<vector<Shape>> inferShape(const TensorVec &inputs) const override;
+    optional<vector<Shape>> inferShape(const TensorVec &inputs) override;
    int getNumGroups() const override { return group; }

  private:
--- a/include/operators/det.h
+++ b/include/operators/det.h
@ -7,7 +7,7 @@ class DetObj : public OperatorObj {
    enum Mode { NormalDet = 0, LogDet };
    DetObj(GraphObj *graph, Tensor input, Tensor output, Mode mode);
    OP_CLONE(DetObj);
-    optional<vector<Shape>> inferShape(const TensorVec &inputs) const override;
+    optional<vector<Shape>> inferShape(const TensorVec &inputs) override;

    std::string toString() const override;
    int numInputs() const override { return 1; }
--- a/include/operators/dropout.h
+++ b/include/operators/dropout.h
@ -37,7 +37,7 @@ class DropoutObj : public OperatorObj {
    DropoutObj(GraphObj *graph, Tensor data, Tensor output, Tensor mask,
               float ratio, bool training_mode);
    OP_CLONE(DropoutObj);
-    optional<vector<Shape>> inferShape(const TensorVec &inputs) const override;
+    optional<vector<Shape>> inferShape(const TensorVec &inputs) override;

    std::string toString() const override;
    int numInputs() const override { return 1; }
--- a/include/operators/element_wise.h
+++ b/include/operators/element_wise.h
@ -21,7 +21,7 @@ class ElementWiseObj : public OperatorObj {
     */
    ElementWiseObj(OpType type, GraphObj *graph, Tensor input0, Tensor input1,
                   Tensor output);
-    optional<vector<Shape>> inferShape(const TensorVec &inputs) const override;
+    optional<vector<Shape>> inferShape(const TensorVec &inputs) override;

    std::string toString() const override;
    int numInputs() const override { return 2; }
@ -38,7 +38,7 @@ class MSELossObj : public OperatorObj {
    MSELossObj(GraphObj *graph, Tensor input0, Tensor input1,
               Reduction reduction, Tensor output);
    OP_CLONE(MSELossObj);
-    optional<vector<Shape>> inferShape(const TensorVec &inputs) const override;
+    optional<vector<Shape>> inferShape(const TensorVec &inputs) override;

    Reduction getReduction() const { return reductionMode; }
    std::string toString() const override;
--- a/include/operators/expand.h
+++ b/include/operators/expand.h
@ -21,7 +21,7 @@ class ExpandObj : public OperatorObj {
     */
    ExpandObj(GraphObj *graph, Tensor input, Tensor output, Shape dims);
    OP_CLONE(ExpandObj);
-    optional<vector<Shape>> inferShape(const TensorVec &inputs) const override;
+    optional<vector<Shape>> inferShape(const TensorVec &inputs) override;

    std::string toString() const override;
    int numInputs() const override { return 1; }
--- a/include/operators/extend.h
+++ b/include/operators/extend.h
@ -23,7 +23,7 @@ class ExtendObj : public OperatorObj {
    ExtendObj(GraphObj *graph, Tensor input, Tensor output, int dim,
              int num = 1);
    OP_CLONE(ExtendObj);
-    optional<vector<Shape>> inferShape(const TensorVec &inputs) const override;
+    optional<vector<Shape>> inferShape(const TensorVec &inputs) override;

    std::string toString() const override;
    int numInputs() const override { return 1; }
--- a/include/operators/gather.h
+++ b/include/operators/gather.h
@ -3,14 +3,28 @@
 #include "core/operator.h"

 namespace infini {
+
+class GatherBaseObj : public OperatorObj {
+  protected:
+    int axis;
+
+  public:
+    GatherBaseObj(OpType opType, TensorVec inputs, TensorVec outputs, int axis)
+        : OperatorObj(opType, inputs, outputs), axis(axis) {}
+
+    virtual ~GatherBaseObj() {}
+    int numInputs() const override { return 2; }
+    int numOutputs() const override { return 1; }
+
+    int getAxis() const { return axis; }
+};
+
 /**
 * @brief Gather and concatenate given positions on a certain dimension of the
 * input tensor using an index tensor.
 *
 */
-class GatherObj : public OperatorObj {
-    int axis;
-
+class GatherObj : public GatherBaseObj {
  public:
    /**
     * @brief Construct a new Gather object.
@ -25,10 +39,7 @@ class GatherObj : public OperatorObj {
              int axis);
    OP_CLONE(GatherObj);
    std::string toString() const override;
-    int numInputs() const override { return 2; }
-    int numOutputs() const override { return 1; }
-    optional<vector<Shape>> inferShape(const TensorVec &inputs) const override;
-    int getAxis() const { return axis; }
+    optional<vector<Shape>> inferShape(const TensorVec &inputs) override;
    vector<DataType> inferDataType(const TensorVec &inputs) const override;

  private:
@ -36,4 +47,33 @@ class GatherObj : public OperatorObj {
    vector<int> getWorkloadVector() const override;
    vector<int> getOpAttrVector() const override;
 };
+
+/**
+ * @brief GatherElements takes two inputs data and indices of the
+ * same rank r >= 1 and an optional attribute axis that identifies
+ * an axis of data.
+ *
+ */
+class GatherElementsObj : public GatherBaseObj {
+  public:
+    /**
+     * @brief Construct a new GatherElements object.
+     *
+     * @param graph The computation graph that this operator belongs to.
+     * @param input The input tensor.
+     * @param indices The index tensor.
+     * @param output The output tensor. Same shape as indices.
+     * @param axis The axis to gather on.
+     */
+    GatherElementsObj(GraphObj *graph, Tensor input, Tensor indices,
+                      Tensor output, int axis);
+    OP_CLONE(GatherElementsObj);
+    std::string toString() const override;
+    optional<vector<Shape>> inferShape(const TensorVec &inputs) override;
+    vector<DataType> inferDataType(const TensorVec &inputs) const override;
+
+  private:
+    vector<int> getWorkloadVector() const override;
+    vector<int> getOpAttrVector() const override;
+};
 } // namespace infini
--- a/include/operators/layer_norm.h
+++ b/include/operators/layer_norm.h
@ -0,0 +1,30 @@
+#pragma once
+#include "core/operator.h"
+
+namespace infini {
+class LayerNormObj : public OperatorObj {
+    float eps;
+    int axis, stash_type;
+
+  public:
+    LayerNormObj(GraphObj *graph, Tensor input, Tensor scale, Tensor output,
+                 Tensor bias = nullptr, float eps = 1e-5, int axis = -1,
+                 int stash_type = 1);
+    OP_CLONE(LayerNormObj);
+    optional<vector<Shape>> inferShape(const TensorVec &inputs) override;
+    std::string toString() const override;
+
+    Tensor getBias() const { return inputs.size() > 2 ? inputs[2] : nullptr; }
+    int numInputs() const override { return inputs.size(); }
+    int numOutputs() const override { return outputs.size(); }
+    float getEps() const { return eps; }
+    int getAxis() const { return axis; }
+    int getStashType() const { return stash_type; }
+
+  private:
+    vector<int> getWorkloadVector() const override;
+    vector<int> getOpAttrVector() const override;
+
+    vector<DataType> inferDataType(const TensorVec &inputs) const override;
+};
+} // namespace infini
--- a/include/operators/lrn.h
+++ b/include/operators/lrn.h
@ -0,0 +1,29 @@
+#pragma once
+#include "core/operator.h"
+
+namespace infini {
+class LRNObj : public OperatorObj {
+
+  public:
+    LRNObj(GraphObj *graph, Tensor inputX, Tensor inputY, float alpha,
+           float beta, float bias, int size);
+    OP_CLONE(LRNObj);
+
+    optional<vector<Shape>> inferShape(const TensorVec &inputs) override;
+
+    std::string toString() const override;
+    int numInputs() const override { return inputs.size(); }
+    int numOutputs() const override { return 1; }
+    auto getAlphaBetaBias() const {
+        return tuple(alpha_value, beta_value, bias_value);
+    }
+    auto getSize() const { return size_value; }
+
+  private:
+    float alpha_value, beta_value, bias_value;
+    int size_value;
+    vector<int> getWorkloadVector() const override;
+    vector<int> getOpAttrVector() const override;
+};
+
+} // namespace infini
--- a/include/operators/matmul.h
+++ b/include/operators/matmul.h
@ -17,6 +17,9 @@ class MatmulObj : public OperatorObj {
    // Auxiliary attributes which are not a part of operator attributes.
    int b, m, n, k;

+    // Specifies the data precision for the matrix multiply.
+    std::string computeType = "default";
+
  public:
    /**
     * @brief Matmul operator with batch broadcast and tensor transpose
@ -38,14 +41,15 @@ class MatmulObj : public OperatorObj {
     * @param transB If matrix B should be transposed when computing.
     * @param bias The bias tensor.
     * @param act The activation function.
+     * @param computeType Specifies the data precision for the matrix multiply.
     */
    MatmulObj(GraphObj *graph, Tensor A, Tensor B, Tensor C,
              bool transA = false, bool transB = false, Tensor bias = nullptr,
-              ActType act = ActType::None);
+              ActType act = ActType::None, std::string computeType = "default");
    OP_CLONE(MatmulObj);

    std::string toString() const override;
-    optional<vector<Shape>> inferShape(const TensorVec &inputs) const override;
+    optional<vector<Shape>> inferShape(const TensorVec &inputs) override;

    int numInputs() const override { return inputs.size(); }
    int numOutputs() const override { return 1; }
@ -60,6 +64,7 @@ class MatmulObj : public OperatorObj {
    int getN() const { return n; }
    int getK() const { return k; }
    auto getBMNK() const { return tuple{b, m, n, k}; }
+    std::string getComputeType() const { return computeType; }

  private:
    vector<int> getWorkloadVector() const override;
--- a/include/operators/membound.h
+++ b/include/operators/membound.h
@ -21,7 +21,7 @@ class MemBoundObj : public OperatorObj {
    OP_CLONE(MemBoundObj);

    std::string toString() const override;
-    optional<vector<Shape>> inferShape(const TensorVec &inputs) const override;
+    optional<vector<Shape>> inferShape(const TensorVec &inputs) override;

    int numInputs() const override { return inputs.size(); }
    int numOutputs() const override { return outputs.size(); }
--- a/include/operators/pad.h
+++ b/include/operators/pad.h
@ -27,7 +27,7 @@ class PadObj : public OperatorObj {
           const vector<int> &pads, const optional<vector<int>> &axes);
    OP_CLONE(PadObj);

-    optional<vector<Shape>> inferShape(const TensorVec &inputs) const override;
+    optional<vector<Shape>> inferShape(const TensorVec &inputs) override;
    std::string toString() const override;
    int numInputs() const override { return 1; }
    int numOutputs() const override { return 1; }
--- a/include/operators/pooling.h
+++ b/include/operators/pooling.h
@ -41,7 +41,7 @@ class PoolingObj : public OperatorObj {
               int ceilMode);
    OP_CLONE(PoolingObj);

-    optional<vector<Shape>> inferShape(const TensorVec &inputs) const override;
+    optional<vector<Shape>> inferShape(const TensorVec &inputs) override;
    std::string toString() const override;
    int numInputs() const override { return 1; }
    int numOutputs() const override { return 1; }
--- a/include/operators/recv.h
+++ b/include/operators/recv.h
@ -0,0 +1,46 @@
+#pragma once
+#include "core/operator.h"
+
+namespace infini {
+/**
+ *
+ * https://docs.nvidia.com/deeplearning/nccl/archives/nccl_2193/user-guide/docs/index.html
+ */
+class RecvObj : public OperatorObj {
+
+  public:
+    /**
+     * @brief Construct a new SendRecv object
+     *
+     * @param graph The computation graph that this operator belongs to.
+     * @param input default nullptr, because recv does not have input.
+     * @param output recv output
+     * @param source the send rank
+     * @param destination the recv rank
+     * @param dims The shape of the output tensor.
+     */
+    RecvObj(GraphObj *graph, Tensor output, int source, int destination,
+            Shape dims, int outputType, Tensor input = nullptr);
+    OP_CLONE(RecvObj);
+
+    int numInputs() const override { return inputs.size(); }
+    int numOutputs() const override { return 1; }
+    optional<vector<Shape>> inferShape(const TensorVec &inputs) override;
+    std::string toString() const override;
+    DataType getDType() const;
+    int getSourceRank() const { return source; }
+    int getDestinationRank() const { return destination; }
+    inline Shape getShape() const { return dims; }
+
+  private:
+    vector<int> getWorkloadVector() const override;
+    vector<int> getOpAttrVector() const override;
+    vector<DataType> inferDataType(const TensorVec &inputs) const override;
+
+  protected:
+    int source;
+    int destination;
+    Shape dims;
+    int outputType;
+};
+} // namespace infini
--- a/include/operators/reduce_mean.h
+++ b/include/operators/reduce_mean.h
@ -3,27 +3,30 @@

 namespace infini {
 /**
- * @brief Compute the mean of input tensor's elements along certain axes.
+ * @brief Compute the reduction of input tensor's elements along certain axes.
 *
 */
-class ReduceMeanObj : public OperatorObj {
+class ReduceBaseObj : public OperatorObj {
+  protected:
    set<int> axes; // axis to reduce
    bool keepDims;

  public:
    /**
-     * @brief Construct a new ReduceMean object.
+     * @brief Construct a new Reduce object.
     *
     * @param graph The computation graph that this operator belongs to.
+     * @param opType The operation type. Should be a Reduce operation.
     * @param input The input tensor.
     * @param output The output tensor.
     * @param axes Axes to reduce.
     * @param keepDims Keep the reduced dimensions or not.
     */
-    ReduceMeanObj(GraphObj *graph, Tensor input, Tensor output,
-                  const optional<vector<int>> &axes, bool keepDims = true);
-    OP_CLONE(ReduceMeanObj);
-    optional<vector<Shape>> inferShape(const TensorVec &inputs) const override;
+    ReduceBaseObj(GraphObj *graph, OpType opType, Tensor input, Tensor output,
+                  const optional<vector<int>> &axes, bool keepDims);
+    virtual ~ReduceBaseObj() {}
+    OP_CLONE(ReduceBaseObj);
+    optional<vector<Shape>> inferShape(const TensorVec &inputs) override;

    std::string toString() const override;
    int numInputs() const override { return 1; }
@ -38,4 +41,15 @@ class ReduceMeanObj : public OperatorObj {
    vector<int> getOpAttrVector() const override;
 };

+class ReduceMeanObj : public ReduceBaseObj {
+  public:
+    ReduceMeanObj(GraphObj *graph, Tensor input, Tensor output,
+                  const optional<vector<int>> &axes, bool keepDims = true);
+};
+
+class ReduceSumObj : public ReduceBaseObj {
+  public:
+    ReduceSumObj(GraphObj *graph, Tensor input, Tensor output,
+                 const optional<vector<int>> &axes, bool keepDims = true);
+};
 } // namespace infini
--- a/include/operators/reshape.h
+++ b/include/operators/reshape.h
@ -9,6 +9,7 @@ namespace infini {
 */
 class ReshapeObj : public OperatorObj {
    Shape dims;
+    Shape outputShape;

  public:
    /**
@ -17,18 +18,20 @@ class ReshapeObj : public OperatorObj {
     * @param graph The computation graph that this operator belongs to.
     * @param input The input tensor.
     * @param output The output tensor.
-     * @param dims The shape of the output tensor.
+     * @param dims The shape to infer the output shape.
+     * @param outputShape The real shape of output tensor.
     */
    ReshapeObj(GraphObj *graph, Tensor input, Tensor output, Shape dims);
    OP_CLONE(ReshapeObj);

-    optional<vector<Shape>> inferShape(const TensorVec &inputs) const override;
+    optional<vector<Shape>> inferShape(const TensorVec &inputs) override;

    std::string toString() const override;
    int numInputs() const override { return 1; }
    int numOutputs() const override { return 1; }

-    inline Shape getShape() const { return dims; }
+    inline Shape getShape() const { return outputShape; }
+    inline Shape getDims() const { return dims; }

  private:
    vector<int> getWorkloadVector() const override;
@ -55,7 +58,7 @@ class FlattenObj : public OperatorObj {
    FlattenObj(GraphObj *graph, Tensor input, Tensor output, int axis);
    OP_CLONE(FlattenObj);

-    optional<vector<Shape>> inferShape(const TensorVec &inputs) const override;
+    optional<vector<Shape>> inferShape(const TensorVec &inputs) override;

    std::string toString() const override;
    int numInputs() const override { return 1; }
@ -85,7 +88,7 @@ class IdentityObj : public OperatorObj {
    IdentityObj(GraphObj *graph, Tensor input, Tensor output);
    OP_CLONE(IdentityObj);

-    optional<vector<Shape>> inferShape(const TensorVec &inputs) const override;
+    optional<vector<Shape>> inferShape(const TensorVec &inputs) override;

    std::string toString() const override;
    int numInputs() const override { return 1; }
--- a/include/operators/resize.h
+++ b/include/operators/resize.h
@ -27,6 +27,60 @@ class ResizeObj : public OperatorObj {
    enum class EKeepAspectRatioPolicy { stretch, notLarger, notSmaller, none };
    enum class ECoeffMode { nearest, linear, cubic };

+    static ECoordinateTransMode fromECoordinateTransModeStr(string mode) {
+        if (mode == "half_pixel") {
+            return ECoordinateTransMode::halfPixel;
+        } else if (mode == "asymmetric") {
+            return ECoordinateTransMode::asymmetric;
+        } else if (mode == "align_corners") {
+            return ECoordinateTransMode::alignCorners;
+        } else if (mode == "pytorch_half_pixel") {
+            return ECoordinateTransMode::pytorchHalfPixel;
+        } else if (mode == "tf_crop_and_resize") {
+            return ECoordinateTransMode::tfCropAndResize;
+        } else {
+            IT_TODO_HALT();
+        }
+    }
+
+    static ENearestMode fromENearestModeStr(string mode) {
+        if (mode == "round_prefer_floor") {
+            return ENearestMode::roundPreferFloor;
+        } else if (mode == "round_prefer_ceil") {
+            return ENearestMode::roundPreferCeil;
+        } else if (mode == "floor") {
+            return ENearestMode::floor;
+        } else if (mode == "ceil") {
+            return ENearestMode::ceil;
+        } else {
+            return ENearestMode::none;
+        }
+    }
+
+    static EKeepAspectRatioPolicy fromRatioPolicyStr(string ratioPolicyStr) {
+        if (ratioPolicyStr == "stretch") {
+            return EKeepAspectRatioPolicy::stretch;
+        } else if (ratioPolicyStr == "not_larger") {
+            return EKeepAspectRatioPolicy::notLarger;
+        } else if (ratioPolicyStr == "not_smaller") {
+            return EKeepAspectRatioPolicy::notSmaller;
+        } else {
+            return EKeepAspectRatioPolicy::none;
+        }
+    }
+
+    static ECoeffMode fromECoeffModeStr(string mode) {
+        if (mode == "nearest") {
+            return ECoeffMode::nearest;
+        } else if (mode == "linear") {
+            return ECoeffMode::linear;
+        } else if (mode == "cubic") {
+            return ECoeffMode::cubic;
+        } else {
+            IT_TODO_HALT();
+        }
+    }
+
  private:
    vector<int> axes;
    vector<float> scales;
@ -60,7 +114,7 @@ class ResizeObj : public OperatorObj {

    // Operator clone(TensorVec inputs, TensorVec outputs) override;
    vector<DataType> inferDataType(const TensorVec &inputs) const override;
-    optional<vector<Shape>> inferShape(const TensorVec &inputs) const override;
+    optional<vector<Shape>> inferShape(const TensorVec &inputs) override;
    std::string toString() const override;
    int numInputs() const override { return inputs.size(); }
    int numOutputs() const override { return 1; }
--- a/include/operators/rms_norm.h
+++ b/include/operators/rms_norm.h
@ -0,0 +1,34 @@
+#pragma once
+#include "core/operator.h"
+
+namespace infini {
+/**
+ * @brief Fused RMSNorm Operator
+ *
+ */
+class RMSNormObj : public OperatorObj {
+    int dim;
+
+  public:
+    /**
+     * @brief Construct a new RMSNorm object.
+     *
+     * @param graph The computation graph that this operator belongs to.
+     * @param input The input tensor.
+     * @param output The output tensor.
+     */
+    RMSNormObj(GraphObj *graph, Tensor input, Tensor weight, Tensor output);
+    OP_CLONE(RMSNormObj);
+
+    optional<vector<Shape>> inferShape(const TensorVec &inputs) override;
+
+    std::string toString() const override;
+    int numInputs() const override { return 2; }
+    int numOutputs() const override { return 1; }
+    int getDim() const { return dim; }
+
+  private:
+    vector<int> getWorkloadVector() const override;
+    vector<int> getOpAttrVector() const override;
+};
+} // namespace infini
--- a/include/operators/rope.h
+++ b/include/operators/rope.h
@ -0,0 +1,29 @@
+#pragma once
+#include "core/operator.h"
+
+namespace infini {
+class RoPEObj : public OperatorObj {
+  public:
+    /**
+     * @brief Construct a new RotaryEmbedding object.
+     *
+     * @param graph The computation graph that this operator belongs to.
+     * @param pos The positon id of the query.
+     * @param input The input tensor.
+     * @param output The output tensor.
+     */
+    RoPEObj(GraphObj *graph, Tensor pos, Tensor input, Tensor output);
+    OP_CLONE(RoPEObj);
+    optional<vector<Shape>> inferShape(const TensorVec &inputs) override;
+
+    std::string toString() const override;
+    int numInputs() const override { return 2; }
+    int numOutputs() const override { return 1; }
+    DataType getDType() const { return getInputs(1)->getDType(); }
+
+  private:
+    vector<int> getWorkloadVector() const override;
+    vector<int> getOpAttrVector() const override;
+};
+
+} // namespace infini
--- a/include/operators/send.h
+++ b/include/operators/send.h
@ -0,0 +1,42 @@
+#pragma once
+#include "core/operator.h"
+
+namespace infini {
+/**
+ *
+ * https://docs.nvidia.com/deeplearning/nccl/archives/nccl_2193/user-guide/docs/index.html
+ */
+class SendObj : public OperatorObj {
+
+  public:
+    /**
+     * @brief Construct a new SendRecv object
+     *
+     * @param graph The computation graph that this operator belongs to.
+     * @param input send input
+     * @param output recv output
+     * @param source the send rank
+     * @param destination the recv rank
+     */
+    SendObj(GraphObj *graph, Tensor input, int source, int destination,
+            Tensor output = nullptr);
+    OP_CLONE(SendObj);
+
+    int numInputs() const override { return 1; }
+    int numOutputs() const override { return outputs.size(); }
+    std::string toString() const override;
+    optional<vector<Shape>> inferShape(const TensorVec &inputs) override;
+
+    int getSourceRank() const { return source; }
+    int getDestinationRank() const { return destination; }
+
+  private:
+    vector<int> getWorkloadVector() const override;
+    vector<int> getOpAttrVector() const override;
+    vector<DataType> inferDataType(const TensorVec &inputs) const override;
+
+  protected:
+    int source;
+    int destination;
+};
+} // namespace infini
--- a/include/operators/slice.h
+++ b/include/operators/slice.h
@ -32,7 +32,7 @@ class SliceObj : public OperatorObj {
             const optional<vector<int>> &steps);
    OP_CLONE(SliceObj);

-    optional<vector<Shape>> inferShape(const TensorVec &inputs) const override;
+    optional<vector<Shape>> inferShape(const TensorVec &inputs) override;
    std::string toString() const override;
    inline int numInputs() const override { return 1; }
    inline int numOutputs() const override { return 1; }
--- a/include/operators/softmax.h
+++ b/include/operators/softmax.h
@ -10,7 +10,7 @@ class SoftmaxObj : public OperatorObj {

    OP_CLONE(SoftmaxObj);

-    optional<vector<Shape>> inferShape(const TensorVec &inputs) const override {
+    optional<vector<Shape>> inferShape(const TensorVec &inputs) override {
        return {{inputs[0]->getDims()}};
    };

--- a/include/operators/split.h
+++ b/include/operators/split.h
@ -37,7 +37,7 @@ class SplitObj : public OperatorObj {
             int dim, const vector<int> &ratio);
    OP_CLONE(SplitObj);

-    optional<vector<Shape>> inferShape(const TensorVec &inputs) const override;
+    optional<vector<Shape>> inferShape(const TensorVec &inputs) override;

    std::string toString() const override;
    int numInputs() const override { return 1; }
--- a/Show More
+++ b/Show More
				`@ -0,0 +1 @@`
				`Subproject commit cbcf3fbf985a00494b0f136c92eaccd42031bf65`