add kunlun squeeze kernel (#229 )

Co-authored-by: Haojie Wang <haojie0429@gmail.com>
添加 MLU 平台分布式验收脚本 (#223 )
2024-04-28 11:28:28 +08:00 · 2024-04-28 11:24:09 +08:00 · 2024-04-23 15:46:25 +08:00 · 2024-04-07 16:57:07 +08:00 · 2024-04-03 09:56:52 +08:00 · 2024-04-01 14:04:28 +08:00
452 changed files with 23056 additions and 3829 deletions
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@ -1,12 +1,11 @@
 name: Build and test cpu
 on:
  push:
    branch: 'master'
    paths-ignore:
      - '**.md'
      - 'LICENSE'
  pull_request:
-    paths-ignore:
+    paths:
      - '**.md'
      - 'LICENSE'
@ -15,10 +14,10 @@ env:
  protobuf-version: "3.21.12"
  python-version: "3.10"
-  resnet-download: https://github.com/onnx/models/raw/main/vision/classification/resnet/model/resnet18-v2-7.onnx
+  resnet-download: https://github.com/InfiniTensor/InfiniTensor/releases/download/test-models/resnet18-v2-7.onnx
-  inception-download: https://media.githubusercontent.com/media/onnx/models/main/vision/classification/inception_and_googlenet/inception_v2/model/inception-v2-9.onnx
+  inception-download: https://github.com/InfiniTensor/InfiniTensor/releases/download/test-models/inception-v2-9.onnx
-  densenet-download: https://github.com/onnx/models/raw/main/vision/classification/densenet-121/model/densenet-12.onnx
+  densenet-download: https://github.com/InfiniTensor/InfiniTensor/releases/download/test-models/densenet-12.onnx
-  efficientnet-download: https://github.com/onnx/models/raw/main/vision/classification/efficientnet-lite4/model/efficientnet-lite4-11.onnx
+  efficientnet-download: https://github.com/InfiniTensor/InfiniTensor/releases/download/test-models/efficientnet-lite4-11.onnx
 jobs:
  build:
--- a/.github/workflows/clang-format-check.yml
+++ b/.github/workflows/clang-format-check.yml
@ -1,12 +1,11 @@
 name: clang-format Check
 on:
  push:
    branch: 'master'
    paths-ignore:
      - '**.md'
      - 'LICENSE'
  pull_request:
-    paths-ignore:
+    paths:
      - '**.md'
      - 'LICENSE'
--- a/.gitignore
+++ b/.gitignore
@ -42,3 +42,5 @@ build_debug/
 # onnx model
 *.onnx
 *.pb
 *.npy
--- a/.gitmodules
+++ b/.gitmodules
@ -11,5 +11,8 @@
 	path = 3rd-party/backward-cpp
 	url = git@github.com:bombela/backward-cpp.git
 [submodule "example"]
-	path = example
+	path = examples/NNmodel
 	url = git@github.com:wanghailu0717/NNmodel.git
 [submodule "examples/distributed/onnxsim_large_model"]
 	path = examples/distributed/onnxsim_large_model
 	url = git@github.com:luchangli03/onnxsim_large_model.git
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -1,20 +1,45 @@
 cmake_minimum_required(VERSION 3.17) # FindCUDAToolkit
 include(CMakeDependentOption)
 project(InfiniTensor C CXX)
 # Do not change these options in this file. Use cmake.config, cmake -DOPTION=VALUE, or ccmake to specify them.
 option(USE_CUDA "Support CUDA GPU" OFF)
 option(USE_BANG "Support BANG MLU" OFF)
 option(USE_KUNLUN "Support KUNLUN XPU" OFF)
 option(USE_INTELCPU "Support INTELCPU" OFF)
 option(USE_BACKTRACE "Print backtrace on exception and segmentation fault" ON)
 option(USE_PROTOBUF "Serialize and deserialize tensors" OFF)
 option(BUILD_NNET "Build nnet" OFF)
 option(BUILD_DIST "Build project for distributed running" OFF)
 option(BUILD_TEST "Build tests" OFF)
 if(USE_CUDA)
    message("CMake 3.18 or higher is required for setting CUDAToolkit")
    cmake_minimum_required(VERSION 3.18) # FindCUDAToolkit
 else()
    cmake_minimum_required(VERSION 3.17)
 endif()
 include(CMakeDependentOption)
 project(InfiniTensor C CXX)
 cmake_dependent_option(BUILD_TEST_CORE "Build tests for core components" ON BUILD_TEST OFF)
 cmake_dependent_option(BUILD_TEST_PET "Build tests for PET" OFF BUILD_TEST OFF)
 cmake_dependent_option(BUILD_TEST_EINNET "Build tests for EINNET" OFF BUILD_TEST OFF)
 set(DEFAULT_BUILD_TYPE "RelWithDebInfo")
 # Build Type
 if(CMAKE_BUILD_TYPE STREQUAL "Debug")
    message("Configuring for Debug build.")
    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -g -O0")
    add_compile_definitions(DEBUG_MODE)
 elseif(CMAKE_BUILD_TYPE STREQUAL "Release")
    message("Configuring for Release build.")
    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O2")
    add_compile_definitions(NDEBUG)
 elseif(CMAKE_BUILD_TYPE STREQUAL "RelWithDebInfo")
    message("Configuring for RelWithDebInfo build.")
    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -g -O2")
 else()
    message("Build type not specified. Configuring for RelWithDebInfo build.")
    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -g -O2")
 endif()
 if(EXISTS ${CMAKE_CURRENT_BINARY_DIR}/config.cmake)
  message(STATUS "Using config.cmake in CMAKE_CURRENT_BINARY_DIR directory")
@ -28,11 +53,13 @@ endif()
 set(CMAKE_CXX_STANDARD 17)
 set(CMAKE_CXX_EXTENSIONS OFF) # -std=gnu++11 when on, -std=c++11 when off
 add_compile_options(-Wno-error=unused-variable)
 find_package(
  Python
  COMPONENTS Interpreter Development
  REQUIRED)
 # OpenMP
 find_package(OpenMP)
 if(OpenMP_C_FOUND)
@ -69,16 +96,17 @@ add_subdirectory(3rd-party/nlohmann_json_cmake_fetchcontent)
 include_directories(3rd-party/nlohmann_json_cmake_fetchcontent/single_include)
 # TVM backend
-if(BUILD_TEST_EINNET)
+if(BUILD_NNET AND BUILD_TEST)
  if (NOT TVM_INCLUDE_DIR OR NOT DMLC_INCLUDE_DIR OR NOT DLPACK_INCLUDE_DIR OR NOT DLPACK_INCLUDE_DIR)
    message(FATAL_ERROR "TVM_INCLUDE_DIR, DMLC_INCLUDE_DIR, and DLPACK_INCLUDE_DIR must be set when BUILD_TEST_EINNET is ON")
  endif()
  # TVM and DMLC for invoking TVM packed functions
  include_directories(${TVM_INCLUDE_DIR})
  include_directories(${DMLC_INCLUDE_DIR})
  include_directories(${DLPACK_INCLUDE_DIR})
-  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DDMLC_USE_LOGGING_LIBRARY=\\\<${TVM_INCLUDE_DIR}/tvm/runtime/logging.h\\\> ")
+  if (TVM_INCLUDE_DIR AND DMLC_INCLUDE_DIR AND DLPACK_INCLUDE_DIR AND DLPACK_INCLUDE_DIR)
-  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DINFINI_USE_TVM=1") # Enable TVM codegen kernels
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DDMLC_USE_LOGGING_LIBRARY=\\\<${TVM_INCLUDE_DIR}/tvm/runtime/logging.h\\\> ")
    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DINFINI_USE_TVM=1") # Enable TVM codegen kernels
  else()
    # message(FATAL_ERROR "TVM_INCLUDE_DIR, DMLC_INCLUDE_DIR, and DLPACK_INCLUDE_DIR must be set when BUILD_NNET AND BUILD_TEST is ON")
  endif()
 endif()
 if(BUILD_TEST)
@ -92,13 +120,21 @@ if(BUILD_TEST)
  include_directories(3rd-party/googletest/googletest/include)
 endif()
-set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -g -Wall -Werror -Wno-error=deprecated-declarations")
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -g -Wall -Werror -Wno-error=deprecated-declarations -Wno-error=pointer-arith")
 set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -UNDEBUG") # Enable assertion
 set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "${CMAKE_CXX_FLAGS_RELWITHDEBINFO} -UNDEBUG") # Enable assertion
 # Source files
-file(GLOB_RECURSE SRC src/ffi/*.cc src/core/*.cc src/kernels/cpu/*.cc src/nnet/*.cc src/operators/*.cc src/utils/*.cc)
+file(GLOB_RECURSE SRC src/ffi/*.cc src/core/*.cc src/kernels/cpu/*.cc src/operators/*.cc src/utils/*.cc)
 if(BUILD_NNET)
  add_compile_definitions(BUILD_NNET=1)
  file(GLOB_RECURSE SRC_NNET src/nnet/*.cc)
  list (APPEND SRC ${SRC_NNET})
  # For locating resource files
  set_source_files_properties(src/nnet/test.cc PROPERTIES COMPILE_OPTIONS "-DINFINI_PROJECT_HOME=${CMAKE_CURRENT_SOURCE_DIR}")
 endif()
 if(USE_CUDA)
  file(GLOB_RECURSE SRC_CUDA src/cuda/*.cc src/cuda/*.cu src/kernels/cuda/*.cc src/kernels/cuda/*.cu)
@ -110,6 +146,11 @@ if(USE_BANG)
  list (APPEND SRC ${SRC_BANG})
 endif()
 if(USE_KUNLUN)
  file(GLOB_RECURSE SRC_KUNLUN src/kunlun/*.cc src/kernels/kunlun/*.cc )
  list (APPEND SRC ${SRC_KUNLUN})
 endif()
 if(USE_INTELCPU)
  file(GLOB_RECURSE SRC_INTELCPU src/intelcpu/*.cc src/kernels/intelcpu/*.cc )
  list (APPEND SRC ${SRC_INTELCPU})
@ -124,7 +165,7 @@ endif()
 target_link_libraries(InfiniTensor pybind11::embed)
 # TVM backend
-if(BUILD_TEST_EINNET)
+if(BUILD_NNET AND BUILD_TEST AND TVM_LIB_DIR)
  target_link_libraries(InfiniTensor ${TVM_LIB_DIR}/libtvm.so)
 endif()
@ -177,6 +218,13 @@ if(USE_CUDA)
  enable_language(CUDA)
  find_package(CUDAToolkit) # For nvrtc and cuda driver
  target_link_libraries(InfiniTensor cudnn CUDA::curand CUDA::cublas CUDA::nvrtc CUDA::cudart CUDA::cuda_driver)
  if (BUILD_DIST)
    message(STATUS "Add BUILD_DIST, use NCCL with CUDA")
    list(APPEND CMAKE_MODULE_PATH ${PROJECT_SOURCE_DIR}/cmake)
    find_package(NCCL REQUIRED)
    add_compile_definitions(INFINI_USE_NCCL=1)
    target_link_libraries(InfiniTensor nccl)
  endif()
 endif()
 if(USE_BANG)
@ -215,7 +263,50 @@ if(USE_BANG)
  # BangC Kernels
  ################################################################################
-  target_link_libraries(InfiniTensor ${CAMBRICON_CNNL} ${CAMBRICON_CNRT} ${CAMBRICON_CNDRV} stdc++)
+  if (BUILD_DIST)
    find_library(CAMBRICON_CNCL libcncl.so "${NEUWARE_HOME}/lib64")
    target_link_libraries(InfiniTensor ${CAMBRICON_CNCL} ${CAMBRICON_CNNL} ${CAMBRICON_CNRT} ${CAMBRICON_CNDRV} stdc++)
    message(STATUS "Add BUILD_DIST, use CNCL with BANG")
    add_compile_definitions(INFINI_USE_CNCL=1)
  else()
    target_link_libraries(InfiniTensor ${CAMBRICON_CNNL} ${CAMBRICON_CNRT} ${CAMBRICON_CNDRV} stdc++)
  endif()
 endif()
 if(USE_KUNLUN)
  add_compile_definitions(USE_KUNLUN=1)
  if ((NOT DEFINED KUNLUN_HOME) AND (NOT DEFINED ENV{KUNLUN_HOME}))
    message(FATAL_ERROR "KUNLUN_HOME is not defined from cmake or env")
  elseif (DEFINED KUNLUN_HOME)
          set(KUNLUN_HOME ${KUNLUN_HOME} CACHE STRING "KUNLUN_HOME directory for Kunlun development")
  else()
          set(KUNLUN_HOME $ENV{KUNLUN_HOME} CACHE STRING "KUNLUN_HOME directory for Kunlun development")
  endif()
  message(STATUS "KUNLUN_HOME: ${KUNLUN_HOME}")
  include_directories("${KUNLUN_HOME}/include/")
  find_library(KUNLUN_RT libxpurt.so "${KUNLUN_HOME}/lib64/")
  find_library(KUNLUN_DNN libxpuapi.so "${KUNLUN_HOME}/lib64/")
  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -lstdc++ -Wall -Werror")
  if ((NOT DEFINED TARGET_CPU_ARCH) AND (NOT DEFINED ENV{TARGET_CPU_ARCH}))
    execute_process(COMMAND uname -m OUTPUT_VARIABLE _uname_m OUTPUT_STRIP_TRAILING_WHITESPACE)
    set(TARGET_CPU_ARCH "${_uname_m}" CACHE STRING "Target CPU ARCH")
  elseif(DEFINED TARGET_CPU_ARCH)
    set(TARGET_CPU_ARCH ${TARGET_CPU_ARCH} CACHE STRING "Target CPU ARCH")
  else()
    set(TARGET_CPU_ARCH $ENV{TARGET_CPU_ARCH} CACHE STRING "Target CPU ARCH")
  endif()
  message(STATUS "TARGET_CPU_ARCH: ${TARGET_CPU_ARCH}")
  if (BUILD_DIST)
    message(STATUS "Add BUILD_DIST, use XCCL with KUNLUN XPU")
    list(APPEND CMAKE_MODULE_PATH ${PROJECT_SOURCE_DIR}/cmake)
    find_package(XCCL REQUIRED)
    add_compile_definitions(INFINI_USE_XCCL=1)
    target_link_libraries(InfiniTensor ${XCCL_LIBRARIES})
  endif()
  target_link_libraries(InfiniTensor ${KUNLUN_RT} ${KUNLUN_DNN} stdc++)
 endif()
 # # Python bindings
@ -234,6 +325,7 @@ function(build_test files)
 endfunction()
 if(BUILD_TEST)
  add_compile_definitions(BUILD_TEST=1)
  enable_testing()
  if(USE_TRACE)
    build_test(test/trace/*.cc)
@ -241,11 +333,18 @@ if(BUILD_TEST)
  if(BUILD_TEST_CORE)
    build_test(test/core/*.cc)
    build_test(test/operators/*.cc)
    build_test(test/kernels/nativecpu/*.cc)
    if (USE_CUDA)
      build_test(test/kernels/cuda/*.cc)
      build_test(test/cuda/*.cc)
    endif()
    if (USE_BANG)
      build_test(test/kernels/bang/*.cc)
      build_test(test/bang/*.cc)
    endif()
    if (USE_KUNLUN)
      build_test(test/kernels/kunlun/*.cc)
      build_test(test/kunlun/*.cc)
    endif()
    if (USE_INTELCPU)
      build_test(test/kernels/intelcpu/*.cc)
@ -254,7 +353,7 @@ if(BUILD_TEST)
  if(BUILD_TEST_PET)
    build_test(test/pet/*.cc)
  endif()
-  if(BUILD_TEST_EINNET)
+  if(BUILD_NNET AND BUILD_TEST)
    build_test(test/nnet/test_*.cc)
    # Build expression reader
--- a/46
+++ b/46
@ -1,17 +1,38 @@
-.PHONY : build clean install-python test-cpp test-onnx
+.PHONY : build clean format install-python test-cpp test-onnx
-TYPE ?= release
+TYPE ?= Release
 CUDA ?= OFF
 BANG ?= OFF
 KUNLUN ?= OFF
 INTELCPU ?= off
 BACKTRACE ?= ON
 TEST ?= ON
 DIST ?= OFF
 NNET ?= OFF
 DIST ?= OFF
 FORMAT_ORIGIN ?=
 # Docker build options
 DOCKER_NAME ?= infinitensor
 DOCKER_IMAGE_NAME ?= infinitensor
 DOCKER_FILE ?= infinitensor_ubuntu_22.04.dockerfile
 DOCKER_RUN_OPTION ?=
 # CUDA option.
 ifeq ($(CUDA), ON)
 	DOCKER_IMAGE_NAME = infinitensor_cuda
 	DOCKER_NAME = infinitensor_cuda
 	DOCKER_FILE = infinitensor_ubuntu_22.04_CUDA.dockerfile
 	DOCKER_RUN_OPTION += --gpus all -it --ipc=host --ulimit memlock=-1 --ulimit stack=67108864 -v `pwd`:`pwd` -w `pwd`
 endif
 CMAKE_OPT = -DCMAKE_BUILD_TYPE=$(TYPE)
 CMAKE_OPT += -DUSE_CUDA=$(CUDA)
 CMAKE_OPT += -DUSE_BANG=$(BANG)
 CMAKE_OPT += -DUSE_KUNLUN=$(KUNLUN)
 CMAKE_OPT += -DUSE_BACKTRACE=$(BACKTRACE)
 CMAKE_OPT += -DBUILD_TEST=$(TEST)
 CMAKE_OPT += -DBUILD_DIST=$(DIST)
 CMAKE_OPT += -DBUILD_NNET=$(NNET)
 ifeq ($(INTELCPU), ON)
 	CMAKE_OPT += -DUSE_INTELCPU=ON -DCMAKE_CXX_COMPILER=dpcpp
@ -24,9 +45,12 @@ build:
 clean:
 	rm -rf build
 format:
 	@python3 scripts/format.py $(FORMAT_ORIGIN)
 install-python: build
 	cp build/$(TYPE)/backend*.so pyinfinitensor/src/pyinfinitensor
-	pip install pyinfinitensor/
+	pip install -e pyinfinitensor/
 test-cpp:
 	@echo
@ -35,3 +59,19 @@ test-cpp:
 test-onnx:
 	@echo
 	python3 pyinfinitensor/tests/test_onnx.py
 test-api:
 	@echo
 	python3 pyinfinitensor/tests/test_api.py
 docker-build:
 	docker build -f scripts/dockerfile/$(DOCKER_FILE) -t $(DOCKER_NAME) .
 docker-run:
 	docker run -t --name $(DOCKER_IMAGE_NAME) -d $(DOCKER_NAME) $(DOCKER_RUN_OPTION)
 docker-start:
 	docker start $(DOCKER_IMAGE_NAME)
 docker-exec:
 	docker exec -it $(DOCKER_IMAGE_NAME) bash
--- a/README.md
+++ b/README.md
@ -33,13 +33,14 @@ There are several configurable CMake options, see the [CMakeLists.txt](/CMakeLis
 ## Roadmap
 - [RefactorGraph](https://github.com/InfiniTensor/RefactorGraph) is a newly designed AI framework that is set to replace the current main branch.
 - [EinNet](https://github.com/InfiniTensor/InfiniTensor/tree/NNET_e2e) is going to be merged into the main branch.
 - Integration of [PET](https://github.com/thu-pacman/PET), a tensor program optimizer supporting partially equivalent transformations.
 - Supported hardware
  - ✔ NVIDIA GPU
  - ✔ Cambricon MLU
  - ✔ Kunlunxin XPU
  - ⬜ Ascend NPU
  - ⬜ Kunlunxin XPU
 ## Contributor Guide
--- a/cmake/FindCNCL.cmake
+++ b/cmake/FindCNCL.cmake
@ -0,0 +1,76 @@
 SET(CNCL_LIB_SEARCH_PATHS $ENV{NEUWARE_HOME}/lib64)
 SET(CNCL_INCLUDE_SEARCH_PATHS $ENV{NEUWARE_HOME}/include)
 set(CNCL_INCLUDE_DIR $ENV{NEUWARE_HOME}/include)
 set(CNCL_LIB_DIR $ENV{NEUWARE_HOME}/lib64)
 set(CNCL_VERSION $ENV{CNCL_VERSION} CACHE STRING "Version of CNCL to build with")
 if ($ENV{CNCL_ROOT_DIR})
  message(WARNING "CNCL_ROOT_DIR is deprecated. Please set CNCL_ROOT instead.")
 endif()
 list(APPEND CNCL_ROOT $ENV{CNCL_ROOT_DIR} ${MLU_TOOLKIT_ROOT_DIR})
 # Compatible layer for CMake <3.12. CNCL_ROOT will be accounted in for searching paths and libraries for CMake >=3.12.
 list(APPEND CMAKE_PREFIX_PATH ${CNCL_ROOT})
 find_path(CNCL_INCLUDE_DIRS
  NAMES cncl.h
  HINTS ${CNCL_INCLUDE_DIR})
 if (USE_STATIC_CNCL)
  MESSAGE(STATUS "USE_STATIC_CNCL is set. Linking with static CNCL library.")
  SET(CNCL_LIBNAME "CNCL_static")
  if (CNCL_VERSION)  # Prefer the versioned library if a specific CNCL version is specified
    set(CMAKE_FIND_LIBRARY_SUFFIXES ".a.${CNCL_VERSION}" ${CMAKE_FIND_LIBRARY_SUFFIXES})
  endif()
 else()
  SET(CNCL_LIBNAME "cncl")
  if (CNCL_VERSION)  # Prefer the versioned library if a specific CNCL version is specified
    set(CMAKE_FIND_LIBRARY_SUFFIXES ".so.${CNCL_VERSION}" ${CMAKE_FIND_LIBRARY_SUFFIXES})
  endif()
 endif()
 find_library(CNCL_LIBRARIES
  NAMES ${CNCL_LIBNAME}
  HINTS ${CNCL_LIB_DIR})
 include(FindPackageHandleStandardArgs)
 find_package_handle_standard_args(CNCL DEFAULT_MSG CNCL_INCLUDE_DIRS CNCL_LIBRARIES)
 if(CNCL_FOUND)  # obtaining CNCL version and some sanity checks
  set (CNCL_HEADER_FILE "${CNCL_INCLUDE_DIRS}/cncl.h")
  message (STATUS "Determining CNCL version from ${CNCL_HEADER_FILE}...")
  set (OLD_CMAKE_REQUIRED_INCLUDES ${CMAKE_REQUIRED_INCLUDES})
  list (APPEND CMAKE_REQUIRED_INCLUDES ${CNCL_INCLUDE_DIRS})
  include(CheckCXXSymbolExists)
  check_cxx_symbol_exists(CNCL_VERSION_CODE CNCL.h CNCL_VERSION_DEFINED)
  if (CNCL_VERSION_DEFINED)
    set(file "${PROJECT_BINARY_DIR}/detect_cncl_version.cc")
    file(WRITE ${file} "
      #include <iostream>
      #include <cncl.h>
      int main()
      {
        std::cout << CNCL_MAJOR << '.' << CNCL_MINOR << '.' << CNCL_PATCH << std::endl;
        int x;
        CNCLGetVersion(&x);
        return x == CNCL_VERSION_CODE;
      }
 ")
    try_run(CNCL_VERSION_MATCHED compile_result ${PROJECT_BINARY_DIR} ${file}
          RUN_OUTPUT_VARIABLE CNCL_VERSION_FROM_HEADER
          CMAKE_FLAGS  "-DINCLUDE_DIRECTORIES=${CNCL_INCLUDE_DIRS}"
          LINK_LIBRARIES ${CNCL_LIBRARIES})
    if (NOT CNCL_VERSION_MATCHED)
      message(FATAL_ERROR "Found CNCL header version and library version do not match! \
 (include: ${CNCL_INCLUDE_DIRS}, library: ${CNCL_LIBRARIES}) Please set CNCL_INCLUDE_DIR and CNCL_LIB_DIR manually.")
    endif()
    message(STATUS "CNCL version: ${CNCL_VERSION_FROM_HEADER}")
  else()
    # message(STATUS "CNCL version < 2.3.5-5")
  endif ()
  set (CMAKE_REQUIRED_INCLUDES ${OLD_CMAKE_REQUIRED_INCLUDES})
  message(STATUS "Found CNCL (include: ${CNCL_INCLUDE_DIRS}, library: ${CNCL_LIBRARIES})")
  mark_as_advanced(CNCL_ROOT_DIR CNCL_INCLUDE_DIRS CNCL_LIBRARIES)
 endif()
--- a/cmake/FindNCCL.cmake
+++ b/cmake/FindNCCL.cmake
@ -0,0 +1,165 @@
 # Copyright (c) 2021-2022, NVIDIA CORPORATION. All rights reserved.
 # 
 # From PyTorch:
 # 
 # Copyright (c) 2016-     Facebook, Inc            (Adam Paszke)
 # Copyright (c) 2014-     Facebook, Inc            (Soumith Chintala)
 # Copyright (c) 2011-2014 Idiap Research Institute (Ronan Collobert)
 # Copyright (c) 2012-2014 Deepmind Technologies    (Koray Kavukcuoglu)
 # Copyright (c) 2011-2012 NEC Laboratories America (Koray Kavukcuoglu)
 # Copyright (c) 2011-2013 NYU                      (Clement Farabet)
 # Copyright (c) 2006-2010 NEC Laboratories America (Ronan Collobert, Leon Bottou, Iain Melvin, Jason Weston)
 # Copyright (c) 2006      Idiap Research Institute (Samy Bengio)
 # Copyright (c) 2001-2004 Idiap Research Institute (Ronan Collobert, Samy Bengio, Johnny Mariethoz)
 # 
 # From Caffe2:
 # 
 # Copyright (c) 2016-present, Facebook Inc. All rights reserved.
 # 
 # All contributions by Facebook:
 # Copyright (c) 2016 Facebook Inc.
 # 
 # All contributions by Google:
 # Copyright (c) 2015 Google Inc.
 # All rights reserved.
 # 
 # All contributions by Yangqing Jia:
 # Copyright (c) 2015 Yangqing Jia
 # All rights reserved.
 # 
 # All contributions by Kakao Brain:
 # Copyright 2019-2020 Kakao Brain
 # 
 # All contributions from Caffe:
 # Copyright(c) 2013, 2014, 2015, the respective contributors
 # All rights reserved.
 # 
 # All other contributions:
 # Copyright(c) 2015, 2016 the respective contributors
 # All rights reserved.
 # 
 # Caffe2 uses a copyright model similar to Caffe: each contributor holds
 # copyright over their contributions to Caffe2. The project versioning records
 # all such contribution and copyright details. If a contributor wants to further
 # mark their specific copyright on a particular contribution, they should
 # indicate their copyright solely in the commit message of the change when it is
 # committed.
 # 
 # All rights reserved.
 # 
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions are met:
 # 
 # 1. Redistributions of source code must retain the above copyright
 #    notice, this list of conditions and the following disclaimer.
 # 
 # 2. Redistributions in binary form must reproduce the above copyright
 #    notice, this list of conditions and the following disclaimer in the
 #    documentation and/or other materials provided with the distribution.
 # 
 # 3. Neither the names of Facebook, Deepmind Technologies, NYU, NEC Laboratories America
 #    and IDIAP Research Institute nor the names of its contributors may be
 #    used to endorse or promote products derived from this software without
 #    specific prior written permission.
 # 
 # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 # ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
 # LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 # CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 # SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 # INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 # CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 # ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 # POSSIBILITY OF SUCH DAMAGE.
 # 
 # Find the nccl libraries
 #
 # The following variables are optionally searched for defaults
 #  NCCL_ROOT: Base directory where all NCCL components are foundHong Xu, 1 year ago: • Let CMake handle NCCL detection instead of ou…
 #  NCCL_INCLUDE_DIR: Directory where NCCL header is foundPieter Noordhuis, 3 years ago: • Bump gloo
 #  NCCL_LIB_DIR: Directory where NCCL library is found
 #
 # The following are set after configuration is done:
 #  NCCL_FOUND
 #  NCCL_INCLUDE_DIRS
 #  NCCL_LIBRARIES
 #
 # The path hints include CUDA_TOOLKIT_ROOT_DIR seeing as some folks
 # install NCCL in the same location as the CUDA toolkit.
 # See https://github.com/caffe2/caffe2/issues/1601
 set(NCCL_INCLUDE_DIR $ENV{NCCL_INCLUDE_DIR} CACHE PATH "Folder contains NVIDIA NCCL headers")
 set(NCCL_LIB_DIR $ENV{NCCL_LIB_DIR} CACHE PATH "Folder contains NVIDIA NCCL libraries")
 set(NCCL_VERSION $ENV{NCCL_VERSION} CACHE STRING "Version of NCCL to build with")
 if ($ENV{NCCL_ROOT_DIR})
  message(WARNING "NCCL_ROOT_DIR is deprecated. Please set NCCL_ROOT instead.")
 endif()
 list(APPEND NCCL_ROOT $ENV{NCCL_ROOT_DIR} ${CUDA_TOOLKIT_ROOT_DIR})
 # Compatible layer for CMake <3.12. NCCL_ROOT will be accounted in for searching paths and libraries for CMake >=3.12.
 list(APPEND CMAKE_PREFIX_PATH ${NCCL_ROOT})
 find_path(NCCL_INCLUDE_DIRS
  NAMES nccl.h
  HINTS ${NCCL_INCLUDE_DIR})
 if (USE_STATIC_NCCL)
  MESSAGE(STATUS "USE_STATIC_NCCL is set. Linking with static NCCL library.")
  SET(NCCL_LIBNAME "nccl_static")
  if (NCCL_VERSION)  # Prefer the versioned library if a specific NCCL version is specified
    set(CMAKE_FIND_LIBRARY_SUFFIXES ".a.${NCCL_VERSION}" ${CMAKE_FIND_LIBRARY_SUFFIXES})
  endif()
 else()
  SET(NCCL_LIBNAME "nccl")
  if (NCCL_VERSION)  # Prefer the versioned library if a specific NCCL version is specified
    set(CMAKE_FIND_LIBRARY_SUFFIXES ".so.${NCCL_VERSION}" ${CMAKE_FIND_LIBRARY_SUFFIXES})
  endif()
 endif()
 find_library(NCCL_LIBRARIES
  NAMES ${NCCL_LIBNAME}
  HINTS ${NCCL_LIB_DIR})
 include(FindPackageHandleStandardArgs)
 find_package_handle_standard_args(NCCL DEFAULT_MSG NCCL_INCLUDE_DIRS NCCL_LIBRARIES)
 if(NCCL_FOUND)  # obtaining NCCL version and some sanity checks
  set (NCCL_HEADER_FILE "${NCCL_INCLUDE_DIRS}/nccl.h")
  message (STATUS "Determining NCCL version from ${NCCL_HEADER_FILE}...")
  set (OLD_CMAKE_REQUIRED_INCLUDES ${CMAKE_REQUIRED_INCLUDES})
  list (APPEND CMAKE_REQUIRED_INCLUDES ${NCCL_INCLUDE_DIRS})
  include(CheckCXXSymbolExists)
  check_cxx_symbol_exists(NCCL_VERSION_CODE nccl.h NCCL_VERSION_DEFINED)
  if (NCCL_VERSION_DEFINED)
    set(file "${PROJECT_BINARY_DIR}/detect_nccl_version.cc")
    file(WRITE ${file} "
      #include <iostream>
      #include <nccl.h>
      int main()
      {
        std::cout << NCCL_MAJOR << '.' << NCCL_MINOR << '.' << NCCL_PATCH << std::endl;
        int x;
        ncclGetVersion(&x);
        return x == NCCL_VERSION_CODE;
      }
 ")
    try_run(NCCL_VERSION_MATCHED compile_result ${PROJECT_BINARY_DIR} ${file}
          RUN_OUTPUT_VARIABLE NCCL_VERSION_FROM_HEADER
          CMAKE_FLAGS  "-DINCLUDE_DIRECTORIES=${NCCL_INCLUDE_DIRS}"
          LINK_LIBRARIES ${NCCL_LIBRARIES})
    if (NOT NCCL_VERSION_MATCHED)
      message(FATAL_ERROR "Found NCCL header version and library version do not match! \
 (include: ${NCCL_INCLUDE_DIRS}, library: ${NCCL_LIBRARIES}) Please set NCCL_INCLUDE_DIR and NCCL_LIB_DIR manually.")
    endif()
    message(STATUS "NCCL version: ${NCCL_VERSION_FROM_HEADER}")
  else()
    # message(STATUS "NCCL version < 2.3.5-5")
  endif ()
  set (CMAKE_REQUIRED_INCLUDES ${OLD_CMAKE_REQUIRED_INCLUDES})
  message(STATUS "Found NCCL (include: ${NCCL_INCLUDE_DIRS}, library: ${NCCL_LIBRARIES})")
  mark_as_advanced(NCCL_ROOT_DIR NCCL_INCLUDE_DIRS NCCL_LIBRARIES)
 endif()
--- a/cmake/FindXCCL.cmake
+++ b/cmake/FindXCCL.cmake
@ -0,0 +1,27 @@
 # Find the xccl libraries
 set(XCCL_INCLUDE_DIR $ENV{KUNLUN_HOME}/include CACHE PATH "Folder contains KUNLUN XCCL headers")
 set(XCCL_LIB_DIR $ENV{KUNLUN_HOME}  CACHE PATH "Folder contains KUNLUN XCCL libraries")
 list(APPEND CMAKE_PREFIX_PATH $ENV{KUNLUN_HOME})
 find_path(XCCL_INCLUDE_DIRS # ${XCCL_INCLUDE_DIR}
  NAMES xpu/bkcl.h
  HINTS XCCL_INCLUDE_DIR)
 find_library(XCCL_LIBRARIES # ${XCCL_LIB_DIR}
  NAMES lib64/libbkcl.so
  HINTS XCCL_LIB_DIR)
 message(STATUS "XCCL_INCLUDE_DIRS: ${XCCL_INCLUDE_DIRS}")
 message(STATUS "XCCL_LIBRARIES: ${XCCL_LIBRARIES}")
 include(FindPackageHandleStandardArgs)
 find_package_handle_standard_args(XCCL DEFAULT_MSG XCCL_INCLUDE_DIRS XCCL_LIBRARIES)
 if (XCCL_FOUND)
  set (XCCL_HEADER_FILE "${XCCL_INCLUDE_DIRS}/xpu/bkcl.h")
  message (STATUS "Determing XCCL version from ${XCCL_HEADER_FILE}...")
  list (APPEND CMAKE_REQUIRED_INCLUDES ${XCCL_INCLUDE_DIRS})
  message(STATUS "Found XCCL (include: ${XCCL_INCLUDE_DIRS}, library: ${XCCL_LIBRARIES})")
  mark_as_advanced(XCCL_INCLUDE_DIRS XCCL_LIBRARIES)
 endif()
--- a/docs/INSTALL_GUIDE_CN.md
+++ b/docs/INSTALL_GUIDE_CN.md
@ -133,10 +133,40 @@
   make install-python BANG=ON
   ```
   编译 CPU 部分，同时编译昆仑 XPU 部分：
   ```bash
   export KUNLUN_HOME=/path/to/your/kunlun_home
   make install-python KUNLUN=ON
   ```
 3. 使用方法
   安装成功后，您就可以使用本项目的 Python 接口进行编码并运行。具体使用方式可以参考项目样例代码 example/Resnet/resnet.py 以及用户使用手册
 ## Docker
 本项目也提供了 Docker 的环境，您可以使用 `make docker-build` 或 `make docker-build CUDA=ON` 命令启动并编译 Dockerfile，您可以通过添加编译选项或者修改 Makefile 变量修改 docker image 名称或者所选的 Dockerfile 文件。
 由于在拉取 github repo 时需要将 ssh key 加入到 github profile 中，因此暂时注释掉拉取 repo 并编译项目的过程，由用户在进入 docker 后自己维护 ssh key（将 host 中的 ssh key 复制到 docker 中可能会遇到环境不一致的问题）。
 ```shell
 # Build docker container.
 make docker-build
 # Run docker image.
 make docker-run
 # Execute docker image.
 make docker-exec
 ```
 如果需要编译 CUDA 版，请使用如下命令：
 ```shell
 # Build docker container.
 make docker-build CUDA=ON
 # Run docker image.
 make docker-run CUDA=ON
 ```
 ## 技术支持
 如遇到问题，请联系我们技术支持团队
--- a/docs/SUPPORT_MATRIX_CN.md
+++ b/docs/SUPPORT_MATRIX_CN.md
@ -2,6 +2,7 @@
 ## 目录
 - [环境支持](#环境支持)
 - [神经网络支持](#神经网络支持)
 - [技术支持](#技术支持)
@ -19,10 +20,10 @@
 目前已经验证过的神经网络模型有
- [x] [ResNet18-v2](https://github.com/onnx/models/blob/main/vision/classification/resnet/model/resnet18-v2-7.onnx)
+- [x] [ResNet18-v2](https://github.com/onnx/models/blob/main/validated/vision/classification/resnet/model/resnet18-v2-7.onnx)
- [x] [DenseNet-121-12](https://github.com/onnx/models/blob/main/vision/classification/densenet-121/model/densenet-12.onnx)
+- [x] [DenseNet-121-12](https://github.com/onnx/models/blob/main/validated/vision/classification/densenet-121/model/densenet-12.onnx)
- [x] [Inception-2](https://github.com/onnx/models/blob/main/vision/classification/inception_and_googlenet/inception_v2/model/inception-v2-9.onnx)
+- [x] [Inception-2](https://github.com/onnx/models/blob/main/validated/vision/classification/inception_and_googlenet/inception_v2/model/inception-v2-9.onnx)
- [x] [EfficientNet-Lite4](https://github.com/onnx/models/blob/main/vision/classification/efficientnet-lite4/model/efficientnet-lite4-11.onnx)
+- [x] [EfficientNet-Lite4](https://github.com/onnx/models/blob/main/validated/vision/classification/efficientnet-lite4/model/efficientnet-lite4-11.onnx)
 ## 技术支持
--- a/docs/USER_GUIDE_CN.md
+++ b/docs/USER_GUIDE_CN.md
@ -3,9 +3,10 @@
 ## 目录
 - [使用方法](#使用方法)
- [python-前端应用指南](#python-前端应用指南)
+- [python 前端应用指南](#python-前端应用指南)
-  - [导入-onnx-模型](#导入-onnx-模型)
+  - [导入 onnx 模型](#导入-onnx-模型)
-  - [导出-onnx-模型](#导出-onnx-模型)
+  - [优化](#优化)
  - [导出 onnx 模型](#导出-onnx-模型)
  - [执行推理](#执行推理)
  - [样例代码](#样例代码)
 - [技术支持](#技术支持)
@ -13,7 +14,7 @@
 ## 使用方法
-项目管理功能已写到 [Makefile](Makefile)，支持下列功能：
+项目管理功能已写到 [Makefile](../Makefile)，支持下列功能：
 - 编译项目：`make`/`make build`
 - 清理生成文件：`make clean`
@ -26,6 +27,7 @@
 - `TYPE`：编译模式（`debug`/`release`），默认值为 `release`
 - `CUDA`：是否编译 CUDA 后端，默认为 `OFF`，`ON` 打开
 - `BANG`：是否编译寒武纪后端，默认为 `OFF`，`ON` 打开
 - `KUNLUN`：是否编译昆仑后端，默认为 `OFF`，`ON` 打开
 - `BACKTRACE`：是否启用栈回溯，默认为 `ON`，`OFF` 关闭，建议调试时打开
 - `TEST`：是否编译 `googletest`，默认为 `ON`，`OFF` 关闭，只有 `test-cpp` 时必要
@ -37,10 +39,10 @@
 支持的模型：
- [x] [ResNet18-v2](https://github.com/onnx/models/blob/main/vision/classification/resnet/model/resnet18-v2-7.onnx)
+- [x] [ResNet18-v2](https://github.com/onnx/models/blob/main/validated/vision/classification/resnet/model/resnet18-v2-7.onnx)
- [x] [DenseNet-121-12](https://github.com/onnx/models/blob/main/vision/classification/densenet-121/model/densenet-12.onnx)
+- [x] [DenseNet-121-12](https://github.com/onnx/models/blob/main/validated/vision/classification/densenet-121/model/densenet-12.onnx)
- [x] [Inception-2](https://github.com/onnx/models/blob/main/vision/classification/inception_and_googlenet/inception_v2/model/inception-v2-9.onnx)
+- [x] [Inception-2](https://github.com/onnx/models/blob/main/validated/vision/classification/inception_and_googlenet/inception_v2/model/inception-v2-9.onnx)
- [x] [EfficientNet-Lite4](https://github.com/onnx/models/blob/main/vision/classification/efficientnet-lite4/model/efficientnet-lite4-11.onnx)
+- [x] [EfficientNet-Lite4](https://github.com/onnx/models/blob/main/validated/vision/classification/efficientnet-lite4/model/efficientnet-lite4-11.onnx)
 ```python
 import onnx
@ -95,7 +97,7 @@ for name, tensor in stub.inputs.items():
    print(name, tensor.shape(), tensor)
 ```
-对于 [resnet18-v2-7.onnx](https://github.com/onnx/models/blob/main/vision/classification/resnet/model/resnet18-v2-7.onnx)，会打印出：
+对于 [resnet18-v2-7.onnx](https://github.com/onnx/models/blob/main/validated/vision/classification/resnet/model/resnet18-v2-7.onnx)，会打印出：
 ```plaintext
 data [1, 3, 224, 224] <backend.Tensor object at 0x7efeb828e3b0>
@ -136,7 +138,7 @@ for name, tensor in stub.outputs.items():
 ### 样例代码
-您可以参照[./example/Resnet/resnet.py](./example/ResNet/resnet.py)的样例代码进行了解，并尝试运行。在这个文件中，我们使用了 Pytorch 构建了 resnet 网络。您可以查阅该脚本使用方式：
+您可以参照[resnet.py](https://github.com/wanghailu0717/NNmodel/blob/main/ResNet/resnet.py)的样例代码进行了解，并尝试运行。在这个文件中，我们使用了 Pytorch 构建了 resnet 网络。您可以查阅该脚本使用方式：
 ```python
 python resnet.py -h
--- a/env.sh
+++ b/env.sh
@ -35,4 +35,4 @@ export LD_LIBRARY_PATH="${NEUWARE_HOME}/lib64:${LD_LIBRARY_PATH}"
 # ├── tools
 # ├── version
 # └── XTDK
-export XPU_HOME=/usr/local/xpu
+export KUNLUN_HOME=/usr/local/xpu
--- a/1
+++ b/1
@ -1 +0,0 @@
 Subproject commit d6ac8c8c73bf83833a71b41e95820d4eb7741fa9
--- a/examples/NNmodel
+++ b/examples/NNmodel
@ -0,0 +1 @@
 Subproject commit 51d3105277f3774ed31c02ed4cd11fa92925af77
--- a/examples/distributed/README.md
+++ b/examples/distributed/README.md
@ -0,0 +1,39 @@
 # 分布式脚本
 ## 英伟达平台运行方式
 #### 1. 运行pytorch模型并生成输入和标准输出，可选择导出onnx
 使用 `--export_onnx` 设置导出onnx的目录，默认为当前路径 `./`，不使用这个flag则只进行计算和生成输入输出。
 ```bash
 python run_pytorch.py --model gpt2  --batch_size 1  --length 1 --export_onnx ./
 ```
 会在当前目录下生成输入输出文件`test_inputs.npy` 和 `test_results.npy`，目前只支持单一输入输出。
 #### 2. 运行InfiniTensor分布式脚本
 ```bash
 python cuda_launch.py --model "/XXX/XXX.onnx" --nproc_per_node 4 
 ```
 ## 寒武纪平台运行方式
 **将上述运行脚本 `run_pytorch.py` 以及 `cuda_launch.py` 针对寒武纪平台做了相应的适配，具体见 `run_pytorch_mlu.py` 以及 `bang_launch.py`。**
 #### 1. 运行pytorch模型并生成输入和标准输出，可选择导出onnx
 使用 `--export_onnx` 设置导出onnx的目录，默认为当前路径 `./`，不使用这个flag则只进行计算和生成输入输出。
 ```bash
 python run_pytorch_mlu.py --model gpt2  --batch_size 1  --length 1 --export_onnx ./
 ```
 会在当前目录下生成输入输出文件`test_inputs.npy` 和 `test_results.npy`，目前只支持单一输入输出。
 #### 2. 运行InfiniTensor分布式脚本
 ```bash
 python bang_launch.py --model "/XXX/XXX.onnx" --nproc_per_node 4 
 ```
--- a/examples/distributed/init.py
+++ b/examples/distributed/init.py
--- a/examples/distributed/bang/bang_launch.py
+++ b/examples/distributed/bang/bang_launch.py
@ -0,0 +1,187 @@
 import sys
 sys.path.append('../')
 import argparse
 import os
 import time
 import multiprocessing as mp
 from pyinfinitensor.onnx import OnnxStub, backend
 import onnx
 from onnx.external_data_helper import convert_model_to_external_data
 from onnx.shape_inference import infer_shapes_path
 import numpy as np
 from parallel_opt import parallel_model
 def parse_args():
    parser = argparse.ArgumentParser(description="launch distributed infinitensor")
    parser.add_argument("--num_nodes", type=int, default=1, help="number of nodes")
    parser.add_argument(
        "--nproc_per_node", type=int, default=1, help="number of processes per node"
    )
    parser.add_argument(
        "--name", type=str, default="test", help="name of this instance."
    )
    parser.add_argument(
        "--model", type=str, required=True, help="path to the ONNX model file."
    )
    parser.add_argument("--batch_size", type=int, default=1, help="batch size.")
    parser.add_argument("--length", type=int, default=1, help="sequence length.")
    parser.add_argument(
        "--gen_std",
        action="store_true",
        help="whether to generate the standard results.",
    )
    parser.add_argument(
        "--type", type=str, choices=["fp32", "fp16", "tf32"], default="fp32", help="data type"
    )
    args = parser.parse_args()
    print("arg setting: ", args)
    return (
        args.num_nodes,
        args.nproc_per_node,
        args.name,
        args.model,
        args.batch_size,
        args.length,
        args.gen_std,
        args.type,
    )
 def run_model(model, runtime, world_size=1, rank=0, n=10, data_type="default"):
    stub = OnnxStub(model, runtime, matmul_compute_type=data_type)
    load_inputs(stub, world_size, rank)
    # stub.tune()
    stub.run()
    # get outputs
    outputs = next(stub.outputs.values().__iter__()).copyout_numpy()
    # bench
    for _ in range(n):
        stub.run()
    begin = time.time()
    for _ in range(n * 2):
        stub.run()
    end = time.time()
    avg_time = (end - begin) / (n * 2)
    print(f"average time: {avg_time}")
    return outputs
 def load_inputs(stub, world_size=1, rank=0):
    for i, (name, tensor) in enumerate(stub.inputs.items()):
        input = np.load(f"./data/input_{i}.npy")
        if all(x == y for x,y in zip(input.shape,tensor.shape())):
            tensor.copyin_numpy(input)
        else:
            tensor.copyin_numpy(np.hsplit(input, world_size)[rank])
 def run_and_compare(name, model, runtime, world_size=1, rank=0, data_type="default"):
    results = np.load(f"./data/output.npy")
    outputs = run_model(model, runtime, world_size, rank, data_type=data_type)
    print("outputs abs mean:", abs(outputs).mean())
    print("max abs diff:", abs(outputs - results).max())
 def start_worker(
    name: str, world_size: int, rank: int, local_rank: int, model: onnx.ModelProto, data_type: str
 ):
    dist_name = name + "_dist"
    model = parallel_model(model, world_size, rank)
    extern_path = f"./{dist_name}_rank{rank}.pb"
    if os.path.exists(extern_path):
        os.remove(extern_path)
    onnx.save_model(
        model,
        f"./{dist_name}_rank{rank}.onnx",
        save_as_external_data=True,
        location=extern_path,
    )
    #infer_shapes_path(f"./{dist_name}_rank{rank}.onnx")
    runtime = backend.BangRuntime(local_rank)
    # print("init comm")
    runtime.init_comm(
        dist_name,
        world_size,
        rank,
    )
    run_and_compare(name, model, runtime, world_size, rank, data_type)
 def start_single(name, model, data_type):
    runtime = backend.BangRuntime(0)
    run_and_compare(name, model, runtime, data_type=data_type)
 def generate_input_output(model):
    os.makedirs(os.path.dirname("./data/"), exist_ok=True)
    runtime = backend.BangRuntime(0)
    stub = OnnxStub(model, runtime)
    position_id = 0
    for i, (name, tensor) in enumerate(stub.inputs.items()):
        input = tensor.copyout_numpy()
        if np.issubdtype(input.dtype, np.integer):
            if input.size == 1:
                # input = np.array([position_id])
                input = np.random.randint(0,2,size=input.shape, dtype=input.dtype)
            else:
                input = np.random.randint(0,2,size=input.shape, dtype=input.dtype)
        elif input.dtype == np.bool_:
            input = np.random.randint(0,2,size=input.shape) > 0
        else:
            if i == 0:
                input = np.ones(input.shape).astype(input.dtype)
                position_id = input.shape[-1] - 1
            else:
                input = np.random.rand(*input.shape).astype(input.dtype)
        tensor.copyin_numpy(input)
        np.save(f"./data/input_{i}", input)
    stub.run()
    time.sleep(0.01)
    output = next(stub.outputs.values().__iter__()).copyout_numpy()
    if np.isnan(output).any():
        print("Nan in output")
    np.save(f"./data/output", output)
 def main():
    nnodes, nproc_per_node, name, model_path, bs, length, gen_std, data_type = parse_args()
    data_type = "default" if data_type == "fp32" else data_type
    model = onnx.load(model_path)
    # generate standart output
    if gen_std:
        print(f"generate standard data for {name}.")
        # a small vocabulary size to fit all LLM.
        generate_input_output(model)
        return
    if nproc_per_node == 1:
        # run single process.
        # use standalone process to isolate bang.
        print("run model by single MLU.")
        # p = mp.Process(target=start_single, args=(name, model, data_type))
        # p.start()
        # p.join()
        start_single(name, model, data_type)
        return
    # run distributed parallel.
    world_size = nnodes * nproc_per_node
    print(f"run model by {world_size} MLU in parallel.")
    workers = [
        mp.Process(
            target=start_worker,
            args=(name, world_size, rank, rank % nproc_per_node, model, data_type),
        )
        for rank in range(world_size)
    ]
    for w in workers:
        w.start()
    for w in workers:
        w.join()
 if __name__ == "__main__":
    main()
--- a/examples/distributed/bang/run_pytorch_mlu.py
+++ b/examples/distributed/bang/run_pytorch_mlu.py
@ -0,0 +1,249 @@
 import argparse
 import torch
 import torch_mlu
 from transformers import BertModel, BertConfig
 from transformers import GPT2Model, GPT2Config
 from transformers import OPTModel, OPTConfig
 from transformers import AlbertModel, AlbertConfig
 from transformers import LlamaModel, LlamaConfig
 import time
 import numpy as np
 import onnx
 import sys
 import os
 from onnx.external_data_helper import convert_model_to_external_data
 from onnxsim import simplify
 def parse_args():
    parser = argparse.ArgumentParser(description="Run pytorch gpt2/bert/opt and optionally export onnx.")
    parser.add_argument(
        "--model", type=str, choices=["gpt2", "bert", "opt", "llama", "albert"], required=True, help="model type"
    )
    parser.add_argument("--batch_size", type=int, default=1, help="batch size.")
    parser.add_argument("--length", type=int, default=1, help="sequence length.")
    parser.add_argument(
        "--export_onnx",
        type=str,
        nargs="?",
        default=None,
        const="./",
        help="whether and where to export onnx file",
    )
    parser.add_argument(
        "--type", type=str, choices=["fp32", "fp16", "tf32"], required=True, help="model data type"
    )
    args = parser.parse_args()
    print("arg setting: ", args)
    return (
        args.model,
        args.batch_size,
        args.length,
        args.export_onnx,
        args.type
    )
 def get_model(modelname):
    match modelname:
        case "albert":
            model = AlbertModel.from_pretrained("albert/albert-base-v2")
            voc_size = AlbertConfig().vocab_size
        case "bert":
            model = BertModel.from_pretrained("bert-base-uncased", add_pooling_layer=False, hidden_act="gelu_new") # erf is not impl by infini
            voc_size = BertConfig().vocab_size
        case "gpt2":
            model = GPT2Model.from_pretrained("GPT2")
            voc_size = GPT2Config().vocab_size
        case "opt":
            model = OPTModel.from_pretrained("facebook/opt-125m")
            voc_size = OPTConfig().vocab_size
        case "llama":
            model = LlamaModel.from_pretrained("meta-llama/Llama-2-7b-hf")
            voc_size = LlamaConfig().vocab_size
        case _:
            raise KeyError(modelname)
    model = model.eval()
    return model, voc_size
 def run_pytorch(torch_model, voc_size, batchsize, len, dtype="fp32"):
    data = np.random.randint(0, voc_size, (batchsize, len), dtype=np.int32)
    os.makedirs(os.path.dirname("./data/"), exist_ok=True)
    np.save("./data/input_0", data)
    inputs = torch.from_numpy(data).to("mlu")
    torch_model = torch_model.to("mlu")
    if dtype == "fp16":
        torch_model = torch_model.half()
    n_iter = 20
    with torch.no_grad():
        for _ in range(10):
            outputs = torch_model(inputs)
    torch.mlu.synchronize()
    begin = time.time()
    with torch.no_grad():
        for _ in range(n_iter):
            torch.mlu.synchronize()
            outputs = torch_model(inputs)
            torch.mlu.synchronize()
    torch.mlu.synchronize()
    end = time.time()
    avg_time = (end - begin) / n_iter
    outputs = outputs.last_hidden_state.to("cpu")
    print("outputs abs mean:", abs(np.array(outputs)).mean())
    print(f"average time: {avg_time}")
    # torch.mlu.memory.empty_cache()
    np.save("./data/output", np.array(outputs))
    print("Save input & output into ./data.")
 def export_onnx(modelname, model, data, path, extern=False, dtype="fp32"):
    data = data.to("mlu")
    model = model.to("mlu")
    if dtype == "fp16":
        model = model.half()
    torch.onnx.export(model, data, path, verbose=False, do_constant_folding=True)
    if modelname != "llama":
        # use onnxsim to simplify
        onnx_model = onnx.load(path)
        onnx_model, check = simplify(onnx_model, skipped_optimizers=['eliminate_duplicate_initializer'])
        # onnx_model, check = simplify(onnx_model, skipped_optimizers=['fuse_qkv', 'eliminate_duplicate_initializer'])
        assert check
        add_value_info_for_constants(onnx_model)
        onnx_model = onnx.shape_inference.infer_shapes(onnx_model)
        if extern:
            extern_path = path.replace('.onnx', '.pb')
            if os.path.exists(extern_path):
                os.remove(extern_path)
            extern_path = extern_path.split("/")[-1]
            convert_model_to_external_data(
                onnx_model,
                all_tensors_to_one_file=True,
                location=extern_path,
                size_threshold=1024,
                convert_attribute=False,
            )
        onnx.save(onnx_model, path)
    else:
        # use third party tool to simplify llama
        # reference: https://github.com/luchangli03/onnxsim_large_model/
        sys.path.append("onnxsim_large_model")
        from onnx_utils import set_onnx_input_shape
        from compress_model import SIZE_1MB, compress_onnx_model, uncompress_onnx_model
        in_model_path = path
        out_model_path = path
        if not out_model_path:
            out_model_path = in_model_path[:-5] + ".sim.onnx"
        if os.path.isdir(out_model_path):
            out_model_path = os.path.join(out_model_path, os.path.basename(in_model_path))
        onnx_model = onnx.load(in_model_path)
        print(f"load model from {in_model_path} success")
        size_th_bytes = 1024 * 1024
        onnx_model, removed_inits = compress_onnx_model(onnx_model, size_th_bytes=size_th_bytes)
        print(f"compress model success")
        onnx_model = set_onnx_input_shape(onnx_model, "")
        tensor_size_threshold = f"1024KB"
        skipped_optimizers = []
        skipped_optimizers.append("eliminate_duplicate_initializer")
        onnx_model, check = simplify(onnx_model, skipped_optimizers=skipped_optimizers,
                                    tensor_size_threshold=tensor_size_threshold)
        if not check:
            raise ValueError(f"simplify compressed model {in_model_path} failed")
        print(f"simplify model success")
        onnx_model = uncompress_onnx_model(onnx_model, removed_inits)
        print(f"uncompress model success")
        add_value_info_for_constants(onnx_model)
        onnx.save(onnx_model, out_model_path, save_as_external_data=True)
 def add_value_info_for_constants(model : onnx.ModelProto):
    """
    Currently onnx.shape_inference doesn't use the shape of initializers, so add
    that info explicitly as ValueInfoProtos.
    Mutates the model.
    Args:
        model: The ModelProto to update.
    """
    # All (top-level) constants will have ValueInfos before IRv4 as they are all inputs
    if model.ir_version < 4:
        return
    def add_const_value_infos_to_graph(graph : onnx.GraphProto):
        inputs = {i.name for i in graph.input}
        existing_info = {vi.name: vi for vi in graph.value_info}
        for init in graph.initializer:
            # Check it really is a constant, not an input
            if init.name in inputs:
                continue
            # The details we want to add
            elem_type = init.data_type
            shape = init.dims
            # Get existing or create new value info for this constant
            vi = existing_info.get(init.name)
            if vi is None:
                vi = graph.value_info.add()
                vi.name = init.name
            # Even though it would be weird, we will not overwrite info even if it doesn't match
            tt = vi.type.tensor_type
            if tt.elem_type == onnx.TensorProto.UNDEFINED:
                tt.elem_type = elem_type
            if not tt.HasField("shape"):
                # Ensure we set an empty list if the const is scalar (zero dims)
                tt.shape.dim.extend([])
                for dim in shape:
                    tt.shape.dim.add().dim_value = dim
        # Handle subgraphs
        for node in graph.node:
            for attr in node.attribute:
                # Ref attrs refer to other attrs, so we don't need to do anything
                if attr.ref_attr_name != "":
                    continue
                if attr.type == onnx.AttributeProto.GRAPH:
                    add_const_value_infos_to_graph(attr.g)
                if attr.type == onnx.AttributeProto.GRAPHS:
                    for g in attr.graphs:
                        add_const_value_infos_to_graph(g)
    return add_const_value_infos_to_graph(model.graph)
 def main():
    torch.backends.mlu.matmul.allow_tf32 = False
    torch.backends.cnnl.allow_tf32 = False
    modelname, batchsize, seqlen, export_path, dtype = parse_args()
    if dtype == "tf32":
        torch.backends.mlu.matmul.allow_tf32 = True
    else:
        os.environ["CAMBRICON_TF32_OVERRIDE"] = "0"
    model, voc_size = get_model(modelname)
    if export_path is not None:
        filename = "{}_{}_{}_{}.onnx".format(modelname, batchsize, seqlen, dtype)
        path = os.path.join(export_path, filename)
        if not os.path.exists(path):
            param = torch.zeros((batchsize, seqlen), dtype=torch.int)
            export_onnx(modelname, model, param, path, True, dtype)
        else:
            print("Onnx path exists, skipping export.")
    run_pytorch(model, voc_size, batchsize, seqlen, dtype)
 if __name__ == "__main__":
    main()
--- a/examples/distributed/cuda/cuda_launch.py
+++ b/examples/distributed/cuda/cuda_launch.py
@ -0,0 +1,161 @@
 import argparse
 import os
 import time
 import multiprocessing as mp
 from pyinfinitensor.onnx import OnnxStub, backend
 import onnx
 from onnx.external_data_helper import convert_model_to_external_data
 from onnx.shape_inference import infer_shapes_path
 import numpy as np
 from parallel_opt import parallel_model
 def parse_args():
    parser = argparse.ArgumentParser(description="launch distributed infinitensor")
    parser.add_argument("--num_nodes", type=int, default=1, help="number of nodes")
    parser.add_argument(
        "--nproc_per_node", type=int, default=1, help="number of processes per node"
    )
    parser.add_argument(
        "--name", type=str, default="test", help="name of this instance."
    )
    parser.add_argument(
        "--model", type=str, required=True, help="path to the ONNX model file."
    )
    parser.add_argument("--batch_size", type=int, default=1, help="batch size.")
    parser.add_argument("--length", type=int, default=1, help="sequence length.")
    parser.add_argument(
        "--gen_std",
        action="store_true",
        help="whether to generate the standard results.",
    )
    parser.add_argument(
        "--type", type=str, choices=["fp32", "fp16", "tf32"], default="fp32", help="data type"
    )
    args = parser.parse_args()
    print("arg setting: ", args)
    return (
        args.num_nodes,
        args.nproc_per_node,
        args.name,
        args.model,
        args.batch_size,
        args.length,
        args.gen_std,
        args.type,
    )
 def run_model(model, runtime, inputs, n=10, data_type = "default"):
    stub = OnnxStub(model, runtime, matmul_compute_type=data_type)
    for tensor, input in zip(stub.inputs.values(), inputs, strict=False):
        tensor.copyin_numpy(input)
    # stub.tune()
    stub.run()
    # get outputs
    outputs = next(stub.outputs.values().__iter__()).copyout_numpy()
    # bench
    for tensor, input in zip(stub.inputs.values(), inputs, strict=False):
        tensor.copyin_numpy(input)
    begin = time.time()
    for _ in range(n):
        stub.run()
    end = time.time()
    avg_time = (end - begin) / n
    print(f"average time: {avg_time}")
    return outputs
 def run_and_compare(name, model, runtime, data_type):
    input_ids = np.load(f"{name}_inputs.npy")
    position_ids = np.arange(input_ids.shape[-1])
    results = np.load(f"{name}_results.npy")
    outputs = run_model(model, runtime, (input_ids, position_ids), data_type=data_type)
    print("outputs abs mean:", abs(outputs).mean())
    print("max abs diff:", abs(outputs - results).max())
 def start_worker(
    name: str, world_size: int, rank: int, local_rank: int, model: onnx.ModelProto, data_type: str
 ):
    dist_name = name + "_dist"
    model = parallel_model(model, world_size, rank)
    extern_path = f"./{dist_name}_rank{rank}.pb"
    if os.path.exists(extern_path):
        os.remove(extern_path)
    onnx.save_model(
        model,
        f"./{dist_name}_rank{rank}.onnx",
        save_as_external_data=True,
        location=extern_path,
    )
    #infer_shapes_path(f"./{dist_name}_rank{rank}.onnx")
    runtime = backend.CudaRuntime(local_rank)
    # print("init comm")
    runtime.init_comm(
        dist_name,
        world_size,
        rank,
    )
    run_and_compare(name, model, runtime, data_type)
 def start_single(name, model, data_type):
    runtime = backend.CudaRuntime(0)
    run_and_compare(name, model, runtime, data_type)
 def gen_standard(name, model, voc_size, bs, len):
    # generate standard results
    input_ids = np.random.randint(0, voc_size, (bs, len))
    position_ids = np.arange(len)
    np.save(f"{name}_inputs", input_ids)
    runtime = backend.CudaRuntime(0)
    outputs = run_model(model, runtime, (input_ids, position_ids), 1)
    print("outputs abs mean:", abs(outputs).mean())
    np.save(f"{name}_results", outputs)
 def main():
    nnodes, nproc_per_node, name, model_path, bs, length, gen_std, data_type = parse_args()
    data_type = "default" if data_type == "fp32" else data_type
    if data_type != "tf32":
        os.environ["NVIDIA_TF32_OVERRIDE"] = "0"
    model = onnx.load(model_path)
    # generate standart output
    if gen_std:
        print(f"generate standard data for {name}.")
        # a small vocabulary size to fit all LLM.
        voc_size = 1000
        gen_standard(name, model, voc_size, bs, length)
        return
    # run single process.
    # use standalone process to isolate cuda.
    print("run model by single GPU.")
    p = mp.Process(target=start_single, args=(name, model, data_type))
    p.start()
    p.join()
    # run distributed parallel.
    world_size = nnodes * nproc_per_node
    print(f"run model by {world_size} GPU in parallel.")
    workers = [
        mp.Process(
            target=start_worker,
            args=(name, world_size, rank, rank % nproc_per_node, model, data_type),
        )
        for rank in range(world_size)
    ]
    for w in workers:
        w.start()
    for w in workers:
        w.join()
 if __name__ == "__main__":
    main()
--- a/examples/distributed/cuda/launch_kvcache.py
+++ b/examples/distributed/cuda/launch_kvcache.py
@ -0,0 +1,245 @@
 import argparse
 import os
 import time
 import multiprocessing as mp
 from pyinfinitensor.onnx import OnnxStub, backend
 import onnx
 from onnx.external_data_helper import convert_model_to_external_data
 import numpy as np
 from parallel_opt import parallel_model
 os.environ["NVIDIA_TF32_OVERRIDE"] = "0"
 def parse_args():
    parser = argparse.ArgumentParser(description="launch distributed infinitensor")
    parser.add_argument("--num_nodes", type=int, default=1, help="number of nodes")
    parser.add_argument(
        "--nproc_per_node", type=int, default=1, help="number of processes per node"
    )
    parser.add_argument(
        "--name", type=str, default="test", help="name of this instance."
    )
    parser.add_argument(
        "--model1", type=str, required=True, help="path to the ONNX model file."
    )
    parser.add_argument(
        "--model2", type=str, required=True, help="path to the ONNX model file."
    )    
    parser.add_argument("--batch_size", type=int, default=1, help="batch size.")
    parser.add_argument("--length", type=int, default=1, help="sequence length.")
    parser.add_argument(
        "--gen_std",
        action="store_true",
        help="whether to generate the standard results.",
    )
    args = parser.parse_args()
    print("arg setting: ", args)
    return (
        args.num_nodes,
        args.nproc_per_node,
        args.name,
        args.model1,
        args.model2,
        args.batch_size,
        args.length,
        args.gen_std,
    )
 def run_model(model1, model2, runtime1, runtime2, inputs1: np.array, inputs2: np.array, n=20):
    ####################################
    # run the first graph without kvcache
    ####################################
    stub1 = OnnxStub(model1, runtime1)
    stub1.inputs['onnx::Reshape_0'].copyin_int32(inputs1.reshape(-1).tolist())
    stub1.tune()
    stub1.run()
    kvcache_it1 = []
    count = 0
    for output in stub1.outputs.items().__iter__():
        if count == 0:
            logits_it1 = np.array(output[1].copyout_float(), dtype=np.float32)
        else:
            kvcache_it1.append(np.array(output[1].copyout_float(), dtype=np.float32))
        count = count + 1
    # bench for stub1
    next(stub1.inputs.items().__iter__())[1].copyin_int32(inputs1.reshape(-1).tolist())
    begin = time.time()
    for _ in range(n):
        stub1.run()
    end = time.time()
    avg_time = (end - begin) / n
    print(f"stub1 average time: {avg_time}")        
    ####################################
    # run the second graph with kvcache
    ####################################
    i = 0
    batchsize = 1
    stub2 = OnnxStub(model2, runtime2)
    past_kvcache_length = (i+2)*np.ones((batchsize, 1), dtype=np.int32)
    # copyin input
    stub2.inputs['onnx::Reshape_0'].copyin_int32(inputs2.reshape(-1).tolist())
    stub2.inputs['input.3'].copyin_int32(past_kvcache_length.reshape(-1).tolist())
    count = -1
    for input in stub2.inputs.items().__iter__():
        if count in range(24):
            # print(count, input[0])
            # print(np.dtype(kvcache_it1[count][0]), kvcache_it1[count].shape)
            input[1].copyin_float(kvcache_it1[count].reshape(-1).tolist())
        count = count + 1
    stub2.tune()
    stub2.run()
    # copyout output
    count = 0
    kvcache_it2 = []
    for output in stub2.outputs.items().__iter__():
        if count == 0:
            logits_it2 = np.array(output[1].copyout_float(), dtype=np.float32)
        else:
            kvcache_it2.append(np.array(output[1].copyout_float(), dtype=np.float32))
        count = count + 1     
    # bench for stub2
    # copyin input
    stub2.inputs['onnx::Reshape_0'].copyin_int32(inputs2.reshape(-1).tolist())
    stub2.inputs['input.3'].copyin_int32(past_kvcache_length.reshape(-1).tolist())
    count = -1
    for input in stub2.inputs.items().__iter__():
        if count in range(24):
            input[1].copyin_float(kvcache_it1[count].reshape(-1).tolist())
        count = count + 1
    begin = time.time()
    for _ in range(n):
        stub2.run()
    end = time.time()
    avg_time = (end - begin) / n
    print(f"stub2 average time: {avg_time}")
    return logits_it2
 def run_and_compare(name, model1, model2, runtime1, runtime2):
    data1 = np.load(f"{name}_inputs1.npy")
    data2 = np.load(f"{name}_inputs2.npy")
    results = np.load(f"{name}_results.npy")
    outputs = run_model(model1, model2, runtime1, runtime2, data1, data2)
    print("outputs sum:", outputs.sum())
    print("max abs diff:", abs(outputs - results).max())
    print("max rel diff:", abs((outputs - results) / results).max())
    # assert np.allclose(outputs, results, rtol=1e-3, atol=1e-6)
 def start_worker(
    name: str, world_size: int, rank: int, local_rank: int, model1: onnx.ModelProto, model2: onnx.ModelProto
 ):
    dist_name = name + "_dist"
    ####################################
    # shard the first graph
    ####################################
    model1 = parallel_model(model1, world_size, rank)
    extern_path = f"./{dist_name}_stub1_rank{rank}.pb"
    if os.path.exists(extern_path):
        os.remove(extern_path)
    convert_model_to_external_data(
        model1,
        all_tensors_to_one_file=True,
        location=extern_path,
        size_threshold=1024,
        convert_attribute=False,
    )
    onnx.save(model1, f"./{dist_name}_stub1_rank{rank}.onnx")
    runtime1 = backend.CudaRuntime(local_rank)
    runtime1.init_comm(
        dist_name,
        world_size,
        rank,
    )
    ####################################
    # shard the second graph
    ####################################    
    model2 = parallel_model(model2, world_size, rank)
    extern_path = f"./{dist_name}_stub2_rank{rank}.pb"
    if os.path.exists(extern_path):
        os.remove(extern_path)
    convert_model_to_external_data(
        model2,
        all_tensors_to_one_file=True,
        location=extern_path,
        size_threshold=1024,
        convert_attribute=False,
    )    
    onnx.save(model2, f"./{dist_name}_stub2_rank{rank}.onnx")
    runtime2 = backend.CudaRuntime(local_rank)
    # print("init comm")
    runtime2.init_comm(
        dist_name,
        world_size,
        rank,
    )    
    # run the two graphs
    run_and_compare(name, model1, model2, runtime1, runtime2)
 def start_single(name, model1, model2):
    runtime1 = backend.CudaRuntime(0)
    runtime2 = backend.CudaRuntime(0)
    run_and_compare(name, model1, model2, runtime1, runtime2)
 def gen_standard(name, model1, model2, voc_size, bs, len):
    # generate standard results
    data1 = np.random.randint(0, voc_size, (bs, len), dtype=np.int32)
    data2 = np.random.randint(0, voc_size, (bs, len), dtype=np.int32)
    np.save(f"{name}_inputs1", data1)
    np.save(f"{name}_inputs2", data2)
    runtime1 = backend.CudaRuntime(0)
    runtime2 = backend.CudaRuntime(0)
    outputs = run_model(model1, model2, runtime1, runtime2, data1, data2, 1)
    np.save(f"{name}_results", outputs)
 def main():
    nnodes, nproc_per_node, name, model1_path, model2_path, bs, length, gen_std = parse_args()
    model1 = onnx.load(model1_path)
    model2 = onnx.load(model2_path)
    # generate standart output
    if gen_std:
        print(f"generate standard data for {name}.")
        # a small vocabulary size to fit all LLM.
        voc_size = 1000
        gen_standard(name, model1, model2, voc_size, bs, length)
        return
    # run single process.
    # use standalone process to isolate cuda.
    p = mp.Process(target=start_single, args=(name, model1, model2))
    p.start()
    p.join()
    # run distributed parallel.
    world_size = nnodes * nproc_per_node
    workers = [
        mp.Process(
            target=start_worker,
            args=(name, world_size, rank, rank % nproc_per_node, model1, model2),
        )
        for rank in range(world_size)
    ]
    for w in workers:
        w.start()
    for w in workers:
        w.join()
 if __name__ == "__main__":
    main()
--- a/examples/distributed/cuda/run_pytorch.py
+++ b/examples/distributed/cuda/run_pytorch.py
@ -0,0 +1,188 @@
 import argparse
 import torch
 from transformers import BertModel, BertConfig
 from transformers import GPT2Model, GPT2Config
 from transformers import OPTModel, OPTConfig
 import time
 import numpy as np
 import onnx
 import os
 from onnx.external_data_helper import convert_model_to_external_data
 from onnxsim import simplify
 def parse_args():
    parser = argparse.ArgumentParser(description="Run pytorch gpt2/bert/opt and optionally export onnx.")
    parser.add_argument(
        "--model", type=str, choices=["gpt2", "bert", "opt"], required=True, help="model type"
    )
    parser.add_argument("--batch_size", type=int, default=1, help="batch size.")
    parser.add_argument("--length", type=int, default=1, help="sequence length.")
    parser.add_argument(
        "--export_onnx",
        type=str,
        nargs="?",
        default=None,
        const="./",
        help="whether and where to export onnx file",
    )
    parser.add_argument(
        "--type", type=str, choices=["fp32", "fp16", "tf32"], default="fp32", help="data type"
    )
    args = parser.parse_args()
    print("arg setting: ", args)
    return (
        args.model,
        args.batch_size,
        args.length,
        args.export_onnx,
        args.type,
    )
 def get_model(modelname):
    match modelname:
        case "bert":
            model = BertModel.from_pretrained("bert-base-uncased", add_pooling_layer=False, hidden_act="gelu_new") # erf is not impl by infini
            voc_size = BertConfig().vocab_size
        case "gpt2":
            model = GPT2Model.from_pretrained("gpt2")
            voc_size = GPT2Config().vocab_size
        case "opt":
            model = model = OPTModel.from_pretrained("./opt-125m")
            voc_size = OPTConfig().vocab_size
        case _:
            raise KeyError(modelname)
    model = model.eval()
    return model, voc_size
 def run_pytorch(torch_model, voc_size, batchsize, len):
    data = np.random.randint(0, voc_size, (batchsize, len), dtype=np.int32)
    np.save("test_inputs", data)
    inputs = torch.from_numpy(data).to("cuda")
    torch_model = torch_model.to("cuda")
    n_iter = 20
    with torch.no_grad():
        for _ in range(10):
            outputs = torch_model(inputs)
    torch.cuda.synchronize()
    begin = time.time()
    with torch.no_grad():
        for _ in range(n_iter):
            torch.cuda.synchronize()
            outputs = torch_model(inputs)
            # 
            torch.cuda.synchronize()
    torch.cuda.synchronize()
    end = time.time()
    avg_time = (end - begin) / n_iter
    outputs = outputs.last_hidden_state.to("cpu")
    print("outputs abs mean:", abs(np.array(outputs)).mean())
    print(f"average time: {avg_time}")
    torch.cuda.memory.empty_cache()
    np.save("test_results", np.array(outputs, dtype=np.float32))
    print("Save input & output as test_inputs.npy and test_results.npy")
 def export_onnx(model, data, path, extern=False):
    torch.onnx.export(model, data, path, verbose=False, do_constant_folding=True)
    onnx_model = onnx.load(path)
    onnx_model, check = simplify(onnx_model, skipped_optimizers=['eliminate_duplicate_initializer'])
    #onnx_model, check = simplify(onnx_model, skipped_optimizers=['fuse_qkv', 'eliminate_duplicate_initializer'])
    assert check
    add_value_info_for_constants(onnx_model)
    onnx_model = onnx.shape_inference.infer_shapes(onnx_model)
    if extern:
        extern_path = path.replace('.onnx', '.pb')
        if os.path.exists(extern_path):
            os.remove(extern_path)
        convert_model_to_external_data(
            onnx_model,
            all_tensors_to_one_file=True,
            location=extern_path,
            size_threshold=1024,
            convert_attribute=False,
        )
    onnx.save(onnx_model, path)
 def add_value_info_for_constants(model : onnx.ModelProto):
    """
    Currently onnx.shape_inference doesn't use the shape of initializers, so add
    that info explicitly as ValueInfoProtos.
    Mutates the model.
    Args:
        model: The ModelProto to update.
    """
    # All (top-level) constants will have ValueInfos before IRv4 as they are all inputs
    if model.ir_version < 4:
        return
    def add_const_value_infos_to_graph(graph : onnx.GraphProto):
        inputs = {i.name for i in graph.input}
        existing_info = {vi.name: vi for vi in graph.value_info}
        for init in graph.initializer:
            # Check it really is a constant, not an input
            if init.name in inputs:
                continue
            # The details we want to add
            elem_type = init.data_type
            shape = init.dims
            # Get existing or create new value info for this constant
            vi = existing_info.get(init.name)
            if vi is None:
                vi = graph.value_info.add()
                vi.name = init.name
            # Even though it would be weird, we will not overwrite info even if it doesn't match
            tt = vi.type.tensor_type
            if tt.elem_type == onnx.TensorProto.UNDEFINED:
                tt.elem_type = elem_type
            if not tt.HasField("shape"):
                # Ensure we set an empty list if the const is scalar (zero dims)
                tt.shape.dim.extend([])
                for dim in shape:
                    tt.shape.dim.add().dim_value = dim
        # Handle subgraphs
        for node in graph.node:
            for attr in node.attribute:
                # Ref attrs refer to other attrs, so we don't need to do anything
                if attr.ref_attr_name != "":
                    continue
                if attr.type == onnx.AttributeProto.GRAPH:
                    add_const_value_infos_to_graph(attr.g)
                if attr.type == onnx.AttributeProto.GRAPHS:
                    for g in attr.graphs:
                        add_const_value_infos_to_graph(g)
    return add_const_value_infos_to_graph(model.graph)
 def main():
    torch.backends.cuda.matmul.allow_tf32 = False
    torch.backends.cudnn.allow_tf32 = False
    modelname, batchsize, seqlen, export_path, data_type = parse_args()
    if data_type == "tf32":
        torch.backends.cuda.matmul.allow_tf32 = True
    else:
        os.environ["NVIDIA_TF32_OVERRIDE"] = "0"
    model, voc_size = get_model(modelname)
    if export_path is not None:
        filename = "{}_{}_{}.onnx".format(modelname, batchsize, seqlen)
        path = os.path.join(export_path, filename)
        param = torch.zeros((batchsize, seqlen), dtype=torch.int)
        export_onnx(model, param, path, True)
    if data_type == "fp16":
        model = model.half()
    run_pytorch(model, voc_size, batchsize, seqlen)
 if __name__ == "__main__":
    main()
--- a/examples/distributed/kunlun/export_onnx.sh
+++ b/examples/distributed/kunlun/export_onnx.sh
@ -0,0 +1,14 @@
 export HF_ENDPOINT=https://hf-mirror.com
 models=("bert" "gpt2" "llama")
 batch_size=(1 32)
 seq_len=(100 500)
 nproc=(1 2 4)
 for model in "${models[@]}"; do
    for bs in "${batch_size[@]}"; do
        for len in "${seq_len[@]}"; do
            python run_pytorch.py --model "$model" --batch_size "$bs" --length "$len" --export_onnx ../models/"$model" --export_only 
        done
    done
 done 
--- a/examples/distributed/kunlun/kunlun_launch.py
+++ b/examples/distributed/kunlun/kunlun_launch.py
@ -0,0 +1,280 @@
 import sys
 sys.path.append('../')
 import argparse
 import os
 import time
 import multiprocessing as mp
 from pyinfinitensor.onnx import OnnxStub, backend
 import onnx
 from onnx.external_data_helper import convert_model_to_external_data
 from onnx.shape_inference import infer_shapes_path
 import numpy as np
 from parallel_opt import parallel_model
 from functools import wraps
 def parse_args():
    parser = argparse.ArgumentParser(description="launch distributed infinitensor")
    parser.add_argument("--num_nodes", type=int, default=1, help="number of nodes")
    parser.add_argument(
        "--nproc_per_node", type=int, default=2, help="number of processes per node"
    )
    parser.add_argument(
        "--name", type=str, choices=["gpt2", "bert", "llama"], help="name of model."
    )
    parser.add_argument(
        "--model", type=str, default="", help="path to the ONNX model file."
    )
    parser.add_argument(
        "--gen_std",
        default=False,
        action="store_true",
        help="whether to generate the standard results.",
    )
    parser.add_argument(
        "--run_single",
        default=False,
        action="store_true",
        help="whether run model with single process with standard inputs"
    )
    parser.add_argument(
        "--input_dir",
        default="./",
        help="path to save model input data"
    )
    parser.add_argument(
        "--result_dir",
        default="./",
        help="path to save model standard output"
    )
    parser.add_argument(
        "--internal_model_dir",
        default="./",
        help="path to save internal onnx model for parallel run"
    )
    args = parser.parse_args()
    # check path, mkdir if not exist
    check_exists(args.input_dir)
    check_exists(args.result_dir)
    check_exists(args.internal_model_dir)
    print("arg setting: ", args)
    return (
        args.num_nodes,
        args.nproc_per_node,
        args.name,
        args.model,
        args.gen_std,
        args.run_single,
        args.input_dir,
        args.result_dir,
        args.internal_model_dir
    )
 """
 utils function for this scripts
 """
 def check_exists(path: str):
    if not os.path.exists(path):
        os.makedirs(path)
 def np_assert(base, test, rtol=1e-2, atol=1e-1):
    # np.testing.assert_allclose(test, base, rtol, atol)
    print("max abs diff:", abs(base - test).max())
 """
 Perf wrapper, run function n times
 then average
 """
 def perf_it(n):
    def decorator(func):
        @wraps(func)
        def wrapper(*args, **kwargs):
            # warmup
            for _ in range(n):
                func(*args, **kwargs)
            t_total = 0
            for _ in range(n):
                t0 = time.time()
                func(*args, **kwargs)
                t1 = time.time()
                t_total += t1 - t0
            avg_time = (t_total) / n
            print(f"Avg runtime of {n} time is {avg_time:.6f} seconds")
            return avg_time
        return wrapper
    return decorator
 """
 Run InfiniTensor model with Standard input
 check=True: check with standard output gen by pytorch
 perf=True: run n times to get avg time
 """
 def run_model(task_name,
              model,
              runtime,
              world_size=1,
              rank=0,
              n=10,
              check=True,
              perf=True):
    stub = OnnxStub(model, runtime,
                    use_naive_allocator=True \
                    if task_name == "llama" else False)
    # load in Onnx model inputs
    def load_inputs(stub: OnnxStub):
        # check exists
        inputs = []
        for i, (name, tensor) in enumerate(stub.inputs.items()):
            input_path = os.path.join(input_dir, \
                                f"{task_name}_input_{i}.npy")
            print(input_path)
            if os.path.exists(input_path):
                input = np.load(input_path)
            else :
                raise KeyError(f"{i} th input of model not exists")
            # check shape
            if all(x == y for x,y in zip(input.shape, tensor.shape())):
                tensor.copyin_numpy(input)
            else:
                tensor.copyin_numpy(np.hsplit(input, world_size)[rank])
    load_inputs(stub)
    # stub.tune()
    stub.run()
    time.sleep(0.01)
    output = next(stub.outputs.values().__iter__()).copyout_numpy()
    # check output results with standard output
    if check:
        st_output_path = os.path.join(result_dir, \
                                f"{task_name}_output.npy")
        assert os.path.exists(st_output_path) , \
                    "standard output not exists"
        st_output = np.load(st_output_path)
        if np.isnan(output).any():
            print("Nan in output")
            exit()
        np_assert(st_output, output)
    # perf
    if perf:
        @perf_it(n)
        def perf_infinitensor(stub: OnnxStub):
            stub.run()
        perf_infinitensor(stub)
    return output
 """
 Start a worker in Parallel
 """
 def start_worker(name: str,
           world_size: int,
           rank: int,
           local_rank: int,
           model: onnx.ModelProto):
    dist_name = name + "_dist"
    # partial a onnx model to world_size part
    model = parallel_model(model, world_size, rank)
    onnx.save(model, os.path.join(internal_model_dir, \
                                    f"{dist_name}_rank{rank}.onnx"), save_as_external_data=True)
    runtime = backend.KUNLUNRuntime(local_rank)
    # print("init comm")
    runtime.init_comm(
        dist_name,
        world_size,
        rank,
    )
    run_model(name, model, runtime, world_size, rank)
 """
 generate standard input/output with
 sigle card run
 """
 def gen_standard(task_name: str, model: onnx.ModelProto):
    runtime = backend.KUNLUNRuntime(0)
    stub = OnnxStub(model, runtime)
    position_id = 0
    # generate random input for model
    for i, (name, tensor) in enumerate(stub.inputs.items()):
        input = tensor.copyout_numpy()
        if np.issubdtype(input.dtype, np.integer):
            if input.size == 1:
                input = np.random.randint(0,2,size=input.shape, dtype=input.dtype)
            else:
                input = np.random.randint(0,2,size=input.shape, dtype=input.dtype)
        elif input.dtype == np.bool_:
            input = np.random.randint(0,2,size=input.shape) > 0
        else:
            if i == 0:
                input = np.ones(input.shape).astype(input.dtype)
                position_id = input.shape[-1] - 1
            else:
                input = np.random.rand(*input.shape).astype(input.dtype)
        tensor.copyin_numpy(input)
        np.save(os.path.join(input_dir, \
                    f"{task_name}_input_{i}.npy"), input)
    stub.run()
    # print(stub.outputs)
    output = next(stub.outputs.values().__iter__()).copyout_numpy()
    if np.isnan(output).any():
        print("Nan in output")
        exit()
    np.save(os.path.join(result_dir, f"{task_name}_output.npy"), output)
 def main():
    global input_dir, result_dir, internal_model_dir
    nnodes, nproc_per_node, task_name, \
        model_path, gen_std, run_single, \
            input_dir, result_dir, internal_model_dir = parse_args()
    # load input onnx model
    model = onnx.load(model_path)
    # generate standart output
    if gen_std:
        print("Generate inputs and outputs.")
        gen_standard(task_name, model)
        return
    if run_single:
        print("Run model by one GPU card.")
        runtime = backend.KUNLUNRuntime(0)
        run_model(task_name, model, runtime)
        return
    # run distributed parallel.
    world_size = nnodes * nproc_per_node
    print(f"Run model by {world_size} GPU in parallel.")
    workers = [
        mp.Process(
            target=start_worker,
            args=(task_name, world_size, rank, rank % nproc_per_node, model),
        )
        for rank in range(world_size)
    ]
    for w in workers:
        w.start()
    for w in workers:
        w.join()
 if __name__ == "__main__":
    main()
--- a/examples/distributed/kunlun/launch.sh
+++ b/examples/distributed/kunlun/launch.sh
@ -0,0 +1,36 @@
 export HF_ENDPOINT=https://hf-mirror.com
 # models=("bert" "gpt2" "llama")
 models=("bert" "gpt2")
 batch_size=(1 32)
 seq_len=(100 500)
 nproc=(1 2 4)
 results_dir="results"
 if [ -d "$results_dir" ]; then
    echo "directory ./$results_dir exists"
 else
    mkdir -p "$results_dir"
    echo "mkdir $results_dir, logs saved there"
 fi
 for model in "${models[@]}"; do
    for bs in "${batch_size[@]}"; do
        for len in "${seq_len[@]}"; do
            # run pytorch model
            echo "Run pytorch $model with batch_size=$bs length=$len ."
            python run_pytorch.py --model "$model" --batch_size "$bs" --length "$len" #> results/"$model"_"$bs"_"$len"_pytorch
            for n in "${nproc[@]}"; do
                # run infinitensor 
                echo "Run $n parallel infinitensor "$model" with batch_size=$bs and length=$len ."
                python kunlun_launch.py --name "$model" --model ../models/"$model"/"$model"_"$bs"_"$len".onnx --nproc_per_node=$n # >> results/"$model"_"$bs"_"$len"_infini 
                # delete internal files
                find ./ -type f -name "*.onnx" -delete
                find ./ -type f -name "*.pb" -delete
            done
            find ./ -type f -name "*.npy" -delete
        done
    done
 done
--- a/examples/distributed/kunlun/llama_launch.sh
+++ b/examples/distributed/kunlun/llama_launch.sh
@ -0,0 +1,35 @@
 export HF_ENDPOINT=https://hf-mirror.com
 # models=("bert" "gpt2" "llama")
 models=("llama")
 batch_size=(1 )
 seq_len=(100 500)
 nproc=(1 2 4)
 results_dir="results"
 if [ -d "$results_dir" ]; then
    echo "directory ./$results_dir exists"
 else
    mkdir -p "$results_dir"
    echo "mkdir $results_dir, logs saved there"
 fi
 for model in "${models[@]}"; do
    for bs in "${batch_size[@]}"; do
        for len in "${seq_len[@]}"; do
            echo "Run pytorch llama with batch_size="$bs" and length="$len""
            python run_pytorch.py --model "$model" --batch_size "$bs" --length "$len"
            for n in "${nproc[@]}"; do
                    # run pytorch model
                    echo "Run infinitensor llama with batch_size="$bs" and length="$len" and nproc="$n"."
                    python kunlun_launch.py --name llama --model ../models/llama/llama_"$bs"_"$len"_fp32.onnx --nproc_per_node=$n
                    # delete internal files
                    find ./ -type f -name "*.onnx" -delete
                    find ./ -type f -name "*0c" -delete
            done
            find ./ -type f -name "*.npy" -delete
        done
    done
 done
--- a/examples/distributed/kunlun/run_pytorch.py
+++ b/examples/distributed/kunlun/run_pytorch.py
@ -0,0 +1,245 @@
 import argparse
 import torch
 from transformers import BertModel, BertConfig
 from transformers import GPT2Model, GPT2Config
 from transformers import OPTModel, OPTConfig
 from transformers import LlamaModel, LlamaConfig
 import time
 import numpy as np
 import onnx
 import os
 import sys
 from onnx.external_data_helper import convert_model_to_external_data
 from onnxsim import simplify
 torch.backends.cuda.matmul.allow_tf32 = False
 torch.backends.cudnn.allow_tf32 = False
 def parse_args():
    parser = argparse.ArgumentParser(description="Run pytorch gpt2/bert/opt and optionally export onnx.")
    parser.add_argument(
        "--model", type=str, choices=["gpt2", "bert", "opt", "llama"], required=True, help="model type"
    )
    parser.add_argument("--batch_size", type=int, default=1, help="batch size.")
    parser.add_argument("--length", type=int, default=1, help="sequence length.")
    parser.add_argument(
        "--export_onnx",
        type=str,
        nargs="?",
        default=None,
        const="./",
        help="whether and where to export onnx file",
    )
    parser.add_argument(
        "--input_dir",
        type=str,
        default="./",
        help="path to save pytorch model input data"
    )
    parser.add_argument(
        "--result_dir",
        type=str,
        default="./",
        help="path to save pytorch model output data"
    )
    parser.add_argument(
        "--export_only",
        action="store_true"
    )
    args = parser.parse_args()
    print("arg setting: ", args)
    return (
        args.model,
        args.batch_size,
        args.length,
        args.export_onnx,
        args.input_dir,
        args.result_dir,
        args.export_only
    )
 def get_model(modelname):
    if modelname == "bert":
        model = BertModel.from_pretrained("bert-base-uncased", add_pooling_layer=False, hidden_act="gelu_new") # erf is not impl by infini
        voc_size = BertConfig().vocab_size
    elif modelname == "gpt2":
        model = GPT2Model.from_pretrained("gpt2")
        voc_size = GPT2Config().vocab_size
    elif modelname == "opt":
        model = OPTModel.from_pretrained("./opt-125m")
        voc_size = OPTConfig().vocab_size
    elif modelname == "llama":
        model = LlamaModel.from_pretrained("meta-llama/Llama-2-7b-hf")
        voc_size = LlamaConfig().vocab_size
    else :
        raise KeyError(modelname)
    model = model.eval()
    return model, voc_size
 def run_pytorch(torch_model, voc_size, batchsize, len, model_name):
    data = np.random.randint(0, voc_size, (batchsize, len), dtype=np.int32)
    np.save(os.path.join(input_dir, f"{model_name}_input_0.npy"), data)
    inputs = torch.from_numpy(data).to("cuda")
    torch_model = torch_model.to("cuda")
    n_iter = 10
    with torch.no_grad():
        for _ in range(10):
            outputs = torch_model(inputs)
    torch.cuda.synchronize()
    begin = time.time()
    with torch.no_grad():
        for _ in range(n_iter):
            torch.cuda.synchronize()
            outputs = torch_model(inputs)
            #
            torch.cuda.synchronize()
    torch.cuda.synchronize()
    end = time.time()
    avg_time = (end - begin) / n_iter
    outputs = outputs.last_hidden_state.to("cpu")
    print("outputs abs mean:", abs(np.array(outputs)).mean())
    print(f"average time: {avg_time}")
    torch.cuda.memory.empty_cache()
    np.save(os.path.join(result_dir, f"{model_name}_output.npy"), \
                                        np.array(outputs))
    print(f"Save input & output as {model_name}_input_0.npy and {model_name}_output.npy")
 def export_onnx(model_name, model, data, path, extern=False):
    # torch.onnx.export(model, data, path, verbose=False, do_constant_folding=True)
    if model_name != "llama":
        onnx_model = onnx.load(path)
        onnx_model, check = simplify(onnx_model,
                                 skipped_optimizers=['fuse_qkv', 'eliminate_duplicate_initializer'])
                                 # skipped_optimizers=['fuse_qkv'])
        assert check
        add_value_info_for_constants(onnx_model)
        onnx_model = onnx.shape_inference.infer_shapes(onnx_model)
        if extern:
            extern_path = path.replace('.onnx', '.pb')
            if os.path.exists(extern_path):
                os.remove(extern_path)
            convert_model_to_external_data(
                onnx_model,
                all_tensors_to_one_file=True,
                location=extern_path.split("/")[-1],
                size_threshold=1024,
                convert_attribute=False,
            )
        onnx.save(onnx_model, path)
    else:
        sys.path.append("onnxsim_large_model")
        from onnx_utils import set_onnx_input_shape
        from compress_model import SIZE_1MB, compress_onnx_model, uncompress_onnx_model
        in_model_path = path
        out_model_path = in_model_path[:-5] + ".sim.onnx"
        onnx_model = onnx.load(in_model_path)
        print(f"load model from {in_model_path} success")
        size_th_bytes = 1024 * 1024
        onnx_model, removed_inits = compress_onnx_model(onnx_model, size_th_bytes=size_th_bytes)
        print("compress model success")
        onnx_model = set_onnx_input_shape(onnx_model, "")
        tensor_size_threshold = f"1024KB"
        skipped_optimizers = []
        skipped_optimizers.append("eliminate_duplicate_initializer")
        onnx_model, check = simplify(onnx_model, skipped_optimizers=skipped_optimizers,
                                    tensor_size_threshold=tensor_size_threshold)
        if not check:
            raise ValueError(f"simplify compressed model {in_model_path} failed")
        print(f"simplify model success")
        onnx_model = uncompress_onnx_model(onnx_model, removed_inits)
        print(f"uncompress model success")
        add_value_info_for_constants(onnx_model)
        onnx.save(onnx_model, out_model_path, save_as_external_data=True)
 def add_value_info_for_constants(model : onnx.ModelProto):
    """
    Currently onnx.shape_inference doesn't use the shape of initializers, so add
    that info explicitly as ValueInfoProtos.
    Mutates the model.
    Args:
        model: The ModelProto to update.
    """
    # All (top-level) constants will have ValueInfos before IRv4 as they are all inputs
    if model.ir_version < 4:
        return
    def add_const_value_infos_to_graph(graph : onnx.GraphProto):
        inputs = {i.name for i in graph.input}
        existing_info = {vi.name: vi for vi in graph.value_info}
        for init in graph.initializer:
            # Check it really is a constant, not an input
            if init.name in inputs:
                continue
            # The details we want to add
            elem_type = init.data_type
            shape = init.dims
            # Get existing or create new value info for this constant
            vi = existing_info.get(init.name)
            if vi is None:
                vi = graph.value_info.add()
                vi.name = init.name
            # Even though it would be weird, we will not overwrite info even if it doesn't match
            tt = vi.type.tensor_type
            if tt.elem_type == onnx.TensorProto.UNDEFINED:
                tt.elem_type = elem_type
            if not tt.HasField("shape"):
                # Ensure we set an empty list if the const is scalar (zero dims)
                tt.shape.dim.extend([])
                for dim in shape:
                    tt.shape.dim.add().dim_value = dim
        # Handle subgraphs
        for node in graph.node:
            for attr in node.attribute:
                # Ref attrs refer to other attrs, so we don't need to do anything
                if attr.ref_attr_name != "":
                    continue
                if attr.type == onnx.AttributeProto.GRAPH:
                    add_const_value_infos_to_graph(attr.g)
                if attr.type == onnx.AttributeProto.GRAPHS:
                    for g in attr.graphs:
                        add_const_value_infos_to_graph(g)
    return add_const_value_infos_to_graph(model.graph)
 def main():
    global input_dir, result_dir
    modelname, batchsize, seqlen, \
        export_path, input_dir, result_dir, export_only = parse_args()
    model, voc_size = get_model(modelname) # pytorch model
    if export_path is not None:
        os.makedirs(export_path, exist_ok=True)
        filename = "{}_{}_{}.onnx".format(modelname, batchsize, seqlen)
        path = os.path.join(export_path, filename)
        param = torch.zeros((batchsize, seqlen), dtype=torch.int)
        export_onnx(modelname, model, param, path, True) # export pytorch model to onnx model
        if export_only:
            return
    run_pytorch(model, voc_size, batchsize, seqlen, modelname)
 if __name__ == "__main__":
    main()
--- a/examples/distributed/onnxsim_large_model
+++ b/examples/distributed/onnxsim_large_model
@ -0,0 +1 @@
 Subproject commit cbcf3fbf985a00494b0f136c92eaccd42031bf65
--- a/examples/distributed/parallel.py
+++ b/examples/distributed/parallel.py
@ -0,0 +1,103 @@
 import onnx
 from onnx import (
    ModelProto,
    TensorProto,
    NodeProto,
    AttributeProto,
 )
 from onnx import helper, numpy_helper
 from typing import Dict, Any
 def parse_attribute(node: NodeProto, attrs: Dict[str, Any] = dict()) -> Dict[str, Any]:
    for attr in node.attribute:
        if attr.name in attrs:
            if attr.type == AttributeProto.INT:
                attrs[attr.name] = attr.i
            elif attr.type == AttributeProto.INTS:
                attrs[attr.name] = attr.ints
            elif attr.type == AttributeProto.FLOAT:
                attrs[attr.name] = attr.f
            elif attr.type == AttributeProto.STRING:
                attrs[attr.name] = attr.s
            elif attr.type == AttributeProto.TENSOR:
                attrs[attr.name] = attr.t
            else:
                assert False, "Unsupported Attribute Type: {}".format(attr.type)
    return attrs
 def parallel_model(model: ModelProto, tp_world_size: int = 1, tp_rank: int = 0):
    data = {init.name: init for init in model.graph.initializer}
    nodes = list(model.graph.node)
    def shard_tensor(tensor: TensorProto, dim: int):
        array = numpy_helper.to_array(tensor)
        if dim >= array.ndim:
            dim = array.ndim - 1
        assert array.shape[dim] % tp_world_size == 0
        seg = array.shape[dim] // tp_world_size
        array = array[tp_rank * seg : (tp_rank + 1) * seg]
        return numpy_helper.from_array(array, name=tensor.name + f":sharded({dim})")
    def shard_gemm(node: NodeProto):
        attrs = parse_attribute(
            node, {"alpha": 1.0, "beta": 1.0, "transA": 0, "transB": 0}
        )
        trans = [attrs["transA"], attrs["transB"]]
        dim = 0
        for i, (input, t) in enumerate(zip(node.input, trans)):
            if input in data:
                dim = i
                sharded = shard_tensor(data[input], dim ^ t)
                node.input[i] = sharded.name
                data[input] = sharded
        if len(node.input) > 2:
            input = node.input[2]
            sharded = shard_tensor(data[input], dim)
            node.input[2] = sharded.name
            data[input] = sharded
        node.output[0] += f":sharded({dim})"
        return dim
    for i, node in enumerate(nodes):
        if node.op_type == "Gemm":
            output = node.output[0]
            dim = shard_gemm(node)
            gathered = [node.output[0] + f".{i}" for i in range(tp_world_size)]
            # all_gather
            nodes.insert(
                i + 1,
                helper.make_node(
                    op_type="AllGather",
                    inputs=[node.output[0]],
                    outputs=gathered,
                    name=node.name + "/allgather",
                    # domain="infini", # shape inference fails for custom domain
                ),
            )
            # concat
            nodes.insert(
                i + 2,
                helper.make_node(
                    op_type="Concat",
                    inputs=gathered,
                    outputs=[output],
                    name=node.name + "/concat",
                    axis=dim,
                ),
            )
    graph = helper.make_graph(
        nodes,
        model.graph.name + f"_{tp_rank}",
        model.graph.input,
        model.graph.output,
        data.values(),
        doc_string=model.graph.doc_string,
        value_info=model.graph.value_info,
    )
    model = helper.make_model(graph)
    onnx.shape_inference.infer_shapes(model)
    return model
--- a/examples/distributed/parallel_opt.py
+++ b/examples/distributed/parallel_opt.py
@ -0,0 +1,247 @@
 import onnx
 from onnx import ModelProto, NodeProto, TensorProto, ValueInfoProto
 from onnx import helper, numpy_helper
 from typing import Dict, List
 from placement import Placement, Replicate, Shard, _Partial
 import numpy as np
 def parallel_model(model: ModelProto, tp_world_size: int = 1, tp_rank: int = 0):
    data = {init.name: init for init in model.graph.initializer}
    vinfo = {info.name: info for info in model.graph.value_info}
    vinfo.update({info.name: info for info in model.graph.input})
    vinfo.update({info.name: info for info in model.graph.output})
    output = {info.name: info for info in model.graph.output}
    place: Dict[str, Placement] = {}
    nodes: List[NodeProto] = []
    def is_sharded(name: str):
        return place[name].is_shard()
    def shard_tensor(tensor: TensorProto, plc: Shard, groups: int = 1):
        # print(f"shard {tensor.name} at dim {dim}")
        assert plc.is_shard(), plc
        ndim = len(tensor.dims)
        if plc.dim < 0:
            plc.dim += ndim
        if tensor.dims[plc.dim] == 1:  # broadcast dim, no need to shard.
            return tensor
        array = numpy_helper.to_array(tensor)
        assert array.shape[plc.dim] % tp_world_size == 0, array.shape[plc.dim]
        dims = list(tensor.dims)
        dims.insert(plc.dim, groups)
        dims[plc.dim + 1] //= groups
        array = array.reshape(dims)
        seg = array.shape[plc.dim + 1] // tp_world_size
        array = array.take(
            indices=range(tp_rank * seg, (tp_rank + 1) * seg), axis=plc.dim + 1
        )
        dims = list(tensor.dims)
        dims[plc.dim] //= tp_world_size
        array = array.reshape(dims)
        tensor = numpy_helper.from_array(array, name=tensor.name)
        place[tensor.name] = plc
        return tensor
    def shard_gemm(node: NodeProto, groups: int = 1):
        # print("gemm", node.name)
        in_plc = place[node.input[0]]
        w_plc = Shard(-1) if in_plc.is_replicate() else Shard(0)
        transB = next((attr.i for attr in node.attribute if attr.name == "transB"), 0)
        if transB:
            w_plc.dim = ~w_plc.dim
        input = node.input[1]
        data[input] = shard_tensor(data[input], w_plc, groups)
        output = node.output[0]
        ndim = len(vinfo[output].type.tensor_type.shape.dim)
        out_plc = Shard(ndim - 1) if in_plc.is_replicate() else _Partial()
        place[node.output[0]] = out_plc
    def shard_concat(node: NodeProto):
        # hack for kvcache
        in_plc = place[node.input[1]]
        if in_plc.is_shard():
            seq_len_dim = vinfo[node.input[0]].type.tensor_type.shape.dim.pop(1)
            seq_len_dim.dim_value //= tp_world_size
            vinfo[node.input[0]].type.tensor_type.shape.dim.insert(1, seq_len_dim)
            place[node.input[0]] = in_plc
            place[node.output[0]] = in_plc
    def shard_binary(node: NodeProto, groups: int = 1):
        # print("binary", node.name, node.input[0], place[node.input[0]])
        a = node.input[0]
        b = node.input[1]
        if a in data:
            a, b = b, a
        place[node.output[0]] = place[a]
        if is_sharded(a) and b in data and len(data[b].dims) == 1:  # broadcast
            data[b] = shard_tensor(data[b], Shard(0), groups)
    def shard_reshape(node: NodeProto):
        # print("reshape", node.name, node.input[0], place[node.input[0]])
        if not is_sharded(node.input[0]):
            return
        in_plc = place[node.input[0]]
        s_dim = -1
        in_dims = [d.dim_value for d in vinfo[node.input[0]].type.tensor_type.shape.dim]
        tensor = data[node.input[1]]
        out_dims = numpy_helper.to_array(tensor).copy()
        if len(in_dims) == 3 and len(out_dims) == 4:
            if in_plc.dim == 0:
                s_dim = 1
            elif in_plc.dim == 2:
                s_dim = 2
        if len(in_dims) == 4 and len(out_dims) == 3:
            if in_plc.dim == 1:
                s_dim = 0
            elif in_plc.dim == 2:
                s_dim = 2
        if len(in_dims) == 2 and len(out_dims) == 3:
            if in_plc.dim == 1:
                s_dim = 2
        if len(in_dims) == 4 and len(out_dims) == 2:
            if in_plc.dim == 1:
                s_dim = 0
            elif in_plc.dim == 2:
                s_dim = 1
        if len(in_dims) == 3 and len(out_dims) == 2:
            if in_plc.dim == 1:
                s_dim = 0
            elif in_plc.dim == 2:
                s_dim = 1
        assert s_dim != -1
        assert out_dims[s_dim] % tp_world_size == 0, out_dims
        out_dims[s_dim] //= tp_world_size
        # if ONNX uses the same tensor for multiple Reshape Nodes, then rename it to distingush from others.
        node.input[1] = node.output[0] + "_shape"
        data[node.input[1]] = numpy_helper.from_array(out_dims, name=node.input[1])
        place[node.output[0]] = Shard(s_dim)
    def shard_split(node: NodeProto):
        if not is_sharded(node.input[0]):
            return
        in_plc = place[node.input[0]]
        split_tensor = data[node.input[1]]
        split = numpy_helper.to_array(split_tensor).copy()
        split //= tp_world_size
        data[node.input[1]] = numpy_helper.from_array(split, name=node.input[1])
        for output in node.output:
            place[output] = in_plc
    def shard_transpose(node: NodeProto):
        plc = place[node.input[0]]
        if plc.is_shard():
            perm = next(attr.ints for attr in node.attribute if attr.name == "perm")
            place[node.output[0]] = Shard(list(perm).index(plc.dim))
    def shard_node(node: NodeProto):
        if node.op_type in ["Relu", "Tanh", "Softmax", "Cast"]:
            place[node.output[0]] = place[node.input[0]]
        elif node.op_type in ["Where"]:
            place[node.output[0]] = place[node.input[1]]
        if node.op_type in {"Add", "Mul", "Div", "Max"}:
            shard_binary(node)
        elif node.op_type == "Reshape":
            shard_reshape(node)
        elif node.op_type == "Transpose":
            shard_transpose(node)
        elif node.op_type == "Split":
            shard_split(node)
        elif node.op_type == "MatMul":
            assert (
                place[node.input[0]] == place[node.input[1]]
            ), f"{place[node.input[0]]} != {place[node.input[1]]}"
            place[node.output[0]] = place[node.input[0]]
        elif node.op_type == "Concat":
            shard_concat(node)
    def find_successor(op_type: str, idx: int, search_limit: int = 1):
        for node in model.graph.node[idx + 1 : idx + 1 + search_limit]:
            if node.op_type == op_type:
                return node
        return None
    # all tensors are initially replicated.
    for v in vinfo:
        place[v] = Replicate()
    for t in data:
        place[t] = Replicate()
    for index, node in enumerate(model.graph.node):
        nodes.append(node)
        # linear
        if (node.op_type == "MatMul" or node.op_type == "Gemm") and any(
            input in data for input in node.input
        ):
            # FIXME(constroy): the last MatMul should not be sharded as TP.
            if (
                node.output[0] in output
                or (
                    index + 1 < len(model.graph.node)
                    and model.graph.node[index + 1].output[0]
                )
                in output
            ):
                continue
            groups = 1
            # If the Gemm or Matmul is followed by a split, then the inputs are concatinated by groups
            split_node = find_successor("Split", index, search_limit=2)
            if split_node is not None:
                groups = len(split_node.output)
            shard_gemm(node, groups)
            plc = place[node.output[0]]
            if plc.is_partial():
                new_name = node.output[0] + f":{plc}"
                place[new_name] = place[node.output[0]]
                # insert all_reduce
                nodes.append(
                    helper.make_node(
                        op_type="ReduceSum",
                        inputs=[new_name],
                        outputs=[node.output[0]],
                        name=node.name + "/all_reduce",
                        noop_with_empty_axes=1,
                        communicator=0,  # hack to treat ReduceSum as AllReduceSum
                    )
                )
                place[node.output[0]] = Replicate()
                node.output[0] = new_name
            if len(node.input) > 2:  # split bias to add
                prev = nodes[-1]
                new_name = prev.output[0] + "_no_bias"
                place[new_name] = place[node.output[0]]
                bias = helper.make_node(
                    op_type="Add",
                    inputs=[new_name, node.input[2]],
                    outputs=[prev.output[0]],
                    name=node.name + "/bias",
                )
                node.input.pop()
                prev.output[0] = new_name
                shard_binary(bias, groups)
                nodes.append(bias)
            continue
        shard_node(node)
    new_input = []
    for info in model.graph.input:
        new_input.append(vinfo[info.name])
    graph = helper.make_graph(
        nodes,
        model.graph.name + f"_{tp_rank}",
        new_input,
        model.graph.output,
        data.values(),
        doc_string=model.graph.doc_string,
        # value_info=vinfo.values(),
    )
    for output in graph.output:
        tt = output.type.tensor_type
        if tt.HasField("shape"):
            tt.ClearField("shape")
    model = helper.make_model(graph)
    #model = onnx.shape_inference.infer_shapes(model)
    return model
--- a/examples/distributed/placement.py
+++ b/examples/distributed/placement.py
@ -0,0 +1,64 @@
 from typing import Optional
 class Placement:
    # base class Placement type
    # convenient utils to check for placement types
    def is_shard(self, dim: Optional[int] = None) -> bool:
        if dim is not None and isinstance(self, Shard):
            return self.dim == dim
        else:
            return isinstance(self, Shard)
    def is_replicate(self) -> bool:
        return isinstance(self, Replicate)
    def is_partial(self) -> bool:
        return isinstance(self, _Partial)
 class Replicate(Placement):
    def __eq__(self, other: object) -> bool:
        if not isinstance(other, Replicate):
            return False
        return True
    def __repr__(self) -> str:
        """
        machine readable representation of the Replicate placement
        """
        return "Replicate()"
 class Shard(Placement):
    # shard placement, shard on a dim
    def __init__(self, dim):
        self.dim = dim
    def __eq__(self, other: object) -> bool:
        if not isinstance(other, Shard):
            return False
        return self.dim == other.dim
    def __repr__(self) -> str:
        """
        machine readable representation of the Shard placement
        """
        return f"Shard(dim={self.dim})"
 class _Partial(Placement):
    def __init__(self, reduce_op: str = "sum"):
        self.reduce_op: str = reduce_op
    def __eq__(self, other: object) -> bool:
        if not isinstance(other, _Partial):
            return False
        return self.reduce_op == other.reduce_op
    def __repr__(self) -> str:
        """
        machine readable representation of the Partial placement
        """
        return f"_Partial(reduce_op={self.reduce_op})"
--- a/examples/python/llama_kvcache_inference.py
+++ b/examples/python/llama_kvcache_inference.py
@ -0,0 +1,145 @@
 import os
 from pyinfinitensor.onnx import OnnxStub, backend
 import numpy as np
 import onnx
 import torch
 from transformers import LlamaModel, LlamaForCausalLM
 from tqdm import tqdm
 import onnx_graphsurgeon as gs
 from onnxsim import simplify
 import argparse
 parser = argparse.ArgumentParser(description='')
 parser.add_argument('--batchsize', dest='batchsize', type=int, default=1)
 parser.add_argument('--layer', dest='n_layers', type=int, default=2)
 parser.add_argument('--iter', dest='n_iter', type=int, default=1)
 parser.add_argument('--n_max_length', dest='n_max_length', type=int, default=1024)
 parser.add_argument('--pretrained_llama_path', dest='pretrained_llama_path', type=str, 
                    default="/data0/shared/data/public/opensource_models/meta-llama/Llama-2-7b-hf/")
 parser.add_argument('--onnx_model_path', dest='onnx_model_path', type=str, 
                    default="/data1/shared/llama")
 args = parser.parse_args()
 ONNX_MODEL_PATH = "{}/llama_bs{}_layer{}.onnx".format(args.onnx_model_path, args.batchsize, args.n_layers)
 ONNX_WEIGHT_PATH = "./llama_bs{}_layer{}.pb".format(args.batchsize, args.n_layers)
 def export_onnx(model: LlamaModel, ONNX_MODEL_PATH):
    param = torch.zeros(
        (args.batchsize, 1024), dtype=torch.long)
    logits = model(param, past_key_values=None)
    param_kvcache = torch.zeros((args.batchsize, 1), dtype=torch.long)
    torch.onnx.export(model, (param_kvcache, {"past_key_values": logits.past_key_values,
                                              "position_ids": param_kvcache}), ONNX_MODEL_PATH, verbose=False,
                      do_constant_folding=True,)
    onnx_model = onnx.load(ONNX_MODEL_PATH)
    print("simplifing onnx model")
    onnx_model, check = simplify(onnx_model, skipped_optimizers=[
                                 'eliminate_duplicate_initializer'])
    assert check
    onnx.save(onnx_model, ONNX_MODEL_PATH, save_as_external_data=True, location=ONNX_WEIGHT_PATH)
    print("simlifing finished.")
@gs.Graph.register()
 def replace_with_attention(self, inputs, outputs, inputs_added, outputs_removed):
    for inp in inputs:
        inp.outputs.clear()   
    for out in outputs:
        out.inputs.clear()
    for inp in inputs_added:
        inputs.append(inp)
    for out in outputs_removed:
        out.inputs.clear()
    return self.layer(op="AttentionKVCache", inputs=inputs, outputs=outputs)
 def replace_onnx_with_attention_op():
    graph = gs.import_onnx(
        onnx.load(ONNX_MODEL_PATH))
    tmap = graph.tensors()
    for i in range(args.n_layers):
        inputs = [
            tmap["onnx::Concat_" + str((i+1)*2)],
            tmap["onnx::Concat_" + str((i+1)*2+1)],
            tmap["/model/layers." + str(i) + "/self_attn/Add_output_0"],
            tmap["/model/layers." + str(i) + "/self_attn/Add_1_output_0"],
            tmap["/model/layers." + str(i) + "/self_attn/Transpose_2_output_0"]]
        outputs = [
            tmap["/model/layers." + str(i) + "/self_attn/MatMul_1_output_0"]]
        inputs_added = [graph.inputs[1]]
        outputs_removed = []
        graph.replace_with_attention(
            inputs, outputs, inputs_added, outputs_removed)
    graph.outputs = [tmap[graph.outputs[0].name]]
    graph.cleanup(True).toposort()
    onnx.save(gs.export_onnx(graph), ONNX_MODEL_PATH, save_as_external_data=True)
 if __name__ == "__main__":
    kvcache_torch = None
    torch_model = LlamaForCausalLM.from_pretrained(
        args.pretrained_llama_path, num_hidden_layers=int(args.n_layers)).eval()
    n_heads = torch_model.config.num_attention_heads
    n_dims = torch_model.config.hidden_size // n_heads
    if not os.path.exists(ONNX_MODEL_PATH):
        print("exporting onnx graph")
        export_onnx(torch_model, ONNX_MODEL_PATH)
        replace_onnx_with_attention_op()
    else:
        print("will use exsiting onnx graph")
    onnx_model = onnx.load(ONNX_MODEL_PATH)
    stub = OnnxStub(onnx_model, backend.cuda_runtime())
    count_wrong = 0
    for i in tqdm(range(0, args.n_max_length)):
        query = np.random.randint(
            torch_model.config.vocab_size, size=(args.batchsize, 1), dtype=np.int32)
        position_id = i*np.ones((args.batchsize, 1), dtype=np.int32)
        ####################################
        # pytorch
        ####################################
        outputs_torch = torch_model(
            torch.tensor(query), past_key_values=kvcache_torch)
        logit_torch = outputs_torch['logits']
        kvcache_torch = outputs_torch['past_key_values']
        ####################################
        # infinitensor
        ####################################
        # copyin input
        (list(stub.inputs.items()))[0][1].copyin_int64(
            query.reshape(-1).tolist())
        (list(stub.inputs.items()))[1][1].copyin_int64(
            position_id.reshape(-1).tolist())
        stub.run()
        ####################################
        # validation
        ####################################
        # copyout output
        logits_it = np.array((list(stub.outputs.items()))
                                [0][1].copyout_float())
        try:
            np.testing.assert_allclose(
                logit_torch[:, -1, :].detach().cpu().numpy().flatten(), logits_it, rtol=1e-3, atol=1e-3)
        except Exception as e: 
            try:
                np.testing.assert_allclose(
                    np.argmax(logit_torch[:, -1, :].detach().cpu().numpy().flatten()), np.argmax(logits_it), rtol=1e-3, atol=1e-3)
            except:
                count_wrong = count_wrong + 1
    result = "{}/{} failed.".format(count_wrong, args.n_max_length)
    print(result)
    del stub
--- a/examples/python/onnx_inference.py
+++ b/examples/python/onnx_inference.py
@ -0,0 +1,29 @@
 import sys
 import onnx
 import torch
 import numpy as np
 from pyinfinitensor.onnx import OnnxStub, backend
 if __name__ == '__main__':
    args = sys.argv
    if len(sys.argv) != 2:
        print("Usage: python onnx_inference.py model_name.onnx")
        exit()
    model_path = sys.argv[1]
    # print(model_path)
    onnx_model = onnx.load(model_path)
    onnx_input = onnx_model.graph.input[0]
    input_shape = [[d.dim_value for d in _input.type.tensor_type.shape.dim]
                   for _input in onnx_model.graph.input]
    # Assume that there is only one input tensor
    input_shape = input_shape[0]
    # print(input_shape)
    input_data = np.random.random(input_shape).astype(np.float32)
    model = OnnxStub(onnx_model, backend.cuda_runtime())
    next(iter(model.inputs.values())).copyin_numpy(input_data)
    model.run()
    outputs = next(iter(model.outputs.values())).copyout_numpy()
    outputs = torch.tensor(outputs)
    print(outputs.shape)
--- a/examples/python/paddle_densenet.py
+++ b/examples/python/paddle_densenet.py
@ -0,0 +1,80 @@
 import paddle
 import paddle.vision.transforms as T
 from paddle.vision.datasets import Cifar10
 from pyinfinitensor.onnx import OnnxStub, backend
 import onnx
 import itertools
 def run_cifar_train_and_infer():
    paddle.device.set_device("gpu")
    transform = T.Compose(
        [
            T.Resize(224),
            T.ToTensor(),
            T.Normalize(
                mean=[0.5, 0.5, 0.5],
                std=[0.5, 0.5, 0.5],
                to_rgb=True,
            ),
        ]
    )
    # 下载数据集并初始化 DataSet
    train_dataset = paddle.vision.datasets.Cifar10(mode='train', transform=transform)
    test_dataset = paddle.vision.datasets.Cifar10(mode='test', transform=transform)
    # 模型组网并初始化网络
    densenet = paddle.vision.models.DenseNet(num_classes=10)
    model = paddle.Model(densenet)
    # 模型训练的配置准备，准备损失函数，优化器和评价指标
    model.prepare(paddle.optimizer.Adam(parameters=model.parameters()), 
                paddle.nn.CrossEntropyLoss(),
                paddle.metric.Accuracy())
    # 模型训练
    model.fit(train_dataset, epochs=5, batch_size=64, verbose=1)
    # 模型评估
    model.evaluate(test_dataset, batch_size=64, verbose=1)
    # export to ONNX
    save_path = 'onnx.save/densenet' # 需要保存的路径
    x_spec = paddle.static.InputSpec([1, 3, 224, 224], 'float32', 'x') # 为模型指定输入的形状和数据类型，支持持 Tensor 或 InputSpec ，InputSpec 支持动态的 shape。
    paddle.onnx.export(densenet, save_path, input_spec=[x_spec], opset_version=11)
    # 加载onnx模型并放到Infinitensor中
    model_path = save_path + ".onnx"
    onnx_model = onnx.load(model_path)
    gofusion_model = OnnxStub(onnx_model, backend.cuda_runtime())
    model = gofusion_model
    model.init()
    # 启动推理
    cifar10_test = Cifar10(
        mode="test",
        transform=transform,  # apply transform to every image
        backend="cv2",  # use OpenCV as image transform backend
    )
    batch_size = 1
    total_size = 0
    total_acc = 0.0
    for data in itertools.islice(iter(cifar10_test), 10000):
        images, labels = data
        next(model.inputs.items().__iter__())[1].copyin_float(images.reshape([3*224*224]).tolist())
        model.run()
        outputs = next(model.outputs.items().__iter__())[1].copyout_float()
        outputs = paddle.to_tensor(outputs)
        outputs = paddle.reshape(outputs, (1, 10))
        labels = paddle.to_tensor(labels)
        labels = paddle.reshape(labels, (1,1))
        acc = paddle.metric.accuracy(outputs, labels)
        total_acc += acc
        total_size += batch_size
    print("test acc: {}".format(total_acc.numpy() / total_size))
 if __name__ == "__main__":
    run_cifar_train_and_infer()
--- a/examples/python/paddle_inception.py
+++ b/examples/python/paddle_inception.py
@ -0,0 +1,80 @@
 import paddle
 import paddle.vision.transforms as T
 from paddle.vision.datasets import Cifar10
 from pyinfinitensor.onnx import OnnxStub, backend
 import onnx
 import itertools
 def run_cifar_train_and_infer():
    paddle.device.set_device("gpu")
    transform = T.Compose(
        [
            T.Resize(224),
            T.ToTensor(),
            T.Normalize(
                mean=[0.5, 0.5, 0.5],
                std=[0.5, 0.5, 0.5],
                to_rgb=True,
            ),
        ]
    )
    # 下载数据集并初始化 DataSet
    train_dataset = paddle.vision.datasets.Cifar10(mode='train', transform=transform)
    test_dataset = paddle.vision.datasets.Cifar10(mode='test', transform=transform)
    # 模型组网并初始化网络
    inception = paddle.vision.models.InceptionV3(num_classes=10)
    model = paddle.Model(inception)
    # 模型训练的配置准备，准备损失函数，优化器和评价指标
    model.prepare(paddle.optimizer.Adam(parameters=model.parameters()), 
                paddle.nn.CrossEntropyLoss(),
                paddle.metric.Accuracy())
    # 模型训练
    model.fit(train_dataset, epochs=5, batch_size=64, verbose=1)
    # 模型评估
    model.evaluate(test_dataset, batch_size=64, verbose=1)
    # export to ONNX
    save_path = 'onnx.save/inception' # 需要保存的路径
    x_spec = paddle.static.InputSpec([1, 3, 224, 224], 'float32', 'x') # 为模型指定输入的形状和数据类型，支持持 Tensor 或 InputSpec ，InputSpec 支持动态的 shape。
    paddle.onnx.export(inception, save_path, input_spec=[x_spec], opset_version=11)
    # 加载onnx模型并放到Infinitensor中
    model_path = save_path + ".onnx"
    onnx_model = onnx.load(model_path)
    gofusion_model = OnnxStub(onnx_model, backend.cuda_runtime())
    model = gofusion_model
    model.init()
    # 启动推理
    cifar10_test = Cifar10(
        mode="test",
        transform=transform,  # apply transform to every image
        backend="cv2",  # use OpenCV as image transform backend
    )
    batch_size = 1
    total_size = 0
    total_acc = 0.0
    for data in itertools.islice(iter(cifar10_test), 10000):
        images, labels = data
        next(model.inputs.items().__iter__())[1].copyin_float(images.reshape([3*224*224]).tolist())
        model.run()
        outputs = next(model.outputs.items().__iter__())[1].copyout_float()
        outputs = paddle.to_tensor(outputs)
        outputs = paddle.reshape(outputs, (1, 10))
        labels = paddle.to_tensor(labels)
        labels = paddle.reshape(labels, (1,1))
        acc = paddle.metric.accuracy(outputs, labels)
        total_acc += acc
        total_size += batch_size
    print("test acc: {}".format(total_acc.numpy() / total_size))
 if __name__ == "__main__":
    run_cifar_train_and_infer() 
--- a/examples/python/paddle_model_dev.md
+++ b/examples/python/paddle_model_dev.md
@ -0,0 +1,31 @@
 ## Description
 This is a doc to tell you how to run paddle*.py in your machine. If your model run on other machines except Nvidia, you may need to make some change.
 ## What do we do in paddle*.py files?
 1. Train model and evalute model with Cifar10 dataset
 2. Export paddle model to onnx model
 3. Load onnx model, infer with InfiniTensor and calculate the inference accuracy
 ## Command
 1. Go to `/examples/python` folder 
 2. Run the following command
   1. ```
      python paddle_resnet.py
      python paddle_densenet.py
      python paddle_inception.py
      ```
 ## What should I do if I use other device(MLU, XPU, NPU)?
 You need to change this code:
 ```
 paddle.device.set_device("gpu") # Change gpu to mlu, xpu or npu
 ```
--- a/examples/python/paddle_resnet.py
+++ b/examples/python/paddle_resnet.py
@ -0,0 +1,81 @@
 import paddle
 import paddle.vision.transforms as T
 from paddle.vision.datasets import Cifar10
 from pyinfinitensor.onnx import OnnxStub, backend
 import onnx
 import itertools
 from paddle.vision.models.resnet import BasicBlock
 def run_cifar_train_and_infer():
    paddle.device.set_device("gpu")
    transform = T.Compose(
        [
            T.Resize(224),
            T.ToTensor(),
            T.Normalize(
                mean=[0.5, 0.5, 0.5],
                std=[0.5, 0.5, 0.5],
                to_rgb=True,
            ),
        ]
    )
    # 下载数据集并初始化 DataSet
    train_dataset = paddle.vision.datasets.Cifar10(mode='train', transform=transform)
    test_dataset = paddle.vision.datasets.Cifar10(mode='test', transform=transform)
    # 模型组网并初始化网络
    resnet = paddle.vision.models.ResNet(BasicBlock, depth=18, num_classes=10)
    model = paddle.Model(resnet)
    # 模型训练的配置准备，准备损失函数，优化器和评价指标
    model.prepare(paddle.optimizer.Adam(parameters=model.parameters()), 
                paddle.nn.CrossEntropyLoss(),
                paddle.metric.Accuracy())
    # 模型训练
    model.fit(train_dataset, epochs=5, batch_size=64, verbose=1)
    # 模型评估
    model.evaluate(test_dataset, batch_size=64, verbose=1)
    # export to ONNX
    save_path = 'onnx.save/resnet' # 需要保存的路径
    x_spec = paddle.static.InputSpec([1, 3, 224, 224], 'float32', 'x') # 为模型指定输入的形状和数据类型，支持持 Tensor 或 InputSpec ，InputSpec 支持动态的 shape。
    paddle.onnx.export(resnet, save_path, input_spec=[x_spec], opset_version=11)
    # 加载onnx模型并放到Infinitensor中
    model_path = save_path + ".onnx"
    onnx_model = onnx.load(model_path)
    gofusion_model = OnnxStub(onnx_model, backend.cuda_runtime())
    model = gofusion_model
    model.init()
    # 启动推理
    cifar10_test = Cifar10(
        mode="test",
        transform=transform,  # apply transform to every image
        backend="cv2",  # use OpenCV as image transform backend
    )
    batch_size = 1
    total_size = 0
    total_acc = 0.0
    for data in itertools.islice(iter(cifar10_test), 10000):
        images, labels = data
        next(model.inputs.items().__iter__())[1].copyin_float(images.reshape([3*224*224]).tolist())
        model.run()
        outputs = next(model.outputs.items().__iter__())[1].copyout_float()
        outputs = paddle.to_tensor(outputs)
        outputs = paddle.reshape(outputs, (1, 10))
        labels = paddle.to_tensor(labels)
        labels = paddle.reshape(labels, (1,1))
        acc = paddle.metric.accuracy(outputs, labels)
        total_acc += acc
        total_size += batch_size
    print("test acc: {}".format(total_acc.numpy() / total_size))
 if __name__ == "__main__":
    run_cifar_train_and_infer()
--- a/examples/python/resnet_inference.py
+++ b/examples/python/resnet_inference.py
@ -0,0 +1,24 @@
 import sys
 import onnx
 import torch
 import numpy as np
 from pyinfinitensor.onnx import OnnxStub, backend
 import torchvision.models as models
 if __name__ == '__main__':
    model_path = './resnet18.onnx'
    tv_model = models.resnet50(weights=None)
    input_shape = (1, 3, 224, 224)
    param = torch.rand(input_shape)
    torch.onnx.export(tv_model, param, model_path, verbose=False)
    onnx_model = onnx.load(model_path)
    model = OnnxStub(onnx_model, backend.cuda_runtime())
    images = np.random.random(input_shape).astype(np.float32)
    next(iter(model.inputs.values())).copyin_numpy(images)
    model.run()
    outputs = next(iter(model.outputs.values())).copyout_numpy()
    outputs = torch.tensor(outputs)
    outputs = torch.reshape(outputs, (1, 1000))
    _, predicted = torch.max(outputs, 1)
    print(predicted)
--- a/include/bang/bang_common.h
+++ b/include/bang/bang_common.h
@ -2,6 +2,10 @@
 #include "cnnl.h"
 #include "cnrt.h"
 #include "core/common.h"
 #include "core/data_type.h"
 #ifdef INFINI_USE_CNCL
 #include "cncl.h"
 #endif
 #define checkBangError(call)                                                   \
    {                                                                          \
@ -27,4 +31,70 @@ namespace infini {
 using BangPtr = void *;
 inline cnnlDataType_t cnnlDataTypeConvert(DataType dataType) {
    if (dataType == DataType::Float32) {
        return CNNL_DTYPE_FLOAT;
    }
    if (dataType == DataType::Float16) {
        return CNNL_DTYPE_HALF;
    }
    if (dataType == DataType::Double) {
        return CNNL_DTYPE_DOUBLE;
    }
    if (dataType == DataType::Int8) {
        return CNNL_DTYPE_INT8;
    }
    if (dataType == DataType::Int32) {
        return CNNL_DTYPE_INT32;
    }
    if (dataType == DataType::UInt8) {
        return CNNL_DTYPE_UINT8;
    }
    if (dataType == DataType::BFloat16) {
        return CNNL_DTYPE_BFLOAT16;
    }
    if (dataType == DataType::Int64) {
        return CNNL_DTYPE_INT64;
    }
    if (dataType == DataType::Bool) {
        return CNNL_DTYPE_BOOL;
    }
    IT_TODO_HALT_MSG("Data type " + dataType.toString() +
                     " not supported in CNNL.");
 }
 #ifdef INFINI_USE_CNCL
 inline cnclDataType_t cnclDataTypeConvert(DataType dataType) {
    if (dataType == DataType::Float32) {
        return cnclFloat32;
    }
    if (dataType == DataType::Float16) {
        return cnclHalf;
    }
    if (dataType == DataType::Int8) {
        return cnclInt8;
    }
    if (dataType == DataType::Int16) {
        return cnclInt16;
    }
    if (dataType == DataType::Int32) {
        return cnclInt32;
    }
    if (dataType == DataType::UInt8) {
        return cnclUint8;
    }
    if (dataType == DataType::UInt16) {
        return cnclUint16;
    }
    if (dataType == DataType::UInt32) {
        return cnclUint32;
    }
    if (dataType == DataType::BFloat16) {
        return cnclBfloat16;
    }
    IT_TODO_HALT_MSG("Data type " + dataType.toString() +
                     " not supported in CNCL.");
 }
 #endif
 } // namespace infini
--- a/include/bang/bang_runtime.h
+++ b/include/bang/bang_runtime.h
@ -7,16 +7,19 @@ namespace infini {
 class BangRuntimeObj : public RuntimeObj {
  private:
    cnnlHandle_t cnnl;
    cnrtQueue_t queue;
    std::unique_ptr<CommunicatorObj> comm;
    BangPtr workspace;
    size_t workspaceSize;
    mutable size_t cursor;
  public:
-    BangRuntimeObj() : RuntimeObj(Device::BANG) {
+    explicit BangRuntimeObj(int deviceId = 0)
        : RuntimeObj(Device::BANG, deviceId) {
        cnInit(0);
        CNdev dev;
-        cnDeviceGet(&dev, 0);
+        cnDeviceGet(&dev, deviceId);
        checkBangError(cnrtSetDevice(dev));
        cnrtQueue_t queue;
        checkBangError(cnrtQueueCreate(&queue));
        checkCnnlError(cnnlCreate(&cnnl));
@ -24,10 +27,12 @@ class BangRuntimeObj : public RuntimeObj {
        // 10GB for Longformer
        // size_t longformerNum = 3lu * (1 << 30);
        workspaceSize = 7ll << 30; // 7 GB
        cursor = 0;
        workspace = alloc(workspaceSize);
    }
    virtual ~BangRuntimeObj() {
        dealloc(workspace);
        checkBangError(cnrtQueueDestroy(queue));
        checkCnnlError(cnnlDestroy(cnnl));
    }
    string toString() const override;
@ -45,10 +50,15 @@ class BangRuntimeObj : public RuntimeObj {
    void dealloc(void *ptr) override { checkBangError(cnrtFree(ptr)); }
    cnnlHandle_t cnnlHandle() const { return cnnl; }
    BangPtr getWorkspace(size_t size) const {
-        IT_ASSERT(size <= workspaceSize);
+        IT_ASSERT((cursor + size) <= workspaceSize);
-        return workspace;
+        cursor += size;
        void *temp = workspace;
        temp += (cursor - size);
        return temp;
    }
    void resetWorkspace() const { cursor = 0; }
    void copyBlobFromCPU(void *dst, const void *src,
                         size_t bytes) const override {
        checkBangError(cnrtMemcpy(dst, const_cast<void *>(src), bytes,
@ -66,6 +76,9 @@ class BangRuntimeObj : public RuntimeObj {
        checkBangError(cnrtMemcpy(dst, const_cast<void *>(src), bytes,
                                  CNRT_MEM_TRANS_DIR_PEER2PEER));
    }
    void initComm(const string &name, int worldSize, int rank) final;
    CommunicatorObj &getCommunicator() const override { return *comm; }
    cnrtQueue_t getBangQueue() const { return queue; }
  private:
    void runWithoutSync(const Graph &graph, bool tune, bool profiling) const;
--- a/include/bang/cncl_communicator.h
+++ b/include/bang/cncl_communicator.h
@ -0,0 +1,79 @@
 #pragma once
 #include "bang_common.h"
 #include "core/communicator.h"
 #include <chrono>
 #include <cncl.h>
 #include <cnrt.h>
 #include <cstdlib>
 #include <filesystem>
 #include <fstream>
 #include <mutex>
 #include <thread>
 namespace infini {
 class CnclCommunicatorObj final : public CommunicatorObj {
  private:
    cnclComm_t *comms;
  public:
    CnclCommunicatorObj(const string &name, int worldSize, int rank)
        : CommunicatorObj(worldSize, rank) {
        const std::string filePath("./" + name + "_cncl_id.bin");
        cnclCliqueId clique_id;
        if (rank == 0) {
            CNCL_CHECK(cnclGetCliqueId(&clique_id));
            std::ofstream ofs(filePath, std::ios::binary);
            ofs.write((char *)&clique_id, sizeof(cnclCliqueId));
        } else {
            auto begin = std::chrono::steady_clock::now();
            while (!std::filesystem::exists(filePath)) {
                auto now = std::chrono::steady_clock::now();
                _IT_ASSERT_2(now < begin + std::chrono::seconds(10),
                             "time limit (10s) exceeded.");
                std::this_thread::sleep_for(std::chrono::milliseconds(100));
            }
            std::ifstream ifs(filePath, std::ios::binary);
            ifs.read((char *)&clique_id, sizeof(cnclCliqueId));
        }
        int num_comms = 1;
        int *dev_list = new int[num_comms];
        int *rank_list = new int[num_comms];
        comms = new cnclComm_t[num_comms];
        uint32_t num_dev = 0;
        checkBangError(cnrtGetDeviceCount(&num_dev));
        for (int i = 0; i < num_comms; i++) {
            rank_list[i] = rank;
            dev_list[i] = rank_list[i] % num_dev;
        }
        CNCL_CHECK(cnclInitComms(comms, num_comms, dev_list, rank_list,
                                 worldSize, &clique_id));
        if (rank == 0) {
            std::filesystem::remove(filePath);
        }
        delete[] dev_list;
        delete[] rank_list;
    }
    ~CnclCommunicatorObj() {
        CNCL_CHECK(cnclDestroyComms(comms, 1));
        delete[] comms;
    }
    // Get the actual cnclComm_t
    cnclComm_t getCnclComm() { return comms[0]; }
    virtual string toString() const final {
        std::ostringstream oss;
        oss << "CNCL communicator";
        return oss.str();
    }
 };
 } // namespace infini
--- a/include/core/common.h
+++ b/include/core/common.h
@ -40,12 +40,12 @@ using HashType = uint64_t; // compatible with std::hash
 // Assert: conditions should have no side effect
 #define _IT_ASSERT_2(condition, info)                                          \
-    (static_cast<bool>(condition)                                              \
+    static_cast<bool>(condition)                                               \
-         ? void(0)                                                             \
+        ? void(0)                                                              \
-         : throw ::infini::Exception(                                          \
+        : throw ::infini::Exception(                                           \
-               std::string("[") + __FILE__ + ":" + std::to_string(__LINE__) +  \
+              std::string("[") + __FILE__ + ":" + std::to_string(__LINE__) +   \
-               "] Assertion failed (" + #condition + "): " + info))
+              "] Assertion failed (" + #condition + "): " + info)
-#define _IT_ASSERT_1(condition) _IT_ASSERT_2(condition, "");
+#define _IT_ASSERT_1(condition) _IT_ASSERT_2(condition, "")
 #define IT_ASSERT(...) _VA_SELECT(_IT_ASSERT, __VA_ARGS__)
 #define IT_TODO_HALT() _IT_ASSERT_2(false, "Unimplemented")
@ -61,21 +61,35 @@ template <typename T> auto enum_to_underlying(T e) {
 }
 template <typename T> std::string vecToString(const std::vector<T> &vec) {
-    std::string ret;
+    std::stringstream ss;
-    ret.append("[");
+    ss << "[";
-    for (auto d : vec) {
+    for (size_t i = 0; i < vec.size(); ++i) {
-        ret.append(std::to_string(d));
+        ss << vec.at(i);
-        ret.append(",");
+        if (i < vec.size() - 1) {
            ss << ",";
        }
    }
-    if (!vec.empty())
+    ss << "]";
-        ret.pop_back();
+    return ss.str();
-    ret.append("]");
+}
-    return ret;
+
 template <typename T> std::string vecToString(const T *st, size_t length) {
    std::stringstream ss;
    ss << "[";
    size_t i = 0;
    for (i = 0; i < length; i++) {
        ss << *(st + i);
        if (i < length - 1) {
            ss << ",";
        }
    }
    ss << "]";
    return ss.str();
 }
 double timeit(
    const std::function<void()> &func,
-    const std::function<void(void)> &sync = []() {}, int warmupRounds = 200,
+    const std::function<void(void)> &sync = []() {}, int warmupRounds = 10,
-    int timingRounds = 200);
+    int timingRounds = 10);
 } // namespace infini
--- a/include/core/communicator.h
+++ b/include/core/communicator.h
@ -0,0 +1,22 @@
 #pragma once
 #include "object.h"
 #include "ref.h"
 namespace infini {
 // base class
 class CommunicatorObj : public Object {
  protected:
    int worldSize;
    int rank;
  public:
    CommunicatorObj(int worldSize, int rank)
        : worldSize(worldSize), rank(rank) {}
    virtual ~CommunicatorObj() = default;
    virtual int getWorldSize() const { return worldSize; }
    virtual int getRank() const { return rank; }
 };
 } // namespace infini
--- a/include/core/data_type.h
+++ b/include/core/data_type.h
@ -1,22 +1,54 @@
 #pragma once
 #include "core/common.h"
 namespace infini {
 class DataType {
  public:
    // legacy
    static const DataType Float32;
    static const DataType UInt32;
    // These are just aligned with the type and index of onnx:
    // <https://onnx.ai/onnx/intro/concepts.html#element-type>
-    static const DataType UInt8, Int8, UInt16, Int16, Int32, Int64;
+    static const DataType Undefine;
-    static constexpr size_t sizePerElement[]{
+    static const DataType Float32;
-        sizeof(float),    sizeof(uint32_t), sizeof(uint8_t), sizeof(int8_t),
+    static const DataType UInt8;
-        sizeof(uint16_t), sizeof(int16_t),  sizeof(int32_t), sizeof(int64_t)};
+    static const DataType Int8;
    static const DataType UInt16;
    static const DataType Int16;
    static const DataType Int32;
    static const DataType Int64;
    static const DataType String;
    static const DataType Bool;
    static const DataType Float16;
    static const DataType Double;
    static const DataType UInt32;
    static const DataType UInt64;
    static const DataType BFloat16;
    // "sizePerElement" show the DType to cpu_type
    // DataType::Bool -> int8_t   DataType::Float16 -> uint16_t
    static constexpr size_t sizePerElement[]{0,
                                             sizeof(float),
                                             sizeof(uint8_t),
                                             sizeof(int8_t),
                                             sizeof(uint16_t),
                                             sizeof(int16_t),
                                             sizeof(int32_t),
                                             sizeof(int64_t),
                                             sizeof(std::string),
                                             sizeof(int8_t),
                                             sizeof(uint16_t),
                                             sizeof(double),
                                             sizeof(uint32_t),
                                             sizeof(uint64_t),
                                             0,
                                             0,
                                             sizeof(uint16_t)};
-    static constexpr std::string_view names[]{"Float32", "UInt32", "UInt8",
+    static constexpr std::string_view names[]{
-                                              "Int8",    "UInt16", "Int16",
+        "Undefine",    "Float32", "UInt8",  "Int8",   "UInt16",
-                                              "Int32",   "Int64"};
+        "Int16",       "Int32",   "Int64",  "String", "Bool",
        "Float16",     "Double",  "UInt32", "UInt64", "PlaceHolder",
        "PlaceHolder", "BFloat16"};
    static constexpr int cpuType[]{-1, 0, 2, 3, 4, 5,  6,  7, -1,
                                   3,  4, 9, 1, 8, -1, -1, 4};
  private:
    int index;
@ -29,37 +61,43 @@ class DataType {
    bool operator==(const DataType &rhs) const { return index == rhs.index; }
    bool operator<(const DataType &rhs) const { return index < rhs.index; }
-    template <typename T> static DataType get() {
+    template <typename T> static int get() {
        IT_TODO_HALT_MSG("Unsupported data type");
    }
    size_t getSize() const { return sizePerElement[index]; }
    string toString() const { return string(names[index]); }
    int cpuTypeInt() const { return cpuType[index]; }
    int getIndex() const { return index; }
 };
 inline const DataType DataType::Float32(0);
 inline const DataType DataType::UInt32(1);
 inline const DataType DataType::UInt8(2), DataType::Int8(3),
    DataType::UInt16(4), DataType::Int16(5), DataType::Int32(6),
    DataType::Int64(7);
 // Method definitions are out of the declaration due to GCC bug:
 // https://stackoverflow.com/questions/49707184/explicit-specialization-in-non-namespace-scope-does-not-compile-in-gcc
-template <> inline DataType DataType::get<float>() { return Float32; }
+template <> inline int DataType::get<float>() { return 0; }
-template <> inline DataType DataType::get<uint32_t>() { return UInt32; }
+template <> inline int DataType::get<uint32_t>() { return 1; }
-template <> inline DataType DataType::get<uint8_t>() { return UInt8; }
+template <> inline int DataType::get<uint8_t>() { return 2; }
-template <> inline DataType DataType::get<int8_t>() { return Int8; }
+template <> inline int DataType::get<int8_t>() { return 3; }
-template <> inline DataType DataType::get<uint16_t>() { return UInt16; }
+template <> inline int DataType::get<uint16_t>() { return 4; }
-template <> inline DataType DataType::get<int16_t>() { return Int16; }
+template <> inline int DataType::get<int16_t>() { return 5; }
-template <> inline DataType DataType::get<int32_t>() { return Int32; }
+template <> inline int DataType::get<int32_t>() { return 6; }
-template <> inline DataType DataType::get<int64_t>() { return Int64; }
+template <> inline int DataType::get<int64_t>() { return 7; }
 template <> inline int DataType::get<uint64_t>() { return 8; }
 template <> inline int DataType::get<double>() { return 9; }
 template <int index> struct DT {};
-template <> struct DT<0> { using t = float; };
+template <> struct DT<0> { using t = bool; };
-template <> struct DT<1> { using t = uint32_t; };
+template <> struct DT<1> { using t = float; };
 template <> struct DT<2> { using t = uint8_t; };
 template <> struct DT<3> { using t = int8_t; };
 template <> struct DT<4> { using t = uint16_t; };
 template <> struct DT<5> { using t = int16_t; };
 template <> struct DT<6> { using t = int32_t; };
 template <> struct DT<7> { using t = int64_t; };
 template <> struct DT<8> { using t = char; };
 template <> struct DT<9> { using t = int8_t; };
 template <> struct DT<10> { using t = uint16_t; };
 template <> struct DT<11> { using t = double; };
 template <> struct DT<12> { using t = uint32_t; };
 template <> struct DT<13> { using t = uint64_t; };
 template <> struct DT<16> { using t = uint16_t; };
 } // namespace infini
--- a/include/core/graph.h
+++ b/include/core/graph.h
@ -1,4 +1,5 @@
 #pragma once
 #include "core/lazy_allocator.h"
 #include "core/operator.h"
 #include "core/tensor.h"
@ -9,9 +10,11 @@ class GraphObj : public Object {
    Runtime runtime;
    TensorVec tensors;
    OpVec ops;
    LazyAllocator allocator;
  public:
-    explicit GraphObj(Runtime runtime) : runtime(runtime), sorted(false){};
+    explicit GraphObj(Runtime runtime)
        : runtime(runtime), allocator(runtime), sorted(false){};
    GraphObj(Runtime runtime, OpVec ops_in);
    string toString() const override;
    Runtime getRuntime() const { return runtime; }
@ -50,6 +53,7 @@ class GraphObj : public Object {
    const TensorVec &getTensors() const { return tensors; }
    const OpVec &getOperators() const { return ops; }
    OpVec getComputeOps() const;
    Tensor getTensor(int) const;
    /**
     * Sort the nodes in topological order.
@ -61,7 +65,13 @@ class GraphObj : public Object {
    void optimize();
-    void dataMalloc();
+    void shape_infer();
    void dataMalloc(bool useNaiveAllocator = false, size_t memPoolSize = 0);
    Tensor cloneKV(Tensor &tensor);
    void freeHeap();
    /**
     * @brief Add an operator and create its outputs. Output tensor arguments
@ -117,6 +127,11 @@ class GraphObj : public Object {
     * @brief If the nodes is sorted in topological order.
     */
    bool sorted;
    /**
     * @brief If the weight tensors are allocated.
     */
    bool weightAllocated = false;
 };
 } // namespace infini
--- a/include/core/graph_handler.h
+++ b/include/core/graph_handler.h
@ -5,31 +5,11 @@
 #include <cstdint>
 #include <iostream>
-namespace infini {
+#ifdef USE_CUDA
 #include "cuda/cuda_runtime.h"
 #endif
-// Use the indices from onnx to reduce delivery overhead,
+namespace infini {
 // which comes from onnx but may be not only used for onnx.
 //
 // see https://onnx.ai/onnx/intro/concepts.html#element-type
 enum OnnxDType : int {
    UNDEFINED = 0,
    FLOAT,
    UINT8,
    INT8,
    UINT16,
    INT16,
    INT32,
    INT64,
    STRING,
    BOOL,
    FLOAT16,
    DOUBLE,
    UINT32,
    UINT64,
    COMPLEX64,
    COMPLEX128,
    BFLOAT16,
 };
 class GraphHandlerObj {
    Graph g;
@ -50,27 +30,40 @@ class GraphHandlerObj {
                            int pw, int sh, int sw, int dh, int dw, int oph,
                            int opw);
    Tensor matmul(Tensor a, Tensor b, Tensor y, bool transA, bool transB,
-                  Tensor bias, ActType act);
+                  Tensor bias, ActType act,
-    Tensor batchNorm(Tensor input, Tensor output, Tensor mean, Tensor var,
+                  std::string matmul_compute_type = "default");
-                     Tensor scale, Tensor bias, float momentum, float eps,
+    Tensor batchNormalization(Tensor input, Tensor output, Tensor mean,
-                     bool training);
+                              Tensor var, Tensor scale, Tensor bias,
                              float momentum, float eps, bool training);
    Tensor layerNormalization(Tensor input, Tensor scale, Tensor output,
                              Tensor bias, float eps, int axis, int stash_type);
    Tensor rmsNorm(Tensor input, Tensor weight, Tensor output);
    Tensor maxPool(Tensor input, Tensor output, int kh, int kw, int dh, int dw,
-                   int ph, int pw, int sh, int sw);
+                   int ph, int pw, int sh, int sw, int ceilMode);
    Tensor avgPool(Tensor input, Tensor output, int kh, int kw, int dh, int dw,
-                   int ph, int pw, int sh, int sw);
+                   int ph, int pw, int sh, int sw, int ceilMode);
    Tensor add(Tensor a, Tensor b, Tensor c);
    Tensor sub(Tensor a, Tensor b, Tensor c);
    Tensor mul(Tensor a, Tensor b, Tensor c);
    Tensor div(Tensor a, Tensor b, Tensor c);
    Tensor pow(Tensor a, Tensor b, Tensor c);
    Tensor min(Tensor a, Tensor b, Tensor c);
    Tensor max(Tensor a, Tensor b, Tensor c);
    Tensor relu(Tensor x, Tensor y);
    Tensor silu(Tensor x, Tensor y);
    Tensor gelu(Tensor x, Tensor y);
    Tensor sigmoid(Tensor x, Tensor y);
    Tensor hardSigmoid(Tensor x, Tensor y);
    Tensor hardSwish(Tensor x, Tensor y);
    Tensor tanh(Tensor x, Tensor y);
    Tensor erf(Tensor x, Tensor y);
    Tensor softmax(Tensor x, Tensor y, int axis);
    Tensor abs(Tensor x, Tensor y);
    Tensor sqrt(Tensor x, Tensor y);
    Tensor neg(Tensor x, Tensor y);
    Tensor shape(Tensor x, Tensor y);
    Tensor identity(Tensor x, Tensor y);
    Tensor flatten(Tensor s, Tensor y, int axis);
@ -79,17 +72,51 @@ class GraphHandlerObj {
                std::optional<float> max);
    Tensor transpose(Tensor data, Tensor transposed, Shape perm);
    Tensor reshape(Tensor data, Tensor reshaped, Shape shape);
    Tensor resize(Tensor input, Tensor output,
                  const std::optional<vector<int>> &axes, Tensor sizes,
                  Tensor scales, Tensor roi, vector<uint32_t> sizes_,
                  vector<float> scales_, vector<float> roi_, string mode,
                  string ratioPolicy, string nearestMode,
                  string coordTransMode);
    Tensor squeeze(Tensor input, Tensor output, Shape axes);
    Tensor unsqueeze(Tensor input, Tensor output, Shape axes);
    Tensor concat(TensorVec inputs, Tensor output, int dim);
    Tensor attentionKVCache(Tensor input_k_cache, Tensor input_v_cache,
                            Tensor input_q, Tensor input_k, Tensor input_v,
                            Tensor position_id, Tensor output_matmul);
    Tensor RoPE(Tensor pos, Tensor input, Tensor output);
    TensorVec split(Tensor input, std::optional<TensorVec> outputs, int axis,
-                    int num_outputs);
+                    std::variant<int, vector<int>> numOrRatio);
    Tensor gather(Tensor data, Tensor indices, Tensor output, int axis);
    Tensor gatherElements(Tensor data, Tensor indices, Tensor output, int axis);
    Tensor reduceMean(Tensor data, Tensor reduced,
                      const optional<vector<int>> &axes, bool keepdims);
    Tensor reduceSum(Tensor data, Tensor reduced,
                     const optional<vector<int>> &axes, bool keepdims);
    Tensor slice(Tensor input, Tensor output, const vector<int> &starts,
                 const vector<int> &ends, const optional<vector<int>> &axes,
                 const optional<vector<int>> &steps);
    Tensor pad(Tensor input, Tensor output, const vector<int> &pads,
               const optional<vector<int>> &axes);
    Tensor cast(Tensor input, Tensor output, int to);
    Tensor expand(Tensor input, Tensor output, Shape dims);
    Tensor where(Tensor inputX, Tensor inputY, Tensor condition, Tensor output);
    std::vector<int> getDims(Tensor x) { return x->getDims(); }
    Tensor allReduceSum(Tensor input, Tensor output);
    Tensor allReduceProd(Tensor input, Tensor output);
    Tensor allReduceMin(Tensor input, Tensor output);
    Tensor allReduceMax(Tensor input, Tensor output);
    Tensor allReduceAvg(Tensor input, Tensor output);
    TensorVec allGather(Tensor input, std::optional<TensorVec> outputs, int n);
    Tensor broadcast(Tensor input, Tensor output, int root);
    Tensor send(Tensor input, int source, int destination, Tensor output);
    Tensor recv(Tensor output, int source, int destination, Shape dims,
                int outputType, Tensor input);
    Tensor depthToSpace(Tensor input, Tensor output, int blocksize,
                        std::string mode);
    Tensor lrn(Tensor input, Tensor output, float alpha, float beta, float bias,
               int size);
    //------ modifiers
@ -97,15 +124,31 @@ class GraphHandlerObj {
    inline void optimize() { g->optimize(); }
    inline void shape_infer() { g->shape_infer(); }
    void change_shape(const vector<int> &shape, int tensorId);
    //------ runtime
-    inline void data_malloc() { g->dataMalloc(); }
+    inline void data_malloc(bool useNaiveAllocator = false,
                            size_t memPoolSize = 0) {
        g->dataMalloc(useNaiveAllocator, memPoolSize);
    }
    inline Tensor clone_KV(Tensor &tensor) { return g->cloneKV(tensor); }
    inline void free_heap() { g->freeHeap(); }
    inline void tune() { g->getRuntime()->run(g, true); }
    inline void run() { g->getRuntime()->run(g); }
    inline double get_perf_time() { return g->getRuntime()->getPerfTime(g); }
 #ifdef USE_CUDA
    inline void run_with_cudagraph() {
        (as<CudaRuntimeObj>(g->getRuntime()))->runWithCudaGraph(g);
    }
 #endif
 };
 } // namespace infini
--- a/include/core/kernel.h
+++ b/include/core/kernel.h
@ -2,10 +2,11 @@
 #include "core/common.h"
 #include "core/operator.h"
 #include "core/tensor.h"
 #include "utils/operator_utils.h"
 #include <functional>
 #include <nlohmann/json.hpp>
 using json = nlohmann::json;
 namespace infini {
 using json = nlohmann::json;
 class RuntimeObj; // Forward declaration for Kernel::compute
@ -29,7 +30,6 @@ class Kernel {
  public:
    Kernel() {}
    virtual ~Kernel() {}
    /**
     * @param op The operator to be executed.
     * @param record The parameters for kernel execution. If extra parameters
@ -102,11 +102,9 @@ class KernelRegistry {
    }
    Kernel *getKernel(const KernelAttrs &kernelAttrs) const {
        auto it = kernels.find(kernelAttrs);
-        IT_ASSERT(it != kernels.end(),
+        IT_ASSERT(it != kernels.end(), "Kernel not found for key {" +
-                  "Kernel not found for key {" +
+                                           get_kernel_attrs_str(kernelAttrs) +
-                      to_string(enum_to_underlying(std::get<0>(kernelAttrs))) +
+                                           "}");
                      ", " + OpRegistry::getOpName(std::get<1>(kernelAttrs)) +
                      ", " + std::get<2>(kernelAttrs).toString() + "}");
        return std::get<0>(it->second);
    }
    const KernelRecord &getKernelItem(const KernelAttrs &kernelAttrs) const {
@ -131,15 +129,16 @@ class CpuKernelWithoutConfig : public Kernel {
 } // namespace infini
-#define _REGISTER_KERNEL_1(device, opType, dataType, kernel, name, cnt)        \
+#define _REGISTER_KERNEL_1(device, opType, kernel, name, cnt)                  \
    namespace infini {                                                         \
    static const bool _CAT(_register_kernel_, cnt) =                           \
-        KernelRegistry::getInstance().registerKernel(                          \
+        KernelRegistry::getInstance().registerKernel(KernelAttrs{device,       \
-            KernelAttrs{device, opType, dataType}, new kernel(), name);        \
+                                                                 opType},      \
                                                     new kernel(), name);      \
    }
-#define REGISTER_KERNEL(device, opType, dataType, kernel, name)                \
+#define REGISTER_KERNEL(device, opType, kernel, name)                          \
-    _REGISTER_KERNEL_1(device, opType, dataType, kernel, name, __COUNTER__)
+    _REGISTER_KERNEL_1(device, opType, kernel, name, __COUNTER__)
 #define _REGISTER_CONSTRUCTOR_1(type, constructor, cnt)                        \
    namespace infini {                                                         \
--- a/include/core/lazy_allocator.h
+++ b/include/core/lazy_allocator.h
@ -0,0 +1,122 @@
 #pragma once
 #include "core/runtime.h"
 #include "core/tensor.h"
 #ifdef BUILD_TEST
 #include "gtest/gtest.h"
 #endif
 #include <cstddef>
 #include <map>
 #include <unordered_set>
 namespace infini {
 class LazyAllocator {
  private:
 #ifdef BUILD_TEST
    FRIEND_TEST(LazyAllocator, testMergeFreeBlocks);
    FRIEND_TEST(LazyAllocator, testAllocWithEndFreeBlock);
 #endif
    Runtime runtime;
    size_t used = 0;
    size_t peak = 0;
    size_t weightPeak = 0;
    size_t heapPeak = 0;
    size_t alignment;
    bool hasMemPool = false;
    size_t memPoolSize = 0;
    // pointer to the memory actually allocated
    void *ptr = nullptr;
    // pointer to the weight memory space
    void *weightPtr = nullptr;
    // memory pool ptr
    void *memPoolPtr = nullptr;
    // // a cache designed for a batch size that has already occurred
    // std::unordered_map<size_t, std::unordered_map<TensorObj *, size_t>>
    // batchsizeToTensorOffset;
    struct freeBlockInfo {
        size_t addr;
        size_t blockSize;
    };
    struct cmpFreeBlockInfo {
        bool operator()(const freeBlockInfo &a, const freeBlockInfo &b) const {
            return (a.blockSize != b.blockSize) ? (a.blockSize < b.blockSize)
                                                : (a.addr < b.addr);
        }
    };
    // free balanced tree, maintains all free memory blocks
    std::set<freeBlockInfo, cmpFreeBlockInfo> freeBlocks;
    // key: head address offset of the free memory block
    // value: blockSize of the block
    std::unordered_map<size_t, size_t> headAddrToBlockSize;
    // key: tail address offset of the free memory block
    // value: blockSize of the block
    std::unordered_map<size_t, size_t> tailAddrToBlockSize;
  public:
    LazyAllocator(Runtime runtime);
    virtual ~LazyAllocator();
    void init();
    void setMemPool(size_t memPoolSize);
    bool getMemPoolStatus();
    // function: simulate memory allocation
    // arguments：
    //     size: size of memory block to be allocated
    // return: head address offset of the allocated memory block
    size_t alloc(size_t size);
    size_t allocWeight(size_t size);
    size_t heapAlloc(size_t size);
    void freeHeap();
    // function: simulate memory free
    // arguments:
    //     addr: head address offset of memory block to be free
    //     size: size of memory block to be freed
    void free(size_t addr, size_t size);
    // function: perform actual memory allocation
    // return: pointer to the head address of the allocated memory
    void *getPtr();
    // void addCache(size_t batchsize, std::unordered_map<TensorObj *, size_t>);
    // std::unordered_map<TensorObj *, size_t> getCache(size_t batchsize);
    void *getWeightPtr();
    void *getHeapPtr();
    void info();
  private:
    // function: memory alignment, rouned up
    // return: size of the aligned memory block
    size_t getAlignedSize(size_t size);
 };
 } // namespace infini
--- a/include/core/op_type.h
+++ b/include/core/op_type.h
@ -0,0 +1,269 @@
 #pragma once
 #ifndef OP_TYPE_H
 #define OP_TYPE_H
 #include <string>
 #include <unordered_set>
 namespace infini {
 struct OpType {
    using underlying_t = uint16_t;
    // Clang-format is ambiguous in formating of comment alignment.
    // In order to disambiguate, it is necessary to comment all enum
    // elements.
    enum : underlying_t {
        Unknown,
        Abs,                // Unary
        Acos,               // Unary
        Acosh,              // Unary
        Add,                // Binary
        And,                // Binary
        ArgMax,             //
        Asin,               // Unary
        Asinh,              // Unary
        Atan,               // Unary
        Atanh,              // Unary
        AttentionKVCache,   // Fusion
        AveragePool,        // Pool
        BatchNormalization, //
        Bernoulli,          //
        BitShift,           // Binary
        BitwiseAnd,         // Binary
        BitwiseNot,         // Binary
        BitwiseOr,          // Binary
        BitwiseXor,         // Binary
        BlackmanWindow,     //
        Cast,               // Unary
        CastLike,           //
        Ceil,               // Unary
        Celu,               //
        CenterCropPad,      //
        Clip,               // Unary
        Col2lm,
        Compress,
        Concat,
        ConcatFromSequence,
        ConstantOfShape,
        Conv,          // ComputationIntensive
        ConvInteger,   // ComputationIntensive
        ConvTranspose, // ComputationIntensive
        Cos,           // Unary
        Cosh,          // Unary
        CumSum,
        DFT,
        DeformConv, // ComputationIntensive
        DepthToSpace,
        DequantizeLinear,
        Det,
        Div, // Binary
        Dropout,
        DynamicQuantizeLinear,
        Einsum,
        Elu,
        Equal, // Compair
        Erf,   // Unary
        Exp,   // Unary
        Expand,
        EyeLike,
        Flatten,
        Floor, // Unary
        GRU,
        Gather,
        GatherElements,
        GatherND,
        Gemm,
        Gelu,              // Unary
        GlobalAveragePool, // GlobalPool
        GlobalLpPool,      // GlobalPool
        GlobalMaxPool,     // GlobalPool
        Greater,           // Compair
        GreaterOrEqual,    // Compair
        GridSample,
        GroupNormalization,
        HammingWindow,
        HannWindow,
        HardSigmoid,
        HardSwish,
        Hardmax,
        Identity,
        If,
        InstanceNormalization,
        IsInf,
        IsNaN,
        LRN,
        LSTM,
        LayerNormalization,
        LeakyRelu,
        Less,        // Compair
        LessOrEqual, // Compair
        Log,         // Unary
        LogSoftmax,
        Loop,
        LpNormalization,
        LpPool,
        MatMul,        // ComputationIntensive
        MatMulInteger, // ComputationIntensive
        Max,
        MaxPool,
        MaxRoiPool,
        MaxUnpool,
        Mean,
        MeanVarianceNormalization,
        MelWeightMatrix,
        Min,
        Mish,
        Mod,         // Binary
        Mul,         // Binary
        Multinomial, //
        Neg,         // Unary
        NegativeLogLikelihoodLoss,
        NonMaxSuppression,
        NonZero,
        Not, // Unary
        OneHot,
        Optional,
        OptionalGetElement,
        OptionalHasElement,
        Or,            // Binary
        PRelu,         //
        Pad,           //
        Pow,           // Binary
        QLinearConv,   // ComputationIntensive
        QLinearMatMul, // ComputationIntensive
        QuantizeLinear,
        RNN,
        RandomNormal,
        RandomNormalLike,
        RandomUniform,
        RandomUniformLike,
        Range,
        Reciprocal,
        ReduceL1,        // Reduce
        ReduceL2,        // Reduce
        ReduceLogSum,    // Reduce
        ReduceLogSumExp, // Reduce
        ReduceMax,       // Reduce
        ReduceMean,      // Reduce
        ReduceMin,       // Reduce
        ReduceProd,      // Reduce
        ReduceSum,       // Reduce
        ReduceSumSquare, // Reduce
        Relu,            // Unary
        Silu,            // Unary
        Reshape,
        Resize,
        ReverseSequence,
        RoiAlign,
        RoPE,    // Fusion
        Round,   // Unary
        RMSNorm, // Fusion
        STFT,
        Scan,
        Scatter,
        ScatterElements,
        ScatterND,
        Selu,
        SequenceAt,
        SequenceConstruct,
        SequenceEmpty,
        SequenceErase,
        SequenceInsert,
        SequenceLength,
        SequenceMap,
        Shape,
        Shrink,
        Sigmoid,
        Sign,
        Sin,  // Unary
        Sinh, // Unary
        Size,
        Slice,
        Softmax,
        SoftmaxCrossEntropyLoss,
        Softplus,
        Softsign,
        SpaceToDepth,
        Split,
        SplitToSequence,
        Sqrt,
        Squeeze,
        StringNormalizer,
        Sub,  // Binary
        Sum,  //
        Tan,  // Unary
        Tanh, // unary
        TfIdfVectorizer,
        ThresholdedRelu,
        Tile,
        TopK,
        Transpose,
        Trilu,
        Unique,
        Unsqueeze,
        Upsample,
        Where,
        Xor, // Binary
        // CUSTOM DEFINED
        G2BMM,
        GBMM,
        MemBound,
        // TODO
        ConvTransNHWC,
        ConvBackwardFilter,
        ReluBackward,
        SigmoidBackward,
        TanhBackward,
        Fill,
        Extend,
        MSELoss,
        Hardtanh,
        L2Loss,
        Rsqrt,
        FloorDiv,
        FloorMod,
        Square,
        SquaredDifference,
        // Communication Ops
        AllReduceSum,
        AllReduceProd,
        AllReduceMin,
        AllReduceMax,
        AllReduceAvg,
        AllGather,
        Broadcast,
        Send,
        Recv,
    } type;
    constexpr OpType(decltype(type) t) : type(t) {}
    constexpr explicit OpType(underlying_t val) : type((decltype(type))val) {}
    constexpr underlying_t underlying() const { return type; }
    bool operator==(OpType others) const { return type == others.type; }
    bool operator!=(OpType others) const { return type != others.type; }
    bool operator<(OpType others) const { return type < others.type; }
    const char *toString() const;
    bool isUnary() const;
    bool isBinary() const;
    bool isElementWise() const;
    bool isCompair() const;
    bool isPool() const;
    bool isGlobalPool() const;
    bool isMatMulOrConv() const;
 };
 enum class ActType {
    None,
    Relu,
    Sigmoid,
    Tanh,
 };
 } // namespace infini
 #endif // OP_TYPE_H
--- a/include/core/operator.h
+++ b/include/core/operator.h
@ -1,231 +1,14 @@
 #pragma once
 #include "core/op_type.h"
 #include "core/tensor.h"
 namespace infini {
-
+using KernelAttrs = std::tuple<Device, OpType::underlying_t>;
 enum class OpType {
    Unknown = 0,
    // linear
    Conv = 100,
    ConvBackwardFilter,
    ConvBackwardData,
    Matmul,
    ConvTrans,
    ConvTransNHWC,
    G2BMM,
    GBMM,
    Pad,
    Slice,
    Concat,
    Split,
    Transpose,
    Extend,
    MaxPool,
    AvgPool,
    Add,
    Sub,
    Mul,
    Div,
    Pow,
    Gather,
    ReduceMean,
    Reshape,
    Flatten,
    Identity,
    // element wise
    BatchNorm = 200,
    Softmax,
    Activation,
    Relu,
    ReluBackward,
    PRelu,
    Sigmoid,
    SigmoidBackward,
    Tanh,
    TanhBackward,
    Abs,
    Sin,
    Cos,
    Tan,
    ASin,
    ACos,
    ATan,
    SinH,
    CosH,
    TanH,
    ASinH,
    ACosH,
    ATanH,
    Resize,
    Arange,
    Shape,
    Copy,
    Ceil,
    Floor,
    Clip,
    Erf,
    Exp,
    Fill,
    Log,
    L2Loss,
    Maximum,
    Minimum,
    MSELoss,
    Neg,
    Power,
    Reciprocal,
    Sqrt,
    Rsqrt,
    Cast,
    FloorDiv,
    FloorMod,
    Det,
    Round,
    Square,
    SquaredDifference,
    Hardtanh,
    Equal,
    NotEqual,
    GreaterThan,
    GreaterEqual,
    LessThan,
    LessEqual,
    And,
    Or,
    Xor,
    Not,
    BitAnd,
    BitOr,
    BitXor,
    BitNot,
    BitLeftShift,
    BitRightShift,
    Dropout,
    //
    MemBound = 300,
 };
 using KernelAttrs = std::tuple<Device, OpType, DataType>;
 class OpRegistry {
  public:
    static std::string getOpName(OpType opType) {
 #define FOP(op)                                                                \
    case OpType::op:                                                           \
        return #op
        switch (opType) {
            FOP(Unknown);
            // linear
            FOP(Conv);
            FOP(ConvBackwardFilter);
            FOP(ConvBackwardData);
            FOP(Matmul);
            FOP(ConvTrans);
            FOP(G2BMM);
            FOP(GBMM);
            FOP(Pad);
            FOP(Slice);
            FOP(Concat);
            FOP(Split);
            FOP(Transpose);
            FOP(Extend);
            FOP(MaxPool);
            FOP(AvgPool);
            FOP(Add);
            FOP(Sub);
            FOP(Mul);
            FOP(Div);
            FOP(Pow);
            FOP(Gather);
            FOP(ReduceMean);
            FOP(Reshape);
            FOP(Identity);
            FOP(Shape);
            // element wise
            FOP(BatchNorm);
            FOP(Softmax);
            FOP(Activation);
            FOP(Relu);
            FOP(ReluBackward);
            FOP(PRelu);
            FOP(Sigmoid);
            FOP(SigmoidBackward);
            FOP(Tanh);
            FOP(TanhBackward);
            FOP(Abs);
            FOP(Sin);
            FOP(Cos);
            FOP(Tan);
            FOP(ASin);
            FOP(ACos);
            FOP(ATan);
            FOP(SinH);
            FOP(CosH);
            FOP(TanH);
            FOP(ASinH);
            FOP(ACosH);
            FOP(ATanH);
            FOP(Copy);
            FOP(Ceil);
            FOP(Floor);
            FOP(Clip);
            FOP(Erf);
            FOP(Exp);
            FOP(Fill);
            FOP(Log);
            FOP(L2Loss);
            FOP(Maximum);
            FOP(Minimum);
            FOP(MSELoss);
            FOP(Neg);
            FOP(Power);
            FOP(Reciprocal);
            FOP(Sqrt);
            FOP(Rsqrt);
            FOP(Cast);
            FOP(FloorDiv);
            FOP(FloorMod);
            FOP(Det);
            FOP(Round);
            FOP(Square);
            FOP(SquaredDifference);
            FOP(Hardtanh);
            FOP(Equal);
            FOP(NotEqual);
            FOP(GreaterThan);
            FOP(GreaterEqual);
            FOP(LessThan);
            FOP(LessEqual);
            FOP(And);
            FOP(Or);
            FOP(Xor);
            FOP(Not);
            FOP(BitAnd);
            FOP(BitOr);
            FOP(BitXor);
            FOP(BitNot);
            FOP(BitLeftShift);
            FOP(BitRightShift);
            //
            FOP(MemBound);
        default:
            IT_ASSERT(false);
            break;
        }
 #undef FOP
    }
 };
 enum class ActType {
    None,
    Relu,
    Sigmoid,
    Tanh,
 };
 struct OpPerfKey {
    HashType hash;
-    OpType opType;
+    OpType::underlying_t opType;
    vector<int> attrs;
  public:
@ -233,7 +16,7 @@ struct OpPerfKey {
    // https://github.com/nlohmann/json#how-can-i-use-get-for-non-default-constructiblenon-copyable-types
    OpPerfKey() = default;
    OpPerfKey(HashType hash, OpType opType, vector<int> attrs = {})
-        : hash(hash), opType(opType), attrs(attrs) {}
+        : hash(hash), opType(opType.underlying()), attrs(attrs) {}
    bool operator==(const OpPerfKey &rhs) const {
        if (hash != rhs.hash)
            return false;
@ -272,8 +55,7 @@ class OperatorObj : public Object {
  public:
    OperatorObj(OpType opType, TensorVec inputs, TensorVec outputs);
-    virtual optional<vector<Shape>>
+    virtual optional<vector<Shape>> inferShape(const TensorVec &inputs) = 0;
    inferShape(const TensorVec &inputs) const = 0;
    virtual vector<DataType> inferDataType(const TensorVec &inputs) const;
    /**
     * @brief Constructs outputs (if requried) and check whether the operator is
@ -290,16 +72,7 @@ class OperatorObj : public Object {
     */
    HashType hash() const;
-  public: // check Op type
+  public:
    bool isLinearOp() const;
    bool isElementWiseOp() const;
    bool isSplitOp() const;
    bool isConcatOp() const;
    bool isComputeOp() const;
    bool isTransposeOp() const;
    bool isReshapeOp() const;
    bool isMemBoundOp() const;
  public: // getter and setter
    const TensorVec &getInputs() const { return inputs; }
    const TensorVec &getOutputs() const { return outputs; }
@ -317,6 +90,7 @@ class OperatorObj : public Object {
    OpType getOpType() const { return type; }
    // HACK: set correct data type
    DataType getDType() const { return getInputs(0)->getDType(); }
    DataType getOutDType() const { return getOutput()->getDType(); }
    virtual int numInputs() const = 0;
    virtual int numOutputs() const = 0;
@ -331,7 +105,7 @@ class OperatorObj : public Object {
                           const TensorVec &newOutputs) const = 0;
  protected:
-    optional<vector<Shape>> inferShape() const;
+    optional<vector<Shape>> inferShape();
    vector<DataType> inferDataType() const;
  private:
--- a/include/core/perf_engine.h
+++ b/include/core/perf_engine.h
@ -2,8 +2,8 @@
 #include "core/graph.h"
 #include "core/kernel.h"
 #include <nlohmann/json_fwd.hpp>
 using json = nlohmann::json;
 namespace infini {
 using json = nlohmann::json;
 class PerfEngine {
  public:
--- a/include/core/runtime.h
+++ b/include/core/runtime.h
@ -1,5 +1,7 @@
 #pragma once
 #include "core/common.h"
 #include "core/communicator.h"
 #include "core/op_type.h"
 #include "core/ref.h"
 #include <memory>
@ -13,6 +15,7 @@ class GraphObj;
 class GraphHandlerObj;
 class RuntimeObj;
 class BlobObj;
 template <typename T> class WorkspaceObj;
 using TensorBase = Ref<TensorBaseObj>;
 using Tensor = Ref<TensorObj>;
@ -21,7 +24,7 @@ using Graph = Ref<GraphObj>;
 using GraphHandler = Ref<GraphHandlerObj>;
 using Runtime = Ref<RuntimeObj>;
 using Blob = Ref<BlobObj>;
-enum class OpType;
+template <typename T> using Workspace = Ref<WorkspaceObj<T>>;
 using TensorVec = vector<Tensor>;
 using OpVec = vector<Operator>;
@ -29,15 +32,17 @@ using OpLists = list<Operator>;
 using VType = uint32_t;
-enum class Device { CPU = 1, CUDA, BANG, INTELCPU };
+enum class Device { CPU = 1, CUDA, BANG, INTELCPU, KUNLUN };
 /***************** Forward declaration end *****************/
 class RuntimeObj : public std::enable_shared_from_this<RuntimeObj> {
  protected:
    Device device;
    int deviceId;
  public:
-    RuntimeObj(Device device) : device(device) {}
+    explicit RuntimeObj(Device device, int deviceId = 0)
        : device(device), deviceId(deviceId) {}
    RuntimeObj(RuntimeObj &other) = delete;
    RuntimeObj &operator=(RuntimeObj const &) = delete;
    virtual ~RuntimeObj() {}
@ -69,6 +74,7 @@ class RuntimeObj : public std::enable_shared_from_this<RuntimeObj> {
    }
    bool isCuda() const { return device == Device::CUDA; }
    bool isBang() const { return device == Device::BANG; }
    bool isKUNLUN() const { return device == Device::KUNLUN; }
    void copyBlob(const TensorObj *dst, const TensorObj *src) const;
    // TODO: unify these copy APIs
    virtual void copyBlobFromCPU(void *dst, const void *src,
@ -77,6 +83,12 @@ class RuntimeObj : public std::enable_shared_from_this<RuntimeObj> {
                               size_t bytes) const = 0;
    virtual string toString() const = 0;
    int getDeviceId() const { return deviceId; }
    virtual void initComm(const string &name, int worldSize, int rank) = 0;
    virtual CommunicatorObj &getCommunicator() const = 0;
  protected:
    void printProfilingData(double totTime,
                            const std::map<OpType, double> &opTime,
@ -97,6 +109,9 @@ class CpuRuntimeObj : public RuntimeObj {
    void copyBlobToCPU(void *dst, const void *src, size_t bytes) const override;
    void copyBlobInsideRuntime(void *dst, const void *src,
                               size_t bytes) const override;
    void initComm(const string &, int, int) override { IT_TODO_HALT(); }
    CommunicatorObj &getCommunicator() const override { IT_TODO_HALT(); }
 };
 class NativeCpuRuntimeObj : public CpuRuntimeObj {
--- a/include/core/tensor.h
+++ b/include/core/tensor.h
@ -1,12 +1,17 @@
 #pragma once
 #include "core/tensor_base.h"
 #include "core/tensor_type.h"
 #include "utils/data_convert.h"
 #include <cmath>
 #include <cstring>
 #include <fstream>
 #if USE_CUDA
 #include "cuda/cuda_runtime.h"
 #endif
-
+#if USE_BANG
 #include "bang/bang_runtime.h"
 #endif
 namespace infini {
 // TODO: how to deal with this
@ -18,13 +23,7 @@ class TensorObj : public TensorBaseObj {
    size_t _size; // Cache of Π(shape).
    Fuid fuid;    // Cloned tensors share the same id. Tensors constructed from
                  // scratch have a new id.
-
+    TensorType tensorType = TensorType::others;
    void copyin(const void *ptr, size_t size) {
        runtime->copyBlobFromCPU(getRawDataPtr<void *>(), ptr, size);
    }
    void copyout(void *ptr, size_t size) const {
        runtime->copyBlobToCPU(ptr, getRawDataPtr<void *>(), size);
    }
  public:
    TensorObj(Shape shape, DataType dtype, Runtime runtime);
@ -35,30 +34,74 @@ class TensorObj : public TensorBaseObj {
    size_t getBytes() const { return _size * dtype.getSize(); }
    Shape getDims() const { return shape; }
-    vector<size_t> getStride() const;
+    void setShape(Shape shape_);
    size_t getRank() const { return shape.size(); }
    Shape getStride() const;
    size_t getOffset(const vector<int> &ds) const;
    void dataMalloc();
    UidBaseType getFuid() const { return fuid; }
    bool isWeight() const { return tensorType == TensorType::weight; }
    bool isInput() const { return tensorType == TensorType::input; }
    bool isOutput() const { return tensorType == TensorType::output; }
    bool isOthers() const { return tensorType == TensorType::others; }
    void setWeight() { tensorType = TensorType::weight; }
    void setInput() {
        if (!this->isWeight()) {
            tensorType = TensorType::input;
        }
    }
    void setOutput() {
        if (!this->isWeight()) {
            tensorType = TensorType::output;
        }
    }
    string tensorTypeToString() const {
        switch (tensorType) {
        case TensorType::weight:
            return "weight";
            break;
        case TensorType::input:
            return "input";
            break;
        case TensorType::output:
            return "output";
            break;
        case TensorType::others:
            return "others";
            break;
        default:
            return "unknown tensor type";
            break;
        }
    }
    void load(std::string file_path);
    void save(std::string file_path);
    void copyin(const void *ptr, size_t size) {
        runtime->copyBlobFromCPU(getRawDataPtr<void *>(), ptr, size);
    }
    void copyout(void *ptr, size_t size) const {
        runtime->copyBlobToCPU(ptr, getRawDataPtr<void *>(), size);
    }
    // Copy elements from `data`.
    template <typename T> void copyin(const vector<T> &data) {
-        IT_ASSERT(DataType::get<T>() == dtype);
+        IT_ASSERT(DataType::get<T>() == dtype.cpuTypeInt());
-        IT_ASSERT(data.size() >= _size);
+        IT_ASSERT(data.size() == _size);
        copyin(data.data(), getBytes());
    }
    // Copy all the elements to a vector.
    template <typename T> auto copyout() const {
-        IT_ASSERT(DataType::get<T>() == dtype);
+        IT_ASSERT(DataType::get<T>() == dtype.cpuTypeInt());
        std::vector<T> ans(_size);
        copyout(ans.data(), getBytes());
        return ans;
    }
    // Copy the element at `pos`.
    template <typename T> auto copyOne(const vector<int> &pos) const {
-        IT_ASSERT(DataType::get<T>() == dtype);
+        IT_ASSERT(DataType::get<T>() == dtype.cpuTypeInt());
        auto offset = getOffset(pos);
        auto bytes = dtype.getSize();
        T ans;
@ -70,10 +113,16 @@ class TensorObj : public TensorBaseObj {
    void copyData(const TensorObj *src);
    void copyData(const Tensor &src) { copyData(src.get()); }
    // TODO: Rename this function later, because it is confused that it will
    // change the field data, but actually it generates data and maybe copy to
    // device.
    // FIXME: std::fucntion copies the generator instead of passing it by ref.
    // Thus the internal state of generator cannot be updated.
    void setData(
        std::function<void(void *, size_t, DataType)> const &generator) const;
    void setDataBlob(const Blob &blob);
    Tensor clone() const {
        auto obj = make_ref<TensorObj>(*this);
        obj->freeData();
@ -95,11 +144,16 @@ class TensorObj : public TensorBaseObj {
    }
    void printData() const;
    void dumpData(std::ofstream &ofs) const;
    bool equalData(const Tensor &rhs, double relativeError = 1e-6) const;
    template <typename T> bool equalData(const vector<T> &dataVector) {
        IT_ASSERT(DataType::get<T>() == dtype);
        IT_ASSERT(size() == dataVector.size());
        if (dtype == DataType::Float16) {
            return equalDataImpl_fp16(getRawDataPtr<uint16_t *>(),
                                      (float *)dataVector.data(), size());
        }
        IT_ASSERT(DataType::get<T>() == dtype.cpuTypeInt());
        return equalDataImpl(getRawDataPtr<T *>(), dataVector.data(), size());
    }
@ -139,19 +193,41 @@ class TensorObj : public TensorBaseObj {
    }
    template <typename T>
-    bool equalDataImpl(const T *a, const T *b, size_t size) const {
+    bool equalDataImpl(const T *a, const T *b, size_t size,
                       double relativeError = 1e-6) const {
        for (size_t i = 0; i < size; ++i) {
            if constexpr (std::is_integral_v<T>) {
                if (a[i] != b[i])
                    return false;
            } else if constexpr (std::is_floating_point_v<T>) {
-                if (fabs(a[i] - b[i]) / std::max(fabs(a[i]), fabs(b[i])) >
+                if (std::min(fabs(a[i]), fabs(b[i])) == 0. &&
-                    1e-6) {
+                    fabs(a[i] - b[i]) > relativeError) {
                    printf("Error on %lu: %f %f\n", i, a[i], b[i]);
                    return false;
                } else if (std::min(fabs(a[i]), fabs(b[i])) != 0. &&
                           fabs(a[i] - b[i]) /
                                   std::max(fabs(a[i]), fabs(b[i])) >
                               relativeError) {
                    printf("Error on %lu: %f %f\n", i, a[i], b[i]);
                    return false;
                }
-            } else
+            } else {
                static_assert(!sizeof(T), "Unsupported data type");
            }
        }
        return true;
    }
    bool equalDataImpl_fp16(const uint16_t *a, const float *b,
                            size_t size) const {
        for (size_t i = 0; i < size; ++i) {
            auto a_fp32 = fp16_to_float(a[i]);
            auto b_fp32 = b[i];
            if (fabs(a_fp32 - b_fp32) / std::max(fabs(a_fp32), fabs(b_fp32)) >
                1e-6) {
                printf("Error on %lu: %f %f\n", i, a_fp32, b_fp32);
                return false;
            }
        }
        return true;
    }
@ -172,8 +248,8 @@ class TensorObj : public TensorBaseObj {
    //         // std::cerr << "Init beginned " << std::endl;
    // #pragma omp parallel for
    //         for (size_t i = 0; i < iEnd; ++i)
-    //             data[i] = fastrand(random_seed[omp_get_thread_num() * 16]) %
+    //             data[i] = fastrand(random_seed[omp_get_thread_num() *
-    //             10000;
+    //             16]) % 10000;
    //         // std::cerr << "Init finished" << std::endl;
    //         computed = ComputedFull;
    //         return true;
@ -218,8 +294,8 @@ class TensorObj : public TensorBaseObj {
    //         auto nDim = dims.size();
    //         auto nBroadcastDim = ds.size() - nDim;
    //         for (size_t i = 0; i < nDim; ++i)
-    //             if (ds[nBroadcastDim + i] < 0 || ds[nBroadcastDim + i] >=
+    //             if (ds[nBroadcastDim + i] < 0 || ds[nBroadcastDim +
-    //             dims[i])
+    //             i] >= dims[i])
    //                 return (size_t)-1;
    //         size_t idx = 0;
    //         for (size_t i = 0; i < nDim; ++i)
@ -278,12 +354,14 @@ class TensorObj : public TensorBaseObj {
    //         return (g_seed >> 16) & 0x7FFF;
    //     }
-    //     std::vector<std::vector<int>> const *getSplittingPoints() const {
+    //     std::vector<std::vector<int>> const *getSplittingPoints()
    //     const {
    //         assert(!splittingPoints.empty());
    //         return &splittingPoints;
    //     }
-    //     bool setSplittingPoints(std::vector<std::vector<int>> value) {
+    //     bool setSplittingPoints(std::vector<std::vector<int>> value)
    //     {
    //         assert(!value.empty());
    //         splittingPoints = value;
    //         return true;
@ -305,7 +383,7 @@ class TensorObj : public TensorBaseObj {
    //     }
    //     void initSplittingPoints() {
-    //     splittingPoints.resize(getDims().size()); }
+    //     splittingPoints.resize(getRank()); }
    //     void printShape();
 };
--- a/include/core/tensor_base.h
+++ b/include/core/tensor_base.h
@ -44,6 +44,7 @@ class TensorBaseObj : public Object {
    }
    DataType getDType() const { return dtype; }
    int getDTypeIndex() const { return dtype.getIndex(); }
    Runtime getRuntime() const { return runtime; }
    //     std::pair<Operator *, int> getOutputOfWithIndex();
--- a/include/core/tensor_type.h
+++ b/include/core/tensor_type.h
@ -0,0 +1,7 @@
 #pragma once
 namespace infini {
 enum class TensorType { weight, input, output, others };
 } // namespace infini
--- a/include/core/workspace.h
+++ b/include/core/workspace.h
@ -0,0 +1,42 @@
 #pragma once
 #include "core/runtime.h"
 namespace infini {
 template <class T> class WorkspaceObj {
  private:
    T workspace;           // workspace pointer
    size_t workspaceSize;  // Size of workspace
    size_t workspaceAlloc; // currently use workspace size
  public:
    WorkspaceObj(T workspace_, size_t workspaceSize_)
        : workspace(workspace_), workspaceSize(workspaceSize_) {
        workspaceAlloc = 0;
    }
    virtual ~WorkspaceObj() {
        // Dealloc workspace in RuntimeObj
        // Set workspace = nullptr here
        workspace = nullptr;
    }
    size_t getWorkspaceSize() const { return workspaceSize; }
    T getWorkspace(size_t size) {
        // Get unused workspace
        IT_ASSERT(size + workspaceAlloc <= workspaceSize);
        auto ret = (T)(static_cast<uint8_t *>(workspace) + workspaceAlloc);
        workspaceAlloc += size;
        return ret;
    }
    T getWorkspace() {
        // Override getWorkspace in order to dealloc in runtime
        return workspace;
    }
    void resetWorkspace() {
        // Reset workspaceAlloc every time end kernel
        workspaceAlloc = 0;
    }
    size_t getWorkspaceAlloc() const { return workspaceAlloc; }
 };
 } // namespace infini
--- a/include/cuda/cuda_attention_kvcache.h
+++ b/include/cuda/cuda_attention_kvcache.h
@ -0,0 +1,17 @@
 #pragma once
 #include "core/common.h"
 #include <cstdio>
 struct AttentionKVCacheMetadata {
    int dimSize[4];
    int stride[4];
 };
 namespace infini {
 void attention_kvcache_kernel(float *input_k_cache, float *input_v_cache,
                              float *input_q, float *input_k, float *input_v,
                              int *position_id, float *output_matmul,
                              const AttentionKVCacheMetadata &compMeta,
                              float *output_O_temp, float *output_sum_temp);
 } // namespace infini
--- a/include/cuda/cuda_common.h
+++ b/include/cuda/cuda_common.h
@ -5,17 +5,13 @@
 #include <cuda_profiler_api.h>
 #include <cudnn.h>
 #include <curand.h>
 #include <memory>
 // TODO: replace with Exception (IT_ASSERT)
 #define checkCudaError(call)                                                   \
-    {                                                                          \
+    if (auto err = call; err != cudaSuccess)                                   \
-        auto err = call;                                                       \
+    throw ::infini::Exception(std::string("[") + __FILE__ + ":" +              \
-        if (cudaSuccess != err) {                                              \
+                              std::to_string(__LINE__) + "] CUDA error (" +    \
-            fprintf(stderr, "Cuda error in %s:%i : %s.\n", __FILE__, __LINE__, \
+                              #call + "): " + cudaGetErrorString(err))
                    cudaGetErrorString(err));                                  \
            exit(EXIT_FAILURE);                                                \
        }                                                                      \
    }
 #define checkCUresult(call)                                                    \
    {                                                                          \
@ -39,14 +35,10 @@
    }
 #define checkCudnnError(call)                                                  \
-    {                                                                          \
+    if (auto err = call; err != CUDNN_STATUS_SUCCESS)                          \
-        auto err = call;                                                       \
+    throw ::infini::Exception(std::string("[") + __FILE__ + ":" +              \
-        if (CUDNN_STATUS_SUCCESS != err) {                                     \
+                              std::to_string(__LINE__) + "] cuDNN error (" +   \
-            fprintf(stderr, "cuDNN error in %s:%i : %s.\n", __FILE__,          \
+                              #call + "): " + cudnnGetErrorString(err))
                    __LINE__, cudnnGetErrorString(err));                       \
            exit(EXIT_FAILURE);                                                \
        }                                                                      \
    }
 #define checkCurandError(call)                                                 \
    {                                                                          \
@ -120,4 +112,20 @@ inline const char *curandGetErrorString(curandStatus_t error) {
 using CudaPtr = void *;
 class CUDAStream {
  public:
    CUDAStream(const CUDAStream &) = delete;
    CUDAStream(CUDAStream &&) = delete;
    void operator=(const CUDAStream &) = delete;
    void operator=(CUDAStream &&) = delete;
    static cudaStream_t getCurrentStream() { return _stream; }
    static void Init() { CUDAStream::_stream = 0; };
    static void createStream() { checkCudaError(cudaStreamCreate(&_stream)); }
    static void destroyStream() { checkCudaError(cudaStreamDestroy(_stream)); }
  private:
    CUDAStream(){};
    static cudaStream_t _stream;
 };
 } // namespace infini
--- a/include/cuda/cuda_element_wise.h
+++ b/include/cuda/cuda_element_wise.h
@ -1,8 +1,20 @@
 #pragma once
 namespace infini {
-void div_kernel(float *a, float *b, float *c, int a0, int a1, int a2, int a3,
+void div_kernel(int dtypeIndex, void *a, void *b, void *c, int a0, int a1,
-                int b0, int b1, int b2, int b3, int c0, int c1, int c2, int c3);
+                int a2, int a3, int b0, int b1, int b2, int b3, int c0, int c1,
-void pow_kernel(float *a, float *b, float *c, int a0, int a1, int a2, int a3,
+                int c2, int c3);
-                int b0, int b1, int b2, int b3, int c0, int c1, int c2, int c3);
+void add_kernel(int dtypeIndex, void *a, void *b, void *c, int a0, int a1,
                int a2, int a3, int b0, int b1, int b2, int b3, int c0, int c1,
                int c2, int c3);
 void pow_kernel(int dtypeIndex, void *a, void *b, void *c, int a0, int a1,
                int a2, int a3, int b0, int b1, int b2, int b3, int c0, int c1,
                int c2, int c3);
 void less_kernel(int dtypeIndex, void *a, void *b, void *c, int a0, int a1,
                 int a2, int a3, int b0, int b1, int b2, int b3, int c0, int c1,
                 int c2, int c3);
 void div_const_kernel(int dType, void *a, void *b, void *c, size_t n);
 void pow_const_kernel(int dType, void *a, void *b, void *c, size_t n);
 }; // namespace infini
--- a/include/cuda/cuda_expand.h
+++ b/include/cuda/cuda_expand.h
@ -0,0 +1,12 @@
 #pragma once
 #include "operators/unary.h"
 #include "utils/small_array.h"
 namespace infini {
 void expandKernel(int dType, void *input, void *output, int nDims,
                  int outputsize, SmallArray inputShape,
                  SmallArray outputShape);
 void expandRowKernel(int dType, void *input, void *output, int n_rows,
                     int row_len);
 }; // namespace infini
--- a/include/cuda/cuda_layernorm.h
+++ b/include/cuda/cuda_layernorm.h
@ -0,0 +1,17 @@
 #pragma once
 #include "operators/unary.h"
 namespace infini {
 void LaynormKernel(const float *input, const float *scale, const float eps,
                   int size, int scaleSize, const int dimsize, const int stride,
                   float *output, const float *bias, int biasSize);
 void LaynormKernel(const float *input, const float *scale, const float eps,
                   int size, int scaleSize, const int dimsize, const int stride,
                   float *output);
 void LaynormKernel(const half *input, const half *scale, const half eps,
                   int size, int scaleSize, const int dimsize, const int stride,
                   half *output, const half *bias, int biasSize);
 void LaynormKernel(const half *input, const half *scale, const half eps,
                   int size, int scaleSize, const int dimsize, const int stride,
                   half *output);
 }; // namespace infini
--- a/include/cuda/cuda_pad_slice.h
+++ b/include/cuda/cuda_pad_slice.h
@ -10,10 +10,11 @@ typedef struct {
    int wholeNDim[MAX_DIM];  // dim size after padding or before slicing
    int partNDim[MAX_DIM];   // dim size before padding or after slicing
    int partStride[MAX_DIM]; // stride before padding or after slicing
    int DType;
 } TransMetaData;
 namespace infini {
-void pad_slice_kernel(float *partData, float *wholeData,
+void pad_slice_kernel(void *partData, void *wholeData,
                      const TransMetaData &metadata, int nDims, int num,
                      bool isPad);
 } // namespace infini
--- a/include/cuda/cuda_rmsnorm.h
+++ b/include/cuda/cuda_rmsnorm.h
@ -0,0 +1,10 @@
 #pragma once
 #include "operators/rms_norm.h"
 namespace infini {
 void rmsnorm_kernel(int dType, void *input, void *weight, void *output,
                    int num_tokens, int hidden_size);
 }; // namespace infini
--- a/include/cuda/cuda_rope.h
+++ b/include/cuda/cuda_rope.h
@ -0,0 +1,12 @@
 #pragma once
 #include "operators/rope.h"
 #include "utils/small_array.h"
 namespace infini {
 void rope_kernel(int dType, int *pos, void *input, void *output, int size,
                 int dim_model, int dim_head, int hidden_stride,
                 int pos_stride);
 }; // namespace infini
--- a/include/cuda/cuda_runtime.h
+++ b/include/cuda/cuda_runtime.h
@ -1,6 +1,9 @@
 #pragma once
 #include "core/runtime.h"
 #include "cuda/cuda_common.h"
 #ifdef INFINI_USE_NCCL
 #include "cuda/nccl_communicator.h"
 #endif
 namespace infini {
@ -8,21 +11,34 @@ class CudaRuntimeObj : public RuntimeObj {
  private:
    cudnnHandle_t cudnn;
    cublasHandle_t cublas;
    std::unique_ptr<CommunicatorObj> comm;
    CudaPtr workspace;
    size_t workspaceSize;
    bool isCudaGraphCreated;
    cudaGraph_t cudaGraph;
    cudaGraphExec_t cudaGraphInstance;
  public:
-    CudaRuntimeObj() : RuntimeObj(Device::CUDA) {
+    explicit CudaRuntimeObj(int deviceId = 0)
        : RuntimeObj(Device::CUDA, deviceId) {
        checkCudaError(cudaSetDevice(deviceId));
        checkCudnnError(cudnnCreate(&cudnn));
        checkCublasError(cublasCreate(&cublas));
        // 10GB for Longformer
        // size_t longformerNum = 3lu * (1 << 30);
        workspaceSize = 7ll << 30; // 7 GB
        workspace = alloc(workspaceSize);
        isCudaGraphCreated = false;
        CUDAStream::Init();
    }
    virtual ~CudaRuntimeObj() {
        try {
            if (isCudaGraphCreated) {
                checkCudaError(cudaGraphExecDestroy(cudaGraphInstance));
                checkCudaError(cudaGraphDestroy(cudaGraph));
                CUDAStream::destroyStream();
            }
            dealloc(workspace);
            checkCudnnError(cudnnDestroy(cudnn));
            checkCublasError(cublasDestroy(cublas));
@ -69,6 +85,13 @@ class CudaRuntimeObj : public RuntimeObj {
    void runWithoutSync(const Graph &graph) const;
    void runWithCudaGraph(const Graph &graph);
    // init communicator
    void initComm(const string &name, int worldSize, int rank) final;
    CommunicatorObj &getCommunicator() const final { return *comm; }
  private:
    void tune(const Graph &graph, bool profiling) const;
 };
--- a/include/cuda/cuda_softmax.h
+++ b/include/cuda/cuda_softmax.h
@ -0,0 +1,8 @@
 #pragma once
 #include "utils/small_array.h"
 namespace infini {
 void softmax_kernel(int num_blocks, float *input, float *output, int size,
                    int dimsize, int stride);
 void softmax_kernel(int num_blocks, half *input, half *output, int size,
                    int dimsize, int stride);
 } // namespace infini
--- a/include/cuda/cuda_split_concat.h
+++ b/include/cuda/cuda_split_concat.h
@ -3,13 +3,13 @@
 #include <cstdio>
 const int BATCH_SIZE = 32; // parallel tensor number.
-const int DIM_MAX_SIZE = 4;
+const int DIM_MAX_SIZE = 8;
 // Concat operator acts like element tensors composing to one big tensor,and
 // split operator acts like one big tensor being composed by element
 // tensors.
-struct ElementTensorMetadata {
+template <typename T> struct ElementTensorMetadata {
-    float *data[BATCH_SIZE];
+    T *data[BATCH_SIZE];
    int dimBgNo[BATCH_SIZE]; // the dimention begin no of the element tensor in
                             // the composed tensor.
    int dimSize[BATCH_SIZE]; // the dimention size of the element tensor.
@ -20,16 +20,17 @@ struct ElementTensorMetadata {
                   data[i], dimBgNo[i], dimSize[i], nElements[i]);
    }
 };
-
+template <typename T> struct ComposedTensorMetadata {
 struct ComposedTensorMetadata {
    int dimSize[DIM_MAX_SIZE];
    int stride[DIM_MAX_SIZE];
-    float *data;
+    T *data;
 };
 namespace infini {
-void split_concat_kernel(const ElementTensorMetadata &eleMeta,
+void split_concat_kernel(const ElementTensorMetadata<float> &eleMeta,
-                         const ComposedTensorMetadata &compMeta, int dim,
+                         const ComposedTensorMetadata<float> &compMeta, int dim,
                         int batchSize, int nDims, bool isSplit);
 void split_concat_kernel(const ElementTensorMetadata<half> &eleMeta,
                         const ComposedTensorMetadata<half> &compMeta, int dim,
                         int batchSize, int nDims, bool isSplit);
 } // namespace infini
--- a/include/cuda/cuda_transpose.h
+++ b/include/cuda/cuda_transpose.h
@ -0,0 +1,11 @@
 #pragma once
 #include "operators/transpose.h"
 #include "utils/small_array.h"
 namespace infini {
 void transpose_kernel(int dType, void *input, void *output, int nDims, int size,
                      SmallArray strides, SmallArray outputShape);
 }; // namespace infini
--- a/include/cuda/cuda_unary.h
+++ b/include/cuda/cuda_unary.h
@ -3,31 +3,22 @@
 #include "operators/unary.h"
 namespace infini {
-void softmax_kernel(float *input, float *output, int num);
+template <typename T> void softmax_kernel(T *input, T *output, size_t num);
-void relu_kernel(float *input, float *output, int num);
+template <typename T> void relu_kernel(T *input, T *output, size_t num);
-void sigmoid_kernel(float *input, float *output, int num);
+template <typename T> void silu_kernel(T *input, T *output, size_t num);
-void tanh_kernel(float *input, float *output, int num);
+template <typename T> void sigmoid_kernel(T *input, T *output, size_t num);
-void abs_kernel(float *input, float *output, int num);
+template <typename T> void tanh_kernel(T *input, T *output, size_t num);
 template <typename T> void abs_kernel(T *input, T *output, size_t num);
 template <typename T> void sqrt_kernel(T *input, T *output, size_t num);
 template <typename T> void neg_kernel(T *input, T *output, size_t num);
 template <typename T> void gelu_kernel(T *input, T *output, size_t num);
 template <typename T> void erf_kernel(T *input, T *output, size_t num);
 template <typename T> void hard_sigmoid_kernel(T *input, T *output, size_t num);
 template <typename T> void hard_swish_kernel(T *input, T *output, size_t num);
-void unary_kernel(const Operator &_op) {
+template <typename INPUT, typename OUTPUT>
-    auto op = as<UnaryObj>(_op);
+void cast_kernel(INPUT *input, OUTPUT *output, size_t num);
    float *const inputData = (op->getInputs(0)->getRawDataPtr<float *>());
    float *const outputData = (op->getOutput()->getRawDataPtr<float *>());
-    auto dim = op->getInputs(0)->getDims();
+void unary_kernel(const Operator &_op);
    int n = dim[0], c = dim[1], h = dim[2], w = dim[3];
    if (op->getOpType() == OpType::Softmax)
        softmax_kernel(inputData, outputData, n * c * h * w);
    else if (op->getOpType() == OpType::Relu)
        relu_kernel(inputData, outputData, n * c * h * w);
    else if (op->getOpType() == OpType::Sigmoid)
        sigmoid_kernel(inputData, outputData, n * c * h * w);
    else if (op->getOpType() == OpType::Tanh)
        tanh_kernel(inputData, outputData, n * c * h * w);
    else if (op->getOpType() == OpType::Abs)
        abs_kernel(inputData, outputData, n * c * h * w);
    else
        IT_TODO_HALT();
 }
 }; // namespace infini
--- a/include/cuda/cuda_utility.h
+++ b/include/cuda/cuda_utility.h
@ -1,11 +1,29 @@
 #pragma once
 #include "core/tensor.h"
 #include "cuda/cuda_common.h"
 namespace infini {
 void cudaPrintFloat(float *x, int len);
-void cudaPrintTensor(const Tensor &tensor) {
+void cudaPrintTensor(const Tensor &tensor);
    cudaPrintFloat(tensor->getRawDataPtr<float *>(), tensor->size());
 }
 cudnnDataType_t cudnnDataTypeConvert(DataType dataType);
 cudaDataType cublasDataTypeConvert(DataType);
 template <int index> struct DT_CUDA {};
 template <> struct DT_CUDA<0> { using t = bool; };
 template <> struct DT_CUDA<1> { using t = float; };
 template <> struct DT_CUDA<2> { using t = unsigned char; };
 template <> struct DT_CUDA<3> { using t = char; };
 template <> struct DT_CUDA<4> { using t = unsigned short; };
 template <> struct DT_CUDA<5> { using t = short; };
 template <> struct DT_CUDA<6> { using t = int; };
 template <> struct DT_CUDA<7> { using t = long long; };
 template <> struct DT_CUDA<9> { using t = bool; };
 template <> struct DT_CUDA<10> { using t = half; };
 template <> struct DT_CUDA<11> { using t = double; };
 template <> struct DT_CUDA<12> { using t = unsigned int; };
 template <> struct DT_CUDA<13> { using t = unsigned long long; };
 template <> struct DT_CUDA<16> { using t = nv_bfloat16; };
 } // namespace infini
--- a/include/cuda/cuda_where.h
+++ b/include/cuda/cuda_where.h
@ -0,0 +1,17 @@
 #pragma once
 #include "operators/unary.h"
 #include "utils/small_array.h"
 namespace infini {
 void whereKernel(const float *inputX, const float *inputY,
                 const uint8_t *condition, float *output, int nDims,
                 int outputsize, SmallArray inputXShape, SmallArray inputYShape,
                 SmallArray conditionShape, SmallArray outputShape, int xSize,
                 int ySize, int cSize);
 void whereKernel(const half *inputX, const half *inputY,
                 const uint8_t *condition, half *output, int nDims,
                 int outputsize, SmallArray inputXShape, SmallArray inputYShape,
                 SmallArray conditionShape, SmallArray outputShape, int xSize,
                 int ySize, int cSize);
 }; // namespace infini
--- a/include/cuda/gather.h
+++ b/include/cuda/gather.h
@ -1,17 +1,61 @@
 #pragma once
-
+#include "core/data_type.h"
-typedef struct {
+#include "core/operator.h"
-    int *indexValue;
+#include "operators/gather.h"
    int axis;
    int inNDim;
    int outNDim;
    int idxNDim;
    int outDim[4];
    int idxDim[4];
    int idxStride[4];
    int inStride[4];
 } GatherMetaData;
 namespace infini {
-void gather_kernel(float *in, float *out, GatherMetaData metaData, int num);
+struct GatherMetaData {
    // Pointer to indices
    void *indexValue;
    // Type of index values
    DataType indexType;
    // Type of input and output data
    DataType dataType;
    // Axis of the gather operation
    int axis;
    // Rank of input
    int inNDim;
    // Rank of output
    int outNDim;
    // Rank of indices
    int idxNDim;
    // Shape of output
    int outDim[4];
    // Shape of indices
    int idxDim[4];
    // Strides of indices
    int idxStride[4];
    // Strides of input
    int inStride[4];
 };
 inline void initGatherMetaData(GatherMetaData &metaData,
                               const Ref<OperatorObj> &_op) {
    memset(&metaData, 0, sizeof(metaData));
    auto op = as<GatherBaseObj>(_op);
    Ref<TensorObj> in = op->getInputs(0);
    Ref<TensorObj> index = op->getInputs(1);
    Ref<TensorObj> out = op->getOutput();
    metaData.indexValue = index->getRawDataPtr<void *>();
    metaData.indexType = index->getDType();
    metaData.dataType = in->getDType();
    metaData.axis = op->getAxis();
    metaData.inNDim = in->getRank();
    metaData.outNDim = out->getRank();
    metaData.idxNDim = index->getRank();
    for (int i = 0; i < metaData.outNDim; ++i)
        metaData.outDim[i] = out->getDims()[i];
    for (int i = 0; i < metaData.idxNDim; ++i) {
        metaData.idxDim[i] = index->getDims()[i];
        metaData.idxStride[i] = index->getStride()[i];
    }
    for (int i = 0; i < metaData.inNDim; ++i) {
        metaData.inStride[i] = in->getStride()[i];
    }
 }
 template <typename T>
 void gather_kernel(T *in, T *out, GatherMetaData metaData, size_t num);
 void gather_elements_kernel(void *in, void *out, GatherMetaData metaData,
                            size_t num);
 } // namespace infini
--- a/include/cuda/nccl_communicator.h
+++ b/include/cuda/nccl_communicator.h
@ -0,0 +1,70 @@
 #pragma once
 #include "core/communicator.h"
 #include <chrono>
 #include <cstdlib>
 #include <filesystem>
 #include <fstream>
 #include <nccl.h>
 #include <thread>
 #define checkNcclError(call)                                                   \
    {                                                                          \
        auto err = call;                                                       \
        if (ncclSuccess != err) {                                              \
            fprintf(stderr, "NCCL error in %s:%i : %s.\n", __FILE__, __LINE__, \
                    ncclGetErrorString(err));                                  \
            exit(EXIT_FAILURE);                                                \
        }                                                                      \
    }
 namespace infini {
 class NcclCommunicatorObj final : public CommunicatorObj {
  private:
    ncclComm_t comm;
  public:
    NcclCommunicatorObj(const string &name, int worldSize, int rank)
        : CommunicatorObj(worldSize, rank) {
        const std::string filePath("./" + name + "_nccl_id.bin");
        ncclUniqueId commId;
        if (rank == 0) {
            checkNcclError(ncclGetUniqueId(&commId));
            std::ofstream ofs(filePath, std::ios::binary);
            ofs.write((char *)&commId, sizeof(ncclUniqueId));
        } else {
            auto begin = std::chrono::steady_clock::now();
            while (!std::filesystem::exists(filePath)) {
                auto now = std::chrono::steady_clock::now();
                _IT_ASSERT_2(now < begin + std::chrono::seconds(10),
                             "time limit (10s) exceeded.");
                std::this_thread::sleep_for(std::chrono::milliseconds(100));
            }
            std::ifstream ifs(filePath, std::ios::binary);
            ifs.read((char *)&commId, sizeof(ncclUniqueId));
        }
        checkNcclError(ncclCommInitRank(&comm, worldSize, commId, rank));
        if (rank == 0) {
            std::filesystem::remove(filePath);
        }
    }
    // Get the actual ncclComm_t
    ncclComm_t getNcclComm() { return comm; }
    void finalize() { checkNcclError(ncclCommFinalize(comm)); }
    ~NcclCommunicatorObj() final {
        finalize();
        checkNcclError(ncclCommDestroy(comm));
    }
    virtual string toString() const final {
        std::ostringstream oss;
        oss << "NCCL communicator";
        return oss.str();
    }
 };
 } // namespace infini
--- a/include/cuda/softmax.h
+++ b/include/cuda/softmax.h
@ -1,6 +0,0 @@
 #pragma once
 namespace infini {
 void softmax_kernel(int max_threadblock_size, int batch_size, float *x,
                    float *y, int dim, int stride);
 }
--- a/include/kunlun/kunlun_act_type.h
+++ b/include/kunlun/kunlun_act_type.h
@ -0,0 +1,23 @@
 #include "core/op_type.h"
 #include "kunlun/kunlun_common.h"
 namespace infini {
 using KunlunActType = xdnn::Activation_t;
 KunlunActType parseActType(ActType act) {
    switch (act) {
    case ActType::None:
        return KunlunActType::LINEAR;
    case ActType::Tanh:
        return KunlunActType::TANH;
    case ActType::Sigmoid:
        return KunlunActType::SIGMOID;
    case ActType::Relu:
        return KunlunActType::RELU6;
    default:
        fprintf(stderr, "Activation Type not support yet!\n");
        break;
    }
    return KunlunActType::LINEAR;
 }
 }; // namespace infini
--- a/include/kunlun/kunlun_common.h
+++ b/include/kunlun/kunlun_common.h
@ -0,0 +1,22 @@
 #pragma once
 #include "core/common.h"
 #include "xpu/runtime_ex.h"
 #include "xpu/xdnn.h"
 namespace xdnn = baidu::xpu::api;
 #define checkKUNLUNError(call)                                                 \
    {                                                                          \
        auto err = call;                                                       \
        if (XPU_SUCCESS != err) {                                              \
            fprintf(stderr, "KUNLUN error in %s:%i : %s.\n", __FILE__,         \
                    __LINE__, xpu_strerror(err));                              \
            exit(EXIT_FAILURE);                                                \
        }                                                                      \
    }
 namespace infini {
 using KUNLUNPtr = void *;
 } // namespace infini
--- a/include/kunlun/kunlun_kernel_without_config.h
+++ b/include/kunlun/kunlun_kernel_without_config.h
@ -0,0 +1,24 @@
 #pragma once
 #include "core/kernel.h"
 #include "kunlun/kunlun_runtime.h"
 namespace infini {
 class KUNLUNKernelWithoutConfig : public Kernel {
  public:
    virtual void compute(const Operator &op, const PerfRecord &record,
                         const RuntimeObj *context) const {
        compute(op, context);
    }
    virtual void compute(const Operator &op,
                         const RuntimeObj *context) const = 0;
    // Premise: op is idempotent since it is called multiple times.
    virtual PerfRecord tune(const Operator &op,
                            const RuntimeObj *_context) const {
        auto context = dynamic_cast<const KUNLUNRuntimeObj *>(_context);
        return make_ref<PerfRecordObj>(timeit([&]() { compute(op, _context); },
                                              [&]() { context->sync(); }));
    }
 };
 } // namespace infini
--- a/include/kunlun/kunlun_runtime.h
+++ b/include/kunlun/kunlun_runtime.h
@ -0,0 +1,81 @@
 #pragma once
 #include "core/runtime.h"
 #include "core/workspace.h"
 #include "kunlun/kunlun_common.h"
 #ifdef INFINI_USE_XCCL
 #include "kunlun/xccl_communicator.h"
 #endif
 namespace infini {
 class KUNLUNRuntimeObj : public RuntimeObj {
  private:
    xdnn::Context *ctx;
    std::unique_ptr<CommunicatorObj> comm;
    // KUNLUNPtr workspace;
    // size_t workspaceSize;
    Workspace<KUNLUNPtr> workspace;
  public:
    KUNLUNRuntimeObj(int deviceId = 0) : RuntimeObj(Device::KUNLUN) {
        xpu_set_device(deviceId);
        ctx = xdnn::create_context();
        // 10GB for Longformer
        // size_t longformerNum = 3lu * (1 << 30);
        size_t workspaceSize = 2llu << 30; // 2 GB
        KUNLUNPtr wkspacePtr = alloc(workspaceSize);
        workspace =
            make_ref<WorkspaceObj<KUNLUNPtr>>(wkspacePtr, workspaceSize);
    }
    virtual ~KUNLUNRuntimeObj() {
        KUNLUNPtr wkspacePtr = workspace->getWorkspace();
        dealloc(wkspacePtr);
        xdnn::destroy_context(ctx);
    }
    string toString() const override;
    void run(const Graph &graph, bool tune = false,
             bool profiling = false) const;
    // double runEvaluation(const Graph &graph, int nWarmups,
    //                      int nEvaluations) const;
    void sync() const;
    KUNLUNPtr alloc(size_t size) override {
        void *ptr;
        checkKUNLUNError(
            xpu_malloc((void **)&ptr, size, XPUMemoryKind::XPU_MEM_HBM));
        return ptr;
    }
    void dealloc(void *ptr) override { xpu_free(ptr); }
    xdnn::Context *KUNLUNHandle() const { return ctx; }
    // Get $size workspace by bytes
    KUNLUNPtr getWorkspace(size_t size) const {
        auto ret = workspace->getWorkspace(size);
        return ret;
    }
    Workspace<KUNLUNPtr> getWorkspaceObj() const { return workspace; }
    void copyBlobFromCPU(void *dst, const void *src,
                         size_t bytes) const override {
        xpu_memcpy(dst, const_cast<void *>(src), bytes,
                   XPUMemcpyKind::XPU_HOST_TO_DEVICE);
    }
    void copyBlobToCPU(void *dst, const void *src,
                       size_t bytes) const override {
        xpu_memcpy(dst, const_cast<void *>(src), bytes,
                   XPUMemcpyKind::XPU_DEVICE_TO_HOST);
    }
    void copyBlobInsideRuntime(void *dst, const void *src,
                               size_t bytes) const override {
        xpu_memcpy(dst, const_cast<void *>(src), bytes,
                   XPUMemcpyKind::XPU_DEVICE_TO_DEVICE);
    }
    void initComm(const string &name, int worldSize, int rank) final;
    CommunicatorObj &getCommunicator() const final { return *comm; }
  private:
    void runWithoutSync(const Graph &graph, bool tune, bool profiling) const;
 };
 } // namespace infini
--- a/include/kunlun/operator_timer.h
+++ b/include/kunlun/operator_timer.h
@ -0,0 +1,10 @@
 #pragma once
 namespace infini {
 namespace opTimer {
 double getPerfConvXdnn(int n, int c, int h, int w, int f, int r, int s,
                       int padh, int padw, int strideh, int stridew,
                       int dilationh, int dilationw, int group,
                       const char *name);
 double getPerfMatmulXdnn(int b, int m, int n, int k, const char *name);
 } // namespace opTimer
 } // namespace infini
--- a/include/kunlun/xccl_communicator.h
+++ b/include/kunlun/xccl_communicator.h
@ -0,0 +1,60 @@
 #pragma once
 #include "core/communicator.h"
 #include "xpu/bkcl.h"
 #include <chrono>
 #include <filesystem>
 #include <fstream>
 #include <thread>
 #define checkXcclError(call)                                                   \
    {                                                                          \
        auto err = call;                                                       \
        if (BKCL_SUCCESS != err) {                                             \
            fprintf(stderr, "XCCL error in %s:%i.\n", __FILE__, __LINE__);     \
            exit(EXIT_FAILURE);                                                \
        }                                                                      \
    }
 namespace infini {
 class XcclCommunicatorObj final : public CommunicatorObj {
  private:
    BKCLContext_t comm;
  public:
    XcclCommunicatorObj(const string &name, int worldSize, int rank)
        : CommunicatorObj(worldSize, rank) {
        const std::string filePath("./" + name + "_xccl_id.bin");
        BKCLUniqueId commId;
        if (rank == 0) {
            checkXcclError(bkcl_get_unique_id(&commId));
            std::ofstream ofs(filePath, std::ios::binary);
            ofs.write((char *)&commId, sizeof(BKCLUniqueId));
        } else {
            auto begin = std::chrono::steady_clock::now();
            while (!std::filesystem::exists(filePath)) {
                auto now = std::chrono::steady_clock::now();
                _IT_ASSERT_2(now < begin + std::chrono::seconds(100),
                             "time limit (100s) exceeded.");
                std::this_thread::sleep_for(std::chrono::milliseconds(100));
            }
            std::ifstream ifs(filePath, std::ios::binary);
            ifs.read((char *)&commId, sizeof(BKCLUniqueId));
        }
        checkXcclError(bkcl_init_rank(&comm, rank, worldSize, &commId));
        if (rank == 0) {
            std::filesystem::remove(filePath);
        }
    }
    BKCLContext_t getXcclComm() { return comm; }
    ~XcclCommunicatorObj() final { checkXcclError(bkcl_destroy_context(comm)); }
    virtual string toString() const final {
        std::ostringstream oss;
        oss << "XCCL communicator";
        return oss.str();
    }
 };
 } // namespace infini
--- a/include/nnet/test.h
+++ b/include/nnet/test.h
@ -24,7 +24,7 @@
 // clang-format on
 namespace nnet {
-int matchExprResult(Derivator &derivator, string fn);
+int matchExprResult(Derivator &derivator, string pathRelativeToProjectHome);
-bool checkExprLogSame(string fnPrefix, int start, int end);
+bool checkExprLogSame(string pathRelativeToProjectHome, int start, int end);
 bool checkExprsEquvivalence(VecExpr exprs);
 } // namespace nnet
--- a/include/operators/G2BMM.h
+++ b/include/operators/G2BMM.h
@ -35,7 +35,7 @@ class G2BMMObj : public OperatorObj {
    OP_CLONE(G2BMMObj);
    std::string toString() const override;
-    optional<vector<Shape>> inferShape(const TensorVec &inputs) const override;
+    optional<vector<Shape>> inferShape(const TensorVec &inputs) override;
    int numInputs() const override { return 2; }
    int numOutputs() const override { return 1; }
--- a/include/operators/GBMM.h
+++ b/include/operators/GBMM.h
@ -33,7 +33,7 @@ class GBMMObj : public OperatorObj {
    OP_CLONE(GBMMObj);
    std::string toString() const override;
-    optional<vector<Shape>> inferShape(const TensorVec &inputs) const override;
+    optional<vector<Shape>> inferShape(const TensorVec &inputs) override;
    int numInputs() const override { return 2; }
    int numOutputs() const override { return 1; }
--- a/include/operators/activation_backward.h
+++ b/include/operators/activation_backward.h
@ -7,7 +7,7 @@ class ActivationBackwardObj : public OperatorObj {
    ActivationBackwardObj(OpType type, GraphObj *graph, Tensor y, Tensor diff_y,
                          Tensor x, Tensor diff_x);
    OP_CLONE(ActivationBackwardObj);
-    optional<vector<Shape>> inferShape(const TensorVec &inputs) const override;
+    optional<vector<Shape>> inferShape(const TensorVec &inputs) override;
    std::string toString() const override;
    int numInputs() const override { return 3; }
--- a/include/operators/all_gather.h
+++ b/include/operators/all_gather.h
@ -0,0 +1,44 @@
 #pragma once
 #include "core/operator.h"
 namespace infini {
 /**
 * @brief The AllGather operation gathers N values from k ranks into
 * an output of size k*N, and distributes that result to all ranks.
 * The output is ordered by rank index.
 *
 * For more details:
 * https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/usage/collectives.html#allgather
 */
 class AllGatherObj : public OperatorObj {
  public:
    /**
     * @brief Construct a new AllGather object
     *
     * @param graph The computation graph that this operator belongs to.
     * @param input The input tensor from this rank.
     * @param outputs A list of output tensors collected from all ranks.
     * @param world_size Total number of ranks.
     */
    AllGatherObj(GraphObj *graph, Tensor input, std::optional<TensorVec>,
                 int world_size);
    OP_CLONE(AllGatherObj);
    int numInputs() const override { return 1; }
    int numOutputs() const override { return world_size; }
    optional<vector<Shape>> inferShape(const TensorVec &inputs) override;
    std::string toString() const override;
    int getWorldSize() const { return world_size; }
  private:
    vector<int> getWorkloadVector() const override;
    vector<int> getOpAttrVector() const override;
    vector<DataType> inferDataType(const TensorVec &inputs) const override;
  protected:
    int world_size;
 };
 } // namespace infini
--- a/include/operators/all_reduce.h
+++ b/include/operators/all_reduce.h
@ -0,0 +1,75 @@
 #pragma once
 #include "core/operator.h"
 namespace infini {
 /**
 * @brief The AllReduce operation is performing reductions on data (sum, min,
 * max, avg, or div) across devices and writing the result in the
 * receive buffers of every rank. For example, in an allreduce operation between
 * k ranks and performing a sum, each rank will provide an array Vk of N values,
 * and receive an identical arrays S of N values, where S[i] =
 * V0[i]+V1[i]+…+Vk-1[i].
 *
 * For more details:
 * https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/usage/collectives.html#allreduce
 */
 class AllReduceBaseObj : public OperatorObj {
  public:
    /**
     * @brief Construct a new AllReduce base object. Should be called by every
     * child class constructor, but not directly.
     *
     * @param graph The computation graph that this operator belongs to.
     * @param opType The operation type. This param is taken care of by child
     * classes.
     * @param input The input tensor from this rank.
     * @param output The output tensor, same size as input.
     */
    AllReduceBaseObj(GraphObj *graph, OpType opType, Tensor input,
                     Tensor output);
    OP_CLONE(AllReduceBaseObj);
    int numInputs() const override { return 1; }
    int numOutputs() const override { return 1; }
    optional<vector<Shape>> inferShape(const TensorVec &inputs) override {
        return {{inputs[0]->getDims()}};
    };
    std::string toString() const override;
  private:
    vector<int> getWorkloadVector() const override;
    vector<int> getOpAttrVector() const override;
    vector<DataType> inferDataType(const TensorVec &inputs) const override {
        return {inputs[0]->getDType()};
    };
 };
 class AllReduceSumObj : public AllReduceBaseObj {
  public:
    AllReduceSumObj(GraphObj *graph, Tensor input, Tensor output);
 };
 class AllReduceProdObj : public AllReduceBaseObj {
  public:
    AllReduceProdObj(GraphObj *graph, Tensor input, Tensor output);
 };
 class AllReduceMinObj : public AllReduceBaseObj {
  public:
    AllReduceMinObj(GraphObj *graph, Tensor input, Tensor output);
 };
 class AllReduceMaxObj : public AllReduceBaseObj {
  public:
    AllReduceMaxObj(GraphObj *graph, Tensor input, Tensor output);
 };
 class AllReduceAvgObj : public AllReduceBaseObj {
  public:
    AllReduceAvgObj(GraphObj *graph, Tensor input, Tensor output);
 };
 } // namespace infini
--- a/include/operators/attention_kvcache.h
+++ b/include/operators/attention_kvcache.h
@ -0,0 +1,43 @@
 #pragma once
 #include "core/operator.h"
 namespace infini {
 /**
 * @brief Fused Attention with KVCache input operator. All the input and output
 * tensors should have the same rank except for the position_id.
 *
 */
 class AttentionKVCacheObj : public OperatorObj {
    int dim;
  public:
    /**
     * @brief Construct a new AttentionKVCache object.
     *
     * @param graph The computation graph that this operator belongs to.
     * @param input_k_cache The k_cache input tensor.
     * @param input_v_cache The v_cache input tensor.
     * @param input_q The query input tensor.
     * @param input_k The key input tensor.
     * @param input_v The value input tensor.
     * @param position_id The positon id of the query,
     * @param output_matmul The query output tensor.
     */
    AttentionKVCacheObj(GraphObj *graph, Tensor input_k_cache,
                        Tensor input_v_cache, Tensor input_q, Tensor input_k,
                        Tensor input_v, Tensor position_id,
                        Tensor output_matmul);
    OP_CLONE(AttentionKVCacheObj);
    optional<vector<Shape>> inferShape(const TensorVec &inputs) override;
    std::string toString() const override;
    int numInputs() const override { return 6; }
    int numOutputs() const override { return 1; }
    int getDim() const { return dim; }
  private:
    vector<int> getWorkloadVector() const override;
    vector<int> getOpAttrVector() const override;
 };
 } // namespace infini
--- a/include/operators/batch_norm.h
+++ b/include/operators/batch_norm.h
@ -34,7 +34,7 @@ class BatchNormObj : public OperatorObj {
                 Tensor var, Tensor scale, Tensor bias, float momentum = 0.9,
                 float eps = 1e-5, bool trainingMode = false);
    OP_CLONE(BatchNormObj);
-    optional<vector<Shape>> inferShape(const TensorVec &inputs) const override;
+    optional<vector<Shape>> inferShape(const TensorVec &inputs) override;
    std::string toString() const override;
    // output size will be 3 when training
--- a/include/operators/broadcast.h
+++ b/include/operators/broadcast.h
@ -0,0 +1,49 @@
 #pragma once
 #include "core/operator.h"
 namespace infini {
 /**
 * @brief The Broadcast operation copies an N-element buffer on the root rank to
 * all ranks.
 *
 * For more details:
 * https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/usage/collectives.html#broadcast
 */
 class BroadcastObj : public OperatorObj {
  public:
    /**
     * @brief Construct a new Broadcast object.
     *
     * @param graph The computation graph that this operator belongs to.
     * @param input The input tensor. Only root needs to initialize it with
     * data.
     * @param output The output tensor, same size as input.
     * @param root The root rank who performs the broadcast.
     */
    BroadcastObj(GraphObj *graph, Tensor input, Tensor output, int root);
    OP_CLONE(BroadcastObj);
    int numInputs() const override { return 1; }
    int numOutputs() const override { return 1; }
    optional<vector<Shape>> inferShape(const TensorVec &inputs) override {
        return {{inputs[0]->getDims()}};
    };
    std::string toString() const override;
    int getRoot() const { return root; }
  private:
    vector<int> getWorkloadVector() const override;
    vector<int> getOpAttrVector() const override;
    vector<DataType> inferDataType(const TensorVec &inputs) const override {
        return {inputs[0]->getDType()};
    };
  protected:
    // The rank who broadcasts data among this communication group
    int root;
 };
 } // namespace infini
--- a/include/operators/concat.h
+++ b/include/operators/concat.h
@ -22,7 +22,7 @@ class ConcatObj : public OperatorObj {
    ConcatObj(GraphObj *graph, TensorVec inputs, Tensor output, int dim);
    OP_CLONE(ConcatObj);
-    optional<vector<Shape>> inferShape(const TensorVec &inputs) const override;
+    optional<vector<Shape>> inferShape(const TensorVec &inputs) override;
    std::string toString() const override;
    int numInputs() const override { return inputs.size(); }
--- a/include/operators/conv.h
+++ b/include/operators/conv.h
@ -142,7 +142,7 @@ class ConvObj : public ConvBaseObj {
            ActType act = ActType::None);
    OP_CLONE(ConvObj);
-    optional<vector<Shape>> inferShape(const TensorVec &inputs) const override;
+    optional<vector<Shape>> inferShape(const TensorVec &inputs) override;
    int getNumGroups() const override { return c / getChannelPerGroup(); }
  private:
@ -164,7 +164,7 @@ class ConvBackwardFilterObj : public ConvBaseObj {
                          int sh = 1, int sw = 1, int dh = 1, int dw = 1,
                          Tensor bias = nullptr, ActType act = ActType::None);
-    optional<vector<Shape>> inferShape(const TensorVec &inputs) const override;
+    optional<vector<Shape>> inferShape(const TensorVec &inputs) override;
    ActType getAct() const { return act; }
    int getNumGroups() const override { return c / getChannelPerGroup(); }
@ -191,7 +191,7 @@ class ConvTransposed2dObj : public ConvBaseObj {
                        Tensor bias = nullptr, ActType act = ActType::None);
    OP_CLONE(ConvTransposed2dObj);
-    optional<vector<Shape>> inferShape(const TensorVec &inputs) const override;
+    optional<vector<Shape>> inferShape(const TensorVec &inputs) override;
    int getNumGroups() const override { return group; }
    std::pair<int, int> getOutputPadding() const { return {oph, opw}; }
@ -218,7 +218,7 @@ class ConvTransposed2dNHWCObj : public ConvBaseObj {
                            Tensor bias = nullptr, ActType act = ActType::None);
    OP_CLONE(ConvTransposed2dNHWCObj);
-    optional<vector<Shape>> inferShape(const TensorVec &inputs) const override;
+    optional<vector<Shape>> inferShape(const TensorVec &inputs) override;
    int getNumGroups() const override { return group; }
  private:
--- a/include/operators/det.h
+++ b/include/operators/det.h
@ -7,7 +7,7 @@ class DetObj : public OperatorObj {
    enum Mode { NormalDet = 0, LogDet };
    DetObj(GraphObj *graph, Tensor input, Tensor output, Mode mode);
    OP_CLONE(DetObj);
-    optional<vector<Shape>> inferShape(const TensorVec &inputs) const override;
+    optional<vector<Shape>> inferShape(const TensorVec &inputs) override;
    std::string toString() const override;
    int numInputs() const override { return 1; }
--- a/include/operators/dropout.h
+++ b/include/operators/dropout.h
@ -37,7 +37,7 @@ class DropoutObj : public OperatorObj {
    DropoutObj(GraphObj *graph, Tensor data, Tensor output, Tensor mask,
               float ratio, bool training_mode);
    OP_CLONE(DropoutObj);
-    optional<vector<Shape>> inferShape(const TensorVec &inputs) const override;
+    optional<vector<Shape>> inferShape(const TensorVec &inputs) override;
    std::string toString() const override;
    int numInputs() const override { return 1; }
--- a/include/operators/element_wise.h
+++ b/include/operators/element_wise.h
@ -21,7 +21,7 @@ class ElementWiseObj : public OperatorObj {
     */
    ElementWiseObj(OpType type, GraphObj *graph, Tensor input0, Tensor input1,
                   Tensor output);
-    optional<vector<Shape>> inferShape(const TensorVec &inputs) const override;
+    optional<vector<Shape>> inferShape(const TensorVec &inputs) override;
    std::string toString() const override;
    int numInputs() const override { return 2; }
@ -38,7 +38,7 @@ class MSELossObj : public OperatorObj {
    MSELossObj(GraphObj *graph, Tensor input0, Tensor input1,
               Reduction reduction, Tensor output);
    OP_CLONE(MSELossObj);
-    optional<vector<Shape>> inferShape(const TensorVec &inputs) const override;
+    optional<vector<Shape>> inferShape(const TensorVec &inputs) override;
    Reduction getReduction() const { return reductionMode; }
    std::string toString() const override;
@ -65,26 +65,24 @@ DEFINE_ELEMENT_WISE_OBJ(Sub, OpType::Sub)
 DEFINE_ELEMENT_WISE_OBJ(Mul, OpType::Mul)
 DEFINE_ELEMENT_WISE_OBJ(Div, OpType::Div)
 DEFINE_ELEMENT_WISE_OBJ(Pow, OpType::Pow)
-DEFINE_ELEMENT_WISE_OBJ(Maximum, OpType::Maximum)
+DEFINE_ELEMENT_WISE_OBJ(Maximum, OpType::Max)
-DEFINE_ELEMENT_WISE_OBJ(Minimum, OpType::Minimum)
+DEFINE_ELEMENT_WISE_OBJ(Minimum, OpType::Min)
-DEFINE_ELEMENT_WISE_OBJ(Power, OpType::Power)
+DEFINE_ELEMENT_WISE_OBJ(Power, OpType::Pow)
 DEFINE_ELEMENT_WISE_OBJ(FloorDiv, OpType::FloorDiv)
 DEFINE_ELEMENT_WISE_OBJ(FloorMod, OpType::FloorMod)
 DEFINE_ELEMENT_WISE_OBJ(SquaredDifference, OpType::SquaredDifference)
 DEFINE_ELEMENT_WISE_OBJ(Equal, OpType::Equal)
-DEFINE_ELEMENT_WISE_OBJ(NotEqual, OpType::NotEqual)
+DEFINE_ELEMENT_WISE_OBJ(GreaterThan, OpType::Greater)
-DEFINE_ELEMENT_WISE_OBJ(GreaterThan, OpType::GreaterThan)
+DEFINE_ELEMENT_WISE_OBJ(GreaterEqual, OpType::GreaterOrEqual)
-DEFINE_ELEMENT_WISE_OBJ(GreaterEqual, OpType::GreaterEqual)
+DEFINE_ELEMENT_WISE_OBJ(LessThan, OpType::Less)
-DEFINE_ELEMENT_WISE_OBJ(LessThan, OpType::LessThan)
+DEFINE_ELEMENT_WISE_OBJ(LessEqual, OpType::LessOrEqual)
 DEFINE_ELEMENT_WISE_OBJ(LessEqual, OpType::LessEqual)
 DEFINE_ELEMENT_WISE_OBJ(And, OpType::And)
 DEFINE_ELEMENT_WISE_OBJ(Or, OpType::Or)
 DEFINE_ELEMENT_WISE_OBJ(Xor, OpType::Xor)
 DEFINE_ELEMENT_WISE_OBJ(Not, OpType::Not)
-DEFINE_ELEMENT_WISE_OBJ(BitAnd, OpType::BitAnd)
+DEFINE_ELEMENT_WISE_OBJ(BitAnd, OpType::BitwiseAnd)
-DEFINE_ELEMENT_WISE_OBJ(BitOr, OpType::BitOr)
+DEFINE_ELEMENT_WISE_OBJ(BitOr, OpType::BitwiseOr)
-DEFINE_ELEMENT_WISE_OBJ(BitXor, OpType::BitXor)
+DEFINE_ELEMENT_WISE_OBJ(BitXor, OpType::BitwiseXor)
-DEFINE_ELEMENT_WISE_OBJ(BitNot, OpType::BitNot)
+DEFINE_ELEMENT_WISE_OBJ(BitNot, OpType::BitwiseNot)
-DEFINE_ELEMENT_WISE_OBJ(BitLeftShift, OpType::BitLeftShift)
+DEFINE_ELEMENT_WISE_OBJ(BitLeftShift, OpType::BitShift)
 DEFINE_ELEMENT_WISE_OBJ(BitRightShift, OpType::BitRightShift)
 }; // namespace infini
--- a/include/operators/expand.h
+++ b/include/operators/expand.h
@ -0,0 +1,36 @@
 #pragma once
 #include "core/operator.h"
 namespace infini {
 /**
 *  @brief Broadcast the input tensor following the given shape and the
 * broadcast rule.
 *
 */
 class ExpandObj : public OperatorObj {
    Shape dims;
  public:
    /**
     * @brief Construct a new Expand object.
     * @param graph The computation graph that this operator belongs to.
     * @param input The input tensor.
     * @param output The output tensor.
     * @param dims The shape you want to expand to, following the broadcast
     * rule.
     */
    ExpandObj(GraphObj *graph, Tensor input, Tensor output, Shape dims);
    OP_CLONE(ExpandObj);
    optional<vector<Shape>> inferShape(const TensorVec &inputs) override;
    std::string toString() const override;
    int numInputs() const override { return 1; }
    int numOutputs() const override { return 1; }
    Shape getShape() const { return dims; }
  private:
    vector<int> getWorkloadVector() const override;
    vector<int> getOpAttrVector() const override;
 };
 } // namespace infini
--- a/include/operators/extend.h
+++ b/include/operators/extend.h
@ -23,7 +23,7 @@ class ExtendObj : public OperatorObj {
    ExtendObj(GraphObj *graph, Tensor input, Tensor output, int dim,
              int num = 1);
    OP_CLONE(ExtendObj);
-    optional<vector<Shape>> inferShape(const TensorVec &inputs) const override;
+    optional<vector<Shape>> inferShape(const TensorVec &inputs) override;
    std::string toString() const override;
    int numInputs() const override { return 1; }
--- a/include/operators/gather.h
+++ b/include/operators/gather.h
@ -3,14 +3,28 @@
 #include "core/operator.h"
 namespace infini {
 class GatherBaseObj : public OperatorObj {
  protected:
    int axis;
  public:
    GatherBaseObj(OpType opType, TensorVec inputs, TensorVec outputs, int axis)
        : OperatorObj(opType, inputs, outputs), axis(axis) {}
    virtual ~GatherBaseObj() {}
    int numInputs() const override { return 2; }
    int numOutputs() const override { return 1; }
    int getAxis() const { return axis; }
 };
 /**
 * @brief Gather and concatenate given positions on a certain dimension of the
 * input tensor using an index tensor.
 *
 */
-class GatherObj : public OperatorObj {
+class GatherObj : public GatherBaseObj {
    int axis;
  public:
    /**
     * @brief Construct a new Gather object.
@ -25,10 +39,7 @@ class GatherObj : public OperatorObj {
              int axis);
    OP_CLONE(GatherObj);
    std::string toString() const override;
-    int numInputs() const override { return 2; }
+    optional<vector<Shape>> inferShape(const TensorVec &inputs) override;
    int numOutputs() const override { return 1; }
    optional<vector<Shape>> inferShape(const TensorVec &inputs) const override;
    int getAxis() const { return axis; }
    vector<DataType> inferDataType(const TensorVec &inputs) const override;
  private:
@ -36,4 +47,33 @@ class GatherObj : public OperatorObj {
    vector<int> getWorkloadVector() const override;
    vector<int> getOpAttrVector() const override;
 };
 /**
 * @brief GatherElements takes two inputs data and indices of the
 * same rank r >= 1 and an optional attribute axis that identifies
 * an axis of data.
 *
 */
 class GatherElementsObj : public GatherBaseObj {
  public:
    /**
     * @brief Construct a new GatherElements object.
     *
     * @param graph The computation graph that this operator belongs to.
     * @param input The input tensor.
     * @param indices The index tensor.
     * @param output The output tensor. Same shape as indices.
     * @param axis The axis to gather on.
     */
    GatherElementsObj(GraphObj *graph, Tensor input, Tensor indices,
                      Tensor output, int axis);
    OP_CLONE(GatherElementsObj);
    std::string toString() const override;
    optional<vector<Shape>> inferShape(const TensorVec &inputs) override;
    vector<DataType> inferDataType(const TensorVec &inputs) const override;
  private:
    vector<int> getWorkloadVector() const override;
    vector<int> getOpAttrVector() const override;
 };
 } // namespace infini
--- a/include/operators/layer_norm.h
+++ b/include/operators/layer_norm.h
@ -0,0 +1,30 @@
 #pragma once
 #include "core/operator.h"
 namespace infini {
 class LayerNormObj : public OperatorObj {
    float eps;
    int axis, stash_type;
  public:
    LayerNormObj(GraphObj *graph, Tensor input, Tensor scale, Tensor output,
                 Tensor bias = nullptr, float eps = 1e-5, int axis = -1,
                 int stash_type = 1);
    OP_CLONE(LayerNormObj);
    optional<vector<Shape>> inferShape(const TensorVec &inputs) override;
    std::string toString() const override;
    Tensor getBias() const { return inputs.size() > 2 ? inputs[2] : nullptr; }
    int numInputs() const override { return inputs.size(); }
    int numOutputs() const override { return outputs.size(); }
    float getEps() const { return eps; }
    int getAxis() const { return axis; }
    int getStashType() const { return stash_type; }
  private:
    vector<int> getWorkloadVector() const override;
    vector<int> getOpAttrVector() const override;
    vector<DataType> inferDataType(const TensorVec &inputs) const override;
 };
 } // namespace infini
--- a/include/operators/lrn.h
+++ b/include/operators/lrn.h
@ -0,0 +1,29 @@
 #pragma once
 #include "core/operator.h"
 namespace infini {
 class LRNObj : public OperatorObj {
  public:
    LRNObj(GraphObj *graph, Tensor inputX, Tensor inputY, float alpha,
           float beta, float bias, int size);
    OP_CLONE(LRNObj);
    optional<vector<Shape>> inferShape(const TensorVec &inputs) override;
    std::string toString() const override;
    int numInputs() const override { return inputs.size(); }
    int numOutputs() const override { return 1; }
    auto getAlphaBetaBias() const {
        return tuple(alpha_value, beta_value, bias_value);
    }
    auto getSize() const { return size_value; }
  private:
    float alpha_value, beta_value, bias_value;
    int size_value;
    vector<int> getWorkloadVector() const override;
    vector<int> getOpAttrVector() const override;
 };
 } // namespace infini
--- a/Show More
+++ b/Show More
		`@ -1 +0,0 @@`
			`Subproject commit d6ac8c8c73bf83833a71b41e95820d4eb7741fa9`
		`@ -0,0 +1 @@`
							`Subproject commit 51d3105277f3774ed31c02ed4cd11fa92925af77`
		`@ -0,0 +1 @@`
							`Subproject commit cbcf3fbf985a00494b0f136c92eaccd42031bf65`